fix: InfluxQL parser incompatibilities (#6034)

* fix: Parse regular expressions starting with possible escape sequence This was failing because the previous combinator, `is_not`, would return an error if it consumed no input when identifying one of the characters in its set. This case would then prevent the remainder of the `regex_literal` parser from identifying and ignoring sequences like "\w" * fix: Parse microsecond duration literals with correct unit suffix * fix: Parse a var ref as a 3-part, segmented identifier Closes #6033 * chore: Address lint warnings * chore: Additional test cases per feedback
2022-11-03 16:43:16 +11:00 · 2022-11-03 16:43:16 +11:00 · f54124102e
parent 4fb2843d05
commit f54124102e
4 changed files with 173 additions and 19 deletions
--- a/influxdb_influxql_parser/src/expression/arithmetic.rs
+++ b/influxdb_influxql_parser/src/expression/arithmetic.rs
@ -12,7 +12,7 @@ use nom::bytes::complete::tag;
 use nom::character::complete::{char, multispace0};
 use nom::combinator::{cut, map, opt, value};
 use nom::multi::{many0, separated_list0};
-use nom::sequence::{delimited, pair, preceded, separated_pair, tuple};
+use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
 use std::fmt::{Display, Formatter, Write};

 /// An InfluxQL arithmetic expression.
@ -316,11 +316,54 @@ where
    )(i)
 }

-/// Parse a variable reference, which is an identifier followed by an optional cast expression.
+/// Parse a segmented identifier
+///
+/// ```text
+/// segmented_identifier ::= identifier |
+///                          ( identifier "." identifier ) |
+///                          ( identifier "." identifier? "." identifier )
+/// ```
+fn segmented_identifier(i: &str) -> ParseResult<&str, Identifier> {
+    let (remaining, (opt_prefix, name)) = pair(
+        opt(alt((
+            // ident2 "." ident1 "."
+            map(
+                pair(
+                    terminated(identifier, tag(".")),
+                    terminated(identifier, tag(".")),
+                ),
+                |(ident2, ident1)| (Some(ident2), Some(ident1)),
+            ),
+            // identifier ".."
+            map(terminated(identifier, tag("..")), |ident2| {
+                (Some(ident2), None)
+            }),
+            // identifier "."
+            map(terminated(identifier, tag(".")), |ident1| {
+                (None, Some(ident1))
+            }),
+        ))),
+        identifier,
+    )(i)?;
+
+    Ok((
+        remaining,
+        match opt_prefix {
+            Some((None, Some(ident1))) => format!("{}.{}", ident1.0, name.0).into(),
+            Some((Some(ident2), None)) => format!("{}..{}", ident2.0, name.0).into(),
+            Some((Some(ident2), Some(ident1))) => {
+                format!("{}.{}.{}", ident2.0, ident1.0, name.0).into()
+            }
+            _ => name,
+        },
+    ))
+}
+
+/// Parse a variable reference, which is a segmented identifier followed by an optional cast expression.
 pub(crate) fn var_ref(i: &str) -> ParseResult<&str, Expr> {
    map(
        pair(
-            identifier,
+            segmented_identifier,
            opt(preceded(
                tag("::"),
                expect(
@ -515,6 +558,19 @@ mod test {
        let (_, got) = var_ref("foo").unwrap();
        assert_eq!(got, var_ref!("foo"));

+        // Whilst this is parsed as a 3-part name, it is treated as a quoted string 🙄
+        // VarRefs are parsed as segmented identifiers
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2515-L2516
+        //
+        // and then the segments are joined as a single string
+        //
+        //   * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2551
+        let (rem, got) = var_ref("db.rp.foo").unwrap();
+        assert_eq!(got, var_ref!("db.rp.foo"));
+        assert_eq!(format!("{}", got), r#""db.rp.foo""#);
+        assert_eq!(rem, "");
+
        // with cast operator
        let (_, got) = var_ref("foo::tag").unwrap();
        assert_eq!(got, var_ref!("foo", Tag));
@ -539,6 +595,62 @@ mod test {
        assert!(got.is_empty())
    }

+    #[test]
+    fn test_segmented_identifier() {
+        // Unquoted
+        let (rem, id) = segmented_identifier("part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "part0");
+
+        // id.id
+        let (rem, id) = segmented_identifier("part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part1.part0\"");
+
+        // id..id
+        let (rem, id) = segmented_identifier("part2..part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part2..part0\"");
+
+        // id.id.id
+        let (rem, id) = segmented_identifier("part2.part1.part0").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
+
+        // "id"."id".id
+        let (rem, id) = segmented_identifier(r#""part 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "\"part 2.part 1.part0\"");
+
+        // Only parses 3 segments
+        let (rem, id) = segmented_identifier("part2.part1.part0.foo").unwrap();
+        assert_eq!(rem, ".foo");
+        assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
+
+        // Quoted
+        let (rem, id) = segmented_identifier("\"part0\"").unwrap();
+        assert_eq!(rem, "");
+        assert_eq!(format!("{}", id), "part0");
+
+        // Additional test cases, with compatibility proven via https://go.dev/play/p/k2150CJocVl
+
+        let (rem, id) = segmented_identifier(r#""part" 2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), "part");
+
+        let (rem, id) = segmented_identifier(r#""part" 2."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#" 2."part 1".part0"#);
+        assert_eq!(format!("{}", id), "part");
+
+        let (rem, id) = segmented_identifier(r#""part "2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#"2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), r#""part ""#);
+
+        let (rem, id) = segmented_identifier(r#""part ""2"."part 1".part0"#).unwrap();
+        assert_eq!(rem, r#""2"."part 1".part0"#);
+        assert_eq!(format!("{}", id), r#""part ""#);
+    }
+
    #[test]
    fn test_display_expr() {
        let (_, e) = arithmetic_expression("5 + 51").unwrap();
--- a/influxdb_influxql_parser/src/literal.rs
+++ b/influxdb_influxql_parser/src/literal.rs
@ -249,15 +249,15 @@ fn single_duration(i: &str) -> ParseResult<&str, i64> {
        pair(
            integer,
            alt((
-                value(Nanosecond, tag("ns")),   // nanoseconds
-                value(Microsecond, tag("µs")), // microseconds
-                value(Microsecond, tag("us")),  // microseconds
-                value(Millisecond, tag("ms")),  // milliseconds
-                value(Second, tag("s")),        // seconds
-                value(Minute, tag("m")),        // minutes
-                value(Hour, tag("h")),          // hours
-                value(Day, tag("d")),           // days
-                value(Week, tag("w")),          // weeks
+                value(Nanosecond, tag("ns")),  // nanoseconds
+                value(Microsecond, tag("µ")), // microseconds
+                value(Microsecond, tag("u")),  // microseconds
+                value(Millisecond, tag("ms")), // milliseconds
+                value(Second, tag("s")),       // seconds
+                value(Minute, tag("m")),       // minutes
+                value(Hour, tag("h")),         // hours
+                value(Day, tag("d")),          // days
+                value(Week, tag("w")),         // weeks
            )),
        ),
        |(v, unit)| match unit {
@ -410,10 +410,14 @@ mod test {
        let (_, got) = single_duration("38ns").unwrap();
        assert_eq!(got, 38);

-        let (_, got) = single_duration("22us").unwrap();
+        let (_, got) = single_duration("22u").unwrap();
        assert_eq!(got, 22 * NANOS_PER_MICRO);

-        let (_, got) = single_duration("7µs").unwrap();
+        let (rem, got) = single_duration("22us").unwrap();
+        assert_eq!(got, 22 * NANOS_PER_MICRO);
+        assert_eq!(rem, "s"); // prove that we ignore the trailing s
+
+        let (_, got) = single_duration("7µ").unwrap();
        assert_eq!(got, 7 * NANOS_PER_MICRO);

        let (_, got) = single_duration("15ms").unwrap();
--- a/influxdb_influxql_parser/src/select.rs
+++ b/influxdb_influxql_parser/src/select.rs
@ -774,6 +774,12 @@ mod test {
            select_statement("SELECT value FROM cpu WHERE time <= now()TZ('Australia/Hobart')")
                .unwrap();
        assert_eq!(rem, "");
+
+        // segmented var ref identifiers
+        let (rem, _) =
+            select_statement(r#"SELECT LAST("n.usage_user") FROM cpu WHERE n.usage_user > 0"#)
+                .unwrap();
+        assert_eq!(rem, "");
    }

    #[test]
@ -848,6 +854,16 @@ mod test {
            }
        );

+        // Parse expression with an alias and no unnecessary whitespace
+        let (_, got) = Field::parse("LAST(\"n.asks\")").unwrap();
+        assert_eq!(
+            got,
+            Field {
+                expr: call!("LAST", var_ref!("n.asks")),
+                alias: None
+            }
+        );
+
        // Parse a call with a VarRef
        let (_, got) = Field::parse("DISTINCT foo AS bar").unwrap();
        assert_eq!(
--- a/influxdb_influxql_parser/src/string.rs
+++ b/influxdb_influxql_parser/src/string.rs
@ -7,8 +7,8 @@
 use crate::impl_tuple_clause;
 use crate::internal::{expect, ParseError, ParseResult};
 use nom::branch::alt;
-use nom::bytes::complete::{is_not, tag};
-use nom::character::complete::char;
+use nom::bytes::complete::{is_not, tag, take_till};
+use nom::character::complete::{anychar, char};
 use nom::combinator::{map, value, verify};
 use nom::error::Error;
 use nom::multi::fold_many0;
@ -137,13 +137,24 @@ fn regex_literal(i: &str) -> ParseResult<&str, &str> {

    loop {
        // match everything except `\`, `/` or `\n`
-        let (_, match_i) = is_not("\\/\n")(remaining)?;
+        let (_, match_i) = take_till(|c| c == '\\' || c == '/' || c == '\n')(remaining)?;
        consumed = &i[..(consumed.len() + match_i.len())];
        remaining = &i[consumed.len()..];

+        // If we didn't consume anything, check whether it is a newline or regex delimiter,
+        // which signals we should leave this parser for outer processing.
+        if consumed.is_empty() {
+            is_not("/\n")(remaining)?;
+        }
+
        // Try and consume '\' followed by a '/'
        if let Ok((remaining_i, _)) = char::<_, Error<&str>>('\\')(remaining) {
            if char::<_, Error<&str>>('/')(remaining_i).is_ok() {
+                // If we didn't consume anything, but we found "\/" sequence,
+                // we need to return an error so the outer fold_many0 parser does not trigger
+                // an infinite recursion error.
+                anychar(consumed)?;
+
                // We're escaping a '/' (a regex delimiter), so finish and let
                // the outer parser match and unescape
                return Ok((remaining, consumed));
@ -201,6 +212,10 @@ mod test {
        let (_, got) = double_quoted_string(r#""quick draw""#).unwrap();
        assert_eq!(got, "quick draw");

+        // ascii
+        let (_, got) = double_quoted_string(r#""n.asks""#).unwrap();
+        assert_eq!(got, "n.asks");
+
        // unicode
        let (_, got) = double_quoted_string("\"quick draw\u{1f47d}\"").unwrap();
        assert_eq!(
@ -265,6 +280,9 @@ mod test {
        let (_, got) = single_quoted_string(r#"'\n\''"#).unwrap();
        assert_eq!(got, "\n'");

+        let (_, got) = single_quoted_string(r#"'\'hello\''"#).unwrap();
+        assert_eq!(got, "'hello'");
+
        // literal tab
        let (_, got) = single_quoted_string("'quick\tdraw'").unwrap();
        assert_eq!(got, "quick\tdraw");
@ -300,13 +318,17 @@ mod test {
        assert_eq!(got, "hello".into());

        // handle escaped delimiters "\/"
-        let (_, got) = regex(r#"/this\/is\/a\/path/"#).unwrap();
-        assert_eq!(got, "this/is/a/path".into());
+        let (_, got) = regex(r#"/\/this\/is\/a\/path/"#).unwrap();
+        assert_eq!(got, "/this/is/a/path".into());

        // ignores any other possible escape sequence
        let (_, got) = regex(r#"/hello\n/"#).unwrap();
        assert_eq!(got, "hello\\n".into());

+        // can parse possible escape sequence at beginning of regex
+        let (_, got) = regex(r#"/\w.*/"#).unwrap();
+        assert_eq!(got, "\\w.*".into());
+
        // Empty regex
        let (i, got) = regex("//").unwrap();
        assert_eq!(i, "");