fix: InfluxQL parser incompatibilities (#6034)

* fix: Parse regular expressions starting with possible escape sequence

This was failing because the previous combinator, `is_not`, would return
an error if it consumed no input when identifying one of the characters
in its set. This case would then prevent the remainder of the
`regex_literal` parser from identifying and ignoring sequences like "\w"

* fix: Parse microsecond duration literals with correct unit suffix

* fix: Parse a var ref as a 3-part, segmented identifier

Closes #6033

* chore: Address lint warnings

* chore: Additional test cases per feedback
pull/24376/head
Stuart Carnie 2022-11-03 16:43:16 +11:00 committed by GitHub
parent 4fb2843d05
commit f54124102e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 173 additions and 19 deletions

View File

@ -12,7 +12,7 @@ use nom::bytes::complete::tag;
use nom::character::complete::{char, multispace0};
use nom::combinator::{cut, map, opt, value};
use nom::multi::{many0, separated_list0};
use nom::sequence::{delimited, pair, preceded, separated_pair, tuple};
use nom::sequence::{delimited, pair, preceded, separated_pair, terminated, tuple};
use std::fmt::{Display, Formatter, Write};
/// An InfluxQL arithmetic expression.
@ -316,11 +316,54 @@ where
)(i)
}
/// Parse a variable reference, which is an identifier followed by an optional cast expression.
/// Parse a segmented identifier
///
/// ```text
/// segmented_identifier ::= identifier |
/// ( identifier "." identifier ) |
/// ( identifier "." identifier? "." identifier )
/// ```
fn segmented_identifier(i: &str) -> ParseResult<&str, Identifier> {
let (remaining, (opt_prefix, name)) = pair(
opt(alt((
// ident2 "." ident1 "."
map(
pair(
terminated(identifier, tag(".")),
terminated(identifier, tag(".")),
),
|(ident2, ident1)| (Some(ident2), Some(ident1)),
),
// identifier ".."
map(terminated(identifier, tag("..")), |ident2| {
(Some(ident2), None)
}),
// identifier "."
map(terminated(identifier, tag(".")), |ident1| {
(None, Some(ident1))
}),
))),
identifier,
)(i)?;
Ok((
remaining,
match opt_prefix {
Some((None, Some(ident1))) => format!("{}.{}", ident1.0, name.0).into(),
Some((Some(ident2), None)) => format!("{}..{}", ident2.0, name.0).into(),
Some((Some(ident2), Some(ident1))) => {
format!("{}.{}.{}", ident2.0, ident1.0, name.0).into()
}
_ => name,
},
))
}
/// Parse a variable reference, which is a segmented identifier followed by an optional cast expression.
pub(crate) fn var_ref(i: &str) -> ParseResult<&str, Expr> {
map(
pair(
identifier,
segmented_identifier,
opt(preceded(
tag("::"),
expect(
@ -515,6 +558,19 @@ mod test {
let (_, got) = var_ref("foo").unwrap();
assert_eq!(got, var_ref!("foo"));
// Whilst this is parsed as a 3-part name, it is treated as a quoted string 🙄
// VarRefs are parsed as segmented identifiers
//
// * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2515-L2516
//
// and then the segments are joined as a single string
//
// * https://github.com/influxdata/influxql/blob/7e7d61973256ffeef4b99edd0a89f18a9e52fa2d/parser.go#L2551
let (rem, got) = var_ref("db.rp.foo").unwrap();
assert_eq!(got, var_ref!("db.rp.foo"));
assert_eq!(format!("{}", got), r#""db.rp.foo""#);
assert_eq!(rem, "");
// with cast operator
let (_, got) = var_ref("foo::tag").unwrap();
assert_eq!(got, var_ref!("foo", Tag));
@ -539,6 +595,62 @@ mod test {
assert!(got.is_empty())
}
#[test]
fn test_segmented_identifier() {
// Unquoted
let (rem, id) = segmented_identifier("part0").unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "part0");
// id.id
let (rem, id) = segmented_identifier("part1.part0").unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "\"part1.part0\"");
// id..id
let (rem, id) = segmented_identifier("part2..part0").unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "\"part2..part0\"");
// id.id.id
let (rem, id) = segmented_identifier("part2.part1.part0").unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
// "id"."id".id
let (rem, id) = segmented_identifier(r#""part 2"."part 1".part0"#).unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "\"part 2.part 1.part0\"");
// Only parses 3 segments
let (rem, id) = segmented_identifier("part2.part1.part0.foo").unwrap();
assert_eq!(rem, ".foo");
assert_eq!(format!("{}", id), "\"part2.part1.part0\"");
// Quoted
let (rem, id) = segmented_identifier("\"part0\"").unwrap();
assert_eq!(rem, "");
assert_eq!(format!("{}", id), "part0");
// Additional test cases, with compatibility proven via https://go.dev/play/p/k2150CJocVl
let (rem, id) = segmented_identifier(r#""part" 2"."part 1".part0"#).unwrap();
assert_eq!(rem, r#" 2"."part 1".part0"#);
assert_eq!(format!("{}", id), "part");
let (rem, id) = segmented_identifier(r#""part" 2."part 1".part0"#).unwrap();
assert_eq!(rem, r#" 2."part 1".part0"#);
assert_eq!(format!("{}", id), "part");
let (rem, id) = segmented_identifier(r#""part "2"."part 1".part0"#).unwrap();
assert_eq!(rem, r#"2"."part 1".part0"#);
assert_eq!(format!("{}", id), r#""part ""#);
let (rem, id) = segmented_identifier(r#""part ""2"."part 1".part0"#).unwrap();
assert_eq!(rem, r#""2"."part 1".part0"#);
assert_eq!(format!("{}", id), r#""part ""#);
}
#[test]
fn test_display_expr() {
let (_, e) = arithmetic_expression("5 + 51").unwrap();

View File

@ -249,15 +249,15 @@ fn single_duration(i: &str) -> ParseResult<&str, i64> {
pair(
integer,
alt((
value(Nanosecond, tag("ns")), // nanoseconds
value(Microsecond, tag("µs")), // microseconds
value(Microsecond, tag("us")), // microseconds
value(Millisecond, tag("ms")), // milliseconds
value(Second, tag("s")), // seconds
value(Minute, tag("m")), // minutes
value(Hour, tag("h")), // hours
value(Day, tag("d")), // days
value(Week, tag("w")), // weeks
value(Nanosecond, tag("ns")), // nanoseconds
value(Microsecond, tag("µ")), // microseconds
value(Microsecond, tag("u")), // microseconds
value(Millisecond, tag("ms")), // milliseconds
value(Second, tag("s")), // seconds
value(Minute, tag("m")), // minutes
value(Hour, tag("h")), // hours
value(Day, tag("d")), // days
value(Week, tag("w")), // weeks
)),
),
|(v, unit)| match unit {
@ -410,10 +410,14 @@ mod test {
let (_, got) = single_duration("38ns").unwrap();
assert_eq!(got, 38);
let (_, got) = single_duration("22us").unwrap();
let (_, got) = single_duration("22u").unwrap();
assert_eq!(got, 22 * NANOS_PER_MICRO);
let (_, got) = single_duration("7µs").unwrap();
let (rem, got) = single_duration("22us").unwrap();
assert_eq!(got, 22 * NANOS_PER_MICRO);
assert_eq!(rem, "s"); // prove that we ignore the trailing s
let (_, got) = single_duration("").unwrap();
assert_eq!(got, 7 * NANOS_PER_MICRO);
let (_, got) = single_duration("15ms").unwrap();

View File

@ -774,6 +774,12 @@ mod test {
select_statement("SELECT value FROM cpu WHERE time <= now()TZ('Australia/Hobart')")
.unwrap();
assert_eq!(rem, "");
// segmented var ref identifiers
let (rem, _) =
select_statement(r#"SELECT LAST("n.usage_user") FROM cpu WHERE n.usage_user > 0"#)
.unwrap();
assert_eq!(rem, "");
}
#[test]
@ -848,6 +854,16 @@ mod test {
}
);
// Parse expression with an alias and no unnecessary whitespace
let (_, got) = Field::parse("LAST(\"n.asks\")").unwrap();
assert_eq!(
got,
Field {
expr: call!("LAST", var_ref!("n.asks")),
alias: None
}
);
// Parse a call with a VarRef
let (_, got) = Field::parse("DISTINCT foo AS bar").unwrap();
assert_eq!(

View File

@ -7,8 +7,8 @@
use crate::impl_tuple_clause;
use crate::internal::{expect, ParseError, ParseResult};
use nom::branch::alt;
use nom::bytes::complete::{is_not, tag};
use nom::character::complete::char;
use nom::bytes::complete::{is_not, tag, take_till};
use nom::character::complete::{anychar, char};
use nom::combinator::{map, value, verify};
use nom::error::Error;
use nom::multi::fold_many0;
@ -137,13 +137,24 @@ fn regex_literal(i: &str) -> ParseResult<&str, &str> {
loop {
// match everything except `\`, `/` or `\n`
let (_, match_i) = is_not("\\/\n")(remaining)?;
let (_, match_i) = take_till(|c| c == '\\' || c == '/' || c == '\n')(remaining)?;
consumed = &i[..(consumed.len() + match_i.len())];
remaining = &i[consumed.len()..];
// If we didn't consume anything, check whether it is a newline or regex delimiter,
// which signals we should leave this parser for outer processing.
if consumed.is_empty() {
is_not("/\n")(remaining)?;
}
// Try and consume '\' followed by a '/'
if let Ok((remaining_i, _)) = char::<_, Error<&str>>('\\')(remaining) {
if char::<_, Error<&str>>('/')(remaining_i).is_ok() {
// If we didn't consume anything, but we found "\/" sequence,
// we need to return an error so the outer fold_many0 parser does not trigger
// an infinite recursion error.
anychar(consumed)?;
// We're escaping a '/' (a regex delimiter), so finish and let
// the outer parser match and unescape
return Ok((remaining, consumed));
@ -201,6 +212,10 @@ mod test {
let (_, got) = double_quoted_string(r#""quick draw""#).unwrap();
assert_eq!(got, "quick draw");
// ascii
let (_, got) = double_quoted_string(r#""n.asks""#).unwrap();
assert_eq!(got, "n.asks");
// unicode
let (_, got) = double_quoted_string("\"quick draw\u{1f47d}\"").unwrap();
assert_eq!(
@ -265,6 +280,9 @@ mod test {
let (_, got) = single_quoted_string(r#"'\n\''"#).unwrap();
assert_eq!(got, "\n'");
let (_, got) = single_quoted_string(r#"'\'hello\''"#).unwrap();
assert_eq!(got, "'hello'");
// literal tab
let (_, got) = single_quoted_string("'quick\tdraw'").unwrap();
assert_eq!(got, "quick\tdraw");
@ -300,13 +318,17 @@ mod test {
assert_eq!(got, "hello".into());
// handle escaped delimiters "\/"
let (_, got) = regex(r#"/this\/is\/a\/path/"#).unwrap();
assert_eq!(got, "this/is/a/path".into());
let (_, got) = regex(r#"/\/this\/is\/a\/path/"#).unwrap();
assert_eq!(got, "/this/is/a/path".into());
// ignores any other possible escape sequence
let (_, got) = regex(r#"/hello\n/"#).unwrap();
assert_eq!(got, "hello\\n".into());
// can parse possible escape sequence at beginning of regex
let (_, got) = regex(r#"/\w.*/"#).unwrap();
assert_eq!(got, "\\w.*".into());
// Empty regex
let (i, got) = regex("//").unwrap();
assert_eq!(i, "");