diff --git a/Cargo.lock b/Cargo.lock index e383b52036..ecbcaf2e77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -340,8 +340,11 @@ dependencies = [ "datafusion", "hashbrown 0.13.2", "num-traits", + "once_cell", "rand", + "regex", "snafu", + "uuid", "workspace-hack", ] @@ -4513,7 +4516,6 @@ dependencies = [ "predicate", "prost", "rand", - "regex", "schema", "service_common", "service_grpc_catalog", @@ -5656,7 +5658,6 @@ dependencies = [ "parking_lot 0.12.1", "prost", "rand", - "regex", "reqwest", "snafu", "sqlx", @@ -5665,7 +5666,6 @@ dependencies = [ "tokio", "tokio-util", "tonic", - "uuid", "workspace-hack", ] diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml index 0644ef0dd4..5bea185e07 100644 --- a/arrow_util/Cargo.toml +++ b/arrow_util/Cargo.toml @@ -15,7 +15,10 @@ chrono = { version = "0.4", default-features = false } comfy-table = { version = "6.1", default-features = false } hashbrown = { workspace = true } num-traits = "0.2" +once_cell = { version = "1.17", features = ["parking_lot"] } +regex = "1.7.0" snafu = "0.7" +uuid = "1" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/arrow_util/src/test_util.rs b/arrow_util/src/test_util.rs index ba47946618..d0bb162d8f 100644 --- a/arrow_util/src/test_util.rs +++ b/arrow_util/src/test_util.rs @@ -1,6 +1,7 @@ //! A collection of testing functions for arrow based code use std::sync::Arc; +use crate::display::pretty_format_batches; use arrow::{ array::{new_null_array, ArrayRef, StringArray}, compute::kernels::sort::{lexsort, SortColumn, SortOptions}, @@ -8,6 +9,10 @@ use arrow::{ error::ArrowError, record_batch::RecordBatch, }; +use once_cell::sync::Lazy; +use regex::{Captures, Regex}; +use std::{borrow::Cow, collections::HashMap}; +use uuid::Uuid; /// Compares the formatted output with the pretty formatted results of /// record batches. This is a macro so errors appear on the correct line @@ -181,3 +186,195 @@ pub fn equalize_batch_schemas(batches: Vec) -> Result = Lazy::new(|| { + Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex") +}); + +/// Match the parquet directory names +/// For example, given +/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` +/// +/// matches `32/51/216/13452` +static REGEX_DIRS: Lazy = + Lazy::new(|| Regex::new(r#"[0-9]+/[0-9]+/[0-9]+/[0-9]+"#).expect("directory regex")); + +/// Replace table row separators of flexible width with fixed with. This is required +/// because the original timing values may differ in "printed width", so the table +/// cells have different widths and hence the separators / borders. E.g.: +/// +/// `+--+--+` -> `----------` +/// `+--+------+` -> `----------` +/// +/// Note that we're kinda inexact with our regex here, but it gets the job done. +static REGEX_LINESEP: Lazy = Lazy::new(|| Regex::new(r#"[+-]{6,}"#).expect("linesep regex")); + +/// Similar to the row separator issue above, the table columns are right-padded +/// with spaces. Due to the different "printed width" of the timing values, we need +/// to normalize this padding as well. E.g.: +/// +/// ` |` -> ` |` +/// ` |` -> ` |` +static REGEX_COL: Lazy = Lazy::new(|| Regex::new(r#"\s+\|"#).expect("col regex")); + +/// Matches line like `metrics=[foo=1, bar=2]` +static REGEX_METRICS: Lazy = + Lazy::new(|| Regex::new(r#"metrics=\[([^\]]*)\]"#).expect("metrics regex")); + +/// Matches things like `1s`, `1.2ms` and `10.2μs` +static REGEX_TIMING: Lazy = + Lazy::new(|| Regex::new(r#"[0-9]+(\.[0-9]+)?.s"#).expect("timing regex")); + +/// Matches things like `FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000` +static REGEX_FILTER: Lazy = + Lazy::new(|| Regex::new("FilterExec: .*").expect("filter regex")); + +fn normalize_for_variable_width(s: Cow<'_, str>) -> String { + let s = REGEX_LINESEP.replace_all(&s, "----------"); + REGEX_COL.replace_all(&s, " |").to_string() +} + +/// A query to run with optional annotations +#[derive(Debug, PartialEq, Eq, Default, Clone, Copy)] +pub struct Normalizer { + /// If true, results are sorted first + pub sorted_compare: bool, + + /// If true, replace UUIDs with static placeholders. + pub normalized_uuids: bool, + + /// If true, normalize timings in queries by replacing them with + /// static placeholders, for example: + /// + /// `1s` -> `1.234ms` + pub normalized_metrics: bool, + + /// if true, normalize filter predicates for explain plans + /// `FilterExec: ` + pub normalized_filters: bool, +} + +impl Normalizer { + pub fn new() -> Self { + Default::default() + } + + /// Take the output of running the query and apply the specified normalizations to them + pub fn normalize_results(&self, mut results: Vec) -> Vec { + // compare against sorted results, if requested + if self.sorted_compare && !results.is_empty() { + let schema = results[0].schema(); + let batch = + arrow::compute::concat_batches(&schema, &results).expect("concatenating batches"); + results = vec![sort_record_batch(batch)]; + } + + let mut current_results = pretty_format_batches(&results) + .unwrap() + .trim() + .lines() + .map(|s| s.to_string()) + .collect::>(); + + // normalize UUIDs, if requested + if self.normalized_uuids { + let mut seen: HashMap = HashMap::new(); + current_results = current_results + .into_iter() + .map(|s| { + // Rewrite parquet directory names like + // `51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` + // + // to: + // 1/1/1/1/00000000-0000-0000-0000-000000000000.parquet + + let s = REGEX_UUID.replace_all(&s, |s: &Captures<'_>| { + let next = seen.len() as u128; + Uuid::from_u128( + *seen + .entry(s.get(0).unwrap().as_str().to_owned()) + .or_insert(next), + ) + .to_string() + }); + + let s = normalize_for_variable_width(s); + REGEX_DIRS.replace_all(&s, "1/1/1/1").to_string() + }) + .collect(); + } + + // normalize metrics, if requested + if self.normalized_metrics { + current_results = current_results + .into_iter() + .map(|s| { + // Replace timings with fixed value, e.g.: + // + // `1s` -> `1.234ms` + // `1.2ms` -> `1.234ms` + // `10.2μs` -> `1.234ms` + let s = REGEX_TIMING.replace_all(&s, "1.234ms"); + + let s = normalize_for_variable_width(s); + + // Metrics are currently ordered by value (not by key), so different timings may + // reorder them. We "parse" the list and normalize the sorting. E.g.: + // + // `metrics=[]` => `metrics=[]` + // `metrics=[foo=1, bar=2]` => `metrics=[bar=2, foo=1]` + // `metrics=[foo=2, bar=1]` => `metrics=[bar=1, foo=2]` + REGEX_METRICS + .replace_all(&s, |c: &Captures<'_>| { + let mut metrics: Vec<_> = c[1].split(", ").collect(); + metrics.sort(); + format!("metrics=[{}]", metrics.join(", ")) + }) + .to_string() + }) + .collect(); + } + + // normalize Filters, if requested + // + // Converts: + // FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000 + // + // to + // FilterExec: + if self.normalized_filters { + current_results = current_results + .into_iter() + .map(|s| { + REGEX_FILTER + .replace_all(&s, |_: &Captures<'_>| "FilterExec: ") + .to_string() + }) + .collect(); + } + + current_results + } + + /// Adds information on what normalizations were applied to the input + pub fn add_description(&self, output: &mut Vec) { + if self.sorted_compare { + output.push("-- Results After Sorting".into()) + } + if self.normalized_uuids { + output.push("-- Results After Normalizing UUIDs".into()) + } + if self.normalized_metrics { + output.push("-- Results After Normalizing Metrics".into()) + } + if self.normalized_filters { + output.push("-- Results After Normalizing Filters".into()) + } + } +} diff --git a/querier/Cargo.toml b/querier/Cargo.toml index 5bdb8d69c1..f3693581e8 100644 --- a/querier/Cargo.toml +++ b/querier/Cargo.toml @@ -56,5 +56,4 @@ insta = { version = "1.28.0", features = ["yaml"] } iox_tests = { path = "../iox_tests" } mutable_batch_lp = { path = "../mutable_batch_lp" } object_store_metrics = { path = "../object_store_metrics" } -regex = "1.7.1" test_helpers = { path = "../test_helpers" } diff --git a/querier/src/namespace/query_access.rs b/querier/src/namespace/query_access.rs index 36a11f0f5c..c0e4a81643 100644 --- a/querier/src/namespace/query_access.rs +++ b/querier/src/namespace/query_access.rs @@ -199,13 +199,12 @@ mod tests { use super::*; use crate::namespace::test_util::{clear_parquet_cache, querier_namespace}; use arrow::record_batch::RecordBatch; - use arrow_util::test_util::batches_to_sorted_lines; + use arrow_util::test_util::{batches_to_sorted_lines, Normalizer}; use data_types::ColumnType; use datafusion::common::DataFusionError; use iox_query::frontend::sql::SqlQueryPlanner; use iox_tests::{TestCatalog, TestParquetFileBuilder}; use metric::{Observation, RawReporter}; - use regex::Regex; use snafu::{ResultExt, Snafu}; use trace::{span::SpanStatus, RingBufferTraceCollector}; @@ -492,13 +491,13 @@ mod tests { format_explain(&querier_namespace, "EXPLAIN SELECT * FROM cpu").await, @r###" --- - - +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| plan_type | plan |" - - +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| logical_plan | TableScan: cpu projection=[foo, host, load, time] |" - - "| physical_plan | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/.parquet, 1/1/1/1/.parquet, 1/1/1/1/.parquet, 1/1/2/2/.parquet, 1/1/1/3/.parquet]]}, projection=[foo, host, load, time] |" - - "| | |" - - +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + - "----------" + - "| plan_type | plan |" + - "----------" + - "| logical_plan | TableScan: cpu projection=[foo, host, load, time] |" + - "| physical_plan | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000000.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000001.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000002.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000003.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000004.parquet]]}, projection=[foo, host, load, time] |" + - "| | |" + - "----------" "### ); @@ -509,22 +508,22 @@ mod tests { format_explain(&querier_namespace, "EXPLAIN SELECT * FROM mem ORDER BY host,time").await, @r###" --- - - +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| plan_type | plan |" - - +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| logical_plan | Sort: mem.host ASC NULLS LAST, mem.time ASC NULLS LAST |" - - "| | TableScan: mem projection=[host, perc, time] |" - - "| physical_plan | SortExec: expr=[host@0 ASC NULLS LAST,time@2 ASC NULLS LAST] |" - - "| | CoalescePartitionsExec |" - - "| | UnionExec |" - - "| | CoalesceBatchesExec: target_batch_size=8192 |" - - "| | FilterExec: time@2 < 1 OR time@2 > 13 OR NOT host@0 = CAST(d AS Dictionary(Int32, Utf8)) |" - - "| | ParquetExec: limit=None, partitions={1 group: [[1/2/1/4/.parquet]]}, projection=[host, perc, time] |" - - "| | CoalesceBatchesExec: target_batch_size=8192 |" - - "| | FilterExec: time@2 < 1 OR time@2 > 13 OR NOT host@0 = CAST(d AS Dictionary(Int32, Utf8)) |" - - "| | ParquetExec: limit=None, partitions={1 group: [[1/2/1/4/.parquet]]}, projection=[host, perc, time] |" - - "| | |" - - +---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ + - "----------" + - "| plan_type | plan |" + - "----------" + - "| logical_plan | Sort: mem.host ASC NULLS LAST, mem.time ASC NULLS LAST |" + - "| | TableScan: mem projection=[host, perc, time] |" + - "| physical_plan | SortExec: expr=[host@0 ASC NULLS LAST,time@2 ASC NULLS LAST] |" + - "| | CoalescePartitionsExec |" + - "| | UnionExec |" + - "| | CoalesceBatchesExec: target_batch_size=8192 |" + - "| | FilterExec: time@2 < 1 OR time@2 > 13 OR NOT host@0 = CAST(d AS Dictionary(Int32, Utf8)) |" + - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000000.parquet]]}, projection=[host, perc, time] |" + - "| | CoalesceBatchesExec: target_batch_size=8192 |" + - "| | FilterExec: time@2 < 1 OR time@2 > 13 OR NOT host@0 = CAST(d AS Dictionary(Int32, Utf8)) |" + - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000001.parquet]]}, projection=[host, perc, time] |" + - "| | |" + - "----------" "### ); @@ -567,19 +566,19 @@ mod tests { format_explain(&querier_namespace, "EXPLAIN SELECT * FROM cpu").await, @r###" --- - - +---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| plan_type | plan |" - - +---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - - "| logical_plan | TableScan: cpu projection=[foo, host, load, time] |" - - "| physical_plan | UnionExec |" - - "| | DeduplicateExec: [host@1 ASC,time@3 ASC] |" - - "| | SortPreservingMergeExec: [host@1 ASC,time@3 ASC] |" - - "| | UnionExec |" - - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/2/2/.parquet]]}, output_ordering=[host@1 ASC, time@3 ASC], projection=[foo, host, load, time] |" - - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/2/2/.parquet]]}, output_ordering=[host@1 ASC, time@3 ASC], projection=[foo, host, load, time] |" - - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/.parquet, 1/1/1/1/.parquet, 1/1/1/1/.parquet, 1/1/1/3/.parquet]]}, projection=[foo, host, load, time] |" - - "| | |" - - +---------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + - "----------" + - "| plan_type | plan |" + - "----------" + - "| logical_plan | TableScan: cpu projection=[foo, host, load, time] |" + - "| physical_plan | UnionExec |" + - "| | DeduplicateExec: [host@1 ASC,time@3 ASC] |" + - "| | SortPreservingMergeExec: [host@1 ASC,time@3 ASC] |" + - "| | UnionExec |" + - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000000.parquet]]}, output_ordering=[host@1 ASC, time@3 ASC], projection=[foo, host, load, time] |" + - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000001.parquet]]}, output_ordering=[host@1 ASC, time@3 ASC], projection=[foo, host, load, time] |" + - "| | ParquetExec: limit=None, partitions={1 group: [[1/1/1/1/00000000-0000-0000-0000-000000000002.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000003.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000004.parquet, 1/1/1/1/00000000-0000-0000-0000-000000000005.parquet]]}, projection=[foo, host, load, time] |" + - "| | |" + - "----------" "### ); } @@ -599,15 +598,11 @@ mod tests { async fn format_explain(querier_namespace: &Arc, sql: &str) -> Vec { let results = run(querier_namespace, sql, None).await; - let formatted = arrow_util::display::pretty_format_batches(&results).unwrap(); - - let regex = Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}") - .expect("UUID regex"); - formatted - .trim() - .split('\n') - .map(|s| regex.replace_all(s, "").to_string()) - .collect::>() + let normalizer = Normalizer { + normalized_uuids: true, + ..Default::default() + }; + normalizer.normalize_results(results) } async fn run( diff --git a/test_helpers_end_to_end/Cargo.toml b/test_helpers_end_to_end/Cargo.toml index f9148131f9..ce37ecf357 100644 --- a/test_helpers_end_to_end/Cargo.toml +++ b/test_helpers_end_to_end/Cargo.toml @@ -26,7 +26,6 @@ once_cell = { version = "1.17", features = ["parking_lot"] } parking_lot = "0.12" prost = "0.11" rand = "0.8.3" -regex = "1.7.0" reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } snafu = "0.7" sqlx = { version = "0.6", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] } @@ -35,5 +34,4 @@ test_helpers = { path = "../test_helpers", features = ["future_timeout"] } tokio = { version = "1.25", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tokio-util = "0.7" tonic = "0.8" -uuid = "1" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/test_helpers_end_to_end/src/lib.rs b/test_helpers_end_to_end/src/lib.rs index 152e4124e7..6e58e27f64 100644 --- a/test_helpers_end_to_end/src/lib.rs +++ b/test_helpers_end_to_end/src/lib.rs @@ -13,7 +13,7 @@ mod grpc; mod mini_cluster; mod server_fixture; mod server_type; -mod snapshot_comparison; +pub mod snapshot_comparison; mod steps; mod udp_listener; diff --git a/test_helpers_end_to_end/src/snapshot_comparison.rs b/test_helpers_end_to_end/src/snapshot_comparison.rs index 118b33db54..44a4bc3c92 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison.rs @@ -1,11 +1,9 @@ -mod normalization; mod queries; -use crate::snapshot_comparison::queries::TestQueries; -use crate::{run_influxql, run_sql, MiniCluster}; +use crate::{run_influxql, run_sql, snapshot_comparison::queries::TestQueries, MiniCluster}; use snafu::{OptionExt, ResultExt, Snafu}; -use std::fmt::{Display, Formatter}; use std::{ + fmt::{Display, Formatter}, fs, path::{Path, PathBuf}, }; diff --git a/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs b/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs deleted file mode 100644 index 8a01d777ab..0000000000 --- a/test_helpers_end_to_end/src/snapshot_comparison/normalization.rs +++ /dev/null @@ -1,199 +0,0 @@ -use arrow::record_batch::RecordBatch; -use arrow_util::{display::pretty_format_batches, test_util::sort_record_batch}; -use once_cell::sync::Lazy; -use regex::{Captures, Regex}; -use std::{borrow::Cow, collections::HashMap}; -use uuid::Uuid; - -/// Match the parquet UUID -/// -/// For example, given -/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` -/// -/// matches `1d325760-2b20-48de-ab48-2267b034133d` -static REGEX_UUID: Lazy = Lazy::new(|| { - Regex::new("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").expect("UUID regex") -}); - -/// Match the parquet directory names -/// For example, given -/// `32/51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` -/// -/// matches `32/51/216/13452` -static REGEX_DIRS: Lazy = - Lazy::new(|| Regex::new(r#"[0-9]+/[0-9]+/[0-9]+/[0-9]+"#).expect("directory regex")); - -/// Replace table row separators of flexible width with fixed with. This is required -/// because the original timing values may differ in "printed width", so the table -/// cells have different widths and hence the separators / borders. E.g.: -/// -/// `+--+--+` -> `----------` -/// `+--+------+` -> `----------` -/// -/// Note that we're kinda inexact with our regex here, but it gets the job done. -static REGEX_LINESEP: Lazy = Lazy::new(|| Regex::new(r#"[+-]{6,}"#).expect("linesep regex")); - -/// Similar to the row separator issue above, the table columns are right-padded -/// with spaces. Due to the different "printed width" of the timing values, we need -/// to normalize this padding as well. E.g.: -/// -/// ` |` -> ` |` -/// ` |` -> ` |` -static REGEX_COL: Lazy = Lazy::new(|| Regex::new(r#"\s+\|"#).expect("col regex")); - -/// Matches line like `metrics=[foo=1, bar=2]` -static REGEX_METRICS: Lazy = - Lazy::new(|| Regex::new(r#"metrics=\[([^\]]*)\]"#).expect("metrics regex")); - -/// Matches things like `1s`, `1.2ms` and `10.2μs` -static REGEX_TIMING: Lazy = - Lazy::new(|| Regex::new(r#"[0-9]+(\.[0-9]+)?.s"#).expect("timing regex")); - -/// Matches things like `FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000` -static REGEX_FILTER: Lazy = - Lazy::new(|| Regex::new("FilterExec: .*").expect("filter regex")); - -fn normalize_for_variable_width(s: Cow) -> String { - let s = REGEX_LINESEP.replace_all(&s, "----------"); - REGEX_COL.replace_all(&s, " |").to_string() -} - -/// A query to run with optional annotations -#[derive(Debug, PartialEq, Eq, Default)] -pub struct Normalizer { - /// If true, results are sorted first - pub sorted_compare: bool, - - /// If true, replace UUIDs with static placeholders. - pub normalized_uuids: bool, - - /// If true, normalize timings in queries by replacing them with - /// static placeholders, for example: - /// - /// `1s` -> `1.234ms` - pub normalized_metrics: bool, - - /// if true, normalize filter predicates for explain plans - /// `FilterExec: ` - pub normalized_filters: bool, -} - -impl Normalizer { - #[cfg(test)] - pub fn new() -> Self { - Default::default() - } - - /// Take the output of running the query and apply the specified normalizations to them - pub fn normalize_results(&self, mut results: Vec) -> Vec { - // compare against sorted results, if requested - if self.sorted_compare && !results.is_empty() { - let schema = results[0].schema(); - let batch = - arrow::compute::concat_batches(&schema, &results).expect("concatenating batches"); - results = vec![sort_record_batch(batch)]; - } - - let mut current_results = pretty_format_batches(&results) - .unwrap() - .trim() - .lines() - .map(|s| s.to_string()) - .collect::>(); - - // normalize UUIDs, if requested - if self.normalized_uuids { - let mut seen: HashMap = HashMap::new(); - current_results = current_results - .into_iter() - .map(|s| { - // Rewrite parquet directory names like - // `51/216/13452/1d325760-2b20-48de-ab48-2267b034133d.parquet` - // - // to: - // 1/1/1/1/00000000-0000-0000-0000-000000000000.parquet - - let s = REGEX_UUID.replace_all(&s, |s: &Captures| { - let next = seen.len() as u128; - Uuid::from_u128( - *seen - .entry(s.get(0).unwrap().as_str().to_owned()) - .or_insert(next), - ) - .to_string() - }); - - let s = normalize_for_variable_width(s); - REGEX_DIRS.replace_all(&s, "1/1/1/1").to_string() - }) - .collect(); - } - - // normalize metrics, if requested - if self.normalized_metrics { - current_results = current_results - .into_iter() - .map(|s| { - // Replace timings with fixed value, e.g.: - // - // `1s` -> `1.234ms` - // `1.2ms` -> `1.234ms` - // `10.2μs` -> `1.234ms` - let s = REGEX_TIMING.replace_all(&s, "1.234ms"); - - let s = normalize_for_variable_width(s); - - // Metrics are currently ordered by value (not by key), so different timings may - // reorder them. We "parse" the list and normalize the sorting. E.g.: - // - // `metrics=[]` => `metrics=[]` - // `metrics=[foo=1, bar=2]` => `metrics=[bar=2, foo=1]` - // `metrics=[foo=2, bar=1]` => `metrics=[bar=1, foo=2]` - REGEX_METRICS - .replace_all(&s, |c: &Captures| { - let mut metrics: Vec<_> = c[1].split(", ").collect(); - metrics.sort(); - format!("metrics=[{}]", metrics.join(", ")) - }) - .to_string() - }) - .collect(); - } - - // normalize Filters, if requested - // - // Converts: - // FilterExec: time@2 < -9223372036854775808 OR time@2 > 1640995204240217000 - // - // to - // FilterExec: - if self.normalized_filters { - current_results = current_results - .into_iter() - .map(|s| { - REGEX_FILTER - .replace_all(&s, |_: &Captures| "FilterExec: ") - .to_string() - }) - .collect(); - } - - current_results - } - - /// Adds information on what normalizations were applied to the input - pub fn add_description(&self, output: &mut Vec) { - if self.sorted_compare { - output.push("-- Results After Sorting".into()) - } - if self.normalized_uuids { - output.push("-- Results After Normalizing UUIDs".into()) - } - if self.normalized_metrics { - output.push("-- Results After Normalizing Metrics".into()) - } - if self.normalized_filters { - output.push("-- Results After Normalizing Filters".into()) - } - } -} diff --git a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs index 70a5f8da4c..d5299c2341 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs @@ -1,6 +1,5 @@ use arrow::record_batch::RecordBatch; - -use super::normalization::Normalizer; +use arrow_util::test_util::Normalizer; /// A query to run with optional annotations #[derive(Debug, PartialEq, Eq, Default)]