From 8b406b4f916252e40446d9cfea18cec2c7c06201 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 22 Oct 2021 12:06:56 +0100 Subject: [PATCH 1/5] test: update table_summaries test --- read_buffer/src/chunk.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs index 94ac209ba4..82b4ae587f 100644 --- a/read_buffer/src/chunk.rs +++ b/read_buffer/src/chunk.rs @@ -873,7 +873,7 @@ mod test { .collect::>(), ), Arc::new( - (vec![None, None, None] as Vec>) + (vec![Some("host a"), None, Some("host b")] as Vec>) .into_iter() .collect::>(), ), @@ -881,7 +881,11 @@ mod test { Arc::new(UInt64Array::from(vec![1000, 3000, 5000])), Arc::new(Int64Array::from(vec![1000, -1000, 4000])), Arc::new(BooleanArray::from(vec![true, true, false])), - Arc::new(StringArray::from(vec![Some("msg a"), Some("msg b"), None])), + Arc::new(StringArray::from(vec![ + Some("msg a"), + Some("msg b"), + Some("msg b"), + ])), Arc::new(TimestampNanosecondArray::from_vec( vec![11111111, 222222, 3333], None, @@ -925,7 +929,13 @@ mod test { ColumnSummary { name: "host".into(), influxdb_type: Some(InfluxDbType::Tag), - stats: Statistics::String(StatValues::new_all_null(3)), + stats: Statistics::String(StatValues { + min: Some("host a".into()), + max: Some("host b".into()), + total_count: 3, + null_count: 1, + distinct_count: Some(NonZeroU64::new(3).unwrap()), + }), }, ColumnSummary { name: "icounter".into(), @@ -939,8 +949,8 @@ mod test { min: Some("msg a".into()), max: Some("msg b".into()), total_count: 3, - null_count: 1, - distinct_count: Some(NonZeroU64::new(3).unwrap()), + null_count: 0, + distinct_count: Some(NonZeroU64::new(2).unwrap()), }), }, ColumnSummary { From 09a2d805b99c8a109f8fff8e5a779f76173d18d8 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 22 Oct 2021 13:03:16 +0100 Subject: [PATCH 2/5] refactor: restore concept of NULL value --- read_buffer/src/value.rs | 57 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/read_buffer/src/value.rs b/read_buffer/src/value.rs index 08a9e8ef19..e979695ed1 100644 --- a/read_buffer/src/value.rs +++ b/read_buffer/src/value.rs @@ -1139,6 +1139,9 @@ impl std::fmt::Display for &Scalar { #[derive(Debug, PartialEq, Clone)] pub enum OwnedValue { + // Represents a NULL value in a column row. + Null, + /// A UTF-8 valid string. String(String), @@ -1167,11 +1170,35 @@ impl PartialOrd for OwnedValue { impl OwnedValue { pub fn new_null() -> Self { - Self::Scalar(Scalar::Null) + Self::Null } pub fn is_null(&self) -> bool { - matches!(self, Self::Scalar(Scalar::Null)) + matches!(self, Self::Null) + } + + pub fn as_string(&self) -> Option { + match self { + OwnedValue::Null => None, + OwnedValue::String(s) => Some(s.clone()), + v => panic!("{:?} cannot be unwrapped as string", v), + } + } + + pub fn as_byte_array(&self) -> Option> { + match self { + OwnedValue::Null => None, + OwnedValue::ByteArray(arr) => Some(arr.clone()), + v => panic!("{:?} cannot be unwrapped as byte array", v), + } + } + + pub fn as_bool(&self) -> Option { + match self { + OwnedValue::Null => None, + OwnedValue::Boolean(b) => Some(*b), + v => panic!("{:?} cannot be unwrapped as string", v), + } } /// Update self to the min of self and other, taking into @@ -1203,9 +1230,35 @@ impl OwnedValue { } } +// Implementations of as_type for various scalar types. +macro_rules! owned_value_as_impls { + ($(($type:ident, $name:ident),)*) => { + $( + impl OwnedValue { + pub fn $name(&self) -> Option<$type> { + match self { + OwnedValue::Null => None, + OwnedValue::Scalar(s) => { + (!s.is_null()).then(|| s.$name()) + } + v => panic!("{:?} cannot be unwrapped as {:?}", v, stringify!($type)), + } + } + } + )* + }; +} + +owned_value_as_impls! { + (i64, as_i64), + (f64, as_f64), + (u64, as_u64), +} + impl std::fmt::Display for &OwnedValue { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + OwnedValue::Null => write!(f, "NULL"), OwnedValue::String(s) => s.fmt(f), OwnedValue::ByteArray(s) => write!(f, "{}", String::from_utf8_lossy(s)), OwnedValue::Boolean(b) => b.fmt(f), From cfd980d1bc18bafc1c02d2c0c9cb4c61e5efed8e Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 22 Oct 2021 13:04:10 +0100 Subject: [PATCH 3/5] fix: only string columns have distinct counts --- data_types/src/partition_metadata.rs | 33 +++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/data_types/src/partition_metadata.rs b/data_types/src/partition_metadata.rs index 643c08c9ca..a71ab75a3d 100644 --- a/data_types/src/partition_metadata.rs +++ b/data_types/src/partition_metadata.rs @@ -494,12 +494,21 @@ where } /// Create statistics for a column that only has nulls up to now - pub fn new_all_null(total_count: u64) -> Self { + pub fn new_all_null(total_count: u64, distinct_count: Option) -> Self { let min = None; let max = None; let null_count = total_count; - let distinct_count = NonZeroU64::new(1); - Self::new_with_distinct(min, max, total_count, null_count, distinct_count) + + if let Some(count) = distinct_count { + assert!(count > 0); + } + Self::new_with_distinct( + min, + max, + total_count, + null_count, + distinct_count.map(|c| NonZeroU64::new(c).unwrap()), + ) } pub fn update_from(&mut self, other: &Self) { @@ -660,6 +669,8 @@ impl IsNan for f64 { #[cfg(test)] mod tests { + use std::convert::TryFrom; + use super::*; #[test] @@ -677,13 +688,25 @@ mod tests { #[test] fn statistics_new_all_null() { - let actual = StatValues::::new_all_null(3); + // i64 values do not have a distinct count + let actual = StatValues::::new_all_null(3, None); let expected = StatValues { min: None, max: None, total_count: 3, null_count: 3, - distinct_count: NonZeroU64::new(1), + distinct_count: None, + }; + assert_eq!(actual, expected); + + // string columns can have a distinct count + let actual = StatValues::::new_all_null(3, Some(1_u64)); + let expected = StatValues { + min: None, + max: None, + total_count: 3, + null_count: 3, + distinct_count: Some(NonZeroU64::try_from(1_u64).unwrap()), }; assert_eq!(actual, expected); } From 7a5263d7094641a7d5943a001d4a42e48ecf7445 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 22 Oct 2021 13:04:42 +0100 Subject: [PATCH 4/5] refactor: tidy up column summary to stats --- read_buffer/src/chunk.rs | 42 ++++++++++++ read_buffer/src/table.rs | 144 ++++++++++++++------------------------- 2 files changed, 92 insertions(+), 94 deletions(-) diff --git a/read_buffer/src/chunk.rs b/read_buffer/src/chunk.rs index 82b4ae587f..7be6fae4e3 100644 --- a/read_buffer/src/chunk.rs +++ b/read_buffer/src/chunk.rs @@ -495,6 +495,7 @@ mod test { use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics}; use metric::{MetricKind, Observation, ObservationSet, RawReporter}; use schema::builder::SchemaBuilder; + use std::iter::FromIterator; use std::{num::NonZeroU64, sync::Arc}; // helper to make the `add_remove_tables` test simpler to read. @@ -854,6 +855,8 @@ mod test { #[test] fn table_summaries() { + use std::iter::repeat; + let schema = SchemaBuilder::new() .non_null_tag("env") .tag("host") @@ -862,6 +865,11 @@ mod test { .non_null_field("icounter", Int64) .non_null_field("active", Boolean) .non_null_field("msg", Utf8) + .field("zf64", Float64) + .field("zu64", UInt64) + .field("zi64", Int64) + .field("zbool", Boolean) + .field("zstr", Utf8) .timestamp() .build() .unwrap(); @@ -886,6 +894,15 @@ mod test { Some("msg b"), Some("msg b"), ])), + // all null columns + Arc::new(Float64Array::from_iter(repeat(None).take(3))), + Arc::new(UInt64Array::from_iter(repeat(None).take(3))), + Arc::new(Int64Array::from_iter(repeat(None).take(3))), + Arc::new(BooleanArray::from_iter(repeat(None).take(3))), + Arc::new(StringArray::from_iter( + repeat::>(None).take(3), + )), + // timestamp column Arc::new(TimestampNanosecondArray::from_vec( vec![11111111, 222222, 3333], None, @@ -963,6 +980,31 @@ mod test { influxdb_type: Some(InfluxDbType::Timestamp), stats: Statistics::I64(StatValues::new_non_null(Some(3333), Some(11111111), 3)), }, + ColumnSummary { + name: "zbool".into(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::Bool(StatValues::new_all_null(3, None)), + }, + ColumnSummary { + name: "zf64".into(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::F64(StatValues::new_all_null(3, None)), + }, + ColumnSummary { + name: "zi64".into(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::I64(StatValues::new_all_null(3, None)), + }, + ColumnSummary { + name: "zstr".into(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::String(StatValues::new_all_null(3, Some(1))), + }, + ColumnSummary { + name: "zu64".into(), + influxdb_type: Some(InfluxDbType::Field), + stats: Statistics::U64(StatValues::new_all_null(3, None)), + }, ]; assert_eq!( diff --git a/read_buffer/src/table.rs b/read_buffer/src/table.rs index 6db494b7aa..0ca0aa1691 100644 --- a/read_buffer/src/table.rs +++ b/read_buffer/src/table.rs @@ -785,90 +785,43 @@ impl MetaData { let null_count = column_meta.null_count as u64; let distinct_count = column_meta.distinct_count; - let stats = match &column_meta.range { - (OwnedValue::String(min), OwnedValue::String(max)) => { - Statistics::String(StatValues { - min: Some(min.to_string()), - max: Some(max.to_string()), - total_count, - null_count, - distinct_count, - }) - } - (OwnedValue::String(_), mismatch) => { - panic!("inconsistent min/max expected String got {}", mismatch) - } - (OwnedValue::Boolean(min), OwnedValue::Boolean(max)) => { - Statistics::Bool(StatValues { - min: Some(*min), - max: Some(*max), - total_count, - null_count, - distinct_count, - }) - } - (OwnedValue::Boolean(_), mismatch) => { - panic!("inconsistent min/max expected Boolean got {}", mismatch) - } - (OwnedValue::Scalar(min), OwnedValue::Scalar(max)) => match (min, max) { - (Scalar::I64(min), Scalar::I64(max)) => Statistics::I64(StatValues { - min: Some(*min), - max: Some(*max), - total_count, - null_count, - distinct_count, - }), - (Scalar::I64(_), mismatch) => { - panic!("inconsistent min/max expected I64 got {}", mismatch) - } - (Scalar::U64(min), Scalar::U64(max)) => Statistics::U64(StatValues { - min: Some(*min), - max: Some(*max), - total_count, - null_count, - distinct_count, - }), - (Scalar::U64(_), mismatch) => { - panic!("inconsistent min/max expected U64 got {}", mismatch) - } - (Scalar::F64(min), Scalar::F64(max)) => Statistics::F64(StatValues { - min: Some(*min), - max: Some(*max), - total_count, - null_count, - distinct_count, - }), - (Scalar::F64(_), mismatch) => { - panic!("inconsistent min/max expected F64 got {}", mismatch) - } - (Scalar::Null, Scalar::Null) => { - assert_eq!( - total_count, null_count, - "expected only null values: {:?}", - column_meta, - ); - assert_eq!( - distinct_count, - std::num::NonZeroU64::new(1), - "distinct count for all null was not 1: {:?}", - column_meta, - ); - - make_null_stats(total_count, &column_meta.logical_data_type) - } - (Scalar::Null, mismatch) => { - panic!("inconsistent min/max expected NULL got {}", mismatch) - } - }, - (OwnedValue::Scalar(_), mismatch) => { - panic!("inconsistent min/max expected Scalar got {}", mismatch) - } - (OwnedValue::ByteArray(_), OwnedValue::ByteArray(_)) => { - panic!("unsupported type statistcs type ByteArray") - } - (OwnedValue::ByteArray(_), mismatch) => { - panic!("inconsistent min/max expected ByteArray got {}", mismatch) - } + let stats = match column_meta.logical_data_type { + LogicalDataType::Integer => Statistics::I64(StatValues { + min: column_meta.range.0.as_i64(), + max: column_meta.range.1.as_i64(), + total_count, + null_count, + distinct_count, + }), + LogicalDataType::Unsigned => Statistics::U64(StatValues { + min: column_meta.range.0.as_u64(), + max: column_meta.range.1.as_u64(), + total_count, + null_count, + distinct_count, + }), + LogicalDataType::Float => Statistics::F64(StatValues { + min: column_meta.range.0.as_f64(), + max: column_meta.range.1.as_f64(), + total_count, + null_count, + distinct_count, + }), + LogicalDataType::String => Statistics::String(StatValues { + min: column_meta.range.0.as_string(), + max: column_meta.range.1.as_string(), + total_count, + null_count, + distinct_count, + }), + LogicalDataType::Binary => panic!("unsupported type statistcs type ByteArray"), + LogicalDataType::Boolean => Statistics::Bool(StatValues { + min: column_meta.range.0.as_bool(), + max: column_meta.range.1.as_bool(), + total_count, + null_count, + distinct_count, + }), }; ColumnSummary { @@ -899,12 +852,12 @@ fn make_null_stats( use LogicalDataType::*; match logical_data_type { - Integer => Statistics::I64(StatValues::new_all_null(total_count)), - Unsigned => Statistics::U64(StatValues::new_all_null(total_count)), - Float => Statistics::F64(StatValues::new_all_null(total_count)), - String => Statistics::String(StatValues::new_all_null(total_count)), + Integer => Statistics::I64(StatValues::new_all_null(total_count, None)), + Unsigned => Statistics::U64(StatValues::new_all_null(total_count, None)), + Float => Statistics::F64(StatValues::new_all_null(total_count, None)), + String => Statistics::String(StatValues::new_all_null(total_count, Some(1))), Binary => panic!("Binary statistics not supported"), - Boolean => Statistics::Bool(StatValues::new_all_null(total_count)), + Boolean => Statistics::Bool(StatValues::new_all_null(total_count, None)), } } @@ -1847,25 +1800,28 @@ west,host-b,100 #[test] fn null_stats_ifield() { let actual = make_null_stats(12, &LogicalDataType::Integer); - assert_eq!(actual, Statistics::I64(StatValues::new_all_null(12))); + assert_eq!(actual, Statistics::I64(StatValues::new_all_null(12, None))); } #[test] fn null_stats_ufield() { let actual = make_null_stats(12, &LogicalDataType::Unsigned); - assert_eq!(actual, Statistics::U64(StatValues::new_all_null(12))); + assert_eq!(actual, Statistics::U64(StatValues::new_all_null(12, None))); } #[test] fn null_stats_float() { let actual = make_null_stats(12, &LogicalDataType::Float); - assert_eq!(actual, Statistics::F64(StatValues::new_all_null(12))); + assert_eq!(actual, Statistics::F64(StatValues::new_all_null(12, None))); } #[test] fn null_stats_string() { let actual = make_null_stats(12, &LogicalDataType::String); - assert_eq!(actual, Statistics::String(StatValues::new_all_null(12))); + assert_eq!( + actual, + Statistics::String(StatValues::new_all_null(12, Some(1_u64))) + ); } #[test] @@ -1877,6 +1833,6 @@ west,host-b,100 #[test] fn null_stats_boolean() { let actual = make_null_stats(12, &LogicalDataType::Boolean); - assert_eq!(actual, Statistics::Bool(StatValues::new_all_null(12))); + assert_eq!(actual, Statistics::Bool(StatValues::new_all_null(12, None))); } } From b3242277eb69c78a291017500b02835e5cb4c8f3 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 22 Oct 2021 13:07:36 +0100 Subject: [PATCH 5/5] test: test_field_name_plan_with_delete is fixed --- mutable_batch/src/column.rs | 27 ++++++++++++---------- query_tests/src/influxrpc/field_columns.rs | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/mutable_batch/src/column.rs b/mutable_batch/src/column.rs index 0721785410..b881fa36c1 100644 --- a/mutable_batch/src/column.rs +++ b/mutable_batch/src/column.rs @@ -113,30 +113,33 @@ impl Column { | InfluxColumnType::Field(InfluxFieldType::Boolean) => { let mut data = BitSet::new(); data.append_unset(row_count); - ColumnData::Bool(data, StatValues::new_all_null(total_count)) + ColumnData::Bool(data, StatValues::new_all_null(total_count, None)) } InfluxColumnType::IOx(IOxValueType::U64) - | InfluxColumnType::Field(InfluxFieldType::UInteger) => { - ColumnData::U64(vec![0; row_count], StatValues::new_all_null(total_count)) - } + | InfluxColumnType::Field(InfluxFieldType::UInteger) => ColumnData::U64( + vec![0; row_count], + StatValues::new_all_null(total_count, None), + ), InfluxColumnType::IOx(IOxValueType::F64) - | InfluxColumnType::Field(InfluxFieldType::Float) => { - ColumnData::F64(vec![0.0; row_count], StatValues::new_all_null(total_count)) - } + | InfluxColumnType::Field(InfluxFieldType::Float) => ColumnData::F64( + vec![0.0; row_count], + StatValues::new_all_null(total_count, None), + ), InfluxColumnType::IOx(IOxValueType::I64) | InfluxColumnType::Field(InfluxFieldType::Integer) - | InfluxColumnType::Timestamp => { - ColumnData::I64(vec![0; row_count], StatValues::new_all_null(total_count)) - } + | InfluxColumnType::Timestamp => ColumnData::I64( + vec![0; row_count], + StatValues::new_all_null(total_count, None), + ), InfluxColumnType::IOx(IOxValueType::String) | InfluxColumnType::Field(InfluxFieldType::String) => ColumnData::String( PackedStringArray::new_empty(row_count), - StatValues::new_all_null(total_count), + StatValues::new_all_null(total_count, Some(1)), ), InfluxColumnType::Tag => ColumnData::Tag( vec![INVALID_DID; row_count], Default::default(), - StatValues::new_all_null(total_count), + StatValues::new_all_null(total_count, Some(1)), ), InfluxColumnType::IOx(IOxValueType::Bytes) => todo!(), }; diff --git a/query_tests/src/influxrpc/field_columns.rs b/query_tests/src/influxrpc/field_columns.rs index 995628078d..ff1309d9a1 100644 --- a/query_tests/src/influxrpc/field_columns.rs +++ b/query_tests/src/influxrpc/field_columns.rs @@ -191,8 +191,6 @@ async fn test_field_name_plan() { } } -// BUG: https://github.com/influxdata/influxdb_iox/issues/2860 -#[ignore] #[tokio::test] async fn test_field_name_plan_with_delete() { test_helpers::maybe_start_logging();