Merge pull request #2937 from influxdata/er/fix/read_buffer/null_values

fix: ensure correct distinct count on all-null Read Buffer columns
pull/24376/head
kodiakhq[bot] 2021-10-22 12:53:37 +00:00 committed by GitHub
commit 7367c75dac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 205 additions and 120 deletions

View File

@ -494,12 +494,21 @@ where
}
/// Create statistics for a column that only has nulls up to now
pub fn new_all_null(total_count: u64) -> Self {
pub fn new_all_null(total_count: u64, distinct_count: Option<u64>) -> Self {
let min = None;
let max = None;
let null_count = total_count;
let distinct_count = NonZeroU64::new(1);
Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
if let Some(count) = distinct_count {
assert!(count > 0);
}
Self::new_with_distinct(
min,
max,
total_count,
null_count,
distinct_count.map(|c| NonZeroU64::new(c).unwrap()),
)
}
pub fn update_from(&mut self, other: &Self) {
@ -660,6 +669,8 @@ impl IsNan for f64 {
#[cfg(test)]
mod tests {
use std::convert::TryFrom;
use super::*;
#[test]
@ -677,13 +688,25 @@ mod tests {
#[test]
fn statistics_new_all_null() {
let actual = StatValues::<i64>::new_all_null(3);
// i64 values do not have a distinct count
let actual = StatValues::<i64>::new_all_null(3, None);
let expected = StatValues {
min: None,
max: None,
total_count: 3,
null_count: 3,
distinct_count: NonZeroU64::new(1),
distinct_count: None,
};
assert_eq!(actual, expected);
// string columns can have a distinct count
let actual = StatValues::<i64>::new_all_null(3, Some(1_u64));
let expected = StatValues {
min: None,
max: None,
total_count: 3,
null_count: 3,
distinct_count: Some(NonZeroU64::try_from(1_u64).unwrap()),
};
assert_eq!(actual, expected);
}

View File

@ -113,30 +113,33 @@ impl Column {
| InfluxColumnType::Field(InfluxFieldType::Boolean) => {
let mut data = BitSet::new();
data.append_unset(row_count);
ColumnData::Bool(data, StatValues::new_all_null(total_count))
ColumnData::Bool(data, StatValues::new_all_null(total_count, None))
}
InfluxColumnType::IOx(IOxValueType::U64)
| InfluxColumnType::Field(InfluxFieldType::UInteger) => {
ColumnData::U64(vec![0; row_count], StatValues::new_all_null(total_count))
}
| InfluxColumnType::Field(InfluxFieldType::UInteger) => ColumnData::U64(
vec![0; row_count],
StatValues::new_all_null(total_count, None),
),
InfluxColumnType::IOx(IOxValueType::F64)
| InfluxColumnType::Field(InfluxFieldType::Float) => {
ColumnData::F64(vec![0.0; row_count], StatValues::new_all_null(total_count))
}
| InfluxColumnType::Field(InfluxFieldType::Float) => ColumnData::F64(
vec![0.0; row_count],
StatValues::new_all_null(total_count, None),
),
InfluxColumnType::IOx(IOxValueType::I64)
| InfluxColumnType::Field(InfluxFieldType::Integer)
| InfluxColumnType::Timestamp => {
ColumnData::I64(vec![0; row_count], StatValues::new_all_null(total_count))
}
| InfluxColumnType::Timestamp => ColumnData::I64(
vec![0; row_count],
StatValues::new_all_null(total_count, None),
),
InfluxColumnType::IOx(IOxValueType::String)
| InfluxColumnType::Field(InfluxFieldType::String) => ColumnData::String(
PackedStringArray::new_empty(row_count),
StatValues::new_all_null(total_count),
StatValues::new_all_null(total_count, Some(1)),
),
InfluxColumnType::Tag => ColumnData::Tag(
vec![INVALID_DID; row_count],
Default::default(),
StatValues::new_all_null(total_count),
StatValues::new_all_null(total_count, Some(1)),
),
InfluxColumnType::IOx(IOxValueType::Bytes) => todo!(),
};

View File

@ -191,8 +191,6 @@ async fn test_field_name_plan() {
}
}
// BUG: https://github.com/influxdata/influxdb_iox/issues/2860
#[ignore]
#[tokio::test]
async fn test_field_name_plan_with_delete() {
test_helpers::maybe_start_logging();

View File

@ -495,6 +495,7 @@ mod test {
use data_types::partition_metadata::{ColumnSummary, InfluxDbType, StatValues, Statistics};
use metric::{MetricKind, Observation, ObservationSet, RawReporter};
use schema::builder::SchemaBuilder;
use std::iter::FromIterator;
use std::{num::NonZeroU64, sync::Arc};
// helper to make the `add_remove_tables` test simpler to read.
@ -854,6 +855,8 @@ mod test {
#[test]
fn table_summaries() {
use std::iter::repeat;
let schema = SchemaBuilder::new()
.non_null_tag("env")
.tag("host")
@ -862,6 +865,11 @@ mod test {
.non_null_field("icounter", Int64)
.non_null_field("active", Boolean)
.non_null_field("msg", Utf8)
.field("zf64", Float64)
.field("zu64", UInt64)
.field("zi64", Int64)
.field("zbool", Boolean)
.field("zstr", Utf8)
.timestamp()
.build()
.unwrap();
@ -873,7 +881,7 @@ mod test {
.collect::<DictionaryArray<Int32Type>>(),
),
Arc::new(
(vec![None, None, None] as Vec<Option<&str>>)
(vec![Some("host a"), None, Some("host b")] as Vec<Option<&str>>)
.into_iter()
.collect::<DictionaryArray<Int32Type>>(),
),
@ -881,7 +889,20 @@ mod test {
Arc::new(UInt64Array::from(vec![1000, 3000, 5000])),
Arc::new(Int64Array::from(vec![1000, -1000, 4000])),
Arc::new(BooleanArray::from(vec![true, true, false])),
Arc::new(StringArray::from(vec![Some("msg a"), Some("msg b"), None])),
Arc::new(StringArray::from(vec![
Some("msg a"),
Some("msg b"),
Some("msg b"),
])),
// all null columns
Arc::new(Float64Array::from_iter(repeat(None).take(3))),
Arc::new(UInt64Array::from_iter(repeat(None).take(3))),
Arc::new(Int64Array::from_iter(repeat(None).take(3))),
Arc::new(BooleanArray::from_iter(repeat(None).take(3))),
Arc::new(StringArray::from_iter(
repeat::<Option<String>>(None).take(3),
)),
// timestamp column
Arc::new(TimestampNanosecondArray::from_vec(
vec![11111111, 222222, 3333],
None,
@ -925,7 +946,13 @@ mod test {
ColumnSummary {
name: "host".into(),
influxdb_type: Some(InfluxDbType::Tag),
stats: Statistics::String(StatValues::new_all_null(3)),
stats: Statistics::String(StatValues {
min: Some("host a".into()),
max: Some("host b".into()),
total_count: 3,
null_count: 1,
distinct_count: Some(NonZeroU64::new(3).unwrap()),
}),
},
ColumnSummary {
name: "icounter".into(),
@ -939,8 +966,8 @@ mod test {
min: Some("msg a".into()),
max: Some("msg b".into()),
total_count: 3,
null_count: 1,
distinct_count: Some(NonZeroU64::new(3).unwrap()),
null_count: 0,
distinct_count: Some(NonZeroU64::new(2).unwrap()),
}),
},
ColumnSummary {
@ -953,6 +980,31 @@ mod test {
influxdb_type: Some(InfluxDbType::Timestamp),
stats: Statistics::I64(StatValues::new_non_null(Some(3333), Some(11111111), 3)),
},
ColumnSummary {
name: "zbool".into(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::Bool(StatValues::new_all_null(3, None)),
},
ColumnSummary {
name: "zf64".into(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::F64(StatValues::new_all_null(3, None)),
},
ColumnSummary {
name: "zi64".into(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::I64(StatValues::new_all_null(3, None)),
},
ColumnSummary {
name: "zstr".into(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::String(StatValues::new_all_null(3, Some(1))),
},
ColumnSummary {
name: "zu64".into(),
influxdb_type: Some(InfluxDbType::Field),
stats: Statistics::U64(StatValues::new_all_null(3, None)),
},
];
assert_eq!(

View File

@ -785,90 +785,43 @@ impl MetaData {
let null_count = column_meta.null_count as u64;
let distinct_count = column_meta.distinct_count;
let stats = match &column_meta.range {
(OwnedValue::String(min), OwnedValue::String(max)) => {
Statistics::String(StatValues {
min: Some(min.to_string()),
max: Some(max.to_string()),
total_count,
null_count,
distinct_count,
})
}
(OwnedValue::String(_), mismatch) => {
panic!("inconsistent min/max expected String got {}", mismatch)
}
(OwnedValue::Boolean(min), OwnedValue::Boolean(max)) => {
Statistics::Bool(StatValues {
min: Some(*min),
max: Some(*max),
total_count,
null_count,
distinct_count,
})
}
(OwnedValue::Boolean(_), mismatch) => {
panic!("inconsistent min/max expected Boolean got {}", mismatch)
}
(OwnedValue::Scalar(min), OwnedValue::Scalar(max)) => match (min, max) {
(Scalar::I64(min), Scalar::I64(max)) => Statistics::I64(StatValues {
min: Some(*min),
max: Some(*max),
total_count,
null_count,
distinct_count,
}),
(Scalar::I64(_), mismatch) => {
panic!("inconsistent min/max expected I64 got {}", mismatch)
}
(Scalar::U64(min), Scalar::U64(max)) => Statistics::U64(StatValues {
min: Some(*min),
max: Some(*max),
total_count,
null_count,
distinct_count,
}),
(Scalar::U64(_), mismatch) => {
panic!("inconsistent min/max expected U64 got {}", mismatch)
}
(Scalar::F64(min), Scalar::F64(max)) => Statistics::F64(StatValues {
min: Some(*min),
max: Some(*max),
total_count,
null_count,
distinct_count,
}),
(Scalar::F64(_), mismatch) => {
panic!("inconsistent min/max expected F64 got {}", mismatch)
}
(Scalar::Null, Scalar::Null) => {
assert_eq!(
total_count, null_count,
"expected only null values: {:?}",
column_meta,
);
assert_eq!(
distinct_count,
std::num::NonZeroU64::new(1),
"distinct count for all null was not 1: {:?}",
column_meta,
);
make_null_stats(total_count, &column_meta.logical_data_type)
}
(Scalar::Null, mismatch) => {
panic!("inconsistent min/max expected NULL got {}", mismatch)
}
},
(OwnedValue::Scalar(_), mismatch) => {
panic!("inconsistent min/max expected Scalar got {}", mismatch)
}
(OwnedValue::ByteArray(_), OwnedValue::ByteArray(_)) => {
panic!("unsupported type statistcs type ByteArray")
}
(OwnedValue::ByteArray(_), mismatch) => {
panic!("inconsistent min/max expected ByteArray got {}", mismatch)
}
let stats = match column_meta.logical_data_type {
LogicalDataType::Integer => Statistics::I64(StatValues {
min: column_meta.range.0.as_i64(),
max: column_meta.range.1.as_i64(),
total_count,
null_count,
distinct_count,
}),
LogicalDataType::Unsigned => Statistics::U64(StatValues {
min: column_meta.range.0.as_u64(),
max: column_meta.range.1.as_u64(),
total_count,
null_count,
distinct_count,
}),
LogicalDataType::Float => Statistics::F64(StatValues {
min: column_meta.range.0.as_f64(),
max: column_meta.range.1.as_f64(),
total_count,
null_count,
distinct_count,
}),
LogicalDataType::String => Statistics::String(StatValues {
min: column_meta.range.0.as_string(),
max: column_meta.range.1.as_string(),
total_count,
null_count,
distinct_count,
}),
LogicalDataType::Binary => panic!("unsupported type statistcs type ByteArray"),
LogicalDataType::Boolean => Statistics::Bool(StatValues {
min: column_meta.range.0.as_bool(),
max: column_meta.range.1.as_bool(),
total_count,
null_count,
distinct_count,
}),
};
ColumnSummary {
@ -899,12 +852,12 @@ fn make_null_stats(
use LogicalDataType::*;
match logical_data_type {
Integer => Statistics::I64(StatValues::new_all_null(total_count)),
Unsigned => Statistics::U64(StatValues::new_all_null(total_count)),
Float => Statistics::F64(StatValues::new_all_null(total_count)),
String => Statistics::String(StatValues::new_all_null(total_count)),
Integer => Statistics::I64(StatValues::new_all_null(total_count, None)),
Unsigned => Statistics::U64(StatValues::new_all_null(total_count, None)),
Float => Statistics::F64(StatValues::new_all_null(total_count, None)),
String => Statistics::String(StatValues::new_all_null(total_count, Some(1))),
Binary => panic!("Binary statistics not supported"),
Boolean => Statistics::Bool(StatValues::new_all_null(total_count)),
Boolean => Statistics::Bool(StatValues::new_all_null(total_count, None)),
}
}
@ -1847,25 +1800,28 @@ west,host-b,100
#[test]
fn null_stats_ifield() {
let actual = make_null_stats(12, &LogicalDataType::Integer);
assert_eq!(actual, Statistics::I64(StatValues::new_all_null(12)));
assert_eq!(actual, Statistics::I64(StatValues::new_all_null(12, None)));
}
#[test]
fn null_stats_ufield() {
let actual = make_null_stats(12, &LogicalDataType::Unsigned);
assert_eq!(actual, Statistics::U64(StatValues::new_all_null(12)));
assert_eq!(actual, Statistics::U64(StatValues::new_all_null(12, None)));
}
#[test]
fn null_stats_float() {
let actual = make_null_stats(12, &LogicalDataType::Float);
assert_eq!(actual, Statistics::F64(StatValues::new_all_null(12)));
assert_eq!(actual, Statistics::F64(StatValues::new_all_null(12, None)));
}
#[test]
fn null_stats_string() {
let actual = make_null_stats(12, &LogicalDataType::String);
assert_eq!(actual, Statistics::String(StatValues::new_all_null(12)));
assert_eq!(
actual,
Statistics::String(StatValues::new_all_null(12, Some(1_u64)))
);
}
#[test]
@ -1877,6 +1833,6 @@ west,host-b,100
#[test]
fn null_stats_boolean() {
let actual = make_null_stats(12, &LogicalDataType::Boolean);
assert_eq!(actual, Statistics::Bool(StatValues::new_all_null(12)));
assert_eq!(actual, Statistics::Bool(StatValues::new_all_null(12, None)));
}
}

View File

@ -1139,6 +1139,9 @@ impl std::fmt::Display for &Scalar {
#[derive(Debug, PartialEq, Clone)]
pub enum OwnedValue {
// Represents a NULL value in a column row.
Null,
/// A UTF-8 valid string.
String(String),
@ -1167,11 +1170,35 @@ impl PartialOrd for OwnedValue {
impl OwnedValue {
pub fn new_null() -> Self {
Self::Scalar(Scalar::Null)
Self::Null
}
pub fn is_null(&self) -> bool {
matches!(self, Self::Scalar(Scalar::Null))
matches!(self, Self::Null)
}
pub fn as_string(&self) -> Option<String> {
match self {
OwnedValue::Null => None,
OwnedValue::String(s) => Some(s.clone()),
v => panic!("{:?} cannot be unwrapped as string", v),
}
}
pub fn as_byte_array(&self) -> Option<Vec<u8>> {
match self {
OwnedValue::Null => None,
OwnedValue::ByteArray(arr) => Some(arr.clone()),
v => panic!("{:?} cannot be unwrapped as byte array", v),
}
}
pub fn as_bool(&self) -> Option<bool> {
match self {
OwnedValue::Null => None,
OwnedValue::Boolean(b) => Some(*b),
v => panic!("{:?} cannot be unwrapped as string", v),
}
}
/// Update self to the min of self and other, taking into
@ -1203,9 +1230,35 @@ impl OwnedValue {
}
}
// Implementations of as_type for various scalar types.
macro_rules! owned_value_as_impls {
($(($type:ident, $name:ident),)*) => {
$(
impl OwnedValue {
pub fn $name(&self) -> Option<$type> {
match self {
OwnedValue::Null => None,
OwnedValue::Scalar(s) => {
(!s.is_null()).then(|| s.$name())
}
v => panic!("{:?} cannot be unwrapped as {:?}", v, stringify!($type)),
}
}
}
)*
};
}
owned_value_as_impls! {
(i64, as_i64),
(f64, as_f64),
(u64, as_u64),
}
impl std::fmt::Display for &OwnedValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OwnedValue::Null => write!(f, "NULL"),
OwnedValue::String(s) => s.fmt(f),
OwnedValue::ByteArray(s) => write!(f, "{}", String::from_utf8_lossy(s)),
OwnedValue::Boolean(b) => b.fmt(f),