diff --git a/read_buffer/src/column/integer.rs b/read_buffer/src/column/integer.rs index 23940aeed5..f80774574a 100644 --- a/read_buffer/src/column/integer.rs +++ b/read_buffer/src/column/integer.rs @@ -1,12 +1,19 @@ use std::mem::size_of; -use arrow::{self, array::Array}; +use arrow::{ + self, array::Array, datatypes::Int16Type as ArrowInt16Type, + datatypes::Int32Type as ArrowInt32Type, datatypes::Int64Type as ArrowInt64Type, + datatypes::Int8Type as ArrowInt8Type, datatypes::UInt16Type as ArrowUInt16Type, + datatypes::UInt32Type as ArrowUInt32Type, datatypes::UInt64Type as ArrowUInt64Type, + datatypes::UInt8Type as ArrowUInt8Type, +}; use super::encoding::{scalar::Fixed, scalar::FixedNull}; use super::{cmp, Statistics}; use crate::column::{EncodedValues, RowIDs, Scalar, Value, Values}; pub enum IntegerEncoding { + // non-null encodings. These are backed by `Vec` I64I64(Fixed), I64I32(Fixed), I64U32(Fixed), @@ -14,15 +21,23 @@ pub enum IntegerEncoding { I64U16(Fixed), I64I8(Fixed), I64U8(Fixed), - U64U64(Fixed), U64U32(Fixed), U64U16(Fixed), U64U8(Fixed), - // Nullable encodings - TODO, add variants for smaller physical types. - I64I64N(FixedNull), - U64U64N(FixedNull), + // Nullable encodings. These are backed by an Arrow array. + I64I64N(FixedNull), + I64I32N(FixedNull), + I64U32N(FixedNull), + I64I16N(FixedNull), + I64U16N(FixedNull), + I64I8N(FixedNull), + I64U8N(FixedNull), + U64U64N(FixedNull), + U64U32N(FixedNull), + U64U16N(FixedNull), + U64U8N(FixedNull), } impl PartialEq for IntegerEncoding { @@ -40,12 +55,57 @@ impl PartialEq for IntegerEncoding { (Self::U64U16(a), Self::U64U16(b)) => a == b, (Self::U64U8(a), Self::U64U8(b)) => a == b, (Self::I64I64N(a), Self::I64I64N(b)) => { - let a = a.all_values(vec![]); + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64I32N(a), Self::I64I32N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64U32N(a), Self::I64U32N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64I16N(a), Self::I64I16N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64U16N(a), Self::I64U16N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64I8N(a), Self::I64I8N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::I64U8N(a), Self::I64U8N(b)) => { + let a = a.all_values::(vec![]); let b = b.all_values(vec![]); a == b } (Self::U64U64N(a), Self::U64U64N(b)) => { - let a = a.all_values(vec![]); + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::U64U32N(a), Self::U64U32N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::U64U16N(a), Self::U64U16N(b)) => { + let a = a.all_values::(vec![]); + let b = b.all_values(vec![]); + a == b + } + (Self::U64U8N(a), Self::U64U8N(b)) => { + let a = a.all_values::(vec![]); let b = b.all_values(vec![]); a == b } @@ -70,7 +130,16 @@ impl IntegerEncoding { Self::U64U16(enc) => enc.size(), Self::U64U8(enc) => enc.size(), Self::I64I64N(enc) => enc.size(), + Self::I64I32N(enc) => enc.size(), + Self::I64U32N(enc) => enc.size(), + Self::I64I16N(enc) => enc.size(), + Self::I64U16N(enc) => enc.size(), + Self::I64I8N(enc) => enc.size(), + Self::I64U8N(enc) => enc.size(), Self::U64U64N(enc) => enc.size(), + Self::U64U32N(enc) => enc.size(), + Self::U64U16N(enc) => enc.size(), + Self::U64U8N(enc) => enc.size(), } } @@ -97,7 +166,16 @@ impl IntegerEncoding { } Self::I64I64N(enc) => enc.size_raw(include_nulls), + Self::I64I32N(enc) => enc.size_raw(include_nulls), + Self::I64U32N(enc) => enc.size_raw(include_nulls), + Self::I64I16N(enc) => enc.size_raw(include_nulls), + Self::I64U16N(enc) => enc.size_raw(include_nulls), + Self::I64I8N(enc) => enc.size_raw(include_nulls), + Self::I64U8N(enc) => enc.size_raw(include_nulls), Self::U64U64N(enc) => enc.size_raw(include_nulls), + Self::U64U32N(enc) => enc.size_raw(include_nulls), + Self::U64U16N(enc) => enc.size_raw(include_nulls), + Self::U64U8N(enc) => enc.size_raw(include_nulls), } } @@ -116,7 +194,16 @@ impl IntegerEncoding { Self::U64U16(enc) => enc.num_rows(), Self::U64U8(enc) => enc.num_rows(), Self::I64I64N(enc) => enc.num_rows(), + Self::I64I32N(enc) => enc.num_rows(), + Self::I64U32N(enc) => enc.num_rows(), + Self::I64I16N(enc) => enc.num_rows(), + Self::I64U16N(enc) => enc.num_rows(), + Self::I64I8N(enc) => enc.num_rows(), + Self::I64U8N(enc) => enc.num_rows(), Self::U64U64N(enc) => enc.num_rows(), + Self::U64U32N(enc) => enc.num_rows(), + Self::U64U16N(enc) => enc.num_rows(), + Self::U64U8N(enc) => enc.num_rows(), } } @@ -137,7 +224,16 @@ impl IntegerEncoding { pub fn contains_null(&self) -> bool { match self { Self::I64I64N(enc) => enc.contains_null(), + Self::I64I32N(enc) => enc.contains_null(), + Self::I64U32N(enc) => enc.contains_null(), + Self::I64I16N(enc) => enc.contains_null(), + Self::I64U16N(enc) => enc.contains_null(), + Self::I64I8N(enc) => enc.contains_null(), + Self::I64U8N(enc) => enc.contains_null(), Self::U64U64N(enc) => enc.contains_null(), + Self::U64U32N(enc) => enc.contains_null(), + Self::U64U16N(enc) => enc.contains_null(), + Self::U64U8N(enc) => enc.contains_null(), _ => false, } } @@ -157,7 +253,16 @@ impl IntegerEncoding { Self::U64U16(_) => 0, Self::U64U8(_) => 0, Self::I64I64N(enc) => enc.null_count(), + Self::I64I32N(enc) => enc.null_count(), + Self::I64U32N(enc) => enc.null_count(), + Self::I64I16N(enc) => enc.null_count(), + Self::I64U16N(enc) => enc.null_count(), + Self::I64I8N(enc) => enc.null_count(), + Self::I64U8N(enc) => enc.null_count(), Self::U64U64N(enc) => enc.null_count(), + Self::U64U32N(enc) => enc.null_count(), + Self::U64U16N(enc) => enc.null_count(), + Self::U64U8N(enc) => enc.null_count(), } } @@ -165,7 +270,16 @@ impl IntegerEncoding { pub fn has_any_non_null_value(&self) -> bool { match self { Self::I64I64N(enc) => enc.has_any_non_null_value(), + Self::I64I32N(enc) => enc.has_any_non_null_value(), + Self::I64U32N(enc) => enc.has_any_non_null_value(), + Self::I64I16N(enc) => enc.has_any_non_null_value(), + Self::I64U16N(enc) => enc.has_any_non_null_value(), + Self::I64I8N(enc) => enc.has_any_non_null_value(), + Self::I64U8N(enc) => enc.has_any_non_null_value(), Self::U64U64N(enc) => enc.has_any_non_null_value(), + Self::U64U32N(enc) => enc.has_any_non_null_value(), + Self::U64U16N(enc) => enc.has_any_non_null_value(), + Self::U64U8N(enc) => enc.has_any_non_null_value(), _ => true, } } @@ -175,7 +289,16 @@ impl IntegerEncoding { pub fn has_non_null_value(&self, row_ids: &[u32]) -> bool { match self { Self::I64I64N(enc) => enc.has_non_null_value(row_ids), + Self::I64I32N(enc) => enc.has_non_null_value(row_ids), + Self::I64U32N(enc) => enc.has_non_null_value(row_ids), + Self::I64I16N(enc) => enc.has_non_null_value(row_ids), + Self::I64U16N(enc) => enc.has_non_null_value(row_ids), + Self::I64I8N(enc) => enc.has_non_null_value(row_ids), + Self::I64U8N(enc) => enc.has_non_null_value(row_ids), Self::U64U64N(enc) => enc.has_non_null_value(row_ids), + Self::U64U32N(enc) => enc.has_non_null_value(row_ids), + Self::U64U16N(enc) => enc.has_non_null_value(row_ids), + Self::U64U8N(enc) => enc.has_non_null_value(row_ids), _ => !row_ids.is_empty(), // all rows will be non-null } } @@ -187,25 +310,64 @@ impl IntegerEncoding { // `c.value` should return as the logical type // signed 64-bit variants - logical type is i64 for all these - Self::I64I64(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64I32(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64U32(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64I16(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64U16(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64I8(c) => Value::Scalar(Scalar::I64(c.value(row_id))), - Self::I64U8(c) => Value::Scalar(Scalar::I64(c.value(row_id))), + Self::I64I64(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64I32(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64U32(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64I16(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64U16(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64I8(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), + Self::I64U8(enc) => Value::Scalar(Scalar::I64(enc.value(row_id))), // unsigned 64-bit variants - logical type is u64 for all these - Self::U64U64(c) => Value::Scalar(Scalar::U64(c.value(row_id))), - Self::U64U32(c) => Value::Scalar(Scalar::U64(c.value(row_id))), - Self::U64U16(c) => Value::Scalar(Scalar::U64(c.value(row_id))), - Self::U64U8(c) => Value::Scalar(Scalar::U64(c.value(row_id))), + Self::U64U64(enc) => Value::Scalar(Scalar::U64(enc.value(row_id))), + Self::U64U32(enc) => Value::Scalar(Scalar::U64(enc.value(row_id))), + Self::U64U16(enc) => Value::Scalar(Scalar::U64(enc.value(row_id))), + Self::U64U8(enc) => Value::Scalar(Scalar::U64(enc.value(row_id))), - Self::I64I64N(c) => match c.value(row_id) { + // signed 64-bit variants + Self::I64I64N(enc) => match enc.value(row_id) { Some(v) => Value::Scalar(Scalar::I64(v)), None => Value::Null, }, - Self::U64U64N(c) => match c.value(row_id) { + Self::I64I32N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U32N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I16N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U16N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I8N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U8N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + + // unsigned 64-bit variants + Self::U64U64N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U32N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U16N(enc) => match enc.value(row_id) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U8N(enc) => match enc.value(row_id) { Some(v) => Value::Scalar(Scalar::U64(v)), None => Value::Null, }, @@ -219,22 +381,34 @@ impl IntegerEncoding { pub fn values(&self, row_ids: &[u32]) -> Values<'_> { match &self { // signed 64-bit variants - logical type is i64 for all these - Self::I64I64(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64I32(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64U32(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64I16(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64U16(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64I8(c) => Values::I64(c.values::(row_ids, vec![])), - Self::I64U8(c) => Values::I64(c.values::(row_ids, vec![])), + Self::I64I64(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64I32(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64U32(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64I16(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64U16(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64I8(enc) => Values::I64(enc.values::(row_ids, vec![])), + Self::I64U8(enc) => Values::I64(enc.values::(row_ids, vec![])), // unsigned 64-bit variants - logical type is u64 for all these - Self::U64U64(c) => Values::U64(c.values::(row_ids, vec![])), - Self::U64U32(c) => Values::U64(c.values::(row_ids, vec![])), - Self::U64U16(c) => Values::U64(c.values::(row_ids, vec![])), - Self::U64U8(c) => Values::U64(c.values::(row_ids, vec![])), + Self::U64U64(enc) => Values::U64(enc.values::(row_ids, vec![])), + Self::U64U32(enc) => Values::U64(enc.values::(row_ids, vec![])), + Self::U64U16(enc) => Values::U64(enc.values::(row_ids, vec![])), + Self::U64U8(enc) => Values::U64(enc.values::(row_ids, vec![])), - Self::I64I64N(c) => Values::I64N(c.values(row_ids, vec![])), - Self::U64U64N(c) => Values::U64N(c.values(row_ids, vec![])), + // signed 64-bit nullable variants - logical type is i64 for all these. + Self::I64I64N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64I32N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64U32N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64I16N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64U16N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64I8N(enc) => Values::I64N(enc.values(row_ids, vec![])), + Self::I64U8N(enc) => Values::I64N(enc.values(row_ids, vec![])), + + // unsigned 64-bit nullable variants - logical type is u64 for all these. + Self::U64U64N(enc) => Values::U64N(enc.values(row_ids, vec![])), + Self::U64U32N(enc) => Values::U64N(enc.values(row_ids, vec![])), + Self::U64U16N(enc) => Values::U64N(enc.values(row_ids, vec![])), + Self::U64U8N(enc) => Values::U64N(enc.values(row_ids, vec![])), } } @@ -259,8 +433,20 @@ impl IntegerEncoding { Self::U64U16(c) => Values::U64(c.all_values::(vec![])), Self::U64U8(c) => Values::U64(c.all_values::(vec![])), - Self::I64I64N(c) => Values::I64N(c.all_values(vec![])), - Self::U64U64N(c) => Values::U64N(c.all_values(vec![])), + // signed 64-bit nullable variants - logical type is i64 for all these. + Self::I64I64N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64I32N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64U32N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64I16N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64U16N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64I8N(enc) => Values::I64N(enc.all_values(vec![])), + Self::I64U8N(enc) => Values::I64N(enc.all_values(vec![])), + + // unsigned 64-bit nullable variants - logical type is u64 for all these. + Self::U64U64N(enc) => Values::U64N(enc.all_values(vec![])), + Self::U64U32N(enc) => Values::U64N(enc.all_values(vec![])), + Self::U64U16N(enc) => Values::U64N(enc.all_values(vec![])), + Self::U64U8N(enc) => Values::U64N(enc.all_values(vec![])), } } @@ -326,8 +512,20 @@ impl IntegerEncoding { Self::U64U16(c) => c.row_ids_filter(value.as_u16(), op, dst), Self::U64U8(c) => c.row_ids_filter(value.as_u8(), op, dst), - Self::I64I64N(c) => c.row_ids_filter(value.as_i64(), op, dst), - Self::U64U64N(c) => c.row_ids_filter(value.as_u64(), op, dst), + // signed 64-bit nullable variants - logical type is i64 for all these. + Self::I64I64N(enc) => enc.row_ids_filter(value.as_i64(), op, dst), + Self::I64I32N(enc) => enc.row_ids_filter(value.as_i32(), op, dst), + Self::I64U32N(enc) => enc.row_ids_filter(value.as_u32(), op, dst), + Self::I64I16N(enc) => enc.row_ids_filter(value.as_i16(), op, dst), + Self::I64U16N(enc) => enc.row_ids_filter(value.as_u16(), op, dst), + Self::I64I8N(enc) => enc.row_ids_filter(value.as_i8(), op, dst), + Self::I64U8N(enc) => enc.row_ids_filter(value.as_u8(), op, dst), + + // unsigned 64-bit nullable variants - logical type is u64 for all these. + Self::U64U64N(enc) => enc.row_ids_filter(value.as_u64(), op, dst), + Self::U64U32N(enc) => enc.row_ids_filter(value.as_u32(), op, dst), + Self::U64U16N(enc) => enc.row_ids_filter(value.as_u16(), op, dst), + Self::U64U8N(enc) => enc.row_ids_filter(value.as_u8(), op, dst), } } @@ -378,8 +576,41 @@ impl IntegerEncoding { c.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) } - Self::I64I64N(_) => todo!(), - Self::U64U64N(_) => todo!(), + Self::I64I64N(enc) => { + enc.row_ids_filter_range((low.1.as_i64(), low.0), (high.1.as_i64(), high.0), dst) + } + Self::I64I32N(enc) => { + enc.row_ids_filter_range((low.1.as_i32(), low.0), (high.1.as_i32(), high.0), dst) + } + Self::I64U32N(enc) => { + enc.row_ids_filter_range((low.1.as_u32(), low.0), (high.1.as_u32(), high.0), dst) + } + Self::I64I16N(enc) => { + enc.row_ids_filter_range((low.1.as_i16(), low.0), (high.1.as_i16(), high.0), dst) + } + Self::I64U16N(enc) => { + enc.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::I64I8N(enc) => { + enc.row_ids_filter_range((low.1.as_i8(), low.0), (high.1.as_i8(), high.0), dst) + } + Self::I64U8N(enc) => { + enc.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } + + // unsigned 64-bit nullable variants - logical type is u64 for all these. + Self::U64U64N(enc) => { + enc.row_ids_filter_range((low.1.as_u64(), low.0), (high.1.as_u64(), high.0), dst) + } + Self::U64U32N(enc) => { + enc.row_ids_filter_range((low.1.as_u32(), low.0), (high.1.as_u32(), high.0), dst) + } + Self::U64U16N(enc) => { + enc.row_ids_filter_range((low.1.as_u16(), low.0), (high.1.as_u16(), high.0), dst) + } + Self::U64U8N(enc) => { + enc.row_ids_filter_range((low.1.as_u8(), low.0), (high.1.as_u8(), high.0), dst) + } } } @@ -396,11 +627,49 @@ impl IntegerEncoding { Self::U64U32(c) => Value::Scalar(Scalar::U64(c.min(row_ids))), Self::U64U16(c) => Value::Scalar(Scalar::U64(c.min(row_ids))), Self::U64U8(c) => Value::Scalar(Scalar::U64(c.min(row_ids))), - Self::I64I64N(c) => match c.min(row_ids) { + + Self::I64I64N(enc) => match enc.min(row_ids) { Some(v) => Value::Scalar(Scalar::I64(v)), None => Value::Null, }, - Self::U64U64N(c) => match c.min(row_ids) { + Self::I64I32N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U32N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I16N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U16N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I8N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U8N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + + Self::U64U64N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U32N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U16N(enc) => match enc.min(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U8N(enc) => match enc.min(row_ids) { Some(v) => Value::Scalar(Scalar::U64(v)), None => Value::Null, }, @@ -420,11 +689,48 @@ impl IntegerEncoding { Self::U64U32(c) => Value::Scalar(Scalar::U64(c.max(row_ids))), Self::U64U16(c) => Value::Scalar(Scalar::U64(c.max(row_ids))), Self::U64U8(c) => Value::Scalar(Scalar::U64(c.max(row_ids))), - Self::I64I64N(c) => match c.max(row_ids) { + Self::I64I64N(enc) => match enc.max(row_ids) { Some(v) => Value::Scalar(Scalar::I64(v)), None => Value::Null, }, - Self::U64U64N(c) => match c.max(row_ids) { + Self::I64I32N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U32N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I16N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U16N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64I8N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + Self::I64U8N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::I64(v)), + None => Value::Null, + }, + + Self::U64U64N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U32N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U16N(enc) => match enc.max(row_ids) { + Some(v) => Value::Scalar(Scalar::U64(v)), + None => Value::Null, + }, + Self::U64U8N(enc) => match enc.max(row_ids) { Some(v) => Value::Scalar(Scalar::U64(v)), None => Value::Null, }, @@ -444,11 +750,48 @@ impl IntegerEncoding { Self::U64U32(c) => Scalar::U64(c.sum(row_ids)), Self::U64U16(c) => Scalar::U64(c.sum(row_ids)), Self::U64U8(c) => Scalar::U64(c.sum(row_ids)), - Self::I64I64N(c) => match c.sum(row_ids) { + Self::I64I64N(enc) => match enc.sum(row_ids) { Some(v) => Scalar::I64(v), None => Scalar::Null, }, - Self::U64U64N(c) => match c.sum(row_ids) { + Self::I64I32N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + Self::I64U32N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + Self::I64I16N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + Self::I64U16N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + Self::I64I8N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + Self::I64U8N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::I64(v), + None => Scalar::Null, + }, + + Self::U64U64N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::U64(v), + None => Scalar::Null, + }, + Self::U64U32N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::U64(v), + None => Scalar::Null, + }, + Self::U64U16N(enc) => match enc.sum(row_ids) { + Some(v) => Scalar::U64(v), + None => Scalar::Null, + }, + Self::U64U8N(enc) => match enc.sum(row_ids) { Some(v) => Scalar::U64(v), None => Scalar::Null, }, @@ -468,8 +811,17 @@ impl IntegerEncoding { Self::U64U32(c) => c.count(row_ids), Self::U64U16(c) => c.count(row_ids), Self::U64U8(c) => c.count(row_ids), - Self::I64I64N(c) => c.count(row_ids), - Self::U64U64N(c) => c.count(row_ids), + Self::I64I64N(enc) => enc.count(row_ids), + Self::I64I32N(enc) => enc.count(row_ids), + Self::I64U32N(enc) => enc.count(row_ids), + Self::I64I16N(enc) => enc.count(row_ids), + Self::I64U16N(enc) => enc.count(row_ids), + Self::I64I8N(enc) => enc.count(row_ids), + Self::I64U8N(enc) => enc.count(row_ids), + Self::U64U64N(enc) => enc.count(row_ids), + Self::U64U32N(enc) => enc.count(row_ids), + Self::U64U16N(enc) => enc.count(row_ids), + Self::U64U8N(enc) => enc.count(row_ids), } } @@ -488,7 +840,16 @@ impl IntegerEncoding { Self::U64U16(_) => "BT_U16", Self::U64U8(_) => "BT_U8", Self::I64I64N(_) => "None", + Self::I64I32N(_) => "BT_I32N", + Self::I64U32N(_) => "BT_U32N", + Self::I64I16N(_) => "BT_U16N", + Self::I64U16N(_) => "BT_U16N", + Self::I64I8N(_) => "BT_I8N", + Self::I64U8N(_) => "BT_U8N", Self::U64U64N(_) => "None", + Self::U64U32N(_) => "BT_U32N", + Self::U64U16N(_) => "BT_U16N", + Self::U64U8N(_) => "BT_U8N", } } @@ -507,7 +868,16 @@ impl IntegerEncoding { Self::U64U16(_) => "u64", Self::U64U8(_) => "u64", Self::I64I64N(_) => "i64", + Self::I64I32N(_) => "i64", + Self::I64U32N(_) => "i64", + Self::I64I16N(_) => "i64", + Self::I64U16N(_) => "i64", + Self::I64I8N(_) => "i64", + Self::I64U8N(_) => "i64", Self::U64U64N(_) => "u64", + Self::U64U32N(_) => "u64", + Self::U64U16N(_) => "u64", + Self::U64U8N(_) => "u64", } } } @@ -528,7 +898,16 @@ impl std::fmt::Display for IntegerEncoding { Self::U64U16(enc) => write!(f, "[{}]: {}", name, enc), Self::U64U8(enc) => write!(f, "[{}]: {}", name, enc), Self::I64I64N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64I32N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64U32N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64I16N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64U16N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64I8N(enc) => write!(f, "[{}]: {}", name, enc), + Self::I64U8N(enc) => write!(f, "[{}]: {}", name, enc), Self::U64U64N(enc) => write!(f, "[{}]: {}", name, enc), + Self::U64U32N(enc) => write!(f, "[{}]: {}", name, enc), + Self::U64U16N(enc) => write!(f, "[{}]: {}", name, enc), + Self::U64U8N(enc) => write!(f, "[{}]: {}", name, enc), } } } @@ -549,7 +928,16 @@ impl std::fmt::Debug for IntegerEncoding { Self::U64U16(enc) => write!(f, "[{}]: {:?}", name, enc), Self::U64U8(enc) => write!(f, "[{}]: {:?}", name, enc), Self::I64I64N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64I32N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64U32N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64I16N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64U16N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64I8N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::I64U8N(enc) => write!(f, "[{}]: {:?}", name, enc), Self::U64U64N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::U64U32N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::U64U16N(enc) => write!(f, "[{}]: {:?}", name, enc), + Self::U64U8N(enc) => write!(f, "[{}]: {:?}", name, enc), } } } @@ -603,17 +991,48 @@ impl From<&[i64]> for IntegerEncoding { /// Converts an Arrow array into an IntegerEncoding. /// -/// TODO(edd): convert underlying type of Arrow data to smallest physical -/// representation. +/// The most compact physical Arrow array type is used to store the column +/// within a `FixedNull` encoding. impl From for IntegerEncoding { fn from(arr: arrow::array::Int64Array) -> Self { if arr.null_count() == 0 { return Self::from(arr.values()); } - // TODO(edd): currently fixed null only supports 64-bit logical/physical - // types. Need to add support for storing as smaller physical types. - Self::I64I64N(FixedNull::::from(arr)) + // determine min and max values. + let min = arrow::compute::kernels::aggregate::min(&arr); + let max = arrow::compute::kernels::aggregate::max(&arr); + + // This match is carefully ordered. It prioritises smaller physical + // datatypes that can safely represent the provided logical data + match (min, max) { + // encode as u8 values + (min, max) if min >= Some(0) && max <= Some(u8::MAX as i64) => { + Self::I64U8N(FixedNull::::from(arr)) + } + // encode as i8 values + (min, max) if min >= Some(i8::MIN as i64) && max <= Some(i8::MAX as i64) => { + Self::I64I8N(FixedNull::::from(arr)) + } + // encode as u16 values + (min, max) if min >= Some(0) && max <= Some(u16::MAX as i64) => { + Self::I64U16N(FixedNull::::from(arr)) + } + // encode as i16 values + (min, max) if min >= Some(i16::MIN as i64) && max <= Some(i16::MAX as i64) => { + Self::I64I16N(FixedNull::::from(arr)) + } + // encode as u32 values + (min, max) if min >= Some(0) && max <= Some(u32::MAX as i64) => { + Self::I64U32N(FixedNull::::from(arr)) + } + // encode as i32 values + (min, max) if min >= Some(i32::MIN as i64) && max <= Some(i32::MAX as i64) => { + Self::I64I32N(FixedNull::::from(arr)) + } + // otherwise, encode with the same physical type (i64) + (_, _) => Self::I64I64N(FixedNull::::from(arr)), + } } } @@ -650,23 +1069,42 @@ impl From<&[u64]> for IntegerEncoding { /// Converts an Arrow array into an IntegerEncoding. /// -/// TODO(edd): convert underlying type of Arrow data to smallest physical -/// representation. +/// The most compact physical Arrow array type is used to store the column +/// within a `FixedNull` encoding. impl From for IntegerEncoding { fn from(arr: arrow::array::UInt64Array) -> Self { if arr.null_count() == 0 { return Self::from(arr.values()); } - // TODO(edd): currently fixed null only supports 64-bit logical/physical - // types. Need to add support for storing as smaller physical types. - Self::U64U64N(FixedNull::::from(arr)) + // determine max value. + let max = arrow::compute::kernels::aggregate::max(&arr); + + // This match is carefully ordered. It prioritises smaller physical + // datatypes that can safely represent the provided logical data + match max { + // encode as u8 values + max if max <= Some(u8::MAX as u64) => { + Self::U64U8N(FixedNull::::from(arr)) + } + // encode as u16 values + max if max <= Some(u16::MAX as u64) => { + Self::U64U16N(FixedNull::::from(arr)) + } + // encode as u32 values + max if max <= Some(u32::MAX as u64) => { + Self::U64U32N(FixedNull::::from(arr)) + } + // otherwise, encode with the same physical type (u64) + _ => Self::U64U64N(FixedNull::::from(arr)), + } } } #[cfg(test)] mod test { - use arrow::datatypes::Int64Type; + use arrow::array::{Int64Array, UInt64Array}; + use std::iter; use super::*; @@ -718,6 +1156,133 @@ mod test { } } + #[test] + fn from_arrow_i64_array() { + let cases = vec![ + vec![0_i64, 2, 245, 3], + vec![0_i64, -120, 127, 3], + vec![399_i64, 2, 2452, 3], + vec![-399_i64, 2, 2452, 3], + vec![u32::MAX as i64, 2, 245, 3], + vec![i32::MIN as i64, 2, 245, 3], + vec![0_i64, 2, 245, u32::MAX as i64 + 1], + ]; + + let exp = vec![ + IntegerEncoding::I64U8(Fixed::::from(cases[0].as_slice())), + IntegerEncoding::I64I8(Fixed::::from(cases[1].as_slice())), + IntegerEncoding::I64U16(Fixed::::from(cases[2].as_slice())), + IntegerEncoding::I64I16(Fixed::::from(cases[3].as_slice())), + IntegerEncoding::I64U32(Fixed::::from(cases[4].as_slice())), + IntegerEncoding::I64I32(Fixed::::from(cases[5].as_slice())), + IntegerEncoding::I64I64(Fixed::::from(cases[6].as_slice())), + ]; + + // for Arrow arrays with no nulls we can store the column using a + // non-nullable fixed encoding + for (case, exp) in cases.iter().cloned().zip(exp.into_iter()) { + let arr = Int64Array::from(case); + assert_eq!(IntegerEncoding::from(arr), exp); + } + + // Tack a NULL onto each of the input cases. + let cases = cases + .iter() + .map(|case| { + case.iter() + .map(|x| Some(*x)) + .chain(iter::repeat(None).take(1)) + .collect::>() + }) + .collect::>(); + + // when a NULL value is present then we need to use a nullable encoding. + let exp = vec![ + IntegerEncoding::I64U8N(FixedNull::::from(Int64Array::from( + cases[0].clone(), + ))), + IntegerEncoding::I64I8N(FixedNull::::from(Int64Array::from( + cases[1].clone(), + ))), + IntegerEncoding::I64U16N(FixedNull::::from(Int64Array::from( + cases[2].clone(), + ))), + IntegerEncoding::I64I16N(FixedNull::::from(Int64Array::from( + cases[3].clone(), + ))), + IntegerEncoding::I64U32N(FixedNull::::from(Int64Array::from( + cases[4].clone(), + ))), + IntegerEncoding::I64I32N(FixedNull::::from(Int64Array::from( + cases[5].clone(), + ))), + IntegerEncoding::I64I64N(FixedNull::::from(Int64Array::from( + cases[6].clone(), + ))), + ]; + + for (case, exp) in cases.into_iter().zip(exp.into_iter()) { + let arr = Int64Array::from(case.clone()); + assert_eq!(IntegerEncoding::from(arr), exp); + } + } + + #[test] + fn from_arrow_u64_array() { + let cases = vec![ + vec![0_u64, 2, 245, 3], + vec![399_u64, 2, 2452, 3], + vec![u32::MAX as u64, 2, 245, 3], + vec![0_u64, 2, 245, u32::MAX as u64 + 1], + ]; + + let exp = vec![ + IntegerEncoding::U64U8(Fixed::::from(cases[0].as_slice())), + IntegerEncoding::U64U16(Fixed::::from(cases[1].as_slice())), + IntegerEncoding::U64U32(Fixed::::from(cases[2].as_slice())), + IntegerEncoding::U64U64(Fixed::::from(cases[3].as_slice())), + ]; + + // for Arrow arrays with no nulls we can store the column using a + // non-nullable fixed encoding + for (case, exp) in cases.iter().cloned().zip(exp.into_iter()) { + let arr = UInt64Array::from(case); + assert_eq!(IntegerEncoding::from(arr), exp); + } + + // Tack a NULL onto each of the input cases. + let cases = cases + .iter() + .map(|case| { + case.iter() + .map(|x| Some(*x)) + .chain(iter::repeat(None).take(1)) + .collect::>() + }) + .collect::>(); + + // when a NULL value is present then we need to use a nullable encoding. + let exp = vec![ + IntegerEncoding::U64U8N(FixedNull::::from(UInt64Array::from( + cases[0].clone(), + ))), + IntegerEncoding::U64U16N(FixedNull::::from(UInt64Array::from( + cases[1].clone(), + ))), + IntegerEncoding::U64U32N(FixedNull::::from(UInt64Array::from( + cases[2].clone(), + ))), + IntegerEncoding::U64U64N(FixedNull::::from(UInt64Array::from( + cases[3].clone(), + ))), + ]; + + for (case, exp) in cases.into_iter().zip(exp.into_iter()) { + let arr = UInt64Array::from(case.clone()); + assert_eq!(IntegerEncoding::from(arr), exp); + } + } + #[test] fn size_raw() { let enc = IntegerEncoding::I64U8(Fixed::::from(&[2, 22, 12, 31][..])); @@ -730,12 +1295,7 @@ mod test { assert_eq!(enc.size_raw(true), 56); assert_eq!(enc.size_raw(false), 56); - let enc = IntegerEncoding::I64I64N(FixedNull::::from(&[2, 22, 12, 31][..])); - // (4 * 8) + 24 - assert_eq!(enc.size_raw(true), 56); - assert_eq!(enc.size_raw(false), 56); - - let enc = IntegerEncoding::I64I64N(FixedNull::::from( + let enc = IntegerEncoding::I64I64N(FixedNull::::from( &[Some(2), Some(22), Some(12), None, None, Some(31)][..], )); // (6 * 8) + 24