From fef30a7cc4dfbc143e14349bd16def77eaca15dc Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 27 Oct 2020 16:07:29 +0000 Subject: [PATCH] feat: add values implementation on column --- delorean_segment_store/src/column.rs | 212 ++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 8 deletions(-) diff --git a/delorean_segment_store/src/column.rs b/delorean_segment_store/src/column.rs index 10270c00a1..7541cf3bdf 100644 --- a/delorean_segment_store/src/column.rs +++ b/delorean_segment_store/src/column.rs @@ -7,6 +7,10 @@ use std::collections::BTreeSet; use croaring::Bitmap; +use delorean_arrow::arrow::array::{ + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; use delorean_arrow::{arrow, arrow::array::Array}; /// The possible logical types that column values can have. All values in a @@ -91,7 +95,23 @@ impl Column { /// All values present at the provided logical row ids. pub fn values(&self, row_ids: &[u32]) -> Values { - todo!() + assert!( + row_ids.len() as u32 <= self.num_rows(), + format!( + "too many row ids {:?} provided for column with {:?} rows", + row_ids.len(), + self.num_rows() + ) + ); + + match &self { + Column::String(_, data) => data.values(row_ids), + Column::Float(_, data) => data.values(row_ids), + Column::Integer(_, data) => data.values(row_ids), + Column::Unsigned(_, data) => data.values(row_ids), + Column::Bool => todo!(), + Column::ByteArray(_, _) => todo!(), + } } // The distinct set of values found at the logical row ids. @@ -209,6 +229,13 @@ impl StringEncoding { } } + /// Returns the logical value found at the provided row id. + pub fn values(&self, row_ids: &[u32]) -> Values { + match &self { + Self::RLE(c) => Values::String(StringArray::from(c.values(row_ids, vec![]))), + } + } + fn from_arrow_string_array(arr: arrow::array::StringArray) -> Self { // // TODO(edd): potentially switch on things like cardinality in the input @@ -469,6 +496,58 @@ impl IntegerEncoding { }, } } + + /// Returns the logical values found at the provided row ids. + /// + /// TODO(edd): perf - provide a pooling mechanism for these destination vectors + /// so that they can be re-used. + pub fn values(&self, row_ids: &[u32]) -> Values { + match &self { + // signed 64-bit variants - logical type is i64 for all these + Self::I64I64(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64I32(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64U32(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64I16(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64U16(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64I8(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + Self::I64U8(c) => Values::I64(Int64Array::from(c.values::(row_ids, vec![]))), + + // signed 32-bit variants - logical type is i32 for all these + Self::I32I32(c) => Values::I32(Int32Array::from(c.values::(row_ids, vec![]))), + Self::I32I16(c) => Values::I32(Int32Array::from(c.values::(row_ids, vec![]))), + Self::I32U16(c) => Values::I32(Int32Array::from(c.values::(row_ids, vec![]))), + Self::I32I8(c) => Values::I32(Int32Array::from(c.values::(row_ids, vec![]))), + Self::I32U8(c) => Values::I32(Int32Array::from(c.values::(row_ids, vec![]))), + + // signed 16-bit variants - logical type is i16 for all these + Self::I16I16(c) => Values::I16(Int16Array::from(c.values::(row_ids, vec![]))), + Self::I16I8(c) => Values::I16(Int16Array::from(c.values::(row_ids, vec![]))), + Self::I16U8(c) => Values::I16(Int16Array::from(c.values::(row_ids, vec![]))), + + // signed 8-bit variant - logical type is i8 + Self::I8I8(c) => Values::I8(Int8Array::from(c.values::(row_ids, vec![]))), + + // unsigned 64-bit variants - logical type is u64 for all these + Self::U64U64(c) => Values::U64(UInt64Array::from(c.values::(row_ids, vec![]))), + Self::U64U32(c) => Values::U64(UInt64Array::from(c.values::(row_ids, vec![]))), + Self::U64U16(c) => Values::U64(UInt64Array::from(c.values::(row_ids, vec![]))), + Self::U64U8(c) => Values::U64(UInt64Array::from(c.values::(row_ids, vec![]))), + + // unsigned 32-bit variants - logical type is u32 for all these + Self::U32U32(c) => Values::U32(UInt32Array::from(c.values::(row_ids, vec![]))), + Self::U32U16(c) => Values::U32(UInt32Array::from(c.values::(row_ids, vec![]))), + Self::U32U8(c) => Values::U32(UInt32Array::from(c.values::(row_ids, vec![]))), + + // unsigned 16-bit variants - logical type is u16 for all these + Self::U16U16(c) => Values::U16(UInt16Array::from(c.values::(row_ids, vec![]))), + Self::U16U8(c) => Values::U16(UInt16Array::from(c.values::(row_ids, vec![]))), + + // unsigned 8-bit variant - logical type is u8 + Self::U8U8(c) => Values::U8(UInt8Array::from(c.values::(row_ids, vec![]))), + + Self::I64I64N(c) => Values::I64(Int64Array::from(c.values(row_ids, vec![]))), + } + } } pub enum FloatEncoding { @@ -487,6 +566,14 @@ impl FloatEncoding { Self::Fixed32(c) => Value::Scalar(Scalar::F32(c.value(row_id))), } } + + /// Returns the logical values found at the provided row ids. + pub fn values(&self, row_ids: &[u32]) -> Values { + match &self { + Self::Fixed64(c) => Values::F64(Float64Array::from(c.values::(row_ids, vec![]))), + Self::Fixed32(c) => Values::F32(Float32Array::from(c.values::(row_ids, vec![]))), + } + } } // Converts an Arrow `StringArray` into a column, currently using the RLE @@ -985,6 +1072,28 @@ impl From<&[f64]> for Column { } } +/// Converts a slice of `f32` values into a fixed-width column encoding. +impl From<&[f32]> for Column { + fn from(arr: &[f32]) -> Self { + // determine min and max values. + let mut min = arr[0]; + let mut max = arr[0]; + for &v in arr.iter().skip(1) { + min = min.min(v); + max = max.max(v); + } + + let data = fixed::Fixed::::from(arr); + let meta = MetaData { + size: data.size(), + rows: data.num_rows(), + range: Some((min as f64, max as f64)), + }; + + Column::Float(meta, FloatEncoding::Fixed32(data)) + } +} + /// These variants describe supported aggregates that can applied to columnar /// data. pub enum AggregateType { @@ -1068,18 +1177,23 @@ pub enum Value<'a> { /// Each variant is a typed vector of materialised values for a column. NULL /// values are represented as None +#[derive(Debug, PartialEq)] pub enum Values { // UTF-8 valid unicode strings String(arrow::array::StringArray), - // 64-bit floating point values - Float(arrow::array::Float64Array), + F64(arrow::array::Float64Array), + F32(arrow::array::Float32Array), - // 64-bit signed integer values - Integer(arrow::array::Int64Array), + I64(arrow::array::Int64Array), + I32(arrow::array::Int32Array), + I16(arrow::array::Int16Array), + I8(arrow::array::Int8Array), - // 64-bit unsigned integer values - Unsigned(arrow::array::UInt64Array), + U64(arrow::array::UInt64Array), + U32(arrow::array::UInt32Array), + U16(arrow::array::UInt16Array), + U8(arrow::array::UInt8Array), // Boolean values Bool(arrow::array::BooleanArray), @@ -1150,7 +1264,10 @@ impl RowIDs { #[cfg(test)] mod test { use super::*; - use delorean_arrow::arrow::array::StringArray; + use delorean_arrow::arrow::array::{ + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, + }; #[test] fn from_arrow_string_array() { @@ -1504,4 +1621,83 @@ mod test { assert_eq!(col.value(1), Value::String("b")); assert_eq!(col.value(2), Value::Null); } + + #[test] + fn values() { + // physical type of `col` will be `i16` but logical type is `i64` + let col = Column::from(&[0_i64, 1, 200, 20, -1][..]); + assert_eq!( + col.values(&[0, 2, 3]), + Values::I64(Int64Array::from(vec![0, 200, 20])) + ); + + // physical type of `col` will be `i16` but logical type is `i32` + let col = Column::from(&[0_i32, 1, 200, 20, -1][..]); + assert_eq!( + col.values(&[0, 2, 3]), + Values::I32(Int32Array::from(vec![0, 200, 20])) + ); + + // physical and logical type of `col` will be `i16` + let col = Column::from(&[0_i16, 1, 200, 20, -1][..]); + assert_eq!( + col.values(&[0, 2, 3]), + Values::I16(Int16Array::from(vec![0, 200, 20])) + ); + + // physical and logical type of `col` will be `i8` + let col = Column::from(&[0_i8, 1, 127, 20, -1][..]); + assert_eq!( + col.values(&[0, 2, 3]), + Values::I8(Int8Array::from(vec![0, 127, 20])) + ); + + // physical type of `col` will be `u8` but logical type is `u64` + let col = Column::from(&[0_u64, 1, 200, 20, 100][..]); + assert_eq!( + col.values(&[3, 4]), + Values::U64(UInt64Array::from(vec![20, 100])) + ); + + // physical type of `col` will be `u8` but logical type is `u32` + let col = Column::from(&[0_u32, 1, 200, 20, 100][..]); + assert_eq!( + col.values(&[3, 4]), + Values::U32(UInt32Array::from(vec![20, 100])) + ); + + // physical type of `col` will be `u8` but logical type is `u16` + let col = Column::from(&[0_u16, 1, 200, 20, 100][..]); + assert_eq!( + col.values(&[3, 4]), + Values::U16(UInt16Array::from(vec![20, 100])) + ); + + // physical and logical type of `col` will be `u8` + let col = Column::from(&[0_u8, 1, 200, 20, 100][..]); + assert_eq!( + col.values(&[3, 4]), + Values::U8(UInt8Array::from(vec![20, 100])) + ); + + // physical and logical type of `col` will be `f64` + let col = Column::from(&[0.0, 1.1, 20.2, 22.3, 100.1324][..]); + assert_eq!( + col.values(&[1, 3]), + Values::F64(Float64Array::from(vec![1.1, 22.3])) + ); + + // physical and logical type of `col` will be `f32` + let col = Column::from(&[0.0_f32, 1.1, 20.2, 22.3, 100.1324][..]); + assert_eq!( + col.values(&[1, 3]), + Values::F32(Float32Array::from(vec![1.1, 22.3])) + ); + + let col = Column::from(&[Some("a"), Some("b"), None, Some("c")][..]); + assert_eq!( + col.values(&[1, 2, 3]), + Values::String(StringArray::from(vec![Some("b"), None, Some("c")])) + ); + } }