feat: add support for determining logical column types

pull/24376/head
Edd Robinson 2021-01-08 12:37:11 +00:00
parent 4dff270c34
commit c3019a91bd
3 changed files with 81 additions and 13 deletions

View File

@ -69,6 +69,18 @@ impl Column {
}
}
/// Returns the logical data-type associated with the column.
pub fn logical_datatype(&self) -> LogicalDataType {
match self {
Column::String(_, _) => LogicalDataType::String,
Column::Float(_, _) => LogicalDataType::Float,
Column::Integer(_, _) => LogicalDataType::Integer,
Column::Unsigned(_, _) => LogicalDataType::Unsigned,
Column::Bool => LogicalDataType::Boolean,
Column::ByteArray(_, _) => LogicalDataType::Binary,
}
}
pub fn size(&self) -> u64 {
0
}
@ -600,6 +612,17 @@ pub struct ColumnProperties {
pub has_pre_computed_row_ids: bool,
}
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
/// The logical data-type for a column.
pub enum LogicalDataType {
Integer, // Signed integer
Unsigned, // Unsigned integer
Float, //
String, // UTF-8 valid string
Binary, // Arbitrary collection of bytes
Boolean, //
}
#[derive(Default, Debug, PartialEq)]
// The meta-data for a column
pub struct MetaData<T>

View File

@ -7,9 +7,9 @@ use arrow_deps::arrow;
use arrow_deps::arrow::record_batch::RecordBatch;
use crate::column::{
cmp::Operator, AggregateResult, AggregateType, Column, EncodedValues, OwnedValue, RowIDs,
RowIDsOption, Scalar, Value, Values, ValuesIterator, FIELD_COLUMN_TYPE, TAG_COLUMN_TYPE,
TIME_COLUMN_TYPE,
self, cmp::Operator, AggregateResult, AggregateType, Column, EncodedValues, LogicalDataType,
OwnedValue, RowIDs, RowIDsOption, Scalar, Value, Values, ValuesIterator, FIELD_COLUMN_TYPE,
TAG_COLUMN_TYPE, TIME_COLUMN_TYPE,
};
/// The name used for a timestamp column.
@ -48,6 +48,7 @@ impl RowGroup {
ColumnType::Tag(c) => {
assert_eq!(c.num_rows(), rows);
meta.column_types.insert(name.clone(), c.logical_datatype());
meta.column_ranges
.insert(name.clone(), c.column_range().unwrap());
all_columns_by_name.insert(name.clone(), all_columns.len());
@ -57,6 +58,7 @@ impl RowGroup {
ColumnType::Field(c) => {
assert_eq!(c.num_rows(), rows);
meta.column_types.insert(name.clone(), c.logical_datatype());
meta.column_ranges
.insert(name.clone(), c.column_range().unwrap());
all_columns_by_name.insert(name.clone(), all_columns.len());
@ -75,6 +77,7 @@ impl RowGroup {
Some((_, _)) => unreachable!("unexpected types for time range"),
};
meta.column_types.insert(name.clone(), c.logical_datatype());
meta.column_ranges
.insert(name.clone(), c.column_range().unwrap());
@ -85,6 +88,12 @@ impl RowGroup {
}
}
// Meta data should have same columns for types and ranges.
assert_eq!(
meta.column_types.keys().collect::<Vec<_>>(),
meta.column_ranges.keys().collect::<Vec<_>>(),
);
Self {
meta,
columns: all_columns,
@ -111,6 +120,11 @@ impl RowGroup {
&self.meta.column_ranges
}
/// The logical data-type of each column.
pub fn column_logical_types(&self) -> &BTreeMap<String, column::LogicalDataType> {
&self.meta.column_types
}
// Returns a reference to a column from the column name.
//
// It is the caller's responsibility to ensure the column exists in the read
@ -953,7 +967,7 @@ impl Ord for GroupKey<'_> {
// A representation of a column name.
pub type ColumnName<'a> = &'a str;
/// The logical type that a column could have.
/// The InfluxDB-specific semantic meaning of a column.
pub enum ColumnType {
Tag(Column),
Field(Column),
@ -987,6 +1001,9 @@ struct MetaData {
// possibly match based on the range of values a column has.
column_ranges: BTreeMap<String, (OwnedValue, OwnedValue)>,
// The logical column types for each column in the `RowGroup`.
column_types: BTreeMap<String, LogicalDataType>,
// The total time range of this table spanning all of the `RowGroup`s within
// the table.
//

View File

@ -2,7 +2,10 @@ use std::collections::{BTreeMap, BTreeSet};
use std::fmt::Display;
use std::slice::Iter;
use crate::row_group::{ColumnName, GroupKey, Predicate, RowGroup};
use crate::{
column,
row_group::{ColumnName, GroupKey, Predicate, RowGroup},
};
use crate::{
column::{AggregateResult, AggregateType, OwnedValue, Scalar, Value},
row_group::{ReadFilterResult, ReadGroupResult},
@ -94,6 +97,11 @@ impl Table {
todo!()
}
/// The logical data-type of each column in the `Table`'s schema.
pub fn column_logical_types(&self) -> &BTreeMap<String, column::LogicalDataType> {
&self.meta.column_types
}
// Determines if schema contains all the provided column names.
fn has_all_columns(&self, names: &[ColumnName<'_>]) -> bool {
for &name in names {
@ -451,6 +459,9 @@ struct MetaData {
// possibly match based on the range of values a column has.
column_ranges: BTreeMap<String, (OwnedValue, OwnedValue)>,
// The `ReadBuffer` logical types associated with the columns in the table
column_types: BTreeMap<String, column::LogicalDataType>,
// The total time range of this table spanning all of the row groups within
// the table.
//
@ -460,16 +471,21 @@ struct MetaData {
}
impl MetaData {
pub fn new(segment: &RowGroup) -> Self {
pub fn new(rg: &RowGroup) -> Self {
Self {
size: segment.size(),
rows: u64::from(segment.rows()),
column_ranges: segment
size: rg.size(),
rows: u64::from(rg.rows()),
column_ranges: rg
.column_ranges()
.iter()
.map(|(k, v)| (k.to_string(), (v.0.clone(), v.1.clone())))
.collect(),
time_range: Some(segment.time_range()),
column_types: rg
.column_logical_types()
.iter()
.map(|(k, v)| (k.clone(), *v))
.collect(),
time_range: Some(rg.time_range()),
}
}
@ -478,6 +494,10 @@ impl MetaData {
self.size += rg.size();
self.rows += u64::from(rg.rows());
// The incoming row group must have the same schema as the existing row
// groups in the table.
assert_eq!(&self.column_types, rg.column_logical_types());
assert_eq!(self.column_ranges.len(), rg.column_ranges().len());
for (column_name, (column_range_min, column_range_max)) in rg.column_ranges() {
let mut curr_range = self
@ -579,7 +599,7 @@ impl std::fmt::Display for ReadGroupResults<'_, '_> {
#[cfg(test)]
mod test {
use super::*;
use crate::column::{cmp::Operator, Column};
use crate::column::{cmp::Operator, Column, LogicalDataType};
use crate::row_group::{ColumnType, TIME_COLUMN_NAME};
fn build_predicates(
@ -617,9 +637,17 @@ mod test {
let fc = ColumnType::Field(Column::from(&[100_u64, 101, 200, 203, 203, 10][..]));
columns.insert("count".to_string(), fc);
let segment = RowGroup::new(6, columns);
let rg = RowGroup::new(6, columns);
let mut table = Table::new("cpu".to_owned(), segment);
let mut table = Table::new("cpu".to_owned(), rg);
let exp_col_types = vec![
("region".to_owned(), LogicalDataType::String),
("count".to_owned(), LogicalDataType::Unsigned),
("time".to_owned(), LogicalDataType::Integer),
]
.into_iter()
.collect::<BTreeMap<_, _>>();
assert_eq!(table.column_logical_types(), &exp_col_types);
// Build another segment.
let mut columns = BTreeMap::new();