use std::{convert::TryFrom, fmt::Display}; use arrow_deps::arrow; use data_types::schema::InfluxFieldType; /// A schema that is used to track the names and semantics of columns returned /// in results out of various operations on a row group. /// /// This schema is useful for helping with displaying information in tests and /// decorating Arrow record batches when results are converted before leaving /// the read buffer. #[derive(Default, PartialEq, Debug, Clone)] pub struct ResultSchema { pub select_columns: Vec<(ColumnType, LogicalDataType)>, pub group_columns: Vec<(ColumnType, LogicalDataType)>, pub aggregate_columns: Vec<(ColumnType, AggregateType, LogicalDataType)>, } impl ResultSchema { pub fn select_column_names_iter(&self) -> impl Iterator { self.select_columns.iter().map(|(name, _)| match name { ColumnType::Tag(name) => name, ColumnType::Field(name) => name, ColumnType::Timestamp(name) => name, ColumnType::Other(name) => name, }) } pub fn group_column_names_iter(&self) -> impl Iterator { self.group_columns.iter().map(|(name, _)| match name { ColumnType::Tag(name) => name, ColumnType::Field(name) => name, ColumnType::Timestamp(name) => name, ColumnType::Other(name) => name, }) } pub fn aggregate_column_names_iter(&self) -> impl Iterator { self.aggregate_columns .iter() .map(|(name, _, _)| match name { ColumnType::Tag(name) => name, ColumnType::Field(name) => name, ColumnType::Timestamp(name) => name, ColumnType::Other(name) => name, }) } pub fn is_empty(&self) -> bool { self.len() == 0 } pub fn len(&self) -> usize { self.select_columns.len() + self.group_columns.len() + self.aggregate_columns.len() } // How to display the name for a column that was constructed as an aggregate // result. // // TODO(edd): support multiple instances of the same aggregation on the same // column? E.g., `temp_sum_1`, `temp_sum_2` etc?? fn aggregate_result_column_name(&self, i: usize) -> String { let (col_type, agg_type, _) = self.aggregate_columns.get(i).unwrap(); format!("{}_{}", col_type, agg_type) } } /// Effectively emits a header line for a CSV-like table. impl Display for ResultSchema { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // do we need to emit the group by or aggregate columns? let has_agg_columns = !self.aggregate_columns.is_empty(); for (i, (name, _)) in self.select_columns.iter().enumerate() { if has_agg_columns || i < self.select_columns.len() - 1 { write!(f, "{},", name)?; } else if !has_agg_columns { return write!(f, "{}", name); // last value in header row } } // write out group by columns, if any for (i, (name, _)) in self.group_columns.iter().enumerate() { write!(f, "{},", name)?; } // finally, emit the aggregate columns for (i, _) in self.aggregate_columns.iter().enumerate() { write!(f, "{}", self.aggregate_result_column_name(i))?; if i < self.aggregate_columns.len() - 1 { write!(f, ",")?; } } writeln!(f) } } impl TryFrom<&ResultSchema> for data_types::schema::Schema { type Error = data_types::schema::builder::Error; fn try_from(rs: &ResultSchema) -> Result { let mut builder = data_types::schema::builder::SchemaBuilder::new(); for (col_type, data_type) in &rs.select_columns { match col_type { ColumnType::Tag(name) => builder = builder.tag(name.as_str()), ColumnType::Field(name) => { builder = builder.influx_field(name.as_str(), data_type.into()) } ColumnType::Timestamp(_) => builder = builder.timestamp(), ColumnType::Other(name) => builder = builder.field(name.as_str(), data_type.into()), } } for (col_type, data_type) in &rs.group_columns { match col_type { ColumnType::Tag(name) => builder = builder.tag(name.as_str()), ColumnType::Field(name) => { builder = builder.influx_field(name.as_str(), data_type.into()) } ColumnType::Timestamp(_) => builder = builder.timestamp(), ColumnType::Other(name) => builder = builder.field(name.as_str(), data_type.into()), } } for (i, (col_type, _, data_type)) in rs.aggregate_columns.iter().enumerate() { let col_name = rs.aggregate_result_column_name(i); match col_type { ColumnType::Field(_) => { builder = builder.influx_field(col_name.as_str(), data_type.into()) } ColumnType::Other(_) => { builder = builder.field(col_name.as_str(), data_type.into()) } ct => unreachable!("not possible to aggregate {:?} columns", ct), } } builder.build() } } #[derive(Debug, Copy, Clone, Eq, PartialEq)] /// The logical data-type for a column. pub enum LogicalDataType { Integer, // Signed integer Unsigned, // Unsigned integer Float, // String, // UTF-8 valid string Binary, // Arbitrary collection of bytes Boolean, // } impl From<&LogicalDataType> for arrow::datatypes::DataType { fn from(logical_type: &LogicalDataType) -> Self { match logical_type { LogicalDataType::Integer => arrow::datatypes::DataType::Int64, LogicalDataType::Unsigned => arrow::datatypes::DataType::UInt64, LogicalDataType::Float => arrow::datatypes::DataType::Float64, LogicalDataType::String => arrow::datatypes::DataType::Utf8, LogicalDataType::Binary => arrow::datatypes::DataType::Binary, LogicalDataType::Boolean => arrow::datatypes::DataType::Boolean, } } } impl From<&LogicalDataType> for InfluxFieldType { fn from(logical_type: &LogicalDataType) -> Self { match logical_type { LogicalDataType::Integer => InfluxFieldType::Integer, LogicalDataType::Unsigned => InfluxFieldType::UInteger, LogicalDataType::Float => InfluxFieldType::Float, LogicalDataType::String => InfluxFieldType::String, LogicalDataType::Binary => { unimplemented!("binary data type cannot be represented as InfluxFieldType") } LogicalDataType::Boolean => InfluxFieldType::Boolean, } } } /// These variants describe supported aggregates that can applied to columnar /// data in the Read Buffer. #[derive(Copy, Clone, PartialEq, Debug)] pub enum AggregateType { Count, First, Last, Min, Max, Sum, /* TODO - support: * Distinct - (edd): not sure this counts as an aggregations. Seems more like a special * filter. CountDistinct * Percentile */ } impl std::fmt::Display for AggregateType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}", match self { AggregateType::Count => "count", AggregateType::First => "first", AggregateType::Last => "last", AggregateType::Min => "min", AggregateType::Max => "max", AggregateType::Sum => "sum", } ) } } /// Describes the semantic meaning of the column in a set of results. That is, /// whether the column is a "tag", "field", "timestamp", or "other". #[derive(PartialEq, Debug, PartialOrd, Clone)] pub enum ColumnType { Tag(String), Field(String), Timestamp(String), Other(String), } impl ColumnType { pub fn as_str(&self) -> &str { match self { ColumnType::Tag(name) => name.as_str(), ColumnType::Field(name) => name.as_str(), ColumnType::Timestamp(name) => name.as_str(), ColumnType::Other(name) => name.as_str(), } } } impl Display for ColumnType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "{}", match self { ColumnType::Tag(name) => name, ColumnType::Field(name) => name, ColumnType::Timestamp(name) => name, ColumnType::Other(name) => name, } ) } }