feat: use bitmasks within MUB (#1274) (#1289)

* feat: use bitmasks within MUB (#1274)

* chore: review feedback

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
pull/24376/head
Raphael Taylor-Davies 2021-04-26 19:00:16 +01:00 committed by GitHub
parent e33af0c084
commit 0a835436ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 866 additions and 626 deletions

1
Cargo.lock generated
View File

@ -1889,6 +1889,7 @@ dependencies = [
"internal_types",
"observability_deps",
"parking_lot",
"rand 0.8.3",
"snafu",
"test_helpers",
"tokio",

View File

@ -1,10 +1,10 @@
//! This module contains structs that describe the metadata for a partition
//! including schema, summary statistics, and file locations in storage.
use std::fmt::{Debug, Display};
use std::mem;
use serde::{Deserialize, Serialize};
use std::borrow::Borrow;
/// Describes the schema, summary statistics for each column in each table and
/// the location of the partition in storage.
@ -222,8 +222,8 @@ impl Statistics {
}
/// Summary statistics for a column.
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq)]
pub struct StatValues<T: PartialEq + PartialOrd + Debug + Display + Clone> {
#[derive(Debug, Deserialize, Serialize, Clone, PartialEq, Default)]
pub struct StatValues<T> {
pub min: T,
pub max: T,
/// number of non-nil values in this column
@ -232,51 +232,38 @@ pub struct StatValues<T: PartialEq + PartialOrd + Debug + Display + Clone> {
impl<T> StatValues<T>
where
T: PartialEq + PartialOrd + Debug + Display + Clone,
T: Default + Clone,
{
pub fn new(starting_value: T) -> Self {
pub fn new() -> Self {
Self::default()
}
pub fn new_with_value(starting_value: T) -> Self {
Self {
min: starting_value.clone(),
max: starting_value,
count: 1,
}
}
}
impl<T> StatValues<T> {
/// updates the statistics keeping the min, max and incrementing count.
pub fn update(&mut self, other: T) {
///
/// The type plumbing exists to allow calling with &str on a StatValues<String>
pub fn update<U: ?Sized>(&mut self, other: &U)
where
T: Borrow<U>,
U: ToOwned<Owned = T> + PartialOrd,
{
self.count += 1;
let set_min = self.min > other;
let set_max = self.max < other;
match (set_min, set_max) {
(true, true) => {
self.min = other.clone();
self.max = other;
}
(true, false) => {
self.min = other;
}
(false, true) => {
self.max = other;
}
(false, false) => (),
}
}
if self.count == 1 || self.min.borrow() > other {
self.min = other.to_owned();
}
impl StatValues<String> {
/// Function for string stats to avoid allocating if we're not updating min
/// or max
pub fn update_string(stats: &mut Self, other: &str) {
stats.count += 1;
if stats.min.as_str() > other {
stats.min = other.to_string();
}
if stats.max.as_str() < other {
stats.max = other.to_string();
if self.count == 1 || self.max.borrow() < other {
self.max = other.to_owned();
}
}
}
@ -287,45 +274,73 @@ mod tests {
#[test]
fn statistics_update() {
let mut stat = StatValues::new(23);
let mut stat = StatValues::new_with_value(23);
assert_eq!(stat.min, 23);
assert_eq!(stat.max, 23);
assert_eq!(stat.count, 1);
stat.update(55);
stat.update(&55);
assert_eq!(stat.min, 23);
assert_eq!(stat.max, 55);
assert_eq!(stat.count, 2);
stat.update(6);
stat.update(&6);
assert_eq!(stat.min, 6);
assert_eq!(stat.max, 55);
assert_eq!(stat.count, 3);
stat.update(30);
stat.update(&30);
assert_eq!(stat.min, 6);
assert_eq!(stat.max, 55);
assert_eq!(stat.count, 4);
}
#[test]
fn statistics_default() {
let mut stat = StatValues::new();
assert_eq!(stat.min, 0);
assert_eq!(stat.max, 0);
assert_eq!(stat.count, 0);
stat.update(&55);
assert_eq!(stat.min, 55);
assert_eq!(stat.max, 55);
assert_eq!(stat.count, 1);
let mut stat = StatValues::new();
assert_eq!(&stat.min, "");
assert_eq!(&stat.max, "");
assert_eq!(stat.count, 0);
stat.update("cupcakes");
assert_eq!(&stat.min, "cupcakes");
assert_eq!(&stat.max, "cupcakes");
assert_eq!(stat.count, 1);
stat.update("woo");
assert_eq!(&stat.min, "cupcakes");
assert_eq!(&stat.max, "woo");
assert_eq!(stat.count, 2);
}
#[test]
fn update_string() {
let mut stat = StatValues::new("bbb".to_string());
let mut stat = StatValues::new_with_value("bbb".to_string());
assert_eq!(stat.min, "bbb".to_string());
assert_eq!(stat.max, "bbb".to_string());
assert_eq!(stat.count, 1);
StatValues::update_string(&mut stat, "aaa");
stat.update("aaa");
assert_eq!(stat.min, "aaa".to_string());
assert_eq!(stat.max, "bbb".to_string());
assert_eq!(stat.count, 2);
StatValues::update_string(&mut stat, "z");
stat.update("z");
assert_eq!(stat.min, "aaa".to_string());
assert_eq!(stat.max, "z".to_string());
assert_eq!(stat.count, 3);
StatValues::update_string(&mut stat, "p");
stat.update("p");
assert_eq!(stat.min, "aaa".to_string());
assert_eq!(stat.max, "z".to_string());
assert_eq!(stat.count, 4);
@ -333,22 +348,22 @@ mod tests {
#[test]
fn table_update_from() {
let mut string_stats = StatValues::new("foo".to_string());
string_stats.update("bar".to_string());
let mut string_stats = StatValues::new_with_value("foo".to_string());
string_stats.update("bar");
let string_col = ColumnSummary {
name: "string".to_string(),
stats: Statistics::String(string_stats),
};
let mut int_stats = StatValues::new(1);
int_stats.update(5);
let mut int_stats = StatValues::new_with_value(1);
int_stats.update(&5);
let int_col = ColumnSummary {
name: "int".to_string(),
stats: Statistics::I64(int_stats),
};
let mut float_stats = StatValues::new(9.1);
float_stats.update(1.3);
let mut float_stats = StatValues::new_with_value(9.1);
float_stats.update(&1.3);
let float_col = ColumnSummary {
name: "float".to_string(),
stats: Statistics::F64(float_stats),
@ -359,15 +374,15 @@ mod tests {
columns: vec![string_col, int_col, float_col],
};
let mut string_stats = StatValues::new("aaa".to_string());
string_stats.update("zzz".to_string());
let mut string_stats = StatValues::new_with_value("aaa".to_string());
string_stats.update("zzz");
let string_col = ColumnSummary {
name: "string".to_string(),
stats: Statistics::String(string_stats),
};
let mut int_stats = StatValues::new(3);
int_stats.update(9);
let mut int_stats = StatValues::new_with_value(3);
int_stats.update(&9);
let int_col = ColumnSummary {
name: "int".to_string(),
stats: Statistics::I64(int_stats),
@ -446,15 +461,15 @@ mod tests {
#[test]
fn from_table_summaries() {
let mut string_stats = StatValues::new("foo".to_string());
string_stats.update("bar".to_string());
let mut string_stats = StatValues::new_with_value("foo".to_string());
string_stats.update("bar");
let string_col = ColumnSummary {
name: "string".to_string(),
stats: Statistics::String(string_stats),
};
let mut int_stats = StatValues::new(1);
int_stats.update(5);
let mut int_stats = StatValues::new_with_value(1);
int_stats.update(&5);
let int_col = ColumnSummary {
name: "int".to_string(),
stats: Statistics::I64(int_stats),
@ -467,7 +482,7 @@ mod tests {
let int_col = ColumnSummary {
name: "int".to_string(),
stats: Statistics::I64(StatValues::new(10)),
stats: Statistics::I64(StatValues::new_with_value(10)),
};
let table_b = TableSummary {
name: "b".to_string(),
@ -481,7 +496,7 @@ mod tests {
let int_col = ColumnSummary {
name: "int".to_string(),
stats: Statistics::I64(StatValues::new(203)),
stats: Statistics::I64(StatValues::new_with_value(203)),
};
let table_b_2 = TableSummary {
name: "b".to_string(),

View File

@ -1,7 +1,7 @@
//! This module contains helper code for building `Entry` and `SequencedEntry`
//! from line protocol and the `DatabaseRules` configuration.
use crate::schema::TIME_COLUMN_NAME;
use crate::schema::{InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME};
use data_types::database_rules::{Error as DataError, Partitioner, ShardId, Sharder, WriterId};
use generated_types::entry as entry_fb;
use influxdb_line_protocol::{FieldValue, ParsedLine};
@ -445,6 +445,37 @@ impl<'a> Column<'a> {
.expect("name must be present in flatbuffers Column")
}
pub fn inner(&self) -> &entry_fb::Column<'a> {
&self.fb
}
pub fn influx_type(&self) -> InfluxColumnType {
match (self.fb.values_type(), self.fb.logical_column_type()) {
(entry_fb::ColumnValues::BoolValues, entry_fb::LogicalColumnType::Field) => {
InfluxColumnType::Field(InfluxFieldType::Boolean)
}
(entry_fb::ColumnValues::U64Values, entry_fb::LogicalColumnType::Field) => {
InfluxColumnType::Field(InfluxFieldType::UInteger)
}
(entry_fb::ColumnValues::F64Values, entry_fb::LogicalColumnType::Field) => {
InfluxColumnType::Field(InfluxFieldType::Float)
}
(entry_fb::ColumnValues::I64Values, entry_fb::LogicalColumnType::Field) => {
InfluxColumnType::Field(InfluxFieldType::Integer)
}
(entry_fb::ColumnValues::StringValues, entry_fb::LogicalColumnType::Tag) => {
InfluxColumnType::Tag
}
(entry_fb::ColumnValues::StringValues, entry_fb::LogicalColumnType::Field) => {
InfluxColumnType::Field(InfluxFieldType::String)
}
(entry_fb::ColumnValues::I64Values, entry_fb::LogicalColumnType::Time) => {
InfluxColumnType::Timestamp
}
_ => unreachable!(),
}
}
pub fn logical_type(&self) -> entry_fb::LogicalColumnType {
self.fb.logical_column_type()
}

View File

@ -617,10 +617,10 @@ impl From<&InfluxColumnType> for &'static str {
}
}
impl ToString for InfluxColumnType {
fn to_string(&self) -> String {
impl std::fmt::Display for InfluxColumnType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s: &str = self.into();
s.into()
write!(f, "{}", s)
}
}

View File

@ -95,6 +95,15 @@ impl SchemaBuilder {
)
}
/// Add a new field column with the specified InfluxDB data model type
pub fn influx_column(self, column_name: &str, column_type: InfluxColumnType) -> Self {
match column_type {
InfluxColumnType::Tag => self.tag(column_name),
InfluxColumnType::Field(field) => self.field(column_name, field.into()),
InfluxColumnType::Timestamp => self.timestamp(),
}
}
/// Add a new nullable field column with the specified Arrow datatype.
pub fn field(self, column_name: &str, arrow_type: ArrowDataType) -> Self {
let influxdb_column_type = arrow_type

View File

@ -35,6 +35,7 @@ tracker = { path = "../tracker" }
test_helpers = { path = "../test_helpers" }
criterion = "0.3"
flate2 = "1.0.20"
rand = "0.8.3"
[features]
default = []

View File

@ -0,0 +1,264 @@
use arrow_deps::arrow::buffer::Buffer;
/// An arrow-compatible mutable bitset implementation
///
/// Note: This currently operates on individual bytes at a time
/// it could be optimised to instead operate on usize blocks
#[derive(Debug)]
pub struct BitSet {
/// The underlying data
///
/// Data is stored in the least significant bit of a byte first
buffer: Vec<u8>,
/// The length of this mask in bits
len: usize,
}
impl BitSet {
/// Creates a new BitSet
pub fn new() -> Self {
Self {
buffer: Default::default(),
len: 0,
}
}
/// Appends `count` unset bits
pub fn append_unset(&mut self, count: usize) {
self.len += count;
let new_buf_len = (self.len + 7) >> 3;
self.buffer.resize(new_buf_len, 0);
}
/// Appends `count` boolean values from the slice of packed bits
pub fn append_bits(&mut self, count: usize, to_set: &[u8]) {
let new_len = self.len + count;
let new_buf_len = (new_len + 7) >> 3;
self.buffer.reserve(new_buf_len - self.buffer.len());
let whole_bytes = count >> 3;
let overrun = count & 7;
let skew = self.len & 7;
if skew == 0 {
self.buffer.extend_from_slice(&to_set[..whole_bytes]);
if overrun > 0 {
let masked = to_set[whole_bytes] & ((1 << overrun) - 1);
self.buffer.push(masked)
}
self.len = new_len;
debug_assert_eq!(self.buffer.len(), new_buf_len);
return;
}
for to_set_byte in &to_set[..whole_bytes] {
let low = *to_set_byte << skew;
let high = *to_set_byte >> (8 - skew);
*self.buffer.last_mut().unwrap() |= low;
self.buffer.push(high);
}
if overrun > 0 {
let masked = to_set[whole_bytes] & ((1 << overrun) - 1);
let low = masked << skew;
*self.buffer.last_mut().unwrap() |= low;
if overrun > 8 - skew {
let high = masked >> (8 - skew);
self.buffer.push(high)
}
}
self.len = new_len;
debug_assert_eq!(self.buffer.len(), new_buf_len);
}
/// Sets a given bit
pub fn set(&mut self, idx: usize) {
let byte_idx = idx >> 3;
let bit_idx = idx & 7;
self.buffer[byte_idx] |= 1 << bit_idx;
}
/// Returns if the given index is set
pub fn get(&self, idx: usize) -> bool {
let byte_idx = idx >> 3;
let bit_idx = idx & 7;
(self.buffer[byte_idx] >> bit_idx) & 1 != 0
}
/// Converts this BitSet to a buffer compatible with arrows boolean encoding
pub fn to_arrow(&self) -> Buffer {
Buffer::from(&self.buffer)
}
/// Returns the number of values stored in the bitset
pub fn len(&self) -> usize {
self.len
}
/// Returns the number of bytes used by this bitset
pub fn byte_len(&self) -> usize {
self.buffer.len()
}
}
/// Returns an iterator over set bit positions in increasing order
pub fn iter_set_positions(bytes: &[u8]) -> impl Iterator<Item = usize> + '_ {
let mut byte_idx = 0;
let mut in_progress = bytes.get(0).cloned().unwrap_or(0);
std::iter::from_fn(move || loop {
if in_progress != 0 {
let bit_pos = in_progress.trailing_zeros();
in_progress ^= 1 << bit_pos;
return Some((byte_idx << 3) + (bit_pos as usize));
}
byte_idx += 1;
in_progress = *bytes.get(byte_idx)?;
})
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_deps::arrow::array::BooleanBufferBuilder;
use rand::RngCore;
/// Computes a compacted representation of a given bool array
fn compact_bools(bools: &[bool]) -> Vec<u8> {
bools
.chunks(8)
.map(|x| {
let mut collect = 0_u8;
for (idx, set) in x.iter().enumerate() {
if *set {
collect |= 1 << idx
}
}
collect
})
.collect()
}
fn iter_set_bools(bools: &[bool]) -> impl Iterator<Item = usize> + '_ {
bools.iter().enumerate().filter_map(|(x, y)| y.then(|| x))
}
#[test]
fn test_compact_bools() {
let bools = &[
false, false, true, true, false, false, true, false, true, false,
];
let collected = compact_bools(bools);
let indexes: Vec<_> = iter_set_bools(bools).collect();
assert_eq!(collected.as_slice(), &[0b01001100, 0b00000001]);
assert_eq!(indexes.as_slice(), &[2, 3, 6, 8])
}
#[test]
fn test_bit_mask() {
let mut mask = BitSet::new();
mask.append_bits(8, &[0b11111111]);
let d1 = mask.buffer.clone();
mask.append_bits(3, &[0b01010010]);
let d2 = mask.buffer.clone();
mask.append_bits(5, &[0b00010100]);
let d3 = mask.buffer.clone();
mask.append_bits(2, &[0b11110010]);
let d4 = mask.buffer.clone();
mask.append_bits(15, &[0b11011010, 0b01010101]);
let d5 = mask.buffer.clone();
assert_eq!(d1.as_slice(), &[0b11111111]);
assert_eq!(d2.as_slice(), &[0b11111111, 0b00000010]);
assert_eq!(d3.as_slice(), &[0b11111111, 0b10100010]);
assert_eq!(d4.as_slice(), &[0b11111111, 0b10100010, 0b00000010]);
assert_eq!(
d5.as_slice(),
&[0b11111111, 0b10100010, 0b01101010, 0b01010111, 0b00000001]
);
assert!(mask.get(0));
assert!(!mask.get(8));
assert!(mask.get(9));
assert!(mask.get(19));
}
#[test]
fn test_bit_mask_all_set() {
let mut mask = BitSet::new();
let mut all_bools = vec![];
let mut rng = rand::thread_rng();
for _ in 0..100 {
let mask_length = (rng.next_u32() % 50) as usize;
let bools: Vec<_> = std::iter::repeat(true).take(mask_length).collect();
let collected = compact_bools(&bools);
mask.append_bits(mask_length, &collected);
all_bools.extend_from_slice(&bools);
}
let collected = compact_bools(&all_bools);
assert_eq!(mask.buffer, collected);
let expected_indexes: Vec<_> = iter_set_bools(&all_bools).collect();
let actual_indexes: Vec<_> = iter_set_positions(&mask.buffer).collect();
assert_eq!(expected_indexes, actual_indexes);
}
#[test]
fn test_bit_mask_fuzz() {
let mut mask = BitSet::new();
let mut all_bools = vec![];
let mut rng = rand::thread_rng();
for _ in 0..100 {
let mask_length = (rng.next_u32() % 50) as usize;
let bools: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0))
.take(mask_length)
.collect();
let collected = compact_bools(&bools);
mask.append_bits(mask_length, &collected);
all_bools.extend_from_slice(&bools);
}
let collected = compact_bools(&all_bools);
assert_eq!(mask.buffer, collected);
let expected_indexes: Vec<_> = iter_set_bools(&all_bools).collect();
let actual_indexes: Vec<_> = iter_set_positions(&mask.buffer).collect();
assert_eq!(expected_indexes, actual_indexes);
for index in actual_indexes {
assert!(mask.get(index));
}
}
#[test]
fn test_arrow_compat() {
let bools = &[
false, false, true, true, false, false, true, false, true, false, false, true,
];
let mut builder = BooleanBufferBuilder::new(bools.len());
builder.append_slice(bools);
let buffer = builder.finish();
let collected = compact_bools(bools);
let mut mask = BitSet::new();
mask.append_bits(bools.len(), &collected);
let mask_buffer = mask.to_arrow();
assert_eq!(collected.as_slice(), buffer.as_slice());
assert_eq!(buffer.as_slice(), mask_buffer.as_slice());
}
}

View File

@ -8,6 +8,7 @@ use internal_types::selection::Selection;
use snafu::{OptionExt, ResultExt, Snafu};
use super::Chunk;
use data_types::partition_metadata::Statistics;
#[derive(Debug, Snafu)]
pub enum Error {
@ -64,11 +65,14 @@ impl ChunkSnapshot {
.lookup_value(TIME_COLUMN_NAME)
.ok()
.and_then(|column_id| {
table.column(column_id).ok().and_then(|column| {
// TimestampRange has an exclusive upper bound
column
.get_i64_stats()
.map(|x| TimestampRange::new(x.min, x.max + 1))
table
.column(column_id)
.ok()
.and_then(|column| match column.stats() {
Statistics::I64(stats) => {
Some(TimestampRange::new(stats.min, stats.max + 1))
}
_ => None,
})
});

View File

@ -1,341 +1,253 @@
use snafu::Snafu;
use snafu::{ensure, Snafu};
use crate::dictionary::{Dictionary, DID};
use data_types::partition_metadata::StatValues;
use generated_types::entry::LogicalColumnType;
use internal_types::entry::TypedValuesIterator;
use data_types::partition_metadata::{StatValues, Statistics};
use internal_types::entry::Column as EntryColumn;
use crate::bitset::{iter_set_positions, BitSet};
use arrow_deps::arrow::array::{
ArrayDataBuilder, ArrayRef, BooleanArray, Float64Array, Int64Array, StringArray,
TimestampNanosecondArray, UInt64Array,
};
use arrow_deps::arrow::datatypes::DataType;
use internal_types::schema::{InfluxColumnType, InfluxFieldType, TIME_DATA_TYPE};
use std::iter::FromIterator;
use std::mem;
use std::sync::Arc;
#[derive(Debug, Snafu)]
#[allow(missing_copy_implementations)]
pub enum Error {
#[snafu(display("Don't know how to insert a column of type {}", inserted_value_type))]
UnknownColumnType { inserted_value_type: String },
#[snafu(display(
"Unable to insert {} type into a column of {}",
inserted_value_type,
existing_column_type
))]
#[snafu(display("Unable to insert {} type into a column of {}", inserted, existing,))]
TypeMismatch {
existing_column_type: String,
inserted_value_type: String,
existing: InfluxColumnType,
inserted: InfluxColumnType,
},
#[snafu(display("InternalError: Applying i64 range on a column with non-i64 type"))]
InternalTypeMismatchForTimePredicate,
#[snafu(display(
"Invalid null mask, expected to be {} bytes but was {}",
expected_bytes,
actual_bytes
))]
InvalidNullMask {
expected_bytes: usize,
actual_bytes: usize,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
/// Stores the actual data for columns in a chunk along with summary
/// statistics
#[derive(Debug, Clone)]
pub enum Column {
F64(Vec<Option<f64>>, StatValues<f64>),
I64(Vec<Option<i64>>, StatValues<i64>),
U64(Vec<Option<u64>>, StatValues<u64>),
String(Vec<Option<String>>, StatValues<String>),
Bool(Vec<Option<bool>>, StatValues<bool>),
#[derive(Debug)]
pub struct Column {
influx_type: InfluxColumnType,
valid: BitSet,
data: ColumnData,
}
#[derive(Debug)]
pub enum ColumnData {
F64(Vec<f64>, StatValues<f64>),
I64(Vec<i64>, StatValues<i64>),
U64(Vec<u64>, StatValues<u64>),
String(Vec<String>, StatValues<String>),
Bool(BitSet, StatValues<bool>),
Tag(Vec<DID>, StatValues<String>),
}
impl Column {
/// Initializes a new column from typed values, the column on a table write
/// batch on an Entry. Will initialize the stats with the first
/// non-null value and update with any other non-null values included.
pub fn new_from_typed_values(
dictionary: &mut Dictionary,
row_count: usize,
logical_type: LogicalColumnType,
values: TypedValuesIterator<'_>,
) -> Self {
match values {
TypedValuesIterator::String(vals) => match logical_type {
LogicalColumnType::Tag => {
let mut tag_values = vec![DID::invalid(); row_count];
let mut stats: Option<StatValues<String>> = None;
pub fn new(row_count: usize, column_type: InfluxColumnType) -> Self {
let mut valid = BitSet::new();
valid.append_unset(row_count);
let mut added_tag_values: Vec<_> = vals
.map(|tag| match tag {
Some(tag) => {
match stats.as_mut() {
Some(s) => StatValues::update_string(s, tag),
None => {
stats = Some(StatValues::new(tag.to_string()));
let data = match column_type {
InfluxColumnType::Field(InfluxFieldType::Boolean) => {
let mut data = BitSet::new();
data.append_unset(row_count);
ColumnData::Bool(data, StatValues::new())
}
InfluxColumnType::Field(InfluxFieldType::UInteger) => {
ColumnData::U64(vec![0; row_count], StatValues::new())
}
dictionary.lookup_value_or_insert(tag)
InfluxColumnType::Field(InfluxFieldType::Float) => {
ColumnData::F64(vec![0.0; row_count], StatValues::new())
}
None => DID::invalid(),
})
.collect();
tag_values.append(&mut added_tag_values);
Self::Tag(
tag_values,
stats.expect("can't insert tag column with no values"),
)
InfluxColumnType::Field(InfluxFieldType::Integer) | InfluxColumnType::Timestamp => {
ColumnData::I64(vec![0; row_count], StatValues::new())
}
LogicalColumnType::Field => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<String>> = None;
for value in vals {
match value {
Some(v) => {
match stats.as_mut() {
Some(s) => StatValues::update_string(s, v),
None => stats = Some(StatValues::new(v.to_string())),
InfluxColumnType::Field(InfluxFieldType::String) => {
ColumnData::String(vec![String::new(); row_count], StatValues::new())
}
values.push(Some(v.to_string()));
InfluxColumnType::Tag => {
ColumnData::Tag(vec![DID::invalid(); row_count], StatValues::new())
}
None => values.push(None),
}
}
Self::String(
values,
stats.expect("can't insert string column with no values"),
)
}
_ => panic!("unsupported!"),
},
TypedValuesIterator::I64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<i64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::I64(
values,
stats.expect("can't insert i64 column with no values"),
)
}
TypedValuesIterator::F64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<f64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::F64(
values,
stats.expect("can't insert f64 column with no values"),
)
}
TypedValuesIterator::U64(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<u64>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::U64(
values,
stats.expect("can't insert u64 column with no values"),
)
}
TypedValuesIterator::Bool(vals) => {
let mut values = vec![None; row_count];
let mut stats: Option<StatValues<bool>> = None;
for v in vals {
if let Some(val) = v {
match stats.as_mut() {
Some(s) => s.update(val),
None => stats = Some(StatValues::new(val)),
}
}
values.push(v);
}
Self::Bool(
values,
stats.expect("can't insert bool column with no values"),
)
}
}
}
/// Pushes typed values, the column from a table write batch on an Entry.
/// Updates statsistics for any non-null values.
pub fn push_typed_values(
&mut self,
dictionary: &mut Dictionary,
logical_type: LogicalColumnType,
values: TypedValuesIterator<'_>,
) -> Result<()> {
match (self, values) {
(Self::Bool(col, stats), TypedValuesIterator::Bool(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
Self {
influx_type: column_type,
valid,
data,
}
}
(Self::I64(col, stats), TypedValuesIterator::I64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::F64(col, stats), TypedValuesIterator::F64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::U64(col, stats), TypedValuesIterator::U64(values)) => {
for val in values {
if let Some(v) = val {
stats.update(v)
};
col.push(val);
}
}
(Self::String(col, stats), TypedValuesIterator::String(values)) => {
if logical_type != LogicalColumnType::Field {
pub fn validate_schema(&self, entry: &EntryColumn<'_>) -> Result<()> {
let entry_type = entry.influx_type();
ensure!(
entry_type == self.influx_type,
TypeMismatch {
existing_column_type: "String",
inserted_value_type: "tag",
}
.fail()?;
}
for val in values {
match val {
Some(v) => {
StatValues::update_string(stats, v);
col.push(Some(v.to_string()));
}
None => col.push(None),
}
}
}
(Self::Tag(col, stats), TypedValuesIterator::String(values)) => {
if logical_type != LogicalColumnType::Tag {
TypeMismatch {
existing_column_type: "tag",
inserted_value_type: "String",
}
.fail()?;
}
for val in values {
match val {
Some(v) => {
StatValues::update_string(stats, v);
let id = dictionary.lookup_value_or_insert(v);
col.push(id);
}
None => col.push(DID::invalid()),
}
}
}
(existing, values) => TypeMismatch {
existing_column_type: existing.type_description(),
inserted_value_type: values.type_description(),
}
.fail()?,
existing: self.influx_type,
inserted: entry_type
}
);
Ok(())
}
/// Pushes None values onto the column until its len is equal to that passed
/// in
pub fn influx_type(&self) -> InfluxColumnType {
self.influx_type
}
pub fn append(&mut self, entry: &EntryColumn<'_>, dictionary: &mut Dictionary) -> Result<()> {
self.validate_schema(entry)?;
let row_count = entry.row_count;
if row_count == 0 {
return Ok(());
}
let mask = construct_valid_mask(entry)?;
match &mut self.data {
ColumnData::Bool(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_bool_values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload");
let data_offset = col_data.len();
col_data.append_unset(row_count);
let initial_non_null_count = stats.count;
for (idx, value) in iter_set_positions(&mask).zip(entry_data) {
stats.update(value);
if *value {
col_data.set(data_offset + idx);
}
}
assert_eq!(
stats.count - initial_non_null_count,
entry_data.len() as u64
);
}
ColumnData::U64(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_u64values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload")
.into_iter();
handle_write(row_count, &mask, entry_data, col_data, stats);
}
ColumnData::F64(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_f64values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload")
.into_iter();
handle_write(row_count, &mask, entry_data, col_data, stats);
}
ColumnData::I64(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_i64values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload")
.into_iter();
handle_write(row_count, &mask, entry_data, col_data, stats);
}
ColumnData::String(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_string_values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload")
.into_iter()
.map(ToString::to_string);
handle_write(row_count, &mask, entry_data, col_data, stats);
}
ColumnData::Tag(col_data, stats) => {
let entry_data = entry
.inner()
.values_as_string_values()
.expect("invalid flatbuffer")
.values()
.expect("invalid payload");
let data_offset = col_data.len();
col_data.resize(data_offset + row_count, DID::invalid());
let initial_non_null_count = stats.count;
let to_add = entry_data.len();
for (idx, value) in iter_set_positions(&mask).zip(entry_data) {
stats.update(value);
col_data[data_offset + idx] = dictionary.lookup_value_or_insert(value);
}
assert_eq!(stats.count - initial_non_null_count, to_add as u64);
}
};
self.valid.append_bits(entry.row_count, &mask);
Ok(())
}
pub fn push_nulls_to_len(&mut self, len: usize) {
match self {
Self::Tag(vals, _) => {
if len > vals.len() {
vals.resize(len, DID::invalid());
}
}
Self::I64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::F64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::U64(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::Bool(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
}
Self::String(vals, _) => {
if len > vals.len() {
vals.resize(len, None);
}
if self.valid.len() == len {
return;
}
assert!(len > self.valid.len(), "cannot shrink column");
let delta = len - self.valid.len();
self.valid.append_unset(delta);
match &mut self.data {
ColumnData::F64(data, _) => data.resize(len, 0.),
ColumnData::I64(data, _) => data.resize(len, 0),
ColumnData::U64(data, _) => data.resize(len, 0),
ColumnData::String(data, _) => data.resize(len, String::new()),
ColumnData::Bool(data, _) => data.append_unset(delta),
ColumnData::Tag(data, _) => data.resize(len, DID::invalid()),
}
}
pub fn len(&self) -> usize {
match self {
Self::F64(v, _) => v.len(),
Self::I64(v, _) => v.len(),
Self::U64(v, _) => v.len(),
Self::String(v, _) => v.len(),
Self::Bool(v, _) => v.len(),
Self::Tag(v, _) => v.len(),
}
self.valid.len()
}
pub fn type_description(&self) -> &'static str {
match self {
Self::F64(_, _) => "f64",
Self::I64(_, _) => "i64",
Self::U64(_, _) => "u64",
Self::String(_, _) => "String",
Self::Bool(_, _) => "bool",
Self::Tag(_, _) => "tag",
pub fn stats(&self) -> Statistics {
match &self.data {
ColumnData::F64(_, stats) => Statistics::F64(stats.clone()),
ColumnData::I64(_, stats) => Statistics::I64(stats.clone()),
ColumnData::U64(_, stats) => Statistics::U64(stats.clone()),
ColumnData::Bool(_, stats) => Statistics::Bool(stats.clone()),
ColumnData::String(_, stats) | ColumnData::Tag(_, stats) => {
Statistics::String(stats.clone())
}
}
pub fn get_i64_stats(&self) -> Option<StatValues<i64>> {
match self {
Self::I64(_, values) => Some(values.clone()),
_ => None,
}
}
/// The approximate memory size of the data in the column. Note that
@ -343,27 +255,150 @@ impl Column {
/// the dictionary size in the chunk that holds the table that has this
/// column. The size returned here is only for their identifiers.
pub fn size(&self) -> usize {
match self {
Self::F64(v, stats) => {
mem::size_of::<Option<f64>>() * v.len() + mem::size_of_val(&stats)
}
Self::I64(v, stats) => {
mem::size_of::<Option<i64>>() * v.len() + mem::size_of_val(&stats)
}
Self::U64(v, stats) => {
mem::size_of::<Option<u64>>() * v.len() + mem::size_of_val(&stats)
}
Self::Bool(v, stats) => {
mem::size_of::<Option<bool>>() * v.len() + mem::size_of_val(&stats)
}
Self::Tag(v, stats) => mem::size_of::<DID>() * v.len() + mem::size_of_val(&stats),
Self::String(v, stats) => {
let string_bytes_size = v
.iter()
.fold(0, |acc, val| acc + val.as_ref().map_or(0, |s| s.len()));
let vec_pointer_sizes = mem::size_of::<Option<String>>() * v.len();
let data_size = match &self.data {
ColumnData::F64(v, stats) => mem::size_of::<f64>() * v.len() + mem::size_of_val(&stats),
ColumnData::I64(v, stats) => mem::size_of::<i64>() * v.len() + mem::size_of_val(&stats),
ColumnData::U64(v, stats) => mem::size_of::<u64>() * v.len() + mem::size_of_val(&stats),
ColumnData::Bool(v, stats) => v.byte_len() + mem::size_of_val(&stats),
ColumnData::Tag(v, stats) => mem::size_of::<DID>() * v.len() + mem::size_of_val(&stats),
ColumnData::String(v, stats) => {
let string_bytes_size = v.iter().fold(0, |acc, val| acc + val.len());
let vec_pointer_sizes = mem::size_of::<String>() * v.len();
string_bytes_size + vec_pointer_sizes + mem::size_of_val(&stats)
}
};
data_size + self.valid.byte_len()
}
pub fn to_arrow(&self, dictionary: &Dictionary) -> Result<ArrayRef> {
let nulls = self.valid.to_arrow();
let data: ArrayRef = match &self.data {
ColumnData::F64(data, _) => {
let data = ArrayDataBuilder::new(DataType::Float64)
.len(data.len())
.add_buffer(data.iter().cloned().collect())
.null_bit_buffer(nulls)
.build();
Arc::new(Float64Array::from(data))
}
ColumnData::I64(data, _) => match self.influx_type {
InfluxColumnType::Timestamp => {
let data = ArrayDataBuilder::new(TIME_DATA_TYPE())
.len(data.len())
.add_buffer(data.iter().cloned().collect())
.null_bit_buffer(nulls)
.build();
Arc::new(TimestampNanosecondArray::from(data))
}
InfluxColumnType::Field(InfluxFieldType::Integer) => {
let data = ArrayDataBuilder::new(DataType::Int64)
.len(data.len())
.add_buffer(data.iter().cloned().collect())
.null_bit_buffer(nulls)
.build();
Arc::new(Int64Array::from(data))
}
_ => unreachable!(),
},
ColumnData::U64(data, _) => {
let data = ArrayDataBuilder::new(DataType::UInt64)
.len(data.len())
.add_buffer(data.iter().cloned().collect())
.null_bit_buffer(nulls)
.build();
Arc::new(UInt64Array::from(data))
}
ColumnData::String(data, _) => {
// TODO: Store this closer to the arrow representation
let iter = data
.iter()
.enumerate()
.map(|(idx, value)| self.valid.get(idx).then(|| value) as _);
let array = StringArray::from_iter(iter);
Arc::new(array)
}
ColumnData::Bool(data, _) => {
let data = ArrayDataBuilder::new(DataType::Boolean)
.len(data.len())
.add_buffer(data.to_arrow())
.null_bit_buffer(nulls)
.build();
Arc::new(BooleanArray::from(data))
}
ColumnData::Tag(data, _) => {
// TODO: Store this closer to the arrow representation
let iter = data.iter().enumerate().map(|(idx, id)| {
self.valid.get(idx).then(|| {
dictionary
.lookup_id(*id)
.expect("dictionary had mapping for tag value")
})
});
let array = StringArray::from_iter(iter);
Arc::new(array)
}
};
assert_eq!(data.len(), self.len());
Ok(data)
}
}
/// Construct a validity mask from the given column's null mask
fn construct_valid_mask(column: &EntryColumn<'_>) -> Result<Vec<u8>> {
let buf_len = (column.row_count + 7) >> 3;
match column.inner().null_mask() {
Some(data) => {
ensure!(
data.len() == buf_len,
InvalidNullMask {
expected_bytes: buf_len,
actual_bytes: data.len()
}
);
Ok(data
.iter()
.map(|x| {
// Currently the bit mask is backwards
!x.reverse_bits()
})
.collect())
}
None => {
// If no null mask they're all valid
let mut data = Vec::new();
data.resize(buf_len, 0xFF);
Ok(data)
}
}
}
/// Writes entry data into a column based on the valid mask
fn handle_write<T, E>(
row_count: usize,
valid_mask: &[u8],
entry_data: E,
col_data: &mut Vec<T>,
stats: &mut StatValues<T>,
) where
T: Clone + Default + PartialOrd,
E: Iterator<Item = T> + ExactSizeIterator,
{
let data_offset = col_data.len();
col_data.resize(data_offset + row_count, Default::default());
let initial_non_null_count = stats.count;
let to_add = entry_data.len();
for (idx, value) in iter_set_positions(valid_mask).zip(entry_data) {
stats.update(&value);
col_data[data_offset + idx] = value;
}
assert_eq!(stats.count - initial_non_null_count, to_add as u64);
}

View File

@ -57,6 +57,7 @@
clippy::clone_on_ref_ptr
)]
mod bitset;
pub mod chunk;
mod column;
mod dictionary;

View File

@ -1,33 +1,20 @@
use std::{cmp, collections::BTreeMap, iter::FromIterator, sync::Arc};
use std::collections::BTreeMap;
use crate::{
column,
column::Column,
dictionary::{Dictionary, Error as DictionaryError, DID},
};
use data_types::{
database_rules::WriterId,
partition_metadata::{ColumnSummary, Statistics},
};
use data_types::{database_rules::WriterId, partition_metadata::ColumnSummary};
use internal_types::{
entry::{self, ClockValue},
schema::{builder::SchemaBuilder, Schema, TIME_COLUMN_NAME},
schema::{builder::SchemaBuilder, Schema},
selection::Selection,
};
use snafu::{OptionExt, ResultExt, Snafu};
use snafu::{ensure, OptionExt, ResultExt, Snafu};
use arrow_deps::{
arrow,
arrow::{
array::{
ArrayRef, BooleanArray, Float64Array, Int64Array, StringArray,
TimestampNanosecondArray, UInt64Array,
},
datatypes::DataType as ArrowDataType,
record_batch::RecordBatch,
},
};
use arrow_deps::{arrow, arrow::record_batch::RecordBatch};
#[derive(Debug, Snafu)]
pub enum Error {
@ -37,30 +24,13 @@ pub enum Error {
source: column::Error,
},
#[snafu(display(
"Expected column {} to be type {} but was {}",
column,
expected_column_type,
actual_column_type
))]
ColumnTypeMismatch {
#[snafu(display("Column {} had {} rows, expected {}", column, expected, actual))]
IncorrectRowCount {
column: String,
expected_column_type: String,
actual_column_type: String,
expected: usize,
actual: usize,
},
#[snafu(display(
"Expected column {} to be a tag but received it as a string field",
column
))]
ExpectedTag { column: String },
#[snafu(display(
"Expected column {} to be a string field but received it as a tag",
column
))]
ExpectedField { column: String },
#[snafu(display("Internal error: unexpected aggregate request for None aggregate",))]
InternalUnexpectedNoneAggregate {},
@ -115,7 +85,7 @@ pub enum Error {
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
#[derive(Debug, Clone)]
#[derive(Debug)]
pub struct Table {
/// Name of the table as a DID in the chunk dictionary
pub id: DID,
@ -164,83 +134,51 @@ impl Table {
_writer_id: WriterId,
columns: Vec<entry::Column<'_>>,
) -> Result<()> {
// get the column ids and validate schema for those that already exist
let columns_with_inserts = columns
.into_iter()
.map(|insert_column| {
let column_id = dictionary.lookup_value_or_insert(insert_column.name());
let values = insert_column.values();
if let Some(c) = self.columns.get(&column_id) {
match (&values, c) {
(entry::TypedValuesIterator::Bool(_), Column::Bool(_, _)) => (),
(entry::TypedValuesIterator::U64(_), Column::U64(_, _)) => (),
(entry::TypedValuesIterator::F64(_), Column::F64(_, _)) => (),
(entry::TypedValuesIterator::I64(_), Column::I64(_, _)) => (),
(entry::TypedValuesIterator::String(_), Column::String(_, _)) => {
if !insert_column.is_field() {
ExpectedField {
column: insert_column.name(),
}
.fail()?
};
}
(entry::TypedValuesIterator::String(_), Column::Tag(_, _)) => {
if !insert_column.is_tag() {
ExpectedTag {
column: insert_column.name(),
}
.fail()?
};
}
_ => ColumnTypeMismatch {
column: insert_column.name(),
expected_column_type: c.type_description(),
actual_column_type: values.type_description(),
}
.fail()?,
}
}
Ok((column_id, insert_column.logical_type(), values))
})
.collect::<Result<Vec<_>>>()?;
let row_count_before_insert = self.row_count();
let additional_rows = columns.first().map(|x| x.row_count).unwrap_or_default();
let final_row_count = row_count_before_insert + additional_rows;
for (column_id, logical_type, values) in columns_with_inserts.into_iter() {
match self.columns.get_mut(&column_id) {
Some(c) => c
.push_typed_values(dictionary, logical_type, values)
.with_context(|| {
let column = dictionary
.lookup_id(column_id)
.expect("column name must be present in dictionary");
ColumnError { column }
})?,
None => {
self.columns.insert(
column_id,
Column::new_from_typed_values(
dictionary,
row_count_before_insert,
logical_type,
values,
),
// get the column ids and validate schema for those that already exist
let column_ids = columns
.iter()
.map(|column| {
ensure!(
column.row_count == additional_rows,
IncorrectRowCount {
column: column.name(),
expected: additional_rows,
actual: column.row_count,
}
);
}
}
let id = dictionary.lookup_value_or_insert(column.name());
if let Some(c) = self.columns.get(&id) {
c.validate_schema(&column).context(ColumnError {
column: column.name(),
})?;
}
// ensure all columns have the same number of rows as the one with the most.
// This adds nulls to the columns that weren't included in this write
let max_row_count = self
Ok(id)
})
.collect::<Result<Vec<_>, _>>()?;
for (fb_column, column_id) in columns.into_iter().zip(column_ids.into_iter()) {
let influx_type = fb_column.influx_type();
let column = self
.columns
.values()
.fold(row_count_before_insert, |max, col| cmp::max(max, col.len()));
.entry(column_id)
.or_insert_with(|| Column::new(row_count_before_insert, influx_type));
column.append(&fb_column, dictionary).context(ColumnError {
column: fb_column.name(),
})?;
assert_eq!(column.len(), final_row_count);
}
for c in self.columns.values_mut() {
c.push_nulls_to_len(max_row_count);
c.push_nulls_to_len(final_row_count);
}
Ok(())
@ -324,27 +262,10 @@ impl Table {
/// Returns the Schema of this table
fn schema_impl(&self, selection: &TableColSelection<'_>) -> Result<Schema> {
let mut schema_builder = SchemaBuilder::new();
for col in &selection.cols {
let column_name = col.column_name;
let column = self.column(col.column_id)?;
schema_builder = match column {
Column::String(_, _) => schema_builder.field(column_name, ArrowDataType::Utf8),
Column::Tag(_, _) => schema_builder.tag(column_name),
Column::F64(_, _) => schema_builder.field(column_name, ArrowDataType::Float64),
Column::I64(_, _) => {
if column_name == TIME_COLUMN_NAME {
schema_builder.timestamp()
} else {
schema_builder.field(column_name, ArrowDataType::Int64)
schema_builder = schema_builder.influx_column(col.column_name, column.influx_type());
}
}
Column::U64(_, _) => schema_builder.field(column_name, ArrowDataType::UInt64),
Column::Bool(_, _) => schema_builder.field(column_name, ArrowDataType::Boolean),
};
}
schema_builder.build().context(InternalSchema)
}
@ -356,60 +277,18 @@ impl Table {
dictionary: &Dictionary,
selection: &TableColSelection<'_>,
) -> Result<RecordBatch> {
let mut columns = Vec::with_capacity(selection.cols.len());
for col in &selection.cols {
let columns = selection
.cols
.iter()
.map(|col| {
let column = self.column(col.column_id)?;
let array: ArrayRef = match column {
Column::String(vals, _) => {
let iter = vals.iter().map(|s| s.as_deref());
let array = StringArray::from_iter(iter);
Arc::new(array)
}
Column::Tag(vals, _) => {
let iter = vals.iter().map(|id| {
if *id == DID::invalid() {
return None;
}
Some(
dictionary
.lookup_id(*id)
.expect("dictionary had mapping for tag value"),
)
});
let array = StringArray::from_iter(iter);
Arc::new(array)
}
Column::F64(vals, _) => {
let array = Float64Array::from_iter(vals.iter());
Arc::new(array)
}
Column::I64(vals, _) => {
if col.column_name == TIME_COLUMN_NAME {
let array = TimestampNanosecondArray::from_iter(vals.iter());
Arc::new(array)
} else {
let array = Int64Array::from_iter(vals.iter());
Arc::new(array)
}
}
Column::U64(vals, _) => {
let array = UInt64Array::from_iter(vals.iter());
Arc::new(array)
}
Column::Bool(vals, _) => {
let array = BooleanArray::from_iter(vals.iter());
Arc::new(array)
}
};
columns.push(array);
}
column.to_arrow(dictionary).context(ColumnError {
column: col.column_name,
})
})
.collect::<Result<Vec<_>>>()?;
let schema = self.schema_impl(selection)?.into();
RecordBatch::try_new(schema, columns).context(ArrowError {})
}
@ -421,19 +300,9 @@ impl Table {
.lookup_id(*column_id)
.expect("column name in dictionary");
let stats = match c {
Column::F64(_, stats) => Statistics::F64(stats.clone()),
Column::I64(_, stats) => Statistics::I64(stats.clone()),
Column::U64(_, stats) => Statistics::U64(stats.clone()),
Column::Bool(_, stats) => Statistics::Bool(stats.clone()),
Column::String(_, stats) | Column::Tag(_, stats) => {
Statistics::String(stats.clone())
}
};
ColumnSummary {
name: column_name.to_string(),
stats,
stats: c.stats(),
}
})
.collect()
@ -461,7 +330,9 @@ impl<'a> TableColSelection<'a> {
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use internal_types::entry::test_helpers::lp_to_entry;
use internal_types::schema::{InfluxColumnType, InfluxFieldType};
use super::*;
@ -476,15 +347,15 @@ mod tests {
];
write_lines_to_table(&mut table, &mut dictionary, lp_lines.clone());
assert_eq!(112, table.size());
assert_eq!(84, table.size());
// doesn't double because of the stats overhead
write_lines_to_table(&mut table, &mut dictionary, lp_lines.clone());
assert_eq!(192, table.size());
assert_eq!(132, table.size());
// now make sure it increased by the same amount minus stats overhead
write_lines_to_table(&mut table, &mut dictionary, lp_lines);
assert_eq!(272, table.size());
assert_eq!(180, table.size());
}
#[test]
@ -588,8 +459,12 @@ mod tests {
assert!(
matches!(
&response,
Error::ExpectedTag {
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
existing: InfluxColumnType::Tag,
inserted: InfluxColumnType::Field(InfluxFieldType::String)
}
} if column == "t1"
),
"didn't match returned error: {:?}",
@ -618,13 +493,13 @@ mod tests {
assert!(
matches!(
&response,
Error::ColumnTypeMismatch {
expected_column_type,
actual_column_type,
column
} if expected_column_type == "i64"
&& actual_column_type == "u64"
&& column == "iv"
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
inserted: InfluxColumnType::Field(InfluxFieldType::UInteger),
existing: InfluxColumnType::Field(InfluxFieldType::Integer)
}
} if column == "iv"
),
"didn't match returned error: {:?}",
response
@ -652,13 +527,13 @@ mod tests {
assert!(
matches!(
&response,
Error::ColumnTypeMismatch {
expected_column_type,
actual_column_type,
column
} if expected_column_type == "f64"
&& actual_column_type == "i64"
&& column == "fv"
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
existing: InfluxColumnType::Field(InfluxFieldType::Float),
inserted: InfluxColumnType::Field(InfluxFieldType::Integer)
}
} if column == "fv"
),
"didn't match returned error: {:?}",
response
@ -686,13 +561,13 @@ mod tests {
assert!(
matches!(
&response,
Error::ColumnTypeMismatch {
expected_column_type,
actual_column_type,
column
} if expected_column_type == "bool"
&& actual_column_type == "f64"
&& column == "bv"
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
existing: InfluxColumnType::Field(InfluxFieldType::Boolean),
inserted: InfluxColumnType::Field(InfluxFieldType::Float)
}
} if column == "bv"
),
"didn't match returned error: {:?}",
response
@ -720,13 +595,13 @@ mod tests {
assert!(
matches!(
&response,
Error::ColumnTypeMismatch {
expected_column_type,
actual_column_type,
column
} if expected_column_type == "String"
&& actual_column_type == "bool"
&& column == "sv"
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
existing: InfluxColumnType::Field(InfluxFieldType::String),
inserted: InfluxColumnType::Field(InfluxFieldType::Boolean),
}
} if column == "sv"
),
"didn't match returned error: {:?}",
response
@ -754,8 +629,12 @@ mod tests {
assert!(
matches!(
&response,
Error::ExpectedField {
column
Error::ColumnError {
column,
source: column::Error::TypeMismatch {
existing: InfluxColumnType::Field(InfluxFieldType::String),
inserted: InfluxColumnType::Tag,
}
} if column == "sv"
),
"didn't match returned error: {:?}",

View File

@ -1599,7 +1599,7 @@ mod tests {
to_arc("cpu"),
0,
ChunkStorage::OpenMutableBuffer,
127,
106,
)];
let size: usize = db
@ -1711,21 +1711,21 @@ mod tests {
to_arc("cpu"),
1,
ChunkStorage::OpenMutableBuffer,
121,
100,
),
ChunkSummary::new_without_timestamps(
to_arc("1970-01-05T15"),
to_arc("cpu"),
0,
ChunkStorage::ClosedMutableBuffer,
157,
129,
),
ChunkSummary::new_without_timestamps(
to_arc("1970-01-05T15"),
to_arc("cpu"),
1,
ChunkStorage::OpenMutableBuffer,
159,
131,
),
];
@ -1735,7 +1735,7 @@ mod tests {
expected, chunk_summaries
);
assert_eq!(db.memory_registries.mutable_buffer.bytes(), 121 + 157 + 159);
assert_eq!(db.memory_registries.mutable_buffer.bytes(), 100 + 129 + 131);
assert_eq!(db.memory_registries.read_buffer.bytes(), 1213);
}

View File

@ -306,11 +306,11 @@ mod tests {
columns: vec![
ColumnSummary {
name: "c1".to_string(),
stats: Statistics::I64(StatValues::new(23)),
stats: Statistics::I64(StatValues::new_with_value(23)),
},
ColumnSummary {
name: "c2".to_string(),
stats: Statistics::I64(StatValues::new(43)),
stats: Statistics::I64(StatValues::new_with_value(43)),
},
],
}],

View File

@ -268,8 +268,8 @@ async fn sql_select_from_system_chunks() {
"+----+---------------+------------+-------------------+-----------------+",
"| id | partition_key | table_name | storage | estimated_bytes |",
"+----+---------------+------------+-------------------+-----------------+",
"| 0 | 1970-01-01T00 | h2o | OpenMutableBuffer | 324 |",
"| 0 | 1970-01-01T00 | o2 | OpenMutableBuffer | 264 |",
"| 0 | 1970-01-01T00 | h2o | OpenMutableBuffer | 257 |",
"| 0 | 1970-01-01T00 | o2 | OpenMutableBuffer | 221 |",
"+----+---------------+------------+-------------------+-----------------+",
];
run_sql_test_case!(

View File

@ -278,7 +278,7 @@ async fn test_chunk_get() {
table_name: "cpu".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 161,
estimated_bytes: 132,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -288,7 +288,7 @@ async fn test_chunk_get() {
table_name: "disk".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 127,
estimated_bytes: 114,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,
@ -455,7 +455,7 @@ async fn test_list_partition_chunks() {
table_name: "cpu".into(),
id: 0,
storage: ChunkStorage::OpenMutableBuffer as i32,
estimated_bytes: 161,
estimated_bytes: 132,
time_of_first_write: None,
time_of_last_write: None,
time_closing: None,

View File

@ -191,7 +191,7 @@ async fn test_get_chunks() {
.and(predicate::str::contains(
r#""storage": "OpenMutableBuffer","#,
))
.and(predicate::str::contains(r#""estimated_bytes": 161"#))
.and(predicate::str::contains(r#""estimated_bytes": 132"#))
// Check for a non empty timestamp such as
// "time_of_first_write": "2021-03-30T17:11:10.723866Z",
.and(predicate::str::contains(r#""time_of_first_write": "20"#));