* feat: MutableBatch write API (#2090) (#2724) * chore: fix lint * fix: handle dictionaries with unused mappings * chore: review feedback * chore: further review feedback Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>pull/24376/head
parent
b55ca06fe3
commit
ce0127a1f7
|
@ -2261,6 +2261,7 @@ dependencies = [
|
||||||
"data_types",
|
"data_types",
|
||||||
"entry",
|
"entry",
|
||||||
"hashbrown",
|
"hashbrown",
|
||||||
|
"itertools",
|
||||||
"schema",
|
"schema",
|
||||||
"snafu",
|
"snafu",
|
||||||
]
|
]
|
||||||
|
|
|
@ -440,14 +440,19 @@ where
|
||||||
Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
|
Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create new statitics with the specified count and null count
|
/// Create new statistics with no values
|
||||||
|
pub fn new_empty() -> Self {
|
||||||
|
Self::new_with_distinct(None, None, 0, 0, None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create new statistics with the specified count and null count
|
||||||
pub fn new(min: Option<T>, max: Option<T>, total_count: u64, null_count: u64) -> Self {
|
pub fn new(min: Option<T>, max: Option<T>, total_count: u64, null_count: u64) -> Self {
|
||||||
let distinct_count = None;
|
let distinct_count = None;
|
||||||
Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
|
Self::new_with_distinct(min, max, total_count, null_count, distinct_count)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create new statitics with the specified count and null count and distinct values
|
/// Create new statistics with the specified count and null count and distinct values
|
||||||
fn new_with_distinct(
|
pub fn new_with_distinct(
|
||||||
min: Option<T>,
|
min: Option<T>,
|
||||||
max: Option<T>,
|
max: Option<T>,
|
||||||
total_count: u64,
|
total_count: u64,
|
||||||
|
|
|
@ -12,6 +12,7 @@ entry = { path = "../entry" }
|
||||||
schema = { path = "../schema" }
|
schema = { path = "../schema" }
|
||||||
snafu = "0.6"
|
snafu = "0.6"
|
||||||
hashbrown = "0.11"
|
hashbrown = "0.11"
|
||||||
|
itertools = "0.10"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arrow_util = { path = "../arrow_util" }
|
arrow_util = { path = "../arrow_util" }
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
//! A [`Column`] stores the rows for a given column name
|
//! A [`Column`] stores the rows for a given column name
|
||||||
|
|
||||||
|
use std::fmt::Formatter;
|
||||||
use std::iter::Enumerate;
|
use std::iter::Enumerate;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
@ -28,10 +29,10 @@ use schema::{IOxValueType, InfluxColumnType, InfluxFieldType, TIME_DATA_TYPE};
|
||||||
///
|
///
|
||||||
/// An i32 is used to match the default for Arrow dictionaries
|
/// An i32 is used to match the default for Arrow dictionaries
|
||||||
#[allow(clippy::upper_case_acronyms)]
|
#[allow(clippy::upper_case_acronyms)]
|
||||||
type DID = i32;
|
pub(crate) type DID = i32;
|
||||||
|
|
||||||
/// An invalid DID used for NULL rows
|
/// An invalid DID used for NULL rows
|
||||||
const INVALID_DID: DID = -1;
|
pub(crate) const INVALID_DID: DID = -1;
|
||||||
|
|
||||||
/// The type of the dictionary used
|
/// The type of the dictionary used
|
||||||
type Dictionary = arrow_util::dictionary::StringDictionary<DID>;
|
type Dictionary = arrow_util::dictionary::StringDictionary<DID>;
|
||||||
|
@ -66,13 +67,13 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||||
/// statistics
|
/// statistics
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Column {
|
pub struct Column {
|
||||||
influx_type: InfluxColumnType,
|
pub(crate) influx_type: InfluxColumnType,
|
||||||
valid: BitSet,
|
pub(crate) valid: BitSet,
|
||||||
data: ColumnData,
|
pub(crate) data: ColumnData,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum ColumnData {
|
pub(crate) enum ColumnData {
|
||||||
F64(Vec<f64>, StatValues<f64>),
|
F64(Vec<f64>, StatValues<f64>),
|
||||||
I64(Vec<i64>, StatValues<i64>),
|
I64(Vec<i64>, StatValues<i64>),
|
||||||
U64(Vec<u64>, StatValues<u64>),
|
U64(Vec<u64>, StatValues<u64>),
|
||||||
|
@ -81,6 +82,24 @@ enum ColumnData {
|
||||||
Tag(Vec<DID>, Dictionary, StatValues<String>),
|
Tag(Vec<DID>, Dictionary, StatValues<String>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Display for ColumnData {
|
||||||
|
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
ColumnData::F64(col_data, _) => write!(f, "F64({})", col_data.len()),
|
||||||
|
ColumnData::I64(col_data, _) => write!(f, "I64({})", col_data.len()),
|
||||||
|
ColumnData::U64(col_data, _) => write!(f, "U64({})", col_data.len()),
|
||||||
|
ColumnData::String(col_data, _) => write!(f, "String({})", col_data.len()),
|
||||||
|
ColumnData::Bool(col_data, _) => write!(f, "Bool({})", col_data.len()),
|
||||||
|
ColumnData::Tag(col_data, dictionary, _) => write!(
|
||||||
|
f,
|
||||||
|
"Tag(keys:{},values:{})",
|
||||||
|
col_data.len(),
|
||||||
|
dictionary.values().len()
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Column {
|
impl Column {
|
||||||
pub(crate) fn new(row_count: usize, column_type: InfluxColumnType) -> Self {
|
pub(crate) fn new(row_count: usize, column_type: InfluxColumnType) -> Self {
|
||||||
let mut valid = BitSet::new();
|
let mut valid = BitSet::new();
|
||||||
|
|
|
@ -9,11 +9,12 @@
|
||||||
clippy::clone_on_ref_ptr
|
clippy::clone_on_ref_ptr
|
||||||
)]
|
)]
|
||||||
|
|
||||||
//! A mutable data structure for a collection of writes
|
//! A mutable data structure for a collection of writes.
|
||||||
|
//!
|
||||||
|
//! Can be viewed as a mutable version of [`RecordBatch`] that remains the exclusive
|
||||||
|
//! owner of its buffers, permitting mutability. The in-memory layout is similar, however,
|
||||||
|
//! permitting fast conversion to [`RecordBatch`]
|
||||||
//!
|
//!
|
||||||
//! Currently supports:
|
|
||||||
//! - `[TableBatch`] writes
|
|
||||||
//! - [`RecordBatch`] conversion
|
|
||||||
|
|
||||||
use crate::column::Column;
|
use crate::column::Column;
|
||||||
use arrow::record_batch::RecordBatch;
|
use arrow::record_batch::RecordBatch;
|
||||||
|
@ -24,6 +25,7 @@ use schema::{builder::SchemaBuilder, Schema};
|
||||||
use snafu::{ensure, OptionExt, ResultExt, Snafu};
|
use snafu::{ensure, OptionExt, ResultExt, Snafu};
|
||||||
|
|
||||||
pub mod column;
|
pub mod column;
|
||||||
|
pub mod writer;
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
#[allow(missing_docs)]
|
||||||
#[derive(Debug, Snafu)]
|
#[derive(Debug, Snafu)]
|
||||||
|
@ -61,15 +63,23 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||||
/// can be appended to and converted into an Arrow `RecordBatch`
|
/// can be appended to and converted into an Arrow `RecordBatch`
|
||||||
#[derive(Debug, Default)]
|
#[derive(Debug, Default)]
|
||||||
pub struct MutableBatch {
|
pub struct MutableBatch {
|
||||||
/// Map of column id from the chunk dictionary to the column
|
/// Map of column name to index in `MutableBatch::columns`
|
||||||
columns: HashMap<String, Column>,
|
column_names: HashMap<String, usize>,
|
||||||
|
|
||||||
|
/// Columns contained within this MutableBatch
|
||||||
|
columns: Vec<Column>,
|
||||||
|
|
||||||
|
/// The number of rows in this MutableBatch
|
||||||
|
row_count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MutableBatch {
|
impl MutableBatch {
|
||||||
/// Create a new empty batch
|
/// Create a new empty batch
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
column_names: Default::default(),
|
||||||
columns: Default::default(),
|
columns: Default::default(),
|
||||||
|
row_count: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,7 +103,8 @@ impl MutableBatch {
|
||||||
let mut schema_builder = SchemaBuilder::new();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let schema = match selection {
|
let schema = match selection {
|
||||||
Selection::All => {
|
Selection::All => {
|
||||||
for (column_name, column) in self.columns.iter() {
|
for (column_name, column_idx) in self.column_names.iter() {
|
||||||
|
let column = &self.columns[*column_idx];
|
||||||
schema_builder.influx_column(column_name, column.influx_type());
|
schema_builder.influx_column(column_name, column.influx_type());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,8 +132,7 @@ impl MutableBatch {
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(_, field)| {
|
.map(|(_, field)| {
|
||||||
let column = self
|
let column = self
|
||||||
.columns
|
.column(field.name())
|
||||||
.get(field.name())
|
|
||||||
.expect("schema contains non-existent column");
|
.expect("schema contains non-existent column");
|
||||||
|
|
||||||
column.to_arrow().context(ColumnError {
|
column.to_arrow().context(ColumnError {
|
||||||
|
@ -136,21 +146,24 @@ impl MutableBatch {
|
||||||
|
|
||||||
/// Returns an iterator over the columns in this batch in no particular order
|
/// Returns an iterator over the columns in this batch in no particular order
|
||||||
pub fn columns(&self) -> impl Iterator<Item = (&String, &Column)> + '_ {
|
pub fn columns(&self) -> impl Iterator<Item = (&String, &Column)> + '_ {
|
||||||
self.columns.iter()
|
self.column_names
|
||||||
|
.iter()
|
||||||
|
.map(move |(name, idx)| (name, &self.columns[*idx]))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the number of rows in this chunk
|
/// Return the number of rows in this chunk
|
||||||
pub fn rows(&self) -> usize {
|
pub fn rows(&self) -> usize {
|
||||||
self.columns
|
self.row_count
|
||||||
.values()
|
|
||||||
.next()
|
|
||||||
.map(|col| col.len())
|
|
||||||
.unwrap_or(0)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a reference to the specified column
|
/// Returns a reference to the specified column
|
||||||
pub(crate) fn column(&self, column: &str) -> Result<&Column> {
|
pub(crate) fn column(&self, column: &str) -> Result<&Column> {
|
||||||
self.columns.get(column).context(ColumnNotFound { column })
|
let idx = self
|
||||||
|
.column_names
|
||||||
|
.get(column)
|
||||||
|
.context(ColumnNotFound { column })?;
|
||||||
|
|
||||||
|
Ok(&self.columns[*idx])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Validates the schema of the passed in columns, then adds their values to
|
/// Validates the schema of the passed in columns, then adds their values to
|
||||||
|
@ -189,8 +202,10 @@ impl MutableBatch {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
if let Some(c) = self.columns.get(column.name()) {
|
if let Some(c_idx) = self.column_names.get(column.name()) {
|
||||||
c.validate_schema(column).context(ColumnError {
|
self.columns[*c_idx]
|
||||||
|
.validate_schema(column)
|
||||||
|
.context(ColumnError {
|
||||||
column: column.name(),
|
column: column.name(),
|
||||||
})?;
|
})?;
|
||||||
}
|
}
|
||||||
|
@ -200,19 +215,24 @@ impl MutableBatch {
|
||||||
|
|
||||||
for fb_column in columns {
|
for fb_column in columns {
|
||||||
let influx_type = fb_column.influx_type();
|
let influx_type = fb_column.influx_type();
|
||||||
|
let columns_len = self.columns.len();
|
||||||
|
|
||||||
let column = self
|
let column_idx = *self
|
||||||
.columns
|
.column_names
|
||||||
.raw_entry_mut()
|
.raw_entry_mut()
|
||||||
.from_key(fb_column.name())
|
.from_key(fb_column.name())
|
||||||
.or_insert_with(|| {
|
.or_insert_with(|| (fb_column.name().to_string(), columns_len))
|
||||||
(
|
|
||||||
fb_column.name().to_string(),
|
|
||||||
Column::new(row_count_before_insert, influx_type),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.1;
|
.1;
|
||||||
|
|
||||||
|
if columns_len == column_idx {
|
||||||
|
self.columns
|
||||||
|
.push(Column::new(row_count_before_insert, influx_type))
|
||||||
|
}
|
||||||
|
|
||||||
|
let column = &mut self.columns[column_idx];
|
||||||
|
|
||||||
|
assert_eq!(column.len(), row_count_before_insert);
|
||||||
|
|
||||||
column.append(&fb_column, mask).context(ColumnError {
|
column.append(&fb_column, mask).context(ColumnError {
|
||||||
column: fb_column.name(),
|
column: fb_column.name(),
|
||||||
})?;
|
})?;
|
||||||
|
@ -221,9 +241,10 @@ impl MutableBatch {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pad any columns that did not have values in this batch with NULLs
|
// Pad any columns that did not have values in this batch with NULLs
|
||||||
for c in self.columns.values_mut() {
|
for c in &mut self.columns {
|
||||||
c.push_nulls_to_len(final_row_count);
|
c.push_nulls_to_len(final_row_count);
|
||||||
}
|
}
|
||||||
|
self.row_count = final_row_count;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,581 @@
|
||||||
|
//! A panic-safe write abstraction for [`MutableBatch`]
|
||||||
|
|
||||||
|
use crate::column::{Column, ColumnData, INVALID_DID};
|
||||||
|
use crate::MutableBatch;
|
||||||
|
use arrow_util::bitset::iter_set_positions;
|
||||||
|
use data_types::partition_metadata::{StatValues, Statistics};
|
||||||
|
use schema::{InfluxColumnType, InfluxFieldType};
|
||||||
|
use snafu::Snafu;
|
||||||
|
use std::num::NonZeroU64;
|
||||||
|
|
||||||
|
#[allow(missing_docs, missing_copy_implementations)]
|
||||||
|
#[derive(Debug, Snafu)]
|
||||||
|
pub enum Error {
|
||||||
|
#[snafu(display("Unable to insert {} type into a column of {}", inserted, existing))]
|
||||||
|
TypeMismatch {
|
||||||
|
existing: InfluxColumnType,
|
||||||
|
inserted: InfluxColumnType,
|
||||||
|
},
|
||||||
|
|
||||||
|
#[snafu(display("Incorrect number of values provided"))]
|
||||||
|
InsufficientValues,
|
||||||
|
|
||||||
|
#[snafu(display("Key not found in dictionary: {}", key))]
|
||||||
|
KeyNotFound { key: usize },
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A specialized `Error` for [`Writer`] errors
|
||||||
|
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||||
|
|
||||||
|
/// [`Writer`] provides a panic-safe abstraction to append a number of rows to a [`MutableBatch`]
|
||||||
|
///
|
||||||
|
/// If a [`Writer`] is dropped without calling [`Writer::commit`], the [`MutableBatch`] will be
|
||||||
|
/// truncated to the original number of rows, and the statistics not updated
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Writer<'a> {
|
||||||
|
/// The mutable batch that is being mutated
|
||||||
|
batch: &'a mut MutableBatch,
|
||||||
|
/// A list of column index paired with Statistics
|
||||||
|
///
|
||||||
|
/// Statistics updates are deferred to commit time
|
||||||
|
statistics: Vec<(usize, Statistics)>,
|
||||||
|
/// The initial number of rows in the MutableBatch
|
||||||
|
initial_rows: usize,
|
||||||
|
/// The number of rows to insert
|
||||||
|
to_insert: usize,
|
||||||
|
/// If this Writer committed successfully
|
||||||
|
success: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Writer<'a> {
|
||||||
|
/// Create a [`Writer`] for inserting `to_insert` rows to the provided `batch`
|
||||||
|
///
|
||||||
|
/// If the writer is dropped without calling commit all changes will be rolled back
|
||||||
|
pub fn new(batch: &'a mut MutableBatch, to_insert: usize) -> Self {
|
||||||
|
let initial_rows = batch.rows();
|
||||||
|
Self {
|
||||||
|
batch,
|
||||||
|
statistics: vec![],
|
||||||
|
initial_rows,
|
||||||
|
to_insert,
|
||||||
|
success: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the f64 typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_f64<I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = f64>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) =
|
||||||
|
self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Float))?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::F64(col_data, _) => {
|
||||||
|
col_data.resize(initial_rows + to_insert, 0_f64);
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data[initial_rows + idx] = value;
|
||||||
|
stats.update(&value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected f64 got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::F64(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the i64 typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_i64<I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = i64>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) =
|
||||||
|
self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Integer))?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::I64(col_data, _) => {
|
||||||
|
col_data.resize(initial_rows + to_insert, 0_i64);
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data[initial_rows + idx] = value;
|
||||||
|
stats.update(&value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected i64 got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::I64(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the u64 typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_u64<I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = u64>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) =
|
||||||
|
self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::UInteger))?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::U64(col_data, _) => {
|
||||||
|
col_data.resize(initial_rows + to_insert, 0_u64);
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data[initial_rows + idx] = value;
|
||||||
|
stats.update(&value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected u64 got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::U64(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the boolean typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_bool<I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = bool>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) =
|
||||||
|
self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::Boolean))?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::Bool(col_data, _) => {
|
||||||
|
col_data.append_unset(to_insert);
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
if value {
|
||||||
|
col_data.set(initial_rows + idx);
|
||||||
|
}
|
||||||
|
stats.update(&value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected bool got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::Bool(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the string field typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_string<'s, I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = &'s str>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) =
|
||||||
|
self.column_mut(name, InfluxColumnType::Field(InfluxFieldType::String))?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::String(col_data, _) => {
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data.extend(initial_rows + idx - col_data.len());
|
||||||
|
col_data.append(value);
|
||||||
|
stats.update(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected tag got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::String(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the tag typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_tag<'s, I>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut values: I,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = &'s str>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) = self.column_mut(name, InfluxColumnType::Tag)?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::Tag(col_data, dict, _) => {
|
||||||
|
col_data.resize(initial_rows + to_insert, INVALID_DID);
|
||||||
|
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data[initial_rows + idx] = dict.lookup_value_or_insert(value);
|
||||||
|
stats.update(value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected tag got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::String(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the tag typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_tag_dict<'s, K, V>(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
mut keys: K,
|
||||||
|
values: V,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
K: Iterator<Item = usize>,
|
||||||
|
V: Iterator<Item = &'s str>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) = self.column_mut(name, InfluxColumnType::Tag)?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::Tag(col_data, dict, _) => {
|
||||||
|
// Lazily compute mappings to handle dictionaries with unused mappings
|
||||||
|
let mut mapping: Vec<_> = values.map(|value| (value, None)).collect();
|
||||||
|
|
||||||
|
col_data.resize(initial_rows + to_insert, INVALID_DID);
|
||||||
|
|
||||||
|
for idx in set_position_iterator(valid_mask, to_insert) {
|
||||||
|
let key = keys.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
let (value, maybe_did) =
|
||||||
|
mapping.get_mut(key).ok_or(Error::KeyNotFound { key })?;
|
||||||
|
|
||||||
|
match maybe_did {
|
||||||
|
Some(did) => col_data[initial_rows + idx] = *did,
|
||||||
|
None => {
|
||||||
|
let did = dict.lookup_value_or_insert(value);
|
||||||
|
*maybe_did = Some(did);
|
||||||
|
col_data[initial_rows + idx] = did
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stats.update(*value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected tag got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, valid_mask, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::String(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the time typed column identified by `name`
|
||||||
|
///
|
||||||
|
/// For each set bit in `valid_mask` an a value from `values` is inserted at the
|
||||||
|
/// corresponding index in the column. Nulls are inserted for the other rows
|
||||||
|
///
|
||||||
|
/// # Panic
|
||||||
|
///
|
||||||
|
/// - panics if this column has already been written to by this `Writer`
|
||||||
|
///
|
||||||
|
pub fn write_time<I>(&mut self, name: &str, mut values: I) -> Result<()>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = i64>,
|
||||||
|
{
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let to_insert = self.to_insert;
|
||||||
|
|
||||||
|
let (col_idx, col) = self.column_mut(name, InfluxColumnType::Timestamp)?;
|
||||||
|
|
||||||
|
let mut stats = StatValues::new_empty();
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::I64(col_data, _) => {
|
||||||
|
col_data.resize(initial_rows + to_insert, 0_i64);
|
||||||
|
for idx in 0..to_insert {
|
||||||
|
let value = values.next().ok_or(Error::InsufficientValues)?;
|
||||||
|
col_data[initial_rows + idx] = value;
|
||||||
|
stats.update(&value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => unreachable!("expected i64 got {} for column \"{}\"", x, name),
|
||||||
|
}
|
||||||
|
|
||||||
|
append_valid_mask(col, None, to_insert);
|
||||||
|
|
||||||
|
stats.update_for_nulls(to_insert as u64 - stats.total_count);
|
||||||
|
self.statistics.push((col_idx, Statistics::I64(stats)));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn column_mut(
|
||||||
|
&mut self,
|
||||||
|
name: &str,
|
||||||
|
influx_type: InfluxColumnType,
|
||||||
|
) -> Result<(usize, &mut Column)> {
|
||||||
|
let columns_len = self.batch.columns.len();
|
||||||
|
|
||||||
|
let column_idx = *self
|
||||||
|
.batch
|
||||||
|
.column_names
|
||||||
|
.raw_entry_mut()
|
||||||
|
.from_key(name)
|
||||||
|
.or_insert_with(|| (name.to_string(), columns_len))
|
||||||
|
.1;
|
||||||
|
|
||||||
|
if columns_len == column_idx {
|
||||||
|
self.batch
|
||||||
|
.columns
|
||||||
|
.push(Column::new(self.initial_rows, influx_type))
|
||||||
|
}
|
||||||
|
|
||||||
|
let col = &mut self.batch.columns[column_idx];
|
||||||
|
|
||||||
|
if col.influx_type != influx_type {
|
||||||
|
return Err(Error::TypeMismatch {
|
||||||
|
existing: col.influx_type,
|
||||||
|
inserted: influx_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
col.valid.len(),
|
||||||
|
self.initial_rows,
|
||||||
|
"expected {} rows in column \"{}\" got {} when performing write of {} rows",
|
||||||
|
self.initial_rows,
|
||||||
|
name,
|
||||||
|
col.valid.len(),
|
||||||
|
self.to_insert
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok((column_idx, col))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Commits the writes performed on this [`Writer`]. This will update the statistics
|
||||||
|
/// and pad any unwritten columns with nulls
|
||||||
|
pub fn commit(mut self) {
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
let final_rows = initial_rows + self.to_insert;
|
||||||
|
|
||||||
|
self.statistics
|
||||||
|
.sort_unstable_by_key(|(col_idx, _)| *col_idx);
|
||||||
|
let mut statistics = self.statistics.iter();
|
||||||
|
|
||||||
|
for (col_idx, col) in self.batch.columns.iter_mut().enumerate() {
|
||||||
|
// All columns should either have received a write and have statistics or not
|
||||||
|
if col.valid.len() == initial_rows {
|
||||||
|
col.push_nulls_to_len(final_rows);
|
||||||
|
} else {
|
||||||
|
assert_eq!(
|
||||||
|
col.valid.len(),
|
||||||
|
final_rows,
|
||||||
|
"expected {} rows in column index {} got {} when performing write of {} rows",
|
||||||
|
final_rows,
|
||||||
|
col_idx,
|
||||||
|
col.valid.len(),
|
||||||
|
self.to_insert
|
||||||
|
);
|
||||||
|
|
||||||
|
let (stats_col_idx, stats) = statistics.next().unwrap();
|
||||||
|
assert_eq!(*stats_col_idx, col_idx);
|
||||||
|
|
||||||
|
match (&mut col.data, stats) {
|
||||||
|
(ColumnData::F64(_, stats), Statistics::F64(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
}
|
||||||
|
(ColumnData::I64(_, stats), Statistics::I64(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
}
|
||||||
|
(ColumnData::U64(_, stats), Statistics::U64(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
}
|
||||||
|
(ColumnData::String(_, stats), Statistics::String(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
}
|
||||||
|
(ColumnData::Bool(_, stats), Statistics::Bool(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
}
|
||||||
|
(ColumnData::Tag(_, dict, stats), Statistics::String(new)) => {
|
||||||
|
stats.update_from(new);
|
||||||
|
stats.distinct_count = match stats.null_count {
|
||||||
|
0 => NonZeroU64::new(dict.values().len() as u64),
|
||||||
|
_ => NonZeroU64::new(dict.values().len() as u64 + 1),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => unreachable!("column: {}, statistics: {}", col.data, stats.type_name()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.batch.row_count = final_rows;
|
||||||
|
self.success = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_position_iterator(
|
||||||
|
valid_mask: Option<&[u8]>,
|
||||||
|
to_insert: usize,
|
||||||
|
) -> impl Iterator<Item = usize> + '_ {
|
||||||
|
match valid_mask {
|
||||||
|
Some(mask) => itertools::Either::Left(
|
||||||
|
iter_set_positions(mask).take_while(move |idx| *idx < to_insert),
|
||||||
|
),
|
||||||
|
None => itertools::Either::Right(0..to_insert),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn append_valid_mask(column: &mut Column, valid_mask: Option<&[u8]>, to_insert: usize) {
|
||||||
|
match valid_mask {
|
||||||
|
Some(mask) => column.valid.append_bits(to_insert, mask),
|
||||||
|
None => column.valid.append_set(to_insert),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Drop for Writer<'a> {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.success {
|
||||||
|
let initial_rows = self.initial_rows;
|
||||||
|
for col in &mut self.batch.columns {
|
||||||
|
col.valid.truncate(initial_rows);
|
||||||
|
match &mut col.data {
|
||||||
|
ColumnData::F64(col_data, _) => col_data.truncate(initial_rows),
|
||||||
|
ColumnData::I64(col_data, _) => col_data.truncate(initial_rows),
|
||||||
|
ColumnData::U64(col_data, _) => col_data.truncate(initial_rows),
|
||||||
|
ColumnData::String(col_data, _) => col_data.truncate(initial_rows),
|
||||||
|
ColumnData::Bool(col_data, _) => col_data.truncate(initial_rows),
|
||||||
|
ColumnData::Tag(col_data, dict, _) => {
|
||||||
|
col_data.truncate(initial_rows);
|
||||||
|
match col_data.iter().max() {
|
||||||
|
Some(max) => dict.truncate(*max),
|
||||||
|
None => dict.clear(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,336 @@
|
||||||
|
use arrow_util::assert_batches_eq;
|
||||||
|
use data_types::partition_metadata::{StatValues, Statistics};
|
||||||
|
use mutable_batch::writer::Writer;
|
||||||
|
use mutable_batch::MutableBatch;
|
||||||
|
use schema::selection::Selection;
|
||||||
|
use std::num::NonZeroU64;
|
||||||
|
|
||||||
|
fn get_stats(batch: &MutableBatch) -> Vec<(&str, Statistics)> {
|
||||||
|
let mut stats: Vec<_> = batch
|
||||||
|
.columns()
|
||||||
|
.map(|(name, col)| (name.as_str(), col.stats()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
stats.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
|
||||||
|
stats
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_basic() {
|
||||||
|
let mut batch = MutableBatch::new();
|
||||||
|
|
||||||
|
let mut writer = Writer::new(&mut batch, 5);
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_bool(
|
||||||
|
"b1",
|
||||||
|
None,
|
||||||
|
vec![true, true, false, false, false].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_bool(
|
||||||
|
"b2",
|
||||||
|
Some(&[0b00011101]),
|
||||||
|
vec![true, false, false, true].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_f64(
|
||||||
|
"f64",
|
||||||
|
Some(&[0b00011011]),
|
||||||
|
vec![343.3, 443., 477., -24.].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_i64("i64", None, vec![234, 6, 2, 6, -3].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_i64("i64_2", Some(&[0b00000001]), vec![-8].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_u64("u64", Some(&[0b00001001]), vec![23, 5].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_time("time", vec![7, 5, 7, 3, 5].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag("tag1", None, vec!["v1", "v1", "v2", "v2", "v1"].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag(
|
||||||
|
"tag2",
|
||||||
|
Some(&[0b00001011]),
|
||||||
|
vec!["v1", "v2", "v2"].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag_dict(
|
||||||
|
"tag3",
|
||||||
|
Some(&[0b00011011]),
|
||||||
|
vec![1, 0, 0, 1].into_iter(),
|
||||||
|
vec!["v1", "v2"].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer.commit();
|
||||||
|
|
||||||
|
let stats: Vec<_> = get_stats(&batch);
|
||||||
|
|
||||||
|
let expected_data = &[
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
"| b1 | b2 | f64 | i64 | i64_2 | tag1 | tag2 | tag3 | time | u64 |",
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
"| true | true | 343.3 | 234 | -8 | v1 | v1 | v2 | 1970-01-01T00:00:00.000000007Z | 23 |",
|
||||||
|
"| true | | 443 | 6 | | v1 | v2 | v1 | 1970-01-01T00:00:00.000000005Z | |",
|
||||||
|
"| false | false | | 2 | | v2 | | | 1970-01-01T00:00:00.000000007Z | |",
|
||||||
|
"| false | false | 477 | 6 | | v2 | v2 | v1 | 1970-01-01T00:00:00.000000003Z | 5 |",
|
||||||
|
"| false | true | -24 | -3 | | v1 | | v2 | 1970-01-01T00:00:00.000000005Z | |",
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_stats = vec![
|
||||||
|
(
|
||||||
|
"b1",
|
||||||
|
Statistics::Bool(StatValues::new(Some(false), Some(true), 5, 0)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"b2",
|
||||||
|
Statistics::Bool(StatValues::new(Some(false), Some(true), 5, 1)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"f64",
|
||||||
|
Statistics::F64(StatValues::new(Some(-24.), Some(477.), 5, 1)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"i64",
|
||||||
|
Statistics::I64(StatValues::new(Some(-3), Some(234), 5, 0)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"i64_2",
|
||||||
|
Statistics::I64(StatValues::new(Some(-8), Some(-8), 5, 4)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag1",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v2".to_string()),
|
||||||
|
5,
|
||||||
|
0,
|
||||||
|
Some(NonZeroU64::new(2).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag2",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v2".to_string()),
|
||||||
|
5,
|
||||||
|
2,
|
||||||
|
Some(NonZeroU64::new(3).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag3",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v2".to_string()),
|
||||||
|
5,
|
||||||
|
1,
|
||||||
|
Some(NonZeroU64::new(3).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"time",
|
||||||
|
Statistics::I64(StatValues::new(Some(3), Some(7), 5, 0)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"u64",
|
||||||
|
Statistics::U64(StatValues::new(Some(5), Some(23), 5, 3)),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
|
||||||
|
assert_eq!(stats, expected_stats);
|
||||||
|
|
||||||
|
let mut writer = Writer::new(&mut batch, 4);
|
||||||
|
writer
|
||||||
|
.write_time("time", vec![4, 6, 21, 7].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag("tag1", None, vec!["v6", "v7", "v8", "v4"].into_iter())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
std::mem::drop(writer);
|
||||||
|
|
||||||
|
let stats: Vec<_> = get_stats(&batch);
|
||||||
|
|
||||||
|
// Writer dropped, should not impact stats or data
|
||||||
|
assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
|
||||||
|
assert_eq!(stats, expected_stats);
|
||||||
|
|
||||||
|
let err = Writer::new(&mut batch, 1)
|
||||||
|
.write_tag("b1", None, vec!["err"].into_iter())
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string();
|
||||||
|
assert_eq!(err.as_str(), "Unable to insert iox::column_type::tag type into a column of iox::column_type::field::boolean");
|
||||||
|
|
||||||
|
let err = Writer::new(&mut batch, 1)
|
||||||
|
.write_i64("f64", None, vec![3].into_iter())
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
assert_eq!(err.as_str(), "Unable to insert iox::column_type::field::integer type into a column of iox::column_type::field::float");
|
||||||
|
|
||||||
|
let err = Writer::new(&mut batch, 1)
|
||||||
|
.write_string("tag3", None, vec!["sd"].into_iter())
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
assert_eq!(err.as_str(), "Unable to insert iox::column_type::field::string type into a column of iox::column_type::tag");
|
||||||
|
|
||||||
|
let err = Writer::new(&mut batch, 1)
|
||||||
|
.write_tag_dict("tag3", None, vec![1].into_iter(), vec!["v1"].into_iter())
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
assert_eq!(err.as_str(), "Key not found in dictionary: 1");
|
||||||
|
|
||||||
|
let stats: Vec<_> = get_stats(&batch);
|
||||||
|
|
||||||
|
// Writer not committed, should not impact stats or data
|
||||||
|
assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
|
||||||
|
assert_eq!(stats, expected_stats);
|
||||||
|
|
||||||
|
let mut writer = Writer::new(&mut batch, 17);
|
||||||
|
|
||||||
|
writer.write_time("time", (0..17).into_iter()).unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_f64(
|
||||||
|
"f64",
|
||||||
|
Some(&[0b01000010, 0b00100100, 0b00000001]),
|
||||||
|
vec![4., 945., -222., 4., 7.].into_iter(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag("tag3", None, std::iter::repeat("v2"))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer
|
||||||
|
.write_tag_dict(
|
||||||
|
"tag2",
|
||||||
|
Some(&[0b11011111, 0b11011101, 0b00000000]),
|
||||||
|
vec![0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1].into_iter(),
|
||||||
|
vec!["v4", "v1", "v7"].into_iter(), // Intentional extra key
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
writer.commit();
|
||||||
|
|
||||||
|
let stats: Vec<_> = get_stats(&batch);
|
||||||
|
|
||||||
|
let expected_data = &[
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
"| b1 | b2 | f64 | i64 | i64_2 | tag1 | tag2 | tag3 | time | u64 |",
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
"| true | true | 343.3 | 234 | -8 | v1 | v1 | v2 | 1970-01-01T00:00:00.000000007Z | 23 |",
|
||||||
|
"| true | | 443 | 6 | | v1 | v2 | v1 | 1970-01-01T00:00:00.000000005Z | |",
|
||||||
|
"| false | false | | 2 | | v2 | | | 1970-01-01T00:00:00.000000007Z | |",
|
||||||
|
"| false | false | 477 | 6 | | v2 | v2 | v1 | 1970-01-01T00:00:00.000000003Z | 5 |",
|
||||||
|
"| false | true | -24 | -3 | | v1 | | v2 | 1970-01-01T00:00:00.000000005Z | |",
|
||||||
|
"| | | | | | | v4 | v2 | 1970-01-01T00:00:00Z | |",
|
||||||
|
"| | | 4 | | | | v1 | v2 | 1970-01-01T00:00:00.000000001Z | |",
|
||||||
|
"| | | | | | | v1 | v2 | 1970-01-01T00:00:00.000000002Z | |",
|
||||||
|
"| | | | | | | v4 | v2 | 1970-01-01T00:00:00.000000003Z | |",
|
||||||
|
"| | | | | | | v1 | v2 | 1970-01-01T00:00:00.000000004Z | |",
|
||||||
|
"| | | | | | | | v2 | 1970-01-01T00:00:00.000000005Z | |",
|
||||||
|
"| | | 945 | | | | v1 | v2 | 1970-01-01T00:00:00.000000006Z | |",
|
||||||
|
"| | | | | | | v1 | v2 | 1970-01-01T00:00:00.000000007Z | |",
|
||||||
|
"| | | | | | | v4 | v2 | 1970-01-01T00:00:00.000000008Z | |",
|
||||||
|
"| | | | | | | | v2 | 1970-01-01T00:00:00.000000009Z | |",
|
||||||
|
"| | | -222 | | | | v4 | v2 | 1970-01-01T00:00:00.000000010Z | |",
|
||||||
|
"| | | | | | | v4 | v2 | 1970-01-01T00:00:00.000000011Z | |",
|
||||||
|
"| | | | | | | v4 | v2 | 1970-01-01T00:00:00.000000012Z | |",
|
||||||
|
"| | | 4 | | | | | v2 | 1970-01-01T00:00:00.000000013Z | |",
|
||||||
|
"| | | | | | | v1 | v2 | 1970-01-01T00:00:00.000000014Z | |",
|
||||||
|
"| | | | | | | v1 | v2 | 1970-01-01T00:00:00.000000015Z | |",
|
||||||
|
"| | | 7 | | | | | v2 | 1970-01-01T00:00:00.000000016Z | |",
|
||||||
|
"+-------+-------+-------+-----+-------+------+------+------+--------------------------------+-----+",
|
||||||
|
];
|
||||||
|
|
||||||
|
let expected_stats = vec![
|
||||||
|
(
|
||||||
|
"b1",
|
||||||
|
Statistics::Bool(StatValues::new(Some(false), Some(true), 22, 17)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"b2",
|
||||||
|
Statistics::Bool(StatValues::new(Some(false), Some(true), 22, 18)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"f64",
|
||||||
|
Statistics::F64(StatValues::new(Some(-222.), Some(945.), 22, 13)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"i64",
|
||||||
|
Statistics::I64(StatValues::new(Some(-3), Some(234), 22, 17)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"i64_2",
|
||||||
|
Statistics::I64(StatValues::new(Some(-8), Some(-8), 22, 21)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag1",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v2".to_string()),
|
||||||
|
22,
|
||||||
|
17,
|
||||||
|
Some(NonZeroU64::new(3).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag2",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v4".to_string()),
|
||||||
|
22,
|
||||||
|
6,
|
||||||
|
Some(NonZeroU64::new(4).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"tag3",
|
||||||
|
Statistics::String(StatValues::new_with_distinct(
|
||||||
|
Some("v1".to_string()),
|
||||||
|
Some("v2".to_string()),
|
||||||
|
22,
|
||||||
|
1,
|
||||||
|
Some(NonZeroU64::new(3).unwrap()),
|
||||||
|
)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"time",
|
||||||
|
Statistics::I64(StatValues::new(Some(0), Some(16), 22, 0)),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"u64",
|
||||||
|
Statistics::U64(StatValues::new(Some(5), Some(23), 22, 20)),
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
assert_batches_eq!(expected_data, &[batch.to_arrow(Selection::All).unwrap()]);
|
||||||
|
assert_eq!(stats, expected_stats);
|
||||||
|
}
|
|
@ -183,8 +183,8 @@ async fn sql_select_from_system_chunks() {
|
||||||
"+---------------+------------+-------------------+--------------+-----------+",
|
"+---------------+------------+-------------------+--------------+-----------+",
|
||||||
"| partition_key | table_name | storage | memory_bytes | row_count |",
|
"| partition_key | table_name | storage | memory_bytes | row_count |",
|
||||||
"+---------------+------------+-------------------+--------------+-----------+",
|
"+---------------+------------+-------------------+--------------+-----------+",
|
||||||
"| 1970-01-01T00 | h2o | OpenMutableBuffer | 1639 | 3 |",
|
"| 1970-01-01T00 | h2o | OpenMutableBuffer | 1671 | 3 |",
|
||||||
"| 1970-01-01T00 | o2 | OpenMutableBuffer | 1635 | 2 |",
|
"| 1970-01-01T00 | o2 | OpenMutableBuffer | 1667 | 2 |",
|
||||||
"+---------------+------------+-------------------+--------------+-----------+",
|
"+---------------+------------+-------------------+--------------+-----------+",
|
||||||
];
|
];
|
||||||
run_sql_test_case(
|
run_sql_test_case(
|
||||||
|
|
|
@ -1738,7 +1738,7 @@ mod tests {
|
||||||
assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);
|
assert_storage_gauge(registry, "catalog_loaded_rows", "object_store", 0);
|
||||||
|
|
||||||
// verify chunk size updated
|
// verify chunk size updated
|
||||||
catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 700);
|
catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 732);
|
||||||
|
|
||||||
// write into same chunk again.
|
// write into same chunk again.
|
||||||
time.inc(Duration::from_secs(1));
|
time.inc(Duration::from_secs(1));
|
||||||
|
@ -1754,7 +1754,7 @@ mod tests {
|
||||||
write_lp(db.as_ref(), "cpu bar=5 50").await;
|
write_lp(db.as_ref(), "cpu bar=5 50").await;
|
||||||
|
|
||||||
// verify chunk size updated
|
// verify chunk size updated
|
||||||
catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 764);
|
catalog_chunk_size_bytes_metric_eq(registry, "mutable_buffer", 796);
|
||||||
|
|
||||||
// Still only one chunk open
|
// Still only one chunk open
|
||||||
assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 1);
|
assert_storage_gauge(registry, "catalog_loaded_chunks", "mutable_buffer", 1);
|
||||||
|
@ -2605,7 +2605,7 @@ mod tests {
|
||||||
id: ChunkId::new_test(0),
|
id: ChunkId::new_test(0),
|
||||||
storage: ChunkStorage::OpenMutableBuffer,
|
storage: ChunkStorage::OpenMutableBuffer,
|
||||||
lifecycle_action: None,
|
lifecycle_action: None,
|
||||||
memory_bytes: 1006, // memory_size
|
memory_bytes: 1038, // memory_size
|
||||||
object_store_bytes: 0, // os_size
|
object_store_bytes: 0, // os_size
|
||||||
row_count: 1,
|
row_count: 1,
|
||||||
time_of_last_access: None,
|
time_of_last_access: None,
|
||||||
|
@ -2864,7 +2864,7 @@ mod tests {
|
||||||
id: chunk_summaries[2].id,
|
id: chunk_summaries[2].id,
|
||||||
storage: ChunkStorage::OpenMutableBuffer,
|
storage: ChunkStorage::OpenMutableBuffer,
|
||||||
lifecycle_action,
|
lifecycle_action,
|
||||||
memory_bytes: 1303,
|
memory_bytes: 1335,
|
||||||
object_store_bytes: 0, // no OS chunks
|
object_store_bytes: 0, // no OS chunks
|
||||||
row_count: 1,
|
row_count: 1,
|
||||||
time_of_last_access: None,
|
time_of_last_access: None,
|
||||||
|
@ -2885,7 +2885,7 @@ mod tests {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 1303);
|
assert_eq!(db.catalog.metrics().memory().mutable_buffer(), 2486 + 1335);
|
||||||
assert_eq!(db.catalog.metrics().memory().read_buffer(), 2550);
|
assert_eq!(db.catalog.metrics().memory().read_buffer(), 2550);
|
||||||
assert_eq!(db.catalog.metrics().memory().object_store(), 1529);
|
assert_eq!(db.catalog.metrics().memory().object_store(), 1529);
|
||||||
}
|
}
|
||||||
|
|
|
@ -527,7 +527,7 @@ async fn test_chunk_get() {
|
||||||
id: ChunkId::new_test(0).into(),
|
id: ChunkId::new_test(0).into(),
|
||||||
storage: ChunkStorage::OpenMutableBuffer.into(),
|
storage: ChunkStorage::OpenMutableBuffer.into(),
|
||||||
lifecycle_action,
|
lifecycle_action,
|
||||||
memory_bytes: 1016,
|
memory_bytes: 1048,
|
||||||
object_store_bytes: 0,
|
object_store_bytes: 0,
|
||||||
row_count: 2,
|
row_count: 2,
|
||||||
time_of_last_access: None,
|
time_of_last_access: None,
|
||||||
|
@ -541,7 +541,7 @@ async fn test_chunk_get() {
|
||||||
id: ChunkId::new_test(0).into(),
|
id: ChunkId::new_test(0).into(),
|
||||||
storage: ChunkStorage::OpenMutableBuffer.into(),
|
storage: ChunkStorage::OpenMutableBuffer.into(),
|
||||||
lifecycle_action,
|
lifecycle_action,
|
||||||
memory_bytes: 1018,
|
memory_bytes: 1050,
|
||||||
object_store_bytes: 0,
|
object_store_bytes: 0,
|
||||||
row_count: 1,
|
row_count: 1,
|
||||||
time_of_last_access: None,
|
time_of_last_access: None,
|
||||||
|
@ -712,7 +712,7 @@ async fn test_list_partition_chunks() {
|
||||||
id: ChunkId::new_test(0).into(),
|
id: ChunkId::new_test(0).into(),
|
||||||
storage: ChunkStorage::OpenMutableBuffer.into(),
|
storage: ChunkStorage::OpenMutableBuffer.into(),
|
||||||
lifecycle_action: ChunkLifecycleAction::Unspecified.into(),
|
lifecycle_action: ChunkLifecycleAction::Unspecified.into(),
|
||||||
memory_bytes: 1016,
|
memory_bytes: 1048,
|
||||||
object_store_bytes: 0,
|
object_store_bytes: 0,
|
||||||
row_count: 2,
|
row_count: 2,
|
||||||
time_of_last_access: None,
|
time_of_last_access: None,
|
||||||
|
|
|
@ -482,7 +482,7 @@ async fn test_get_chunks() {
|
||||||
.and(predicate::str::contains(
|
.and(predicate::str::contains(
|
||||||
r#""storage": "CHUNK_STORAGE_OPEN_MUTABLE_BUFFER","#,
|
r#""storage": "CHUNK_STORAGE_OPEN_MUTABLE_BUFFER","#,
|
||||||
))
|
))
|
||||||
.and(predicate::str::contains(r#""memoryBytes": "1016""#))
|
.and(predicate::str::contains(r#""memoryBytes": "1048""#))
|
||||||
// Check for a non empty timestamp such as
|
// Check for a non empty timestamp such as
|
||||||
// "time_of_first_write": "2021-03-30T17:11:10.723866Z",
|
// "time_of_first_write": "2021-03-30T17:11:10.723866Z",
|
||||||
.and(predicate::str::contains(r#""timeOfFirstWrite": "20"#));
|
.and(predicate::str::contains(r#""timeOfFirstWrite": "20"#));
|
||||||
|
|
Loading…
Reference in New Issue