influxdb/influxdb3_write/src/write_buffer/mod.rs

//! Implementation of an in-memory buffer for writes that persists data into a wal if it is configured.

pub(crate) mod buffer_segment;
mod flusher;
mod loader;
mod segment_state;
mod table_buffer;

use crate::cache::ParquetCache;
use crate::catalog::{Catalog, DatabaseSchema, TableDefinition, TIME_COLUMN_NAME};
use crate::chunk::ParquetChunk;
use crate::persister::PersisterImpl;
use crate::write_buffer::flusher::WriteBufferFlusher;
use crate::write_buffer::loader::load_starting_state;
use crate::write_buffer::segment_state::{run_buffer_segment_persist_and_cleanup, SegmentState};
use crate::{
    BufferedWriteRequest, Bufferer, ChunkContainer, LpWriteOp, Persister, Precision,
    SegmentDuration, SequenceNumber, Wal, WalOp, WriteBuffer, WriteLineError,
};
use async_trait::async_trait;
use data_types::{
    column_type_from_field, ChunkId, ChunkOrder, ColumnType, NamespaceName, NamespaceNameError,
};
use datafusion::common::DataFusionError;
use datafusion::execution::context::SessionState;
use datafusion::logical_expr::Expr;
use datafusion::physical_plan::SendableRecordBatchStream;
use influxdb_line_protocol::{parse_lines, FieldValue, ParsedLine};
use iox_query::chunk_statistics::create_chunk_statistics;
use iox_query::QueryChunk;
use iox_time::{Time, TimeProvider};
use object_store::path::Path as ObjPath;
use object_store::ObjectMeta;
use observability_deps::tracing::{debug, error};
use parking_lot::{Mutex, RwLock};
use parquet_file::storage::ParquetExecInput;
use sha2::Digest;
use sha2::Sha256;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::i64;
use std::sync::{Arc, OnceLock};
use thiserror::Error;
use tokio::sync::watch;

#[derive(Debug, Error)]
pub enum Error {
    #[error("parsing for line protocol failed")]
    ParseError(WriteLineError),

    #[error("column type mismatch for column {name}: existing: {existing:?}, new: {new:?}")]
    ColumnTypeMismatch {
        name: String,
        existing: ColumnType,
        new: ColumnType,
    },

    #[error("catalog update erorr {0}")]
    CatalogUpdateError(#[from] crate::catalog::Error),

    #[error("error from wal: {0}")]
    WalError(#[from] crate::wal::Error),

    #[error("error from buffer segment: {0}")]
    BufferSegmentError(String),

    #[error("error from persister: {0}")]
    PersisterError(#[from] crate::persister::Error),

    #[error("corrupt load state: {0}")]
    CorruptLoadState(String),

    #[error("database name error: {0}")]
    DatabaseNameError(#[from] NamespaceNameError),

    #[error("walop in file {0} contained data for more than one segment, which is invalid")]
    WalOpForMultipleSegments(String),

    #[error("error from table buffer: {0}")]
    TableBufferError(#[from] table_buffer::Error),
}

pub type Result<T, E = Error> = std::result::Result<T, E>;

#[derive(Debug)]
pub struct WriteRequest<'a> {
    pub db_name: NamespaceName<'static>,
    pub line_protocol: &'a str,
    pub default_time: u64,
}

#[derive(Debug)]
pub struct WriteBufferImpl<W, T> {
    catalog: Arc<Catalog>,
    persister: Arc<PersisterImpl>,
    parquet_cache: Arc<ParquetCache>,
    segment_state: Arc<RwLock<SegmentState<T, W>>>,
    wal: Option<Arc<W>>,
    write_buffer_flusher: WriteBufferFlusher,
    segment_duration: SegmentDuration,
    #[allow(dead_code)]
    time_provider: Arc<T>,
    #[allow(dead_code)]
    segment_persist_handle: Mutex<tokio::task::JoinHandle<()>>,
    #[allow(dead_code)]
    shutdown_segment_persist_tx: watch::Sender<()>,
}

impl<W: Wal, T: TimeProvider> WriteBufferImpl<W, T> {
    pub async fn new(
        persister: Arc<PersisterImpl>,
        wal: Option<Arc<W>>,
        time_provider: Arc<T>,
        segment_duration: SegmentDuration,
        executor: Arc<iox_query::exec::Executor>,
    ) -> Result<Self> {
        let now = time_provider.now();
        let loaded_state =
            load_starting_state(Arc::clone(&persister), wal.clone(), now, segment_duration).await?;

        let segment_state = Arc::new(RwLock::new(SegmentState::new(
            segment_duration,
            loaded_state.last_segment_id,
            Arc::clone(&loaded_state.catalog),
            Arc::clone(&time_provider),
            loaded_state.open_segments,
            loaded_state.persisting_buffer_segments,
            loaded_state.persisted_segments,
            wal.clone(),
        )));

        let write_buffer_flusher = WriteBufferFlusher::new(Arc::clone(&segment_state));

        let segment_state_persister = Arc::clone(&segment_state);
        let time_provider_persister = Arc::clone(&time_provider);
        let wal_perister = wal.clone();
        let cloned_persister = Arc::clone(&persister);

        let (shutdown_segment_persist_tx, shutdown_rx) = watch::channel(());
        let segment_persist_handle = tokio::task::spawn(async move {
            run_buffer_segment_persist_and_cleanup(
                cloned_persister,
                segment_state_persister,
                shutdown_rx,
                time_provider_persister,
                wal_perister,
                executor,
            )
            .await;
        });

        Ok(Self {
            catalog: loaded_state.catalog,
            segment_state,
            parquet_cache: Arc::new(ParquetCache::new(&persister.mem_pool)),
            persister,
            wal,
            write_buffer_flusher,
            time_provider,
            segment_duration,
            segment_persist_handle: Mutex::new(segment_persist_handle),
            shutdown_segment_persist_tx,
        })
    }

    pub fn catalog(&self) -> Arc<Catalog> {
        Arc::clone(&self.catalog)
    }

    async fn write_lp(
        &self,
        db_name: NamespaceName<'static>,
        lp: &str,
        ingest_time: Time,
        accept_partial: bool,
        precision: Precision,
    ) -> Result<BufferedWriteRequest> {
        debug!("write_lp to {} in writebuffer", db_name);

        let result = parse_validate_and_update_catalog(
            db_name.clone(),
            lp,
            &self.catalog,
            ingest_time,
            self.segment_duration,
            accept_partial,
            precision,
        )?;

        self.write_buffer_flusher
            .write_to_open_segment(result.valid_segmented_data)
            .await?;

        Ok(BufferedWriteRequest {
            db_name,
            invalid_lines: result.errors,
            line_count: result.line_count,
            field_count: result.field_count,
            tag_count: result.tag_count,
        })
    }

    fn get_table_chunks(
        &self,
        database_name: &str,
        table_name: &str,
        filters: &[Expr],
        projection: Option<&Vec<usize>>,
        ctx: &SessionState,
    ) -> Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
        let db_schema = self
            .catalog
            .db_schema(database_name)
            .ok_or_else(|| DataFusionError::Execution(format!("db {} not found", database_name)))?;

        let table_schema = {
            let table = db_schema.tables.get(table_name).ok_or_else(|| {
                DataFusionError::Execution(format!(
                    "table {} not found in db {}",
                    table_name, database_name
                ))
            })?;

            table.schema.clone()
        };

        let segment_state = self.segment_state.read();
        let mut chunks =
            segment_state.get_table_chunks(db_schema, table_name, filters, projection, ctx)?;
        let parquet_files = segment_state.get_parquet_files(database_name, table_name);

        let mut chunk_order = chunks.len() as i64;
        let object_store_url = self.persister.object_store_url();

        for parquet_file in parquet_files {
            // TODO: update persisted segments to serialize their key to use here
            let partition_key = data_types::PartitionKey::from(parquet_file.path.clone());
            let partition_id = data_types::partition::TransitionPartitionId::new(
                data_types::TableId::new(0),
                &partition_key,
            );

            let chunk_stats = create_chunk_statistics(
                Some(parquet_file.row_count as usize),
                &table_schema,
                Some(parquet_file.timestamp_min_max()),
                None,
            );

            let location = ObjPath::from(parquet_file.path.clone());

            let parquet_exec = ParquetExecInput {
                object_store_url: object_store_url.clone(),
                object_meta: ObjectMeta {
                    location,
                    last_modified: Default::default(),
                    size: parquet_file.size_bytes as usize,
                    e_tag: None,
                    version: None,
                },
                object_store: self.persister.object_store(),
            };

            let parquet_chunk = ParquetChunk {
                schema: table_schema.clone(),
                stats: Arc::new(chunk_stats),
                partition_id,
                sort_key: None,
                id: ChunkId::new(),
                chunk_order: ChunkOrder::new(chunk_order),
                parquet_exec,
            };

            chunk_order += 1;

            chunks.push(Arc::new(parquet_chunk));
        }

        // Get any cached files and add them to the query
        // This is mostly the same as above, but we change the object store to
        // point to the in memory cache
        for parquet_file in self
            .parquet_cache
            .get_parquet_files(database_name, table_name)
        {
            let partition_key = data_types::PartitionKey::from(parquet_file.path.clone());
            let partition_id = data_types::partition::TransitionPartitionId::new(
                data_types::TableId::new(0),
                &partition_key,
            );

            let chunk_stats = create_chunk_statistics(
                Some(parquet_file.row_count as usize),
                &table_schema,
                Some(parquet_file.timestamp_min_max()),
                None,
            );

            let location = ObjPath::from(parquet_file.path.clone());

            let parquet_exec = ParquetExecInput {
                object_store_url: object_store_url.clone(),
                object_meta: ObjectMeta {
                    location,
                    last_modified: Default::default(),
                    size: parquet_file.size_bytes as usize,
                    e_tag: None,
                    version: None,
                },
                object_store: Arc::clone(&self.parquet_cache.object_store()),
            };

            let parquet_chunk = ParquetChunk {
                schema: table_schema.clone(),
                stats: Arc::new(chunk_stats),
                partition_id,
                sort_key: None,
                id: ChunkId::new(),
                chunk_order: ChunkOrder::new(chunk_order),
                parquet_exec,
            };

            chunk_order += 1;

            chunks.push(Arc::new(parquet_chunk));
        }

        Ok(chunks)
    }

    pub async fn cache_parquet(
        &self,
        db_name: &str,
        table_name: &str,
        min_time: i64,
        max_time: i64,
        records: SendableRecordBatchStream,
    ) -> Result<(), Error> {
        Ok(self
            .parquet_cache
            .persist_parquet_file(db_name, table_name, min_time, max_time, records, None)
            .await?)
    }

    pub async fn update_parquet(
        &self,
        db_name: &str,
        table_name: &str,
        min_time: i64,
        max_time: i64,
        path: ObjPath,
        records: SendableRecordBatchStream,
    ) -> Result<(), Error> {
        Ok(self
            .parquet_cache
            .persist_parquet_file(db_name, table_name, min_time, max_time, records, Some(path))
            .await?)
    }

    pub async fn remove_parquet(&self, path: ObjPath) -> Result<(), Error> {
        Ok(self.parquet_cache.remove_parquet_file(path).await?)
    }

    pub async fn purge_cache(&self) -> Result<(), Error> {
        Ok(self.parquet_cache.purge_cache().await?)
    }

    #[cfg(test)]
    fn get_table_record_batches(
        &self,
        datbase_name: &str,
        table_name: &str,
    ) -> Vec<arrow::record_batch::RecordBatch> {
        let db_schema = self.catalog.db_schema(datbase_name).unwrap();
        let table = db_schema.tables.get(table_name).unwrap();
        let schema = table.schema.clone();

        let segment_state = self.segment_state.read();
        segment_state.open_segments_table_record_batches(datbase_name, table_name, &schema)
    }
}

#[async_trait]
impl<W: Wal, T: TimeProvider> Bufferer for WriteBufferImpl<W, T> {
    async fn write_lp(
        &self,
        database: NamespaceName<'static>,
        lp: &str,
        ingest_time: Time,
        accept_partial: bool,
        precision: Precision,
    ) -> Result<BufferedWriteRequest> {
        self.write_lp(database, lp, ingest_time, accept_partial, precision)
            .await
    }

    fn wal(&self) -> Option<Arc<impl Wal>> {
        self.wal.clone()
    }

    fn catalog(&self) -> Arc<Catalog> {
        self.catalog()
    }
}

impl<W: Wal, T: TimeProvider> ChunkContainer for WriteBufferImpl<W, T> {
    fn get_table_chunks(
        &self,
        database_name: &str,
        table_name: &str,
        filters: &[Expr],
        projection: Option<&Vec<usize>>,
        ctx: &SessionState,
    ) -> crate::Result<Vec<Arc<dyn QueryChunk>>, DataFusionError> {
        self.get_table_chunks(database_name, table_name, filters, projection, ctx)
    }
}

impl<W: Wal, T: TimeProvider> WriteBuffer for WriteBufferImpl<W, T> {}

/// Returns a validated result and the sequence number of the catalog before any updates were
/// applied.
pub(crate) fn parse_validate_and_update_catalog(
    db_name: NamespaceName<'static>,
    lp: &str,
    catalog: &Catalog,
    ingest_time: Time,
    segment_duration: SegmentDuration,
    accept_partial: bool,
    precision: Precision,
) -> Result<ValidationResult> {
    let (sequence, db) = catalog.db_or_create(db_name.as_str())?;
    let mut result = parse_validate_and_update_schema(
        lp,
        &db,
        db_name,
        ingest_time,
        segment_duration,
        accept_partial,
        precision,
        sequence,
    )?;

    if let Some(schema) = result.schema.take() {
        debug!("replacing schema for {:?}", schema);

        catalog.replace_database(sequence, Arc::new(schema))?;
    }

    Ok(result)
}

/// Takes &str of line protocol, parses lines, validates the schema, and inserts new columns
/// if present. Assigns the default time to any lines that do not include a time
#[allow(clippy::too_many_arguments)]
pub(crate) fn parse_validate_and_update_schema(
    lp: &str,
    schema: &DatabaseSchema,
    db_name: NamespaceName<'static>,
    ingest_time: Time,
    segment_duration: SegmentDuration,
    accept_partial: bool,
    precision: Precision,
    starting_catalog_sequence_number: SequenceNumber,
) -> Result<ValidationResult> {
    let mut errors = vec![];
    let mut lp_lines = lp.lines();

    let mut valid_parsed_and_raw_lines: Vec<(ParsedLine, &str)> = vec![];

    for (line_idx, maybe_line) in parse_lines(lp).enumerate() {
        let line = match maybe_line {
            Ok(line) => line,
            Err(e) => {
                if !accept_partial {
                    return Err(Error::ParseError(WriteLineError {
                        // This unwrap is fine because we're moving line by line
                        // alongside the output from parse_lines
                        original_line: lp_lines.next().unwrap().to_string(),
                        line_number: line_idx + 1,
                        error_message: e.to_string(),
                    }));
                } else {
                    errors.push(WriteLineError {
                        original_line: lp_lines.next().unwrap().to_string(),
                        // This unwrap is fine because we're moving line by line
                        // alongside the output from parse_lines
                        line_number: line_idx + 1,
                        error_message: e.to_string(),
                    });
                }
                continue;
            }
        };
        // This unwrap is fine because we're moving line by line
        // alongside the output from parse_lines
        valid_parsed_and_raw_lines.push((line, lp_lines.next().unwrap()));
    }

    validate_or_insert_schema_and_partitions(
        valid_parsed_and_raw_lines,
        schema,
        db_name,
        ingest_time,
        segment_duration,
        precision,
        starting_catalog_sequence_number,
    )
    .map(move |mut result| {
        result.errors = errors;
        result
    })
}

/// Takes parsed lines, validates their schema. If new tables or columns are defined, they
/// are passed back as a new DatabaseSchema as part of the ValidationResult. Lines are split
/// into partitions and the validation result contains the data that can then be serialized
/// into the WAL.
pub(crate) fn validate_or_insert_schema_and_partitions(
    lines: Vec<(ParsedLine<'_>, &str)>,
    schema: &DatabaseSchema,
    db_name: NamespaceName<'static>,
    ingest_time: Time,
    segment_duration: SegmentDuration,
    precision: Precision,
    starting_catalog_sequence_number: SequenceNumber,
) -> Result<ValidationResult> {
    // The (potentially updated) DatabaseSchema to return to the caller.
    let mut schema = Cow::Borrowed(schema);

    // The parsed and validated table_batches
    let mut segment_table_batches: HashMap<Time, TableBatchMap> = HashMap::new();

    let line_count = lines.len();
    let mut field_count = 0;
    let mut tag_count = 0;

    for (line, raw_line) in lines.into_iter() {
        field_count += line.field_set.len();
        tag_count += line.series.tag_set.as_ref().map(|t| t.len()).unwrap_or(0);

        validate_and_convert_parsed_line(
            line,
            raw_line,
            &mut segment_table_batches,
            &mut schema,
            ingest_time,
            segment_duration,
            precision,
        )?;
    }

    let schema = match schema {
        Cow::Owned(s) => Some(s),
        Cow::Borrowed(_) => None,
    };

    let valid_segmented_data = segment_table_batches
        .into_iter()
        .map(|(segment_start, table_batches)| ValidSegmentedData {
            database_name: db_name.clone(),
            segment_start,
            table_batches: table_batches.table_batches,
            wal_op: WalOp::LpWrite(LpWriteOp {
                db_name: db_name.to_string(),
                lp: table_batches.lines.join("\n"),
                default_time: ingest_time.timestamp_nanos(),
                precision,
            }),
            starting_catalog_sequence_number,
        })
        .collect();

    Ok(ValidationResult {
        schema,
        line_count,
        field_count,
        tag_count,
        errors: vec![],
        valid_segmented_data,
    })
}

/// Check if the table exists in the schema and update the schema if it does not
// Because the entry API requires &mut it is not used to avoid a premature
// clone of the Cow.
fn validate_and_update_schema(line: &ParsedLine<'_>, schema: &mut Cow<'_, DatabaseSchema>) {
    let table_name = line.series.measurement.as_str();
    match schema.tables.get(table_name) {
        Some(t) => {
            // Collect new column definitions
            let mut new_cols = Vec::with_capacity(line.column_count() + 1);
            if let Some(tagset) = &line.series.tag_set {
                for (tag_key, _) in tagset {
                    if !t.column_exists(tag_key.as_str()) {
                        new_cols.push((tag_key.to_string(), ColumnType::Tag as i16));
                    }
                }
            }
            for (field_name, value) in &line.field_set {
                if !t.column_exists(field_name.as_str()) {
                    new_cols.push((field_name.to_string(), column_type_from_field(value) as i16));
                }
            }

            if !new_cols.is_empty() {
                let t = schema.to_mut().tables.get_mut(table_name).unwrap();
                t.add_columns(new_cols);
            }
        }
        None => {
            let mut columns = BTreeMap::new();
            if let Some(tag_set) = &line.series.tag_set {
                for (tag_key, _) in tag_set {
                    columns.insert(tag_key.to_string(), ColumnType::Tag as i16);
                }
            }
            for (field_name, value) in &line.field_set {
                columns.insert(field_name.to_string(), column_type_from_field(value) as i16);
            }

            columns.insert(TIME_COLUMN_NAME.to_string(), ColumnType::Time as i16);

            let table = TableDefinition::new(table_name, columns);

            assert!(schema
                .to_mut()
                .tables
                .insert(table_name.to_string(), table)
                .is_none());
        }
    };
}

fn validate_and_convert_parsed_line<'a>(
    line: ParsedLine<'_>,
    raw_line: &'a str,
    segment_table_batches: &mut HashMap<Time, TableBatchMap<'a>>,
    schema: &mut Cow<'_, DatabaseSchema>,
    ingest_time: Time,
    segment_duration: SegmentDuration,
    precision: Precision,
) -> Result<()> {
    validate_and_update_schema(&line, schema);

    // now that we've ensured all columns exist in the schema, construct the actual row and values
    // while validating the column types match.
    let mut values = Vec::with_capacity(line.column_count() + 1);

    // validate tags, collecting any new ones that must be inserted, or adding the values
    if let Some(tag_set) = line.series.tag_set {
        for (tag_key, value) in tag_set {
            let value = Field {
                name: tag_key.to_string(),
                value: FieldData::Tag(value.to_string()),
            };
            values.push(value);
        }
    }

    // validate fields, collecting any new ones that must be inserted, or adding values
    for (field_name, value) in line.field_set {
        let field_data = match value {
            FieldValue::I64(v) => FieldData::Integer(v),
            FieldValue::F64(v) => FieldData::Float(v),
            FieldValue::U64(v) => FieldData::UInteger(v),
            FieldValue::Boolean(v) => FieldData::Boolean(v),
            FieldValue::String(v) => FieldData::String(v.to_string()),
        };
        let value = Field {
            name: field_name.to_string(),
            value: field_data,
        };
        values.push(value);
    }

    // set the time value
    let time_value_nanos = line
        .timestamp
        .map(|ts| {
            let multiplier = match precision {
                Precision::Auto => match crate::guess_precision(ts) {
                    Precision::Second => 1_000_000_000,
                    Precision::Millisecond => 1_000_000,
                    Precision::Microsecond => 1_000,
                    Precision::Nanosecond => 1,

                    Precision::Auto => unreachable!(),
                },
                Precision::Second => 1_000_000_000,
                Precision::Millisecond => 1_000_000,
                Precision::Microsecond => 1_000,
                Precision::Nanosecond => 1,
            };

            ts * multiplier
        })
        .unwrap_or(ingest_time.timestamp_nanos());

    let segment_start = segment_duration.start_time(time_value_nanos / 1_000_000_000);

    values.push(Field {
        name: TIME_COLUMN_NAME.to_string(),
        value: FieldData::Timestamp(time_value_nanos),
    });

    let table_batch_map = segment_table_batches.entry(segment_start).or_default();

    let table_batch = table_batch_map
        .table_batches
        .entry(line.series.measurement.to_string())
        .or_default();
    table_batch.rows.push(Row {
        time: time_value_nanos,
        fields: values,
    });

    table_batch_map.lines.push(raw_line);

    Ok(())
}

#[derive(Debug, Default)]
pub(crate) struct TableBatch {
    #[allow(dead_code)]
    pub(crate) name: String,
    pub(crate) rows: Vec<Row>,
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Row {
    pub(crate) time: i64,
    pub(crate) fields: Vec<Field>,
}

#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct Field {
    pub(crate) name: String,
    pub(crate) value: FieldData,
}

#[derive(Clone, Debug)]
pub(crate) enum FieldData {
    Timestamp(i64),
    Tag(String),
    String(String),
    Integer(i64),
    UInteger(u64),
    Float(f64),
    Boolean(bool),
}

impl PartialEq for FieldData {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (FieldData::Timestamp(a), FieldData::Timestamp(b)) => a == b,
            (FieldData::Tag(a), FieldData::Tag(b)) => a == b,
            (FieldData::String(a), FieldData::String(b)) => a == b,
            (FieldData::Integer(a), FieldData::Integer(b)) => a == b,
            (FieldData::UInteger(a), FieldData::UInteger(b)) => a == b,
            (FieldData::Float(a), FieldData::Float(b)) => a == b,
            (FieldData::Boolean(a), FieldData::Boolean(b)) => a == b,
            _ => false,
        }
    }
}

impl Eq for FieldData {}

/// Result of the validation. If the NamespaceSchema or PartitionMap were updated, they will be
/// in the result.
#[derive(Debug, Default)]
#[allow(dead_code)]
pub(crate) struct ValidationResult {
    /// If the namespace schema is updated with new tables or columns it will be here, which
    /// can be used to update the cache.
    pub(crate) schema: Option<DatabaseSchema>,
    /// Number of lines passed in
    pub(crate) line_count: usize,
    /// Number of fields passed in
    pub(crate) field_count: usize,
    /// Number of tags passed in
    pub(crate) tag_count: usize,
    /// Any errors that occurred while parsing the lines
    pub(crate) errors: Vec<crate::WriteLineError>,
    /// Only valid lines from what was passed in to validate, segmented based on the
    /// timestamps of the data.
    pub(crate) valid_segmented_data: Vec<ValidSegmentedData>,
}

#[derive(Debug)]
pub(crate) struct ValidSegmentedData {
    pub(crate) database_name: NamespaceName<'static>,
    pub(crate) segment_start: Time,
    pub(crate) table_batches: HashMap<String, TableBatch>,
    pub(crate) wal_op: WalOp,
    /// The sequence number of the catalog before any updates were applied based on this write.
    pub(crate) starting_catalog_sequence_number: SequenceNumber,
}

#[derive(Debug, Default)]
pub(crate) struct TableBatchMap<'a> {
    pub(crate) lines: Vec<&'a str>,
    pub(crate) table_batches: HashMap<String, TableBatch>,
}

/// The 32 byte SHA256 digest of the full tag set for a line of measurement data
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
pub struct SeriesId([u8; 32]);

impl std::fmt::Display for SeriesId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", hex::encode(self.0))
    }
}

fn default_series_sha() -> &'static [u8; 32] {
    static DEFAULT_SERIES_ID_SHA: OnceLock<[u8; 32]> = OnceLock::new();
    // the unwrap is safe here because the Sha256 digest will always be 32 bytes:
    DEFAULT_SERIES_ID_SHA.get_or_init(|| Sha256::digest("")[..].try_into().unwrap())
}

impl Default for SeriesId {
    fn default() -> Self {
        Self(default_series_sha().to_owned())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::persister::PersisterImpl;
    use crate::wal::WalImpl;
    use crate::{SegmentId, SequenceNumber, WalOpBatch};
    use arrow::record_batch::RecordBatch;
    use arrow_util::assert_batches_eq;
    use datafusion_util::config::register_iox_object_store;
    use iox_query::exec::IOxSessionContext;
    use iox_time::{MockProvider, Time};
    use object_store::memory::InMemory;
    use object_store::ObjectStore;

    #[test]
    fn parse_lp_into_buffer() {
        let db = Arc::new(DatabaseSchema::new("foo"));
        let db_name = NamespaceName::new("foo").unwrap();
        let lp = "cpu,region=west user=23.2 100\nfoo f1=1i";
        let result = parse_validate_and_update_schema(
            lp,
            &db,
            db_name,
            Time::from_timestamp_nanos(0),
            SegmentDuration::new_5m(),
            false,
            Precision::Nanosecond,
            SequenceNumber::new(0),
        )
        .unwrap();

        let db = result.schema.unwrap();

        assert_eq!(db.tables.len(), 2);
        assert_eq!(db.tables.get("cpu").unwrap().columns().len(), 3);
        assert_eq!(db.tables.get("foo").unwrap().columns().len(), 2);
    }

    #[tokio::test]
    async fn buffers_and_persists_to_wal() {
        let dir = test_helpers::tmp_dir().unwrap().into_path();
        let wal = WalImpl::new(dir.clone()).unwrap();
        let object_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
        let persister = Arc::new(PersisterImpl::new(Arc::clone(&object_store)));
        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
        let segment_duration = SegmentDuration::new_5m();
        let write_buffer = WriteBufferImpl::new(
            Arc::clone(&persister),
            Some(Arc::new(wal)),
            Arc::clone(&time_provider),
            segment_duration,
            crate::test_help::make_exec(),
        )
        .await
        .unwrap();

        let summary = write_buffer
            .write_lp(
                NamespaceName::new("foo").unwrap(),
                "cpu bar=1 10",
                Time::from_timestamp_nanos(123),
                false,
                Precision::Nanosecond,
            )
            .await
            .unwrap();
        assert_eq!(summary.line_count, 1);
        assert_eq!(summary.field_count, 1);
        assert_eq!(summary.tag_count, 0);

        // ensure the data is in the buffer
        let actual = write_buffer.get_table_record_batches("foo", "cpu");
        let expected = [
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1.0 | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        assert_batches_eq!(&expected, &actual);

        // ensure the data is in the wal
        let wal = WalImpl::new(dir).unwrap();
        let mut reader = wal.open_segment_reader(SegmentId::new(1)).unwrap();
        let batch = reader.next_batch().unwrap().unwrap();
        let expected_batch = WalOpBatch {
            sequence_number: SequenceNumber::new(1),
            ops: vec![WalOp::LpWrite(LpWriteOp {
                db_name: "foo".to_string(),
                lp: "cpu bar=1 10".to_string(),
                default_time: 123,
                precision: Precision::Nanosecond,
            })],
        };
        assert_eq!(batch, expected_batch);

        // ensure we load state from the persister
        let write_buffer = WriteBufferImpl::new(
            persister,
            Some(Arc::new(wal)),
            time_provider,
            segment_duration,
            crate::test_help::make_exec(),
        )
        .await
        .unwrap();
        let actual = write_buffer.get_table_record_batches("foo", "cpu");
        assert_batches_eq!(&expected, &actual);
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn returns_chunks_across_buffered_persisted_and_persisting_data() {
        let dir = test_helpers::tmp_dir().unwrap().into_path();
        let wal = Some(Arc::new(WalImpl::new(dir.clone()).unwrap()));
        let object_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
        let persister = Arc::new(PersisterImpl::new(Arc::clone(&object_store)));
        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
        let segment_duration = SegmentDuration::new_5m();
        let write_buffer = WriteBufferImpl::new(
            Arc::clone(&persister),
            wal.clone(),
            Arc::clone(&time_provider),
            segment_duration,
            crate::test_help::make_exec(),
        )
        .await
        .unwrap();
        let session_context = IOxSessionContext::with_testing();
        let runtime_env = session_context.inner().runtime_env();
        register_iox_object_store(runtime_env, "influxdb3", Arc::clone(&object_store));

        let _ = write_buffer
            .write_lp(
                NamespaceName::new("foo").unwrap(),
                "cpu bar=1 10",
                Time::from_timestamp_nanos(123),
                false,
                Precision::Nanosecond,
            )
            .await
            .unwrap();

        let expected = [
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 1.0 | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);

        // advance the time and wait for it to persist
        time_provider.set(Time::from_timestamp(800, 0).unwrap());
        loop {
            let segment_state = write_buffer.segment_state.read();
            if !segment_state.persisted_segments().is_empty() {
                break;
            }
        }

        // nothing should be open at this point
        assert!(write_buffer
            .segment_state
            .read()
            .open_segment_times()
            .is_empty());

        // verify we get the persisted data
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);

        // now write some into the next segment we're in and verify we get both buffer and persisted
        let _ = write_buffer
            .write_lp(
                NamespaceName::new("foo").unwrap(),
                "cpu bar=2",
                Time::from_timestamp(900, 0).unwrap(),
                false,
                Precision::Nanosecond,
            )
            .await
            .unwrap();
        let expected = [
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 2.0 | 1970-01-01T00:15:00Z           |",
            "| 1.0 | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);

        // and now reload the buffer and verify that we get persisted and the buffer again
        let write_buffer = WriteBufferImpl::new(
            Arc::clone(&persister),
            wal,
            Arc::clone(&time_provider),
            segment_duration,
            crate::test_help::make_exec(),
        )
        .await
        .unwrap();
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);

        // and now add to the buffer and verify that we still only get two chunks
        let _ = write_buffer
            .write_lp(
                NamespaceName::new("foo").unwrap(),
                "cpu bar=3",
                Time::from_timestamp(950, 0).unwrap(),
                false,
                Precision::Nanosecond,
            )
            .await
            .unwrap();
        let expected = [
            "+-----+--------------------------------+",
            "| bar | time                           |",
            "+-----+--------------------------------+",
            "| 2.0 | 1970-01-01T00:15:00Z           |",
            "| 3.0 | 1970-01-01T00:15:50Z           |",
            "| 1.0 | 1970-01-01T00:00:00.000000010Z |",
            "+-----+--------------------------------+",
        ];
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn sets_starting_catalog_number_on_new_segment() {
        let dir = test_helpers::tmp_dir().unwrap().into_path();
        let wal = Some(Arc::new(WalImpl::new(dir.clone()).unwrap()));
        let object_store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
        let persister = Arc::new(PersisterImpl::new(Arc::clone(&object_store)));
        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_nanos(0)));
        let segment_duration = SegmentDuration::new_5m();
        let write_buffer = WriteBufferImpl::new(
            Arc::clone(&persister),
            wal.clone(),
            Arc::clone(&time_provider),
            segment_duration,
            crate::test_help::make_exec(),
        )
        .await
        .unwrap();
        let starting_catalog_sequence_number = write_buffer.catalog().sequence_number();

        let session_context = IOxSessionContext::with_testing();
        let runtime_env = session_context.inner().runtime_env();
        register_iox_object_store(runtime_env, "influxdb3", Arc::clone(&object_store));

        // write data into the buffer that will go into a new segment
        let new_segment_time = Time::from_timestamp(360, 0).unwrap();
        let _ = write_buffer
            .write_lp(
                NamespaceName::new("foo").unwrap(),
                "cpu bar=1",
                new_segment_time,
                false,
                Precision::Nanosecond,
            )
            .await
            .unwrap();

        let expected = [
            "+-----+----------------------+",
            "| bar | time                 |",
            "+-----+----------------------+",
            "| 1.0 | 1970-01-01T00:06:00Z |",
            "+-----+----------------------+",
        ];
        let actual = get_table_batches(&write_buffer, "foo", "cpu", &session_context).await;
        assert_batches_eq!(&expected, &actual);

        // get the segment for the new_segment_time and validate that it has the correct starting catalog sequence number
        let state = write_buffer.segment_state.read();
        let segment_start_time = SegmentDuration::new_5m().start_time(new_segment_time.timestamp());
        let segment = state.segment_for_time(segment_start_time).unwrap();
        assert_eq!(
            segment.starting_catalog_sequence_number(),
            starting_catalog_sequence_number
        );
    }

    async fn get_table_batches(
        write_buffer: &WriteBufferImpl<WalImpl, MockProvider>,
        database_name: &str,
        table_name: &str,
        ctx: &IOxSessionContext,
    ) -> Vec<RecordBatch> {
        let chunks = write_buffer
            .get_table_chunks(database_name, table_name, &[], None, &ctx.inner().state())
            .unwrap();
        let mut batches = vec![];
        for chunk in chunks {
            let chunk = chunk
                .data()
                .read_to_batches(chunk.schema(), ctx.inner())
                .await;
            batches.extend(chunk);
        }
        batches
    }
}