refactor: remove row_count from IoxMetadata

Remove the redundant row_count from the IoxMetadata structure that is
serialised into the Parquet file.

The reasoning is twofold:

    * The Parquet file's native metadata already contains a row count
    * Needing to know the number of rows up-front precludes streaming
pull/24376/head
Dom Dwyer 2022-05-20 17:18:11 +01:00
parent 71555ee55c
commit a142a9eb57
10 changed files with 10 additions and 40 deletions

View File

@ -746,12 +746,8 @@ impl Compactor {
);
let row_count: usize = output_batches.iter().map(|b| b.num_rows()).sum();
let row_count = row_count.try_into().context(RowCountTypeConversionSnafu)?;
debug!("got {} rows from stream {}", row_count, i);
if row_count == 0 {
continue;
}
// Compute min and max of the `time` column
let (min_time, max_time) =
@ -771,7 +767,6 @@ impl Compactor {
time_of_last_write: Time::from_timestamp_nanos(max_time),
min_sequence_number,
max_sequence_number,
row_count,
compaction_level: 1, // compacted result file always have level 1
sort_key: Some(sort_key.clone()),
};
@ -2410,7 +2405,6 @@ mod tests {
time_of_last_write: compactor.time_provider.now(),
min_sequence_number: SequenceNumber::new(5),
max_sequence_number: SequenceNumber::new(6),
row_count: 3,
compaction_level: 1, // level of compacted data is always 1
sort_key: Some(SortKey::from_columns(["tag1", "time"])),
};
@ -2558,7 +2552,6 @@ mod tests {
time_of_last_write: compactor.time_provider.now(),
min_sequence_number: SequenceNumber::new(5),
max_sequence_number: SequenceNumber::new(6),
row_count: 3,
compaction_level: 1, // file level of compacted file is always 1
sort_key: None,
};

View File

@ -7,6 +7,13 @@ import "google/protobuf/timestamp.proto";
// IOx-specific metadata that will be serialized into the file-level key-value Parquet metadata
// under a single key.
message IoxMetadata {
// Removed as the Parquet metadata itself contains the row count, and
// specifying it here creates a dependency that prevents streaming
// serialisation (needing to know the number rows before you can serialise
// your parquet file with this metadata structure within it)
reserved 14;
reserved "row_count";
// Object store ID. Used in the parquet filename. 16 bytes in big-endian order.
bytes object_store_id = 1;
@ -46,9 +53,6 @@ message IoxMetadata {
// The maximum sequence number from a sequencer in this parquet file.
int64 max_sequence_number = 13;
// Number of rows of data in this file.
int64 row_count = 14;
// The sort key of this chunk
SortKey sort_key = 15;

View File

@ -105,9 +105,6 @@ pub async fn compact_persisting_batch(
.filter(|b| b.num_rows() != 0)
.collect();
let row_count: usize = output_batches.iter().map(|b| b.num_rows()).sum();
let row_count = row_count.try_into().context(RowCountTypeConversionSnafu)?;
// Compute min and max of the `time` column
let (min_time, max_time) =
compute_timenanosecond_min_max(&output_batches).context(MinMaxSnafu)?;
@ -129,7 +126,6 @@ pub async fn compact_persisting_batch(
time_of_last_write: Time::from_timestamp_nanos(max_time),
min_sequence_number: min_seq,
max_sequence_number: max_seq,
row_count,
compaction_level: INITIAL_COMPACTION_LEVEL,
sort_key: Some(metadata_sort_key),
};
@ -337,7 +333,6 @@ mod tests {
20000,
seq_num_start,
seq_num_end,
3,
INITIAL_COMPACTION_LEVEL,
Some(SortKey::from_columns(["tag1", "time"])),
);
@ -432,7 +427,6 @@ mod tests {
220000,
seq_num_start,
seq_num_end,
4,
INITIAL_COMPACTION_LEVEL,
// Sort key should now be set
Some(SortKey::from_columns(["tag1", "tag3", "time"])),
@ -530,7 +524,6 @@ mod tests {
220000,
seq_num_start,
seq_num_end,
4,
INITIAL_COMPACTION_LEVEL,
// The sort key in the metadata should be the same as specified (that is, not
// recomputed)
@ -628,7 +621,6 @@ mod tests {
220000,
seq_num_start,
seq_num_end,
4,
INITIAL_COMPACTION_LEVEL,
// The sort key in the metadata should be updated to include the new column just before
// the time column
@ -729,7 +721,6 @@ mod tests {
220000,
seq_num_start,
seq_num_end,
4,
INITIAL_COMPACTION_LEVEL,
// The sort key in the metadata should only contain the columns in this file
Some(SortKey::from_columns(["tag3", "tag1", "time"])),

View File

@ -77,7 +77,6 @@ mod tests {
time_of_last_write: now(),
min_sequence_number: SequenceNumber::new(5),
max_sequence_number: SequenceNumber::new(6),
row_count: 0,
compaction_level: INITIAL_COMPACTION_LEVEL,
sort_key: None,
};
@ -111,7 +110,6 @@ mod tests {
time_of_last_write: now(),
min_sequence_number: SequenceNumber::new(5),
max_sequence_number: SequenceNumber::new(6),
row_count: 3,
compaction_level: INITIAL_COMPACTION_LEVEL,
sort_key: None,
};

View File

@ -64,8 +64,6 @@ pub async fn make_persisting_batch_with_meta() -> (Arc<PersistingBatch>, Vec<Tom
let namespace_id = 1;
let table_id = 1;
let partition_id = 1;
let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
let row_count = row_count.try_into().unwrap();
// make the persisting batch
let persisting_batch = make_persisting_batch(
@ -95,7 +93,6 @@ pub async fn make_persisting_batch_with_meta() -> (Arc<PersistingBatch>, Vec<Tom
7000,
seq_num_start,
seq_num_end,
row_count,
INITIAL_COMPACTION_LEVEL,
Some(SortKey::from_columns(vec!["tag1", "tag2", "time"])),
);
@ -139,7 +136,6 @@ pub fn make_meta(
max_time: i64,
min_sequence_number: i64,
max_sequence_number: i64,
row_count: i64,
compaction_level: i16,
sort_key: Option<SortKey>,
) -> IoxMetadata {
@ -157,7 +153,6 @@ pub fn make_meta(
time_of_last_write: Time::from_timestamp_nanos(max_time),
min_sequence_number: SequenceNumber::new(min_sequence_number),
max_sequence_number: SequenceNumber::new(max_sequence_number),
row_count,
compaction_level,
sort_key,
}

View File

@ -521,7 +521,6 @@ impl TestPartition {
time_of_last_write: Time::from_timestamp_nanos(max_time),
min_sequence_number,
max_sequence_number,
row_count: row_count as i64,
compaction_level: INITIAL_COMPACTION_LEVEL,
sort_key: Some(sort_key.clone()),
};

View File

@ -279,9 +279,6 @@ pub struct IoxMetadata {
/// sequence number of the last write
pub max_sequence_number: SequenceNumber,
/// number of rows of data
pub row_count: i64,
/// the compaction level of the file
pub compaction_level: i16,
@ -317,7 +314,6 @@ impl IoxMetadata {
time_of_last_write: Some(self.time_of_last_write.date_time().into()),
min_sequence_number: self.min_sequence_number.get(),
max_sequence_number: self.max_sequence_number.get(),
row_count: self.row_count,
sort_key,
compaction_level: self.compaction_level as i32,
};
@ -377,7 +373,6 @@ impl IoxMetadata {
time_of_last_write,
min_sequence_number: SequenceNumber::new(proto_msg.min_sequence_number),
max_sequence_number: SequenceNumber::new(proto_msg.max_sequence_number),
row_count: proto_msg.row_count,
sort_key,
compaction_level: proto_msg.compaction_level as i16,
})
@ -394,6 +389,7 @@ impl IoxMetadata {
file_size_bytes: usize,
metadata: &IoxParquetMetaData,
) -> ParquetFileParams {
let row_count = metadata.decode().expect("invalid metadata").row_count();
ParquetFileParams {
sequencer_id: self.sequencer_id,
namespace_id: self.namespace_id,
@ -406,8 +402,8 @@ impl IoxMetadata {
max_time: Timestamp::new(self.time_of_last_write.timestamp_nanos()),
file_size_bytes: file_size_bytes as i64,
parquet_metadata: metadata.thrift_bytes().to_vec(),
row_count: self.row_count,
compaction_level: self.compaction_level,
row_count: row_count.try_into().expect("row count overflows i64"),
created_at: Timestamp::new(self.creation_timestamp.timestamp_nanos()),
}
}
@ -878,7 +874,6 @@ mod tests {
time_of_last_write: Time::from_timestamp(3234, 3456),
min_sequence_number: SequenceNumber::new(5),
max_sequence_number: SequenceNumber::new(6),
row_count: 3,
compaction_level: 0,
sort_key: Some(sort_key),
};
@ -906,7 +901,6 @@ mod tests {
time_of_last_write: Time::from_timestamp_nanos(424242),
min_sequence_number: SequenceNumber::new(10),
max_sequence_number: SequenceNumber::new(11),
row_count: 1000,
compaction_level: 1,
sort_key: None,
};

View File

@ -180,7 +180,6 @@ mod tests {
time_of_last_write: Time::from_timestamp_nanos(424242),
min_sequence_number: SequenceNumber::new(10),
max_sequence_number: SequenceNumber::new(11),
row_count: 1000,
compaction_level: 1,
sort_key: None,
};

View File

@ -294,7 +294,6 @@ mod tests {
time_of_last_write: Time::from_timestamp_nanos(424242),
min_sequence_number: SequenceNumber::new(10),
max_sequence_number: SequenceNumber::new(11),
row_count: 1000,
compaction_level: 1,
sort_key: None,
};

View File

@ -47,7 +47,6 @@ async fn test_decoded_iox_metadata() {
min_sequence_number: SequenceNumber::new(10),
max_sequence_number: SequenceNumber::new(11),
compaction_level: 1,
row_count: 12341234,
sort_key: None,
};
@ -116,7 +115,6 @@ async fn test_derive_parquet_file_params() {
min_sequence_number: SequenceNumber::new(10),
max_sequence_number: SequenceNumber::new(11),
compaction_level: 1,
row_count: 12341234,
sort_key: None,
};
@ -157,11 +155,11 @@ async fn test_derive_parquet_file_params() {
assert_eq!(catalog_data.file_size_bytes, file_size as i64);
assert_eq!(catalog_data.compaction_level, meta.compaction_level);
assert_eq!(catalog_data.created_at, Timestamp::new(1234));
assert_eq!(catalog_data.row_count, 3);
// NOTE: these DO NOT reflect the actual values! These values were not
// derived from the actual data, but instead trusted from the input
// IoxMetadata.
assert_eq!(catalog_data.row_count, meta.row_count);
assert_eq!(catalog_data.min_time, Timestamp::new(4242));
assert_eq!(catalog_data.max_time, Timestamp::new(424242));
}