diff --git a/Cargo.lock b/Cargo.lock index 509d306f21..273eff94bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -427,6 +427,21 @@ dependencies = [ "shlex", ] +[[package]] +name = "bit-set" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e11e16035ea35e4e5997b393eacbf6f63983188f7a2ad25bfb13465f5ad59de" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + [[package]] name = "bitflags" version = "1.3.2" @@ -1867,7 +1882,7 @@ dependencies = [ "log", "pest", "pest_derive", - "quick-error", + "quick-error 2.0.1", "serde", "serde_json", ] @@ -4402,15 +4417,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0d9cc07f18492d879586c92b485def06bc850da3118075cd45d50e9c95b0e5" dependencies = [ + "bit-set", "bitflags", "byteorder", "lazy_static", "num-traits", - "quick-error", + "quick-error 2.0.1", "rand", "rand_chacha", "rand_xorshift", "regex-syntax", + "rusty-fork", + "tempfile", ] [[package]] @@ -4605,6 +4623,12 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quick-error" version = "2.0.1" @@ -4741,6 +4765,7 @@ dependencies = [ "packers", "parking_lot 0.12.0", "permutation", + "proptest", "rand", "rand_distr", "schema", @@ -5144,6 +5169,18 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error 1.2.3", + "tempfile", + "wait-timeout", +] + [[package]] name = "rustyline" version = "9.1.2" diff --git a/read_buffer/Cargo.toml b/read_buffer/Cargo.toml index bf88b78a37..1d2ca6ada1 100644 --- a/read_buffer/Cargo.toml +++ b/read_buffer/Cargo.toml @@ -30,6 +30,7 @@ workspace-hack = { path = "../workspace-hack"} [dev-dependencies] # In alphabetical order criterion = "0.3.3" packers = { path = "../packers" } +proptest = "1.0.0" rand = "0.8.3" rand_distr = "0.4.2" test_helpers = { path = "../test_helpers" } diff --git a/read_buffer/proptest-regressions/column/encoding/scalar/fixed_null.txt b/read_buffer/proptest-regressions/column/encoding/scalar/fixed_null.txt new file mode 100644 index 0000000000..2f8508e705 --- /dev/null +++ b/read_buffer/proptest-regressions/column/encoding/scalar/fixed_null.txt @@ -0,0 +1,13 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 836582419d74d26bd8cf2f79d7bc39825100fe2d597d704fde9b7669c9f9b206 # shrinks to arr = [] +cc 503e571344ed7bf06f55f02058ad7ca9c53561652d04f8ff8583b8690548503c # shrinks to arr = [] +cc 0867ea10ca74e47a208731accc4ff17826a5c1a05f69cbab7c0e7db73886c3cb # shrinks to arr = [] +cc 98b86d843e5d9d685c3e9ea505f15b26c247d7f7e478d98037cbfdb5c7ec7791 # shrinks to arr = [] +cc 7adf8aadb9a325578d038998c20c1b7f4af0d6cd7bc16c5fa4379cf9a4471466 # shrinks to arr = [] +cc 5282c6eeaef6a1096e4bcb9ac460db53d9a815eca9d8a0f29b53d70ffe78f775 # shrinks to arr = [] +cc 7319175e674e0e859251ffc3c136cc163010c2ef0df90f830d067b99964081c6 # shrinks to arr = [] diff --git a/read_buffer/src/column/encoding/scalar/fixed_null.rs b/read_buffer/src/column/encoding/scalar/fixed_null.rs index 064c13ed96..6b02c79385 100644 --- a/read_buffer/src/column/encoding/scalar/fixed_null.rs +++ b/read_buffer/src/column/encoding/scalar/fixed_null.rs @@ -516,8 +516,9 @@ mod test { use std::sync::Arc; use arrow::datatypes::*; + use proptest::prelude::*; - use super::super::transcoders::MockTranscoder; + use super::super::transcoders::{ByteTrimmer, MockTranscoder, NoOpTranscoder}; use super::cmp::Operator; use super::*; @@ -908,4 +909,120 @@ mod test { let (v, _) = new_mock_encoding(vec![None, Some(100), Some(222)]); assert!(v.has_any_non_null_value()); } + + // This macro builds out property tests for the integer byte trimmer encoder. + // Each of the supported logical types (i64, u64) is tested with transcoders + // that store encoded values physically as (i32, u32, i16, u16, i8, u8) + // depending on logical type and value range. + macro_rules! make_test_transcoder_integer_bytetrimer { + (($logical:ty, $logical_arrow:ty, $physical:ty, $physical_arrow:ty, $fn_name:ident)) => { + proptest! { + #[test] + // The proptest strategy will generate vectors of values within the physical type + // bounds, ensuring they can be safely encoded. + // The strategy effectively says: + // + // Generate vectors of Option where the value will be `None`. + // Generate values according to the provided range, and generate + // `n` of them according to the size range `0..=50`. + fn $fn_name(arr in prop::collection::vec(proptest::option::weighted(0.9, <$physical>::MIN as $logical ..=<$physical>::MAX as $logical), 0..=50)) { + // The control encoding is just a null-supporting array + // implementation with no compression. We will check that all + // encodings under test behave in the same way as this one. + let control = FixedNull::new( + PrimitiveArray::<$logical_arrow>::from(arr.clone()), + NoOpTranscoder {}, + ); + + let transcoder = ByteTrimmer {}; + let byte_trimmed = FixedNull::<$physical_arrow, $logical, _>::new( + arr.into_iter() + .map(|v| v.map(|v| transcoder.encode(v))) + .collect::>(), // encode u64 as u8, + transcoder, + ); + + // exercise some physical operations + let mut cases = vec![]; + for op in ["<", "<=", ">", ">=", "=", "!="] { + for v in [ + <$physical>::MIN, + <$physical>::MIN + 1, + <$physical>::MAX / 10, + <$physical>::MAX / 4, + <$physical>::MAX / 2, + <$physical>::MAX - 1, + <$physical>::MAX, + ] { + cases.push((op, v as $logical)); + } + } + + for (op, v) in cases { + let row_ids_control = control.row_ids_filter( + v, + &cmp::Operator::try_from(op).unwrap(), + RowIDs::new_vector(), + ); + let row_ids_trimmed = byte_trimmed.row_ids_filter( + v, + &cmp::Operator::try_from(op).unwrap(), + RowIDs::new_vector(), + ); + prop_assert_eq!(row_ids_control, row_ids_trimmed) + } + } + } + }; + } + + make_test_transcoder_integer_bytetrimer!(( + u64, + UInt64Type, + u8, + UInt8Type, + test_transcoder_byte_trim_u64_to_u8 + )); + make_test_transcoder_integer_bytetrimer!(( + u64, + UInt64Type, + u16, + UInt16Type, + test_transcoder_byte_trim_u64_to_u16 + )); + make_test_transcoder_integer_bytetrimer!(( + u64, + UInt64Type, + u32, + UInt32Type, + test_transcoder_byte_trim_u64_to_u32 + )); + make_test_transcoder_integer_bytetrimer!(( + i64, + Int64Type, + i8, + Int8Type, + test_transcoder_byte_trim_i64_to_i8 + )); + make_test_transcoder_integer_bytetrimer!(( + i64, + Int64Type, + u8, + UInt8Type, + test_transcoder_byte_trim_i64_to_u8 + )); + make_test_transcoder_integer_bytetrimer!(( + i64, + Int64Type, + i16, + Int16Type, + test_transcoder_byte_trim_i64_to_i16 + )); + make_test_transcoder_integer_bytetrimer!(( + i64, + Int64Type, + u16, + UInt16Type, + test_transcoder_byte_trim_i64_to_u16 + )); }