feat: arrow_util truncate and bitset append (#2879)

* feat: arrow_util truncate and bitset append

* chore: check still mutable after truncate

* chore: review feedback

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
pull/24376/head
Raphael Taylor-Davies 2021-10-18 16:58:17 +01:00 committed by GitHub
parent f5a84122e3
commit 1518f30da3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 150 additions and 0 deletions

View File

@ -35,8 +35,41 @@ impl BitSet {
self.buffer.resize(new_buf_len, 0);
}
/// Appends `count` set bits
pub fn append_set(&mut self, count: usize) {
let new_len = self.len + count;
let new_buf_len = (new_len + 7) >> 3;
let skew = self.len & 7;
if skew != 0 {
*self.buffer.last_mut().unwrap() |= 0xFF << skew;
}
self.buffer.resize(new_buf_len, 0xFF);
let rem = new_len & 7;
if rem != 0 {
*self.buffer.last_mut().unwrap() &= (1 << rem) - 1;
}
self.len = new_len;
}
/// Truncates the bitset to the provided length
pub fn truncate(&mut self, len: usize) {
let new_buf_len = (len + 7) >> 3;
self.buffer.truncate(new_buf_len);
let overrun = len & 7;
if overrun > 0 {
*self.buffer.last_mut().unwrap() &= (1 << overrun) - 1;
}
self.len = len;
}
/// Appends `count` boolean values from the slice of packed bits
pub fn append_bits(&mut self, count: usize, to_set: &[u8]) {
assert_eq!((count + 7) >> 3, to_set.len());
let new_len = self.len + count;
let new_buf_len = (new_len + 7) >> 3;
self.buffer.reserve(new_buf_len - self.buffer.len());
@ -252,6 +285,53 @@ mod tests {
}
}
#[test]
fn test_append_fuzz() {
let mut mask = BitSet::new();
let mut all_bools = vec![];
let mut rng = rand::thread_rng();
for _ in 0..100 {
let len = (rng.next_u32() % 32) as usize;
let set = rng.next_u32() & 1 == 0;
match set {
true => mask.append_set(len),
false => mask.append_unset(len),
}
all_bools.extend(std::iter::repeat(set).take(len));
let collected = compact_bools(&all_bools);
assert_eq!(mask.buffer, collected);
}
}
#[test]
fn test_truncate_fuzz() {
let mut mask = BitSet::new();
let mut all_bools = vec![];
let mut rng = rand::thread_rng();
for _ in 0..100 {
let mask_length = (rng.next_u32() % 32) as usize;
let bools: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() & 1 == 0))
.take(mask_length)
.collect();
let collected = compact_bools(&bools);
mask.append_bits(mask_length, &collected);
all_bools.extend_from_slice(&bools);
let truncate = rng.next_u32() as usize % all_bools.len();
mask.truncate(truncate);
all_bools.truncate(truncate);
let collected = compact_bools(&all_bools);
assert_eq!(mask.buffer, collected);
}
}
#[test]
fn test_arrow_compat() {
let bools = &[

View File

@ -112,6 +112,19 @@ impl<K: AsPrimitive<usize> + FromPrimitive + Zero> StringDictionary<K> {
pub fn into_inner(self) -> PackedStringArray<K> {
self.storage
}
/// Truncates this dictionary removing all keys larger than `id`
pub fn truncate(&mut self, id: K) {
let id = id.as_();
self.dedup.retain(|k, _| k.as_() <= id);
self.storage.truncate(id + 1)
}
/// Clears this dictionary removing all elements
pub fn clear(&mut self) {
self.storage.clear();
self.dedup.clear()
}
}
fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
@ -255,4 +268,30 @@ mod test {
let err = TryInto::<StringDictionary<_>>::try_into(data).expect_err("expected failure");
assert!(matches!(err, Error::DuplicateKeyFound { key } if &key == "cupcakes"))
}
#[test]
fn test_truncate() {
let mut dictionary = StringDictionary::<i32>::new();
dictionary.lookup_value_or_insert("cupcake");
dictionary.lookup_value_or_insert("cupcake");
dictionary.lookup_value_or_insert("bingo");
let bingo = dictionary.lookup_value_or_insert("bingo");
let bongo = dictionary.lookup_value_or_insert("bongo");
dictionary.lookup_value_or_insert("bingo");
dictionary.lookup_value_or_insert("cupcake");
dictionary.truncate(bingo);
assert_eq!(dictionary.values().len(), 2);
assert_eq!(dictionary.dedup.len(), 2);
assert_eq!(dictionary.lookup_value("cupcake"), Some(0));
assert_eq!(dictionary.lookup_value("bingo"), Some(1));
assert!(dictionary.lookup_value("bongo").is_none());
assert!(dictionary.lookup_id(bongo).is_none());
dictionary.lookup_value_or_insert("bongo");
assert_eq!(dictionary.lookup_value("bongo"), Some(2));
}
}

View File

@ -76,6 +76,19 @@ impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
self.offsets.resize(self.offsets.len() + len, offset);
}
/// Truncates the array to the given length
pub fn truncate(&mut self, len: usize) {
self.offsets.truncate(len + 1);
let last_idx = self.offsets.last().expect("offsets empty");
self.storage.truncate(last_idx.as_());
}
/// Removes all elements from this array
pub fn clear(&mut self) {
self.offsets.truncate(1);
self.storage.clear();
}
pub fn iter(&self) -> PackedStringIterator<'_, K> {
PackedStringIterator {
array: self,
@ -170,4 +183,22 @@ mod tests {
assert_eq!(array.get(9).unwrap(), "");
assert_eq!(array.get(3).unwrap(), "");
}
#[test]
fn test_truncate() {
let mut array = PackedStringArray::<i32>::new();
array.append("hello");
array.append("world");
array.append("cupcake");
array.truncate(1);
assert_eq!(array.len(), 1);
assert_eq!(array.get(0).unwrap(), "hello");
array.append("world");
assert_eq!(array.len(), 2);
assert_eq!(array.get(0).unwrap(), "hello");
assert_eq!(array.get(1).unwrap(), "world");
}
}