diff --git a/Cargo.lock b/Cargo.lock index f6511d99a8..c863aed6d6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -165,6 +165,7 @@ dependencies = [ "hashbrown 0.11.2", "num-traits", "rand 0.8.3", + "snafu", ] [[package]] diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml index c7c54152e5..e00f07f9ef 100644 --- a/arrow_util/Cargo.toml +++ b/arrow_util/Cargo.toml @@ -11,6 +11,7 @@ arrow = { path = "../arrow" } ahash = "0.7.2" num-traits = "0.2" futures = "0.3" +snafu = "0.6" hashbrown = "0.11" [dev-dependencies] diff --git a/arrow_util/src/dictionary.rs b/arrow_util/src/dictionary.rs index 257c7a497f..768761f768 100644 --- a/arrow_util/src/dictionary.rs +++ b/arrow_util/src/dictionary.rs @@ -4,6 +4,14 @@ use hashbrown::HashMap; use crate::string::PackedStringArray; use num_traits::{AsPrimitive, FromPrimitive, Zero}; +use snafu::Snafu; +use std::convert::TryFrom; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("duplicate key found {}", key))] + DuplicateKeyFound { key: String }, +} /// A String dictionary that builds on top of `PackedStringArray` adding O(1) /// index lookups for a given string @@ -88,6 +96,10 @@ impl + FromPrimitive + Zero> StringDictionary { pub fn values(&self) -> &PackedStringArray { &self.storage } + + pub fn into_inner(self) -> PackedStringArray { + self.storage + } } fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 { @@ -97,9 +109,54 @@ fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 { state.finish() } +impl TryFrom> for StringDictionary +where + K: AsPrimitive + FromPrimitive + Zero, +{ + type Error = Error; + + fn try_from(storage: PackedStringArray) -> Result { + use hashbrown::hash_map::RawEntryMut; + + let hasher = ahash::RandomState::new(); + let mut dedup: HashMap = HashMap::with_capacity_and_hasher(storage.len(), ()); + for (idx, value) in storage.iter().enumerate() { + let hash = hash_str(&hasher, value); + + let entry = dedup + .raw_entry_mut() + .from_hash(hash, |key| value == storage.get(key.as_()).unwrap()); + + match entry { + RawEntryMut::Occupied(_) => { + return Err(Error::DuplicateKeyFound { + key: value.to_string(), + }) + } + RawEntryMut::Vacant(entry) => { + let key = + K::from_usize(idx).expect("failed to fit string index into dictionary key"); + + entry.insert_with_hasher(hash, key, (), |key| { + let string = storage.get(key.as_()).unwrap(); + hash_str(&hasher, string) + }); + } + } + } + + Ok(Self { + hash: hasher, + dedup, + storage, + }) + } +} + #[cfg(test)] mod test { use super::*; + use std::convert::TryInto; #[test] fn test_dictionary() { @@ -130,4 +187,34 @@ mod test { assert!(dictionary.lookup_id(-1).is_none()); assert_eq!(arrow_expected, arrow_actual); } + + #[test] + fn from_string_array() { + let mut data = PackedStringArray::::new(); + data.append("cupcakes"); + data.append("foo"); + data.append("bingo"); + + let dictionary: StringDictionary<_> = data.try_into().unwrap(); + + assert_eq!(dictionary.lookup_value("cupcakes"), Some(0)); + assert_eq!(dictionary.lookup_value("foo"), Some(1)); + assert_eq!(dictionary.lookup_value("bingo"), Some(2)); + + assert_eq!(dictionary.lookup_id(0), Some("cupcakes")); + assert_eq!(dictionary.lookup_id(1), Some("foo")); + assert_eq!(dictionary.lookup_id(2), Some("bingo")); + } + + #[test] + fn from_string_array_duplicates() { + let mut data = PackedStringArray::::new(); + data.append("cupcakes"); + data.append("foo"); + data.append("bingo"); + data.append("cupcakes"); + + let err = TryInto::>::try_into(data).expect_err("expected failure"); + assert!(matches!(err, Error::DuplicateKeyFound { key } if &key == "cupcakes")) + } } diff --git a/arrow_util/src/string.rs b/arrow_util/src/string.rs index 1370466638..3ec339399e 100644 --- a/arrow_util/src/string.rs +++ b/arrow_util/src/string.rs @@ -7,25 +7,25 @@ use std::fmt::Debug; /// A packed string array that stores start and end indexes into /// a contiguous string slice. /// -/// The type parameter O alters the type used to store the offsets +/// The type parameter K alters the type used to store the offsets #[derive(Debug)] -pub struct PackedStringArray { +pub struct PackedStringArray { /// The start and end offsets of strings stored in storage - offsets: Vec, + offsets: Vec, /// A contiguous array of string data storage: String, } -impl Default for PackedStringArray { +impl Default for PackedStringArray { fn default() -> Self { Self { - offsets: vec![O::zero()], + offsets: vec![K::zero()], storage: String::new(), } } } -impl + FromPrimitive + Zero> PackedStringArray { +impl + FromPrimitive + Zero> PackedStringArray { pub fn new() -> Self { Self::default() } @@ -37,7 +37,7 @@ impl + FromPrimitive + Zero> PackedStringArray { let id = self.offsets.len() - 1; let offset = self.storage.len() + data.len(); - let offset = O::from_usize(offset).expect("failed to fit into offset type"); + let offset = K::from_usize(offset).expect("failed to fit into offset type"); self.offsets.push(offset); self.storage.push_str(data); @@ -53,9 +53,24 @@ impl + FromPrimitive + Zero> PackedStringArray { Some(&self.storage[start_offset..end_offset]) } + pub fn iter(&self) -> PackedStringIterator<'_, K> { + PackedStringIterator { + array: &self, + index: 0, + } + } + + pub fn len(&self) -> usize { + self.offsets.len() - 1 + } + + pub fn is_empty(&self) -> bool { + self.offsets.len() == 1 + } + /// Return the amount of memory in bytes taken up by this array pub fn size(&self) -> usize { - self.storage.len() + self.offsets.len() * std::mem::size_of::() + self.storage.len() + self.offsets.len() * std::mem::size_of::() } } @@ -76,6 +91,26 @@ impl PackedStringArray { } } +pub struct PackedStringIterator<'a, K> { + array: &'a PackedStringArray, + index: usize, +} + +impl<'a, K: AsPrimitive + FromPrimitive + Zero> Iterator for PackedStringIterator<'a, K> { + type Item = &'a str; + + fn next(&mut self) -> Option { + let item = self.array.get(self.index)?; + self.index += 1; + Some(item) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.array.len() - self.index; + (len, Some(len)) + } +} + #[cfg(test)] mod tests { use crate::string::PackedStringArray;