feat: construct StringDictionary from PackedStringArray (#1475)
* feat: construct StringDictionary from PackedStringArray * chore: fix formatting Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>pull/24376/head
parent
67984fbff5
commit
b02105e47b
|
@ -165,6 +165,7 @@ dependencies = [
|
|||
"hashbrown 0.11.2",
|
||||
"num-traits",
|
||||
"rand 0.8.3",
|
||||
"snafu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -11,6 +11,7 @@ arrow = { path = "../arrow" }
|
|||
ahash = "0.7.2"
|
||||
num-traits = "0.2"
|
||||
futures = "0.3"
|
||||
snafu = "0.6"
|
||||
hashbrown = "0.11"
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
@ -4,6 +4,14 @@ use hashbrown::HashMap;
|
|||
|
||||
use crate::string::PackedStringArray;
|
||||
use num_traits::{AsPrimitive, FromPrimitive, Zero};
|
||||
use snafu::Snafu;
|
||||
use std::convert::TryFrom;
|
||||
|
||||
#[derive(Debug, Snafu)]
|
||||
pub enum Error {
|
||||
#[snafu(display("duplicate key found {}", key))]
|
||||
DuplicateKeyFound { key: String },
|
||||
}
|
||||
|
||||
/// A String dictionary that builds on top of `PackedStringArray` adding O(1)
|
||||
/// index lookups for a given string
|
||||
|
@ -88,6 +96,10 @@ impl<K: AsPrimitive<usize> + FromPrimitive + Zero> StringDictionary<K> {
|
|||
pub fn values(&self) -> &PackedStringArray<K> {
|
||||
&self.storage
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> PackedStringArray<K> {
|
||||
self.storage
|
||||
}
|
||||
}
|
||||
|
||||
fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
|
||||
|
@ -97,9 +109,54 @@ fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
|
|||
state.finish()
|
||||
}
|
||||
|
||||
impl<K> TryFrom<PackedStringArray<K>> for StringDictionary<K>
|
||||
where
|
||||
K: AsPrimitive<usize> + FromPrimitive + Zero,
|
||||
{
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(storage: PackedStringArray<K>) -> Result<Self, Error> {
|
||||
use hashbrown::hash_map::RawEntryMut;
|
||||
|
||||
let hasher = ahash::RandomState::new();
|
||||
let mut dedup: HashMap<K, (), ()> = HashMap::with_capacity_and_hasher(storage.len(), ());
|
||||
for (idx, value) in storage.iter().enumerate() {
|
||||
let hash = hash_str(&hasher, value);
|
||||
|
||||
let entry = dedup
|
||||
.raw_entry_mut()
|
||||
.from_hash(hash, |key| value == storage.get(key.as_()).unwrap());
|
||||
|
||||
match entry {
|
||||
RawEntryMut::Occupied(_) => {
|
||||
return Err(Error::DuplicateKeyFound {
|
||||
key: value.to_string(),
|
||||
})
|
||||
}
|
||||
RawEntryMut::Vacant(entry) => {
|
||||
let key =
|
||||
K::from_usize(idx).expect("failed to fit string index into dictionary key");
|
||||
|
||||
entry.insert_with_hasher(hash, key, (), |key| {
|
||||
let string = storage.get(key.as_()).unwrap();
|
||||
hash_str(&hasher, string)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
hash: hasher,
|
||||
dedup,
|
||||
storage,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use std::convert::TryInto;
|
||||
|
||||
#[test]
|
||||
fn test_dictionary() {
|
||||
|
@ -130,4 +187,34 @@ mod test {
|
|||
assert!(dictionary.lookup_id(-1).is_none());
|
||||
assert_eq!(arrow_expected, arrow_actual);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_string_array() {
|
||||
let mut data = PackedStringArray::<u64>::new();
|
||||
data.append("cupcakes");
|
||||
data.append("foo");
|
||||
data.append("bingo");
|
||||
|
||||
let dictionary: StringDictionary<_> = data.try_into().unwrap();
|
||||
|
||||
assert_eq!(dictionary.lookup_value("cupcakes"), Some(0));
|
||||
assert_eq!(dictionary.lookup_value("foo"), Some(1));
|
||||
assert_eq!(dictionary.lookup_value("bingo"), Some(2));
|
||||
|
||||
assert_eq!(dictionary.lookup_id(0), Some("cupcakes"));
|
||||
assert_eq!(dictionary.lookup_id(1), Some("foo"));
|
||||
assert_eq!(dictionary.lookup_id(2), Some("bingo"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_string_array_duplicates() {
|
||||
let mut data = PackedStringArray::<u64>::new();
|
||||
data.append("cupcakes");
|
||||
data.append("foo");
|
||||
data.append("bingo");
|
||||
data.append("cupcakes");
|
||||
|
||||
let err = TryInto::<StringDictionary<_>>::try_into(data).expect_err("expected failure");
|
||||
assert!(matches!(err, Error::DuplicateKeyFound { key } if &key == "cupcakes"))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,25 +7,25 @@ use std::fmt::Debug;
|
|||
/// A packed string array that stores start and end indexes into
|
||||
/// a contiguous string slice.
|
||||
///
|
||||
/// The type parameter O alters the type used to store the offsets
|
||||
/// The type parameter K alters the type used to store the offsets
|
||||
#[derive(Debug)]
|
||||
pub struct PackedStringArray<O> {
|
||||
pub struct PackedStringArray<K> {
|
||||
/// The start and end offsets of strings stored in storage
|
||||
offsets: Vec<O>,
|
||||
offsets: Vec<K>,
|
||||
/// A contiguous array of string data
|
||||
storage: String,
|
||||
}
|
||||
|
||||
impl<O: Zero> Default for PackedStringArray<O> {
|
||||
impl<K: Zero> Default for PackedStringArray<K> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
offsets: vec![O::zero()],
|
||||
offsets: vec![K::zero()],
|
||||
storage: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
|
||||
impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
|
|||
let id = self.offsets.len() - 1;
|
||||
|
||||
let offset = self.storage.len() + data.len();
|
||||
let offset = O::from_usize(offset).expect("failed to fit into offset type");
|
||||
let offset = K::from_usize(offset).expect("failed to fit into offset type");
|
||||
|
||||
self.offsets.push(offset);
|
||||
self.storage.push_str(data);
|
||||
|
@ -53,9 +53,24 @@ impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
|
|||
Some(&self.storage[start_offset..end_offset])
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> PackedStringIterator<'_, K> {
|
||||
PackedStringIterator {
|
||||
array: &self,
|
||||
index: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.offsets.len() - 1
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.offsets.len() == 1
|
||||
}
|
||||
|
||||
/// Return the amount of memory in bytes taken up by this array
|
||||
pub fn size(&self) -> usize {
|
||||
self.storage.len() + self.offsets.len() * std::mem::size_of::<O>()
|
||||
self.storage.len() + self.offsets.len() * std::mem::size_of::<K>()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -76,6 +91,26 @@ impl PackedStringArray<i32> {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct PackedStringIterator<'a, K> {
|
||||
array: &'a PackedStringArray<K>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl<'a, K: AsPrimitive<usize> + FromPrimitive + Zero> Iterator for PackedStringIterator<'a, K> {
|
||||
type Item = &'a str;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let item = self.array.get(self.index)?;
|
||||
self.index += 1;
|
||||
Some(item)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let len = self.array.len() - self.index;
|
||||
(len, Some(len))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::string::PackedStringArray;
|
||||
|
|
Loading…
Reference in New Issue