feat: construct StringDictionary from PackedStringArray (#1475)

* feat: construct StringDictionary from PackedStringArray

* chore: fix formatting

Co-authored-by: kodiakhq[bot] <49736102+kodiakhq[bot]@users.noreply.github.com>
pull/24376/head
Raphael Taylor-Davies 2021-05-11 21:16:25 +01:00 committed by GitHub
parent 67984fbff5
commit b02105e47b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 132 additions and 8 deletions

1
Cargo.lock generated
View File

@ -165,6 +165,7 @@ dependencies = [
"hashbrown 0.11.2",
"num-traits",
"rand 0.8.3",
"snafu",
]
[[package]]

View File

@ -11,6 +11,7 @@ arrow = { path = "../arrow" }
ahash = "0.7.2"
num-traits = "0.2"
futures = "0.3"
snafu = "0.6"
hashbrown = "0.11"
[dev-dependencies]

View File

@ -4,6 +4,14 @@ use hashbrown::HashMap;
use crate::string::PackedStringArray;
use num_traits::{AsPrimitive, FromPrimitive, Zero};
use snafu::Snafu;
use std::convert::TryFrom;
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("duplicate key found {}", key))]
DuplicateKeyFound { key: String },
}
/// A String dictionary that builds on top of `PackedStringArray` adding O(1)
/// index lookups for a given string
@ -88,6 +96,10 @@ impl<K: AsPrimitive<usize> + FromPrimitive + Zero> StringDictionary<K> {
pub fn values(&self) -> &PackedStringArray<K> {
&self.storage
}
pub fn into_inner(self) -> PackedStringArray<K> {
self.storage
}
}
fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
@ -97,9 +109,54 @@ fn hash_str(hasher: &ahash::RandomState, value: &str) -> u64 {
state.finish()
}
impl<K> TryFrom<PackedStringArray<K>> for StringDictionary<K>
where
K: AsPrimitive<usize> + FromPrimitive + Zero,
{
type Error = Error;
fn try_from(storage: PackedStringArray<K>) -> Result<Self, Error> {
use hashbrown::hash_map::RawEntryMut;
let hasher = ahash::RandomState::new();
let mut dedup: HashMap<K, (), ()> = HashMap::with_capacity_and_hasher(storage.len(), ());
for (idx, value) in storage.iter().enumerate() {
let hash = hash_str(&hasher, value);
let entry = dedup
.raw_entry_mut()
.from_hash(hash, |key| value == storage.get(key.as_()).unwrap());
match entry {
RawEntryMut::Occupied(_) => {
return Err(Error::DuplicateKeyFound {
key: value.to_string(),
})
}
RawEntryMut::Vacant(entry) => {
let key =
K::from_usize(idx).expect("failed to fit string index into dictionary key");
entry.insert_with_hasher(hash, key, (), |key| {
let string = storage.get(key.as_()).unwrap();
hash_str(&hasher, string)
});
}
}
}
Ok(Self {
hash: hasher,
dedup,
storage,
})
}
}
#[cfg(test)]
mod test {
use super::*;
use std::convert::TryInto;
#[test]
fn test_dictionary() {
@ -130,4 +187,34 @@ mod test {
assert!(dictionary.lookup_id(-1).is_none());
assert_eq!(arrow_expected, arrow_actual);
}
#[test]
fn from_string_array() {
let mut data = PackedStringArray::<u64>::new();
data.append("cupcakes");
data.append("foo");
data.append("bingo");
let dictionary: StringDictionary<_> = data.try_into().unwrap();
assert_eq!(dictionary.lookup_value("cupcakes"), Some(0));
assert_eq!(dictionary.lookup_value("foo"), Some(1));
assert_eq!(dictionary.lookup_value("bingo"), Some(2));
assert_eq!(dictionary.lookup_id(0), Some("cupcakes"));
assert_eq!(dictionary.lookup_id(1), Some("foo"));
assert_eq!(dictionary.lookup_id(2), Some("bingo"));
}
#[test]
fn from_string_array_duplicates() {
let mut data = PackedStringArray::<u64>::new();
data.append("cupcakes");
data.append("foo");
data.append("bingo");
data.append("cupcakes");
let err = TryInto::<StringDictionary<_>>::try_into(data).expect_err("expected failure");
assert!(matches!(err, Error::DuplicateKeyFound { key } if &key == "cupcakes"))
}
}

View File

@ -7,25 +7,25 @@ use std::fmt::Debug;
/// A packed string array that stores start and end indexes into
/// a contiguous string slice.
///
/// The type parameter O alters the type used to store the offsets
/// The type parameter K alters the type used to store the offsets
#[derive(Debug)]
pub struct PackedStringArray<O> {
pub struct PackedStringArray<K> {
/// The start and end offsets of strings stored in storage
offsets: Vec<O>,
offsets: Vec<K>,
/// A contiguous array of string data
storage: String,
}
impl<O: Zero> Default for PackedStringArray<O> {
impl<K: Zero> Default for PackedStringArray<K> {
fn default() -> Self {
Self {
offsets: vec![O::zero()],
offsets: vec![K::zero()],
storage: String::new(),
}
}
}
impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
pub fn new() -> Self {
Self::default()
}
@ -37,7 +37,7 @@ impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
let id = self.offsets.len() - 1;
let offset = self.storage.len() + data.len();
let offset = O::from_usize(offset).expect("failed to fit into offset type");
let offset = K::from_usize(offset).expect("failed to fit into offset type");
self.offsets.push(offset);
self.storage.push_str(data);
@ -53,9 +53,24 @@ impl<O: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<O> {
Some(&self.storage[start_offset..end_offset])
}
pub fn iter(&self) -> PackedStringIterator<'_, K> {
PackedStringIterator {
array: &self,
index: 0,
}
}
pub fn len(&self) -> usize {
self.offsets.len() - 1
}
pub fn is_empty(&self) -> bool {
self.offsets.len() == 1
}
/// Return the amount of memory in bytes taken up by this array
pub fn size(&self) -> usize {
self.storage.len() + self.offsets.len() * std::mem::size_of::<O>()
self.storage.len() + self.offsets.len() * std::mem::size_of::<K>()
}
}
@ -76,6 +91,26 @@ impl PackedStringArray<i32> {
}
}
pub struct PackedStringIterator<'a, K> {
array: &'a PackedStringArray<K>,
index: usize,
}
impl<'a, K: AsPrimitive<usize> + FromPrimitive + Zero> Iterator for PackedStringIterator<'a, K> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
let item = self.array.get(self.index)?;
self.index += 1;
Some(item)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.array.len() - self.index;
(len, Some(len))
}
}
#[cfg(test)]
mod tests {
use crate::string::PackedStringArray;