319 lines
8.9 KiB
Rust
319 lines
8.9 KiB
Rust
use arrow::array::ArrayDataBuilder;
|
|
use arrow::array::StringArray;
|
|
use arrow::buffer::Buffer;
|
|
use num_traits::{AsPrimitive, FromPrimitive, Zero};
|
|
use std::fmt::Debug;
|
|
use std::ops::Range;
|
|
|
|
/// A packed string array that stores start and end indexes into
|
|
/// a contiguous string slice.
|
|
///
|
|
/// The type parameter K alters the type used to store the offsets
|
|
#[derive(Debug, Clone)]
|
|
pub struct PackedStringArray<K> {
|
|
/// The start and end offsets of strings stored in storage
|
|
offsets: Vec<K>,
|
|
/// A contiguous array of string data
|
|
storage: String,
|
|
}
|
|
|
|
impl<K: Zero> Default for PackedStringArray<K> {
|
|
fn default() -> Self {
|
|
Self {
|
|
offsets: vec![K::zero()],
|
|
storage: String::new(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
|
|
pub fn new() -> Self {
|
|
Self::default()
|
|
}
|
|
|
|
pub fn new_empty(len: usize) -> Self {
|
|
Self {
|
|
offsets: vec![K::zero(); len + 1],
|
|
storage: String::new(),
|
|
}
|
|
}
|
|
|
|
pub fn with_capacity(keys: usize, values: usize) -> Self {
|
|
let mut offsets = Vec::with_capacity(keys + 1);
|
|
offsets.push(K::zero());
|
|
|
|
Self {
|
|
offsets,
|
|
storage: String::with_capacity(values),
|
|
}
|
|
}
|
|
|
|
/// Append a value
|
|
///
|
|
/// Returns the index of the appended data
|
|
pub fn append(&mut self, data: &str) -> usize {
|
|
let id = self.offsets.len() - 1;
|
|
|
|
let offset = self.storage.len() + data.len();
|
|
let offset = K::from_usize(offset).expect("failed to fit into offset type");
|
|
|
|
self.offsets.push(offset);
|
|
self.storage.push_str(data);
|
|
|
|
id
|
|
}
|
|
|
|
/// Extends this [`PackedStringArray`] by the contents of `other`
|
|
pub fn extend_from(&mut self, other: &PackedStringArray<K>) {
|
|
let offset = self.storage.len();
|
|
self.storage.push_str(other.storage.as_str());
|
|
// Copy offsets skipping the first element as this string start delimiter is already
|
|
// provided by the end delimiter of the current offsets array
|
|
self.offsets.extend(
|
|
other
|
|
.offsets
|
|
.iter()
|
|
.skip(1)
|
|
.map(|x| K::from_usize(x.as_() + offset).expect("failed to fit into offset type")),
|
|
)
|
|
}
|
|
|
|
/// Extends this [`PackedStringArray`] by `range` elements from `other`
|
|
pub fn extend_from_range(&mut self, other: &PackedStringArray<K>, range: Range<usize>) {
|
|
let first_offset: usize = other.offsets[range.start].as_();
|
|
let end_offset: usize = other.offsets[range.end].as_();
|
|
|
|
let insert_offset = self.storage.len();
|
|
|
|
self.storage
|
|
.push_str(&other.storage[first_offset..end_offset]);
|
|
|
|
self.offsets.extend(
|
|
other.offsets[(range.start + 1)..(range.end + 1)]
|
|
.iter()
|
|
.map(|x| {
|
|
K::from_usize(x.as_() - first_offset + insert_offset)
|
|
.expect("failed to fit into offset type")
|
|
}),
|
|
)
|
|
}
|
|
|
|
/// Get the value at a given index
|
|
pub fn get(&self, index: usize) -> Option<&str> {
|
|
let start_offset = self.offsets.get(index)?.as_();
|
|
let end_offset = self.offsets.get(index + 1)?.as_();
|
|
|
|
Some(&self.storage[start_offset..end_offset])
|
|
}
|
|
|
|
/// Pads with empty strings to reach length
|
|
pub fn extend(&mut self, len: usize) {
|
|
let offset = K::from_usize(self.storage.len()).expect("failed to fit into offset type");
|
|
self.offsets.resize(self.offsets.len() + len, offset);
|
|
}
|
|
|
|
/// Truncates the array to the given length
|
|
pub fn truncate(&mut self, len: usize) {
|
|
self.offsets.truncate(len + 1);
|
|
let last_idx = self.offsets.last().expect("offsets empty");
|
|
self.storage.truncate(last_idx.as_());
|
|
}
|
|
|
|
/// Removes all elements from this array
|
|
pub fn clear(&mut self) {
|
|
self.offsets.truncate(1);
|
|
self.storage.clear();
|
|
}
|
|
|
|
pub fn iter(&self) -> PackedStringIterator<'_, K> {
|
|
PackedStringIterator {
|
|
array: self,
|
|
index: 0,
|
|
}
|
|
}
|
|
|
|
/// The number of strings in this array
|
|
pub fn len(&self) -> usize {
|
|
self.offsets.len() - 1
|
|
}
|
|
|
|
pub fn is_empty(&self) -> bool {
|
|
self.offsets.len() == 1
|
|
}
|
|
|
|
/// Return the amount of memory in bytes taken up by this array
|
|
pub fn size(&self) -> usize {
|
|
self.storage.capacity() + self.offsets.capacity() * std::mem::size_of::<K>()
|
|
}
|
|
|
|
pub fn inner(&self) -> (&[K], &str) {
|
|
(&self.offsets, &self.storage)
|
|
}
|
|
|
|
pub fn into_inner(self) -> (Vec<K>, String) {
|
|
(self.offsets, self.storage)
|
|
}
|
|
}
|
|
|
|
impl PackedStringArray<i32> {
|
|
/// Convert to an arrow with an optional null bitmask
|
|
pub fn to_arrow(&self, nulls: Option<Buffer>) -> StringArray {
|
|
let len = self.offsets.len() - 1;
|
|
let offsets = Buffer::from_slice_ref(&self.offsets);
|
|
let values = Buffer::from(self.storage.as_bytes());
|
|
|
|
let data = ArrayDataBuilder::new(arrow::datatypes::DataType::Utf8)
|
|
.len(len)
|
|
.add_buffer(offsets)
|
|
.add_buffer(values)
|
|
.null_bit_buffer(nulls)
|
|
.build()
|
|
// TODO consider skipping the validation checks by using
|
|
// `new_unchecked`
|
|
.expect("Valid array data");
|
|
StringArray::from(data)
|
|
}
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct PackedStringIterator<'a, K> {
|
|
array: &'a PackedStringArray<K>,
|
|
index: usize,
|
|
}
|
|
|
|
impl<'a, K: AsPrimitive<usize> + FromPrimitive + Zero> Iterator for PackedStringIterator<'a, K> {
|
|
type Item = &'a str;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let item = self.array.get(self.index)?;
|
|
self.index += 1;
|
|
Some(item)
|
|
}
|
|
|
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
|
let len = self.array.len() - self.index;
|
|
(len, Some(len))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::string::PackedStringArray;
|
|
|
|
#[test]
|
|
fn test_storage() {
|
|
let mut array = PackedStringArray::<i32>::new();
|
|
|
|
array.append("hello");
|
|
array.append("world");
|
|
array.append("cupcake");
|
|
|
|
assert_eq!(array.get(0).unwrap(), "hello");
|
|
assert_eq!(array.get(1).unwrap(), "world");
|
|
assert_eq!(array.get(2).unwrap(), "cupcake");
|
|
assert!(array.get(-1_i32 as usize).is_none());
|
|
|
|
assert!(array.get(3).is_none());
|
|
|
|
array.extend(2);
|
|
assert_eq!(array.get(3).unwrap(), "");
|
|
assert_eq!(array.get(4).unwrap(), "");
|
|
assert!(array.get(5).is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_empty() {
|
|
let array = PackedStringArray::<u8>::new_empty(20);
|
|
assert_eq!(array.get(12).unwrap(), "");
|
|
assert_eq!(array.get(9).unwrap(), "");
|
|
assert_eq!(array.get(3).unwrap(), "");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate() {
|
|
let mut array = PackedStringArray::<i32>::new();
|
|
|
|
array.append("hello");
|
|
array.append("world");
|
|
array.append("cupcake");
|
|
|
|
array.truncate(1);
|
|
assert_eq!(array.len(), 1);
|
|
assert_eq!(array.get(0).unwrap(), "hello");
|
|
|
|
array.append("world");
|
|
assert_eq!(array.len(), 2);
|
|
assert_eq!(array.get(0).unwrap(), "hello");
|
|
assert_eq!(array.get(1).unwrap(), "world");
|
|
}
|
|
|
|
#[test]
|
|
fn test_extend_from() {
|
|
let mut a = PackedStringArray::<i32>::new();
|
|
|
|
a.append("hello");
|
|
a.append("world");
|
|
a.append("cupcake");
|
|
a.append("");
|
|
|
|
let mut b = PackedStringArray::<i32>::new();
|
|
|
|
b.append("foo");
|
|
b.append("bar");
|
|
|
|
a.extend_from(&b);
|
|
|
|
let a_content: Vec<_> = a.iter().collect();
|
|
assert_eq!(
|
|
a_content,
|
|
vec!["hello", "world", "cupcake", "", "foo", "bar"]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_extend_from_range() {
|
|
let mut a = PackedStringArray::<i32>::new();
|
|
|
|
a.append("hello");
|
|
a.append("world");
|
|
a.append("cupcake");
|
|
a.append("");
|
|
|
|
let mut b = PackedStringArray::<i32>::new();
|
|
|
|
b.append("foo");
|
|
b.append("bar");
|
|
b.append("");
|
|
b.append("fiz");
|
|
|
|
a.extend_from_range(&b, 1..3);
|
|
|
|
assert_eq!(a.len(), 6);
|
|
|
|
let a_content: Vec<_> = a.iter().collect();
|
|
assert_eq!(a_content, vec!["hello", "world", "cupcake", "", "bar", ""]);
|
|
|
|
// Should be a no-op
|
|
a.extend_from_range(&b, 0..0);
|
|
|
|
let a_content: Vec<_> = a.iter().collect();
|
|
assert_eq!(a_content, vec!["hello", "world", "cupcake", "", "bar", ""]);
|
|
|
|
a.extend_from_range(&b, 0..1);
|
|
|
|
let a_content: Vec<_> = a.iter().collect();
|
|
assert_eq!(
|
|
a_content,
|
|
vec!["hello", "world", "cupcake", "", "bar", "", "foo"]
|
|
);
|
|
|
|
a.extend_from_range(&b, 1..4);
|
|
|
|
let a_content: Vec<_> = a.iter().collect();
|
|
assert_eq!(
|
|
a_content,
|
|
vec!["hello", "world", "cupcake", "", "bar", "", "foo", "bar", "", "fiz"]
|
|
);
|
|
}
|
|
}
|