refactor: spike on arrow encoding
parent
9f299461ed
commit
47b2f7940b
|
@ -819,6 +819,7 @@ dependencies = [
|
|||
"datafusion 2.0.0-SNAPSHOT (git+https://github.com/alamb/arrow.git?rev=46f18c2602072e083809e0846b810e0cc3c59fdd)",
|
||||
"delorean_table",
|
||||
"env_logger",
|
||||
"heapsize",
|
||||
"human_format",
|
||||
"log",
|
||||
"parquet 2.0.0-SNAPSHOT (git+https://github.com/alamb/arrow.git?rev=46f18c2602072e083809e0846b810e0cc3c59fdd)",
|
||||
|
@ -1356,6 +1357,15 @@ dependencies = [
|
|||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapsize"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1679e6ea370dee694f91f1dc469bf94cf8f52051d147aec3e1f9497c6fc22461"
|
||||
dependencies = [
|
||||
"winapi 0.3.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.3.1"
|
||||
|
|
|
@ -12,9 +12,7 @@ delorean_table = { path = "../delorean_table" }
|
|||
arrow = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" }
|
||||
parquet = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" }
|
||||
datafusion = { git = "https://github.com/alamb/arrow.git", rev="46f18c2602072e083809e0846b810e0cc3c59fdd", version = "2.0.0-SNAPSHOT" }
|
||||
#arrow = { path = "/Users/alamb/Software/arrow/rust/arrow" }
|
||||
#parquet = { path = "/Users/alamb/Software/arrow/rust/parquet" }
|
||||
#datafusion = { path = "/Users/alamb/Software/arrow/rust/datafusion" }
|
||||
heapsize = "0.4.2"
|
||||
snafu = "0.6.8"
|
||||
croaring = "0.4.5"
|
||||
crossbeam = "0.7.3"
|
||||
|
|
|
@ -134,6 +134,7 @@ fn build_store(
|
|||
let mut segment = Segment::new(rb.num_rows(), schema);
|
||||
convert_record_batch(rb, &mut segment)?;
|
||||
|
||||
println!("{}", &segment);
|
||||
store.add_segment(segment);
|
||||
}
|
||||
Ok(None) => {
|
||||
|
|
|
@ -873,6 +873,23 @@ impl Column {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Column {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match &self {
|
||||
Column::String(c) => {
|
||||
write!(f, "{}", c)?;
|
||||
}
|
||||
Column::Float(c) => {
|
||||
write!(f, "{}", c)?;
|
||||
}
|
||||
Column::Integer(c) => {
|
||||
write!(f, "{}", c)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl AggregatableByRange for &Column {
|
||||
fn aggregate_by_id_range(
|
||||
&self,
|
||||
|
@ -964,6 +981,12 @@ impl String {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for String {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Meta: {}, Data: {}", self.meta, self.data)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Float {
|
||||
meta: metadata::F64,
|
||||
|
@ -1015,6 +1038,12 @@ impl Float {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Float {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Meta: {}, Data: {}", self.meta, self.data)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&[f64]> for Float {
|
||||
fn from(values: &[f64]) -> Self {
|
||||
let len = values.len();
|
||||
|
@ -1034,6 +1063,32 @@ impl From<&[f64]> for Float {
|
|||
}
|
||||
}
|
||||
|
||||
// use arrow::array::Array;
|
||||
// impl From<arrow::array::PrimitiveArray<arrow::datatypes::Float64Type>> for Float {
|
||||
// fn from(arr: arrow::array::PrimitiveArray<arrow::datatypes::Float64Type>) -> Self {
|
||||
// let len = arr.len();
|
||||
// let mut min = std::f64::MAX;
|
||||
// let mut max = std::f64::MIN;
|
||||
|
||||
// // calculate min/max for meta data
|
||||
// // TODO(edd): can use compute kernels for this.
|
||||
// for i in 0..arr.len() {
|
||||
// if arr.is_null(i) {
|
||||
// continue;
|
||||
// }
|
||||
|
||||
// let v = arr.value(i);
|
||||
// min = min.min(v);
|
||||
// max = max.max(v);
|
||||
// }
|
||||
|
||||
// Self {
|
||||
// meta: metadata::F64::new((min, max), len),
|
||||
// data: Box::new(encoding::PlainArrow { arr }),
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Integer {
|
||||
meta: metadata::I64,
|
||||
|
@ -1080,6 +1135,12 @@ impl Integer {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Integer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Meta: {}, Data: {}", self.meta, self.data)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&[i64]> for Integer {
|
||||
fn from(values: &[i64]) -> Self {
|
||||
let len = values.len();
|
||||
|
@ -1100,11 +1161,12 @@ impl From<&[i64]> for Integer {
|
|||
}
|
||||
|
||||
pub mod metadata {
|
||||
use std::mem::size_of;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Str {
|
||||
range: (Option<String>, Option<String>),
|
||||
num_rows: usize,
|
||||
// sparse_index: BTreeMap<String, usize>,
|
||||
}
|
||||
|
||||
impl Str {
|
||||
|
@ -1145,8 +1207,20 @@ pub mod metadata {
|
|||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
// TODO!!!!
|
||||
0 //self.range.0.len() + self.range.1.len() + std::mem::size_of::<usize>()
|
||||
// size of types for num_rows and range
|
||||
let base_size = size_of::<usize>() + (2 * size_of::<Option<String>>());
|
||||
match &self.range {
|
||||
(None, None) => base_size,
|
||||
(Some(min), None) => base_size + min.len(),
|
||||
(None, Some(max)) => base_size + max.len(),
|
||||
(Some(min), Some(max)) => base_size + min.len() + max.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Str {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Range: ({:?})", self.range)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1184,7 +1258,13 @@ pub mod metadata {
|
|||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
std::mem::size_of::<(f64, f64)>() + std::mem::size_of::<usize>()
|
||||
size_of::<usize>() + (size_of::<(f64, f64)>())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for F64 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Range: ({:?})", self.range)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1219,7 +1299,13 @@ pub mod metadata {
|
|||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
std::mem::size_of::<(i64, i64)>() + std::mem::size_of::<usize>()
|
||||
size_of::<usize>() + (size_of::<(i64, i64)>())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for I64 {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Range: ({:?})", self.range)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::iter;
|
||||
use std::mem::size_of;
|
||||
|
||||
use arrow::array::{Array, PrimitiveArray};
|
||||
use arrow::datatypes::ArrowNumericType;
|
||||
use arrow::datatypes::*;
|
||||
|
||||
pub trait Encoding: Send + Sync {
|
||||
pub trait NumericEncoding: Send + Sync + std::fmt::Display + std::fmt::Debug {
|
||||
type Item;
|
||||
|
||||
fn size(&self) -> usize;
|
||||
|
@ -15,39 +15,15 @@ pub trait Encoding: Send + Sync {
|
|||
fn all_encoded_values(&self) -> Vec<Self::Item>;
|
||||
fn scan_from(&self, row_id: usize) -> &[Self::Item];
|
||||
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize;
|
||||
|
||||
// TODO(edd): clean up the API for getting row ids that match predicates.
|
||||
//
|
||||
// Ideally you should be able to provide a collection of predicates to
|
||||
// match on.
|
||||
//
|
||||
// A simpler approach would be to provide a method that matches on a single
|
||||
// predicate and then call that multiple times, unioning or intersecting the
|
||||
// resulting row sets.
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize>;
|
||||
fn row_ids_single_cmp_roaring(
|
||||
&self,
|
||||
wanted: &Self::Item,
|
||||
order: std::cmp::Ordering,
|
||||
) -> croaring::Bitmap;
|
||||
fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap;
|
||||
}
|
||||
|
||||
pub trait NumericEncoding: Send + Sync {
|
||||
type Item;
|
||||
|
||||
fn size(&self) -> usize;
|
||||
fn value(&self, row_id: usize) -> Self::Item;
|
||||
fn values(&self, row_ids: &[usize]) -> Vec<Self::Item>;
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<Self::Item>;
|
||||
fn all_encoded_values(&self) -> Vec<Self::Item>;
|
||||
fn scan_from(&self, row_id: usize) -> &[Self::Item];
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item;
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item;
|
||||
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize;
|
||||
fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64;
|
||||
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize>;
|
||||
fn row_id_ge_value(&self, v: Self::Item) -> Option<usize>;
|
||||
|
||||
fn row_ids_single_cmp_roaring(
|
||||
&self,
|
||||
wanted: &Self::Item,
|
||||
|
@ -56,39 +32,37 @@ pub trait NumericEncoding: Send + Sync {
|
|||
fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap;
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for dyn NumericEncoding<Item = f64> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", "todo")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for dyn NumericEncoding<Item = i64> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", "todo")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct PlainArrow<T>
|
||||
where
|
||||
// T: ArrowNumericType + std::ops::Add,
|
||||
T: ArrowNumericType,
|
||||
// T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign,
|
||||
T::Native: Default
|
||||
+ PartialEq
|
||||
+ PartialOrd
|
||||
+ Copy
|
||||
+ std::fmt::Debug
|
||||
+ std::ops::Add<Output = T::Native>,
|
||||
{
|
||||
arr: PrimitiveArray<T>,
|
||||
// _phantom: T,
|
||||
}
|
||||
|
||||
impl<T> PlainArrow<T>
|
||||
impl<T> NumericEncoding for PlainArrow<T>
|
||||
where
|
||||
// T: ArrowNumericType + std::ops::Add,
|
||||
T: ArrowNumericType,
|
||||
// T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign,
|
||||
T: ArrowNumericType + std::fmt::Debug,
|
||||
T::Native: Default
|
||||
+ PartialEq
|
||||
+ PartialOrd
|
||||
+ Copy
|
||||
+ std::fmt::Debug
|
||||
+ std::ops::Add<Output = T::Native>,
|
||||
{
|
||||
pub fn size(&self) -> usize {
|
||||
type Item = Option<T::Native>;
|
||||
|
||||
fn size(&self) -> usize {
|
||||
self.arr.len()
|
||||
}
|
||||
|
||||
pub fn value(&self, row_id: usize) -> Option<T::Native> {
|
||||
fn value(&self, row_id: usize) -> Option<T::Native> {
|
||||
if self.arr.is_null(row_id) {
|
||||
return None;
|
||||
}
|
||||
|
@ -109,12 +83,12 @@ where
|
|||
}
|
||||
|
||||
/// Well this is terribly slow
|
||||
pub fn encoded_values(&self, row_ids: &[usize]) -> Vec<Option<T::Native>> {
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<Option<T::Native>> {
|
||||
self.values(row_ids)
|
||||
}
|
||||
|
||||
/// TODO(edd): there must be a more efficient way.
|
||||
pub fn all_encoded_values(&self) -> Vec<Option<T::Native>> {
|
||||
fn all_encoded_values(&self) -> Vec<Option<T::Native>> {
|
||||
let mut out = Vec::with_capacity(self.arr.len());
|
||||
for i in 0..self.arr.len() {
|
||||
if self.arr.is_null(i) {
|
||||
|
@ -127,11 +101,10 @@ where
|
|||
out
|
||||
}
|
||||
|
||||
pub fn scan_from(&self, row_id: usize) -> &[Option<T::Native>] {
|
||||
// todo
|
||||
|
||||
&[]
|
||||
|
||||
// TODO(edd): problem here is returning a slice because we need to own the
|
||||
// backing vector.
|
||||
fn scan_from(&self, row_id: usize) -> &[Option<T::Native>] {
|
||||
unimplemented!("need to figure out returning a slice");
|
||||
// let mut out = Vec::with_capacity(self.arr.len() - row_id);
|
||||
// for i in row_id..self.arr.len() {
|
||||
// if self.arr.is_null(i) {
|
||||
|
@ -144,18 +117,22 @@ where
|
|||
// out.as_slice()
|
||||
}
|
||||
|
||||
pub fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Option<T::Native> {
|
||||
// let mut res = T::Native::default();
|
||||
|
||||
// // HMMMMM - materialising which has a memory cost.
|
||||
// let vec = row_ids.to_vec();
|
||||
// for v in vec {
|
||||
// res += self.arr.value(v as usize);
|
||||
// }
|
||||
None // todo
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Option<T::Native> {
|
||||
// TODO(edd): this is expensive - may pay to expose method to do this
|
||||
// where you accept an array.
|
||||
let mut res = T::Native::default();
|
||||
let vec = row_ids.to_vec();
|
||||
for row_id in vec {
|
||||
let i = row_id as usize;
|
||||
if self.arr.is_null(i) {
|
||||
return None;
|
||||
}
|
||||
res = res + self.arr.value(i);
|
||||
}
|
||||
Some(res)
|
||||
}
|
||||
|
||||
pub fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Option<T::Native> {
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Option<T::Native> {
|
||||
// if the column contains a null value between the range then the result
|
||||
// will be None.
|
||||
for i in from_row_id..to_row_id {
|
||||
|
@ -165,57 +142,35 @@ where
|
|||
}
|
||||
|
||||
// Otherwise sum all the values between in the range.
|
||||
// let mut res = f64::from(self.arr.value(from_row_id));
|
||||
// for i in from_row_id + 1..to_row_id {
|
||||
// res = res + self.arr.value(i);
|
||||
// }
|
||||
// Some(res)
|
||||
None
|
||||
}
|
||||
|
||||
pub fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
// TODO - count values that are not null in the row range.
|
||||
0 // todo
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericEncoding for PlainArrow<Float64Type> {
|
||||
type Item = Option<f64>;
|
||||
|
||||
fn size(&self) -> usize {
|
||||
self.size()
|
||||
}
|
||||
|
||||
fn value(&self, row_id: usize) -> Self::Item {
|
||||
self.value(row_id)
|
||||
}
|
||||
|
||||
fn values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.values(row_ids)
|
||||
}
|
||||
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.encoded_values(row_ids)
|
||||
}
|
||||
|
||||
fn all_encoded_values(&self) -> Vec<Self::Item> {
|
||||
self.all_encoded_values()
|
||||
}
|
||||
|
||||
fn scan_from(&self, row_id: usize) -> &[Self::Item] {
|
||||
self.scan_from(row_id)
|
||||
}
|
||||
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item {
|
||||
self.sum_by_ids(row_ids)
|
||||
}
|
||||
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item {
|
||||
self.sum_by_id_range(from_row_id, to_row_id)
|
||||
let mut res = T::Native::default();
|
||||
for i in from_row_id..to_row_id {
|
||||
res = res + self.arr.value(i);
|
||||
}
|
||||
Some(res)
|
||||
}
|
||||
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
self.count_by_id_range(from_row_id, to_row_id)
|
||||
// TODO - count values that are not null in the row range.
|
||||
let mut count = 0;
|
||||
for i in from_row_id..to_row_id {
|
||||
if self.arr.is_null(i) {
|
||||
continue;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn row_id_ge_value(&self, v: Self::Item) -> Option<usize> {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn row_ids_single_cmp_roaring(
|
||||
|
@ -223,15 +178,26 @@ impl NumericEncoding for PlainArrow<Float64Type> {
|
|||
wanted: &Self::Item,
|
||||
order: std::cmp::Ordering,
|
||||
) -> croaring::Bitmap {
|
||||
self.row_ids_single_cmp_roaring(wanted, order)
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap {
|
||||
self.row_ids_gte_lt_roaring(from, to)
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize> {
|
||||
self.row_id_eq_value(v)
|
||||
impl<T: ArrowNumericType> std::fmt::Display for PlainArrow<T>
|
||||
where
|
||||
T: ArrowNumericType + std::fmt::Debug,
|
||||
T::Native: Default
|
||||
+ PartialEq
|
||||
+ PartialOrd
|
||||
+ Copy
|
||||
+ std::fmt::Debug
|
||||
+ std::ops::Add<Output = T::Native>,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[PlainArrow<T>] size: {}", self.size())
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -242,31 +208,60 @@ pub struct PlainFixed<T> {
|
|||
// total_order can be used as a hint to stop scanning the column early when
|
||||
// applying a comparison predicate to the column.
|
||||
total_order: bool,
|
||||
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<T> PlainFixed<T>
|
||||
impl<T> std::fmt::Display for PlainFixed<T>
|
||||
where
|
||||
T: Default + PartialEq + PartialOrd + Copy + std::fmt::Debug + std::ops::AddAssign,
|
||||
T: Default
|
||||
+ PartialEq
|
||||
+ PartialOrd
|
||||
+ Copy
|
||||
+ std::fmt::Debug
|
||||
+ std::fmt::Display
|
||||
+ Sync
|
||||
+ Send
|
||||
+ std::ops::AddAssign,
|
||||
{
|
||||
pub fn size(&self) -> usize {
|
||||
self.values.len() * std::mem::size_of::<T>()
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[PlainFixed<T>] size: {}", self.size(),)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> NumericEncoding for PlainFixed<T>
|
||||
where
|
||||
T: Default
|
||||
+ PartialEq
|
||||
+ PartialOrd
|
||||
+ Copy
|
||||
+ std::fmt::Debug
|
||||
+ std::fmt::Display
|
||||
+ Sync
|
||||
+ Send
|
||||
+ std::ops::AddAssign,
|
||||
{
|
||||
type Item = T;
|
||||
|
||||
fn size(&self) -> usize {
|
||||
self.size
|
||||
}
|
||||
|
||||
pub fn row_id_eq_value(&self, v: T) -> Option<usize> {
|
||||
fn row_id_eq_value(&self, v: T) -> Option<usize> {
|
||||
self.values.iter().position(|x| *x == v)
|
||||
}
|
||||
|
||||
pub fn row_id_ge_value(&self, v: T) -> Option<usize> {
|
||||
fn row_id_ge_value(&self, v: T) -> Option<usize> {
|
||||
self.values.iter().position(|x| *x >= v)
|
||||
}
|
||||
|
||||
// get value at row_id. Panics if out of bounds.
|
||||
pub fn value(&self, row_id: usize) -> T {
|
||||
fn value(&self, row_id: usize) -> T {
|
||||
self.values[row_id]
|
||||
}
|
||||
|
||||
/// Return the decoded values for the provided logical row ids.
|
||||
pub fn values(&self, row_ids: &[usize]) -> Vec<T> {
|
||||
fn values(&self, row_ids: &[usize]) -> Vec<T> {
|
||||
let mut out = Vec::with_capacity(row_ids.len());
|
||||
for chunks in row_ids.chunks_exact(4) {
|
||||
out.push(self.values[chunks[3]]);
|
||||
|
@ -286,24 +281,24 @@ where
|
|||
|
||||
/// Return the raw encoded values for the provided logical row ids. For Plain
|
||||
/// encoding this is just the decoded values.
|
||||
pub fn encoded_values(&self, row_ids: &[usize]) -> Vec<T> {
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<T> {
|
||||
self.values(row_ids)
|
||||
}
|
||||
|
||||
/// Return all encoded values. For this encoding this is just the decoded
|
||||
/// values
|
||||
pub fn all_encoded_values(&self) -> Vec<T> {
|
||||
fn all_encoded_values(&self) -> Vec<T> {
|
||||
self.values.clone()
|
||||
}
|
||||
|
||||
pub fn scan_from(&self, row_id: usize) -> &[T] {
|
||||
fn scan_from(&self, row_id: usize) -> &[T] {
|
||||
&self.values[row_id..]
|
||||
}
|
||||
|
||||
/// returns a set of row ids that match a single ordering on a desired value
|
||||
///
|
||||
/// This supports `value = x` , `value < x` or `value > x`.
|
||||
pub fn row_ids_single_cmp_roaring(
|
||||
fn row_ids_single_cmp_roaring(
|
||||
&self,
|
||||
wanted: &T,
|
||||
order: std::cmp::Ordering,
|
||||
|
@ -343,7 +338,7 @@ where
|
|||
/// returns a set of row ids that match the half open interval `[from, to)`.
|
||||
///
|
||||
/// The main use-case for this is time range filtering.
|
||||
pub fn row_ids_gte_lt_roaring(&self, from: &T, to: &T) -> croaring::Bitmap {
|
||||
fn row_ids_gte_lt_roaring(&self, from: &T, to: &T) -> croaring::Bitmap {
|
||||
let mut bm = croaring::Bitmap::create();
|
||||
|
||||
let mut found = false; //self.values[0];
|
||||
|
@ -376,7 +371,7 @@ where
|
|||
bm
|
||||
}
|
||||
|
||||
pub fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> T {
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> T {
|
||||
let mut res = T::default();
|
||||
for v in self.values[from_row_id..to_row_id].iter() {
|
||||
res += *v;
|
||||
|
@ -384,12 +379,12 @@ where
|
|||
res
|
||||
}
|
||||
|
||||
pub fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
to_row_id - from_row_id
|
||||
}
|
||||
|
||||
// TODO(edd): make faster
|
||||
pub fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> T {
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> T {
|
||||
let mut res = T::default();
|
||||
// println!(
|
||||
// "cardinality is {:?} out of {:?}",
|
||||
|
@ -449,7 +444,7 @@ where
|
|||
res
|
||||
}
|
||||
|
||||
pub fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 {
|
||||
fn count_by_ids(&self, row_ids: &croaring::Bitmap) -> u64 {
|
||||
row_ids.cardinality()
|
||||
}
|
||||
}
|
||||
|
@ -460,6 +455,10 @@ impl From<&[i64]> for PlainFixed<i64> {
|
|||
values: v.to_vec(),
|
||||
// buf: Vec::with_capacity(v.len()),
|
||||
total_order: false,
|
||||
size: size_of::<Vec<i64>>()
|
||||
+ (size_of::<i64>() * v.len())
|
||||
+ size_of::<bool>()
|
||||
+ size_of::<usize>(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -470,122 +469,14 @@ impl From<&[f64]> for PlainFixed<f64> {
|
|||
values: v.to_vec(),
|
||||
// buf: Vec::with_capacity(v.len()),
|
||||
total_order: false,
|
||||
size: size_of::<Vec<f64>>()
|
||||
+ (size_of::<f64>() * v.len())
|
||||
+ size_of::<bool>()
|
||||
+ size_of::<usize>(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericEncoding for PlainFixed<f64> {
|
||||
type Item = f64;
|
||||
|
||||
fn size(&self) -> usize {
|
||||
self.size()
|
||||
}
|
||||
|
||||
fn value(&self, row_id: usize) -> Self::Item {
|
||||
self.value(row_id)
|
||||
}
|
||||
|
||||
fn values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.values(row_ids)
|
||||
}
|
||||
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.encoded_values(row_ids)
|
||||
}
|
||||
|
||||
fn all_encoded_values(&self) -> Vec<Self::Item> {
|
||||
self.all_encoded_values()
|
||||
}
|
||||
|
||||
fn scan_from(&self, row_id: usize) -> &[Self::Item] {
|
||||
self.scan_from(row_id)
|
||||
}
|
||||
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item {
|
||||
self.sum_by_ids(row_ids)
|
||||
}
|
||||
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item {
|
||||
self.sum_by_id_range(from_row_id, to_row_id)
|
||||
}
|
||||
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
self.count_by_id_range(from_row_id, to_row_id)
|
||||
}
|
||||
|
||||
fn row_ids_single_cmp_roaring(
|
||||
&self,
|
||||
wanted: &Self::Item,
|
||||
order: std::cmp::Ordering,
|
||||
) -> croaring::Bitmap {
|
||||
self.row_ids_single_cmp_roaring(wanted, order)
|
||||
}
|
||||
|
||||
fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap {
|
||||
self.row_ids_gte_lt_roaring(from, to)
|
||||
}
|
||||
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize> {
|
||||
self.row_id_eq_value(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl NumericEncoding for PlainFixed<i64> {
|
||||
type Item = i64;
|
||||
|
||||
fn size(&self) -> usize {
|
||||
self.size()
|
||||
}
|
||||
|
||||
fn value(&self, row_id: usize) -> Self::Item {
|
||||
self.value(row_id)
|
||||
}
|
||||
|
||||
fn values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.values(row_ids)
|
||||
}
|
||||
|
||||
fn encoded_values(&self, row_ids: &[usize]) -> Vec<Self::Item> {
|
||||
self.encoded_values(row_ids)
|
||||
}
|
||||
|
||||
fn all_encoded_values(&self) -> Vec<Self::Item> {
|
||||
self.all_encoded_values()
|
||||
}
|
||||
|
||||
fn scan_from(&self, row_id: usize) -> &[Self::Item] {
|
||||
self.scan_from(row_id)
|
||||
}
|
||||
|
||||
fn sum_by_ids(&self, row_ids: &mut croaring::Bitmap) -> Self::Item {
|
||||
self.sum_by_ids(row_ids)
|
||||
}
|
||||
|
||||
fn sum_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> Self::Item {
|
||||
self.sum_by_id_range(from_row_id, to_row_id)
|
||||
}
|
||||
|
||||
fn count_by_id_range(&self, from_row_id: usize, to_row_id: usize) -> usize {
|
||||
self.count_by_id_range(from_row_id, to_row_id)
|
||||
}
|
||||
|
||||
fn row_ids_single_cmp_roaring(
|
||||
&self,
|
||||
wanted: &Self::Item,
|
||||
order: std::cmp::Ordering,
|
||||
) -> croaring::Bitmap {
|
||||
self.row_ids_single_cmp_roaring(wanted, order)
|
||||
}
|
||||
|
||||
fn row_ids_gte_lt_roaring(&self, from: &Self::Item, to: &Self::Item) -> croaring::Bitmap {
|
||||
self.row_ids_gte_lt_roaring(from, to)
|
||||
}
|
||||
|
||||
fn row_id_eq_value(&self, v: Self::Item) -> Option<usize> {
|
||||
self.row_id_eq_value(v)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DictionaryRLE {
|
||||
// stores the mapping between an entry and its assigned index.
|
||||
|
@ -602,7 +493,6 @@ pub struct DictionaryRLE {
|
|||
// stores tuples where each pair refers to a dictionary entry and the number
|
||||
// of times the entry repeats.
|
||||
run_lengths: Vec<(usize, u64)>,
|
||||
run_length_size: usize,
|
||||
|
||||
total: u64,
|
||||
}
|
||||
|
@ -615,7 +505,6 @@ impl DictionaryRLE {
|
|||
index_entry: BTreeMap::new(),
|
||||
map_size: 0,
|
||||
run_lengths: Vec::new(),
|
||||
run_length_size: 0,
|
||||
total: 0,
|
||||
}
|
||||
}
|
||||
|
@ -627,7 +516,6 @@ impl DictionaryRLE {
|
|||
index_entry: BTreeMap::new(),
|
||||
map_size: 0,
|
||||
run_lengths: Vec::new(),
|
||||
run_length_size: 0,
|
||||
total: 0,
|
||||
};
|
||||
|
||||
|
@ -663,7 +551,6 @@ impl DictionaryRLE {
|
|||
} else {
|
||||
// start a new run-length
|
||||
self.run_lengths.push((*idx, additional));
|
||||
self.run_length_size += std::mem::size_of::<(usize, u64)>();
|
||||
}
|
||||
self.index_row_ids
|
||||
.get_mut(&(*idx as u32))
|
||||
|
@ -690,7 +577,6 @@ impl DictionaryRLE {
|
|||
.get_mut(&(idx as u32))
|
||||
.unwrap()
|
||||
.add_range(self.total..self.total + additional);
|
||||
self.run_length_size += std::mem::size_of::<(usize, u64)>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -927,8 +813,28 @@ impl DictionaryRLE {
|
|||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
// mapping and reverse mapping then the rles
|
||||
2 * self.map_size + self.run_length_size
|
||||
// entry_index: BTreeMap<Option<String>, usize>,
|
||||
|
||||
// // stores the mapping between an index and its entry.
|
||||
// index_entry: BTreeMap<usize, Option<String>>,
|
||||
|
||||
(self.index_entry.len() * size_of::<BTreeMap<usize, Option<String>>>())
|
||||
+ (self.index_row_ids.len() * size_of::<BTreeMap<u32, croaring::Bitmap>>())
|
||||
+ size_of::<usize>()
|
||||
+ (self.run_lengths.len() * size_of::<Vec<(usize, u64)>>())
|
||||
+ size_of::<u64>()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DictionaryRLE {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"[DictionaryRLE] size: {}, dict entries: {}, runs: {} ",
|
||||
self.size(),
|
||||
self.index_entry.len(),
|
||||
self.run_lengths.len()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -959,6 +865,7 @@ impl std::convert::From<&delorean_table::Packer<delorean_table::ByteArray>> for
|
|||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::NumericEncoding;
|
||||
|
||||
#[test]
|
||||
fn plain_arrow() {
|
||||
|
|
|
@ -934,6 +934,21 @@ impl Segment {
|
|||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Segment {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(
|
||||
f,
|
||||
"Rows: {}\nSize: {} Columns: ",
|
||||
self.num_rows(),
|
||||
self.size()
|
||||
)?;
|
||||
for (c, name) in self.columns.iter().zip(self.column_names().iter()) {
|
||||
writeln!(f, "{} {}", name, c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta data for a segment. This data is mainly used to determine if a segment
|
||||
/// may contain a value that can answer a query.
|
||||
#[derive(Debug)]
|
||||
|
|
Loading…
Reference in New Issue