* feat: optimize dedup dictionaries (#1827) * fix: handle sliced null bitmasks * chore: review feedbackpull/24376/head
parent
fef160e24f
commit
62d3305923
|
@ -85,9 +85,13 @@ fn optimize_dict_col(
|
||||||
None => -1,
|
None => -1,
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok(Arc::new(
|
let offset = keys.data().offset();
|
||||||
new_dictionary.to_arrow(new_keys, keys.data().null_buffer().cloned()),
|
let nulls = keys
|
||||||
))
|
.data()
|
||||||
|
.null_buffer()
|
||||||
|
.map(|buffer| buffer.bit_slice(offset, keys.len()));
|
||||||
|
|
||||||
|
Ok(Arc::new(new_dictionary.to_arrow(new_keys, nulls)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -118,19 +122,9 @@ mod tests {
|
||||||
Some(3),
|
Some(3),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let data = ArrayDataBuilder::new(DataType::Dictionary(
|
|
||||||
Box::new(DataType::Int32),
|
|
||||||
Box::new(DataType::Utf8),
|
|
||||||
))
|
|
||||||
.len(keys.len())
|
|
||||||
.add_buffer(keys.data().buffers()[0].clone())
|
|
||||||
.null_bit_buffer(keys.data().null_buffer().unwrap().clone())
|
|
||||||
.add_child_data(values.data().clone())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
let batch = RecordBatch::try_from_iter(vec![(
|
let batch = RecordBatch::try_from_iter(vec![(
|
||||||
"foo",
|
"foo",
|
||||||
Arc::new(DictionaryArray::<Int32Type>::from(data)) as ArrayRef,
|
Arc::new(build_dict(keys, values)) as ArrayRef,
|
||||||
)])
|
)])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -244,4 +238,66 @@ mod tests {
|
||||||
&[optimized]
|
&[optimized]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_null() {
|
||||||
|
let values = StringArray::from(vec!["bananas"]);
|
||||||
|
let keys = Int32Array::from(vec![None, None, Some(0)]);
|
||||||
|
let col = Arc::new(build_dict(keys, values)) as ArrayRef;
|
||||||
|
|
||||||
|
let col = optimize_dict_col(&col, &DataType::Int32, &DataType::Utf8).unwrap();
|
||||||
|
|
||||||
|
let batch = RecordBatch::try_from_iter(vec![("t", col)]).unwrap();
|
||||||
|
|
||||||
|
assert_batches_eq!(
|
||||||
|
vec![
|
||||||
|
"+---------+",
|
||||||
|
"| t |",
|
||||||
|
"+---------+",
|
||||||
|
"| |",
|
||||||
|
"| |",
|
||||||
|
"| bananas |",
|
||||||
|
"+---------+",
|
||||||
|
],
|
||||||
|
&[batch]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_slice() {
|
||||||
|
let values = StringArray::from(vec!["bananas"]);
|
||||||
|
let keys = Int32Array::from(vec![None, Some(0), None]);
|
||||||
|
let col = Arc::new(build_dict(keys, values)) as ArrayRef;
|
||||||
|
let col = col.slice(1, 2);
|
||||||
|
|
||||||
|
let col = optimize_dict_col(&col, &DataType::Int32, &DataType::Utf8).unwrap();
|
||||||
|
|
||||||
|
let batch = RecordBatch::try_from_iter(vec![("t", col)]).unwrap();
|
||||||
|
|
||||||
|
assert_batches_eq!(
|
||||||
|
vec![
|
||||||
|
"+---------+",
|
||||||
|
"| t |",
|
||||||
|
"+---------+",
|
||||||
|
"| bananas |",
|
||||||
|
"| |",
|
||||||
|
"+---------+",
|
||||||
|
],
|
||||||
|
&[batch]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_dict(keys: Int32Array, values: StringArray) -> DictionaryArray<Int32Type> {
|
||||||
|
let data = ArrayDataBuilder::new(DataType::Dictionary(
|
||||||
|
Box::new(DataType::Int32),
|
||||||
|
Box::new(DataType::Utf8),
|
||||||
|
))
|
||||||
|
.len(keys.len())
|
||||||
|
.add_buffer(keys.data().buffers()[0].clone())
|
||||||
|
.null_bit_buffer(keys.data().null_buffer().unwrap().clone())
|
||||||
|
.add_child_data(values.data().clone())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
DictionaryArray::from(data)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -251,7 +251,7 @@ async fn deduplicate(
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use arrow::compute::SortOptions;
|
use arrow::compute::SortOptions;
|
||||||
use arrow::datatypes::SchemaRef;
|
use arrow::datatypes::{Int32Type, SchemaRef};
|
||||||
use arrow::{
|
use arrow::{
|
||||||
array::{ArrayRef, Float64Array, StringArray},
|
array::{ArrayRef, Float64Array, StringArray},
|
||||||
record_batch::RecordBatch,
|
record_batch::RecordBatch,
|
||||||
|
@ -260,6 +260,8 @@ mod test {
|
||||||
use datafusion::physical_plan::{collect, expressions::col, memory::MemoryExec};
|
use datafusion::physical_plan::{collect, expressions::col, memory::MemoryExec};
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use arrow::array::DictionaryArray;
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_single_tag() {
|
async fn test_single_tag() {
|
||||||
|
@ -777,6 +779,90 @@ mod test {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_dictionary() {
|
||||||
|
let t1 = DictionaryArray::<Int32Type>::from_iter(vec![Some("a"), Some("a"), Some("b")]);
|
||||||
|
let t2 = DictionaryArray::<Int32Type>::from_iter(vec![Some("b"), Some("c"), Some("c")]);
|
||||||
|
let f1 = Float64Array::from(vec![Some(1.0), Some(3.0), Some(4.0)]);
|
||||||
|
let f2 = Float64Array::from(vec![Some(2.0), None, Some(5.0)]);
|
||||||
|
|
||||||
|
let batch1 = RecordBatch::try_from_iter(vec![
|
||||||
|
("t1", Arc::new(t1) as ArrayRef),
|
||||||
|
("t2", Arc::new(t2) as ArrayRef),
|
||||||
|
("f1", Arc::new(f1) as ArrayRef),
|
||||||
|
("f2", Arc::new(f2) as ArrayRef),
|
||||||
|
])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let t1 = DictionaryArray::<Int32Type>::from_iter(vec![Some("b"), Some("c")]);
|
||||||
|
let t2 = DictionaryArray::<Int32Type>::from_iter(vec![Some("c"), Some("d")]);
|
||||||
|
let f1 = Float64Array::from(vec![None, Some(7.0)]);
|
||||||
|
let f2 = Float64Array::from(vec![Some(6.0), Some(8.0)]);
|
||||||
|
|
||||||
|
let batch2 = RecordBatch::try_from_iter(vec![
|
||||||
|
("t1", Arc::new(t1) as ArrayRef),
|
||||||
|
("t2", Arc::new(t2) as ArrayRef),
|
||||||
|
("f1", Arc::new(f1) as ArrayRef),
|
||||||
|
("f2", Arc::new(f2) as ArrayRef),
|
||||||
|
])
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let sort_keys = vec![
|
||||||
|
PhysicalSortExpr {
|
||||||
|
expr: col("t1"),
|
||||||
|
options: SortOptions {
|
||||||
|
descending: false,
|
||||||
|
nulls_first: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
PhysicalSortExpr {
|
||||||
|
expr: col("t2"),
|
||||||
|
options: SortOptions {
|
||||||
|
descending: false,
|
||||||
|
nulls_first: false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
let results = dedupe(vec![batch1, batch2], sort_keys).await;
|
||||||
|
|
||||||
|
let cols: Vec<_> = results
|
||||||
|
.output
|
||||||
|
.iter()
|
||||||
|
.map(|batch| {
|
||||||
|
batch
|
||||||
|
.column(batch.schema().column_with_name("t1").unwrap().0)
|
||||||
|
.as_any()
|
||||||
|
.downcast_ref::<DictionaryArray<Int32Type>>()
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Should produce optimised dictionaries
|
||||||
|
// The batching is not important
|
||||||
|
assert_eq!(cols.len(), 3);
|
||||||
|
assert_eq!(cols[0].keys().len(), 2);
|
||||||
|
assert_eq!(cols[0].values().len(), 1); // "a"
|
||||||
|
assert_eq!(cols[1].keys().len(), 1);
|
||||||
|
assert_eq!(cols[1].values().len(), 1); // "b"
|
||||||
|
assert_eq!(cols[2].keys().len(), 1);
|
||||||
|
assert_eq!(cols[2].values().len(), 1); // "c"
|
||||||
|
|
||||||
|
let expected = vec![
|
||||||
|
"+----+----+----+----+",
|
||||||
|
"| t1 | t2 | f1 | f2 |",
|
||||||
|
"+----+----+----+----+",
|
||||||
|
"| a | b | 1 | 2 |",
|
||||||
|
"| a | c | 3 | |",
|
||||||
|
"| b | c | 4 | 6 |",
|
||||||
|
"| c | d | 7 | 8 |",
|
||||||
|
"+----+----+----+----+",
|
||||||
|
];
|
||||||
|
assert_batches_eq!(&expected, &results.output);
|
||||||
|
// 5 rows in initial input, 4 rows in output ==> 1 dupes
|
||||||
|
assert_eq!(results.num_dupes(), 5 - 4);
|
||||||
|
}
|
||||||
|
|
||||||
struct TestResults {
|
struct TestResults {
|
||||||
output: Vec<RecordBatch>,
|
output: Vec<RecordBatch>,
|
||||||
exec: Arc<DeduplicateExec>,
|
exec: Arc<DeduplicateExec>,
|
||||||
|
|
|
@ -9,6 +9,7 @@ use arrow::{
|
||||||
record_batch::RecordBatch,
|
record_batch::RecordBatch,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use arrow_util::optimize::optimize_dictionaries;
|
||||||
use datafusion::physical_plan::{
|
use datafusion::physical_plan::{
|
||||||
coalesce_batches::concat_batches, expressions::PhysicalSortExpr, PhysicalExpr, SQLMetric,
|
coalesce_batches::concat_batches, expressions::PhysicalSortExpr, PhysicalExpr, SQLMetric,
|
||||||
};
|
};
|
||||||
|
@ -177,7 +178,12 @@ impl RecordBatchDeduplicator {
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect::<ArrowResult<Vec<ArrayRef>>>()?;
|
.collect::<ArrowResult<Vec<ArrayRef>>>()?;
|
||||||
RecordBatch::try_new(batch.schema(), new_columns)
|
|
||||||
|
let batch = RecordBatch::try_new(batch.schema(), new_columns)?;
|
||||||
|
// At time of writing, `MutableArrayData` concatenates the
|
||||||
|
// contents of dictionaries as well; Do a post pass to remove the
|
||||||
|
// redundancy if possible
|
||||||
|
optimize_dictionaries(&batch)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,7 +239,12 @@ impl RecordBatchDeduplicator {
|
||||||
.map(|old_column| old_column.slice(offset, len))
|
.map(|old_column| old_column.slice(offset, len))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
RecordBatch::try_new(schema, new_columns)
|
let batch = RecordBatch::try_new(schema, new_columns)?;
|
||||||
|
|
||||||
|
// At time of writing, `concat_batches` concatenates the
|
||||||
|
// contents of dictionaries as well; Do a post pass to remove the
|
||||||
|
// redundancy if possible
|
||||||
|
optimize_dictionaries(&batch)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue