refactor: expose public API

2020-11-12 18:23:34 +00:00 · 2020-11-12 18:23:34 +00:00 · d54c30147e
parent fc881776dd
commit d54c30147e
3 changed files with 16 additions and 14 deletions
--- a/segment_store/src/column.rs
+++ b/segment_store/src/column.rs
@ -22,7 +22,7 @@ use arrow_deps::{arrow, arrow::array::Array};
 // FWIW it's not the cardinality of the column that should drive the decision
 // it's how many run-lengths would be produced in an RLE column and whether that
 // compression is worth the memory and compute costs to work on it.
-pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 1_000_000;
+pub const TEMP_CARDINALITY_DICTIONARY_ENCODING_LIMIT: usize = 100_000;
 /// The possible logical types that column values can have. All values in a
 /// column have the same physical type.
 pub enum Column {
--- a/segment_store/src/column/dictionary.rs
+++ b/segment_store/src/column/dictionary.rs
@ -20,7 +20,7 @@ pub enum Encoding {
 }

 impl Encoding {
-    fn debug_name(&self) -> &'static str {
+    pub fn debug_name(&self) -> &'static str {
        match &self {
            Encoding::RLE(_) => "RLE encoder",
            Encoding::Plain(_) => "plain encoder",
@ -81,7 +81,7 @@ impl Encoding {

    /// Populates the provided destination container with the row ids satisfying
    /// the provided predicate.
-    fn row_ids_filter(&self, value: &str, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
+    pub fn row_ids_filter(&self, value: &str, op: &cmp::Operator, dst: RowIDs) -> RowIDs {
        match self {
            Encoding::RLE(enc) => enc.row_ids_filter(value, op, dst),
            Encoding::Plain(enc) => enc.row_ids_filter(value, op, dst),
@ -161,7 +161,11 @@ impl Encoding {
    ///
    /// NULL values are represented by None. It is the caller's responsibility
    /// to ensure row ids are a monotonically increasing set.
-    fn values<'a>(&'a self, row_ids: &[u32], dst: Vec<Option<&'a str>>) -> Vec<Option<&'a str>> {
+    pub fn values<'a>(
+        &'a self,
+        row_ids: &[u32],
+        dst: Vec<Option<&'a str>>,
+    ) -> Vec<Option<&'a str>> {
        match self {
            Encoding::RLE(enc) => enc.values(row_ids, dst),
            Encoding::Plain(enc) => enc.values(row_ids, dst),
--- a/segment_store/src/column/dictionary/plain.rs
+++ b/segment_store/src/column/dictionary/plain.rs
@ -24,10 +24,12 @@ pub struct Plain {
    contains_null: bool,
 }

-// The default initialisation of an Plain involves reserving the first id/index 0
-// for the NULL value.
+// The default initialisation of an Plain involves reserving the first id/index
+// `0`, which is the encoded representation of the NULL value.
 impl Default for Plain {
    fn default() -> Self {
+        // for this to make sense NULL_ID must be `0`.
+        assert_eq!(NULL_ID, 0);
        Self {
            entries: vec![None],
            encoded_data: vec![],
@ -118,8 +120,8 @@ impl Plain {
        }
    }

-    // Preferred to add values to the column. `id` is the encoded
-    // representation of a logical string value.
+    // Preferred method to add values to the column. `id` is the encoded
+    // representation of a logical value.
    fn push_encoded_values(&mut self, id: u32, additional: u32) {
        self.encoded_data
            .extend(std::iter::repeat(id).take(additional as usize));
@ -142,11 +144,6 @@ impl Plain {
        }
    }

-    // correct way to determine next encoded id for a new value.
-    fn next_encoded_id(&self) -> u32 {
-        todo!()
-    }
-
    /// The number of logical rows encoded in this column.
    pub fn num_rows(&self) -> u32 {
        self.encoded_data.len() as u32
@ -449,7 +446,8 @@ impl Plain {
        dst.clear();
        dst.reserve(row_ids.len());

-        // TODO - not sure at all about this deref...
+        // The `as_deref` is needed to convert an `&Option<String>` into an
+        // `Option<&str>`.
        for chunks in row_ids.chunks_exact(4) {
            dst.push(self.entries[self.encoded_data[chunks[0] as usize] as usize].as_deref());
            dst.push(self.entries[self.encoded_data[chunks[1] as usize] as usize].as_deref());