fix: Encode and decode string data as bytes

String data isn't guaranteed to be UTF-8
2020-06-19 15:10:09 -04:00 · 2020-06-19 15:10:09 -04:00 · 1e341a7321
parent 672d3fe668
commit 1e341a7321
6 changed files with 36 additions and 23 deletions
--- a/delorean_ingest/src/lib.rs
+++ b/delorean_ingest/src/lib.rs
@ -597,7 +597,7 @@ impl TSMFileConverter {
                            })?;

                            // this will create a column of repeated None values.
-                            let col: Vec<Option<String>> = vec![None; col_len];
+                            let col: Vec<Option<Vec<u8>>> = vec![None; col_len];
                            packed_columns[*idx] = Packers::from(col);
                        }

@ -650,7 +650,7 @@ impl TSMFileConverter {
                                    packed_columns[*idx] = Packers::from(col);
                                }
                                BlockType::Str => {
-                                    let col: Vec<Option<String>> = vec![None; col_len];
+                                    let col: Vec<Option<Vec<u8>>> = vec![None; col_len];
                                    packed_columns[*idx] = Packers::from(col);
                                }
                                BlockType::Unsigned => {
--- a/delorean_table/src/packers.rs
+++ b/delorean_table/src/packers.rs
@ -152,13 +152,13 @@ impl std::convert::From<delorean_table_schema::DataType> for Packers {
    }
 }

-impl std::convert::From<Vec<Option<String>>> for Packers {
-    fn from(values: Vec<Option<String>>) -> Self {
+impl std::convert::From<Vec<Option<Vec<u8>>>> for Packers {
+    fn from(values: Vec<Option<Vec<u8>>>) -> Self {
        // TODO(edd): convert this with an iterator?
        let mut as_byte_array: Vec<Option<ByteArray>> = Vec::with_capacity(values.len());
        for v in values {
            match v {
-                Some(v) => as_byte_array.push(Some(ByteArray::from(v.as_str()))),
+                Some(v) => as_byte_array.push(Some(ByteArray::from(v))),
                None => as_byte_array.push(None),
            }
        }
--- a/delorean_tsm/src/encoders/string.rs
+++ b/delorean_tsm/src/encoders/string.rs
@ -9,8 +9,9 @@ const HEADER_LEN: usize = 1;
 /// Store `i32::MAX` as a `usize` for comparing with lengths in assertions
 const MAX_I32: usize = i32::MAX as usize;

-/// Encodes a slice of string slices into a vector of bytes. Currently uses Snappy compression.
-pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
+/// Encodes a slice of byte slices representing string data into a vector of bytes. Currently uses
+/// Snappy compression.
+pub fn encode(src: &[&[u8]], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
    dst.truncate(0); // reset buffer
    if src.is_empty() {
        return Ok(());
@ -21,7 +22,7 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
    let sum_of_lengths: usize = src
        .iter()
        .map(|s| {
-            let len = s.as_ref().len();
+            let len = s.len();
            assert!(len < MAX_I32);
            len
        })
@ -47,11 +48,10 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
    let (compressed_data, data) = dst.split_at_mut(compressed_size);
    let mut n = 0;
    for s in src {
-        let s = s.as_ref();
        let len = s.len();
        let len_u64: u64 = len.try_into()?;
        n += len_u64.encode_var(&mut data[n..]);
-        data[n..n + len].copy_from_slice(s.as_bytes());
+        data[n..n + len].copy_from_slice(s);
        n += len;
    }
    let data = &data[..n];
@ -69,8 +69,9 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
    Ok(())
 }

-/// Decodes a slice of bytes representing Snappy-compressed data into a vector of `String`s.
-pub fn decode(src: &[u8], dst: &mut Vec<String>) -> Result<(), Box<dyn Error>> {
+/// Decodes a slice of bytes representing Snappy-compressed data into a vector of vectors of bytes
+/// representing string data, which may or may not be valid UTF-8.
+pub fn decode(src: &[u8], dst: &mut Vec<Vec<u8>>) -> Result<(), Box<dyn Error>> {
    if src.is_empty() {
        return Ok(());
    }
@ -104,7 +105,7 @@ pub fn decode(src: &[u8], dst: &mut Vec<String>) -> Result<(), Box<dyn Error>> {
            return Err("short buffer".into());
        }

-        dst.push(std::str::from_utf8(&decoded_bytes[lower..upper])?.to_string());
+        dst.push(decoded_bytes[lower..upper].to_vec());

        // The length of this string plus the length of the variable byte encoded length
        i += length + num_bytes_read;
@ -119,7 +120,7 @@ mod tests {

    #[test]
    fn encode_no_values() {
-        let src: Vec<&str> = vec![];
+        let src: Vec<&[u8]> = vec![];
        let mut dst = vec![];

        // check for error
@ -131,7 +132,8 @@ mod tests {

    #[test]
    fn encode_single() {
-        let src = vec!["v1"];
+        let v1_bytes = b"v1";
+        let src = vec![&v1_bytes[..]];
        let mut dst = vec![];

        encode(&src, &mut dst).expect("failed to encode src");
@ -140,7 +142,8 @@ mod tests {

    #[test]
    fn encode_multi_compressed() {
-        let src: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
+        let src_strings: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
+        let src: Vec<_> = src_strings.iter().map(|s| s.as_bytes()).collect();
        let mut dst = vec![];

        encode(&src, &mut dst).expect("failed to encode src");
@ -172,7 +175,12 @@ mod tests {
        let mut dst = vec![];

        decode(&src, &mut dst).expect("failed to decode src");
-        assert_eq!(dst, vec!["v1"]);
+
+        let dst_as_strings: Vec<_> = dst
+            .iter()
+            .map(|s| std::str::from_utf8(s).unwrap())
+            .collect();
+        assert_eq!(dst_as_strings, vec!["v1"]);
    }

    #[test]
@ -186,7 +194,11 @@ mod tests {

        decode(&src, &mut dst).expect("failed to decode src");

+        let dst_as_strings: Vec<_> = dst
+            .iter()
+            .map(|s| std::str::from_utf8(s).unwrap())
+            .collect();
        let expected: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
-        assert_eq!(dst, expected);
+        assert_eq!(dst_as_strings, expected);
    }
 }
--- a/delorean_tsm/src/lib.rs
+++ b/delorean_tsm/src/lib.rs
@ -129,7 +129,7 @@ pub enum BlockData {
    Float { ts: Vec<i64>, values: Vec<f64> },
    Integer { ts: Vec<i64>, values: Vec<i64> },
    Bool { ts: Vec<i64>, values: Vec<bool> },
-    Str { ts: Vec<i64>, values: Vec<String> },
+    Str { ts: Vec<i64>, values: Vec<Vec<u8>> },
    Unsigned { ts: Vec<i64>, values: Vec<u64> },
 }

--- a/delorean_tsm/src/mapper.rs
+++ b/delorean_tsm/src/mapper.rs
@ -15,7 +15,7 @@ use std::iter::Peekable;
 /// The main purpose of the `TSMMeasurementMapper` is to provide a
 /// transformation step that allows one to convert per-series/per-field data
 /// into measurement-oriented table data.
-///  
+///
 #[derive(Debug)]
 pub struct TSMMeasurementMapper<R>
 where
@ -199,7 +199,7 @@ pub enum ColumnData {
    Float(Vec<Option<f64>>),
    Integer(Vec<Option<i64>>),
    Bool(Vec<Option<bool>>),
-    Str(Vec<Option<String>>),
+    Str(Vec<Option<Vec<u8>>>),
    Unsigned(Vec<Option<u64>>),
 }

@ -209,7 +209,7 @@ enum ValuePair {
    F64((i64, f64)),
    I64((i64, i64)),
    Bool((i64, bool)),
-    Str((i64, String)),
+    Str((i64, Vec<u8>)),
    U64((i64, u64)),
 }

--- a/src/storage/block.rs
+++ b/src/storage/block.rs
@ -240,7 +240,8 @@ impl Encoder for Vec<u64> {

 impl Encoder for Vec<&str> {
    fn encode(&self, dst: &mut Vec<u8>) -> Result<(), StorageError> {
-        string::encode(&self, dst).map_err(|e| StorageError {
+        let bytes: Vec<_> = self.iter().map(|s| s.as_bytes()).collect();
+        string::encode(&bytes, dst).map_err(|e| StorageError {
            description: e.to_string(),
        })
    }