fix: Encode and decode string data as bytes

String data isn't guaranteed to be UTF-8
pull/24376/head
Carol (Nichols || Goulding) 2020-06-19 15:10:09 -04:00
parent 672d3fe668
commit 1e341a7321
6 changed files with 36 additions and 23 deletions

View File

@ -597,7 +597,7 @@ impl TSMFileConverter {
})?;
// this will create a column of repeated None values.
let col: Vec<Option<String>> = vec![None; col_len];
let col: Vec<Option<Vec<u8>>> = vec![None; col_len];
packed_columns[*idx] = Packers::from(col);
}
@ -650,7 +650,7 @@ impl TSMFileConverter {
packed_columns[*idx] = Packers::from(col);
}
BlockType::Str => {
let col: Vec<Option<String>> = vec![None; col_len];
let col: Vec<Option<Vec<u8>>> = vec![None; col_len];
packed_columns[*idx] = Packers::from(col);
}
BlockType::Unsigned => {

View File

@ -152,13 +152,13 @@ impl std::convert::From<delorean_table_schema::DataType> for Packers {
}
}
impl std::convert::From<Vec<Option<String>>> for Packers {
fn from(values: Vec<Option<String>>) -> Self {
impl std::convert::From<Vec<Option<Vec<u8>>>> for Packers {
fn from(values: Vec<Option<Vec<u8>>>) -> Self {
// TODO(edd): convert this with an iterator?
let mut as_byte_array: Vec<Option<ByteArray>> = Vec::with_capacity(values.len());
for v in values {
match v {
Some(v) => as_byte_array.push(Some(ByteArray::from(v.as_str()))),
Some(v) => as_byte_array.push(Some(ByteArray::from(v))),
None => as_byte_array.push(None),
}
}

View File

@ -9,8 +9,9 @@ const HEADER_LEN: usize = 1;
/// Store `i32::MAX` as a `usize` for comparing with lengths in assertions
const MAX_I32: usize = i32::MAX as usize;
/// Encodes a slice of string slices into a vector of bytes. Currently uses Snappy compression.
pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
/// Encodes a slice of byte slices representing string data into a vector of bytes. Currently uses
/// Snappy compression.
pub fn encode(src: &[&[u8]], dst: &mut Vec<u8>) -> Result<(), Box<dyn Error>> {
dst.truncate(0); // reset buffer
if src.is_empty() {
return Ok(());
@ -21,7 +22,7 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
let sum_of_lengths: usize = src
.iter()
.map(|s| {
let len = s.as_ref().len();
let len = s.len();
assert!(len < MAX_I32);
len
})
@ -47,11 +48,10 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
let (compressed_data, data) = dst.split_at_mut(compressed_size);
let mut n = 0;
for s in src {
let s = s.as_ref();
let len = s.len();
let len_u64: u64 = len.try_into()?;
n += len_u64.encode_var(&mut data[n..]);
data[n..n + len].copy_from_slice(s.as_bytes());
data[n..n + len].copy_from_slice(s);
n += len;
}
let data = &data[..n];
@ -69,8 +69,9 @@ pub fn encode<T: AsRef<str>>(src: &[T], dst: &mut Vec<u8>) -> Result<(), Box<dyn
Ok(())
}
/// Decodes a slice of bytes representing Snappy-compressed data into a vector of `String`s.
pub fn decode(src: &[u8], dst: &mut Vec<String>) -> Result<(), Box<dyn Error>> {
/// Decodes a slice of bytes representing Snappy-compressed data into a vector of vectors of bytes
/// representing string data, which may or may not be valid UTF-8.
pub fn decode(src: &[u8], dst: &mut Vec<Vec<u8>>) -> Result<(), Box<dyn Error>> {
if src.is_empty() {
return Ok(());
}
@ -104,7 +105,7 @@ pub fn decode(src: &[u8], dst: &mut Vec<String>) -> Result<(), Box<dyn Error>> {
return Err("short buffer".into());
}
dst.push(std::str::from_utf8(&decoded_bytes[lower..upper])?.to_string());
dst.push(decoded_bytes[lower..upper].to_vec());
// The length of this string plus the length of the variable byte encoded length
i += length + num_bytes_read;
@ -119,7 +120,7 @@ mod tests {
#[test]
fn encode_no_values() {
let src: Vec<&str> = vec![];
let src: Vec<&[u8]> = vec![];
let mut dst = vec![];
// check for error
@ -131,7 +132,8 @@ mod tests {
#[test]
fn encode_single() {
let src = vec!["v1"];
let v1_bytes = b"v1";
let src = vec![&v1_bytes[..]];
let mut dst = vec![];
encode(&src, &mut dst).expect("failed to encode src");
@ -140,7 +142,8 @@ mod tests {
#[test]
fn encode_multi_compressed() {
let src: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
let src_strings: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
let src: Vec<_> = src_strings.iter().map(|s| s.as_bytes()).collect();
let mut dst = vec![];
encode(&src, &mut dst).expect("failed to encode src");
@ -172,7 +175,12 @@ mod tests {
let mut dst = vec![];
decode(&src, &mut dst).expect("failed to decode src");
assert_eq!(dst, vec!["v1"]);
let dst_as_strings: Vec<_> = dst
.iter()
.map(|s| std::str::from_utf8(s).unwrap())
.collect();
assert_eq!(dst_as_strings, vec!["v1"]);
}
#[test]
@ -186,7 +194,11 @@ mod tests {
decode(&src, &mut dst).expect("failed to decode src");
let dst_as_strings: Vec<_> = dst
.iter()
.map(|s| std::str::from_utf8(s).unwrap())
.collect();
let expected: Vec<_> = (0..10).map(|i| format!("value {}", i)).collect();
assert_eq!(dst, expected);
assert_eq!(dst_as_strings, expected);
}
}

View File

@ -129,7 +129,7 @@ pub enum BlockData {
Float { ts: Vec<i64>, values: Vec<f64> },
Integer { ts: Vec<i64>, values: Vec<i64> },
Bool { ts: Vec<i64>, values: Vec<bool> },
Str { ts: Vec<i64>, values: Vec<String> },
Str { ts: Vec<i64>, values: Vec<Vec<u8>> },
Unsigned { ts: Vec<i64>, values: Vec<u64> },
}

View File

@ -199,7 +199,7 @@ pub enum ColumnData {
Float(Vec<Option<f64>>),
Integer(Vec<Option<i64>>),
Bool(Vec<Option<bool>>),
Str(Vec<Option<String>>),
Str(Vec<Option<Vec<u8>>>),
Unsigned(Vec<Option<u64>>),
}
@ -209,7 +209,7 @@ enum ValuePair {
F64((i64, f64)),
I64((i64, i64)),
Bool((i64, bool)),
Str((i64, String)),
Str((i64, Vec<u8>)),
U64((i64, u64)),
}

View File

@ -240,7 +240,8 @@ impl Encoder for Vec<u64> {
impl Encoder for Vec<&str> {
fn encode(&self, dst: &mut Vec<u8>) -> Result<(), StorageError> {
string::encode(&self, dst).map_err(|e| StorageError {
let bytes: Vec<_> = self.iter().map(|s| s.as_bytes()).collect();
string::encode(&bytes, dst).map_err(|e| StorageError {
description: e.to_string(),
})
}