package tsm1 import ( "encoding/binary" "errors" "fmt" "unsafe" "github.com/golang/snappy" ) var ( errStringBatchDecodeInvalidStringLength = fmt.Errorf("stringArrayDecodeAll: invalid encoded string length") errStringBatchDecodeLengthOverflow = fmt.Errorf("stringArrayDecodeAll: length overflow") errStringBatchDecodeShortBuffer = fmt.Errorf("stringArrayDecodeAll: short buffer") // ErrStringArrayEncodeTooLarge reports that the encoded length of a slice of strings is too large. ErrStringArrayEncodeTooLarge = errors.New("StringArrayEncodeAll: source length too large") ) // StringArrayEncodeAll encodes src into b, returning b and any error encountered. // The returned slice may be of a different length and capactity to b. // // Currently only the string compression scheme used snappy. func StringArrayEncodeAll(src []string, b []byte) ([]byte, error) { srcSz := 2 + len(src)*binary.MaxVarintLen32 // strings should't be longer than 64kb for i := range src { srcSz += len(src[i]) } // determine the maximum possible length needed for the buffer, which // includes the compressed size var compressedSz = 0 if len(src) > 0 { mle := snappy.MaxEncodedLen(srcSz) if mle == -1 { return b[:0], ErrStringArrayEncodeTooLarge } compressedSz = mle + 1 /* header */ } totSz := srcSz + compressedSz if cap(b) < totSz { b = make([]byte, totSz) } else { b = b[:totSz] } // Shortcut to snappy encoding nothing. if len(src) == 0 { b[0] = stringCompressedSnappy << 4 return b[:2], nil } // write the data to be compressed *after* the space needed for snappy // compression. The compressed data is at the start of the allocated buffer, // ensuring the entire capacity is returned and available for subsequent use. dta := b[compressedSz:] n := 0 for i := range src { n += binary.PutUvarint(dta[n:], uint64(len(src[i]))) n += copy(dta[n:], src[i]) } dta = dta[:n] dst := b[:compressedSz] dst[0] = stringCompressedSnappy << 4 res := snappy.Encode(dst[1:], dta) return dst[:len(res)+1], nil } func StringArrayDecodeAll(b []byte, dst []string) ([]string, error) { // First byte stores the encoding type, only have snappy format // currently so ignore for now. if len(b) > 0 { var err error // it is important that to note that `snappy.Decode` always returns // a newly allocated slice as the final strings reference this slice // directly. b, err = snappy.Decode(nil, b[1:]) if err != nil { return []string{}, fmt.Errorf("failed to decode string block: %v", err.Error()) } } else { return []string{}, nil } var ( i, l int ) sz := cap(dst) if sz == 0 { sz = 64 dst = make([]string, sz) } else { dst = dst[:sz] } j := 0 for i < len(b) { length, n := binary.Uvarint(b[i:]) if n <= 0 { return []string{}, errStringBatchDecodeInvalidStringLength } // The length of this string plus the length of the variable byte encoded length l = int(length) + n lower := i + n upper := lower + int(length) if upper < lower { return []string{}, errStringBatchDecodeLengthOverflow } if upper > len(b) { return []string{}, errStringBatchDecodeShortBuffer } // NOTE: this optimization is critical for performance and to reduce // allocations. This is just as "safe" as string.Builder, which // returns a string mapped to the original byte slice s := b[lower:upper] val := *(*string)(unsafe.Pointer(&s)) if j < len(dst) { dst[j] = val } else { dst = append(dst, val) // force a resize dst = dst[:cap(dst)] } i += l j++ } return dst[:j], nil }