influxdb/tsdb/tsm1/int.go

325 lines
7.9 KiB
Go

package tsm1
// Integer encoding uses two different strategies depending on the range of values in
// the uncompressed data. Encoded values are first encoding used zig zag encoding.
// This interleaves positive and negative integers across a range of positive integers.
//
// For example, [-2,-1,0,1] becomes [3,1,0,2]. See
// https://developers.google.com/protocol-buffers/docs/encoding?hl=en#signed-integers
// for more information.
//
// If all the zig zag encoded values are less than 1 << 60 - 1, they are compressed using
// simple8b encoding. If any value is larger than 1 << 60 - 1, the values are stored uncompressed.
//
// Each encoded byte slice contains a 1 byte header followed by multiple 8 byte packed integers
// or 8 byte uncompressed integers. The 4 high bits of the first byte indicate the encoding type
// for the remaining bytes.
//
// There are currently two encoding types that can be used with room for 16 total. These additional
// encoding slots are reserved for future use. One improvement to be made is to use a patched
// encoding such as PFOR if only a small number of values exceed the max compressed value range. This
// should improve compression ratios with very large integers near the ends of the int64 range.
import (
"encoding/binary"
"fmt"
"github.com/jwilder/encoding/simple8b"
)
const (
// intUncompressed is an uncompressed format using 8 bytes per point
intUncompressed = 0
// intCompressedSimple is a bit-packed format using simple8b encoding
intCompressedSimple = 1
// intCompressedRLE is a run-length encoding format
intCompressedRLE = 2
)
// IntegerEncoder encodes int64s into byte slices.
type IntegerEncoder struct {
prev int64
rle bool
values []uint64
}
// NewIntegerEncoder returns a new integer encoder with an initial buffer of values sized at sz.
func NewIntegerEncoder(sz int) IntegerEncoder {
return IntegerEncoder{
rle: true,
values: make([]uint64, 0, sz),
}
}
// Flush is no-op
func (e *IntegerEncoder) Flush() {}
// Reset sets the encoder back to its initial state.
func (e *IntegerEncoder) Reset() {
e.prev = 0
e.rle = true
e.values = e.values[:0]
}
// Write encodes v to the underlying buffers.
func (e *IntegerEncoder) Write(v int64) {
// Delta-encode each value as it's written. This happens before
// ZigZagEncoding because the deltas could be negative.
delta := v - e.prev
e.prev = v
enc := ZigZagEncode(delta)
if len(e.values) > 1 {
e.rle = e.rle && e.values[len(e.values)-1] == enc
}
e.values = append(e.values, enc)
}
// Bytes returns a copy of the underlying buffer.
func (e *IntegerEncoder) Bytes() ([]byte, error) {
// Only run-length encode if it could reduce storage size.
if e.rle && len(e.values) > 2 {
return e.encodeRLE()
}
for _, v := range e.values {
// Value is too large to encode using packed format
if v > simple8b.MaxValue {
return e.encodeUncompressed()
}
}
return e.encodePacked()
}
func (e *IntegerEncoder) encodeRLE() ([]byte, error) {
// Large varints can take up to 10 bytes. We're storing 3 + 1
// type byte.
var b [31]byte
// 4 high bits used for the encoding type
b[0] = byte(intCompressedRLE) << 4
i := 1
// The first value
binary.BigEndian.PutUint64(b[i:], e.values[0])
i += 8
// The first delta
i += binary.PutUvarint(b[i:], e.values[1])
// The number of times the delta is repeated
i += binary.PutUvarint(b[i:], uint64(len(e.values)-1))
return b[:i], nil
}
func (e *IntegerEncoder) encodePacked() ([]byte, error) {
if len(e.values) == 0 {
return nil, nil
}
// Encode all but the first value. Fist value is written unencoded
// using 8 bytes.
encoded, err := simple8b.EncodeAll(e.values[1:])
if err != nil {
return nil, err
}
b := make([]byte, 1+(len(encoded)+1)*8)
// 4 high bits of first byte store the encoding type for the block
b[0] = byte(intCompressedSimple) << 4
// Write the first value since it's not part of the encoded values
binary.BigEndian.PutUint64(b[1:9], e.values[0])
// Write the encoded values
for i, v := range encoded {
binary.BigEndian.PutUint64(b[9+i*8:9+i*8+8], v)
}
return b, nil
}
func (e *IntegerEncoder) encodeUncompressed() ([]byte, error) {
if len(e.values) == 0 {
return nil, nil
}
b := make([]byte, 1+len(e.values)*8)
// 4 high bits of first byte store the encoding type for the block
b[0] = byte(intUncompressed) << 4
for i, v := range e.values {
binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v)
}
return b, nil
}
// IntegerDecoder decodes a byte slice into int64s.
type IntegerDecoder struct {
// 240 is the maximum number of values that can be encoded into a single uint64 using simple8b
values [240]uint64
bytes []byte
i int
n int
prev int64
first bool
// The first value for a run-length encoded byte slice
rleFirst uint64
// The delta value for a run-length encoded byte slice
rleDelta uint64
encoding byte
err error
}
// SetBytes sets the underlying byte slice of the decoder.
func (d *IntegerDecoder) SetBytes(b []byte) {
if len(b) > 0 {
d.encoding = b[0] >> 4
d.bytes = b[1:]
} else {
d.encoding = 0
d.bytes = nil
}
d.i = 0
d.n = 0
d.prev = 0
d.first = true
d.rleFirst = 0
d.rleDelta = 0
d.err = nil
}
// Next returns true if there are any values remaining to be decoded.
func (d *IntegerDecoder) Next() bool {
if d.i >= d.n && len(d.bytes) == 0 {
return false
}
d.i++
if d.i >= d.n {
switch d.encoding {
case intUncompressed:
d.decodeUncompressed()
case intCompressedSimple:
d.decodePacked()
case intCompressedRLE:
d.decodeRLE()
default:
d.err = fmt.Errorf("unknown encoding %v", d.encoding)
}
}
return d.err == nil && d.i < d.n
}
// Error returns the last error encountered by the decoder.
func (d *IntegerDecoder) Error() error {
return d.err
}
// Read returns the next value from the decoder.
func (d *IntegerDecoder) Read() int64 {
switch d.encoding {
case intCompressedRLE:
return ZigZagDecode(d.rleFirst) + int64(d.i)*ZigZagDecode(d.rleDelta)
default:
v := ZigZagDecode(d.values[d.i])
// v is the delta encoded value, we need to add the prior value to get the original
v = v + d.prev
d.prev = v
return v
}
}
func (d *IntegerDecoder) decodeRLE() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("integerDecoder: not enough data to decode RLE starting value")
return
}
var i, n int
// Next 8 bytes is the starting value
first := binary.BigEndian.Uint64(d.bytes[i : i+8])
i += 8
// Next 1-10 bytes is the delta value
value, n := binary.Uvarint(d.bytes[i:])
if n <= 0 {
d.err = fmt.Errorf("integerDecoder: invalid RLE delta value")
return
}
i += n
// Last 1-10 bytes is how many times the value repeats
count, n := binary.Uvarint(d.bytes[i:])
if n <= 0 {
d.err = fmt.Errorf("integerDecoder: invalid RLE repeat value")
return
}
// Store the first value and delta value so we do not need to allocate
// a large values slice. We can compute the value at position d.i on
// demand.
d.rleFirst = first
d.rleDelta = value
d.n = int(count) + 1
d.i = 0
// We've process all the bytes
d.bytes = nil
}
func (d *IntegerDecoder) decodePacked() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("integerDecoder: not enough data to decode packed value")
return
}
v := binary.BigEndian.Uint64(d.bytes[0:8])
// The first value is always unencoded
if d.first {
d.first = false
d.n = 1
d.values[0] = v
} else {
n, err := simple8b.Decode(&d.values, v)
if err != nil {
// Should never happen, only error that could be returned is if the the value to be decoded was not
// actually encoded by simple8b encoder.
d.err = fmt.Errorf("failed to decode value %v: %v", v, err)
}
d.n = n
}
d.i = 0
d.bytes = d.bytes[8:]
}
func (d *IntegerDecoder) decodeUncompressed() {
if len(d.bytes) == 0 {
return
}
if len(d.bytes) < 8 {
d.err = fmt.Errorf("integerDecoder: not enough data to decode uncompressed value")
return
}
d.values[0] = binary.BigEndian.Uint64(d.bytes[0:8])
d.i = 0
d.n = 1
d.bytes = d.bytes[8:]
}