influxdb/tsdb/engine/tsm1/int.go

package tsm1

// Int64 encoding uses two different strategies depending on the range of values in
// the uncompressed data.  Encoded values are first encoding used zig zag encoding.
// This interleaves postiive and negative integers across a range of positive integers.
//
// For example, [-2,-1,0,1] becomes [3,1,0,2]. See
// https://developers.google.com/protocol-buffers/docs/encoding?hl=en#signed-integers
// for more information.
//
// If all the zig zag encoded values less than 1 << 60 - 1, they are compressed using
// simple8b encoding.  If any values is larger than 1 << 60 - 1, the values are stored uncompressed.
//
// Each encoded byte slice, contains a 1 byte header followed by multiple 8 byte packed integers
// or 8 byte uncompressed integers.  The 4 high bits of the first byte indicate the encoding type
// for the remaining bytes.
//
// There are currently two encoding types that can be used with room for 16 total.  These additional
// encoding slots are reserved for future use.  One improvement to be made is to use a patched
// encoding such as PFOR if only a small number of values exceed the max compressed value range.  This
// should improve compression ratios with very large integers near the ends of the int64 range.

import (
	"encoding/binary"
	"fmt"

	"github.com/jwilder/encoding/simple8b"
)

const (
	// intUncompressed is an uncompressed format using 8 bytes per point
	intUncompressed = 0
	// intCompressedSimple is a bit-packed format using simple8b encoding
	intCompressedSimple = 1
)

// Int64Encoder encoders int64 into byte slices
type Int64Encoder interface {
	Write(v int64)
	Bytes() ([]byte, error)
}

// Int64Decoder decodes a byte slice into int64s
type Int64Decoder interface {
	Next() bool
	Read() int64
	Error() error
}

type int64Encoder struct {
	values []uint64
}

func NewInt64Encoder() Int64Encoder {
	return &int64Encoder{}
}

func (e *int64Encoder) Write(v int64) {
	e.values = append(e.values, ZigZagEncode(v))
}

func (e *int64Encoder) Bytes() ([]byte, error) {
	for _, v := range e.values {
		// Value is too large to encode using packed format
		if v > simple8b.MaxValue {
			return e.encodeUncompressed()
		}
	}

	return e.encodePacked()
}

func (e *int64Encoder) encodePacked() ([]byte, error) {
	encoded, err := simple8b.EncodeAll(e.values)
	if err != nil {
		return nil, err
	}

	b := make([]byte, 1+len(encoded)*8)
	// 4 high bits of first byte store the encoding type for the block
	b[0] = byte(intCompressedSimple) << 4

	for i, v := range encoded {
		binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v)
	}
	return b, nil
}

func (e *int64Encoder) encodeUncompressed() ([]byte, error) {
	b := make([]byte, 1+len(e.values)*8)
	// 4 high bits of first byte store the encoding type for the block
	b[0] = byte(intUncompressed) << 4

	for i, v := range e.values {
		binary.BigEndian.PutUint64(b[1+i*8:1+i*8+8], v)
	}
	return b, nil
}

type int64Decoder struct {
	values []uint64
	bytes  []byte
	i      int
	n      int

	encoding byte
	err      error
}

func NewInt64Decoder(b []byte) Int64Decoder {
	d := &int64Decoder{
		// 240 is the maximum number of values that can be encoded into a single uint64 using simple8b
		values: make([]uint64, 240),
	}

	d.SetBytes(b)
	return d
}

func (d *int64Decoder) SetBytes(b []byte) {
	if len(b) > 0 {
		d.encoding = b[0] >> 4
		d.bytes = b[1:]
	}
	d.i = 0
	d.n = 0
}

func (d *int64Decoder) Next() bool {
	if d.i >= d.n && len(d.bytes) == 0 {
		return false
	}

	d.i += 1

	if d.i >= d.n {
		switch d.encoding {
		case intUncompressed:
			d.decodeUncompressed()
		case intCompressedSimple:
			d.decodePacked()
		default:
			d.err = fmt.Errorf("unknown encoding %v", d.encoding)
		}
	}
	return d.i < d.n
}

func (d *int64Decoder) Error() error {
	return d.err
}

func (d *int64Decoder) Read() int64 {
	return ZigZagDecode(d.values[d.i])
}

func (d *int64Decoder) decodePacked() {
	if len(d.bytes) == 0 {
		return
	}

	v := binary.BigEndian.Uint64(d.bytes[0:8])
	n, err := simple8b.Decode(d.values, v)
	if err != nil {
		// Should never happen, only error that could be returned is if the the value to be decoded was not
		// actually encoded by simple8b encoder.
		d.err = fmt.Errorf("failed to decode value %v: %v", v, err)
	}

	d.n = n
	d.i = 0
	d.bytes = d.bytes[8:]
}

func (d *int64Decoder) decodeUncompressed() {
	d.values[0] = binary.BigEndian.Uint64(d.bytes[0:8])
	d.i = 0
	d.n = 1
	d.bytes = d.bytes[8:]
}