influxdb/pkg/hll/hll_test.go

package hll

import (
	crand "crypto/rand"
	"encoding/binary"
	"fmt"
	"math"
	"math/rand"
	"reflect"
	"testing"
	"unsafe"

	"github.com/davecgh/go-spew/spew"
)

func nopHash(buf []byte) uint64 {
	if len(buf) != 8 {
		panic(fmt.Sprintf("unexpected size buffer: %d", len(buf)))
	}
	return binary.BigEndian.Uint64(buf)
}

func toByte(v uint64) []byte {
	var buf [8]byte
	binary.BigEndian.PutUint64(buf[:], v)
	return buf[:]
}

func TestPlus_Bytes(t *testing.T) {
	testCases := []struct {
		p      uint8
		normal bool
	}{
		{4, false},
		{5, false},
		{4, true},
		{5, true},
	}

	for i, testCase := range testCases {
		t.Run(fmt.Sprint(i), func(t *testing.T) {
			h := NewTestPlus(testCase.p)

			plusStructOverhead := int(unsafe.Sizeof(*h))
			compressedListOverhead := int(unsafe.Sizeof(*h.sparseList))

			var expectedDenseListCapacity, expectedSparseListCapacity int

			if testCase.normal {
				h.toNormal()
				// denseList has capacity for 2^p elements, one byte each
				expectedDenseListCapacity = int(math.Pow(2, float64(testCase.p)))
				if expectedDenseListCapacity != cap(h.denseList) {
					t.Errorf("denseList capacity: want %d got %d", expectedDenseListCapacity, cap(h.denseList))
				}
			} else {
				// sparseList has capacity for 2^p elements, one byte each
				expectedSparseListCapacity = int(math.Pow(2, float64(testCase.p)))
				if expectedSparseListCapacity != cap(h.sparseList.b) {
					t.Errorf("sparseList capacity: want %d got %d", expectedSparseListCapacity, cap(h.sparseList.b))
				}
				expectedSparseListCapacity += compressedListOverhead
			}

			expectedSize := plusStructOverhead + expectedDenseListCapacity + expectedSparseListCapacity
			if expectedSize != h.Bytes() {
				t.Errorf("Bytes(): want %d got %d", expectedSize, h.Bytes())
			}
		})
	}
}

func TestPlus_Add_NoSparse(t *testing.T) {
	h := NewTestPlus(16)
	h.toNormal()

	h.Add(toByte(0x00010fffffffffff))
	n := h.denseList[1]
	if n != 5 {
		t.Error(n)
	}

	h.Add(toByte(0x0002ffffffffffff))
	n = h.denseList[2]
	if n != 1 {
		t.Error(n)
	}

	h.Add(toByte(0x0003000000000000))
	n = h.denseList[3]
	if n != 49 {
		t.Error(n)
	}

	h.Add(toByte(0x0003000000000001))
	n = h.denseList[3]
	if n != 49 {
		t.Error(n)
	}

	h.Add(toByte(0xff03700000000000))
	n = h.denseList[0xff03]
	if n != 2 {
		t.Error(n)
	}

	h.Add(toByte(0xff03080000000000))
	n = h.denseList[0xff03]
	if n != 5 {
		t.Error(n)
	}
}

func TestPlusPrecision_NoSparse(t *testing.T) {
	h := NewTestPlus(4)
	h.toNormal()

	h.Add(toByte(0x1fffffffffffffff))
	n := h.denseList[1]
	if n != 1 {
		t.Error(n)
	}

	h.Add(toByte(0xffffffffffffffff))
	n = h.denseList[0xf]
	if n != 1 {
		t.Error(n)
	}

	h.Add(toByte(0x00ffffffffffffff))
	n = h.denseList[0]
	if n != 5 {
		t.Error(n)
	}
}

func TestPlus_toNormal(t *testing.T) {
	h := NewTestPlus(16)
	h.Add(toByte(0x00010fffffffffff))
	h.toNormal()
	c := h.Count()
	if c != 1 {
		t.Error(c)
	}

	if h.sparse {
		t.Error("toNormal should convert to normal")
	}

	h = NewTestPlus(16)
	h.hash = nopHash
	h.Add(toByte(0x00010fffffffffff))
	h.Add(toByte(0x0002ffffffffffff))
	h.Add(toByte(0x0003000000000000))
	h.Add(toByte(0x0003000000000001))
	h.Add(toByte(0xff03700000000000))
	h.Add(toByte(0xff03080000000000))
	h.mergeSparse()
	h.toNormal()

	n := h.denseList[1]
	if n != 5 {
		t.Error(n)
	}
	n = h.denseList[2]
	if n != 1 {
		t.Error(n)
	}
	n = h.denseList[3]
	if n != 49 {
		t.Error(n)
	}
	n = h.denseList[0xff03]
	if n != 5 {
		t.Error(n)
	}
}

func TestPlusCount(t *testing.T) {
	h := NewTestPlus(16)

	n := h.Count()
	if n != 0 {
		t.Error(n)
	}

	h.Add(toByte(0x00010fffffffffff))
	h.Add(toByte(0x00020fffffffffff))
	h.Add(toByte(0x00030fffffffffff))
	h.Add(toByte(0x00040fffffffffff))
	h.Add(toByte(0x00050fffffffffff))
	h.Add(toByte(0x00050fffffffffff))

	n = h.Count()
	if n != 5 {
		t.Error(n)
	}

	// not mutated, still returns correct count
	n = h.Count()
	if n != 5 {
		t.Error(n)
	}

	h.Add(toByte(0x00060fffffffffff))

	// mutated
	n = h.Count()
	if n != 6 {
		t.Error(n)
	}
}

func TestPlus_Merge_Error(t *testing.T) {
	h := NewTestPlus(16)
	h2 := NewTestPlus(10)

	err := h.Merge(h2)
	if err == nil {
		t.Error("different precision should return error")
	}
}

func TestHLL_Merge_Sparse(t *testing.T) {
	h := NewTestPlus(16)
	h.Add(toByte(0x00010fffffffffff))
	h.Add(toByte(0x00020fffffffffff))
	h.Add(toByte(0x00030fffffffffff))
	h.Add(toByte(0x00040fffffffffff))
	h.Add(toByte(0x00050fffffffffff))
	h.Add(toByte(0x00050fffffffffff))

	h2 := NewTestPlus(16)
	h2.Merge(h)
	n := h2.Count()
	if n != 5 {
		t.Error(n)
	}

	if h2.sparse {
		t.Error("Merge should convert to normal")
	}

	if !h.sparse {
		t.Error("Merge should not modify argument")
	}

	h2.Merge(h)
	n = h2.Count()
	if n != 5 {
		t.Error(n)
	}

	h.Add(toByte(0x00060fffffffffff))
	h.Add(toByte(0x00070fffffffffff))
	h.Add(toByte(0x00080fffffffffff))
	h.Add(toByte(0x00090fffffffffff))
	h.Add(toByte(0x000a0fffffffffff))
	h.Add(toByte(0x000a0fffffffffff))
	n = h.Count()
	if n != 10 {
		t.Error(n)
	}

	h2.Merge(h)
	n = h2.Count()
	if n != 10 {
		t.Error(n)
	}
}

func TestHLL_Merge_Normal(t *testing.T) {
	h := NewTestPlus(16)
	h.toNormal()
	h.Add(toByte(0x00010fffffffffff))
	h.Add(toByte(0x00020fffffffffff))
	h.Add(toByte(0x00030fffffffffff))
	h.Add(toByte(0x00040fffffffffff))
	h.Add(toByte(0x00050fffffffffff))
	h.Add(toByte(0x00050fffffffffff))

	h2 := NewTestPlus(16)
	h2.toNormal()
	h2.Merge(h)
	n := h2.Count()
	if n != 5 {
		t.Error(n)
	}

	h2.Merge(h)
	n = h2.Count()
	if n != 5 {
		t.Error(n)
	}

	h.Add(toByte(0x00060fffffffffff))
	h.Add(toByte(0x00070fffffffffff))
	h.Add(toByte(0x00080fffffffffff))
	h.Add(toByte(0x00090fffffffffff))
	h.Add(toByte(0x000a0fffffffffff))
	h.Add(toByte(0x000a0fffffffffff))
	n = h.Count()
	if n != 10 {
		t.Error(n)
	}

	h2.Merge(h)
	n = h2.Count()
	if n != 10 {
		t.Error(n)
	}
}

func TestPlus_Merge(t *testing.T) {
	h := NewTestPlus(16)

	k1 := uint64(0xf000017000000000)
	h.Add(toByte(k1))
	if !h.tmpSet.has(h.encodeHash(k1)) {
		t.Error("key not in hash")
	}

	k2 := uint64(0x000fff8f00000000)
	h.Add(toByte(k2))
	if !h.tmpSet.has(h.encodeHash(k2)) {
		t.Error("key not in hash")
	}

	if len(h.tmpSet) != 2 {
		t.Error(h.tmpSet)
	}

	h.mergeSparse()
	if len(h.tmpSet) != 0 {
		t.Error(h.tmpSet)
	}
	if h.sparseList.count != 2 {
		t.Error(h.sparseList)
	}

	iter := h.sparseList.Iter()
	n := iter.Next()
	if n != h.encodeHash(k2) {
		t.Error(n)
	}
	n = iter.Next()
	if n != h.encodeHash(k1) {
		t.Error(n)
	}

	k3 := uint64(0x0f00017000000000)
	h.Add(toByte(k3))
	if !h.tmpSet.has(h.encodeHash(k3)) {
		t.Error("key not in hash")
	}

	h.mergeSparse()
	if len(h.tmpSet) != 0 {
		t.Error(h.tmpSet)
	}
	if h.sparseList.count != 3 {
		t.Error(h.sparseList)
	}

	iter = h.sparseList.Iter()
	n = iter.Next()
	if n != h.encodeHash(k2) {
		t.Error(n)
	}
	n = iter.Next()
	if n != h.encodeHash(k3) {
		t.Error(n)
	}
	n = iter.Next()
	if n != h.encodeHash(k1) {
		t.Error(n)
	}

	h.Add(toByte(k1))
	if !h.tmpSet.has(h.encodeHash(k1)) {
		t.Error("key not in hash")
	}

	h.mergeSparse()
	if len(h.tmpSet) != 0 {
		t.Error(h.tmpSet)
	}
	if h.sparseList.count != 3 {
		t.Error(h.sparseList)
	}

	iter = h.sparseList.Iter()
	n = iter.Next()
	if n != h.encodeHash(k2) {
		t.Error(n)
	}
	n = iter.Next()
	if n != h.encodeHash(k3) {
		t.Error(n)
	}
	n = iter.Next()
	if n != h.encodeHash(k1) {
		t.Error(n)
	}
}

func TestPlus_EncodeDecode(t *testing.T) {
	h := NewTestPlus(8)
	i, r := h.decodeHash(h.encodeHash(0xffffff8000000000))
	if i != 0xff {
		t.Error(i)
	}
	if r != 1 {
		t.Error(r)
	}

	i, r = h.decodeHash(h.encodeHash(0xff00000000000000))
	if i != 0xff {
		t.Error(i)
	}
	if r != 57 {
		t.Error(r)
	}

	i, r = h.decodeHash(h.encodeHash(0xff30000000000000))
	if i != 0xff {
		t.Error(i)
	}
	if r != 3 {
		t.Error(r)
	}

	i, r = h.decodeHash(h.encodeHash(0xaa10000000000000))
	if i != 0xaa {
		t.Error(i)
	}
	if r != 4 {
		t.Error(r)
	}

	i, r = h.decodeHash(h.encodeHash(0xaa0f000000000000))
	if i != 0xaa {
		t.Error(i)
	}
	if r != 5 {
		t.Error(r)
	}
}

func TestPlus_Error(t *testing.T) {
	_, err := NewPlus(3)
	if err == nil {
		t.Error("precision 3 should return error")
	}

	_, err = NewPlus(18)
	if err != nil {
		t.Error(err)
	}

	_, err = NewPlus(19)
	if err == nil {
		t.Error("precision 17 should return error")
	}
}

func TestPlus_Marshal_Unmarshal_Sparse(t *testing.T) {
	h, _ := NewPlus(4)
	h.sparse = true
	h.tmpSet = map[uint32]struct{}{26: struct{}{}, 40: struct{}{}}

	// Add a bunch of values to the sparse representation.
	for i := 0; i < 10; i++ {
		h.sparseList.Append(uint32(rand.Int()))
	}

	data, err := h.MarshalBinary()
	if err != nil {
		t.Fatal(err)
	}

	// Peeking at the first byte should reveal the version.
	if got, exp := data[0], byte(2); got != exp {
		t.Fatalf("got byte %v, expected %v", got, exp)
	}

	var res Plus
	if err := res.UnmarshalBinary(data); err != nil {
		t.Fatal(err)
	}

	// reflect.DeepEqual will always return false when comparing non-nil
	// functions, so we'll set them to nil.
	h.hash, res.hash = nil, nil
	if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
		t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
	}
}

func TestPlus_Marshal_Unmarshal_Dense(t *testing.T) {
	h, _ := NewPlus(4)
	h.sparse = false

	// Add a bunch of values to the dense representation.
	for i := 0; i < 10; i++ {
		h.denseList = append(h.denseList, uint8(rand.Int()))
	}

	data, err := h.MarshalBinary()
	if err != nil {
		t.Fatal(err)
	}

	// Peeking at the first byte should reveal the version.
	if got, exp := data[0], byte(2); got != exp {
		t.Fatalf("got byte %v, expected %v", got, exp)
	}

	var res Plus
	if err := res.UnmarshalBinary(data); err != nil {
		t.Fatal(err)
	}

	// reflect.DeepEqual will always return false when comparing non-nil
	// functions, so we'll set them to nil.
	h.hash, res.hash = nil, nil
	if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
		t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
	}
}

// Tests that a sketch can be serialised / unserialised and keep an accurate
// cardinality estimate.
func TestPlus_Marshal_Unmarshal_Count(t *testing.T) {
	if testing.Short() {
		t.Skip("Skipping test in short mode")
	}

	count := make(map[string]struct{}, 1000000)
	h, _ := NewPlus(16)

	buf := make([]byte, 8)
	for i := 0; i < 1000000; i++ {
		if _, err := crand.Read(buf); err != nil {
			panic(err)
		}

		count[string(buf)] = struct{}{}

		// Add to the sketch.
		h.Add(buf)
	}

	gotC := h.Count()
	epsilon := 15000 // 1.5%
	if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
		t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
	}

	// Serialise the sketch.
	sketch, err := h.MarshalBinary()
	if err != nil {
		t.Fatal(err)
	}

	// Deserialise.
	h = &Plus{}
	if err := h.UnmarshalBinary(sketch); err != nil {
		t.Fatal(err)
	}

	// The count should be the same
	oldC := gotC
	if got, exp := h.Count(), oldC; got != exp {
		t.Fatalf("got %d, expected %d", got, exp)
	}

	// Add some more values.
	for i := 0; i < 1000000; i++ {
		if _, err := crand.Read(buf); err != nil {
			panic(err)
		}

		count[string(buf)] = struct{}{}

		// Add to the sketch.
		h.Add(buf)
	}

	// The sketch should still be working correctly.
	gotC = h.Count()
	epsilon = 30000 // 1.5%
	if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
		t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
	}
}

func NewTestPlus(p uint8) *Plus {
	h, err := NewPlus(p)
	if err != nil {
		panic(err)
	}
	h.hash = nopHash
	return h
}

// Generate random data to add to the sketch.
func genData(n int) [][]byte {
	out := make([][]byte, 0, n)
	buf := make([]byte, 8)

	for i := 0; i < n; i++ {
		// generate 8 random bytes
		n, err := rand.Read(buf)
		if err != nil {
			panic(err)
		} else if n != 8 {
			panic(fmt.Errorf("only %d bytes generated", n))
		}

		out = append(out, buf)
	}
	if len(out) != n {
		panic(fmt.Sprintf("wrong size slice: %d", n))
	}
	return out
}

// Memoises values to be added to a sketch during a benchmark.
var benchdata = map[int][][]byte{}

func benchmarkPlusAdd(b *testing.B, h *Plus, n int) {
	blobs, ok := benchdata[n]
	if !ok {
		// Generate it.
		benchdata[n] = genData(n)
		blobs = benchdata[n]
	}

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for j := 0; j < len(blobs); j++ {
			h.Add(blobs[j])
		}
	}
	b.StopTimer()
}

func BenchmarkPlus_Add_100(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 100)
}

func BenchmarkPlus_Add_1000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 1000)
}

func BenchmarkPlus_Add_10000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 10000)
}

func BenchmarkPlus_Add_100000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 100000)
}

func BenchmarkPlus_Add_1000000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 1000000)
}

func BenchmarkPlus_Add_10000000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 10000000)
}

func BenchmarkPlus_Add_100000000(b *testing.B) {
	h, _ := NewPlus(16)
	benchmarkPlusAdd(b, h, 100000000)
}