influxdb/pkg/hll/hll_test.go

684 lines
13 KiB
Go

package hll
import (
crand "crypto/rand"
"encoding/binary"
"fmt"
"math"
"math/rand"
"reflect"
"testing"
"unsafe"
"github.com/davecgh/go-spew/spew"
)
func nopHash(buf []byte) uint64 {
if len(buf) != 8 {
panic(fmt.Sprintf("unexpected size buffer: %d", len(buf)))
}
return binary.BigEndian.Uint64(buf)
}
func toByte(v uint64) []byte {
var buf [8]byte
binary.BigEndian.PutUint64(buf[:], v)
return buf[:]
}
func TestPlus_Bytes(t *testing.T) {
testCases := []struct {
p uint8
normal bool
}{
{4, false},
{5, false},
{4, true},
{5, true},
}
for i, testCase := range testCases {
t.Run(fmt.Sprint(i), func(t *testing.T) {
h := NewTestPlus(testCase.p)
plusStructOverhead := int(unsafe.Sizeof(*h))
compressedListOverhead := int(unsafe.Sizeof(*h.sparseList))
var expectedDenseListCapacity, expectedSparseListCapacity int
if testCase.normal {
h.toNormal()
// denseList has capacity for 2^p elements, one byte each
expectedDenseListCapacity = int(math.Pow(2, float64(testCase.p)))
if expectedDenseListCapacity != cap(h.denseList) {
t.Errorf("denseList capacity: want %d got %d", expectedDenseListCapacity, cap(h.denseList))
}
} else {
// sparseList has capacity for 2^p elements, one byte each
expectedSparseListCapacity = int(math.Pow(2, float64(testCase.p)))
if expectedSparseListCapacity != cap(h.sparseList.b) {
t.Errorf("sparseList capacity: want %d got %d", expectedSparseListCapacity, cap(h.sparseList.b))
}
expectedSparseListCapacity += compressedListOverhead
}
expectedSize := plusStructOverhead + expectedDenseListCapacity + expectedSparseListCapacity
if expectedSize != h.Bytes() {
t.Errorf("Bytes(): want %d got %d", expectedSize, h.Bytes())
}
})
}
}
func TestPlus_Add_NoSparse(t *testing.T) {
h := NewTestPlus(16)
h.toNormal()
h.Add(toByte(0x00010fffffffffff))
n := h.denseList[1]
if n != 5 {
t.Error(n)
}
h.Add(toByte(0x0002ffffffffffff))
n = h.denseList[2]
if n != 1 {
t.Error(n)
}
h.Add(toByte(0x0003000000000000))
n = h.denseList[3]
if n != 49 {
t.Error(n)
}
h.Add(toByte(0x0003000000000001))
n = h.denseList[3]
if n != 49 {
t.Error(n)
}
h.Add(toByte(0xff03700000000000))
n = h.denseList[0xff03]
if n != 2 {
t.Error(n)
}
h.Add(toByte(0xff03080000000000))
n = h.denseList[0xff03]
if n != 5 {
t.Error(n)
}
}
func TestPlusPrecision_NoSparse(t *testing.T) {
h := NewTestPlus(4)
h.toNormal()
h.Add(toByte(0x1fffffffffffffff))
n := h.denseList[1]
if n != 1 {
t.Error(n)
}
h.Add(toByte(0xffffffffffffffff))
n = h.denseList[0xf]
if n != 1 {
t.Error(n)
}
h.Add(toByte(0x00ffffffffffffff))
n = h.denseList[0]
if n != 5 {
t.Error(n)
}
}
func TestPlus_toNormal(t *testing.T) {
h := NewTestPlus(16)
h.Add(toByte(0x00010fffffffffff))
h.toNormal()
c := h.Count()
if c != 1 {
t.Error(c)
}
if h.sparse {
t.Error("toNormal should convert to normal")
}
h = NewTestPlus(16)
h.hash = nopHash
h.Add(toByte(0x00010fffffffffff))
h.Add(toByte(0x0002ffffffffffff))
h.Add(toByte(0x0003000000000000))
h.Add(toByte(0x0003000000000001))
h.Add(toByte(0xff03700000000000))
h.Add(toByte(0xff03080000000000))
h.mergeSparse()
h.toNormal()
n := h.denseList[1]
if n != 5 {
t.Error(n)
}
n = h.denseList[2]
if n != 1 {
t.Error(n)
}
n = h.denseList[3]
if n != 49 {
t.Error(n)
}
n = h.denseList[0xff03]
if n != 5 {
t.Error(n)
}
}
func TestPlusCount(t *testing.T) {
h := NewTestPlus(16)
n := h.Count()
if n != 0 {
t.Error(n)
}
h.Add(toByte(0x00010fffffffffff))
h.Add(toByte(0x00020fffffffffff))
h.Add(toByte(0x00030fffffffffff))
h.Add(toByte(0x00040fffffffffff))
h.Add(toByte(0x00050fffffffffff))
h.Add(toByte(0x00050fffffffffff))
n = h.Count()
if n != 5 {
t.Error(n)
}
// not mutated, still returns correct count
n = h.Count()
if n != 5 {
t.Error(n)
}
h.Add(toByte(0x00060fffffffffff))
// mutated
n = h.Count()
if n != 6 {
t.Error(n)
}
}
func TestPlus_Merge_Error(t *testing.T) {
h := NewTestPlus(16)
h2 := NewTestPlus(10)
err := h.Merge(h2)
if err == nil {
t.Error("different precision should return error")
}
}
func TestHLL_Merge_Sparse(t *testing.T) {
h := NewTestPlus(16)
h.Add(toByte(0x00010fffffffffff))
h.Add(toByte(0x00020fffffffffff))
h.Add(toByte(0x00030fffffffffff))
h.Add(toByte(0x00040fffffffffff))
h.Add(toByte(0x00050fffffffffff))
h.Add(toByte(0x00050fffffffffff))
h2 := NewTestPlus(16)
h2.Merge(h)
n := h2.Count()
if n != 5 {
t.Error(n)
}
if h2.sparse {
t.Error("Merge should convert to normal")
}
if !h.sparse {
t.Error("Merge should not modify argument")
}
h2.Merge(h)
n = h2.Count()
if n != 5 {
t.Error(n)
}
h.Add(toByte(0x00060fffffffffff))
h.Add(toByte(0x00070fffffffffff))
h.Add(toByte(0x00080fffffffffff))
h.Add(toByte(0x00090fffffffffff))
h.Add(toByte(0x000a0fffffffffff))
h.Add(toByte(0x000a0fffffffffff))
n = h.Count()
if n != 10 {
t.Error(n)
}
h2.Merge(h)
n = h2.Count()
if n != 10 {
t.Error(n)
}
}
func TestHLL_Merge_Normal(t *testing.T) {
h := NewTestPlus(16)
h.toNormal()
h.Add(toByte(0x00010fffffffffff))
h.Add(toByte(0x00020fffffffffff))
h.Add(toByte(0x00030fffffffffff))
h.Add(toByte(0x00040fffffffffff))
h.Add(toByte(0x00050fffffffffff))
h.Add(toByte(0x00050fffffffffff))
h2 := NewTestPlus(16)
h2.toNormal()
h2.Merge(h)
n := h2.Count()
if n != 5 {
t.Error(n)
}
h2.Merge(h)
n = h2.Count()
if n != 5 {
t.Error(n)
}
h.Add(toByte(0x00060fffffffffff))
h.Add(toByte(0x00070fffffffffff))
h.Add(toByte(0x00080fffffffffff))
h.Add(toByte(0x00090fffffffffff))
h.Add(toByte(0x000a0fffffffffff))
h.Add(toByte(0x000a0fffffffffff))
n = h.Count()
if n != 10 {
t.Error(n)
}
h2.Merge(h)
n = h2.Count()
if n != 10 {
t.Error(n)
}
}
func TestPlus_Merge(t *testing.T) {
h := NewTestPlus(16)
k1 := uint64(0xf000017000000000)
h.Add(toByte(k1))
if !h.tmpSet.has(h.encodeHash(k1)) {
t.Error("key not in hash")
}
k2 := uint64(0x000fff8f00000000)
h.Add(toByte(k2))
if !h.tmpSet.has(h.encodeHash(k2)) {
t.Error("key not in hash")
}
if len(h.tmpSet) != 2 {
t.Error(h.tmpSet)
}
h.mergeSparse()
if len(h.tmpSet) != 0 {
t.Error(h.tmpSet)
}
if h.sparseList.count != 2 {
t.Error(h.sparseList)
}
iter := h.sparseList.Iter()
n := iter.Next()
if n != h.encodeHash(k2) {
t.Error(n)
}
n = iter.Next()
if n != h.encodeHash(k1) {
t.Error(n)
}
k3 := uint64(0x0f00017000000000)
h.Add(toByte(k3))
if !h.tmpSet.has(h.encodeHash(k3)) {
t.Error("key not in hash")
}
h.mergeSparse()
if len(h.tmpSet) != 0 {
t.Error(h.tmpSet)
}
if h.sparseList.count != 3 {
t.Error(h.sparseList)
}
iter = h.sparseList.Iter()
n = iter.Next()
if n != h.encodeHash(k2) {
t.Error(n)
}
n = iter.Next()
if n != h.encodeHash(k3) {
t.Error(n)
}
n = iter.Next()
if n != h.encodeHash(k1) {
t.Error(n)
}
h.Add(toByte(k1))
if !h.tmpSet.has(h.encodeHash(k1)) {
t.Error("key not in hash")
}
h.mergeSparse()
if len(h.tmpSet) != 0 {
t.Error(h.tmpSet)
}
if h.sparseList.count != 3 {
t.Error(h.sparseList)
}
iter = h.sparseList.Iter()
n = iter.Next()
if n != h.encodeHash(k2) {
t.Error(n)
}
n = iter.Next()
if n != h.encodeHash(k3) {
t.Error(n)
}
n = iter.Next()
if n != h.encodeHash(k1) {
t.Error(n)
}
}
func TestPlus_EncodeDecode(t *testing.T) {
h := NewTestPlus(8)
i, r := h.decodeHash(h.encodeHash(0xffffff8000000000))
if i != 0xff {
t.Error(i)
}
if r != 1 {
t.Error(r)
}
i, r = h.decodeHash(h.encodeHash(0xff00000000000000))
if i != 0xff {
t.Error(i)
}
if r != 57 {
t.Error(r)
}
i, r = h.decodeHash(h.encodeHash(0xff30000000000000))
if i != 0xff {
t.Error(i)
}
if r != 3 {
t.Error(r)
}
i, r = h.decodeHash(h.encodeHash(0xaa10000000000000))
if i != 0xaa {
t.Error(i)
}
if r != 4 {
t.Error(r)
}
i, r = h.decodeHash(h.encodeHash(0xaa0f000000000000))
if i != 0xaa {
t.Error(i)
}
if r != 5 {
t.Error(r)
}
}
func TestPlus_Error(t *testing.T) {
_, err := NewPlus(3)
if err == nil {
t.Error("precision 3 should return error")
}
_, err = NewPlus(18)
if err != nil {
t.Error(err)
}
_, err = NewPlus(19)
if err == nil {
t.Error("precision 17 should return error")
}
}
func TestPlus_Marshal_Unmarshal_Sparse(t *testing.T) {
h, _ := NewPlus(4)
h.sparse = true
h.tmpSet = map[uint32]struct{}{26: struct{}{}, 40: struct{}{}}
// Add a bunch of values to the sparse representation.
for i := 0; i < 10; i++ {
h.sparseList.Append(uint32(rand.Int()))
}
data, err := h.MarshalBinary()
if err != nil {
t.Fatal(err)
}
// Peeking at the first byte should reveal the version.
if got, exp := data[0], byte(2); got != exp {
t.Fatalf("got byte %v, expected %v", got, exp)
}
var res Plus
if err := res.UnmarshalBinary(data); err != nil {
t.Fatal(err)
}
// reflect.DeepEqual will always return false when comparing non-nil
// functions, so we'll set them to nil.
h.hash, res.hash = nil, nil
if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
}
}
func TestPlus_Marshal_Unmarshal_Dense(t *testing.T) {
h, _ := NewPlus(4)
h.sparse = false
// Add a bunch of values to the dense representation.
for i := 0; i < 10; i++ {
h.denseList = append(h.denseList, uint8(rand.Int()))
}
data, err := h.MarshalBinary()
if err != nil {
t.Fatal(err)
}
// Peeking at the first byte should reveal the version.
if got, exp := data[0], byte(2); got != exp {
t.Fatalf("got byte %v, expected %v", got, exp)
}
var res Plus
if err := res.UnmarshalBinary(data); err != nil {
t.Fatal(err)
}
// reflect.DeepEqual will always return false when comparing non-nil
// functions, so we'll set them to nil.
h.hash, res.hash = nil, nil
if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
}
}
// Tests that a sketch can be serialised / unserialised and keep an accurate
// cardinality estimate.
func TestPlus_Marshal_Unmarshal_Count(t *testing.T) {
if testing.Short() {
t.Skip("Skipping test in short mode")
}
count := make(map[string]struct{}, 1000000)
h, _ := NewPlus(16)
buf := make([]byte, 8)
for i := 0; i < 1000000; i++ {
if _, err := crand.Read(buf); err != nil {
panic(err)
}
count[string(buf)] = struct{}{}
// Add to the sketch.
h.Add(buf)
}
gotC := h.Count()
epsilon := 15000 // 1.5%
if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
}
// Serialise the sketch.
sketch, err := h.MarshalBinary()
if err != nil {
t.Fatal(err)
}
// Deserialise.
h = &Plus{}
if err := h.UnmarshalBinary(sketch); err != nil {
t.Fatal(err)
}
// The count should be the same
oldC := gotC
if got, exp := h.Count(), oldC; got != exp {
t.Fatalf("got %d, expected %d", got, exp)
}
// Add some more values.
for i := 0; i < 1000000; i++ {
if _, err := crand.Read(buf); err != nil {
panic(err)
}
count[string(buf)] = struct{}{}
// Add to the sketch.
h.Add(buf)
}
// The sketch should still be working correctly.
gotC = h.Count()
epsilon = 30000 // 1.5%
if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
}
}
func NewTestPlus(p uint8) *Plus {
h, err := NewPlus(p)
if err != nil {
panic(err)
}
h.hash = nopHash
return h
}
// Generate random data to add to the sketch.
func genData(n int) [][]byte {
out := make([][]byte, 0, n)
buf := make([]byte, 8)
for i := 0; i < n; i++ {
// generate 8 random bytes
n, err := rand.Read(buf)
if err != nil {
panic(err)
} else if n != 8 {
panic(fmt.Errorf("only %d bytes generated", n))
}
out = append(out, buf)
}
if len(out) != n {
panic(fmt.Sprintf("wrong size slice: %d", n))
}
return out
}
// Memoises values to be added to a sketch during a benchmark.
var benchdata = map[int][][]byte{}
func benchmarkPlusAdd(b *testing.B, h *Plus, n int) {
blobs, ok := benchdata[n]
if !ok {
// Generate it.
benchdata[n] = genData(n)
blobs = benchdata[n]
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
for j := 0; j < len(blobs); j++ {
h.Add(blobs[j])
}
}
b.StopTimer()
}
func BenchmarkPlus_Add_100(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 100)
}
func BenchmarkPlus_Add_1000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 1000)
}
func BenchmarkPlus_Add_10000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 10000)
}
func BenchmarkPlus_Add_100000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 100000)
}
func BenchmarkPlus_Add_1000000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 1000000)
}
func BenchmarkPlus_Add_10000000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 10000000)
}
func BenchmarkPlus_Add_100000000(b *testing.B) {
h, _ := NewPlus(16)
benchmarkPlusAdd(b, h, 100000000)
}