684 lines
13 KiB
Go
684 lines
13 KiB
Go
package hll
|
|
|
|
import (
|
|
crand "crypto/rand"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"reflect"
|
|
"testing"
|
|
"unsafe"
|
|
|
|
"github.com/davecgh/go-spew/spew"
|
|
)
|
|
|
|
func nopHash(buf []byte) uint64 {
|
|
if len(buf) != 8 {
|
|
panic(fmt.Sprintf("unexpected size buffer: %d", len(buf)))
|
|
}
|
|
return binary.BigEndian.Uint64(buf)
|
|
}
|
|
|
|
func toByte(v uint64) []byte {
|
|
var buf [8]byte
|
|
binary.BigEndian.PutUint64(buf[:], v)
|
|
return buf[:]
|
|
}
|
|
|
|
func TestPlus_Bytes(t *testing.T) {
|
|
testCases := []struct {
|
|
p uint8
|
|
normal bool
|
|
}{
|
|
{4, false},
|
|
{5, false},
|
|
{4, true},
|
|
{5, true},
|
|
}
|
|
|
|
for i, testCase := range testCases {
|
|
t.Run(fmt.Sprint(i), func(t *testing.T) {
|
|
h := NewTestPlus(testCase.p)
|
|
|
|
plusStructOverhead := int(unsafe.Sizeof(*h))
|
|
compressedListOverhead := int(unsafe.Sizeof(*h.sparseList))
|
|
|
|
var expectedDenseListCapacity, expectedSparseListCapacity int
|
|
|
|
if testCase.normal {
|
|
h.toNormal()
|
|
// denseList has capacity for 2^p elements, one byte each
|
|
expectedDenseListCapacity = int(math.Pow(2, float64(testCase.p)))
|
|
if expectedDenseListCapacity != cap(h.denseList) {
|
|
t.Errorf("denseList capacity: want %d got %d", expectedDenseListCapacity, cap(h.denseList))
|
|
}
|
|
} else {
|
|
// sparseList has capacity for 2^p elements, one byte each
|
|
expectedSparseListCapacity = int(math.Pow(2, float64(testCase.p)))
|
|
if expectedSparseListCapacity != cap(h.sparseList.b) {
|
|
t.Errorf("sparseList capacity: want %d got %d", expectedSparseListCapacity, cap(h.sparseList.b))
|
|
}
|
|
expectedSparseListCapacity += compressedListOverhead
|
|
}
|
|
|
|
expectedSize := plusStructOverhead + expectedDenseListCapacity + expectedSparseListCapacity
|
|
if expectedSize != h.Bytes() {
|
|
t.Errorf("Bytes(): want %d got %d", expectedSize, h.Bytes())
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestPlus_Add_NoSparse(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
h.toNormal()
|
|
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
n := h.denseList[1]
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x0002ffffffffffff))
|
|
n = h.denseList[2]
|
|
if n != 1 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x0003000000000000))
|
|
n = h.denseList[3]
|
|
if n != 49 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x0003000000000001))
|
|
n = h.denseList[3]
|
|
if n != 49 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0xff03700000000000))
|
|
n = h.denseList[0xff03]
|
|
if n != 2 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0xff03080000000000))
|
|
n = h.denseList[0xff03]
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlusPrecision_NoSparse(t *testing.T) {
|
|
h := NewTestPlus(4)
|
|
h.toNormal()
|
|
|
|
h.Add(toByte(0x1fffffffffffffff))
|
|
n := h.denseList[1]
|
|
if n != 1 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0xffffffffffffffff))
|
|
n = h.denseList[0xf]
|
|
if n != 1 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x00ffffffffffffff))
|
|
n = h.denseList[0]
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlus_toNormal(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
h.toNormal()
|
|
c := h.Count()
|
|
if c != 1 {
|
|
t.Error(c)
|
|
}
|
|
|
|
if h.sparse {
|
|
t.Error("toNormal should convert to normal")
|
|
}
|
|
|
|
h = NewTestPlus(16)
|
|
h.hash = nopHash
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
h.Add(toByte(0x0002ffffffffffff))
|
|
h.Add(toByte(0x0003000000000000))
|
|
h.Add(toByte(0x0003000000000001))
|
|
h.Add(toByte(0xff03700000000000))
|
|
h.Add(toByte(0xff03080000000000))
|
|
h.mergeSparse()
|
|
h.toNormal()
|
|
|
|
n := h.denseList[1]
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
n = h.denseList[2]
|
|
if n != 1 {
|
|
t.Error(n)
|
|
}
|
|
n = h.denseList[3]
|
|
if n != 49 {
|
|
t.Error(n)
|
|
}
|
|
n = h.denseList[0xff03]
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlusCount(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
|
|
n := h.Count()
|
|
if n != 0 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
h.Add(toByte(0x00020fffffffffff))
|
|
h.Add(toByte(0x00030fffffffffff))
|
|
h.Add(toByte(0x00040fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
|
|
n = h.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
// not mutated, still returns correct count
|
|
n = h.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x00060fffffffffff))
|
|
|
|
// mutated
|
|
n = h.Count()
|
|
if n != 6 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlus_Merge_Error(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
h2 := NewTestPlus(10)
|
|
|
|
err := h.Merge(h2)
|
|
if err == nil {
|
|
t.Error("different precision should return error")
|
|
}
|
|
}
|
|
|
|
func TestHLL_Merge_Sparse(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
h.Add(toByte(0x00020fffffffffff))
|
|
h.Add(toByte(0x00030fffffffffff))
|
|
h.Add(toByte(0x00040fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
|
|
h2 := NewTestPlus(16)
|
|
h2.Merge(h)
|
|
n := h2.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
if h2.sparse {
|
|
t.Error("Merge should convert to normal")
|
|
}
|
|
|
|
if !h.sparse {
|
|
t.Error("Merge should not modify argument")
|
|
}
|
|
|
|
h2.Merge(h)
|
|
n = h2.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x00060fffffffffff))
|
|
h.Add(toByte(0x00070fffffffffff))
|
|
h.Add(toByte(0x00080fffffffffff))
|
|
h.Add(toByte(0x00090fffffffffff))
|
|
h.Add(toByte(0x000a0fffffffffff))
|
|
h.Add(toByte(0x000a0fffffffffff))
|
|
n = h.Count()
|
|
if n != 10 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h2.Merge(h)
|
|
n = h2.Count()
|
|
if n != 10 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestHLL_Merge_Normal(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
h.toNormal()
|
|
h.Add(toByte(0x00010fffffffffff))
|
|
h.Add(toByte(0x00020fffffffffff))
|
|
h.Add(toByte(0x00030fffffffffff))
|
|
h.Add(toByte(0x00040fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
h.Add(toByte(0x00050fffffffffff))
|
|
|
|
h2 := NewTestPlus(16)
|
|
h2.toNormal()
|
|
h2.Merge(h)
|
|
n := h2.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h2.Merge(h)
|
|
n = h2.Count()
|
|
if n != 5 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(0x00060fffffffffff))
|
|
h.Add(toByte(0x00070fffffffffff))
|
|
h.Add(toByte(0x00080fffffffffff))
|
|
h.Add(toByte(0x00090fffffffffff))
|
|
h.Add(toByte(0x000a0fffffffffff))
|
|
h.Add(toByte(0x000a0fffffffffff))
|
|
n = h.Count()
|
|
if n != 10 {
|
|
t.Error(n)
|
|
}
|
|
|
|
h2.Merge(h)
|
|
n = h2.Count()
|
|
if n != 10 {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlus_Merge(t *testing.T) {
|
|
h := NewTestPlus(16)
|
|
|
|
k1 := uint64(0xf000017000000000)
|
|
h.Add(toByte(k1))
|
|
if !h.tmpSet.has(h.encodeHash(k1)) {
|
|
t.Error("key not in hash")
|
|
}
|
|
|
|
k2 := uint64(0x000fff8f00000000)
|
|
h.Add(toByte(k2))
|
|
if !h.tmpSet.has(h.encodeHash(k2)) {
|
|
t.Error("key not in hash")
|
|
}
|
|
|
|
if len(h.tmpSet) != 2 {
|
|
t.Error(h.tmpSet)
|
|
}
|
|
|
|
h.mergeSparse()
|
|
if len(h.tmpSet) != 0 {
|
|
t.Error(h.tmpSet)
|
|
}
|
|
if h.sparseList.count != 2 {
|
|
t.Error(h.sparseList)
|
|
}
|
|
|
|
iter := h.sparseList.Iter()
|
|
n := iter.Next()
|
|
if n != h.encodeHash(k2) {
|
|
t.Error(n)
|
|
}
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k1) {
|
|
t.Error(n)
|
|
}
|
|
|
|
k3 := uint64(0x0f00017000000000)
|
|
h.Add(toByte(k3))
|
|
if !h.tmpSet.has(h.encodeHash(k3)) {
|
|
t.Error("key not in hash")
|
|
}
|
|
|
|
h.mergeSparse()
|
|
if len(h.tmpSet) != 0 {
|
|
t.Error(h.tmpSet)
|
|
}
|
|
if h.sparseList.count != 3 {
|
|
t.Error(h.sparseList)
|
|
}
|
|
|
|
iter = h.sparseList.Iter()
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k2) {
|
|
t.Error(n)
|
|
}
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k3) {
|
|
t.Error(n)
|
|
}
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k1) {
|
|
t.Error(n)
|
|
}
|
|
|
|
h.Add(toByte(k1))
|
|
if !h.tmpSet.has(h.encodeHash(k1)) {
|
|
t.Error("key not in hash")
|
|
}
|
|
|
|
h.mergeSparse()
|
|
if len(h.tmpSet) != 0 {
|
|
t.Error(h.tmpSet)
|
|
}
|
|
if h.sparseList.count != 3 {
|
|
t.Error(h.sparseList)
|
|
}
|
|
|
|
iter = h.sparseList.Iter()
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k2) {
|
|
t.Error(n)
|
|
}
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k3) {
|
|
t.Error(n)
|
|
}
|
|
n = iter.Next()
|
|
if n != h.encodeHash(k1) {
|
|
t.Error(n)
|
|
}
|
|
}
|
|
|
|
func TestPlus_EncodeDecode(t *testing.T) {
|
|
h := NewTestPlus(8)
|
|
i, r := h.decodeHash(h.encodeHash(0xffffff8000000000))
|
|
if i != 0xff {
|
|
t.Error(i)
|
|
}
|
|
if r != 1 {
|
|
t.Error(r)
|
|
}
|
|
|
|
i, r = h.decodeHash(h.encodeHash(0xff00000000000000))
|
|
if i != 0xff {
|
|
t.Error(i)
|
|
}
|
|
if r != 57 {
|
|
t.Error(r)
|
|
}
|
|
|
|
i, r = h.decodeHash(h.encodeHash(0xff30000000000000))
|
|
if i != 0xff {
|
|
t.Error(i)
|
|
}
|
|
if r != 3 {
|
|
t.Error(r)
|
|
}
|
|
|
|
i, r = h.decodeHash(h.encodeHash(0xaa10000000000000))
|
|
if i != 0xaa {
|
|
t.Error(i)
|
|
}
|
|
if r != 4 {
|
|
t.Error(r)
|
|
}
|
|
|
|
i, r = h.decodeHash(h.encodeHash(0xaa0f000000000000))
|
|
if i != 0xaa {
|
|
t.Error(i)
|
|
}
|
|
if r != 5 {
|
|
t.Error(r)
|
|
}
|
|
}
|
|
|
|
func TestPlus_Error(t *testing.T) {
|
|
_, err := NewPlus(3)
|
|
if err == nil {
|
|
t.Error("precision 3 should return error")
|
|
}
|
|
|
|
_, err = NewPlus(18)
|
|
if err != nil {
|
|
t.Error(err)
|
|
}
|
|
|
|
_, err = NewPlus(19)
|
|
if err == nil {
|
|
t.Error("precision 17 should return error")
|
|
}
|
|
}
|
|
|
|
func TestPlus_Marshal_Unmarshal_Sparse(t *testing.T) {
|
|
h, _ := NewPlus(4)
|
|
h.sparse = true
|
|
h.tmpSet = map[uint32]struct{}{26: struct{}{}, 40: struct{}{}}
|
|
|
|
// Add a bunch of values to the sparse representation.
|
|
for i := 0; i < 10; i++ {
|
|
h.sparseList.Append(uint32(rand.Int()))
|
|
}
|
|
|
|
data, err := h.MarshalBinary()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Peeking at the first byte should reveal the version.
|
|
if got, exp := data[0], byte(2); got != exp {
|
|
t.Fatalf("got byte %v, expected %v", got, exp)
|
|
}
|
|
|
|
var res Plus
|
|
if err := res.UnmarshalBinary(data); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// reflect.DeepEqual will always return false when comparing non-nil
|
|
// functions, so we'll set them to nil.
|
|
h.hash, res.hash = nil, nil
|
|
if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
|
|
t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
|
|
}
|
|
}
|
|
|
|
func TestPlus_Marshal_Unmarshal_Dense(t *testing.T) {
|
|
h, _ := NewPlus(4)
|
|
h.sparse = false
|
|
|
|
// Add a bunch of values to the dense representation.
|
|
for i := 0; i < 10; i++ {
|
|
h.denseList = append(h.denseList, uint8(rand.Int()))
|
|
}
|
|
|
|
data, err := h.MarshalBinary()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Peeking at the first byte should reveal the version.
|
|
if got, exp := data[0], byte(2); got != exp {
|
|
t.Fatalf("got byte %v, expected %v", got, exp)
|
|
}
|
|
|
|
var res Plus
|
|
if err := res.UnmarshalBinary(data); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// reflect.DeepEqual will always return false when comparing non-nil
|
|
// functions, so we'll set them to nil.
|
|
h.hash, res.hash = nil, nil
|
|
if got, exp := &res, h; !reflect.DeepEqual(got, exp) {
|
|
t.Fatalf("got %v, wanted %v", spew.Sdump(got), spew.Sdump(exp))
|
|
}
|
|
}
|
|
|
|
// Tests that a sketch can be serialised / unserialised and keep an accurate
|
|
// cardinality estimate.
|
|
func TestPlus_Marshal_Unmarshal_Count(t *testing.T) {
|
|
if testing.Short() {
|
|
t.Skip("Skipping test in short mode")
|
|
}
|
|
|
|
count := make(map[string]struct{}, 1000000)
|
|
h, _ := NewPlus(16)
|
|
|
|
buf := make([]byte, 8)
|
|
for i := 0; i < 1000000; i++ {
|
|
if _, err := crand.Read(buf); err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
count[string(buf)] = struct{}{}
|
|
|
|
// Add to the sketch.
|
|
h.Add(buf)
|
|
}
|
|
|
|
gotC := h.Count()
|
|
epsilon := 15000 // 1.5%
|
|
if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
|
|
t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
|
|
}
|
|
|
|
// Serialise the sketch.
|
|
sketch, err := h.MarshalBinary()
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Deserialise.
|
|
h = &Plus{}
|
|
if err := h.UnmarshalBinary(sketch); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// The count should be the same
|
|
oldC := gotC
|
|
if got, exp := h.Count(), oldC; got != exp {
|
|
t.Fatalf("got %d, expected %d", got, exp)
|
|
}
|
|
|
|
// Add some more values.
|
|
for i := 0; i < 1000000; i++ {
|
|
if _, err := crand.Read(buf); err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
count[string(buf)] = struct{}{}
|
|
|
|
// Add to the sketch.
|
|
h.Add(buf)
|
|
}
|
|
|
|
// The sketch should still be working correctly.
|
|
gotC = h.Count()
|
|
epsilon = 30000 // 1.5%
|
|
if got, exp := math.Abs(float64(int(gotC)-len(count))), epsilon; int(got) > exp {
|
|
t.Fatalf("error was %v for estimation %d and true cardinality %d", got, gotC, len(count))
|
|
}
|
|
}
|
|
|
|
func NewTestPlus(p uint8) *Plus {
|
|
h, err := NewPlus(p)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
h.hash = nopHash
|
|
return h
|
|
}
|
|
|
|
// Generate random data to add to the sketch.
|
|
func genData(n int) [][]byte {
|
|
out := make([][]byte, 0, n)
|
|
buf := make([]byte, 8)
|
|
|
|
for i := 0; i < n; i++ {
|
|
// generate 8 random bytes
|
|
n, err := rand.Read(buf)
|
|
if err != nil {
|
|
panic(err)
|
|
} else if n != 8 {
|
|
panic(fmt.Errorf("only %d bytes generated", n))
|
|
}
|
|
|
|
out = append(out, buf)
|
|
}
|
|
if len(out) != n {
|
|
panic(fmt.Sprintf("wrong size slice: %d", n))
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Memoises values to be added to a sketch during a benchmark.
|
|
var benchdata = map[int][][]byte{}
|
|
|
|
func benchmarkPlusAdd(b *testing.B, h *Plus, n int) {
|
|
blobs, ok := benchdata[n]
|
|
if !ok {
|
|
// Generate it.
|
|
benchdata[n] = genData(n)
|
|
blobs = benchdata[n]
|
|
}
|
|
|
|
b.ReportAllocs()
|
|
b.ResetTimer()
|
|
for i := 0; i < b.N; i++ {
|
|
for j := 0; j < len(blobs); j++ {
|
|
h.Add(blobs[j])
|
|
}
|
|
}
|
|
b.StopTimer()
|
|
}
|
|
|
|
func BenchmarkPlus_Add_100(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 100)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_1000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 1000)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_10000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 10000)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_100000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 100000)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_1000000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 1000000)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_10000000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 10000000)
|
|
}
|
|
|
|
func BenchmarkPlus_Add_100000000(b *testing.B) {
|
|
h, _ := NewPlus(16)
|
|
benchmarkPlusAdd(b, h, 100000000)
|
|
}
|