Improve performance of TSI bloom filter

This commit replaces the previous hashing algorithm used by the pkg.Filter with
one based on xxhash. Further, taking from the hashing literature, we can
represent k hashes with only two hash function, where previously Filter was using
four.

Further, unlike `murmur3`, `xxhash` is allocation-free, so allocations have
dramatically reduced when inserting and checking for hashes.
pull/8857/head
Edd Robinson 2017-08-31 13:16:53 +01:00
parent fe960b0f3a
commit 5b7fc517fa
3 changed files with 112 additions and 110 deletions

View File

@ -30,6 +30,7 @@
- [#8854](https://github.com/influxdata/influxdb/pull/8854): Report the task status for a query. - [#8854](https://github.com/influxdata/influxdb/pull/8854): Report the task status for a query.
- [#8853](https://github.com/influxdata/influxdb/pull/8853): Reduce allocations, improve `readEntries` performance by simplifying loop - [#8853](https://github.com/influxdata/influxdb/pull/8853): Reduce allocations, improve `readEntries` performance by simplifying loop
- [#8830](https://github.com/influxdata/influxdb/issues/8830): Separate importer log statements to stdout and stderr. - [#8830](https://github.com/influxdata/influxdb/issues/8830): Separate importer log statements to stdout and stderr.
- [#8857](https://github.com/influxdata/influxdb/pull/8857): Improve performance of Bloom Filter in TSI index.
### Bugfixes ### Bugfixes

View File

@ -2,8 +2,9 @@ package bloom
// NOTE: // NOTE:
// This package implements a limited bloom filter implementation based on // This package implements a limited bloom filter implementation based on
// Will Fitzgerald's bloom & bitset packages. It's implemented locally to // Will Fitzgerald's bloom & bitset packages. It uses a zero-allocation xxhash
// support zero-copy memory-mapped slices. // implementation, rather than murmur3. It's implemented locally to support
// zero-copy memory-mapped slices.
// //
// This also optimizes the filter by always using a bitset size with a power of 2. // This also optimizes the filter by always using a bitset size with a power of 2.
@ -11,31 +12,21 @@ import (
"fmt" "fmt"
"math" "math"
"github.com/influxdata/influxdb/pkg/pool" "github.com/cespare/xxhash"
"github.com/spaolacci/murmur3"
) )
// Filter represents a bloom filter. // Filter represents a bloom filter.
type Filter struct { type Filter struct {
k uint64 k uint64
b []byte b []byte
mask uint64 mask uint64
hashPool *pool.Generic
} }
// NewFilter returns a new instance of Filter using m bits and k hash functions. // NewFilter returns a new instance of Filter using m bits and k hash functions.
// If m is not a power of two then it is rounded to the next highest power of 2. // If m is not a power of two then it is rounded to the next highest power of 2.
func NewFilter(m uint64, k uint64) *Filter { func NewFilter(m uint64, k uint64) *Filter {
m = pow2(m) m = pow2(m)
return &Filter{k: k, b: make([]byte, m/8), mask: m - 1}
return &Filter{
k: k,
b: make([]byte, m/8),
mask: m - 1,
hashPool: pool.NewGeneric(16, func(sz int) interface{} {
return murmur3.New128()
}),
}
} }
// NewFilterBuffer returns a new instance of a filter using a backing buffer. // NewFilterBuffer returns a new instance of a filter using a backing buffer.
@ -45,15 +36,7 @@ func NewFilterBuffer(buf []byte, k uint64) (*Filter, error) {
if m != uint64(len(buf))*8 { if m != uint64(len(buf))*8 {
return nil, fmt.Errorf("bloom.Filter: buffer bit count must a power of two: %d/%d", len(buf)*8, m) return nil, fmt.Errorf("bloom.Filter: buffer bit count must a power of two: %d/%d", len(buf)*8, m)
} }
return &Filter{k: k, b: buf, mask: m - 1}, nil
return &Filter{
k: k,
b: buf,
mask: m - 1,
hashPool: pool.NewGeneric(16, func(sz int) interface{} {
return murmur3.New128()
}),
}, nil
} }
// Len returns the number of bits used in the filter. // Len returns the number of bits used in the filter.
@ -67,7 +50,7 @@ func (f *Filter) Bytes() []byte { return f.b }
// Clone returns a copy of f. // Clone returns a copy of f.
func (f *Filter) Clone() *Filter { func (f *Filter) Clone() *Filter {
other := &Filter{k: f.k, b: make([]byte, len(f.b)), mask: f.mask, hashPool: f.hashPool} other := &Filter{k: f.k, b: make([]byte, len(f.b)), mask: f.mask}
copy(other.b, f.b) copy(other.b, f.b)
return other return other
} }
@ -116,21 +99,22 @@ func (f *Filter) Merge(other *Filter) error {
return nil return nil
} }
// location returns the ith hashed location using the four base hash values. // location returns the ith hashed location using two hash values.
func (f *Filter) location(h [4]uint64, i uint64) uint { func (f *Filter) location(h [2]uint64, i uint64) uint {
return uint((h[i%2] + i*h[2+(((i+(i%2))%4)/2)]) & f.mask) return uint((h[0] + h[1]*i) & f.mask)
} }
// hash returns a set of 4 based hashes. // hash returns two 64-bit hashes based on the output of xxhash.
func (f *Filter) hash(data []byte) [4]uint64 { func (f *Filter) hash(data []byte) [2]uint64 {
h := f.hashPool.Get(0).(murmur3.Hash128) v1 := xxhash.Sum64(data)
defer f.hashPool.Put(h) var v2 uint64
h.Reset() if len(data) > 0 {
h.Write(data) b := data[len(data)-1] // We'll put the original byte back.
v1, v2 := h.Sum128() data[len(data)-1] = byte(0)
h.Write([]byte{1}) v2 = xxhash.Sum64(data)
v3, v4 := h.Sum128() data[len(data)-1] = b
return [4]uint64{v1, v2, v3, v4} }
return [2]uint64{v1, v2}
} }
// Estimate returns an estimated bit count and hash count given the element count and false positive rate. // Estimate returns an estimated bit count and hash count given the element count and false positive rate.

View File

@ -1,6 +1,7 @@
package bloom_test package bloom_test
import ( import (
"encoding/binary"
"fmt" "fmt"
"testing" "testing"
@ -9,46 +10,97 @@ import (
// Ensure filter can insert values and verify they exist. // Ensure filter can insert values and verify they exist.
func TestFilter_InsertContains(t *testing.T) { func TestFilter_InsertContains(t *testing.T) {
f := bloom.NewFilter(1000, 4) // Short, less comprehensive test.
testShortFilter_InsertContains(t)
// Insert value and validate. if testing.Short() {
f.Insert([]byte("Bess")) return // Just run the above short test
if !f.Contains([]byte("Bess")) {
t.Fatal("expected true")
} }
// Insert another value and test. // More comprehensive test for the xxhash based Bloom Filter.
f.Insert([]byte("Emma"))
if !f.Contains([]byte("Emma")) { // These parameters will result, for 10M entries, with a bloom filter
t.Fatal("expected true") // with 0.001 false positive rate (1 in 1000 values will be incorrectly
// identified as being present in the set).
filter := bloom.NewFilter(143775876, 10)
v := make([]byte, 4, 4)
for i := 0; i < 10000000; i++ {
binary.BigEndian.PutUint32(v, uint32(i))
filter.Insert(v)
} }
// Validate that a non-existent value doesn't exist. // None of the values inserted should ever be considered "not possibly in
if f.Contains([]byte("Jane")) { // the filter".
t.Fatal("expected false") t.Run("100M", func(t *testing.T) {
} for i := 0; i < 10000000; i++ {
binary.BigEndian.PutUint32(v, uint32(i))
if !filter.Contains(v) {
t.Fatalf("got false for value %q, expected true", v)
}
}
// If we check for 100,000,000 values that we know are not present in the
// filter then we might expect around 100,000 of them to be false positives.
var fp int
for i := 10000000; i < 110000000; i++ {
binary.BigEndian.PutUint32(v, uint32(i))
if filter.Contains(v) {
fp++
}
}
if fp > 1000000 {
// If we're an order of magnitude off, then it's arguable that there
// is a bug in the bloom filter.
t.Fatalf("got %d false positives which is an error rate of %f, expected error rate <=0.001", fp, float64(fp)/100000000)
}
t.Logf("Bloom false positive error rate was %f", float64(fp)/100000000)
})
}
func testShortFilter_InsertContains(t *testing.T) {
t.Run("short", func(t *testing.T) {
f := bloom.NewFilter(1000, 4)
// Insert value and validate.
f.Insert([]byte("Bess"))
if !f.Contains([]byte("Bess")) {
t.Fatal("expected true")
}
// Insert another value and test.
f.Insert([]byte("Emma"))
if !f.Contains([]byte("Emma")) {
t.Fatal("expected true")
}
// Validate that a non-existent value doesn't exist.
if f.Contains([]byte("Jane")) {
t.Fatal("expected false")
}
})
}
var benchCases = []struct {
m, k uint64
n int
}{
{m: 100, k: 4, n: 1000},
{m: 1000, k: 4, n: 1000},
{m: 10000, k: 4, n: 1000},
{m: 100000, k: 4, n: 1000},
{m: 100, k: 8, n: 1000},
{m: 1000, k: 8, n: 1000},
{m: 10000, k: 8, n: 1000},
{m: 100000, k: 8, n: 1000},
{m: 100, k: 20, n: 1000},
{m: 1000, k: 20, n: 1000},
{m: 10000, k: 20, n: 1000},
{m: 100000, k: 20, n: 1000},
} }
func BenchmarkFilter_Insert(b *testing.B) { func BenchmarkFilter_Insert(b *testing.B) {
cases := []struct { for _, c := range benchCases {
m, k uint64
n int
}{
{m: 100, k: 4, n: 1000},
{m: 1000, k: 4, n: 1000},
{m: 10000, k: 4, n: 1000},
{m: 100000, k: 4, n: 1000},
{m: 100, k: 8, n: 1000},
{m: 1000, k: 8, n: 1000},
{m: 10000, k: 8, n: 1000},
{m: 100000, k: 8, n: 1000},
{m: 100, k: 20, n: 1000},
{m: 1000, k: 20, n: 1000},
{m: 10000, k: 20, n: 1000},
{m: 100000, k: 20, n: 1000},
}
for _, c := range cases {
data := make([][]byte, 0, c.n) data := make([][]byte, 0, c.n)
for i := 0; i < c.n; i++ { for i := 0; i < c.n; i++ {
data = append(data, []byte(fmt.Sprintf("%d", i))) data = append(data, []byte(fmt.Sprintf("%d", i)))
@ -63,31 +115,14 @@ func BenchmarkFilter_Insert(b *testing.B) {
} }
} }
}) })
} }
} }
var okResult bool var okResult bool
func BenchmarkFilter_Contains(b *testing.B) { func BenchmarkFilter_Contains(b *testing.B) {
cases := []struct { for _, c := range benchCases {
m, k uint64
n int
}{
{m: 100, k: 4, n: 1000},
{m: 1000, k: 4, n: 1000},
{m: 10000, k: 4, n: 1000},
{m: 100000, k: 4, n: 1000},
{m: 100, k: 8, n: 1000},
{m: 1000, k: 8, n: 1000},
{m: 10000, k: 8, n: 1000},
{m: 100000, k: 8, n: 1000},
{m: 100, k: 20, n: 1000},
{m: 1000, k: 20, n: 1000},
{m: 10000, k: 20, n: 1000},
{m: 100000, k: 20, n: 1000},
}
for _, c := range cases {
data := make([][]byte, 0, c.n) data := make([][]byte, 0, c.n)
notData := make([][]byte, 0, c.n) notData := make([][]byte, 0, c.n)
for i := 0; i < c.n; i++ { for i := 0; i < c.n; i++ {
@ -120,25 +155,7 @@ func BenchmarkFilter_Contains(b *testing.B) {
} }
func BenchmarkFilter_Merge(b *testing.B) { func BenchmarkFilter_Merge(b *testing.B) {
cases := []struct { for _, c := range benchCases {
m, k uint64
n int
}{
{m: 100, k: 4, n: 1000},
{m: 1000, k: 4, n: 1000},
{m: 10000, k: 4, n: 1000},
{m: 100000, k: 4, n: 1000},
{m: 100, k: 8, n: 1000},
{m: 1000, k: 8, n: 1000},
{m: 10000, k: 8, n: 1000},
{m: 100000, k: 8, n: 1000},
{m: 100, k: 20, n: 1000},
{m: 1000, k: 20, n: 1000},
{m: 10000, k: 20, n: 1000},
{m: 100000, k: 20, n: 1000},
}
for _, c := range cases {
data1 := make([][]byte, 0, c.n) data1 := make([][]byte, 0, c.n)
data2 := make([][]byte, 0, c.n) data2 := make([][]byte, 0, c.n)
for i := 0; i < c.n; i++ { for i := 0; i < c.n; i++ {