137 lines
3.7 KiB
Go
137 lines
3.7 KiB
Go
package bloom
|
|
|
|
// NOTE:
|
|
// This package implements a limited bloom filter implementation based on
|
|
// Will Fitzgerald's bloom & bitset packages. It uses a zero-allocation xxhash
|
|
// implementation, rather than murmur3. It's implemented locally to support
|
|
// zero-copy memory-mapped slices.
|
|
//
|
|
// This also optimizes the filter by always using a bitset size with a power of 2.
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
|
|
"github.com/cespare/xxhash"
|
|
)
|
|
|
|
// Filter represents a bloom filter.
|
|
type Filter struct {
|
|
k uint64
|
|
b []byte
|
|
mask uint64
|
|
}
|
|
|
|
// NewFilter returns a new instance of Filter using m bits and k hash functions.
|
|
// If m is not a power of two then it is rounded to the next highest power of 2.
|
|
func NewFilter(m uint64, k uint64) *Filter {
|
|
m = pow2(m)
|
|
return &Filter{k: k, b: make([]byte, m>>3), mask: m - 1}
|
|
}
|
|
|
|
// NewFilterBuffer returns a new instance of a filter using a backing buffer.
|
|
// The buffer length MUST be a power of 2.
|
|
func NewFilterBuffer(buf []byte, k uint64) (*Filter, error) {
|
|
m := pow2(uint64(len(buf)) * 8)
|
|
if m != uint64(len(buf))*8 {
|
|
return nil, fmt.Errorf("bloom.Filter: buffer bit count must a power of two: %d/%d", len(buf)*8, m)
|
|
}
|
|
return &Filter{k: k, b: buf, mask: m - 1}, nil
|
|
}
|
|
|
|
// Len returns the number of bits used in the filter.
|
|
func (f *Filter) Len() uint { return uint(len(f.b)) }
|
|
|
|
// K returns the number of hash functions used in the filter.
|
|
func (f *Filter) K() uint64 { return f.k }
|
|
|
|
// Bytes returns the underlying backing slice.
|
|
func (f *Filter) Bytes() []byte { return f.b }
|
|
|
|
// Clone returns a copy of f.
|
|
func (f *Filter) Clone() *Filter {
|
|
other := &Filter{k: f.k, b: make([]byte, len(f.b)), mask: f.mask}
|
|
copy(other.b, f.b)
|
|
return other
|
|
}
|
|
|
|
// Insert inserts data to the filter.
|
|
func (f *Filter) Insert(v []byte) {
|
|
h := f.hash(v)
|
|
for i := uint64(0); i < f.k; i++ {
|
|
loc := f.location(h, i)
|
|
f.b[loc>>3] |= 1 << (loc & 7)
|
|
}
|
|
}
|
|
|
|
// Contains returns true if the filter possibly contains v.
|
|
// Returns false if the filter definitely does not contain v.
|
|
func (f *Filter) Contains(v []byte) bool {
|
|
h := f.hash(v)
|
|
for i := uint64(0); i < f.k; i++ {
|
|
loc := f.location(h, i)
|
|
if f.b[loc>>3]&(1<<(loc&7)) == 0 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Merge performs an in-place union of other into f.
|
|
// Returns an error if m or k of the filters differs.
|
|
func (f *Filter) Merge(other *Filter) error {
|
|
if other == nil {
|
|
return nil
|
|
}
|
|
|
|
// Ensure m & k fields match.
|
|
if len(f.b) != len(other.b) {
|
|
return fmt.Errorf("bloom.Filter.Merge(): m mismatch: %d <> %d", len(f.b), len(other.b))
|
|
} else if f.k != other.k {
|
|
return fmt.Errorf("bloom.Filter.Merge(): k mismatch: %d <> %d", f.b, other.b)
|
|
}
|
|
|
|
// Perform union of each byte.
|
|
for i := range f.b {
|
|
f.b[i] |= other.b[i]
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// location returns the ith hashed location using two hash values.
|
|
func (f *Filter) location(h [2]uint64, i uint64) uint {
|
|
return uint((h[0] + h[1]*i) & f.mask)
|
|
}
|
|
|
|
// hash returns two 64-bit hashes based on the output of xxhash.
|
|
func (f *Filter) hash(data []byte) [2]uint64 {
|
|
v1 := xxhash.Sum64(data)
|
|
var v2 uint64
|
|
if len(data) > 0 {
|
|
b := data[len(data)-1] // We'll put the original byte back.
|
|
data[len(data)-1] = byte(0)
|
|
v2 = xxhash.Sum64(data)
|
|
data[len(data)-1] = b
|
|
}
|
|
return [2]uint64{v1, v2}
|
|
}
|
|
|
|
// Estimate returns an estimated bit count and hash count given the element count and false positive rate.
|
|
func Estimate(n uint64, p float64) (m uint64, k uint64) {
|
|
m = uint64(math.Ceil(-1 * float64(n) * math.Log(p) / math.Pow(math.Log(2), 2)))
|
|
k = uint64(math.Ceil(math.Log(2) * float64(m) / float64(n)))
|
|
return m, k
|
|
}
|
|
|
|
// pow2 returns the number that is the next highest power of 2.
|
|
// Returns v if it is a power of 2.
|
|
func pow2(v uint64) uint64 {
|
|
for i := uint64(8); i < 1<<62; i *= 2 {
|
|
if i >= v {
|
|
return i
|
|
}
|
|
}
|
|
panic("unreachable")
|
|
}
|