318 lines
8.0 KiB
Go
318 lines
8.0 KiB
Go
package tsm1
|
|
|
|
import (
|
|
"fmt"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"github.com/cespare/xxhash"
|
|
"github.com/influxdata/platform/pkg/bytesutil"
|
|
)
|
|
|
|
// partitions is the number of partitions we used in the ring's continuum. It
|
|
// basically defines the maximum number of partitions you can have in the ring.
|
|
// If a smaller number of partitions are chosen when creating a ring, then
|
|
// they're evenly spread across this many partitions in the ring.
|
|
const partitions = 16
|
|
|
|
// ring is a structure that maps series keys to entries.
|
|
//
|
|
// ring is implemented as a crude hash ring, in so much that you can have
|
|
// variable numbers of members in the ring, and the appropriate member for a
|
|
// given series key can always consistently be found. Unlike a true hash ring
|
|
// though, this ring is not resizeable—there must be at most 256 members in the
|
|
// ring, and the number of members must always be a power of 2.
|
|
//
|
|
// ring works as follows: Each member of the ring contains a single store, which
|
|
// contains a map of series keys to entries. A ring always has 256 partitions,
|
|
// and a member takes up one or more of these partitions (depending on how many
|
|
// members are specified to be in the ring)
|
|
//
|
|
// To determine the partition that a series key should be added to, the series
|
|
// key is hashed and the first 8 bits are used as an index to the ring.
|
|
//
|
|
type ring struct {
|
|
// Number of keys within the ring. This is used to provide a hint for
|
|
// allocating the return values in keys(). It will not be perfectly accurate
|
|
// since it doesn't consider adding duplicate keys, or trying to remove non-
|
|
// existent keys.
|
|
keysHint int64
|
|
|
|
// The unique set of partitions in the ring.
|
|
// len(partitions) <= len(continuum)
|
|
partitions []*partition
|
|
}
|
|
|
|
// newring returns a new ring initialised with n partitions. n must always be a
|
|
// power of 2, and for performance reasons should be larger than the number of
|
|
// cores on the host. The supported set of values for n is:
|
|
//
|
|
// {1, 2, 4, 8, 16, 32, 64, 128, 256}.
|
|
//
|
|
func newring(n int) (*ring, error) {
|
|
if n <= 0 || n > partitions {
|
|
return nil, fmt.Errorf("invalid number of paritions: %d", n)
|
|
}
|
|
|
|
r := ring{
|
|
partitions: make([]*partition, n), // maximum number of partitions.
|
|
}
|
|
|
|
// The trick here is to map N partitions to all points on the continuum,
|
|
// such that the first eight bits of a given hash will map directly to one
|
|
// of the N partitions.
|
|
for i := 0; i < len(r.partitions); i++ {
|
|
r.partitions[i] = &partition{
|
|
store: make(map[string]*entry),
|
|
}
|
|
}
|
|
return &r, nil
|
|
}
|
|
|
|
// reset resets the ring so it can be reused. Before removing references to entries
|
|
// within each partition it gathers sizing information to provide hints when
|
|
// reallocating entries in partition maps.
|
|
//
|
|
// reset is not safe for use by multiple goroutines.
|
|
func (r *ring) reset() {
|
|
for _, partition := range r.partitions {
|
|
partition.reset()
|
|
}
|
|
r.keysHint = 0
|
|
}
|
|
|
|
// getPartition retrieves the hash ring partition associated with the provided
|
|
// key.
|
|
func (r *ring) getPartition(key []byte) *partition {
|
|
return r.partitions[int(xxhash.Sum64(key)%partitions)]
|
|
}
|
|
|
|
// entry returns the entry for the given key.
|
|
// entry is safe for use by multiple goroutines.
|
|
func (r *ring) entry(key []byte) *entry {
|
|
return r.getPartition(key).entry(key)
|
|
}
|
|
|
|
// write writes values to the entry in the ring's partition associated with key.
|
|
// If no entry exists for the key then one will be created.
|
|
// write is safe for use by multiple goroutines.
|
|
func (r *ring) write(key []byte, values Values) (bool, error) {
|
|
return r.getPartition(key).write(key, values)
|
|
}
|
|
|
|
// add adds an entry to the ring.
|
|
func (r *ring) add(key []byte, entry *entry) {
|
|
r.getPartition(key).add(key, entry)
|
|
atomic.AddInt64(&r.keysHint, 1)
|
|
}
|
|
|
|
// remove deletes the entry for the given key.
|
|
// remove is safe for use by multiple goroutines.
|
|
func (r *ring) remove(key []byte) {
|
|
r.getPartition(key).remove(key)
|
|
if r.keysHint > 0 {
|
|
atomic.AddInt64(&r.keysHint, -1)
|
|
}
|
|
}
|
|
|
|
// keys returns all the keys from all partitions in the hash ring. The returned
|
|
// keys will be in order if sorted is true.
|
|
func (r *ring) keys(sorted bool) [][]byte {
|
|
keys := make([][]byte, 0, atomic.LoadInt64(&r.keysHint))
|
|
for _, p := range r.partitions {
|
|
keys = append(keys, p.keys()...)
|
|
}
|
|
|
|
if sorted {
|
|
bytesutil.Sort(keys)
|
|
}
|
|
return keys
|
|
}
|
|
|
|
func (r *ring) count() int {
|
|
var n int
|
|
for _, p := range r.partitions {
|
|
n += p.count()
|
|
}
|
|
return n
|
|
}
|
|
|
|
// apply applies the provided function to every entry in the ring under a read
|
|
// lock using a separate goroutine for each partition. The provided function
|
|
// will be called with each key and the corresponding entry. The first error
|
|
// encountered will be returned, if any. apply is safe for use by multiple
|
|
// goroutines.
|
|
func (r *ring) apply(f func([]byte, *entry) error) error {
|
|
|
|
var (
|
|
wg sync.WaitGroup
|
|
res = make(chan error, len(r.partitions))
|
|
)
|
|
|
|
for _, p := range r.partitions {
|
|
wg.Add(1)
|
|
|
|
go func(p *partition) {
|
|
defer wg.Done()
|
|
|
|
p.mu.RLock()
|
|
for k, e := range p.store {
|
|
if err := f([]byte(k), e); err != nil {
|
|
res <- err
|
|
p.mu.RUnlock()
|
|
return
|
|
}
|
|
}
|
|
p.mu.RUnlock()
|
|
}(p)
|
|
}
|
|
|
|
go func() {
|
|
wg.Wait()
|
|
close(res)
|
|
}()
|
|
|
|
// Collect results.
|
|
for err := range res {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// applySerial is similar to apply, but invokes f on each partition in the same
|
|
// goroutine.
|
|
// apply is safe for use by multiple goroutines.
|
|
func (r *ring) applySerial(f func([]byte, *entry) error) error {
|
|
for _, p := range r.partitions {
|
|
p.mu.RLock()
|
|
for k, e := range p.store {
|
|
if e.count() == 0 {
|
|
continue
|
|
}
|
|
if err := f([]byte(k), e); err != nil {
|
|
p.mu.RUnlock()
|
|
return err
|
|
}
|
|
}
|
|
p.mu.RUnlock()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (r *ring) split(n int) []storer {
|
|
var keys int
|
|
storers := make([]storer, n)
|
|
for i := 0; i < n; i++ {
|
|
storers[i], _ = newring(len(r.partitions))
|
|
}
|
|
|
|
for i, p := range r.partitions {
|
|
r := storers[i%n].(*ring)
|
|
r.partitions[i] = p
|
|
keys += len(p.store)
|
|
}
|
|
return storers
|
|
}
|
|
|
|
// partition provides safe access to a map of series keys to entries.
|
|
type partition struct {
|
|
mu sync.RWMutex
|
|
store map[string]*entry
|
|
}
|
|
|
|
// entry returns the partition's entry for the provided key.
|
|
// It's safe for use by multiple goroutines.
|
|
func (p *partition) entry(key []byte) *entry {
|
|
p.mu.RLock()
|
|
e := p.store[string(key)]
|
|
p.mu.RUnlock()
|
|
return e
|
|
}
|
|
|
|
// write writes the values to the entry in the partition, creating the entry
|
|
// if it does not exist.
|
|
// write is safe for use by multiple goroutines.
|
|
func (p *partition) write(key []byte, values Values) (bool, error) {
|
|
p.mu.RLock()
|
|
e := p.store[string(key)]
|
|
p.mu.RUnlock()
|
|
if e != nil {
|
|
// Hot path.
|
|
return false, e.add(values)
|
|
}
|
|
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
|
|
// Check again.
|
|
if e = p.store[string(key)]; e != nil {
|
|
return false, e.add(values)
|
|
}
|
|
|
|
// Create a new entry using a preallocated size if we have a hint available.
|
|
e, err := newEntryValues(values)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
p.store[string(key)] = e
|
|
return true, nil
|
|
}
|
|
|
|
// add adds a new entry for key to the partition.
|
|
func (p *partition) add(key []byte, entry *entry) {
|
|
p.mu.Lock()
|
|
p.store[string(key)] = entry
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
// remove deletes the entry associated with the provided key.
|
|
// remove is safe for use by multiple goroutines.
|
|
func (p *partition) remove(key []byte) {
|
|
p.mu.Lock()
|
|
delete(p.store, string(key))
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
// keys returns an unsorted slice of the keys in the partition.
|
|
func (p *partition) keys() [][]byte {
|
|
p.mu.RLock()
|
|
keys := make([][]byte, 0, len(p.store))
|
|
for k, v := range p.store {
|
|
if v.count() == 0 {
|
|
continue
|
|
}
|
|
keys = append(keys, []byte(k))
|
|
}
|
|
p.mu.RUnlock()
|
|
return keys
|
|
}
|
|
|
|
// reset resets the partition by reinitialising the store. reset returns hints
|
|
// about sizes that the entries within the store could be reallocated with.
|
|
func (p *partition) reset() {
|
|
p.mu.RLock()
|
|
sz := len(p.store)
|
|
p.mu.RUnlock()
|
|
|
|
newStore := make(map[string]*entry, sz)
|
|
p.mu.Lock()
|
|
p.store = newStore
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
func (p *partition) count() int {
|
|
var n int
|
|
p.mu.RLock()
|
|
for _, v := range p.store {
|
|
if v.count() > 0 {
|
|
n++
|
|
}
|
|
}
|
|
p.mu.RUnlock()
|
|
return n
|
|
|
|
}
|