influxdb/tsdb/tsm1/ring.go

318 lines
8.0 KiB
Go

package tsm1
import (
"fmt"
"sync"
"sync/atomic"
"github.com/cespare/xxhash"
"github.com/influxdata/platform/pkg/bytesutil"
)
// partitions is the number of partitions we used in the ring's continuum. It
// basically defines the maximum number of partitions you can have in the ring.
// If a smaller number of partitions are chosen when creating a ring, then
// they're evenly spread across this many partitions in the ring.
const partitions = 16
// ring is a structure that maps series keys to entries.
//
// ring is implemented as a crude hash ring, in so much that you can have
// variable numbers of members in the ring, and the appropriate member for a
// given series key can always consistently be found. Unlike a true hash ring
// though, this ring is not resizeable—there must be at most 256 members in the
// ring, and the number of members must always be a power of 2.
//
// ring works as follows: Each member of the ring contains a single store, which
// contains a map of series keys to entries. A ring always has 256 partitions,
// and a member takes up one or more of these partitions (depending on how many
// members are specified to be in the ring)
//
// To determine the partition that a series key should be added to, the series
// key is hashed and the first 8 bits are used as an index to the ring.
//
type ring struct {
// Number of keys within the ring. This is used to provide a hint for
// allocating the return values in keys(). It will not be perfectly accurate
// since it doesn't consider adding duplicate keys, or trying to remove non-
// existent keys.
keysHint int64
// The unique set of partitions in the ring.
// len(partitions) <= len(continuum)
partitions []*partition
}
// newring returns a new ring initialised with n partitions. n must always be a
// power of 2, and for performance reasons should be larger than the number of
// cores on the host. The supported set of values for n is:
//
// {1, 2, 4, 8, 16, 32, 64, 128, 256}.
//
func newring(n int) (*ring, error) {
if n <= 0 || n > partitions {
return nil, fmt.Errorf("invalid number of paritions: %d", n)
}
r := ring{
partitions: make([]*partition, n), // maximum number of partitions.
}
// The trick here is to map N partitions to all points on the continuum,
// such that the first eight bits of a given hash will map directly to one
// of the N partitions.
for i := 0; i < len(r.partitions); i++ {
r.partitions[i] = &partition{
store: make(map[string]*entry),
}
}
return &r, nil
}
// reset resets the ring so it can be reused. Before removing references to entries
// within each partition it gathers sizing information to provide hints when
// reallocating entries in partition maps.
//
// reset is not safe for use by multiple goroutines.
func (r *ring) reset() {
for _, partition := range r.partitions {
partition.reset()
}
r.keysHint = 0
}
// getPartition retrieves the hash ring partition associated with the provided
// key.
func (r *ring) getPartition(key []byte) *partition {
return r.partitions[int(xxhash.Sum64(key)%partitions)]
}
// entry returns the entry for the given key.
// entry is safe for use by multiple goroutines.
func (r *ring) entry(key []byte) *entry {
return r.getPartition(key).entry(key)
}
// write writes values to the entry in the ring's partition associated with key.
// If no entry exists for the key then one will be created.
// write is safe for use by multiple goroutines.
func (r *ring) write(key []byte, values Values) (bool, error) {
return r.getPartition(key).write(key, values)
}
// add adds an entry to the ring.
func (r *ring) add(key []byte, entry *entry) {
r.getPartition(key).add(key, entry)
atomic.AddInt64(&r.keysHint, 1)
}
// remove deletes the entry for the given key.
// remove is safe for use by multiple goroutines.
func (r *ring) remove(key []byte) {
r.getPartition(key).remove(key)
if r.keysHint > 0 {
atomic.AddInt64(&r.keysHint, -1)
}
}
// keys returns all the keys from all partitions in the hash ring. The returned
// keys will be in order if sorted is true.
func (r *ring) keys(sorted bool) [][]byte {
keys := make([][]byte, 0, atomic.LoadInt64(&r.keysHint))
for _, p := range r.partitions {
keys = append(keys, p.keys()...)
}
if sorted {
bytesutil.Sort(keys)
}
return keys
}
func (r *ring) count() int {
var n int
for _, p := range r.partitions {
n += p.count()
}
return n
}
// apply applies the provided function to every entry in the ring under a read
// lock using a separate goroutine for each partition. The provided function
// will be called with each key and the corresponding entry. The first error
// encountered will be returned, if any. apply is safe for use by multiple
// goroutines.
func (r *ring) apply(f func([]byte, *entry) error) error {
var (
wg sync.WaitGroup
res = make(chan error, len(r.partitions))
)
for _, p := range r.partitions {
wg.Add(1)
go func(p *partition) {
defer wg.Done()
p.mu.RLock()
for k, e := range p.store {
if err := f([]byte(k), e); err != nil {
res <- err
p.mu.RUnlock()
return
}
}
p.mu.RUnlock()
}(p)
}
go func() {
wg.Wait()
close(res)
}()
// Collect results.
for err := range res {
if err != nil {
return err
}
}
return nil
}
// applySerial is similar to apply, but invokes f on each partition in the same
// goroutine.
// apply is safe for use by multiple goroutines.
func (r *ring) applySerial(f func([]byte, *entry) error) error {
for _, p := range r.partitions {
p.mu.RLock()
for k, e := range p.store {
if e.count() == 0 {
continue
}
if err := f([]byte(k), e); err != nil {
p.mu.RUnlock()
return err
}
}
p.mu.RUnlock()
}
return nil
}
func (r *ring) split(n int) []storer {
var keys int
storers := make([]storer, n)
for i := 0; i < n; i++ {
storers[i], _ = newring(len(r.partitions))
}
for i, p := range r.partitions {
r := storers[i%n].(*ring)
r.partitions[i] = p
keys += len(p.store)
}
return storers
}
// partition provides safe access to a map of series keys to entries.
type partition struct {
mu sync.RWMutex
store map[string]*entry
}
// entry returns the partition's entry for the provided key.
// It's safe for use by multiple goroutines.
func (p *partition) entry(key []byte) *entry {
p.mu.RLock()
e := p.store[string(key)]
p.mu.RUnlock()
return e
}
// write writes the values to the entry in the partition, creating the entry
// if it does not exist.
// write is safe for use by multiple goroutines.
func (p *partition) write(key []byte, values Values) (bool, error) {
p.mu.RLock()
e := p.store[string(key)]
p.mu.RUnlock()
if e != nil {
// Hot path.
return false, e.add(values)
}
p.mu.Lock()
defer p.mu.Unlock()
// Check again.
if e = p.store[string(key)]; e != nil {
return false, e.add(values)
}
// Create a new entry using a preallocated size if we have a hint available.
e, err := newEntryValues(values)
if err != nil {
return false, err
}
p.store[string(key)] = e
return true, nil
}
// add adds a new entry for key to the partition.
func (p *partition) add(key []byte, entry *entry) {
p.mu.Lock()
p.store[string(key)] = entry
p.mu.Unlock()
}
// remove deletes the entry associated with the provided key.
// remove is safe for use by multiple goroutines.
func (p *partition) remove(key []byte) {
p.mu.Lock()
delete(p.store, string(key))
p.mu.Unlock()
}
// keys returns an unsorted slice of the keys in the partition.
func (p *partition) keys() [][]byte {
p.mu.RLock()
keys := make([][]byte, 0, len(p.store))
for k, v := range p.store {
if v.count() == 0 {
continue
}
keys = append(keys, []byte(k))
}
p.mu.RUnlock()
return keys
}
// reset resets the partition by reinitialising the store. reset returns hints
// about sizes that the entries within the store could be reallocated with.
func (p *partition) reset() {
p.mu.RLock()
sz := len(p.store)
p.mu.RUnlock()
newStore := make(map[string]*entry, sz)
p.mu.Lock()
p.store = newStore
p.mu.Unlock()
}
func (p *partition) count() int {
var n int
p.mu.RLock()
for _, v := range p.store {
if v.count() > 0 {
n++
}
}
p.mu.RUnlock()
return n
}