influxdb/tsdb/tsm1/reader_index.go

package tsm1

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"math"
	"sort"
	"sync"

	"go.uber.org/zap"
)

// TSMIndex represent the index section of a TSM file.  The index records all
// blocks, their locations, sizes, min and max times.
type TSMIndex interface {
	// Delete removes the given keys from the index.
	Delete(keys [][]byte)

	// DeleteRange removes the given keys with data between minTime and maxTime from the index.
	DeleteRange(keys [][]byte, minTime, maxTime int64)

	// DeletePrefix removes keys that begin with the given prefix with data between minTime and
	// maxTime from the index.
	DeletePrefix(prefix []byte, minTime, maxTime int64)

	// ContainsKey returns true if the given key may exist in the index.  This func is faster than
	// Contains but, may return false positives.
	ContainsKey(key []byte) bool

	// Contains return true if the given key exists in the index.
	Contains(key []byte) bool

	// ContainsValue returns true if key and time might exist in this file.  This function could
	// return true even though the actual point does not exists.  For example, the key may
	// exist in this file, but not have a point exactly at time t.
	ContainsValue(key []byte, timestamp int64) bool

	// ReadEntries reads the index entries for key into entries.
	ReadEntries(key []byte, entries []IndexEntry) ([]IndexEntry, error)

	// Entry returns the index entry for the specified key and timestamp.  If no entry
	// matches the key and timestamp, nil is returned.
	Entry(key []byte, timestamp int64) *IndexEntry

	// KeyCount returns the count of unique keys in the index.
	KeyCount() int

	// Iterator returns an iterator over the keys starting at the provided key. You must
	// call Next before calling any of the accessors.
	Iterator([]byte) *TSMIndexIterator

	// OverlapsTimeRange returns true if the time range of the file intersect min and max.
	OverlapsTimeRange(min, max int64) bool

	// OverlapsKeyRange returns true if the min and max keys of the file overlap the arguments min and max.
	OverlapsKeyRange(min, max []byte) bool

	// Size returns the size of the current index in bytes.
	Size() uint32

	// TimeRange returns the min and max time across all keys in the file.
	TimeRange() (int64, int64)

	// TombstoneRange returns ranges of time that are deleted for the given key.
	TombstoneRange(key []byte, buf []TimeRange) []TimeRange

	// KeyRange returns the min and max keys in the file.
	KeyRange() ([]byte, []byte)

	// Type returns the block type of the values stored for the key.  Returns one of
	// BlockFloat64, BlockInt64, BlockBool, BlockString.  If key does not exist,
	// an error is returned.
	Type(key []byte) (byte, error)

	// UnmarshalBinary populates an index from an encoded byte slice
	// representation of an index.
	UnmarshalBinary(b []byte) error

	// Close closes the index and releases any resources.
	Close() error
}

// indirectIndex is a TSMIndex that uses a raw byte slice representation of an index.  This
// implementation can be used for indexes that may be MMAPed into memory.
type indirectIndex struct {
	mu     sync.RWMutex
	logger *zap.Logger

	// indirectIndex works a follows.  Assuming we have an index structure in memory as
	// the diagram below:
	//
	// ┌────────────────────────────────────────────────────────────────────┐
	// │                               Index                                │
	// ├─┬──────────────────────┬──┬───────────────────────┬───┬────────────┘
	// │0│                      │62│                       │145│
	// ├─┴───────┬─────────┬────┼──┴──────┬─────────┬──────┼───┴─────┬──────┐
	// │Key 1 Len│   Key   │... │Key 2 Len│  Key 2  │ ...  │  Key 3  │ ...  │
	// │ 2 bytes │ N bytes │    │ 2 bytes │ N bytes │      │ 2 bytes │      │
	// └─────────┴─────────┴────┴─────────┴─────────┴──────┴─────────┴──────┘

	// We would build an `offsets` slices where each element pointers to the byte location
	// for the first key in the index slice.

	// ┌────────────────────────────────────────────────────────────────────┐
	// │                              Offsets                               │
	// ├────┬────┬────┬─────────────────────────────────────────────────────┘
	// │ 0  │ 62 │145 │
	// └────┴────┴────┘

	// Using this offset slice we can find `Key 2` by doing a binary search
	// over the offsets slice.  Instead of comparing the value in the offsets
	// (e.g. `62`), we use that as an index into the underlying index to
	// retrieve the key at postion `62` and perform our comparisons with that.

	// When we have identified the correct position in the index for a given
	// key, we could perform another binary search or a linear scan.  This
	// should be fast as well since each index entry is 28 bytes and all
	// contiguous in memory.  The current implementation uses a linear scan since the
	// number of block entries is expected to be < 100 per key.

	// b is the underlying index byte slice.  This could be a copy on the heap or an MMAP
	// slice reference
	b faultBuffer

	// ro contains the positions in b for each key as well as the first bytes of each key
	// to avoid disk seeks.
	ro readerOffsets

	// minKey, maxKey are the minium and maximum (lexicographically sorted) contained in the
	// file
	minKey, maxKey []byte

	// minTime, maxTime are the minimum and maximum times contained in the file across all
	// series.
	minTime, maxTime int64

	// tombstones contains only the tombstoned keys with subset of time values deleted.  An
	// entry would exist here if a subset of the points for a key were deleted and the file
	// had not be re-compacted to remove the points on disk.
	tombstones map[uint32][]TimeRange

	// prefixTombstones contains the tombestoned keys with a subset of the values deleted that
	// all share the same prefix.
	prefixTombstones *prefixTree
}

// NewIndirectIndex returns a new indirect index.
func NewIndirectIndex() *indirectIndex {
	return &indirectIndex{
		tombstones:       make(map[uint32][]TimeRange),
		prefixTombstones: newPrefixTree(),
	}
}

// ContainsKey returns true of key may exist in this index.
func (d *indirectIndex) ContainsKey(key []byte) bool {
	return bytes.Compare(key, d.minKey) >= 0 && bytes.Compare(key, d.maxKey) <= 0
}

// ReadEntries returns all index entries for a key.
func (d *indirectIndex) ReadEntries(key []byte, entries []IndexEntry) ([]IndexEntry, error) {
	d.mu.RLock()
	defer d.mu.RUnlock()

	iter := d.ro.Iterator()
	exact, _ := iter.Seek(key, &d.b)
	if !exact {
		return nil, nil
	}

	entries, err := readEntries(d.b.access(iter.EntryOffset(&d.b), 0), entries)
	if err != nil {
		return nil, err
	}

	return entries, nil
}

// Entry returns the index entry for the specified key and timestamp.  If no entry
// matches the key an timestamp, nil is returned.
func (d *indirectIndex) Entry(key []byte, timestamp int64) *IndexEntry {
	entries, err := d.ReadEntries(key, nil)
	if err != nil {
		d.logger.Error("error reading tsm index key", zap.String("key", fmt.Sprintf("%q", key)))
		return nil
	}
	for _, entry := range entries {
		if entry.Contains(timestamp) {
			return &entry
		}
	}
	return nil
}

// KeyCount returns the count of unique keys in the index.
func (d *indirectIndex) KeyCount() int {
	d.mu.RLock()
	n := len(d.ro.offsets)
	d.mu.RUnlock()
	return n
}

// Iterator returns an iterator over the keys starting at the provided key. You must
// call Next before calling any of the accessors.
func (d *indirectIndex) Iterator(key []byte) *TSMIndexIterator {
	d.mu.RLock()
	iter := d.ro.Iterator()
	_, ok := iter.Seek(key, &d.b)
	ti := &TSMIndexIterator{
		d:     d,
		n:     int(len(d.ro.offsets)),
		b:     &d.b,
		iter:  &iter,
		first: true,
		ok:    ok,
	}
	d.mu.RUnlock()

	return ti
}

// Delete removes the given keys from the index.
func (d *indirectIndex) Delete(keys [][]byte) {
	if len(keys) == 0 {
		return
	}

	d.mu.RLock()
	iter := d.ro.Iterator()
	for _, key := range keys {
		if !iter.Next() || !bytes.Equal(iter.Key(&d.b), key) {
			if exact, _ := iter.Seek(key, &d.b); !exact {
				continue
			}
		}

		delete(d.tombstones, iter.Offset())
		iter.Delete()
	}
	d.mu.RUnlock()

	if !iter.HasDeletes() {
		return
	}

	d.mu.Lock()
	iter.Done()
	d.mu.Unlock()
}

// insertTimeRange adds a time range described by the minTime and maxTime into ts.
func insertTimeRange(ts []TimeRange, minTime, maxTime int64) []TimeRange {
	n := sort.Search(len(ts), func(i int) bool {
		if ts[i].Min == minTime {
			return ts[i].Max >= maxTime
		}
		return ts[i].Min > minTime
	})

	ts = append(ts, TimeRange{})
	copy(ts[n+1:], ts[n:])
	ts[n] = TimeRange{Min: minTime, Max: maxTime}
	return ts
}

// pendingTombstone is a type that describes a pending insertion of a tombstone.
type pendingTombstone struct {
	Key         int
	Index       int
	Offset      uint32
	EntryOffset uint32
	Tombstones  int
}

// coversEntries checks if all of the stored tombstones including one for minTime and maxTime cover
// all of the index entries. It mutates the entries slice to do the work, so be sure to make a copy
// if you must.
func (d *indirectIndex) coversEntries(offset uint32, key []byte, buf []TimeRange,
	entries []IndexEntry, minTime, maxTime int64) ([]TimeRange, bool) {

	// grab the tombstones from the prefixes. these come out unsorted, so we sort
	// them and place them in the merger section named unsorted.
	buf = d.prefixTombstones.Search(key, buf[:0])
	if len(buf) > 1 {
		sort.Slice(buf, func(i, j int) bool { return buf[i].Less(buf[j]) })
	}

	// create the merger with the other tombstone entries: the ones for the specific
	// key and the one we have proposed to add.
	merger := timeRangeMerger{
		sorted:   d.tombstones[offset],
		unsorted: buf,
		single:   TimeRange{Min: minTime, Max: maxTime},
		used:     false,
	}

	return buf, timeRangesCoverEntries(merger, entries)
}

// DeleteRange removes the given keys with data between minTime and maxTime from the index.
func (d *indirectIndex) DeleteRange(keys [][]byte, minTime, maxTime int64) {
	// If we're deleting everything, we won't need to worry about partial deletes.
	if minTime <= d.minTime && maxTime >= d.maxTime {
		d.Delete(keys)
		return
	}

	// Is the range passed in outside of the time range for the file?
	if minTime > d.maxTime || maxTime < d.minTime {
		return
	}

	// General outline:
	// Under the read lock, determine the set of actions we need to
	// take and on what keys to take them. Then, under the write
	// lock, perform those actions. We keep track of some state
	// during the read lock to make double checking under the
	// write lock cheap.

	d.mu.RLock()
	iter := d.ro.Iterator()
	var (
		ok      bool
		trbuf   []TimeRange
		entries []IndexEntry
		pending []pendingTombstone
		err     error
	)

	for i, key := range keys {
		if !iter.Next() || !bytes.Equal(iter.Key(&d.b), key) {
			if exact, _ := iter.Seek(key, &d.b); !exact {
				continue
			}
		}

		entryOffset := iter.EntryOffset(&d.b)
		entries, err = readEntriesTimes(d.b.access(entryOffset, 0), entries)
		if err != nil {
			// If we have an error reading the entries for a key, we should just pretend
			// the whole key is deleted. Maybe a better idea is to report this up somehow
			// but that's for another time.
			iter.Delete()
			continue
		}

		// Is the time range passed outside of the time range we have stored for this key?
		min, max := entries[0].MinTime, entries[len(entries)-1].MaxTime
		if minTime > max || maxTime < min {
			continue
		}

		// Does the range passed in cover every value for the key?
		if minTime <= min && maxTime >= max {
			iter.Delete()
			continue
		}

		// Does adding the minTime and maxTime cover the entries?
		offset := iter.Offset()
		trbuf, ok = d.coversEntries(offset, key, trbuf, entries, minTime, maxTime)
		if ok {
			iter.Delete()
			continue
		}

		// Save that we should add a tombstone for this key, and how many tombstones
		// already existed to avoid double checks.
		pending = append(pending, pendingTombstone{
			Key:         i,
			Index:       iter.Index(),
			Offset:      offset,
			EntryOffset: entryOffset,
			Tombstones:  len(d.tombstones[offset]),
		})
	}

	d.mu.RUnlock()

	if len(pending) == 0 && !iter.HasDeletes() {
		return
	}

	d.mu.Lock()
	defer d.mu.Unlock()

	for _, p := range pending {
		// Check the existing tombstones. If the length did not/ change, then we know
		// that we don't need to double check coverage, since we only ever increase the
		// number of tombstones for a key.
		if trs := d.tombstones[p.Offset]; p.Tombstones == len(trs) {
			d.tombstones[p.Offset] = insertTimeRange(trs, minTime, maxTime)
			continue
		}

		// Since the length changed, we have to do the expensive overlap check again.
		// We re-read the entries again under the write lock because this should be
		// rare and only during concurrent deletes to the same key. We could make
		// a copy of the entries before getting here, but that penalizes the common
		// no-concurrent case.
		entries, err = readEntriesTimes(d.b.access(p.EntryOffset, 0), entries)
		if err != nil {
			// If we have an error reading the entries for a key, we should just pretend
			// the whole key is deleted. Maybe a better idea is to report this up somehow
			// but that's for another time.
			delete(d.tombstones, p.Offset)
			iter.SetIndex(p.Index)
			if iter.Offset() == p.Offset {
				iter.Delete()
			}
			continue
		}

		trbuf, ok = d.coversEntries(p.Offset, keys[p.Key], trbuf, entries, minTime, maxTime)
		if ok {
			delete(d.tombstones, p.Offset)
			iter.SetIndex(p.Index)
			if iter.Offset() == p.Offset {
				iter.Delete()
			}
			continue
		}

		// Append the TimeRange into the tombstones.
		trs := d.tombstones[p.Offset]
		d.tombstones[p.Offset] = insertTimeRange(trs, minTime, maxTime)
	}

	iter.Done()
}

func (d *indirectIndex) DeletePrefix(prefix []byte, minTime, maxTime int64) {
	// If we're deleting everything, we won't need to worry about partial deletes.
	partial := !(minTime <= d.minTime && maxTime >= d.maxTime)

	// Is the range passed in outside of the time range for the file?
	if minTime > d.maxTime || maxTime < d.minTime {
		return
	}

	d.mu.RLock()
	var (
		ok        bool
		trbuf     []TimeRange
		entries   []IndexEntry
		err       error
		mustTrack bool
	)

	// seek to the earliest key with the prefix, and start iterating. we can't call
	// next until after we've checked the key, so keep a "first" flag.
	first := true
	iter := d.ro.Iterator()
	for {
		if first {
			if _, ok := iter.Seek(prefix, &d.b); !ok {
				break
			}
		} else if !iter.Next() {
			break
		}
		if !bytes.HasPrefix(iter.Key(&d.b), prefix) {
			break
		}
		first = false

		// if we're not doing a partial delete, we don't need to read the entries and
		// can just delete the key and move on.
		if !partial {
			iter.Delete()
			continue
		}

		entryOffset := iter.EntryOffset(&d.b)
		entries, err = readEntriesTimes(d.b.access(entryOffset, 0), entries)
		if err != nil {
			// If we have an error reading the entries for a key, we should just pretend
			// the whole key is deleted. Maybe a better idea is to report this up somehow
			// but that's for another time.
			iter.Delete()
			continue
		}

		// Is the time range passed outside the range we have stored for the key?
		min, max := entries[0].MinTime, entries[len(entries)-1].MaxTime
		if minTime > max || maxTime < min {
			continue
		}

		// Does the range passed cover every value for the key?
		if minTime <= min && maxTime >= max {
			iter.Delete()
			continue
		}

		// Does adding the minTime and maxTime cover the entries?
		trbuf, ok = d.coversEntries(iter.Offset(), iter.Key(&d.b), trbuf, entries, minTime, maxTime)
		if ok {
			iter.Delete()
			continue
		}

		// Otherwise, we have to track it in the prefix tombstones list.
		mustTrack = true
	}
	d.mu.RUnlock()

	// Check and abort if nothing needs to be done.
	if !mustTrack && !iter.HasDeletes() {
		return
	}

	d.mu.Lock()
	defer d.mu.Unlock()

	if mustTrack {
		d.prefixTombstones.Append(prefix, TimeRange{Min: minTime, Max: maxTime})
	}

	if iter.HasDeletes() {
		iter.Done()
	}
}

// TombstoneRange returns ranges of time that are deleted for the given key.
func (d *indirectIndex) TombstoneRange(key []byte, buf []TimeRange) []TimeRange {
	d.mu.RLock()
	rs := d.prefixTombstones.Search(key, buf[:0])
	iter := d.ro.Iterator()
	exact, _ := iter.Seek(key, &d.b)
	if exact {
		rs = append(rs, d.tombstones[iter.Offset()]...)
	}
	d.mu.RUnlock()
	return rs
}

// Contains return true if the given key exists in the index.
func (d *indirectIndex) Contains(key []byte) bool {
	d.mu.RLock()
	iter := d.ro.Iterator()
	exact, _ := iter.Seek(key, &d.b)
	d.mu.RUnlock()
	return exact
}

// ContainsValue returns true if key and time might exist in this file.
func (d *indirectIndex) ContainsValue(key []byte, timestamp int64) bool {
	d.mu.RLock()
	defer d.mu.RUnlock()

	iter := d.ro.Iterator()
	exact, _ := iter.Seek(key, &d.b)
	if !exact {
		return false
	}

	for _, t := range d.tombstones[iter.Offset()] {
		if t.Min <= timestamp && timestamp <= t.Max {
			return false
		}
	}

	if d.prefixTombstones.checkOverlap(key, timestamp) {
		return false
	}

	entries, err := d.ReadEntries(key, nil)
	if err != nil {
		d.logger.Error("error reading tsm index key", zap.String("key", fmt.Sprintf("%q", key)))
		return false
	}

	for _, entry := range entries {
		if entry.Contains(timestamp) {
			return true
		}
	}

	return false
}

// Type returns the block type of the values stored for the key.
func (d *indirectIndex) Type(key []byte) (byte, error) {
	d.mu.RLock()
	defer d.mu.RUnlock()

	iter := d.ro.Iterator()
	exact, _ := iter.Seek(key, &d.b)
	if !exact {
		return 0, errors.New("key does not exist")
	}

	return d.b.access(iter.EntryOffset(&d.b), 1)[0], nil
}

// OverlapsTimeRange returns true if the time range of the file intersect min and max.
func (d *indirectIndex) OverlapsTimeRange(min, max int64) bool {
	return d.minTime <= max && d.maxTime >= min
}

// OverlapsKeyRange returns true if the min and max keys of the file overlap the arguments min and max.
func (d *indirectIndex) OverlapsKeyRange(min, max []byte) bool {
	return bytes.Compare(d.minKey, max) <= 0 && bytes.Compare(d.maxKey, min) >= 0
}

// KeyRange returns the min and max keys in the index.
func (d *indirectIndex) KeyRange() ([]byte, []byte) {
	return d.minKey, d.maxKey
}

// TimeRange returns the min and max time across all keys in the index.
func (d *indirectIndex) TimeRange() (int64, int64) {
	return d.minTime, d.maxTime
}

// MarshalBinary returns a byte slice encoded version of the index.
func (d *indirectIndex) MarshalBinary() ([]byte, error) {
	d.mu.RLock()
	defer d.mu.RUnlock()

	return d.b.b, nil
}

// UnmarshalBinary populates an index from an encoded byte slice
// representation of an index.
func (d *indirectIndex) UnmarshalBinary(b []byte) error {
	d.mu.Lock()
	defer d.mu.Unlock()

	// Keep a reference to the actual index bytes
	d.b = faultBuffer{b: b}
	if len(b) == 0 {
		return nil
	}

	// make sure a uint32 is sufficient to store any offset into the index.
	if uint64(len(b)) != uint64(uint32(len(b))) {
		return fmt.Errorf("indirectIndex: too large to open")
	}

	var minTime, maxTime int64 = math.MaxInt64, math.MinInt64

	// To create our "indirect" index, we need to find the location of all the keys in
	// the raw byte slice.  The keys are listed once each (in sorted order).  Following
	// each key is a time ordered list of index entry blocks for that key.  The loop below
	// basically skips across the slice keeping track of the counter when we are at a key
	// field.
	var i uint32
	var ro readerOffsets

	iMax := uint32(len(b))
	if iMax > math.MaxInt32 {
		return fmt.Errorf("indirectIndex: too large to store offsets")
	}

	for i < iMax {
		offset := i // save for when we add to the data structure

		// Skip to the start of the values
		// key length value (2) + type (1) + length of key
		if i+2 >= iMax {
			return fmt.Errorf("indirectIndex: not enough data for key length value")
		}
		keyLength := uint32(binary.BigEndian.Uint16(b[i : i+2]))
		i += 2

		if i+keyLength+indexTypeSize >= iMax {
			return fmt.Errorf("indirectIndex: not enough data for key and type")
		}
		ro.AddKey(offset, b[i:i+keyLength])
		i += keyLength + indexTypeSize

		// count of index entries
		if i+indexCountSize >= iMax {
			return fmt.Errorf("indirectIndex: not enough data for index entries count")
		}
		count := uint32(binary.BigEndian.Uint16(b[i : i+indexCountSize]))
		if count == 0 {
			return fmt.Errorf("indirectIndex: key exits with no entries")
		}
		i += indexCountSize

		// Find the min time for the block
		if i+8 >= iMax {
			return fmt.Errorf("indirectIndex: not enough data for min time")
		}
		minT := int64(binary.BigEndian.Uint64(b[i : i+8]))
		if minT < minTime {
			minTime = minT
		}

		i += (count - 1) * indexEntrySize

		// Find the max time for the block
		if i+16 >= iMax {
			return fmt.Errorf("indirectIndex: not enough data for max time")
		}
		maxT := int64(binary.BigEndian.Uint64(b[i+8 : i+16]))
		if maxT > maxTime {
			maxTime = maxT
		}

		i += indexEntrySize
	}

	ro.Done()

	firstOfs := ro.offsets[0]
	key := readKey(b[firstOfs:])
	d.minKey = key

	lastOfs := ro.offsets[len(ro.offsets)-1]
	key = readKey(b[lastOfs:])
	d.maxKey = key

	d.minTime = minTime
	d.maxTime = maxTime
	d.ro = ro

	return nil
}

// Size returns the size of the current index in bytes.
func (d *indirectIndex) Size() uint32 {
	d.mu.RLock()
	defer d.mu.RUnlock()

	return d.b.len()
}

func (d *indirectIndex) Close() error {
	return nil
}

func readKey(b []byte) (key []byte) {
	size := binary.BigEndian.Uint16(b[:2])
	return b[2 : 2+size]
}

func readEntries(b []byte, entries []IndexEntry) ([]IndexEntry, error) {
	if len(b) < indexTypeSize+indexCountSize {
		return entries[:0], errors.New("readEntries: data too short for headers")
	}

	count := int(binary.BigEndian.Uint16(b[indexTypeSize : indexTypeSize+indexCountSize]))
	if cap(entries) < count {
		entries = make([]IndexEntry, count)
	} else {
		entries = entries[:count]
	}
	b = b[indexTypeSize+indexCountSize:]

	for i := range entries {
		if err := entries[i].UnmarshalBinary(b); err != nil {
			return entries[:0], err
		}
		b = b[indexEntrySize:]
	}

	return entries, nil
}

// readEntriesTimes is a helper function to read entries at the provided buffer but
// only reading in the min and max times.
func readEntriesTimes(b []byte, entries []IndexEntry) ([]IndexEntry, error) {
	if len(b) < indexTypeSize+indexCountSize {
		return entries[:0], errors.New("readEntries: data too short for headers")
	}

	count := int(binary.BigEndian.Uint16(b[indexTypeSize : indexTypeSize+indexCountSize]))
	if cap(entries) < count {
		entries = make([]IndexEntry, count)
	} else {
		entries = entries[:count]
	}
	b = b[indexTypeSize+indexCountSize:]

	for i := range entries {
		if len(b) < indexEntrySize {
			return entries[:0], errors.New("readEntries: stream too short for entry")
		}
		entries[i].MinTime = int64(binary.BigEndian.Uint64(b[0:8]))
		entries[i].MaxTime = int64(binary.BigEndian.Uint64(b[8:16]))
		b = b[indexEntrySize:]
	}

	return entries, nil
}