influxdb/tsdb/tsm1/tombstone.go

635 lines
19 KiB
Go

package tsm1
/*
Tombstone file format:
╔═══════════════════════════════════════════Tombstone File════════════════════════════════════════════╗
║ ┌─────────────┐┌──────────────────────────────────────────────────────────────────────────────────┐ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ Header ││ │ ║
║ │ 4 bytes ││ Tombstone Entries │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ │ ││ │ ║
║ └─────────────┘└──────────────────────────────────────────────────────────────────────────────────┘ ║
╚═════════════════════════════════════════════════════════════════════════════════════════════════════╝
╔═══════════════════════════════════════════Tombstone Entry═══════════════════════════════════════════╗
║ ┌──────┐┌───────────────┐┌────────────┐┌────────────────────────┐┌───────────────┐┌───────────────┐ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │Prefix││ Reserved ││ Key Length ││ Key ││ Min Time ││ Max Time │ ║
║ │ Bit ││ 7 bits ││ 24 bits ││ N bytes ││ 8 bytes ││ 8 bytes │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ │ ││ ││ ││ ││ ││ │ ║
║ └──────┘└───────────────┘└────────────┘└────────────────────────┘└───────────────┘└───────────────┘ ║
╚═════════════════════════════════════════════════════════════════════════════════════════════════════╝
NOTE: v1, v2 and v3 tombstone supports have been dropped from 2.x. Only v4 is now
supported.
*/
import (
"bufio"
"compress/gzip"
"encoding/binary"
"errors"
"fmt"
"io"
"math"
"os"
"path/filepath"
"strings"
"sync"
"github.com/influxdata/influxdb/v2/pkg/fs"
)
const (
headerSize = 4
v4header = 0x1504
)
var errIncompatibleV4Version = errors.New("incompatible v4 version")
// Tombstoner records tombstones when entries are deleted.
type Tombstoner struct {
mu sync.RWMutex
// Path is the location of the file to record tombstone. This should be the
// full path to a TSM file.
Path string
FilterFn func(k []byte) bool
// cache of the stats for this tombstone
fileStats []FileStat
// indicates that the stats may be out of sync with what is on disk and they
// should be refreshed.
statsLoaded bool
// Tombstones that have been written but not flushed to disk yet.
tombstones []Tombstone
// These are references used for pending writes that have not been committed. If
// these are nil, then no pending writes are in progress.
gz *gzip.Writer
bw *bufio.Writer
pendingFile *os.File
tmp [8]byte
lastAppliedOffset int64
// Optional observer for when tombstone files are written.
obs FileStoreObserver
}
// NewTombstoner constructs a Tombstoner for the given path. FilterFn can be nil.
func NewTombstoner(path string, filterFn func(k []byte) bool) *Tombstoner {
return &Tombstoner{
Path: path,
FilterFn: filterFn,
obs: noFileStoreObserver{},
}
}
// Tombstone represents an individual deletion.
type Tombstone struct {
// Key is the tombstoned series key.
Key []byte
// Prefix indicates if this tombstone entry is a prefix key, meaning all
// keys with a prefix matching Key should be removed for the [Min, Max] range.
Prefix bool
// Min and Max are the min and max unix nanosecond time ranges of Key that are deleted.
Min, Max int64
// Predicate stores the marshaled form of some predicate for matching keys.
Predicate []byte
}
func (t Tombstone) String() string {
prefix := "Key"
if t.Prefix {
prefix = "Prefix"
}
return fmt.Sprintf("%s: %q, [%d, %d] pred:%v", prefix, t.Key, t.Min, t.Max, len(t.Predicate) > 0)
}
// WithObserver sets a FileStoreObserver for when the tombstone file is written.
func (t *Tombstoner) WithObserver(obs FileStoreObserver) {
if obs == nil {
obs = noFileStoreObserver{}
}
t.obs = obs
}
// AddPrefixRange adds a prefix-based tombstone key with an explicit range.
func (t *Tombstoner) AddPrefixRange(key []byte, min, max int64, predicate []byte) error {
t.mu.Lock()
defer t.mu.Unlock()
// If this TSMFile has not been written (mainly in tests), don't write a
// tombstone because the keys will not be written when it's actually saved.
if t.Path == "" {
return nil
}
t.statsLoaded = false
if err := t.prepareLatest(); err != nil {
return err
}
return t.writeTombstoneV4(t.gz, Tombstone{
Key: key,
Min: min,
Max: max,
Prefix: true,
Predicate: predicate,
})
}
// Add adds the all keys, across all timestamps, to the tombstone.
func (t *Tombstoner) Add(keys [][]byte) error {
return t.AddRange(keys, math.MinInt64, math.MaxInt64)
}
// AddRange adds all keys to the tombstone specifying only the data between min and max to be removed.
func (t *Tombstoner) AddRange(keys [][]byte, min, max int64) error {
for t.FilterFn != nil && len(keys) > 0 && !t.FilterFn(keys[0]) {
keys = keys[1:]
}
if len(keys) == 0 {
return nil
}
t.mu.Lock()
defer t.mu.Unlock()
// If this TSMFile has not been written (mainly in tests), don't write a
// tombstone because the keys will not be written when it's actually saved.
if t.Path == "" {
return nil
}
t.statsLoaded = false
if err := t.prepareLatest(); err != nil {
return err
}
for _, k := range keys {
if t.FilterFn != nil && !t.FilterFn(k) {
continue
}
if err := t.writeTombstoneV4(t.gz, Tombstone{
Key: k,
Min: min,
Max: max,
Prefix: false,
}); err != nil {
return err
}
}
return nil
}
func (t *Tombstoner) Flush() error {
t.mu.Lock()
defer t.mu.Unlock()
if err := t.commit(); err != nil {
// Reset our temp references and clean up.
_ = t.rollback()
return err
}
return nil
}
func (t *Tombstoner) Rollback() error {
t.mu.Lock()
defer t.mu.Unlock()
return t.rollback()
}
// Delete removes all the tombstone files from disk.
func (t *Tombstoner) Delete() error {
t.mu.Lock()
defer t.mu.Unlock()
if err := os.RemoveAll(t.tombstonePath()); err != nil {
return err
}
t.statsLoaded = false
t.lastAppliedOffset = 0
return nil
}
// HasTombstones return true if there are any tombstone entries recorded.
func (t *Tombstoner) HasTombstones() bool {
files := t.TombstoneFiles()
t.mu.RLock()
n := len(t.tombstones)
t.mu.RUnlock()
return len(files) > 0 && files[0].Size > 0 || n > 0
}
// TombstoneFiles returns any tombstone files associated with Tombstoner's TSM file.
func (t *Tombstoner) TombstoneFiles() []FileStat {
t.mu.RLock()
if t.statsLoaded {
stats := t.fileStats
t.mu.RUnlock()
return stats
}
t.mu.RUnlock()
stat, err := os.Stat(t.tombstonePath())
if os.IsNotExist(err) || err != nil {
t.mu.Lock()
// The file doesn't exist so record that we tried to load it so
// we don't continue to keep trying. This is the common case.
t.statsLoaded = os.IsNotExist(err)
t.fileStats = t.fileStats[:0]
t.mu.Unlock()
return nil
}
t.mu.Lock()
t.fileStats = append(t.fileStats[:0], FileStat{
Path: t.tombstonePath(),
LastModified: stat.ModTime().UnixNano(),
Size: uint32(stat.Size()),
})
t.statsLoaded = true
stats := t.fileStats
t.mu.Unlock()
return stats
}
// Walk calls fn for every Tombstone under the Tombstoner.
func (t *Tombstoner) Walk(fn func(t Tombstone) error) error {
t.mu.Lock()
defer t.mu.Unlock()
f, err := os.Open(t.tombstonePath())
if os.IsNotExist(err) {
return nil
} else if err != nil {
return err
}
defer f.Close()
var b [4]byte
if _, err := f.Read(b[:]); err != nil {
return errors.New("unable to read header")
}
if _, err := f.Seek(0, io.SeekStart); err != nil {
return err
}
header := binary.BigEndian.Uint32(b[:])
if header == v4header {
return t.readTombstoneV4(f, fn)
}
return errors.New("invalid tombstone file")
}
func (t *Tombstoner) prepareLatest() error {
if t.pendingFile != nil { // There is already a pending tombstone file open.
return nil
}
tmpPath := fmt.Sprintf("%s.%s", t.tombstonePath(), CompactionTempExtension)
tmp, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_RDWR|os.O_EXCL, 0666)
if err != nil {
return err
}
removeTmp := func() {
tmp.Close()
os.Remove(tmp.Name())
}
// Copy the existing v4 file if it exists
f, err := os.Open(t.tombstonePath())
if err != nil && !os.IsNotExist(err) {
// An unexpected error should be returned
removeTmp()
return err
} else if err == nil {
// No error so load the tombstone file.
defer f.Close()
var b [4]byte
if n, err := f.Read(b[:]); n == 4 && err == nil {
header := binary.BigEndian.Uint32(b[:])
// There is an existing tombstone on disk and it's not a v4.
// We can't support it.
if header != v4header {
removeTmp()
return errIncompatibleV4Version
}
// Seek back to the beginning we copy the header
if _, err := f.Seek(0, io.SeekStart); err != nil {
removeTmp()
return err
}
// Copy the whole file
if _, err := io.Copy(tmp, f); err != nil {
f.Close()
removeTmp()
return err
}
}
}
// Else, the error was that the file does not exist. Create a new one.
var b [8]byte
bw := bufio.NewWriterSize(tmp, 64*1024)
// Write the header only if the file is new
if os.IsNotExist(err) {
binary.BigEndian.PutUint32(b[:4], v4header)
if _, err := bw.Write(b[:4]); err != nil {
removeTmp()
return err
}
}
// Write the tombstones
gz := gzip.NewWriter(bw)
t.pendingFile = tmp
t.gz = gz
t.bw = bw
return nil
}
func (t *Tombstoner) commit() error {
// No pending writes
if t.pendingFile == nil {
return nil
}
if err := t.gz.Close(); err != nil {
return err
}
if err := t.bw.Flush(); err != nil {
return err
}
// fsync the file to flush the write
if err := t.pendingFile.Sync(); err != nil {
return err
}
tmpFilename := t.pendingFile.Name()
t.pendingFile.Close()
if err := t.obs.FileFinishing(tmpFilename); err != nil {
return err
}
if err := fs.RenameFileWithReplacement(tmpFilename, t.tombstonePath()); err != nil {
return err
}
if err := fs.SyncDir(filepath.Dir(t.tombstonePath())); err != nil {
return err
}
t.pendingFile = nil
t.bw = nil
t.gz = nil
return nil
}
func (t *Tombstoner) rollback() error {
if t.pendingFile == nil {
return nil
}
tmpFilename := t.pendingFile.Name()
t.pendingFile.Close()
t.gz = nil
t.bw = nil
t.pendingFile = nil
return os.Remove(tmpFilename)
}
// readTombstoneV4 reads the fourth version of tombstone files that are capable
// of storing multiple v3 files appended together.
func (t *Tombstoner) readTombstoneV4(f *os.File, fn func(t Tombstone) error) error {
// Skip header, already checked earlier
if t.lastAppliedOffset != 0 {
if _, err := f.Seek(t.lastAppliedOffset, io.SeekStart); err != nil {
return err
}
} else {
if _, err := f.Seek(headerSize, io.SeekStart); err != nil {
return err
}
}
const kmask = int64(0xff000000) // Mask for non key-length bits
br := bufio.NewReaderSize(f, 64*1024)
gr, err := gzip.NewReader(br)
if err == io.EOF {
return nil
} else if err != nil {
return err
}
defer gr.Close()
var ( // save these buffers across loop iterations to avoid allocations
keyBuf []byte
predBuf []byte
)
for {
gr.Multistream(false)
if err := func() error {
for {
var buf [8]byte
if _, err = io.ReadFull(gr, buf[:4]); err == io.EOF || err == io.ErrUnexpectedEOF {
return nil
} else if err != nil {
return err
}
keyLen := int64(binary.BigEndian.Uint32(buf[:4]))
prefix := keyLen>>31&1 == 1 // Prefix is set according to whether the highest bit is set.
hasPred := keyLen>>30&1 == 1
// Remove 8 MSB to get correct length.
keyLen &^= kmask
if int64(len(keyBuf)) < keyLen {
keyBuf = make([]byte, keyLen)
}
// cap slice protects against invalid usages of append in callback
key := keyBuf[:keyLen:keyLen]
if _, err := io.ReadFull(gr, key); err != nil {
return err
}
if _, err := io.ReadFull(gr, buf[:8]); err != nil {
return err
}
min := int64(binary.BigEndian.Uint64(buf[:8]))
if _, err := io.ReadFull(gr, buf[:8]); err != nil {
return err
}
max := int64(binary.BigEndian.Uint64(buf[:8]))
var predicate []byte
if hasPred {
if _, err := io.ReadFull(gr, buf[:8]); err != nil {
return err
}
predLen := binary.BigEndian.Uint64(buf[:8])
if uint64(len(predBuf)) < predLen {
predBuf = make([]byte, predLen)
}
// cap slice protects against invalid usages of append in callback
predicate = predBuf[:predLen:predLen]
if _, err := io.ReadFull(gr, predicate); err != nil {
return err
}
}
if err := fn(Tombstone{
Key: key,
Min: min,
Max: max,
Prefix: prefix,
Predicate: predicate,
}); err != nil {
return err
}
}
}(); err != nil {
return err
}
for _, t := range t.tombstones {
if err := fn(t); err != nil {
return err
}
}
err = gr.Reset(br)
if err == io.EOF {
break
}
}
// Save the position of tombstone file so we don't re-apply the same set again if there are
// more deletes.
pos, err := f.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
t.lastAppliedOffset = pos
return nil
}
func (t *Tombstoner) tombstonePath() string {
if strings.HasSuffix(t.Path, "tombstone") {
return t.Path
}
// Filename is 0000001.tsm1
filename := filepath.Base(t.Path)
// Strip off the tsm1
ext := filepath.Ext(filename)
if ext != "" {
filename = strings.TrimSuffix(filename, ext)
}
// Append the "tombstone" suffix to create a 0000001.tombstone file
return filepath.Join(filepath.Dir(t.Path), filename+".tombstone")
}
func (t *Tombstoner) writeTombstoneV4(dst io.Writer, ts Tombstone) error {
maxKeyLen := 0x00ffffff // 24 bit key length. Top 8 bits for other information.
// Maximum key length. Leaves 8 spare bits.
if len(ts.Key) > maxKeyLen {
return fmt.Errorf("key has length %d, maximum allowed key length %d", len(ts.Key), maxKeyLen)
}
l := uint32(len(ts.Key))
if ts.Prefix {
// A mask to set the prefix bit on a tombstone.
l |= 1 << 31
}
if len(ts.Predicate) > 0 {
// A mask to set the predicate bit on a tombstone
l |= 1 << 30
}
binary.BigEndian.PutUint32(t.tmp[:4], l)
if _, err := dst.Write(t.tmp[:4]); err != nil {
return err
}
if _, err := dst.Write([]byte(ts.Key)); err != nil {
return err
}
binary.BigEndian.PutUint64(t.tmp[:], uint64(ts.Min))
if _, err := dst.Write(t.tmp[:]); err != nil {
return err
}
binary.BigEndian.PutUint64(t.tmp[:], uint64(ts.Max))
if _, err := dst.Write(t.tmp[:]); err != nil {
return err
}
if len(ts.Predicate) > 0 {
binary.BigEndian.PutUint64(t.tmp[:], uint64(len(ts.Predicate)))
if _, err := dst.Write(t.tmp[:]); err != nil {
return err
}
if _, err := dst.Write(ts.Predicate); err != nil {
return err
}
}
return nil
}