fix: make TSI index compact old and too-large log files (#22334)

* TSI index should compact old or too-large log files * Old tsl files should be compacted without new writes * Add extra logging when disk size test fails Co-authored-by: Sam Arnold <sarnold@influxdata.com>
2021-08-30 18:27:48 -04:00 · 2021-08-30 18:27:48 -04:00 · 12fff64760
parent cc6accf106
commit 12fff64760
7 changed files with 232 additions and 47 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -62,6 +62,7 @@ This release adds an embedded SQLite database for storing metadata required by t
 1. [22235](https://github.com/influxdata/influxdb/pull/22235): Avoid compaction queue stats flutter.
 1. [22272](https://github.com/influxdata/influxdb/pull/22272): Requests to `/api/v2/authorizations` filter correctly on `org` and `user` parameters.
 1. [22311](https://github.com/influxdata/influxdb/pull/22311): Enforce max field size while parsing line protocol.
+1. [22334](https://github.com/influxdata/influxdb/pull/22334): Periodically compact old and large TSI files.

 ## v2.0.8 [2021-08-13]

--- a/cmd/influxd/inspect/build_tsi/build_tsi.go
+++ b/cmd/influxd/inspect/build_tsi/build_tsi.go
@ -492,6 +492,21 @@ func IndexShard(sfile *tsdb.SeriesFile, dataDir, walDir string, maxLogFileSize i
 		return err
 	}

+	log.Debug("Reopening TSI index with max-index-log-file-size=1 to fully compact log files")
+	compactingIndex := tsi1.NewIndex(sfile, "",
+		tsi1.WithPath(tmpPath),
+		tsi1.WithMaximumLogFileSize(1),
+	)
+	if err := compactingIndex.Open(); err != nil {
+		return err
+	}
+	compactingIndex.Compact()
+	compactingIndex.Wait()
+	log.Debug("re-closing tsi index")
+	if err := compactingIndex.Close(); err != nil {
+		return err
+	}
+
 	// Rename TSI to standard path.
 	log.Debug("Moving tsi to permanent location")
 	return os.Rename(tmpPath, indexPath)
--- a/tsdb/index/tsi1/cache_test.go
+++ b/tsdb/index/tsi1/cache_test.go
@ -9,6 +9,28 @@ import (
 	"github.com/influxdata/influxdb/v2/tsdb"
 )

+// This function is used to log the components of disk size when DiskSizeBytes fails
+func (i *Index) LogDiskSize(t *testing.T) {
+	fs, err := i.RetainFileSet()
+	if err != nil {
+		t.Log("could not retain fileset")
+	}
+	defer fs.Release()
+	var size int64
+	// Get MANIFEST sizes from each partition.
+	for count, p := range i.partitions {
+		sz := p.manifestSize
+		t.Logf("Parition %d has size %d", count, sz)
+		size += sz
+	}
+	for _, f := range fs.files {
+		sz := f.Size()
+		t.Logf("Size of file %s is %d", f.Path(), sz)
+		size += sz
+	}
+	t.Logf("Total size is %d", size)
+}
+
 func TestTagValueSeriesIDCache(t *testing.T) {
 	m0k0v0 := tsdb.NewSeriesIDSet(1, 2, 3, 4, 5)
 	m0k0v1 := tsdb.NewSeriesIDSet(10, 20, 30, 40, 50)
--- a/tsdb/index/tsi1/index.go
+++ b/tsdb/index/tsi1/index.go
@ -10,6 +10,7 @@ import (
 	"strconv"
 	"sync"
 	"sync/atomic"
+	"time"
 	"unsafe"

 	"github.com/cespare/xxhash"
@ -42,6 +43,7 @@ func init() {
 		idx := NewIndex(sfile, db,
 			WithPath(path),
 			WithMaximumLogFileSize(int64(opt.Config.MaxIndexLogFileSize)),
+			WithMaximumLogFileAge(time.Duration(opt.Config.CompactFullWriteColdDuration)),
 			WithSeriesIDCacheSize(opt.Config.SeriesIDSetCacheSize),
 		)
 		return idx
@ -88,6 +90,12 @@ var WithMaximumLogFileSize = func(size int64) IndexOption {
 	}
 }

+var WithMaximumLogFileAge = func(dur time.Duration) IndexOption {
+	return func(i *Index) {
+		i.maxLogFileAge = dur
+	}
+}
+
 // DisableFsync disables flushing and syncing of underlying files. Primarily this
 // impacts the LogFiles. This option can be set when working with the index in
 // an offline manner, for cases where a hard failure can be overcome by re-running the tooling.
@ -130,12 +138,13 @@ type Index struct {
 	tagValueCacheSize int

 	// The following may be set when initializing an Index.
-	path               string      // Root directory of the index partitions.
-	disableCompactions bool        // Initially disables compactions on the index.
-	maxLogFileSize     int64       // Maximum size of a LogFile before it's compacted.
-	logfileBufferSize  int         // The size of the buffer used by the LogFile.
-	disableFsync       bool        // Disables flushing buffers and fsyning files. Used when working with indexes offline.
-	logger             *zap.Logger // Index's logger.
+	path               string        // Root directory of the index partitions.
+	disableCompactions bool          // Initially disables compactions on the index.
+	maxLogFileSize     int64         // Maximum size of a LogFile before it's compacted.
+	maxLogFileAge      time.Duration // Maximum age of a LogFile before it's compacted.
+	logfileBufferSize  int           // The size of the buffer used by the LogFile.
+	disableFsync       bool          // Disables flushing buffers and fsyning files. Used when working with indexes offline.
+	logger             *zap.Logger   // Index's logger.

 	// The following must be set when initializing an Index.
 	sfile    *tsdb.SeriesFile // series lookup file
@ -161,6 +170,7 @@ func NewIndex(sfile *tsdb.SeriesFile, database string, options ...IndexOption) *
 	idx := &Index{
 		tagValueCacheSize: tsdb.DefaultSeriesIDSetCacheSize,
 		maxLogFileSize:    tsdb.DefaultMaxIndexLogFileSize,
+		maxLogFileAge:     tsdb.DefaultCompactFullWriteColdDuration,
 		logger:            zap.NewNop(),
 		version:           Version,
 		sfile:             sfile,
@ -193,6 +203,7 @@ func (i *Index) Bytes() int {
 	b += int(unsafe.Sizeof(i.path)) + len(i.path)
 	b += int(unsafe.Sizeof(i.disableCompactions))
 	b += int(unsafe.Sizeof(i.maxLogFileSize))
+	b += int(unsafe.Sizeof(i.maxLogFileAge))
 	b += int(unsafe.Sizeof(i.logger))
 	b += int(unsafe.Sizeof(i.sfile))
 	// Do not count SeriesFile because it belongs to the code that constructed this Index.
@ -259,6 +270,7 @@ func (i *Index) Open() error {
 	for j := 0; j < len(i.partitions); j++ {
 		p := NewPartition(i.sfile, filepath.Join(i.path, fmt.Sprint(j)))
 		p.MaxLogFileSize = i.maxLogFileSize
+		p.MaxLogFileAge = i.maxLogFileAge
 		p.nosync = i.disableFsync
 		p.logbufferSize = i.logfileBufferSize
 		p.logger = i.logger.With(zap.String("tsi1_partition", fmt.Sprint(j+1)))
--- a/tsdb/index/tsi1/index_test.go
+++ b/tsdb/index/tsi1/index_test.go
@ -11,6 +11,7 @@ import (
 	"sort"
 	"sync"
 	"testing"
+	"time"

 	"github.com/influxdata/influxdb/v2/models"
 	"github.com/influxdata/influxdb/v2/tsdb"
@ -222,6 +223,13 @@ func TestIndex_Open(t *testing.T) {
 				t.Fatalf("got index version %d, expected %d", got, exp)
 			}
 		}
+
+		for i := 0; i < int(idx.PartitionN); i++ {
+			p := idx.PartitionAt(i)
+			if got, exp := p.NeedsCompaction(), false; got != exp {
+				t.Fatalf("got needs compaction %v, expected %v", got, exp)
+			}
+		}
 	})

 	// Reopening an open index should return an error.
@ -298,15 +306,25 @@ func TestIndex_DiskSizeBytes(t *testing.T) {
 		t.Fatal(err)
 	}

-	// Verify on disk size is the same in each stage.
-	// Each series stores flag(1) + series(uvarint(2)) + len(name)(1) + len(key)(1) + len(value)(1) + checksum(4).
-	expSize := int64(4 * 9)
+	idx.RunStateAware(t, func(t *testing.T, state int) {
+		// Each MANIFEST file is 419 bytes and there are tsi1.DefaultPartitionN of them
+		expSize := int64(tsi1.DefaultPartitionN * 419)
+		switch state {
+		case Initial:
+			fallthrough
+		case Reopen:
+			// In the log file, each series stores flag(1) + series(uvarint(2)) + len(name)(1) + len(key)(1) + len(value)(1) + checksum(4).
+			expSize += 4 * 9
+		case PostCompaction:
+			fallthrough
+		case PostCompactionReopen:
+			// For TSI files after a compaction, instead of 4*9, we have encoded measurement names, tag names, etc which is larger
+			expSize += 2202
+		}

-	// Each MANIFEST file is 419 bytes and there are tsi1.DefaultPartitionN of them
-	expSize += int64(tsi1.DefaultPartitionN * 419)
-
-	idx.Run(t, func(t *testing.T) {
 		if got, exp := idx.DiskSizeBytes(), expSize; got != exp {
+			// We had some odd errors - if the size is unexpected, log it
+			idx.Index.LogDiskSize(t)
 			t.Fatalf("got %d bytes, expected %d", got, exp)
 		}
 	})
@ -517,7 +535,7 @@ func (idx *Index) Close() error {
 }

 // Reopen closes and opens the index.
-func (idx *Index) Reopen() error {
+func (idx *Index) Reopen(maxLogSize int64) error {
 	if err := idx.Index.Close(); err != nil {
 		return err
 	}
@ -529,11 +547,24 @@ func (idx *Index) Reopen() error {
 	}

 	partitionN := idx.Index.PartitionN // Remember how many partitions to use.
-	idx.Index = tsi1.NewIndex(idx.SeriesFile.SeriesFile, "db0", tsi1.WithPath(idx.Index.Path()))
+	idx.Index = tsi1.NewIndex(idx.SeriesFile.SeriesFile, "db0", tsi1.WithPath(idx.Index.Path()), tsi1.WithMaximumLogFileSize(maxLogSize))
 	idx.Index.PartitionN = partitionN
 	return idx.Open()
 }

+const (
+	Initial = iota
+	Reopen
+	PostCompaction
+	PostCompactionReopen
+)
+
+func curryState(state int, f func(t *testing.T, state int)) func(t *testing.T) {
+	return func(t *testing.T) {
+		f(t, state)
+	}
+}
+
 // Run executes a subtest for each of several different states:
 //
 // - Immediately
@ -544,27 +575,42 @@ func (idx *Index) Reopen() error {
 // The index should always respond in the same fashion regardless of
 // how data is stored. This helper allows the index to be easily tested
 // in all major states.
-func (idx *Index) Run(t *testing.T, fn func(t *testing.T)) {
+func (idx *Index) RunStateAware(t *testing.T, fn func(t *testing.T, state int)) {
 	// Invoke immediately.
-	t.Run("state=initial", fn)
+	t.Run("state=initial", curryState(Initial, fn))

 	// Reopen and invoke again.
-	if err := idx.Reopen(); err != nil {
+	if err := idx.Reopen(tsdb.DefaultMaxIndexLogFileSize); err != nil {
 		t.Fatalf("reopen error: %s", err)
 	}
-	t.Run("state=reopen", fn)
+	t.Run("state=reopen", curryState(Reopen, fn))

-	// TODO: Request a compaction.
-	// if err := idx.Compact(); err != nil {
-	// 	t.Fatalf("compact error: %s", err)
-	// }
-	// t.Run("state=post-compaction", fn)
+	// Reopen requiring a full compaction of the TSL files and invoke again.
+	idx.Reopen(1)
+	for {
+		needsCompaction := false
+		for i := 0; i < int(idx.PartitionN); i++ {
+			needsCompaction = needsCompaction || idx.PartitionAt(i).NeedsCompaction()
+		}
+		if !needsCompaction {
+			break
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	t.Run("state=post-compaction", curryState(PostCompaction, fn))

 	// Reopen and invoke again.
-	if err := idx.Reopen(); err != nil {
+	if err := idx.Reopen(tsdb.DefaultMaxIndexLogFileSize); err != nil {
 		t.Fatalf("post-compaction reopen error: %s", err)
 	}
-	t.Run("state=post-compaction-reopen", fn)
+	t.Run("state=post-compaction-reopen", curryState(PostCompactionReopen, fn))
+}
+
+// Run is the same is RunStateAware but for tests that do not depend on compaction state
+func (idx *Index) Run(t *testing.T, fn func(t *testing.T)) {
+	idx.RunStateAware(t, func(t *testing.T, _ int) {
+		fn(t)
+	})
 }

 // CreateSeriesSliceIfNotExists creates multiple series at a time.
--- a/tsdb/index/tsi1/log_file.go
+++ b/tsdb/index/tsi1/log_file.go
@ -87,6 +87,8 @@ func NewLogFile(sfile *tsdb.SeriesFile, path string) *LogFile {

 // bytes estimates the memory footprint of this LogFile, in bytes.
 func (f *LogFile) bytes() int {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
 	var b int
 	b += 24 // mu RWMutex is 24 bytes
 	b += 16 // wg WaitGroup is 16 bytes
@ -263,6 +265,13 @@ func (f *LogFile) Size() int64 {
 	return v
 }

+// ModTime returns the last modified time of the file
+func (f *LogFile) ModTime() time.Time {
+	f.mu.RLock()
+	defer f.mu.RUnlock()
+	return f.modTime
+}
+
 // Measurement returns a measurement element.
 func (f *LogFile) Measurement(name []byte) MeasurementElem {
 	f.mu.RLock()
--- a/tsdb/index/tsi1/partition.go
+++ b/tsdb/index/tsi1/partition.go
@ -71,6 +71,7 @@ type Partition struct {

 	// Log file compaction thresholds.
 	MaxLogFileSize int64
+	MaxLogFileAge  time.Duration
 	nosync         bool // when true, flushing and syncing of LogFile will be disabled.
 	logbufferSize  int  // the LogFile's buffer is set to this value.

@ -130,6 +131,7 @@ func (p *Partition) bytes() int {
 	b += int(unsafe.Sizeof(p.path)) + len(p.path)
 	b += int(unsafe.Sizeof(p.id)) + len(p.id)
 	b += int(unsafe.Sizeof(p.MaxLogFileSize))
+	b += int(unsafe.Sizeof(p.MaxLogFileAge))
 	b += int(unsafe.Sizeof(p.compactionInterrupt))
 	b += int(unsafe.Sizeof(p.compactionsDisabled))
 	b += int(unsafe.Sizeof(p.logger))
@ -238,7 +240,7 @@ func (p *Partition) Open() error {
 	p.opened = true

 	// Send a compaction request on start up.
-	p.compact()
+	go p.runPeriodicCompaction()

 	return nil
 }
@ -473,6 +475,7 @@ func (p *Partition) prependActiveLogFile() error {
 	manifestSize, err := p.Manifest().Write()
 	if err != nil {
 		// TODO: Close index if write fails.
+		p.logger.Error("manifest write failed, index is potentially damaged", zap.Error(err))
 		return err
 	}
 	p.manifestSize = manifestSize
@ -892,7 +895,54 @@ func (p *Partition) compactionsEnabled() bool {
 	return p.compactionsDisabled == 0
 }

+func (p *Partition) runPeriodicCompaction() {
+	// kick off an initial compaction at startup without the optimization check
+	p.Compact()
+
+	// Avoid a race when using Reopen in tests
+	p.mu.RLock()
+	closing := p.closing
+	p.mu.RUnlock()
+
+	// check for compactions once an hour (usually not necessary but a nice safety check)
+	t := time.NewTicker(1 * time.Hour)
+	defer t.Stop()
+	for {
+		select {
+		case <-closing:
+			return
+		case <-t.C:
+			if p.NeedsCompaction() {
+				p.Compact()
+			}
+		}
+	}
+}
+
+// NeedsCompaction only requires a read lock and checks if there are files that could be compacted.
+// If compact is updated we should also update NeedsCompaction.
+func (p *Partition) NeedsCompaction() bool {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+	if p.needsLogCompaction() {
+		return true
+	}
+	levelCount := make(map[int]int)
+	maxLevel := len(p.levels) - 2
+	// If we have 2 log files (level 0), or if we have 2 files at the same level, we should do a compaction.
+	for _, f := range p.fileSet.files {
+		level := f.Level()
+		levelCount[level]++
+		if level <= maxLevel && levelCount[level] > 1 && !p.levelCompacting[level] {
+			return true
+		}
+	}
+	return false
+}
+
 // compact compacts continguous groups of files that are not currently compacting.
+//
+// compact requires that mu is write-locked.
 func (p *Partition) compact() {
 	if p.isClosing() {
 		return
@ -904,6 +954,37 @@ func (p *Partition) compact() {
 	fs := p.retainFileSet()
 	defer fs.Release()

+	// check if the current active log file should be rolled
+	if p.needsLogCompaction() {
+		if err := p.prependActiveLogFile(); err != nil {
+			p.logger.Error("failed to retire active log file", zap.Error(err))
+		}
+	}
+
+	// compact any non-active log files first
+	for _, f := range p.fileSet.files {
+		if f.Level() == 0 {
+			logFile := f.(*LogFile) // It is an invariant that a file is level 0 iff it is a log file
+			if logFile == p.activeLogFile {
+				continue
+			}
+			if p.levelCompacting[0] {
+				break
+			}
+			// Mark the level as compacting.
+			p.levelCompacting[0] = true
+			p.currentCompactionN++
+			go func() {
+				p.compactLogFile(logFile)
+				p.mu.Lock()
+				p.currentCompactionN--
+				p.levelCompacting[0] = false
+				p.mu.Unlock()
+				p.Compact()
+			}()
+		}
+	}
+
 	// Iterate over each level we are going to compact.
 	// We skip the first level (0) because it is log files and they are compacted separately.
 	// We skip the last level because the files have no higher level to compact into.
@ -956,6 +1037,11 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch
 	assert(len(files) >= 2, "at least two index files are required for compaction")
 	assert(level > 0, "cannot compact level zero")

+	// Files have already been retained by caller.
+	// Ensure files are released only once.
+	var once sync.Once
+	defer once.Do(func() { IndexFiles(files).Release() })
+
 	// Build a logger for this compaction.
 	log, logEnd := logger.NewOperation(context.TODO(), p.logger, "TSI level compaction", "tsi1_compact_to_level", zap.Int("tsi1_level", level))
 	defer logEnd()
@ -968,11 +1054,6 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch
 	default:
 	}

-	// Files have already been retained by caller.
-	// Ensure files are released only once.
-	var once sync.Once
-	defer once.Do(func() { IndexFiles(files).Release() })
-
 	// Track time to compact.
 	start := time.Now()

@ -1060,13 +1141,22 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch

 func (p *Partition) Rebuild() {}

+// needsLogCompaction returns true if the log file is too big or too old
+// The caller must have at least a read lock on the partition
+func (p *Partition) needsLogCompaction() bool {
+	size := p.activeLogFile.Size()
+	modTime := p.activeLogFile.ModTime()
+	return size >= p.MaxLogFileSize || (size > 0 && modTime.Before(time.Now().Add(-p.MaxLogFileAge)))
+}
+
 func (p *Partition) CheckLogFile() error {
-	// Check log file size under read lock.
-	if size := func() int64 {
+	// Check log file under read lock.
+	needsCompaction := func() bool {
 		p.mu.RLock()
 		defer p.mu.RUnlock()
-		return p.activeLogFile.Size()
-	}(); size < p.MaxLogFileSize {
+		return p.needsLogCompaction()
+	}()
+	if !needsCompaction {
 		return nil
 	}

@ -1077,27 +1167,17 @@ func (p *Partition) CheckLogFile() error {
 }

 func (p *Partition) checkLogFile() error {
-	if p.activeLogFile.Size() < p.MaxLogFileSize {
+	if !p.needsLogCompaction() {
 		return nil
 	}

-	// Swap current log file.
-	logFile := p.activeLogFile
-
 	// Open new log file and insert it into the first position.
 	if err := p.prependActiveLogFile(); err != nil {
 		return err
 	}

 	// Begin compacting in a background goroutine.
-	p.currentCompactionN++
 	go func() {
-		p.compactLogFile(logFile)
-
-		p.mu.Lock()
-		p.currentCompactionN-- // compaction is now complete
-		p.mu.Unlock()
-
 		p.Compact() // check for new compactions
 	}()