From f56bc0853f6282673f6cd7fbda0923c8ad2da9ab Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 31 Oct 2018 17:41:56 +0000 Subject: [PATCH 01/25] Convert TSM compaction stats to Prom metrics This commits converts all the 1.x TSM compaction statistics, which previously were written to an _internal db, to Prometheus metrics. --- cmd/influxd/main.go | 2 + storage/engine.go | 1 + tsdb/engine.go | 126 ++++++++++++++ tsdb/tsm1/engine.go | 368 ++++++++++++++++++++++------------------- tsdb/tsm1/metrics.go | 84 ++++++++++ tsdb/tsm1/scheduler.go | 24 +-- 6 files changed, 419 insertions(+), 186 deletions(-) create mode 100644 tsdb/engine.go create mode 100644 tsdb/tsm1/metrics.go diff --git a/cmd/influxd/main.go b/cmd/influxd/main.go index b4b9f768cf..43f1691d29 100644 --- a/cmd/influxd/main.go +++ b/cmd/influxd/main.go @@ -271,6 +271,8 @@ func (m *Main) run(ctx context.Context) (err error) { m.logger.Error("failed to open engine", zap.Error(err)) return err } + // The Engine's metrics must be registered after it opens. + reg.MustRegister(engine.PrometheusCollectors()...) pointsWriter = m.engine diff --git a/storage/engine.go b/storage/engine.go index 379af0ba81..3722a8c894 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -169,6 +169,7 @@ func (e *Engine) PrometheusCollectors() []prometheus.Collector { // TODO(edd): Get prom metrics for TSM. // TODO(edd): Get prom metrics for index. // TODO(edd): Get prom metrics for series file. + metrics = append(metrics, e.engine.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) return metrics } diff --git a/tsdb/engine.go b/tsdb/engine.go new file mode 100644 index 0000000000..c5e8aeccff --- /dev/null +++ b/tsdb/engine.go @@ -0,0 +1,126 @@ +package tsdb + +import ( + "context" + "errors" + "regexp" + "runtime" + "time" + + "github.com/influxdata/influxql" + "github.com/influxdata/platform/models" + "github.com/influxdata/platform/pkg/limiter" + "go.uber.org/zap" +) + +var ( + // ErrUnknownEngineFormat is returned when the engine format is + // unknown. ErrUnknownEngineFormat is currently returned if a format + // other than tsm1 is encountered. + ErrUnknownEngineFormat = errors.New("unknown engine format") +) + +// Engine represents a swappable storage engine for the shard. +type Engine interface { + Open() error + Close() error + SetEnabled(enabled bool) + SetCompactionsEnabled(enabled bool) + ScheduleFullCompaction() error + + WithLogger(*zap.Logger) + + CreateCursorIterator(ctx context.Context) (CursorIterator, error) + WritePoints(points []models.Point) error + + CreateSeriesIfNotExists(key, name []byte, tags models.Tags, typ models.FieldType) error + CreateSeriesListIfNotExists(collection *SeriesCollection) error + DeleteSeriesRange(itr SeriesIterator, min, max int64) error + DeleteSeriesRangeWithPredicate(itr SeriesIterator, predicate func(name []byte, tags models.Tags) (int64, int64, bool)) error + + SeriesN() int64 + + MeasurementExists(name []byte) (bool, error) + + MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error) + ForEachMeasurementName(fn func(name []byte) error) error + DeleteMeasurement(name []byte) error + + HasTagKey(name, key []byte) (bool, error) + MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error) + TagKeyCardinality(name, key []byte) int + + LastModified() time.Time + DiskSize() int64 + IsIdle() bool + Free() error +} + +// SeriesIDSets provides access to the total set of series IDs +type SeriesIDSets interface { + ForEach(f func(ids *SeriesIDSet)) error +} + +// EngineOptions represents the options used to initialize the engine. +type EngineOptions struct { + EngineVersion string + ShardID uint64 + + // Limits the concurrent number of TSM files that can be loaded at once. + OpenLimiter limiter.Fixed + + // CompactionDisabled specifies shards should not schedule compactions. + // This option is intended for offline tooling. + CompactionDisabled bool + CompactionPlannerCreator CompactionPlannerCreator + CompactionLimiter limiter.Fixed + CompactionThroughputLimiter limiter.Rate + WALEnabled bool + MonitorDisabled bool + + // DatabaseFilter is a predicate controlling which databases may be opened. + // If no function is set, all databases will be opened. + DatabaseFilter func(database string) bool + + // RetentionPolicyFilter is a predicate controlling which combination of database and retention policy may be opened. + // nil will allow all combinations to pass. + RetentionPolicyFilter func(database, rp string) bool + + // ShardFilter is a predicate controlling which combination of database, retention policy and shard group may be opened. + // nil will allow all combinations to pass. + ShardFilter func(database, rp string, id uint64) bool + + Config Config + SeriesIDSets SeriesIDSets + + OnNewEngine func(Engine) + + FileStoreObserver FileStoreObserver +} + +// NewEngineOptions constructs an EngineOptions object with safe default values. +// This should only be used in tests; production environments should read from a config file. +func NewEngineOptions() EngineOptions { + return EngineOptions{ + EngineVersion: DefaultEngine, + Config: NewConfig(), + OpenLimiter: limiter.NewFixed(runtime.GOMAXPROCS(0)), + CompactionLimiter: limiter.NewFixed(1), + WALEnabled: false, + } +} + +// NewInmemIndex returns a new "inmem" index type. +var NewInmemIndex func(name string, sfile *SeriesFile) (interface{}, error) + +type CompactionPlannerCreator func(cfg Config) interface{} + +// FileStoreObserver is passed notifications before the file store adds or deletes files. In this way, it can +// be sure to observe every file that is added or removed even in the presence of process death. +type FileStoreObserver interface { + // FileFinishing is called before a file is renamed to it's final name. + FileFinishing(path string) error + + // FileUnlinking is called before a file is unlinked. + FileUnlinking(path string) error +} diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 87e1280de1..69df60743a 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "io" "io/ioutil" "math" "os" @@ -26,6 +27,7 @@ import ( "github.com/influxdata/platform/query" "github.com/influxdata/platform/tsdb" "github.com/influxdata/platform/tsdb/tsi1" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -35,7 +37,8 @@ import ( //go:generate env GO111MODULE=on go run github.com/benbjohnson/tmpl -data=@compact.gen.go.tmpldata compact.gen.go.tmpl //go:generate env GO111MODULE=on go run github.com/benbjohnson/tmpl -data=@reader.gen.go.tmpldata reader.gen.go.tmpl -var ( // Static objects to prevent small allocs. +var ( + // Static objects to prevent small allocs. keyFieldSeparatorBytes = []byte(keyFieldSeparator) emptyBytes = []byte{} ) @@ -70,44 +73,6 @@ const ( MaxPointsPerBlock = 1000 ) -// Statistics gathered by the engine. -const ( - statCacheCompactions = "cacheCompactions" - statCacheCompactionsActive = "cacheCompactionsActive" - statCacheCompactionError = "cacheCompactionErr" - statCacheCompactionDuration = "cacheCompactionDuration" - - statTSMLevel1Compactions = "tsmLevel1Compactions" - statTSMLevel1CompactionsActive = "tsmLevel1CompactionsActive" - statTSMLevel1CompactionError = "tsmLevel1CompactionErr" - statTSMLevel1CompactionDuration = "tsmLevel1CompactionDuration" - statTSMLevel1CompactionQueue = "tsmLevel1CompactionQueue" - - statTSMLevel2Compactions = "tsmLevel2Compactions" - statTSMLevel2CompactionsActive = "tsmLevel2CompactionsActive" - statTSMLevel2CompactionError = "tsmLevel2CompactionErr" - statTSMLevel2CompactionDuration = "tsmLevel2CompactionDuration" - statTSMLevel2CompactionQueue = "tsmLevel2CompactionQueue" - - statTSMLevel3Compactions = "tsmLevel3Compactions" - statTSMLevel3CompactionsActive = "tsmLevel3CompactionsActive" - statTSMLevel3CompactionError = "tsmLevel3CompactionErr" - statTSMLevel3CompactionDuration = "tsmLevel3CompactionDuration" - statTSMLevel3CompactionQueue = "tsmLevel3CompactionQueue" - - statTSMOptimizeCompactions = "tsmOptimizeCompactions" - statTSMOptimizeCompactionsActive = "tsmOptimizeCompactionsActive" - statTSMOptimizeCompactionError = "tsmOptimizeCompactionErr" - statTSMOptimizeCompactionDuration = "tsmOptimizeCompactionDuration" - statTSMOptimizeCompactionQueue = "tsmOptimizeCompactionQueue" - - statTSMFullCompactions = "tsmFullCompactions" - statTSMFullCompactionsActive = "tsmFullCompactionsActive" - statTSMFullCompactionError = "tsmFullCompactionErr" - statTSMFullCompactionDuration = "tsmFullCompactionDuration" - statTSMFullCompactionQueue = "tsmFullCompactionQueue" -) - // An EngineOption is a functional option for changing the configuration of // an Engine. type EngineOption func(i *Engine) @@ -190,7 +155,9 @@ type Engine struct { // Controls whether to enabled compactions when the engine is open enableCompactionsOnOpen bool - stats *EngineStatistics + compactionTracker *compactionTracker // Used to track state of compactions. + blockMetrics *blockMetrics // Provides Engine metrics to external systems. + defaultMetricLabels prometheus.Labels // N.B this must not be mutated after Open is called. // Limiter for concurrent compactions. compactionLimiter limiter.Fixed @@ -234,7 +201,6 @@ func NewEngine(path string, idx *tsi1.Index, config Config, options ...EngineOpt } logger := zap.NewNop() - stats := &EngineStatistics{} e := &Engine{ path: path, index: idx, @@ -254,7 +220,6 @@ func NewEngine(path string, idx *tsi1.Index, config Config, options ...EngineOpt CacheFlushWriteColdDuration: time.Duration(config.Cache.SnapshotWriteColdDuration), enableCompactionsOnOpen: true, formatFileName: DefaultFormatFileName, - stats: stats, compactionLimiter: limiter.NewFixed(maxCompactions), scheduler: newScheduler(stats, maxCompactions), } @@ -522,81 +487,6 @@ func (e *Engine) MeasurementStats() (MeasurementStats, error) { return e.FileStore.MeasurementStats() } -// EngineStatistics maintains statistics for the engine. -type EngineStatistics struct { - CacheCompactions int64 // Counter of cache compactions that have ever run. - CacheCompactionsActive int64 // Gauge of cache compactions currently running. - CacheCompactionErrors int64 // Counter of cache compactions that have failed due to error. - CacheCompactionDuration int64 // Counter of number of wall nanoseconds spent in cache compactions. - - TSMCompactions [3]int64 // Counter of TSM compactions (by level) that have ever run. - TSMCompactionsActive [3]int64 // Gauge of TSM compactions (by level) currently running. - TSMCompactionErrors [3]int64 // Counter of TSM compcations (by level) that have failed due to error. - TSMCompactionDuration [3]int64 // Counter of number of wall nanoseconds spent in TSM compactions (by level). - TSMCompactionsQueue [3]int64 // Gauge of TSM compactions queues (by level). - - TSMOptimizeCompactions int64 // Counter of optimize compactions that have ever run. - TSMOptimizeCompactionsActive int64 // Gauge of optimize compactions currently running. - TSMOptimizeCompactionErrors int64 // Counter of optimize compactions that have failed due to error. - TSMOptimizeCompactionDuration int64 // Counter of number of wall nanoseconds spent in optimize compactions. - TSMOptimizeCompactionsQueue int64 // Gauge of optimize compactions queue. - - TSMFullCompactions int64 // Counter of full compactions that have ever run. - TSMFullCompactionsActive int64 // Gauge of full compactions currently running. - TSMFullCompactionErrors int64 // Counter of full compactions that have failed due to error. - TSMFullCompactionDuration int64 // Counter of number of wall nanoseconds spent in full compactions. - TSMFullCompactionsQueue int64 // Gauge of full compactions queue. -} - -// Statistics returns statistics for periodic monitoring. -func (e *Engine) Statistics(tags map[string]string) []models.Statistic { - statistics := make([]models.Statistic, 0, 4) - statistics = append(statistics, models.Statistic{ - Name: "tsm1_engine", - Tags: tags, - Values: map[string]interface{}{ - statCacheCompactions: atomic.LoadInt64(&e.stats.CacheCompactions), - statCacheCompactionsActive: atomic.LoadInt64(&e.stats.CacheCompactionsActive), - statCacheCompactionError: atomic.LoadInt64(&e.stats.CacheCompactionErrors), - statCacheCompactionDuration: atomic.LoadInt64(&e.stats.CacheCompactionDuration), - - statTSMLevel1Compactions: atomic.LoadInt64(&e.stats.TSMCompactions[0]), - statTSMLevel1CompactionsActive: atomic.LoadInt64(&e.stats.TSMCompactionsActive[0]), - statTSMLevel1CompactionError: atomic.LoadInt64(&e.stats.TSMCompactionErrors[0]), - statTSMLevel1CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[0]), - statTSMLevel1CompactionQueue: atomic.LoadInt64(&e.stats.TSMCompactionsQueue[0]), - - statTSMLevel2Compactions: atomic.LoadInt64(&e.stats.TSMCompactions[1]), - statTSMLevel2CompactionsActive: atomic.LoadInt64(&e.stats.TSMCompactionsActive[1]), - statTSMLevel2CompactionError: atomic.LoadInt64(&e.stats.TSMCompactionErrors[1]), - statTSMLevel2CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[1]), - statTSMLevel2CompactionQueue: atomic.LoadInt64(&e.stats.TSMCompactionsQueue[1]), - - statTSMLevel3Compactions: atomic.LoadInt64(&e.stats.TSMCompactions[2]), - statTSMLevel3CompactionsActive: atomic.LoadInt64(&e.stats.TSMCompactionsActive[2]), - statTSMLevel3CompactionError: atomic.LoadInt64(&e.stats.TSMCompactionErrors[2]), - statTSMLevel3CompactionDuration: atomic.LoadInt64(&e.stats.TSMCompactionDuration[2]), - statTSMLevel3CompactionQueue: atomic.LoadInt64(&e.stats.TSMCompactionsQueue[2]), - - statTSMOptimizeCompactions: atomic.LoadInt64(&e.stats.TSMOptimizeCompactions), - statTSMOptimizeCompactionsActive: atomic.LoadInt64(&e.stats.TSMOptimizeCompactionsActive), - statTSMOptimizeCompactionError: atomic.LoadInt64(&e.stats.TSMOptimizeCompactionErrors), - statTSMOptimizeCompactionDuration: atomic.LoadInt64(&e.stats.TSMOptimizeCompactionDuration), - statTSMOptimizeCompactionQueue: atomic.LoadInt64(&e.stats.TSMOptimizeCompactionsQueue), - - statTSMFullCompactions: atomic.LoadInt64(&e.stats.TSMFullCompactions), - statTSMFullCompactionsActive: atomic.LoadInt64(&e.stats.TSMFullCompactionsActive), - statTSMFullCompactionError: atomic.LoadInt64(&e.stats.TSMFullCompactionErrors), - statTSMFullCompactionDuration: atomic.LoadInt64(&e.stats.TSMFullCompactionDuration), - statTSMFullCompactionQueue: atomic.LoadInt64(&e.stats.TSMFullCompactionsQueue), - }, - }) - - statistics = append(statistics, e.Cache.Statistics(tags)...) - statistics = append(statistics, e.FileStore.Statistics(tags)...) - return statistics -} - // DiskSize returns the total size in bytes of all TSM and WAL segments on disk. func (e *Engine) DiskSize() int64 { walDiskSizeBytes := e.WAL.DiskSizeBytes() @@ -605,6 +495,11 @@ func (e *Engine) DiskSize() int64 { // Open opens and initializes the engine. func (e *Engine) Open() error { + // Initialise metrics... + e.blockMetrics = newBlockMetrics(e.defaultMetricLabels) + e.compactionTracker = newCompactionTracker(e.blockMetrics) + e.scheduler.setCompactionTracker(e.compactionTracker) + if err := os.MkdirAll(e.path, 0777); err != nil { return err } @@ -649,6 +544,18 @@ func (e *Engine) Close() error { return e.WAL.Close() } +// PrometheusCollectors returns all the prometheus collectors associated with +// the engine and its components. +func (e *Engine) PrometheusCollectors() []prometheus.Collector { + var metrics []prometheus.Collector + metrics = append(metrics, e.blockMetrics.PrometheusCollectors()...) + + // TODO(edd): Add Filestore metrics + // TODO(edd): Add Cache metrics + // TODO(edd): Add WAL metrics + return metrics +} + // WithLogger sets the logger for the engine. func (e *Engine) WithLogger(log *zap.Logger) { e.logger = log.With(zap.String("engine", "tsm1")) @@ -668,15 +575,7 @@ func (e *Engine) WithLogger(log *zap.Logger) { // shard is fully compacted. func (e *Engine) IsIdle() bool { cacheEmpty := e.Cache.Size() == 0 - - runningCompactions := atomic.LoadInt64(&e.stats.CacheCompactionsActive) - runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[0]) - runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[1]) - runningCompactions += atomic.LoadInt64(&e.stats.TSMCompactionsActive[2]) - runningCompactions += atomic.LoadInt64(&e.stats.TSMFullCompactionsActive) - runningCompactions += atomic.LoadInt64(&e.stats.TSMOptimizeCompactionsActive) - - return cacheEmpty && runningCompactions == 0 && e.CompactionPlan.FullyCompacted() + return cacheEmpty && e.compactionTracker.AllActive() == 0 && e.CompactionPlan.FullyCompacted() } // Free releases any resources held by the engine to free up memory or CPU. @@ -1106,6 +1005,149 @@ func (e *Engine) CreateSeriesListIfNotExists(collection *tsdb.SeriesCollection) return e.index.CreateSeriesListIfNotExists(collection) } +// WriteTo is not implemented. +func (e *Engine) WriteTo(w io.Writer) (n int64, err error) { panic("not implemented") } + +// compactionLevel describes a snapshot or levelled compaction. +type compactionLevel int + +func (l compactionLevel) String() string { + switch l { + case 0: + return "snapshot" + case 1, 2, 3: + return fmt.Sprint(int(l)) + case 4: + return "optimize" + case 5: + return "full" + default: + panic("unsupported compaction level") + } +} + +// compactionTracker tracks compactions and snapshots within the Engine. +// +// As well as being responsible for providing atomic reads and writes to the +// statistics tracking the various compaction operations, compactionTracker also +// mirrors any writes to the prometheus block metrics, which the Engine exposes. +// +// *NOTE* - compactionTracker fields should not be directory modified. Doing so +// could result in the Engine exposing inaccurate metrics. +type compactionTracker struct { + metrics *blockMetrics + + // Note: Compactions are levelled as follows: + // 0 – Snapshots + // 1-3 – Levelled compactions + // 4 – Optimize compactions + // 5 – Full compactions + + ok [6]uint64 // Counter of TSM compactions (by level) that have successfully completed. + active [6]uint64 // Gauge of TSM compactions (by level) currently running. + errors [6]uint64 // Counter of TSM compcations (by level) that have failed due to error. + queue [6]uint64 // Gauge of TSM compactions queues (by level). +} + +func newCompactionTracker(blockMetrics *blockMetrics) *compactionTracker { + return &compactionTracker{metrics: blockMetrics} +} + +// Completed returns the total number of compactions for the provided level. +func (t *compactionTracker) Completed(level int) uint64 { return atomic.LoadUint64(&t.ok[level]) } + +// Active returns the number of active snapshots (level 0), +// level 1, 2 or 3 compactions, optimize compactions (level 4), or full +// compactions (level 5). +func (t *compactionTracker) Active(level int) uint64 { + return atomic.LoadUint64(&t.active[level]) +} + +// AllActive returns the number of active snapshots and compactions. +func (t *compactionTracker) AllActive() uint64 { + var total uint64 + for i := 0; i < len(t.active); i++ { + total += atomic.LoadUint64(&t.active[i]) + } + return total +} + +// ActiveOptimise returns the number of active Optimise compactions. +// +// ActiveOptimise is a helper for Active(4). +func (t *compactionTracker) ActiveOptimise() uint64 { return t.Active(4) } + +// ActiveFull returns the number of active Full compactions. +// +// ActiveFull is a helper for Active(5). +func (t *compactionTracker) ActiveFull() uint64 { return t.Active(5) } + +// Errors returns the total number of errors encountered attempting compactions +// for the provided level. +func (t *compactionTracker) Errors(level int) uint64 { return atomic.LoadUint64(&t.errors[level]) } + +// IncActive increments the number of active compactions for the provided level. +func (t *compactionTracker) IncActive(level compactionLevel) { + atomic.AddUint64(&t.active[level], 1) + + labels := t.metrics.CompactionLabels(level) + t.metrics.CompactionsActive.With(labels).Inc() +} + +// IncFullActive increments the number of active Full compactions. +func (t *compactionTracker) IncFullActive() { t.IncActive(5) } + +// DecActive decrements the number of active compactions for the provided level. +func (t *compactionTracker) DecActive(level compactionLevel) { + atomic.AddUint64(&t.active[level], ^uint64(0)) + + labels := t.metrics.CompactionLabels(level) + t.metrics.CompactionsActive.With(labels).Dec() +} + +// DecFullActive decrements the number of active Full compactions. +func (t *compactionTracker) DecFullActive() { t.DecActive(5) } + +// Attempted updates the number of compactions attempted for the provided level. +func (t *compactionTracker) Attempted(level compactionLevel, success bool, duration time.Duration) { + if success { + atomic.AddUint64(&t.ok[level], 1) + + labels := t.metrics.CompactionLabels(level) + t.metrics.CompactionDuration.With(labels).Observe(duration.Seconds()) + + labels["status"] = "ok" + t.metrics.Compactions.With(labels).Inc() + return + } + + atomic.AddUint64(&t.errors[level], 1) + + labels := t.metrics.CompactionLabels(level) + labels["status"] = "error" + t.metrics.Compactions.With(labels).Inc() +} + +// SnapshotAttempted updates the number of snapshots attempted. +func (t *compactionTracker) SnapshotAttempted(success bool, duration time.Duration) { + t.Attempted(0, success, duration) +} + +// SetQueue sets the compaction queue depth for the provided level. +func (t *compactionTracker) SetQueue(level compactionLevel, length uint64) { + atomic.StoreUint64(&t.queue[level], length) + + labels := t.metrics.CompactionLabels(level) + t.metrics.CompactionQueue.With(labels).Set(float64(length)) +} + +// SetOptimiseQueue sets the queue depth for Optimisation compactions. +func (t *compactionTracker) SetOptimiseQueue(length uint64) { t.SetQueue(4, length) } + +// SetFullQueue sets the queue depth for Full compactions. +func (t *compactionTracker) SetFullQueue(length uint64) { t.SetQueue(5, length) } + +>>>>>>> Convert TSM compaction stats to Prom metrics // WriteSnapshot will snapshot the cache and write a new TSM file with its contents, releasing the snapshot when done. func (e *Engine) WriteSnapshot() error { // Lock and grab the cache snapshot along with all the closed WAL @@ -1216,11 +1258,8 @@ func (e *Engine) compactCache() { err := e.WriteSnapshot() if err != nil && err != errCompactionsDisabled { e.logger.Info("Error writing snapshot", zap.Error(err)) - atomic.AddInt64(&e.stats.CacheCompactionErrors, 1) - } else { - atomic.AddInt64(&e.stats.CacheCompactions, 1) } - atomic.AddInt64(&e.stats.CacheCompactionDuration, time.Since(start).Nanoseconds()) + e.compactionTracker.SnapshotAttempted(err == nil || err == errCompactionsDisabled, time.Since(start)) } } } @@ -1262,18 +1301,18 @@ func (e *Engine) compact(wg *sync.WaitGroup) { level2Groups := e.CompactionPlan.PlanLevel(2) level3Groups := e.CompactionPlan.PlanLevel(3) level4Groups := e.CompactionPlan.Plan(e.FileStore.LastModified()) - atomic.StoreInt64(&e.stats.TSMOptimizeCompactionsQueue, int64(len(level4Groups))) + e.compactionTracker.SetOptimiseQueue(uint64(len(level4Groups))) // If no full compactions are need, see if an optimize is needed if len(level4Groups) == 0 { level4Groups = e.CompactionPlan.PlanOptimize() - atomic.StoreInt64(&e.stats.TSMOptimizeCompactionsQueue, int64(len(level4Groups))) + e.compactionTracker.SetOptimiseQueue(uint64(len(level4Groups))) } // Update the level plan queue stats - atomic.StoreInt64(&e.stats.TSMCompactionsQueue[0], int64(len(level1Groups))) - atomic.StoreInt64(&e.stats.TSMCompactionsQueue[1], int64(len(level2Groups))) - atomic.StoreInt64(&e.stats.TSMCompactionsQueue[2], int64(len(level3Groups))) + e.compactionTracker.SetQueue(1, uint64(len(level1Groups))) + e.compactionTracker.SetQueue(2, uint64(len(level2Groups))) + e.compactionTracker.SetQueue(3, uint64(len(level3Groups))) // Set the queue depths on the scheduler e.scheduler.setDepth(1, len(level1Groups)) @@ -1314,7 +1353,7 @@ func (e *Engine) compact(wg *sync.WaitGroup) { // compactHiPriorityLevel kicks off compactions using the high priority policy. It returns // true if the compaction was started -func (e *Engine) compactHiPriorityLevel(grp CompactionGroup, level int, fast bool, wg *sync.WaitGroup) bool { +func (e *Engine) compactHiPriorityLevel(grp CompactionGroup, level compactionLevel, fast bool, wg *sync.WaitGroup) bool { s := e.levelCompactionStrategy(grp, fast, level) if s == nil { return false @@ -1322,13 +1361,12 @@ func (e *Engine) compactHiPriorityLevel(grp CompactionGroup, level int, fast boo // Try hi priority limiter, otherwise steal a little from the low priority if we can. if e.compactionLimiter.TryTake() { - atomic.AddInt64(&e.stats.TSMCompactionsActive[level-1], 1) + e.compactionTracker.IncActive(level) wg.Add(1) go func() { defer wg.Done() - defer atomic.AddInt64(&e.stats.TSMCompactionsActive[level-1], -1) - + defer e.compactionTracker.DecActive(level) defer e.compactionLimiter.Release() s.Apply() // Release the files in the compaction plan @@ -1343,7 +1381,7 @@ func (e *Engine) compactHiPriorityLevel(grp CompactionGroup, level int, fast boo // compactLoPriorityLevel kicks off compactions using the lo priority policy. It returns // the plans that were not able to be started -func (e *Engine) compactLoPriorityLevel(grp CompactionGroup, level int, fast bool, wg *sync.WaitGroup) bool { +func (e *Engine) compactLoPriorityLevel(grp CompactionGroup, level compactionLevel, fast bool, wg *sync.WaitGroup) bool { s := e.levelCompactionStrategy(grp, fast, level) if s == nil { return false @@ -1351,11 +1389,11 @@ func (e *Engine) compactLoPriorityLevel(grp CompactionGroup, level int, fast boo // Try the lo priority limiter, otherwise steal a little from the high priority if we can. if e.compactionLimiter.TryTake() { - atomic.AddInt64(&e.stats.TSMCompactionsActive[level-1], 1) + e.compactionTracker.IncActive(level) wg.Add(1) go func() { defer wg.Done() - defer atomic.AddInt64(&e.stats.TSMCompactionsActive[level-1], -1) + defer e.compactionTracker.DecActive(level) defer e.compactionLimiter.Release() s.Apply() // Release the files in the compaction plan @@ -1376,11 +1414,11 @@ func (e *Engine) compactFull(grp CompactionGroup, wg *sync.WaitGroup) bool { // Try the lo priority limiter, otherwise steal a little from the high priority if we can. if e.compactionLimiter.TryTake() { - atomic.AddInt64(&e.stats.TSMFullCompactionsActive, 1) + e.compactionTracker.IncFullActive() wg.Add(1) go func() { defer wg.Done() - defer atomic.AddInt64(&e.stats.TSMFullCompactionsActive, -1) + defer e.compactionTracker.DecFullActive() defer e.compactionLimiter.Release() s.Apply() // Release the files in the compaction plan @@ -1396,12 +1434,9 @@ type compactionStrategy struct { group CompactionGroup fast bool - level int + level compactionLevel - durationStat *int64 - activeStat *int64 - successStat *int64 - errorStat *int64 + tracker *compactionTracker logger *zap.Logger compactor *Compactor @@ -1412,13 +1447,12 @@ type compactionStrategy struct { // Apply concurrently compacts all the groups in a compaction strategy. func (s *compactionStrategy) Apply() { - start := time.Now() s.compactGroup() - atomic.AddInt64(s.durationStat, time.Since(start).Nanoseconds()) } // compactGroup executes the compaction strategy against a single CompactionGroup. func (s *compactionStrategy) compactGroup() { + now := time.Now() group := s.group log, logEnd := logger.NewOperation(s.logger, "TSM compaction", "tsm1_compact_group") defer logEnd() @@ -1451,14 +1485,14 @@ func (s *compactionStrategy) compactGroup() { } log.Info("Error compacting TSM files", zap.Error(err)) - atomic.AddInt64(s.errorStat, 1) + s.tracker.Attempted(s.level, false, 0) time.Sleep(time.Second) return } if err := s.fileStore.ReplaceWithCallback(group, files, nil); err != nil { log.Info("Error replacing new TSM files", zap.Error(err)) - atomic.AddInt64(s.errorStat, 1) + s.tracker.Attempted(s.level, false, 0) time.Sleep(time.Second) return } @@ -1466,27 +1500,22 @@ func (s *compactionStrategy) compactGroup() { for i, f := range files { log.Info("Compacted file", zap.Int("tsm1_index", i), zap.String("tsm1_file", f)) } - log.Info("Finished compacting files", - zap.Int("tsm1_files_n", len(files))) - atomic.AddInt64(s.successStat, 1) + log.Info("Finished compacting files", zap.Int("tsm1_files_n", len(files))) + s.tracker.Attempted(s.level, true, time.Since(now)) } // levelCompactionStrategy returns a compactionStrategy for the given level. // It returns nil if there are no TSM files to compact. -func (e *Engine) levelCompactionStrategy(group CompactionGroup, fast bool, level int) *compactionStrategy { +func (e *Engine) levelCompactionStrategy(group CompactionGroup, fast bool, level compactionLevel) *compactionStrategy { return &compactionStrategy{ group: group, - logger: e.logger.With(zap.Int("tsm1_level", level), zap.String("tsm1_strategy", "level")), + logger: e.logger.With(zap.Int("tsm1_level", int(level)), zap.String("tsm1_strategy", "level")), fileStore: e.FileStore, compactor: e.Compactor, fast: fast, engine: e, level: level, - - activeStat: &e.stats.TSMCompactionsActive[level-1], - successStat: &e.stats.TSMCompactions[level-1], - errorStat: &e.stats.TSMCompactionErrors[level-1], - durationStat: &e.stats.TSMCompactionDuration[level-1], + tracker: e.compactionTracker, } } @@ -1500,21 +1529,12 @@ func (e *Engine) fullCompactionStrategy(group CompactionGroup, optimize bool) *c compactor: e.Compactor, fast: optimize, engine: e, - level: 4, + level: 5, } if optimize { - s.activeStat = &e.stats.TSMOptimizeCompactionsActive - s.successStat = &e.stats.TSMOptimizeCompactions - s.errorStat = &e.stats.TSMOptimizeCompactionErrors - s.durationStat = &e.stats.TSMOptimizeCompactionDuration - } else { - s.activeStat = &e.stats.TSMFullCompactionsActive - s.successStat = &e.stats.TSMFullCompactions - s.errorStat = &e.stats.TSMFullCompactionErrors - s.durationStat = &e.stats.TSMFullCompactionDuration + s.level = 4 } - return s } diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go new file mode 100644 index 0000000000..6f64ce95cb --- /dev/null +++ b/tsdb/tsm1/metrics.go @@ -0,0 +1,84 @@ +package tsm1 + +import ( + "fmt" + "sort" + + "github.com/prometheus/client_golang/prometheus" +) + +// namespace is the leading part of all published metrics for the Storage service. +const namespace = "storage" + +const blockSubsystem = "block" // sub-system associated with metrics for block storage. + +// blockMetrics are a set of metrics concerned with tracking data about block storage. +type blockMetrics struct { + labels prometheus.Labels // Read only. + + Compactions *prometheus.CounterVec + CompactionsActive *prometheus.GaugeVec + CompactionDuration *prometheus.HistogramVec + CompactionQueue *prometheus.GaugeVec +} + +// newBlockMetrics initialises the prometheus metrics for the block subsystem. +func newBlockMetrics(labels prometheus.Labels) *blockMetrics { + compactionNames := []string{"level"} // All compaction metrics have a `level` label. + for k := range labels { + compactionNames = append(compactionNames, k) + } + sort.Strings(compactionNames) + totalCompactionsNames := append(compactionNames, "status") + sort.Strings(totalCompactionsNames) + + return &blockMetrics{ + labels: labels, + Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: blockSubsystem, + Name: "compactions_total", + Help: "Number of times cache snapshotted or TSM compaction attempted.", + }, totalCompactionsNames), + CompactionsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: blockSubsystem, + Name: "compactions_active", + Help: "Number of active compactions.", + }, compactionNames), + CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: blockSubsystem, + Name: "compaction_duration_seconds", + Help: "Time taken for a successful compaction or snapshot.", + // 30 buckets spaced exponentially between 5s and ~53 minutes. + Buckets: prometheus.ExponentialBuckets(5.0, 1.25, 30), + }, compactionNames), + CompactionQueue: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: blockSubsystem, + Name: "compactions_queued", + Help: "Number of queued compactions.", + }, compactionNames), + } +} + +// CompactionLabels returns a copy of labels for use with compaction metrics. +func (b *blockMetrics) CompactionLabels(level compactionLevel) prometheus.Labels { + l := make(map[string]string, len(b.labels)) + for k, v := range b.labels { + l[k] = v + } + l["level"] = fmt.Sprint(level) + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (b *blockMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + b.Compactions, + b.CompactionsActive, + b.CompactionDuration, + b.CompactionQueue, + } +} diff --git a/tsdb/tsm1/scheduler.go b/tsdb/tsm1/scheduler.go index d360afc3e7..833f98c817 100644 --- a/tsdb/tsm1/scheduler.go +++ b/tsdb/tsm1/scheduler.go @@ -1,28 +1,28 @@ package tsm1 -import ( - "sync/atomic" -) - var defaultWeights = [4]float64{0.4, 0.3, 0.2, 0.1} type scheduler struct { - maxConcurrency int - stats *EngineStatistics + maxConcurrency int + compactionTracker *compactionTracker // queues is the depth of work pending for each compaction level queues [4]int weights [4]float64 } -func newScheduler(stats *EngineStatistics, maxConcurrency int) *scheduler { +func newScheduler(maxConcurrency int) *scheduler { return &scheduler{ - stats: stats, maxConcurrency: maxConcurrency, weights: defaultWeights, } } +// setCompactionTracker sets the metrics on the scheduler. It must be called before next. +func (s *scheduler) setCompactionTracker(tracker *compactionTracker) { + s.compactionTracker = tracker +} + func (s *scheduler) setDepth(level, depth int) { level = level - 1 if level < 0 || level > len(s.queues) { @@ -33,10 +33,10 @@ func (s *scheduler) setDepth(level, depth int) { } func (s *scheduler) next() (int, bool) { - level1Running := int(atomic.LoadInt64(&s.stats.TSMCompactionsActive[0])) - level2Running := int(atomic.LoadInt64(&s.stats.TSMCompactionsActive[1])) - level3Running := int(atomic.LoadInt64(&s.stats.TSMCompactionsActive[2])) - level4Running := int(atomic.LoadInt64(&s.stats.TSMFullCompactionsActive) + atomic.LoadInt64(&s.stats.TSMOptimizeCompactionsActive)) + level1Running := int(s.compactionTracker.Active(1)) + level2Running := int(s.compactionTracker.Active(2)) + level3Running := int(s.compactionTracker.Active(3)) + level4Running := int(s.compactionTracker.ActiveFull() + s.compactionTracker.ActiveOptimise()) if level1Running+level2Running+level3Running+level4Running >= s.maxConcurrency { return 0, false From d61b9f16458d64d7876551801dd4a4e4bb2b42ca Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 31 Oct 2018 18:36:22 +0000 Subject: [PATCH 02/25] Convert Filestore stats --- tsdb/engine.go | 126 ---------------------------------------- tsdb/tsm1/engine.go | 25 ++++---- tsdb/tsm1/file_store.go | 93 ++++++++++++++++++----------- tsdb/tsm1/metrics.go | 117 ++++++++++++++++++++++++++++++------- 4 files changed, 168 insertions(+), 193 deletions(-) delete mode 100644 tsdb/engine.go diff --git a/tsdb/engine.go b/tsdb/engine.go deleted file mode 100644 index c5e8aeccff..0000000000 --- a/tsdb/engine.go +++ /dev/null @@ -1,126 +0,0 @@ -package tsdb - -import ( - "context" - "errors" - "regexp" - "runtime" - "time" - - "github.com/influxdata/influxql" - "github.com/influxdata/platform/models" - "github.com/influxdata/platform/pkg/limiter" - "go.uber.org/zap" -) - -var ( - // ErrUnknownEngineFormat is returned when the engine format is - // unknown. ErrUnknownEngineFormat is currently returned if a format - // other than tsm1 is encountered. - ErrUnknownEngineFormat = errors.New("unknown engine format") -) - -// Engine represents a swappable storage engine for the shard. -type Engine interface { - Open() error - Close() error - SetEnabled(enabled bool) - SetCompactionsEnabled(enabled bool) - ScheduleFullCompaction() error - - WithLogger(*zap.Logger) - - CreateCursorIterator(ctx context.Context) (CursorIterator, error) - WritePoints(points []models.Point) error - - CreateSeriesIfNotExists(key, name []byte, tags models.Tags, typ models.FieldType) error - CreateSeriesListIfNotExists(collection *SeriesCollection) error - DeleteSeriesRange(itr SeriesIterator, min, max int64) error - DeleteSeriesRangeWithPredicate(itr SeriesIterator, predicate func(name []byte, tags models.Tags) (int64, int64, bool)) error - - SeriesN() int64 - - MeasurementExists(name []byte) (bool, error) - - MeasurementNamesByRegex(re *regexp.Regexp) ([][]byte, error) - ForEachMeasurementName(fn func(name []byte) error) error - DeleteMeasurement(name []byte) error - - HasTagKey(name, key []byte) (bool, error) - MeasurementTagKeysByExpr(name []byte, expr influxql.Expr) (map[string]struct{}, error) - TagKeyCardinality(name, key []byte) int - - LastModified() time.Time - DiskSize() int64 - IsIdle() bool - Free() error -} - -// SeriesIDSets provides access to the total set of series IDs -type SeriesIDSets interface { - ForEach(f func(ids *SeriesIDSet)) error -} - -// EngineOptions represents the options used to initialize the engine. -type EngineOptions struct { - EngineVersion string - ShardID uint64 - - // Limits the concurrent number of TSM files that can be loaded at once. - OpenLimiter limiter.Fixed - - // CompactionDisabled specifies shards should not schedule compactions. - // This option is intended for offline tooling. - CompactionDisabled bool - CompactionPlannerCreator CompactionPlannerCreator - CompactionLimiter limiter.Fixed - CompactionThroughputLimiter limiter.Rate - WALEnabled bool - MonitorDisabled bool - - // DatabaseFilter is a predicate controlling which databases may be opened. - // If no function is set, all databases will be opened. - DatabaseFilter func(database string) bool - - // RetentionPolicyFilter is a predicate controlling which combination of database and retention policy may be opened. - // nil will allow all combinations to pass. - RetentionPolicyFilter func(database, rp string) bool - - // ShardFilter is a predicate controlling which combination of database, retention policy and shard group may be opened. - // nil will allow all combinations to pass. - ShardFilter func(database, rp string, id uint64) bool - - Config Config - SeriesIDSets SeriesIDSets - - OnNewEngine func(Engine) - - FileStoreObserver FileStoreObserver -} - -// NewEngineOptions constructs an EngineOptions object with safe default values. -// This should only be used in tests; production environments should read from a config file. -func NewEngineOptions() EngineOptions { - return EngineOptions{ - EngineVersion: DefaultEngine, - Config: NewConfig(), - OpenLimiter: limiter.NewFixed(runtime.GOMAXPROCS(0)), - CompactionLimiter: limiter.NewFixed(1), - WALEnabled: false, - } -} - -// NewInmemIndex returns a new "inmem" index type. -var NewInmemIndex func(name string, sfile *SeriesFile) (interface{}, error) - -type CompactionPlannerCreator func(cfg Config) interface{} - -// FileStoreObserver is passed notifications before the file store adds or deletes files. In this way, it can -// be sure to observe every file that is added or removed even in the presence of process death. -type FileStoreObserver interface { - // FileFinishing is called before a file is renamed to it's final name. - FileFinishing(path string) error - - // FileUnlinking is called before a file is unlinked. - FileUnlinking(path string) error -} diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 69df60743a..50096ca0a5 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -221,7 +221,7 @@ func NewEngine(path string, idx *tsi1.Index, config Config, options ...EngineOpt enableCompactionsOnOpen: true, formatFileName: DefaultFormatFileName, compactionLimiter: limiter.NewFixed(maxCompactions), - scheduler: newScheduler(stats, maxCompactions), + scheduler: newScheduler(maxCompactions), } for _, option := range options { @@ -497,7 +497,11 @@ func (e *Engine) DiskSize() int64 { func (e *Engine) Open() error { // Initialise metrics... e.blockMetrics = newBlockMetrics(e.defaultMetricLabels) - e.compactionTracker = newCompactionTracker(e.blockMetrics) + + // Propagate prometheus metrics down into trackers. + e.compactionTracker = newCompactionTracker(e.blockMetrics.compactionMetrics) + e.FileStore.fileTracker = newFileTracker(e.blockMetrics.fileMetrics) + e.scheduler.setCompactionTracker(e.compactionTracker) if err := os.MkdirAll(e.path, 0777); err != nil { @@ -550,7 +554,6 @@ func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector metrics = append(metrics, e.blockMetrics.PrometheusCollectors()...) - // TODO(edd): Add Filestore metrics // TODO(edd): Add Cache metrics // TODO(edd): Add WAL metrics return metrics @@ -1035,7 +1038,7 @@ func (l compactionLevel) String() string { // *NOTE* - compactionTracker fields should not be directory modified. Doing so // could result in the Engine exposing inaccurate metrics. type compactionTracker struct { - metrics *blockMetrics + metrics *compactionMetrics // Note: Compactions are levelled as follows: // 0 – Snapshots @@ -1049,8 +1052,8 @@ type compactionTracker struct { queue [6]uint64 // Gauge of TSM compactions queues (by level). } -func newCompactionTracker(blockMetrics *blockMetrics) *compactionTracker { - return &compactionTracker{metrics: blockMetrics} +func newCompactionTracker(metrics *compactionMetrics) *compactionTracker { + return &compactionTracker{metrics: metrics} } // Completed returns the total number of compactions for the provided level. @@ -1090,7 +1093,7 @@ func (t *compactionTracker) Errors(level int) uint64 { return atomic.LoadUint64( func (t *compactionTracker) IncActive(level compactionLevel) { atomic.AddUint64(&t.active[level], 1) - labels := t.metrics.CompactionLabels(level) + labels := t.metrics.Labels(level) t.metrics.CompactionsActive.With(labels).Inc() } @@ -1101,7 +1104,7 @@ func (t *compactionTracker) IncFullActive() { t.IncActive(5) } func (t *compactionTracker) DecActive(level compactionLevel) { atomic.AddUint64(&t.active[level], ^uint64(0)) - labels := t.metrics.CompactionLabels(level) + labels := t.metrics.Labels(level) t.metrics.CompactionsActive.With(labels).Dec() } @@ -1113,7 +1116,7 @@ func (t *compactionTracker) Attempted(level compactionLevel, success bool, durat if success { atomic.AddUint64(&t.ok[level], 1) - labels := t.metrics.CompactionLabels(level) + labels := t.metrics.Labels(level) t.metrics.CompactionDuration.With(labels).Observe(duration.Seconds()) labels["status"] = "ok" @@ -1123,7 +1126,7 @@ func (t *compactionTracker) Attempted(level compactionLevel, success bool, durat atomic.AddUint64(&t.errors[level], 1) - labels := t.metrics.CompactionLabels(level) + labels := t.metrics.Labels(level) labels["status"] = "error" t.metrics.Compactions.With(labels).Inc() } @@ -1137,7 +1140,7 @@ func (t *compactionTracker) SnapshotAttempted(success bool, duration time.Durati func (t *compactionTracker) SetQueue(level compactionLevel, length uint64) { atomic.StoreUint64(&t.queue[level], length) - labels := t.metrics.CompactionLabels(level) + labels := t.metrics.Labels(level) t.metrics.CompactionQueue.With(labels).Set(float64(length)) } diff --git a/tsdb/tsm1/file_store.go b/tsdb/tsm1/file_store.go index 8934949731..e25afba80e 100644 --- a/tsdb/tsm1/file_store.go +++ b/tsdb/tsm1/file_store.go @@ -17,7 +17,6 @@ import ( "sync/atomic" "time" - "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/file" "github.com/influxdata/platform/pkg/limiter" "github.com/influxdata/platform/pkg/metrics" @@ -160,12 +159,6 @@ type FileStoreObserver interface { FileUnlinking(path string) error } -// Statistics gathered by the FileStore. -const ( - statFileStoreBytes = "diskBytes" - statFileStoreCount = "numFiles" -) - var ( floatBlocksDecodedCounter = metrics.MustRegisterCounter("float_blocks_decoded", metrics.WithGroup(tsmGroup)) floatBlocksSizeCounter = metrics.MustRegisterCounter("float_blocks_size_bytes", metrics.WithGroup(tsmGroup)) @@ -198,8 +191,8 @@ type FileStore struct { traceLogger *zap.Logger // Logger to be used when trace-logging is on. traceLogging bool - stats *FileStoreStatistics - purger *purger + fileTracker *fileTracker + purger *purger currentTempDirID int @@ -242,7 +235,6 @@ func NewFileStore(dir string) *FileStore { logger: logger, traceLogger: logger, openLimiter: limiter.NewFixed(runtime.GOMAXPROCS(0)), - stats: &FileStoreStatistics{}, purger: &purger{ files: map[string]TSMFile{}, logger: logger, @@ -290,20 +282,53 @@ func (f *FileStore) WithLogger(log *zap.Logger) { // FileStoreStatistics keeps statistics about the file store. type FileStoreStatistics struct { - DiskBytes int64 - FileCount int64 + SDiskBytes int64 + SFileCount int64 } -// Statistics returns statistics for periodic monitoring. -func (f *FileStore) Statistics(tags map[string]string) []models.Statistic { - return []models.Statistic{{ - Name: "tsm1_filestore", - Tags: tags, - Values: map[string]interface{}{ - statFileStoreBytes: atomic.LoadInt64(&f.stats.DiskBytes), - statFileStoreCount: atomic.LoadInt64(&f.stats.FileCount), - }, - }} +// fileTracker tracks file counts and sizes within the FileStore. +// +// As well as being responsible for providing atomic reads and writes to the +// statistics, fileTracker also mirrors any changes to the external prometheus +// metrics, which the Engine exposes. +// +// *NOTE* - fileTracker fields should not be directory modified. Doing so +// could result in the Engine exposing inaccurate metrics. +type fileTracker struct { + metrics *fileMetrics + diskBytes uint64 + fileCount uint64 +} + +func newFileTracker(metrics *fileMetrics) *fileTracker { + return &fileTracker{metrics: metrics} +} + +// Bytes returns the number of bytes in use on disk. +func (t *fileTracker) Bytes() uint64 { return atomic.LoadUint64(&t.diskBytes) } + +// SetBytes sets the number of bytes in use on disk. +func (t *fileTracker) SetBytes(bytes uint64) { + atomic.StoreUint64(&t.diskBytes, bytes) + + labels := t.metrics.Labels() + t.metrics.DiskSize.With(labels).Set(float64(bytes)) +} + +// AddBytes increases the number of bytes. +func (t *fileTracker) AddBytes(bytes uint64) { + atomic.AddUint64(&t.diskBytes, bytes) + + labels := t.metrics.Labels() + t.metrics.DiskSize.With(labels).Add(float64(bytes)) +} + +// SetFileCount sets the number of files in the FileStore. +func (t *fileTracker) SetFileCount(files uint64) { + atomic.StoreUint64(&t.fileCount, files) + + labels := t.metrics.Labels() + t.metrics.Files.With(labels).Set(float64(files)) } // Count returns the number of TSM files currently loaded. @@ -581,10 +606,11 @@ func (f *FileStore) Open() error { f.files = append(f.files, res.r) // Accumulate file store size stats - atomic.AddInt64(&f.stats.DiskBytes, int64(res.r.Size())) + totalSize := uint64(res.r.Size()) for _, ts := range res.r.TombstoneFiles() { - atomic.AddInt64(&f.stats.DiskBytes, int64(ts.Size)) + totalSize += uint64(ts.Size) } + f.fileTracker.AddBytes(totalSize) // Re-initialize the lastModified time for the file store if res.r.LastModified() > lm { @@ -596,7 +622,7 @@ func (f *FileStore) Open() error { close(readerC) sort.Sort(tsmReaders(f.files)) - atomic.StoreInt64(&f.stats.FileCount, int64(len(f.files))) + f.fileTracker.SetFileCount(uint64(len(f.files))) return nil } @@ -609,7 +635,7 @@ func (f *FileStore) Close() error { f.lastFileStats = nil f.files = nil - atomic.StoreInt64(&f.stats.FileCount, 0) + f.fileTracker.SetFileCount(uint64(0)) // Let other methods access this closed object while we do the actual closing. f.mu.Unlock() @@ -624,9 +650,8 @@ func (f *FileStore) Close() error { return nil } -func (f *FileStore) DiskSizeBytes() int64 { - return atomic.LoadInt64(&f.stats.DiskBytes) -} +// DiskSizeBytes returns the total number of bytes consumed by the files in the FileStore. +func (f *FileStore) DiskSizeBytes() int64 { return int64(f.fileTracker.Bytes()) } // Read returns the slice of values for the given key and the given timestamp, // if any file matches those constraints. @@ -878,18 +903,18 @@ func (f *FileStore) replace(oldFiles, newFiles []string, updatedFn func(r []TSMF f.lastFileStats = nil f.files = active sort.Sort(tsmReaders(f.files)) - atomic.StoreInt64(&f.stats.FileCount, int64(len(f.files))) + f.fileTracker.SetFileCount(uint64(len(f.files))) // Recalculate the disk size stat - var totalSize int64 + var totalSize uint64 for _, file := range f.files { - totalSize += int64(file.Size()) + totalSize += uint64(file.Size()) for _, ts := range file.TombstoneFiles() { - totalSize += int64(ts.Size) + totalSize += uint64(ts.Size) } } - atomic.StoreInt64(&f.stats.DiskBytes, totalSize) + f.fileTracker.SetBytes(totalSize) return nil } diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index 6f64ce95cb..6a90c9990c 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -10,20 +10,44 @@ import ( // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" -const blockSubsystem = "block" // sub-system associated with metrics for block storage. +const compactionSubsystem = "compactions" // sub-system associated with metrics for compactions +const fileStoreSubsystem = "tsm_files" // sub-system associated with metrics for compactions // blockMetrics are a set of metrics concerned with tracking data about block storage. type blockMetrics struct { - labels prometheus.Labels // Read only. + labels prometheus.Labels + *compactionMetrics + *fileMetrics +} +// newBlockMetrics initialises the prometheus metrics for the block subsystem. +func newBlockMetrics(labels prometheus.Labels) *blockMetrics { + return &blockMetrics{ + labels: labels, + compactionMetrics: newCompactionMetrics(labels), + fileMetrics: newFileMetrics(labels), + } +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *blockMetrics) PrometheusCollectors() []prometheus.Collector { + var metrics []prometheus.Collector + metrics = append(metrics, m.compactionMetrics.PrometheusCollectors()...) + metrics = append(metrics, m.fileMetrics.PrometheusCollectors()...) + return metrics +} + +// compactionMetrics are a set of metrics concerned with tracking data about compactions. +type compactionMetrics struct { + labels prometheus.Labels // Read Only Compactions *prometheus.CounterVec CompactionsActive *prometheus.GaugeVec CompactionDuration *prometheus.HistogramVec CompactionQueue *prometheus.GaugeVec } -// newBlockMetrics initialises the prometheus metrics for the block subsystem. -func newBlockMetrics(labels prometheus.Labels) *blockMetrics { +// newCompactionMetrics initialises the prometheus metrics for compactions. +func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { compactionNames := []string{"level"} // All compaction metrics have a `level` label. for k := range labels { compactionNames = append(compactionNames, k) @@ -32,41 +56,41 @@ func newBlockMetrics(labels prometheus.Labels) *blockMetrics { totalCompactionsNames := append(compactionNames, "status") sort.Strings(totalCompactionsNames) - return &blockMetrics{ + return &compactionMetrics{ labels: labels, Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, - Subsystem: blockSubsystem, - Name: "compactions_total", + Subsystem: compactionSubsystem, + Name: "total", Help: "Number of times cache snapshotted or TSM compaction attempted.", }, totalCompactionsNames), CompactionsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, - Subsystem: blockSubsystem, - Name: "compactions_active", + Subsystem: compactionSubsystem, + Name: "active", Help: "Number of active compactions.", }, compactionNames), CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, - Subsystem: blockSubsystem, - Name: "compaction_duration_seconds", + Subsystem: compactionSubsystem, + Name: "duration_seconds", Help: "Time taken for a successful compaction or snapshot.", // 30 buckets spaced exponentially between 5s and ~53 minutes. Buckets: prometheus.ExponentialBuckets(5.0, 1.25, 30), }, compactionNames), CompactionQueue: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, - Subsystem: blockSubsystem, - Name: "compactions_queued", + Subsystem: compactionSubsystem, + Name: "queued", Help: "Number of queued compactions.", }, compactionNames), } } -// CompactionLabels returns a copy of labels for use with compaction metrics. -func (b *blockMetrics) CompactionLabels(level compactionLevel) prometheus.Labels { - l := make(map[string]string, len(b.labels)) - for k, v := range b.labels { +// Labels returns a copy of labels for use with compaction metrics. +func (m *compactionMetrics) Labels(level compactionLevel) prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { l[k] = v } l["level"] = fmt.Sprint(level) @@ -74,11 +98,60 @@ func (b *blockMetrics) CompactionLabels(level compactionLevel) prometheus.Labels } // PrometheusCollectors satisfies the prom.PrometheusCollector interface. -func (b *blockMetrics) PrometheusCollectors() []prometheus.Collector { +func (m *compactionMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ - b.Compactions, - b.CompactionsActive, - b.CompactionDuration, - b.CompactionQueue, + m.Compactions, + m.CompactionsActive, + m.CompactionDuration, + m.CompactionQueue, + } +} + +// fileMetrics are a set of metrics concerned with tracking data about compactions. +type fileMetrics struct { + labels prometheus.Labels + DiskSize *prometheus.GaugeVec + Files *prometheus.GaugeVec +} + +// newFileMetrics initialises the prometheus metrics for tracking files on disk. +func newFileMetrics(labels prometheus.Labels) *fileMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + return &fileMetrics{ + labels: labels, + DiskSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: fileStoreSubsystem, + Name: "disk_bytes", + Help: "Number of bytes TSM files using on disk.", + }, names), + Files: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: fileStoreSubsystem, + Name: "total", + Help: "Number of files.", + }, names), + } +} + +// Labels returns a copy of labels for use with file metrics. +func (m *fileMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *fileMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.DiskSize, + m.Files, } } From 3b980ed7e35e7a8985fcb862156901dbbd89fe40 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 1 Nov 2018 18:58:56 +0000 Subject: [PATCH 03/25] Convert Cache statistics --- tsdb/tsm1/cache.go | 330 ++++++++++++++++++++++-------------- tsdb/tsm1/cache_test.go | 14 +- tsdb/tsm1/engine.go | 3 +- tsdb/tsm1/file_store.go | 1 + tsdb/tsm1/metrics.go | 110 +++++++++++- tsdb/tsm1/scheduler.go | 5 +- tsdb/tsm1/scheduler_test.go | 19 +-- 7 files changed, 325 insertions(+), 157 deletions(-) diff --git a/tsdb/tsm1/cache.go b/tsdb/tsm1/cache.go index a075ed21e8..455c38d7b6 100644 --- a/tsdb/tsm1/cache.go +++ b/tsdb/tsm1/cache.go @@ -143,25 +143,6 @@ func (e *entry) InfluxQLType() (influxql.DataType, error) { return e.values.InfluxQLType() } -// Statistics gathered by the Cache. -const ( - // levels - point in time measures - - statCacheMemoryBytes = "memBytes" // level: Size of in-memory cache in bytes - statCacheDiskBytes = "diskBytes" // level: Size of on-disk snapshots in bytes - statSnapshots = "snapshotCount" // level: Number of active snapshots. - statCacheAgeMs = "cacheAgeMs" // level: Number of milliseconds since cache was last snapshoted at sample time - - // counters - accumulative measures - - statCachedBytes = "cachedBytes" // counter: Total number of bytes written into snapshots. - statWALCompactionTimeMs = "WALCompactionTimeMs" // counter: Total number of milliseconds spent compacting snapshots - - statCacheWriteOK = "writeOk" - statCacheWriteErr = "writeErr" - statCacheWriteDropped = "writeDropped" -) - // storer is the interface that descibes a cache's store. type storer interface { entry(key []byte) *entry // Get an entry by its key. @@ -178,12 +159,7 @@ type storer interface { // Cache maintains an in-memory store of Values for a set of keys. type Cache struct { - // Due to a bug in atomic size needs to be the first word in the struct, as - // that's the only place where you're guaranteed to be 64-bit aligned on a - // 32 bit system. See: https://golang.org/pkg/sync/atomic/#pkg-note-BUG - size uint64 - snapshotSize uint64 - + _ uint64 // Padding for 32 bit struct alignment mu sync.RWMutex store storer maxSize uint64 @@ -194,10 +170,7 @@ type Cache struct { snapshot *Cache snapshotting bool - // This number is the number of pending or failed WriteSnaphot attempts since the last successful one. - snapshotAttempts int - - stats *CacheStatistics + cacheTracker *cacheTracker lastSnapshot time.Time lastWriteTime time.Time @@ -213,50 +186,13 @@ func NewCache(maxSize uint64) *Cache { c := &Cache{ maxSize: maxSize, store: emptyStore{}, - stats: &CacheStatistics{}, lastSnapshot: time.Now(), + cacheTracker: newCacheTracker(newCacheMetrics(nil)), } c.initialize.Store(&sync.Once{}) - c.UpdateAge() - c.UpdateCompactTime(0) - c.updateCachedBytes(0) - c.updateMemSize(0) - c.updateSnapshots() return c } -// CacheStatistics hold statistics related to the cache. -type CacheStatistics struct { - MemSizeBytes int64 - DiskSizeBytes int64 - SnapshotCount int64 - CacheAgeMs int64 - CachedBytes int64 - WALCompactionTimeMs int64 - WriteOK int64 - WriteErr int64 - WriteDropped int64 -} - -// Statistics returns statistics for periodic monitoring. -func (c *Cache) Statistics(tags map[string]string) []models.Statistic { - return []models.Statistic{{ - Name: "tsm1_cache", - Tags: tags, - Values: map[string]interface{}{ - statCacheMemoryBytes: atomic.LoadInt64(&c.stats.MemSizeBytes), - statCacheDiskBytes: atomic.LoadInt64(&c.stats.DiskSizeBytes), - statSnapshots: atomic.LoadInt64(&c.stats.SnapshotCount), - statCacheAgeMs: atomic.LoadInt64(&c.stats.CacheAgeMs), - statCachedBytes: atomic.LoadInt64(&c.stats.CachedBytes), - statWALCompactionTimeMs: atomic.LoadInt64(&c.stats.WALCompactionTimeMs), - statCacheWriteOK: atomic.LoadInt64(&c.stats.WriteOK), - statCacheWriteErr: atomic.LoadInt64(&c.stats.WriteErr), - statCacheWriteDropped: atomic.LoadInt64(&c.stats.WriteDropped), - }, - }} -} - // init initializes the cache and allocates the underlying store. Once initialized, // the store re-used until Freed. func (c *Cache) init() { @@ -291,13 +227,15 @@ func (c *Cache) Write(key []byte, values []Value) error { n := c.Size() + addedSize if limit > 0 && n > limit { - atomic.AddInt64(&c.stats.WriteErr, 1) + c.cacheTracker.IncWritesErr() + c.cacheTracker.AddWrittenBytesDrop(uint64(addedSize)) return ErrCacheMemorySizeLimitExceeded(n, limit) } newKey, err := c.store.write(key, values) if err != nil { - atomic.AddInt64(&c.stats.WriteErr, 1) + c.cacheTracker.IncWritesErr() + c.cacheTracker.AddWrittenBytesErr(uint64(addedSize)) return err } @@ -305,9 +243,10 @@ func (c *Cache) Write(key []byte, values []Value) error { addedSize += uint64(len(key)) } // Update the cache size and the memory size stat. - c.increaseSize(addedSize) - c.updateMemSize(int64(addedSize)) - atomic.AddInt64(&c.stats.WriteOK, 1) + c.cacheTracker.IncCacheSize(addedSize) + c.cacheTracker.AddMemBytes(addedSize) + c.cacheTracker.AddWrittenBytesOK(uint64(addedSize)) + c.cacheTracker.IncWritesOK() return nil } @@ -328,7 +267,8 @@ func (c *Cache) WriteMulti(values map[string][]Value) error { limit := c.maxSize // maxSize is safe for reading without a lock. n := c.Size() + addedSize if limit > 0 && n > limit { - atomic.AddInt64(&c.stats.WriteErr, 1) + c.cacheTracker.IncWritesErr() + c.cacheTracker.AddWrittenBytesDrop(uint64(addedSize)) return ErrCacheMemorySizeLimitExceeded(n, limit) } @@ -337,32 +277,36 @@ func (c *Cache) WriteMulti(values map[string][]Value) error { store := c.store c.mu.RUnlock() - // We'll optimistially set size here, and then decrement it for write errors. - c.increaseSize(addedSize) + var bytesWrittenErr uint64 + + // We'll optimistically set size here, and then decrement it for write errors. for k, v := range values { newKey, err := store.write([]byte(k), v) if err != nil { // The write failed, hold onto the error and adjust the size delta. werr = err addedSize -= uint64(Values(v).Size()) - c.decreaseSize(uint64(Values(v).Size())) + bytesWrittenErr += uint64(Values(v).Size()) } + if newKey { addedSize += uint64(len(k)) - c.increaseSize(uint64(len(k))) } } // Some points in the batch were dropped. An error is returned so // error stat is incremented as well. if werr != nil { - atomic.AddInt64(&c.stats.WriteDropped, 1) - atomic.AddInt64(&c.stats.WriteErr, 1) + c.cacheTracker.IncWritesErr() + c.cacheTracker.IncWritesDrop() + c.cacheTracker.AddWrittenBytesErr(bytesWrittenErr) } // Update the memory size stat - c.updateMemSize(int64(addedSize)) - atomic.AddInt64(&c.stats.WriteOK, 1) + c.cacheTracker.IncCacheSize(addedSize) + c.cacheTracker.AddMemBytes(addedSize) + c.cacheTracker.IncWritesOK() + c.cacheTracker.AddWrittenBytesOK(addedSize) c.mu.Lock() c.lastWriteTime = time.Now() @@ -384,7 +328,7 @@ func (c *Cache) Snapshot() (*Cache, error) { } c.snapshotting = true - c.snapshotAttempts++ // increment the number of times we tried to do this + c.cacheTracker.IncSnapshotsActive() // increment the number of times we tried to do this // If no snapshot exists, create a new one, otherwise update the existing snapshot if c.snapshot == nil { @@ -393,8 +337,10 @@ func (c *Cache) Snapshot() (*Cache, error) { return nil, err } + newMetrics := newCacheMetrics(c.cacheTracker.metrics.Labels()) c.snapshot = &Cache{ - store: store, + store: store, + cacheTracker: newCacheTracker(newMetrics), } } @@ -407,18 +353,17 @@ func (c *Cache) Snapshot() (*Cache, error) { c.snapshot.store, c.store = c.store, c.snapshot.store snapshotSize := c.Size() - // Save the size of the snapshot on the snapshot cache - atomic.StoreUint64(&c.snapshot.size, snapshotSize) - // Save the size of the snapshot on the live cache - atomic.StoreUint64(&c.snapshotSize, snapshotSize) + c.snapshot.cacheTracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the snapshot cache + c.cacheTracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the live cache // Reset the cache's store. c.store.reset() - atomic.StoreUint64(&c.size, 0) + c.cacheTracker.SetCacheSize(0) c.lastSnapshot = time.Now() - c.updateCachedBytes(snapshotSize) // increment the number of bytes added to the snapshot - c.updateSnapshots() + c.cacheTracker.AddSnapshottedBytes(snapshotSize) // increment the number of bytes added to the snapshot + c.cacheTracker.SetDiskBytes(0) + c.cacheTracker.SetSnapshotsActive(0) return c.snapshot, nil } @@ -455,33 +400,26 @@ func (c *Cache) ClearSnapshot(success bool) { c.snapshotting = false if success { - c.snapshotAttempts = 0 - c.updateMemSize(-int64(atomic.LoadUint64(&c.snapshotSize))) // decrement the number of bytes in cache + snapshotSize := c.cacheTracker.SnapshotSize() + c.cacheTracker.SetSnapshotsActive(0) + c.cacheTracker.SubMemBytes(snapshotSize) // decrement the number of bytes in cache // Reset the snapshot to a fresh Cache. + newMetrics := newCacheMetrics(c.cacheTracker.metrics.Labels()) c.snapshot = &Cache{ - store: c.snapshot.store, + store: c.snapshot.store, + cacheTracker: newCacheTracker(newMetrics), } - atomic.StoreUint64(&c.snapshotSize, 0) - c.updateSnapshots() + c.cacheTracker.SetSnapshotSize(0) + c.cacheTracker.SetDiskBytes(0) + c.cacheTracker.SetSnapshotsActive(0) } } // Size returns the number of point-calcuated bytes the cache currently uses. func (c *Cache) Size() uint64 { - return atomic.LoadUint64(&c.size) + atomic.LoadUint64(&c.snapshotSize) -} - -// increaseSize increases size by delta. -func (c *Cache) increaseSize(delta uint64) { - atomic.AddUint64(&c.size, delta) -} - -// decreaseSize decreases size by delta. -func (c *Cache) decreaseSize(delta uint64) { - // Per sync/atomic docs, bit-flip delta minus one to perform subtraction within AddUint64. - atomic.AddUint64(&c.size, ^(delta - 1)) + return c.cacheTracker.CacheSize() + c.cacheTracker.SnapshotSize() } // MaxSize returns the maximum number of bytes the cache may consume. @@ -623,6 +561,7 @@ func (c *Cache) DeleteRange(keys [][]byte, min, max int64) { c.mu.Lock() defer c.mu.Unlock() + var total uint64 for _, k := range keys { // Make sure key exist in the cache, skip if it does not e := c.store.entry(k) @@ -630,23 +569,28 @@ func (c *Cache) DeleteRange(keys [][]byte, min, max int64) { continue } - origSize := uint64(e.size()) + total += uint64(e.size()) + // Everything is being deleted. if min == math.MinInt64 && max == math.MaxInt64 { - c.decreaseSize(origSize + uint64(len(k))) + total += uint64(len(k)) // all entries and the key. c.store.remove(k) continue } + // Filter what to delete by time range. e.filter(min, max) if e.count() == 0 { + // Nothing left in cache for that key + total += uint64(len(k)) // all entries and the key. c.store.remove(k) - c.decreaseSize(origSize + uint64(len(k))) continue } - c.decreaseSize(origSize - uint64(e.size())) + // Just update what is being deleted by the size of the filtered entries. + total -= uint64(e.size()) } - atomic.StoreInt64(&c.stats.MemSizeBytes, int64(c.Size())) + c.cacheTracker.DecCacheSize(total) // Decrease the live cache size. + c.cacheTracker.SetMemBytes(uint64(c.Size())) } // SetMaxSize updates the memory limit of the cache. @@ -777,23 +721,156 @@ func (c *Cache) LastWriteTime() time.Time { func (c *Cache) UpdateAge() { c.mu.RLock() defer c.mu.RUnlock() - ageStat := int64(time.Since(c.lastSnapshot) / time.Millisecond) - atomic.StoreInt64(&c.stats.CacheAgeMs, ageStat) + c.cacheTracker.SetAge(time.Since(c.lastSnapshot)) } -// UpdateCompactTime updates WAL compaction time statistic based on d. -func (c *Cache) UpdateCompactTime(d time.Duration) { - atomic.AddInt64(&c.stats.WALCompactionTimeMs, int64(d/time.Millisecond)) +// cacheTracker tracks writes to the cache and snapshots. +// +// As well as being responsible for providing atomic reads and writes to the +// statistics, cacheTracker also mirrors any changes to the external prometheus +// metrics, which the Engine exposes. +// +// *NOTE* - cacheTracker fields should not be directory modified. Doing so +// could result in the Engine exposing inaccurate metrics. +type cacheTracker struct { + metrics *cacheMetrics + snapshotsActive uint64 + snapshotSize uint64 + cacheSize uint64 + + // Used in testing. + memSizeBytes uint64 + snapshottedBytes uint64 + writesDropped uint64 + writesErr uint64 } -// updateCachedBytes increases the cachedBytes counter by b. -func (c *Cache) updateCachedBytes(b uint64) { - atomic.AddInt64(&c.stats.CachedBytes, int64(b)) +func newCacheTracker(metrics *cacheMetrics) *cacheTracker { + return &cacheTracker{metrics: metrics} } -// updateMemSize updates the memSize level by b. -func (c *Cache) updateMemSize(b int64) { - atomic.AddInt64(&c.stats.MemSizeBytes, b) +// AddMemBytes increases the number of in-memory cache bytes. +func (t *cacheTracker) AddMemBytes(bytes uint64) { + atomic.AddUint64(&t.memSizeBytes, bytes) + + labels := t.metrics.Labels() + t.metrics.MemSize.With(labels).Add(float64(bytes)) +} + +// SubMemBytes decreases the number of in-memory cache bytes. +func (t *cacheTracker) SubMemBytes(bytes uint64) { + atomic.AddUint64(&t.memSizeBytes, ^(bytes - 1)) + + labels := t.metrics.Labels() + t.metrics.MemSize.With(labels).Sub(float64(bytes)) +} + +// SetMemBytes sets the number of in-memory cache bytes. +func (t *cacheTracker) SetMemBytes(bytes uint64) { + atomic.StoreUint64(&t.memSizeBytes, bytes) + + labels := t.metrics.Labels() + t.metrics.MemSize.With(labels).Set(float64(bytes)) +} + +// AddBytesWritten increases the number of bytes written to the cache. +func (t *cacheTracker) AddBytesWritten(bytes uint64) { + labels := t.metrics.Labels() + t.metrics.MemSize.With(labels).Add(float64(bytes)) +} + +// AddSnapshottedBytes increases the number of bytes snapshotted. +func (t *cacheTracker) AddSnapshottedBytes(bytes uint64) { + atomic.AddUint64(&t.snapshottedBytes, bytes) + + labels := t.metrics.Labels() + t.metrics.SnapshottedBytes.With(labels).Add(float64(bytes)) +} + +// SetDiskBytes sets the number of bytes on disk used by snapshot data. +func (t *cacheTracker) SetDiskBytes(bytes uint64) { + labels := t.metrics.Labels() + t.metrics.DiskSize.With(labels).Set(float64(bytes)) +} + +// IncSnapshotsActive increases the number of active snapshots. +func (t *cacheTracker) IncSnapshotsActive() { + atomic.AddUint64(&t.snapshotsActive, 1) + + labels := t.metrics.Labels() + t.metrics.SnapshotsActive.With(labels).Inc() +} + +// SetSnapshotsActive sets the number of bytes on disk used by snapshot data. +func (t *cacheTracker) SetSnapshotsActive(n uint64) { + atomic.StoreUint64(&t.snapshotsActive, n) + + labels := t.metrics.Labels() + t.metrics.SnapshotsActive.With(labels).Set(float64(n)) +} + +// AddWrittenBytes increases the number of bytes written to the cache, with a required status. +func (t *cacheTracker) AddWrittenBytes(status string, bytes uint64) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.WrittenBytes.With(labels).Add(float64(bytes)) +} + +// AddWrittenBytesOK increments the number of successful writes. +func (t *cacheTracker) AddWrittenBytesOK(bytes uint64) { t.AddWrittenBytes("ok", bytes) } + +// AddWrittenBytesError increments the number of writes that encountered an error. +func (t *cacheTracker) AddWrittenBytesErr(bytes uint64) { t.AddWrittenBytes("error", bytes) } + +// AddWrittenBytesDrop increments the number of writes that were dropped. +func (t *cacheTracker) AddWrittenBytesDrop(bytes uint64) { t.AddWrittenBytes("dropped", bytes) } + +// IncWrites increments the number of writes to the cache, with a required status. +func (t *cacheTracker) IncWrites(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Writes.With(labels).Inc() +} + +// IncWritesOK increments the number of successful writes. +func (t *cacheTracker) IncWritesOK() { t.IncWrites("ok") } + +// IncWritesError increments the number of writes that encountered an error. +func (t *cacheTracker) IncWritesErr() { + atomic.AddUint64(&t.writesErr, 1) + + t.IncWrites("error") +} + +// IncWritesDrop increments the number of writes that were dropped. +func (t *cacheTracker) IncWritesDrop() { + atomic.AddUint64(&t.writesDropped, 1) + + t.IncWrites("dropped") +} + +// CacheSize returns the live cache size. +func (t *cacheTracker) CacheSize() uint64 { return atomic.LoadUint64(&t.cacheSize) } + +// IncCacheSize increases the live cache size by sz bytes. +func (t *cacheTracker) IncCacheSize(sz uint64) { atomic.AddUint64(&t.cacheSize, sz) } + +// DecCacheSize decreases the live cache size by sz bytes. +func (t *cacheTracker) DecCacheSize(sz uint64) { atomic.AddUint64(&t.cacheSize, ^(sz - 1)) } + +// SetCacheSize sets the live cache size to sz. +func (t *cacheTracker) SetCacheSize(sz uint64) { atomic.StoreUint64(&t.cacheSize, sz) } + +// SetSnapshotSize sets the last successful snapshot size. +func (t *cacheTracker) SetSnapshotSize(sz uint64) { atomic.StoreUint64(&t.snapshotSize, sz) } + +// SnapshotSize returns the last successful snapshot size. +func (t *cacheTracker) SnapshotSize() uint64 { return atomic.LoadUint64(&t.snapshotSize) } + +// SetAge sets the time since the last successful snapshot +func (t *cacheTracker) SetAge(d time.Duration) { + labels := t.metrics.Labels() + t.metrics.Age.With(labels).Set(d.Seconds()) } func valueType(v Value) byte { @@ -811,13 +888,6 @@ func valueType(v Value) byte { } } -// updateSnapshots updates the snapshotsCount and the diskSize levels. -func (c *Cache) updateSnapshots() { - // Update disk stats - atomic.StoreInt64(&c.stats.DiskSizeBytes, int64(atomic.LoadUint64(&c.snapshotSize))) - atomic.StoreInt64(&c.stats.SnapshotCount, int64(c.snapshotAttempts)) -} - type emptyStore struct{} func (e emptyStore) entry(key []byte) *entry { return nil } diff --git a/tsdb/tsm1/cache_test.go b/tsdb/tsm1/cache_test.go index 0f0dff9673..38f243ea92 100644 --- a/tsdb/tsm1/cache_test.go +++ b/tsdb/tsm1/cache_test.go @@ -138,9 +138,9 @@ func TestCache_WriteMulti_Stats(t *testing.T) { } // Write stats updated - if got, exp := c.stats.WriteDropped, int64(1); got != exp { + if got, exp := atomic.LoadUint64(&c.cacheTracker.writesDropped), uint64(1); got != exp { t.Fatalf("got %v, expected %v", got, exp) - } else if got, exp := c.stats.WriteErr, int64(1); got != exp { + } else if got, exp := atomic.LoadUint64(&c.cacheTracker.writesErr), uint64(1); got != exp { t.Fatalf("got %v, expected %v", got, exp) } } @@ -190,11 +190,11 @@ func TestCache_Cache_DeleteRange(t *testing.T) { c.DeleteRange([][]byte{[]byte("bar")}, 2, math.MaxInt64) if exp, keys := [][]byte{[]byte("bar"), []byte("foo")}, c.Keys(); !reflect.DeepEqual(keys, exp) { - t.Fatalf("cache keys incorrect after 2 writes, exp %v, got %v", exp, keys) + t.Fatalf("cache keys incorrect after delete, exp %v, got %v", exp, keys) } if got, exp := c.Size(), valuesSize+uint64(v0.Size())+6; exp != got { - t.Fatalf("cache size incorrect after 2 writes, exp %d, got %d", exp, got) + t.Fatalf("cache size incorrect after delete, exp %d, got %d", exp, got) } if got, exp := len(c.Values([]byte("bar"))), 1; got != exp { @@ -479,7 +479,7 @@ func TestCache_Snapshot_Stats(t *testing.T) { t.Fatal(err) } - if got, exp := c.stats.MemSizeBytes, int64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.cacheTracker.memSizeBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } @@ -494,11 +494,11 @@ func TestCache_Snapshot_Stats(t *testing.T) { } // Cached bytes should have been increased. - if got, exp := c.stats.CachedBytes, int64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.cacheTracker.snapshottedBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } - if got, exp := c.stats.MemSizeBytes, int64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.cacheTracker.memSizeBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } } diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 50096ca0a5..1f99609d24 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -501,6 +501,7 @@ func (e *Engine) Open() error { // Propagate prometheus metrics down into trackers. e.compactionTracker = newCompactionTracker(e.blockMetrics.compactionMetrics) e.FileStore.fileTracker = newFileTracker(e.blockMetrics.fileMetrics) + e.Cache.cacheTracker = newCacheTracker(e.blockMetrics.cacheMetrics) e.scheduler.setCompactionTracker(e.compactionTracker) @@ -554,7 +555,6 @@ func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector metrics = append(metrics, e.blockMetrics.PrometheusCollectors()...) - // TODO(edd): Add Cache metrics // TODO(edd): Add WAL metrics return metrics } @@ -1161,7 +1161,6 @@ func (e *Engine) WriteSnapshot() error { log, logEnd := logger.NewOperation(e.logger, "Cache snapshot", "tsm1_cache_snapshot") defer func() { elapsed := time.Since(started) - e.Cache.UpdateCompactTime(elapsed) log.Info("Snapshot for path written", zap.String("path", e.path), zap.Duration("duration", elapsed)) diff --git a/tsdb/tsm1/file_store.go b/tsdb/tsm1/file_store.go index e25afba80e..733c2dc0c3 100644 --- a/tsdb/tsm1/file_store.go +++ b/tsdb/tsm1/file_store.go @@ -241,6 +241,7 @@ func NewFileStore(dir string) *FileStore { }, obs: noFileStoreObserver{}, parseFileName: DefaultParseFileName, + fileTracker: newFileTracker(newFileMetrics(nil)), } fs.purger.fileStore = fs return fs diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index 6a90c9990c..7d832a6bfd 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -10,14 +10,16 @@ import ( // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" -const compactionSubsystem = "compactions" // sub-system associated with metrics for compactions -const fileStoreSubsystem = "tsm_files" // sub-system associated with metrics for compactions +const compactionSubsystem = "compactions" // sub-system associated with metrics for compactions. +const fileStoreSubsystem = "tsm_files" // sub-system associated with metrics for TSM files. +const cacheSubsystem = "cache" // sub-system associated with metrics for the cache. // blockMetrics are a set of metrics concerned with tracking data about block storage. type blockMetrics struct { labels prometheus.Labels *compactionMetrics *fileMetrics + *cacheMetrics } // newBlockMetrics initialises the prometheus metrics for the block subsystem. @@ -26,6 +28,7 @@ func newBlockMetrics(labels prometheus.Labels) *blockMetrics { labels: labels, compactionMetrics: newCompactionMetrics(labels), fileMetrics: newFileMetrics(labels), + cacheMetrics: newCacheMetrics(labels), } } @@ -34,16 +37,20 @@ func (m *blockMetrics) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector metrics = append(metrics, m.compactionMetrics.PrometheusCollectors()...) metrics = append(metrics, m.fileMetrics.PrometheusCollectors()...) + metrics = append(metrics, m.cacheMetrics.PrometheusCollectors()...) return metrics } // compactionMetrics are a set of metrics concerned with tracking data about compactions. type compactionMetrics struct { - labels prometheus.Labels // Read Only - Compactions *prometheus.CounterVec + labels prometheus.Labels // Read Only + CompactionsActive *prometheus.GaugeVec CompactionDuration *prometheus.HistogramVec CompactionQueue *prometheus.GaugeVec + + // The following metrics include a ``"status" = {ok, error, dropped}` label + Compactions *prometheus.CounterVec } // newCompactionMetrics initialises the prometheus metrics for compactions. @@ -155,3 +162,98 @@ func (m *fileMetrics) PrometheusCollectors() []prometheus.Collector { m.Files, } } + +// cacheMetrics are a set of metrics concerned with tracking data about the TSM Cache. +type cacheMetrics struct { + labels prometheus.Labels // Read Only + + MemSize *prometheus.GaugeVec + DiskSize *prometheus.GaugeVec + SnapshotsActive *prometheus.GaugeVec + Age *prometheus.GaugeVec + SnapshottedBytes *prometheus.CounterVec + + // The following metrics include a ``"status" = {ok, error, dropped}` label + WrittenBytes *prometheus.CounterVec + Writes *prometheus.CounterVec +} + +// newCacheMetrics initialises the prometheus metrics for compactions. +func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + writeNames := append(names, "status") + sort.Strings(writeNames) + + return &cacheMetrics{ + labels: labels, + MemSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "inuse_bytes", + Help: "In-memory size of cache.", + }, names), + DiskSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "disk_bytes", + Help: "Number of bytes on disk used by snapshot data.", + }, names), + SnapshotsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "snapshots_active", + Help: "Number of active concurrent snapshots (>1 when splitting the cache).", + }, names), + Age: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "age", + Help: "Age of the current cache (time since last snapshot or initialisation).", + }, names), + SnapshottedBytes: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "snapshot_bytes", + Help: "Number of bytes snapshotted.", + }, names), + WrittenBytes: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "written_bytes", + Help: "Number of bytes successfully written to the Cache.", + }, writeNames), + Writes: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "writes", + Help: "Number of writes to the Cache.", + }, writeNames), + } +} + +// Labels returns a copy of labels for use with cache metrics. +func (m *cacheMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.MemSize, + m.DiskSize, + m.SnapshotsActive, + m.Age, + m.SnapshottedBytes, + m.WrittenBytes, + m.Writes, + } +} diff --git a/tsdb/tsm1/scheduler.go b/tsdb/tsm1/scheduler.go index 833f98c817..c4beba403c 100644 --- a/tsdb/tsm1/scheduler.go +++ b/tsdb/tsm1/scheduler.go @@ -13,8 +13,9 @@ type scheduler struct { func newScheduler(maxConcurrency int) *scheduler { return &scheduler{ - maxConcurrency: maxConcurrency, - weights: defaultWeights, + maxConcurrency: maxConcurrency, + weights: defaultWeights, + compactionTracker: newCompactionTracker(newCompactionMetrics(nil)), } } diff --git a/tsdb/tsm1/scheduler_test.go b/tsdb/tsm1/scheduler_test.go index 9ff40b0e5f..97871def85 100644 --- a/tsdb/tsm1/scheduler_test.go +++ b/tsdb/tsm1/scheduler_test.go @@ -3,7 +3,7 @@ package tsm1 import "testing" func TestScheduler_Runnable_Empty(t *testing.T) { - s := newScheduler(&EngineStatistics{}, 1) + s := newScheduler(1) for i := 1; i < 5; i++ { s.setDepth(i, 1) @@ -20,11 +20,10 @@ func TestScheduler_Runnable_Empty(t *testing.T) { } func TestScheduler_Runnable_MaxConcurrency(t *testing.T) { - s := newScheduler(&EngineStatistics{}, 1) + s := newScheduler(1) // level 1 - s.stats = &EngineStatistics{} - s.stats.TSMCompactionsActive[0] = 1 + s.compactionTracker.active[1] = 1 for i := 0; i <= 4; i++ { _, runnable := s.next() if exp, got := false, runnable; exp != got { @@ -33,8 +32,7 @@ func TestScheduler_Runnable_MaxConcurrency(t *testing.T) { } // level 2 - s.stats = &EngineStatistics{} - s.stats.TSMCompactionsActive[1] = 1 + s.compactionTracker.active[2] = 1 for i := 0; i <= 4; i++ { _, runnable := s.next() if exp, got := false, runnable; exp != got { @@ -43,8 +41,7 @@ func TestScheduler_Runnable_MaxConcurrency(t *testing.T) { } // level 3 - s.stats = &EngineStatistics{} - s.stats.TSMCompactionsActive[2] = 1 + s.compactionTracker.active[3] = 1 for i := 0; i <= 4; i++ { _, runnable := s.next() if exp, got := false, runnable; exp != got { @@ -53,8 +50,7 @@ func TestScheduler_Runnable_MaxConcurrency(t *testing.T) { } // optimize - s.stats = &EngineStatistics{} - s.stats.TSMOptimizeCompactionsActive++ + s.compactionTracker.active[4] = 1 for i := 0; i <= 4; i++ { _, runnable := s.next() if exp, got := false, runnable; exp != got { @@ -63,8 +59,7 @@ func TestScheduler_Runnable_MaxConcurrency(t *testing.T) { } // full - s.stats = &EngineStatistics{} - s.stats.TSMFullCompactionsActive++ + s.compactionTracker.active[5] = 1 for i := 0; i <= 4; i++ { _, runnable := s.next() if exp, got := false, runnable; exp != got { From 44e5fbae0af1179632e924d3fb5300ab0d17fa5e Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 7 Nov 2018 16:24:47 +0000 Subject: [PATCH 04/25] Convert WAL stats --- storage/engine.go | 1 - tsdb/tsm1/engine.go | 5 ++ tsdb/tsm1/metrics.go | 73 +++++++++++++++++++++++ tsdb/tsm1/wal.go | 134 ++++++++++++++++++++++++++++--------------- 4 files changed, 166 insertions(+), 47 deletions(-) diff --git a/storage/engine.go b/storage/engine.go index 3722a8c894..54eda7b8ae 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -166,7 +166,6 @@ func (e *Engine) WithLogger(log *zap.Logger) { // the engine and its components. func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector - // TODO(edd): Get prom metrics for TSM. // TODO(edd): Get prom metrics for index. // TODO(edd): Get prom metrics for series file. metrics = append(metrics, e.engine.PrometheusCollectors()...) diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 1f99609d24..c9a23a1e29 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -503,6 +503,11 @@ func (e *Engine) Open() error { e.FileStore.fileTracker = newFileTracker(e.blockMetrics.fileMetrics) e.Cache.cacheTracker = newCacheTracker(e.blockMetrics.cacheMetrics) + // Set default metrics on WAL if enabled. + if wal, ok := e.WAL.(*WAL); ok { + wal.tracker = newWALTracker(e.blockMetrics.walMetrics) + } + e.scheduler.setCompactionTracker(e.compactionTracker) if err := os.MkdirAll(e.path, 0777); err != nil { diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index 7d832a6bfd..dc3e5bd8cf 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -13,6 +13,7 @@ const namespace = "storage" const compactionSubsystem = "compactions" // sub-system associated with metrics for compactions. const fileStoreSubsystem = "tsm_files" // sub-system associated with metrics for TSM files. const cacheSubsystem = "cache" // sub-system associated with metrics for the cache. +const walSubsystem = "wal" // sub-system associated with metrics for the WAL. // blockMetrics are a set of metrics concerned with tracking data about block storage. type blockMetrics struct { @@ -20,6 +21,7 @@ type blockMetrics struct { *compactionMetrics *fileMetrics *cacheMetrics + *walMetrics } // newBlockMetrics initialises the prometheus metrics for the block subsystem. @@ -29,6 +31,7 @@ func newBlockMetrics(labels prometheus.Labels) *blockMetrics { compactionMetrics: newCompactionMetrics(labels), fileMetrics: newFileMetrics(labels), cacheMetrics: newCacheMetrics(labels), + walMetrics: newWALMetrics(labels), } } @@ -38,6 +41,7 @@ func (m *blockMetrics) PrometheusCollectors() []prometheus.Collector { metrics = append(metrics, m.compactionMetrics.PrometheusCollectors()...) metrics = append(metrics, m.fileMetrics.PrometheusCollectors()...) metrics = append(metrics, m.cacheMetrics.PrometheusCollectors()...) + metrics = append(metrics, m.walMetrics.PrometheusCollectors()...) return metrics } @@ -100,6 +104,7 @@ func (m *compactionMetrics) Labels(level compactionLevel) prometheus.Labels { for k, v := range m.labels { l[k] = v } + // N.B all compaction metrics include level. So it's included here. l["level"] = fmt.Sprint(level) return l } @@ -257,3 +262,71 @@ func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { m.Writes, } } + +// walMetrics are a set of metrics concerned with tracking data about compactions. +type walMetrics struct { + labels prometheus.Labels + OldSegmentBytes *prometheus.GaugeVec + CurrentSegmentBytes *prometheus.GaugeVec + Segments *prometheus.GaugeVec + Writes *prometheus.CounterVec +} + +// newWALMetrics initialises the prometheus metrics for tracking the WAL. +func newWALMetrics(labels prometheus.Labels) *walMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + writeNames := append(names, "status") + sort.Strings(writeNames) + + return &walMetrics{ + labels: labels, + OldSegmentBytes: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: walSubsystem, + Name: "old_segment_bytes", + Help: "Number of bytes old WAL segments using on disk.", + }, names), + CurrentSegmentBytes: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: walSubsystem, + Name: "current_segment_bytes", + Help: "Number of bytes TSM files using on disk.", + }, names), + Segments: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: walSubsystem, + Name: "segments_total", + Help: "Number of WAL segment files on disk.", + }, names), + Writes: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: walSubsystem, + Name: "writes", + Help: "Number of writes to the WAL.", + }, writeNames), + } +} + +// Labels returns a copy of labels for use with file metrics. +func (m *walMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *walMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.OldSegmentBytes, + m.CurrentSegmentBytes, + m.Segments, + m.Writes, + } +} diff --git a/tsdb/tsm1/wal.go b/tsdb/tsm1/wal.go index 7d0ae9c7fc..54e50609aa 100644 --- a/tsdb/tsm1/wal.go +++ b/tsdb/tsm1/wal.go @@ -18,7 +18,6 @@ import ( "time" "github.com/golang/snappy" - "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/limiter" "github.com/influxdata/platform/pkg/pool" "go.uber.org/zap" @@ -88,14 +87,6 @@ var ( bytesPool = pool.NewLimitedBytes(256, walEncodeBufSize*2) ) -// Statistics gathered by the WAL. -const ( - statWALOldBytes = "oldSegmentsDiskBytes" - statWALCurrentBytes = "currentSegmentDiskBytes" - statWriteOk = "writeOk" - statWriteErr = "writeErr" -) - // WAL represents the write-ahead log used for writing TSM files. type WAL struct { // goroutines waiting for the next fsync @@ -128,8 +119,7 @@ type WAL struct { // SegmentSize is the file size at which a segment file will be rotated SegmentSize int - // statistics for the WAL - stats *WALStatistics + tracker *walTracker limiter limiter.Fixed } @@ -143,10 +133,10 @@ func NewWAL(path string) *WAL { SegmentSize: DefaultSegmentSize, closing: make(chan struct{}), syncWaiters: make(chan chan error, 1024), - stats: &WALStatistics{}, limiter: limiter.NewFixed(defaultWaitingWALWrites), logger: logger, traceLogger: logger, + tracker: newWALTracker(newWALMetrics(nil)), } } @@ -172,28 +162,6 @@ func (l *WAL) WithLogger(log *zap.Logger) { } } -// WALStatistics maintains statistics about the WAL. -type WALStatistics struct { - OldBytes int64 - CurrentBytes int64 - WriteOK int64 - WriteErr int64 -} - -// Statistics returns statistics for periodic monitoring. -func (l *WAL) Statistics(tags map[string]string) []models.Statistic { - return []models.Statistic{{ - Name: "tsm1_wal", - Tags: tags, - Values: map[string]interface{}{ - statWALOldBytes: atomic.LoadInt64(&l.stats.OldBytes), - statWALCurrentBytes: atomic.LoadInt64(&l.stats.CurrentBytes), - statWriteOk: atomic.LoadInt64(&l.stats.WriteOK), - statWriteErr: atomic.LoadInt64(&l.stats.WriteErr), - }, - }} -} - // Path returns the directory the log was initialized with. func (l *WAL) Path() string { l.mu.RLock() @@ -217,6 +185,7 @@ func (l *WAL) Open() error { if err != nil { return err } + l.tracker.SetSegments(uint64(len(segments))) if len(segments) > 0 { lastSegment := segments[len(segments)-1] @@ -234,6 +203,7 @@ func (l *WAL) Open() error { if stat.Size() == 0 { os.Remove(lastSegment) segments = segments[:len(segments)-1] + l.tracker.DecSegments() } else { fd, err := os.OpenFile(lastSegment, os.O_RDWR, 0666) if err != nil { @@ -245,7 +215,7 @@ func (l *WAL) Open() error { l.currentSegmentWriter = NewWALSegmentWriter(fd) // Reset the current segment size stat - atomic.StoreInt64(&l.stats.CurrentBytes, stat.Size()) + l.tracker.SetCurrentSegmentSize(uint64(stat.Size())) } } @@ -263,7 +233,7 @@ func (l *WAL) Open() error { } } } - atomic.StoreInt64(&l.stats.OldBytes, totalOldDiskSize) + l.tracker.SetOldSegmentSize(uint64(totalOldDiskSize)) l.closing = make(chan struct{}) @@ -336,10 +306,10 @@ func (l *WAL) WriteMulti(values map[string][]Value) (int, error) { id, err := l.writeToLog(entry) if err != nil { - atomic.AddInt64(&l.stats.WriteErr, 1) + l.tracker.IncWritesErr() return -1, err } - atomic.AddInt64(&l.stats.WriteOK, 1) + l.tracker.IncWritesOK() return id, nil } @@ -390,6 +360,7 @@ func (l *WAL) Remove(files []string) error { if err != nil { return err } + l.tracker.SetSegments(uint64(len(segments))) var totalOldDiskSize int64 for _, seg := range segments { @@ -400,8 +371,7 @@ func (l *WAL) Remove(files []string) error { totalOldDiskSize += stat.Size() } - atomic.StoreInt64(&l.stats.OldBytes, totalOldDiskSize) - + l.tracker.SetOldSegmentSize(uint64(totalOldDiskSize)) return nil } @@ -412,8 +382,9 @@ func (l *WAL) LastWriteTime() time.Time { return l.lastWriteTime } +// DiskSizeBytes returns the on-disk size of the WAL. func (l *WAL) DiskSizeBytes() int64 { - return atomic.LoadInt64(&l.stats.OldBytes) + atomic.LoadInt64(&l.stats.CurrentBytes) + return int64(l.tracker.OldSegmentSize() + l.tracker.CurrentSegmentSize()) } func (l *WAL) writeToLog(entry WALEntry) (int, error) { @@ -464,8 +435,7 @@ func (l *WAL) writeToLog(entry WALEntry) (int, error) { l.scheduleSync() // Update stats for current segment size - atomic.StoreInt64(&l.stats.CurrentBytes, int64(l.currentSegmentWriter.size)) - + l.tracker.SetCurrentSegmentSize(uint64(l.currentSegmentWriter.size)) l.lastWriteTime = time.Now().UTC() return l.currentSegmentID, nil @@ -586,7 +556,7 @@ func (l *WAL) newSegmentFile() error { if err := l.currentSegmentWriter.close(); err != nil { return err } - atomic.StoreInt64(&l.stats.OldBytes, int64(l.currentSegmentWriter.size)) + l.tracker.SetOldSegmentSize(uint64(l.currentSegmentWriter.size)) } fileName := filepath.Join(l.path, fmt.Sprintf("%s%05d.%s", WALFilePrefix, l.currentSegmentID, WALFileExtension)) @@ -595,13 +565,85 @@ func (l *WAL) newSegmentFile() error { return err } l.currentSegmentWriter = NewWALSegmentWriter(fd) + l.tracker.IncSegments() // Reset the current segment size stat - atomic.StoreInt64(&l.stats.CurrentBytes, 0) - + l.tracker.SetCurrentSegmentSize(0) return nil } +// walTracker tracks writes to the WAL. +// +// As well as being responsible for providing atomic reads and writes to the +// statistics, walTracker also mirrors any changes to the external prometheus +// metrics, which the Engine exposes. +// +// *NOTE* - walTracker fields should not be directory modified. Doing so +// could result in the Engine exposing inaccurate metrics. +type walTracker struct { + metrics *walMetrics + + oldSegmentBytes uint64 + currentSegmentBytes uint64 +} + +func newWALTracker(metrics *walMetrics) *walTracker { + return &walTracker{metrics: metrics} +} + +// IncWrites increments the number of writes to the cache, with a required status. +func (t *walTracker) IncWrites(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Writes.With(labels).Inc() +} + +// IncWritesOK increments the number of successful writes. +func (t *walTracker) IncWritesOK() { t.IncWrites("ok") } + +// IncWritesError increments the number of writes that encountered an error. +func (t *walTracker) IncWritesErr() { t.IncWrites("error") } + +// SetOldSegmentSize sets the size of all old segments on disk. +func (t *walTracker) SetOldSegmentSize(sz uint64) { + atomic.StoreUint64(&t.oldSegmentBytes, sz) + + labels := t.metrics.Labels() + t.metrics.OldSegmentBytes.With(labels).Set(float64(sz)) +} + +// OldSegmentSize returns the on-disk size of all old segments. +func (t *walTracker) OldSegmentSize() uint64 { return atomic.LoadUint64(&t.oldSegmentBytes) } + +// SetCurrentSegmentSize sets the size of all old segments on disk. +func (t *walTracker) SetCurrentSegmentSize(sz uint64) { + atomic.StoreUint64(&t.oldSegmentBytes, sz) + + labels := t.metrics.Labels() + t.metrics.CurrentSegmentBytes.With(labels).Set(float64(sz)) +} + +// CurrentSegmentSize returns the on-disk size of all old segments. +func (t *walTracker) CurrentSegmentSize() uint64 { return atomic.LoadUint64(&t.oldSegmentBytes) } + +// SetSegments sets the number of segments files on disk. +func (t *walTracker) SetSegments(sz uint64) { + labels := t.metrics.Labels() + t.metrics.Segments.With(labels).Set(float64(sz)) +} + +// IncSegments increases the number of segments files by one. +func (t *walTracker) IncSegments() { + labels := t.metrics.Labels() + t.metrics.Segments.With(labels).Inc() +} + +// DecSegments decreases the number of segments files by one. +func (t *walTracker) DecSegments() { + labels := t.metrics.Labels() + t.metrics.Segments.With(labels).Dec() +} + // WALEntry is record stored in each WAL segment. Each entry has a type // and an opaque, type dependent byte slice data attribute. type WALEntry interface { From 6c5dec8f8822e3d4dc6c58196d4844230ecf836a Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 7 Nov 2018 16:28:34 +0000 Subject: [PATCH 05/25] Refactor tracker names --- tsdb/tsm1/cache.go | 84 ++++++++++++++++++++--------------------- tsdb/tsm1/cache_test.go | 10 ++--- tsdb/tsm1/engine.go | 4 +- tsdb/tsm1/file_store.go | 18 ++++----- 4 files changed, 58 insertions(+), 58 deletions(-) diff --git a/tsdb/tsm1/cache.go b/tsdb/tsm1/cache.go index 455c38d7b6..70599277ca 100644 --- a/tsdb/tsm1/cache.go +++ b/tsdb/tsm1/cache.go @@ -170,7 +170,7 @@ type Cache struct { snapshot *Cache snapshotting bool - cacheTracker *cacheTracker + tracker *cacheTracker lastSnapshot time.Time lastWriteTime time.Time @@ -187,7 +187,7 @@ func NewCache(maxSize uint64) *Cache { maxSize: maxSize, store: emptyStore{}, lastSnapshot: time.Now(), - cacheTracker: newCacheTracker(newCacheMetrics(nil)), + tracker: newCacheTracker(newCacheMetrics(nil)), } c.initialize.Store(&sync.Once{}) return c @@ -227,15 +227,15 @@ func (c *Cache) Write(key []byte, values []Value) error { n := c.Size() + addedSize if limit > 0 && n > limit { - c.cacheTracker.IncWritesErr() - c.cacheTracker.AddWrittenBytesDrop(uint64(addedSize)) + c.tracker.IncWritesErr() + c.tracker.AddWrittenBytesDrop(uint64(addedSize)) return ErrCacheMemorySizeLimitExceeded(n, limit) } newKey, err := c.store.write(key, values) if err != nil { - c.cacheTracker.IncWritesErr() - c.cacheTracker.AddWrittenBytesErr(uint64(addedSize)) + c.tracker.IncWritesErr() + c.tracker.AddWrittenBytesErr(uint64(addedSize)) return err } @@ -243,10 +243,10 @@ func (c *Cache) Write(key []byte, values []Value) error { addedSize += uint64(len(key)) } // Update the cache size and the memory size stat. - c.cacheTracker.IncCacheSize(addedSize) - c.cacheTracker.AddMemBytes(addedSize) - c.cacheTracker.AddWrittenBytesOK(uint64(addedSize)) - c.cacheTracker.IncWritesOK() + c.tracker.IncCacheSize(addedSize) + c.tracker.AddMemBytes(addedSize) + c.tracker.AddWrittenBytesOK(uint64(addedSize)) + c.tracker.IncWritesOK() return nil } @@ -267,8 +267,8 @@ func (c *Cache) WriteMulti(values map[string][]Value) error { limit := c.maxSize // maxSize is safe for reading without a lock. n := c.Size() + addedSize if limit > 0 && n > limit { - c.cacheTracker.IncWritesErr() - c.cacheTracker.AddWrittenBytesDrop(uint64(addedSize)) + c.tracker.IncWritesErr() + c.tracker.AddWrittenBytesDrop(uint64(addedSize)) return ErrCacheMemorySizeLimitExceeded(n, limit) } @@ -297,16 +297,16 @@ func (c *Cache) WriteMulti(values map[string][]Value) error { // Some points in the batch were dropped. An error is returned so // error stat is incremented as well. if werr != nil { - c.cacheTracker.IncWritesErr() - c.cacheTracker.IncWritesDrop() - c.cacheTracker.AddWrittenBytesErr(bytesWrittenErr) + c.tracker.IncWritesErr() + c.tracker.IncWritesDrop() + c.tracker.AddWrittenBytesErr(bytesWrittenErr) } // Update the memory size stat - c.cacheTracker.IncCacheSize(addedSize) - c.cacheTracker.AddMemBytes(addedSize) - c.cacheTracker.IncWritesOK() - c.cacheTracker.AddWrittenBytesOK(addedSize) + c.tracker.IncCacheSize(addedSize) + c.tracker.AddMemBytes(addedSize) + c.tracker.IncWritesOK() + c.tracker.AddWrittenBytesOK(addedSize) c.mu.Lock() c.lastWriteTime = time.Now() @@ -328,7 +328,7 @@ func (c *Cache) Snapshot() (*Cache, error) { } c.snapshotting = true - c.cacheTracker.IncSnapshotsActive() // increment the number of times we tried to do this + c.tracker.IncSnapshotsActive() // increment the number of times we tried to do this // If no snapshot exists, create a new one, otherwise update the existing snapshot if c.snapshot == nil { @@ -337,10 +337,10 @@ func (c *Cache) Snapshot() (*Cache, error) { return nil, err } - newMetrics := newCacheMetrics(c.cacheTracker.metrics.Labels()) + newMetrics := newCacheMetrics(c.tracker.metrics.Labels()) c.snapshot = &Cache{ - store: store, - cacheTracker: newCacheTracker(newMetrics), + store: store, + tracker: newCacheTracker(newMetrics), } } @@ -353,17 +353,17 @@ func (c *Cache) Snapshot() (*Cache, error) { c.snapshot.store, c.store = c.store, c.snapshot.store snapshotSize := c.Size() - c.snapshot.cacheTracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the snapshot cache - c.cacheTracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the live cache + c.snapshot.tracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the snapshot cache + c.tracker.SetSnapshotSize(snapshotSize) // Save the size of the snapshot on the live cache // Reset the cache's store. c.store.reset() - c.cacheTracker.SetCacheSize(0) + c.tracker.SetCacheSize(0) c.lastSnapshot = time.Now() - c.cacheTracker.AddSnapshottedBytes(snapshotSize) // increment the number of bytes added to the snapshot - c.cacheTracker.SetDiskBytes(0) - c.cacheTracker.SetSnapshotsActive(0) + c.tracker.AddSnapshottedBytes(snapshotSize) // increment the number of bytes added to the snapshot + c.tracker.SetDiskBytes(0) + c.tracker.SetSnapshotsActive(0) return c.snapshot, nil } @@ -400,26 +400,26 @@ func (c *Cache) ClearSnapshot(success bool) { c.snapshotting = false if success { - snapshotSize := c.cacheTracker.SnapshotSize() - c.cacheTracker.SetSnapshotsActive(0) - c.cacheTracker.SubMemBytes(snapshotSize) // decrement the number of bytes in cache + snapshotSize := c.tracker.SnapshotSize() + c.tracker.SetSnapshotsActive(0) + c.tracker.SubMemBytes(snapshotSize) // decrement the number of bytes in cache // Reset the snapshot to a fresh Cache. - newMetrics := newCacheMetrics(c.cacheTracker.metrics.Labels()) + newMetrics := newCacheMetrics(c.tracker.metrics.Labels()) c.snapshot = &Cache{ - store: c.snapshot.store, - cacheTracker: newCacheTracker(newMetrics), + store: c.snapshot.store, + tracker: newCacheTracker(newMetrics), } - c.cacheTracker.SetSnapshotSize(0) - c.cacheTracker.SetDiskBytes(0) - c.cacheTracker.SetSnapshotsActive(0) + c.tracker.SetSnapshotSize(0) + c.tracker.SetDiskBytes(0) + c.tracker.SetSnapshotsActive(0) } } // Size returns the number of point-calcuated bytes the cache currently uses. func (c *Cache) Size() uint64 { - return c.cacheTracker.CacheSize() + c.cacheTracker.SnapshotSize() + return c.tracker.CacheSize() + c.tracker.SnapshotSize() } // MaxSize returns the maximum number of bytes the cache may consume. @@ -589,8 +589,8 @@ func (c *Cache) DeleteRange(keys [][]byte, min, max int64) { // Just update what is being deleted by the size of the filtered entries. total -= uint64(e.size()) } - c.cacheTracker.DecCacheSize(total) // Decrease the live cache size. - c.cacheTracker.SetMemBytes(uint64(c.Size())) + c.tracker.DecCacheSize(total) // Decrease the live cache size. + c.tracker.SetMemBytes(uint64(c.Size())) } // SetMaxSize updates the memory limit of the cache. @@ -721,7 +721,7 @@ func (c *Cache) LastWriteTime() time.Time { func (c *Cache) UpdateAge() { c.mu.RLock() defer c.mu.RUnlock() - c.cacheTracker.SetAge(time.Since(c.lastSnapshot)) + c.tracker.SetAge(time.Since(c.lastSnapshot)) } // cacheTracker tracks writes to the cache and snapshots. diff --git a/tsdb/tsm1/cache_test.go b/tsdb/tsm1/cache_test.go index 38f243ea92..a5f107529c 100644 --- a/tsdb/tsm1/cache_test.go +++ b/tsdb/tsm1/cache_test.go @@ -138,9 +138,9 @@ func TestCache_WriteMulti_Stats(t *testing.T) { } // Write stats updated - if got, exp := atomic.LoadUint64(&c.cacheTracker.writesDropped), uint64(1); got != exp { + if got, exp := atomic.LoadUint64(&c.tracker.writesDropped), uint64(1); got != exp { t.Fatalf("got %v, expected %v", got, exp) - } else if got, exp := atomic.LoadUint64(&c.cacheTracker.writesErr), uint64(1); got != exp { + } else if got, exp := atomic.LoadUint64(&c.tracker.writesErr), uint64(1); got != exp { t.Fatalf("got %v, expected %v", got, exp) } } @@ -479,7 +479,7 @@ func TestCache_Snapshot_Stats(t *testing.T) { t.Fatal(err) } - if got, exp := atomic.LoadUint64(&c.cacheTracker.memSizeBytes), uint64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.tracker.memSizeBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } @@ -494,11 +494,11 @@ func TestCache_Snapshot_Stats(t *testing.T) { } // Cached bytes should have been increased. - if got, exp := atomic.LoadUint64(&c.cacheTracker.snapshottedBytes), uint64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.tracker.snapshottedBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } - if got, exp := atomic.LoadUint64(&c.cacheTracker.memSizeBytes), uint64(16)+3; got != exp { + if got, exp := atomic.LoadUint64(&c.tracker.memSizeBytes), uint64(16)+3; got != exp { t.Fatalf("got %v, expected %v", got, exp) } } diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index c9a23a1e29..3ab21c437c 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -500,8 +500,8 @@ func (e *Engine) Open() error { // Propagate prometheus metrics down into trackers. e.compactionTracker = newCompactionTracker(e.blockMetrics.compactionMetrics) - e.FileStore.fileTracker = newFileTracker(e.blockMetrics.fileMetrics) - e.Cache.cacheTracker = newCacheTracker(e.blockMetrics.cacheMetrics) + e.FileStore.tracker = newFileTracker(e.blockMetrics.fileMetrics) + e.Cache.tracker = newCacheTracker(e.blockMetrics.cacheMetrics) // Set default metrics on WAL if enabled. if wal, ok := e.WAL.(*WAL); ok { diff --git a/tsdb/tsm1/file_store.go b/tsdb/tsm1/file_store.go index 733c2dc0c3..1657bd0bc8 100644 --- a/tsdb/tsm1/file_store.go +++ b/tsdb/tsm1/file_store.go @@ -191,8 +191,8 @@ type FileStore struct { traceLogger *zap.Logger // Logger to be used when trace-logging is on. traceLogging bool - fileTracker *fileTracker - purger *purger + tracker *fileTracker + purger *purger currentTempDirID int @@ -241,7 +241,7 @@ func NewFileStore(dir string) *FileStore { }, obs: noFileStoreObserver{}, parseFileName: DefaultParseFileName, - fileTracker: newFileTracker(newFileMetrics(nil)), + tracker: newFileTracker(newFileMetrics(nil)), } fs.purger.fileStore = fs return fs @@ -611,7 +611,7 @@ func (f *FileStore) Open() error { for _, ts := range res.r.TombstoneFiles() { totalSize += uint64(ts.Size) } - f.fileTracker.AddBytes(totalSize) + f.tracker.AddBytes(totalSize) // Re-initialize the lastModified time for the file store if res.r.LastModified() > lm { @@ -623,7 +623,7 @@ func (f *FileStore) Open() error { close(readerC) sort.Sort(tsmReaders(f.files)) - f.fileTracker.SetFileCount(uint64(len(f.files))) + f.tracker.SetFileCount(uint64(len(f.files))) return nil } @@ -636,7 +636,7 @@ func (f *FileStore) Close() error { f.lastFileStats = nil f.files = nil - f.fileTracker.SetFileCount(uint64(0)) + f.tracker.SetFileCount(uint64(0)) // Let other methods access this closed object while we do the actual closing. f.mu.Unlock() @@ -652,7 +652,7 @@ func (f *FileStore) Close() error { } // DiskSizeBytes returns the total number of bytes consumed by the files in the FileStore. -func (f *FileStore) DiskSizeBytes() int64 { return int64(f.fileTracker.Bytes()) } +func (f *FileStore) DiskSizeBytes() int64 { return int64(f.tracker.Bytes()) } // Read returns the slice of values for the given key and the given timestamp, // if any file matches those constraints. @@ -904,7 +904,7 @@ func (f *FileStore) replace(oldFiles, newFiles []string, updatedFn func(r []TSMF f.lastFileStats = nil f.files = active sort.Sort(tsmReaders(f.files)) - f.fileTracker.SetFileCount(uint64(len(f.files))) + f.tracker.SetFileCount(uint64(len(f.files))) // Recalculate the disk size stat var totalSize uint64 @@ -915,7 +915,7 @@ func (f *FileStore) replace(oldFiles, newFiles []string, updatedFn func(r []TSMF } } - f.fileTracker.SetBytes(totalSize) + f.tracker.SetBytes(totalSize) return nil } From 8ca637bd80e9a29660793aed60e36503e0b7148e Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 7 Nov 2018 16:44:25 +0000 Subject: [PATCH 06/25] Refactor default labels and retention metrics --- storage/engine.go | 32 ++++++++++++++++++-------------- storage/metrics.go | 18 +++++++++++++++++- storage/retention.go | 41 +++++++++++++++-------------------------- tsdb/tsm1/engine.go | 6 ++++++ 4 files changed, 56 insertions(+), 41 deletions(-) diff --git a/storage/engine.go b/storage/engine.go index 54eda7b8ae..d664945f4c 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -39,6 +39,8 @@ type Engine struct { wal *tsm1.WAL retentionEnforcer *retentionEnforcer + defaultMetricLabels prometheus.Labels + // Tracks all goroutines started by the Engine. wg sync.WaitGroup @@ -61,6 +63,7 @@ func WithTSMFilenameFormatter(fn tsm1.FormatFileNameFunc) Option { func WithEngineID(id int) Option { return func(e *Engine) { e.engineID = &id + e.defaultMetricLabels["engine_id"] = fmt.Sprint(*e.engineID) } } @@ -69,6 +72,7 @@ func WithEngineID(id int) Option { func WithNodeID(id int) Option { return func(e *Engine) { e.nodeID = &id + e.defaultMetricLabels["node_id"] = fmt.Sprint(*e.nodeID) } } @@ -78,17 +82,6 @@ func WithNodeID(id int) Option { func WithRetentionEnforcer(finder BucketFinder) Option { return func(e *Engine) { e.retentionEnforcer = newRetentionEnforcer(e, finder) - - if e.engineID != nil { - e.retentionEnforcer.defaultMetricLabels["engine_id"] = fmt.Sprint(*e.engineID) - } - - if e.nodeID != nil { - e.retentionEnforcer.defaultMetricLabels["node_id"] = fmt.Sprint(*e.nodeID) - } - - // As new labels may have been set, set the new metrics on the enforcer. - e.retentionEnforcer.retentionMetrics = newRetentionMetrics(e.retentionEnforcer.defaultMetricLabels) } } @@ -110,9 +103,11 @@ func WithCompactionPlanner(planner tsm1.CompactionPlanner) Option { // TSM engine. func NewEngine(path string, c Config, options ...Option) *Engine { e := &Engine{ - config: c, - path: path, - logger: zap.NewNop(), + config: c, + path: path, + sfile: tsdb.NewSeriesFile(c.GetSeriesFilePath(path)), + defaultMetricLabels: prometheus.Labels{}, + logger: zap.NewNop(), } // Initialize series file. @@ -140,6 +135,9 @@ func NewEngine(path string, c Config, options ...Option) *Engine { for _, option := range options { option(e) } + // Set default metrics labels. + e.engine.WithDefaultMetricLabels(e.defaultMetricLabels) + return e } @@ -197,6 +195,7 @@ func (e *Engine) Open() error { e.engine.SetCompactionsEnabled(true) // TODO(edd):is this needed? e.closing = make(chan struct{}) + // TODO(edd) background tasks will be run in priority order via a scheduler. // For now we will just run on an interval as we only have the retention // policy enforcer. @@ -221,6 +220,11 @@ func (e *Engine) runRetentionEnforcer() { return } + if e.retentionEnforcer != nil { + // Set default metric labels on retention enforcer. + e.retentionEnforcer.metrics = newRetentionMetrics(e.defaultMetricLabels) + } + l := e.logger.With(zap.String("component", "retention_enforcer"), logger.DurationLiteral("check_interval", interval)) l.Info("Starting") diff --git a/storage/metrics.go b/storage/metrics.go index 6bbe690a0a..8833f11b2b 100644 --- a/storage/metrics.go +++ b/storage/metrics.go @@ -1,6 +1,10 @@ package storage -import "github.com/prometheus/client_golang/prometheus" +import ( + "sort" + + "github.com/prometheus/client_golang/prometheus" +) // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" @@ -9,6 +13,7 @@ const retentionSubsystem = "retention" // sub-system associated with metrics for // retentionMetrics is a set of metrics concerned with tracking data about retention policies. type retentionMetrics struct { + labels prometheus.Labels Checks *prometheus.CounterVec CheckDuration *prometheus.HistogramVec Unprocessable *prometheus.CounterVec @@ -20,6 +25,8 @@ func newRetentionMetrics(labels prometheus.Labels) *retentionMetrics { for k := range labels { names = append(names, k) } + names = append(names, "status") // All metrics include status + sort.Strings(names) return &retentionMetrics{ Checks: prometheus.NewCounterVec(prometheus.CounterOpts{ @@ -54,6 +61,15 @@ func newRetentionMetrics(labels prometheus.Labels) *retentionMetrics { } } +// Labels returns a copy of labels for use with retention metrics. +func (m *retentionMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (rm *retentionMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ diff --git a/storage/retention.go b/storage/retention.go index 6bae09b3b4..8b11e855b1 100644 --- a/storage/retention.go +++ b/storage/retention.go @@ -48,8 +48,7 @@ type retentionEnforcer struct { logger *zap.Logger - retentionMetrics *retentionMetrics - defaultMetricLabels prometheus.Labels // N.B this must not be mutated after Open is called. + metrics *retentionMetrics } // newRetentionEnforcer returns a new enforcer that ensures expired data is @@ -57,24 +56,14 @@ type retentionEnforcer struct { // disabling the service. func newRetentionEnforcer(engine Deleter, bucketService BucketFinder) *retentionEnforcer { s := &retentionEnforcer{ - Engine: engine, - BucketService: bucketService, - logger: zap.NewNop(), - defaultMetricLabels: prometheus.Labels{"status": ""}, + Engine: engine, + BucketService: bucketService, + logger: zap.NewNop(), } - s.retentionMetrics = newRetentionMetrics(s.defaultMetricLabels) + s.metrics = newRetentionMetrics(nil) return s } -// metricLabels returns a new copy of the default metric labels. -func (s *retentionEnforcer) metricLabels() prometheus.Labels { - labels := make(map[string]string, len(s.defaultMetricLabels)) - for k, v := range s.defaultMetricLabels { - labels[k] = v - } - return labels -} - // WithLogger sets the logger l on the service. It must be called before Open. func (s *retentionEnforcer) WithLogger(l *zap.Logger) { if s == nil { @@ -96,15 +85,15 @@ func (s *retentionEnforcer) run() { } now := time.Now().UTC() - labels := s.metricLabels() + labels := s.metrics.Labels() labels["status"] = "ok" if err := s.expireData(rpByBucketID, now); err != nil { log.Error("Deletion not successful", zap.Error(err)) labels["status"] = "error" } - s.retentionMetrics.CheckDuration.With(labels).Observe(time.Since(now).Seconds()) - s.retentionMetrics.Checks.With(labels).Inc() + s.metrics.CheckDuration.With(labels).Observe(time.Since(now).Seconds()) + s.metrics.Checks.With(labels).Inc() } // expireData runs a delete operation on the storage engine. @@ -162,21 +151,21 @@ func (s *retentionEnforcer) expireData(rpByBucketID map[platform.ID]time.Duratio } defer func() { - if s.retentionMetrics == nil { + if s.metrics == nil { return } - labels := s.metricLabels() + labels := s.metrics.Labels() labels["status"] = "bad_measurement" - s.retentionMetrics.Unprocessable.With(labels).Add(float64(len(badMSketch))) + s.metrics.Unprocessable.With(labels).Add(float64(len(badMSketch))) labels["status"] = "missing_bucket" - s.retentionMetrics.Unprocessable.With(labels).Add(float64(len(missingBSketch))) + s.metrics.Unprocessable.With(labels).Add(float64(len(missingBSketch))) labels["status"] = "ok" - s.retentionMetrics.Series.With(labels).Add(float64(atomic.LoadUint64(&seriesDeleted))) + s.metrics.Series.With(labels).Add(float64(atomic.LoadUint64(&seriesDeleted))) labels["status"] = "skipped" - s.retentionMetrics.Series.With(labels).Add(float64(atomic.LoadUint64(&seriesSkipped))) + s.metrics.Series.With(labels).Add(float64(atomic.LoadUint64(&seriesSkipped))) }() return s.Engine.DeleteSeriesRangeWithPredicate(newSeriesIteratorAdapter(cur), fn) @@ -200,7 +189,7 @@ func (s *retentionEnforcer) getRetentionPeriodPerBucket() (map[platform.ID]time. // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (s *retentionEnforcer) PrometheusCollectors() []prometheus.Collector { - return s.retentionMetrics.PrometheusCollectors() + return s.metrics.PrometheusCollectors() } // A BucketService is an platform.BucketService that the retentionEnforcer can open, diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 3ab21c437c..a4d9f8bda3 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -250,6 +250,12 @@ func (e *Engine) WithCompactionPlanner(planner CompactionPlanner) { e.CompactionPlan = planner } +// WithDefaultMetricLabels sets the default labels for metrics on the engine. +// It must be called before the Engine is opened. +func (e *Engine) WithDefaultMetricLabels(labels prometheus.Labels) { + e.defaultMetricLabels = labels +} + // SetEnabled sets whether the engine is enabled. func (e *Engine) SetEnabled(enabled bool) { e.enableCompactionsOnOpen = enabled From 4e67e37ca68fe0fc295c9e8d486732a41eaf8e9c Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 7 Nov 2018 17:30:12 +0000 Subject: [PATCH 07/25] Fix bug with engineID --- storage/engine.go | 2 +- storage/metrics.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/storage/engine.go b/storage/engine.go index d664945f4c..a58f68b6b6 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -149,7 +149,7 @@ func (e *Engine) WithLogger(log *zap.Logger) { } if e.engineID != nil { - fields = append(fields, zap.Int("engine_id", *e.nodeID)) + fields = append(fields, zap.Int("engine_id", *e.engineID)) } fields = append(fields, zap.String("service", "storage-engine")) diff --git a/storage/metrics.go b/storage/metrics.go index 8833f11b2b..277386ab40 100644 --- a/storage/metrics.go +++ b/storage/metrics.go @@ -29,6 +29,7 @@ func newRetentionMetrics(labels prometheus.Labels) *retentionMetrics { sort.Strings(names) return &retentionMetrics{ + labels: labels, Checks: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: retentionSubsystem, From d1fe2bc188df9b0c0329fc205dfa37d4db85de62 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 9 Nov 2018 10:16:34 +0000 Subject: [PATCH 08/25] Add series file metrics --- pkg/rhh/metrics.go | 38 ++++++++++ pkg/rhh/rhh.go | 17 +++++ storage/engine.go | 9 +-- tsdb/metrics.go | 115 +++++++++++++++++++++++++++++ tsdb/series_file.go | 26 +++++++ tsdb/series_index.go | 10 +++ tsdb/series_partition.go | 151 ++++++++++++++++++++++++++++++++++++--- tsdb/tsm1/engine.go | 4 +- tsdb/tsm1/metrics.go | 4 +- 9 files changed, 355 insertions(+), 19 deletions(-) create mode 100644 pkg/rhh/metrics.go create mode 100644 tsdb/metrics.go diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go new file mode 100644 index 0000000000..477d834d6c --- /dev/null +++ b/pkg/rhh/metrics.go @@ -0,0 +1,38 @@ +package rhh + +import ( + "sort" + + "github.com/prometheus/client_golang/prometheus" +) + +type rhhMetrics struct { + labels prometheus.Labels +} + +// newRHHMetrics initialises prometheus metrics for tracking an RHH hashmap. +func newRHHMetrics(namespace, subsystem string, labels prometheus.Labels) *rhhMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + return &rhhMetrics{ + labels: labels, + } +} + +// Labels returns a copy of labels for use with RHH metrics. +func (m *rhhMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *rhhMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{} +} diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index bb8db4be7c..3b58eb1cad 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -6,6 +6,7 @@ import ( "sort" "github.com/cespare/xxhash" + "github.com/prometheus/client_golang/prometheus" ) // HashMap represents a hash map that implements Robin Hood Hashing. @@ -21,12 +22,15 @@ type HashMap struct { loadFactor int tmpKey []byte + + tracker *rhhTracker } func NewHashMap(opt Options) *HashMap { m := &HashMap{ capacity: pow2(opt.Capacity), // Limited to 2^64. loadFactor: opt.LoadFactor, + tracker: newRHHTracker(newRHHMetrics("", "", nil)), } m.alloc() return m @@ -203,6 +207,19 @@ func (m *HashMap) Keys() [][]byte { return a } +// PrometheusCollectors returns the metrics associated with this hashmap. +func (m *HashMap) PrometheusCollectors() []prometheus.Collector { + return m.tracker.metrics.PrometheusCollectors() +} + +type rhhTracker struct { + metrics *rhhMetrics +} + +func newRHHTracker(metrics *rhhMetrics) *rhhTracker { + return &rhhTracker{metrics: metrics} +} + type hashElem struct { key []byte value interface{} diff --git a/storage/engine.go b/storage/engine.go index a58f68b6b6..51eaf88dce 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -1,10 +1,10 @@ package storage import ( + "fmt" "bytes" "context" "errors" - "fmt" "sync" "time" @@ -136,7 +136,8 @@ func NewEngine(path string, c Config, options ...Option) *Engine { option(e) } // Set default metrics labels. - e.engine.WithDefaultMetricLabels(e.defaultMetricLabels) + e.engine.SetDefaultMetricLabels(e.defaultMetricLabels) + e.sfile.SetDefaultMetricLabels(e.defaultMetricLabels) return e } @@ -165,7 +166,7 @@ func (e *Engine) WithLogger(log *zap.Logger) { func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector // TODO(edd): Get prom metrics for index. - // TODO(edd): Get prom metrics for series file. + metrics = append(metrics, e.sfile.PrometheusCollectors()...) metrics = append(metrics, e.engine.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) return metrics @@ -200,7 +201,7 @@ func (e *Engine) Open() error { // For now we will just run on an interval as we only have the retention // policy enforcer. e.runRetentionEnforcer() - + return nil } diff --git a/tsdb/metrics.go b/tsdb/metrics.go new file mode 100644 index 0000000000..fe70e9c612 --- /dev/null +++ b/tsdb/metrics.go @@ -0,0 +1,115 @@ +package tsdb + +import ( + "fmt" + "sort" + + "github.com/prometheus/client_golang/prometheus" +) + +// namespace is the leading part of all published metrics for the Storage service. +const namespace = "storage" + +const seriesFileSubsystem = "series_file" // sub-system associated with metrics for the Series File. + +type seriesFileMetrics struct { + labels prometheus.Labels + SeriesCreated *prometheus.CounterVec // Number of series created in Series File. + Series *prometheus.GaugeVec // Number of series. + DiskSize *prometheus.GaugeVec // Size occupied on disk. + Segments *prometheus.GaugeVec // Number of segment files. + + CompactionsActive *prometheus.GaugeVec // Number of active compactions. + CompactionDuration *prometheus.HistogramVec // Duration of compactions. + // The following metrics include a ``"status" = {ok, error}` label + Compactions *prometheus.CounterVec // Total number of compactions. +} + +// newSeriesFileMetrics initialises the prometheus metrics for tracking the Series File. +func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + names = append(names, "partition_id") // All metrics have a partition_id label + sort.Strings(names) + + totalCompactions := append(names, "status") + sort.Strings(totalCompactions) + + durationCompaction := append(names, "component") + sort.Strings(durationCompaction) + + return &seriesFileMetrics{ + labels: labels, + SeriesCreated: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "series_created", + Help: "Number of series created in Series File.", + }, names), + Series: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "series_total", + Help: "Number of series in Series File.", + }, names), + DiskSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "disk_bytes", + Help: "Number of bytes Series File is using on disk.", + }, names), + Segments: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "segments", + Help: "Number of segment files in Series File.", + }, names), + CompactionsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "index_compactions_active", + Help: "Number of active compactions.", + }, names), + CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "index_compactions_duration_seconds", + Help: "Time taken for a successful compaction of index.", + // 30 buckets spaced exponentially between 5s and ~53 minutes. + Buckets: prometheus.ExponentialBuckets(5.0, 1.25, 30), + }, durationCompaction), + Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: seriesFileSubsystem, + Name: "compactions", + Help: "Number of compactions.", + }, totalCompactions), + } +} + +// Labels returns a copy of labels for use with Series File metrics. +func (m *seriesFileMetrics) Labels(partition int) prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + + // N.B all series file metrics include the partition. So it's included here. + l["partition_id"] = fmt.Sprint(partition) + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *seriesFileMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.SeriesCreated, + m.Series, + m.DiskSize, + m.Segments, + m.CompactionsActive, + m.CompactionDuration, + m.Compactions, + } +} diff --git a/tsdb/series_file.go b/tsdb/series_file.go index eeb384128f..985286ddd6 100644 --- a/tsdb/series_file.go +++ b/tsdb/series_file.go @@ -13,6 +13,7 @@ import ( "github.com/cespare/xxhash" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/binaryutil" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" "golang.org/x/sync/errgroup" ) @@ -35,6 +36,9 @@ type SeriesFile struct { path string partitions []*SeriesPartition + defaultMetricLabels prometheus.Labels + metrics *seriesFileMetrics + refs sync.RWMutex // RWMutex to track references to the SeriesFile that are in use. Logger *zap.Logger @@ -53,6 +57,12 @@ func (f *SeriesFile) WithLogger(log *zap.Logger) { f.Logger = log.With(zap.String("service", "series-file")) } +// SetDefaultMetricLabels sets the default labels for metrics on the Series File. +// It must be called before the SeriesFile is opened. +func (f *SeriesFile) SetDefaultMetricLabels(labels prometheus.Labels) { + f.defaultMetricLabels = labels +} + // Open memory maps the data file at the file's path. func (f *SeriesFile) Open() error { // Wait for all references to be released and prevent new ones from being acquired. @@ -64,12 +74,18 @@ func (f *SeriesFile) Open() error { return err } + f.metrics = newSeriesFileMetrics(f.defaultMetricLabels) // All partitions must share the same metrics. + // Open partitions. f.partitions = make([]*SeriesPartition, 0, SeriesFilePartitionN) for i := 0; i < SeriesFilePartitionN; i++ { // TODO(edd): These partition initialisation should be moved up to NewSeriesFile. p := NewSeriesPartition(i, f.SeriesPartitionPath(i)) p.Logger = f.Logger.With(zap.Int("partition", p.ID())) + + // Set the metric tracker on the partition with any injected default labels. + p.tracker = newSeriesPartitionTracker(f.metrics, p.ID()) + if err := p.Open(); err != nil { f.Close() return err @@ -300,6 +316,16 @@ func (f *SeriesFile) SeriesKeyPartition(key []byte) *SeriesPartition { return f.partitions[partitionID] } +// PrometheusCollectors returns all the prometheus metrics associated with the series file. +func (f *SeriesFile) PrometheusCollectors() []prometheus.Collector { + collectors := f.metrics.PrometheusCollectors() // Shared per-partition metrics. + + for _, p := range f.partitions { + collectors = append(collectors, p.PrometheusCollectors()...) + } + return collectors +} + // AppendSeriesKey serializes name and tags to a byte slice. // The total length is prepended as a uvarint. func AppendSeriesKey(dst []byte, name []byte, tags models.Tags) []byte { diff --git a/tsdb/series_index.go b/tsdb/series_index.go index 1dab0ac095..8346934cdc 100644 --- a/tsdb/series_index.go +++ b/tsdb/series_index.go @@ -144,6 +144,16 @@ func (idx *SeriesIndex) OnDiskCount() uint64 { return idx.count } // InMemCount returns the number of series in the in-memory index. func (idx *SeriesIndex) InMemCount() uint64 { return uint64(len(idx.idOffsetMap)) } +// OnDiskSize returns the on-disk size of the index in bytes. +func (idx *SeriesIndex) OnDiskSize() uint64 { return uint64(len(idx.data)) } + +// InMemSize returns the heap size of the index in bytes. The returned value is +// an estimation and does not include include all allocated memory. +func (idx *SeriesIndex) InMemSize() uint64 { + n := len(idx.idOffsetMap) + return uint64(2*8*n) + uint64(len(idx.tombstones)*8) +} + func (idx *SeriesIndex) Insert(key []byte, id SeriesIDTyped, offset int64) { idx.execEntry(SeriesEntryInsertFlag, id, offset, key) } diff --git a/tsdb/series_partition.go b/tsdb/series_partition.go index abde072f85..4f2e6888f3 100644 --- a/tsdb/series_partition.go +++ b/tsdb/series_partition.go @@ -8,10 +8,12 @@ import ( "os" "path/filepath" "sync" + "time" "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/rhh" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -44,7 +46,8 @@ type SeriesPartition struct { CompactThreshold int - Logger *zap.Logger + tracker *seriesPartitionTracker + Logger *zap.Logger } // NewSeriesPartition returns a new instance of SeriesPartition. @@ -54,6 +57,7 @@ func NewSeriesPartition(id int, path string) *SeriesPartition { path: path, closing: make(chan struct{}), CompactThreshold: DefaultSeriesPartitionCompactThreshold, + tracker: newSeriesPartitionTracker(newSeriesFileMetrics(nil), id), Logger: zap.NewNop(), seq: uint64(id) + 1, } @@ -75,7 +79,6 @@ func (p *SeriesPartition) Open() error { if err := p.openSegments(); err != nil { return err } - // Init last segment for writes. if err := p.activeSegment().InitForWrite(); err != nil { return err @@ -87,13 +90,14 @@ func (p *SeriesPartition) Open() error { } else if p.index.Recover(p.segments); err != nil { return err } - return nil }(); err != nil { p.Close() return err } + p.tracker.SetSeries(p.index.Count()) // Set series count metric. + p.tracker.SetDiskSize(p.DiskSize()) // Set on-disk size metric. return nil } @@ -134,6 +138,7 @@ func (p *SeriesPartition) openSegments() error { p.segments = append(p.segments, segment) } + p.tracker.SetSegments(uint64(len(p.segments))) return nil } @@ -170,9 +175,17 @@ func (p *SeriesPartition) ID() int { return p.id } // Path returns the path to the partition. func (p *SeriesPartition) Path() string { return p.path } -// Path returns the path to the series index. +// IndexPath returns the path to the series index. func (p *SeriesPartition) IndexPath() string { return filepath.Join(p.path, "index") } +// PrometheusCollectors returns the collectors associated with the partition. +func (p *SeriesPartition) PrometheusCollectors() []prometheus.Collector { + // SeriesFile metrics + p.mu.RLock() + defer p.mu.RUnlock() + return p.index.keyIDMap.PrometheusCollectors() // Metrics for RHH. +} + // CreateSeriesListIfNotExists creates a list of series in bulk if they don't exist. // The ids parameter is modified to contain series IDs for all keys belonging to this partition. // If the type does not match the existing type for the key, a zero id is stored. @@ -283,6 +296,8 @@ func (p *SeriesPartition) CreateSeriesListIfNotExists(collection *SeriesCollecti for _, keyRange := range newKeyRanges { p.index.Insert(p.seriesKeyByOffset(keyRange.offset), keyRange.id, keyRange.offset) } + p.tracker.AddSeriesCreated(uint64(len(newKeyRanges))) // Track new series in metric. + p.tracker.AddSeries(uint64(len(newKeyRanges))) // Check if we've crossed the compaction threshold. if p.compactionsEnabled() && !p.compacting && p.CompactThreshold != 0 && p.index.InMemCount() >= uint64(p.CompactThreshold) { @@ -290,13 +305,18 @@ func (p *SeriesPartition) CreateSeriesListIfNotExists(collection *SeriesCollecti log, logEnd := logger.NewOperation(p.Logger, "Series partition compaction", "series_partition_compaction", zap.String("path", p.path)) p.wg.Add(1) + p.tracker.IncCompactionsActive() go func() { defer p.wg.Done() compactor := NewSeriesPartitionCompactor() compactor.cancel = p.closing - if err := compactor.Compact(p); err != nil { + duration, err := compactor.Compact(p) + if err != nil { + p.tracker.IncCompactionErr() log.Error("series partition compaction failed", zap.Error(err)) + } else { + p.tracker.IncCompactionOK(duration) } logEnd() @@ -305,6 +325,10 @@ func (p *SeriesPartition) CreateSeriesListIfNotExists(collection *SeriesCollecti p.mu.Lock() p.compacting = false p.mu.Unlock() + p.tracker.DecCompactionsActive() + + // Disk size may have changed due to compaction. + p.tracker.SetDiskSize(p.DiskSize()) }() } @@ -348,7 +372,7 @@ func (p *SeriesPartition) DeleteSeriesID(id SeriesID) error { // Mark tombstone in memory. p.index.Delete(id) - + p.tracker.SubSeries(1) return nil } @@ -417,6 +441,21 @@ func (p *SeriesPartition) SeriesCount() uint64 { return n } +// DiskSize returns the number of bytes taken up on disk by the partition. +func (p *SeriesPartition) DiskSize() uint64 { + p.mu.RLock() + defer p.mu.RUnlock() + return p.diskSize() +} + +func (p *SeriesPartition) diskSize() uint64 { + totalSize := p.index.OnDiskSize() + for _, segment := range p.segments { + totalSize += uint64(len(segment.Data())) + } + return totalSize +} + func (p *SeriesPartition) DisableCompactions() { p.mu.Lock() defer p.mu.Unlock() @@ -503,7 +542,8 @@ func (p *SeriesPartition) createSegment() (*SeriesSegment, error) { if err := segment.InitForWrite(); err != nil { return nil, err } - + p.tracker.SetSegments(uint64(len(p.segments))) + p.tracker.SetDiskSize(p.diskSize()) // Disk size will change with new segment. return segment, nil } @@ -525,6 +565,92 @@ func (p *SeriesPartition) seriesKeyByOffset(offset int64) []byte { return nil } +type seriesPartitionTracker struct { + metrics *seriesFileMetrics + id int // ID of partition. +} + +func newSeriesPartitionTracker(metrics *seriesFileMetrics, partition int) *seriesPartitionTracker { + return &seriesPartitionTracker{ + metrics: metrics, + id: partition, + } +} + +// AddSeriesCreated increases the number of series created in the partition by n. +func (t *seriesPartitionTracker) AddSeriesCreated(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.SeriesCreated.With(labels).Add(float64(n)) +} + +// SetSeries sets the number of series in the partition. +func (t *seriesPartitionTracker) SetSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Set(float64(n)) +} + +// AddSeries increases the number of series in the partition by n. +func (t *seriesPartitionTracker) AddSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Add(float64(n)) +} + +// SubSeries decreases the number of series in the partition by n. +func (t *seriesPartitionTracker) SubSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Sub(float64(n)) +} + +// SetDiskSize sets the number of bytes used by files for in partition. +func (t *seriesPartitionTracker) SetDiskSize(sz uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.DiskSize.With(labels).Set(float64(sz)) +} + +// SetSegments sets the number of segments files for the partition. +func (t *seriesPartitionTracker) SetSegments(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Segments.With(labels).Set(float64(n)) +} + +// IncCompactionsActive increments the number of active compactions for the +// components of a partition (index and segments). +func (t *seriesPartitionTracker) IncCompactionsActive() { + labels := t.metrics.Labels(t.id) + labels["component"] = "index" // TODO(edd): when we add segment compactions we will add a new label value. + t.metrics.CompactionsActive.With(labels).Inc() +} + +// DecCompactionsActive decrements the number of active compactions for the +// components of a partition (index and segments). +func (t *seriesPartitionTracker) DecCompactionsActive() { + labels := t.metrics.Labels(t.id) + labels["component"] = "index" // TODO(edd): when we add segment compactions we will add a new label value. + t.metrics.CompactionsActive.With(labels).Dec() +} + +// incCompactions increments the number of compactions for the partition. +// Callers should use IncCompactionOK and IncCompactionErr. +func (t *seriesPartitionTracker) incCompactions(status string, duration time.Duration) { + if duration > 0 { + labels := t.metrics.Labels(t.id) + labels["component"] = "index" + t.metrics.CompactionDuration.With(labels).Observe(duration.Seconds()) + } + + labels := t.metrics.Labels(t.id) + labels["status"] = status + t.metrics.Compactions.With(labels).Inc() +} + +// IncCompactionOK increments the number of successful compactions for the partition. +func (t *seriesPartitionTracker) IncCompactionOK(duration time.Duration) { + t.incCompactions("ok", duration) +} + +// IncCompactionErr increments the number of failed compactions for the partition. +func (t *seriesPartitionTracker) IncCompactionErr() { t.incCompactions("error", 0) } + // SeriesPartitionCompactor represents an object reindexes a series partition and optionally compacts segments. type SeriesPartitionCompactor struct { cancel <-chan struct{} @@ -536,7 +662,7 @@ func NewSeriesPartitionCompactor() *SeriesPartitionCompactor { } // Compact rebuilds the series partition index. -func (c *SeriesPartitionCompactor) Compact(p *SeriesPartition) error { +func (c *SeriesPartitionCompactor) Compact(p *SeriesPartition) (time.Duration, error) { // Snapshot the partitions and index so we can check tombstones and replay at the end under lock. p.mu.RLock() segments := CloneSeriesSegments(p.segments) @@ -544,11 +670,14 @@ func (c *SeriesPartitionCompactor) Compact(p *SeriesPartition) error { seriesN := p.index.Count() p.mu.RUnlock() + now := time.Now() + // Compact index to a temporary location. indexPath := index.path + ".compacting" if err := c.compactIndexTo(index, seriesN, segments, indexPath); err != nil { - return err + return 0, err } + duration := time.Since(now) // Swap compacted index under lock & replay since compaction. if err := func() error { @@ -570,10 +699,10 @@ func (c *SeriesPartitionCompactor) Compact(p *SeriesPartition) error { } return nil }(); err != nil { - return err + return 0, err } - return nil + return duration, nil } func (c *SeriesPartitionCompactor) compactIndexTo(index *SeriesIndex, seriesN uint64, segments []*SeriesSegment, path string) error { diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index a4d9f8bda3..f4dce6edfa 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -250,9 +250,9 @@ func (e *Engine) WithCompactionPlanner(planner CompactionPlanner) { e.CompactionPlan = planner } -// WithDefaultMetricLabels sets the default labels for metrics on the engine. +// SetDefaultMetricLabels sets the default labels for metrics on the engine. // It must be called before the Engine is opened. -func (e *Engine) WithDefaultMetricLabels(labels prometheus.Labels) { +func (e *Engine) SetDefaultMetricLabels(labels prometheus.Labels) { e.defaultMetricLabels = labels } diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index dc3e5bd8cf..7f3865f962 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -53,7 +53,7 @@ type compactionMetrics struct { CompactionDuration *prometheus.HistogramVec CompactionQueue *prometheus.GaugeVec - // The following metrics include a ``"status" = {ok, error, dropped}` label + // The following metrics include a ``"status" = {ok, error}` label Compactions *prometheus.CounterVec } @@ -312,7 +312,7 @@ func newWALMetrics(labels prometheus.Labels) *walMetrics { } } -// Labels returns a copy of labels for use with file metrics. +// Labels returns a copy of labels for use with WAL metrics. func (m *walMetrics) Labels() prometheus.Labels { l := make(map[string]string, len(m.labels)) for k, v := range m.labels { From 55caa0fe549d81b21c45f194dad8bc15cea44f04 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 9 Nov 2018 13:43:44 +0000 Subject: [PATCH 09/25] Add RHH metrics --- pkg/rhh/metrics.go | 113 +++++++++++++++++++++++++++++++++---- pkg/rhh/rhh.go | 117 +++++++++++++++++++++++++++++++++++++-- tsdb/metrics.go | 3 +- tsdb/series_file.go | 28 +++++++--- tsdb/series_index.go | 23 +++++++- tsdb/series_partition.go | 14 +---- 6 files changed, 257 insertions(+), 41 deletions(-) diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go index 477d834d6c..3d7f8840d0 100644 --- a/pkg/rhh/metrics.go +++ b/pkg/rhh/metrics.go @@ -6,33 +6,124 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -type rhhMetrics struct { - labels prometheus.Labels +type Metrics struct { + Lab prometheus.Labels + LoadFactor *prometheus.GaugeVec // Load factor of the hashmap. + Size *prometheus.GaugeVec // Number of items in hashmap. + GetDuration *prometheus.HistogramVec // Sample of get times. + LastGetDuration *prometheus.GaugeVec // Sample of most recent get time. + InsertDuration *prometheus.HistogramVec // Sample of insertion times. + LastInsertDuration *prometheus.GaugeVec // Sample of most recent insertion time. + LastGrowDuration *prometheus.GaugeVec // Most recent growth time. + MeanProbeCount *prometheus.GaugeVec // Average number of probes for each element. + + // These metrics have an extra label status = {"hit", "miss"} + Gets *prometheus.CounterVec // Number of times item retrieved. + Puts *prometheus.CounterVec // Number of times item retrieved. } -// newRHHMetrics initialises prometheus metrics for tracking an RHH hashmap. -func newRHHMetrics(namespace, subsystem string, labels prometheus.Labels) *rhhMetrics { +// NewMetrics initialises prometheus metrics for tracking an RHH hashmap. +func NewMetrics(namespace, subsystem string, labels prometheus.Labels) *Metrics { var names []string for k := range labels { names = append(names, k) } sort.Strings(names) - return &rhhMetrics{ - labels: labels, + getPutNames := append(names, "status") + sort.Strings(getPutNames) + + return &Metrics{ + Lab: labels, + LoadFactor: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "load_percent", + Help: "Load factor of the hashmap.", + }, names), + Size: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "size", + Help: "Number of items in the hashmap.", + }, names), + GetDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "get_duration_ns", + Help: "Times taken to retrieve elements in nanoseconds (sampled every 10% of retrievals).", + // 15 buckets spaced exponentially between 100 and ~30,000. + Buckets: prometheus.ExponentialBuckets(100., 1.5, 15), + }, names), + LastGetDuration: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "get_duration_last_ns", + Help: "Last retrieval duration in nanoseconds (sampled every 10% of retrievals)", + }, names), + InsertDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "put_duration_ns", + Help: "Times taken to insert elements in nanoseconds (sampled every 10% of insertions).", + // 15 buckets spaced exponentially between 100 and ~30,000. + Buckets: prometheus.ExponentialBuckets(100., 1.5, 15), + }, names), + LastInsertDuration: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "put_duration_last_ns", + Help: "Last insertion duration in nanoseconds (sampled every 10% of insertions)", + }, names), + LastGrowDuration: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "grow_duration_s", + Help: "Time in seconds to last grow the hashmap.", + }, names), + MeanProbeCount: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "mean_probes", + Help: "Average probe count of all elements (sampled every 0.5% of insertions).", + }, names), + + Gets: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "get_total", + Help: "Number of times elements retrieved.", + }, getPutNames), + Puts: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "put_total", + Help: "Number of times elements inserted.", + }, getPutNames), } } // Labels returns a copy of labels for use with RHH metrics. -func (m *rhhMetrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { +func (m *Metrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.Lab)) + for k, v := range m.Lab { l[k] = v } return l } // PrometheusCollectors satisfies the prom.PrometheusCollector interface. -func (m *rhhMetrics) PrometheusCollectors() []prometheus.Collector { - return []prometheus.Collector{} +func (m *Metrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.LoadFactor, + m.Size, + m.GetDuration, + m.LastGetDuration, + m.InsertDuration, + m.LastInsertDuration, + m.LastGrowDuration, + m.MeanProbeCount, + m.Gets, + m.Puts, + } } diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index 3b58eb1cad..0e51a4639b 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -3,7 +3,9 @@ package rhh import ( "bytes" "encoding/binary" + "math/rand" "sort" + "time" "github.com/cespare/xxhash" "github.com/prometheus/client_golang/prometheus" @@ -26,11 +28,16 @@ type HashMap struct { tracker *rhhTracker } +// NewHashMap initialises a new Hashmap with the provided options. func NewHashMap(opt Options) *HashMap { + if opt.Metrics == nil { + opt.Metrics = NewMetrics("", "", nil) + } + m := &HashMap{ capacity: pow2(opt.Capacity), // Limited to 2^64. loadFactor: opt.LoadFactor, - tracker: newRHHTracker(newRHHMetrics("", "", nil)), + tracker: newRHHTracker(opt.Metrics), } m.alloc() return m @@ -43,17 +50,41 @@ func (m *HashMap) Reset() { m.elems[i].reset() } m.n = 0 + m.tracker.SetSize(0) } +// Get returns the value for a key from the Hashmap, or nil if no key exists. func (m *HashMap) Get(key []byte) interface{} { + var now time.Time + var sample bool + if rand.Float64() < 0.1 { + now = time.Now() + sample = true + } + i := m.index(key) + + if sample { + m.tracker.ObserveGet(time.Since(now)) + } + if i == -1 { + m.tracker.IncGetMiss() return nil } + m.tracker.IncGetHit() return m.elems[i].value } -func (m *HashMap) Put(key []byte, val interface{}) { +func (m *HashMap) put(key []byte, val interface{}, instrument bool) { + var now time.Time + var samplePut bool + + if instrument && rand.Float64() < 0.1 { + now = time.Now() + samplePut = true + } + // Grow the map if we've run out of slots. m.n++ if m.n > m.threshold { @@ -62,11 +93,35 @@ func (m *HashMap) Put(key []byte, val interface{}) { // If the key was overwritten then decrement the size. overwritten := m.insert(HashKey(key), key, val) + if instrument && samplePut { + m.tracker.ObservePut(time.Since(now)) + } + if overwritten { m.n-- + if instrument { + m.tracker.IncPutHit() + } + } else if instrument { + m.tracker.SetSize(uint64(m.n)) + m.tracker.SetLoadFactor(float64(m.n) / float64(m.capacity) * 100.0) + m.tracker.IncPutMiss() } } +// Put stores the value at key in the Hashmap, overwriting an existing value if +// one exists. If the maximum load of the Hashmap is reached, the Hashmap will +// first resize itself. +func (m *HashMap) Put(key []byte, val interface{}) { + m.put(key, val, true) +} + +// PutQuiet is equivalent to Put, but no instrumentation code is executed. It can +// be faster when many keys are being inserted into the Hashmap. +func (m *HashMap) PutQuiet(key []byte, val interface{}) { + m.put(key, val, false) +} + func (m *HashMap) insert(hash int64, key []byte, val interface{}) (overwritten bool) { pos := hash & m.mask var dist int64 @@ -190,7 +245,7 @@ func (m *HashMap) AverageProbeCount() float64 { } sum += float64(Dist(hash, i, m.capacity)) } - return sum/float64(m.n) + 1.0 + return sum / (float64(m.n) + 1.0) } // Keys returns a list of sorted keys. @@ -213,13 +268,64 @@ func (m *HashMap) PrometheusCollectors() []prometheus.Collector { } type rhhTracker struct { - metrics *rhhMetrics + metrics *Metrics } -func newRHHTracker(metrics *rhhMetrics) *rhhTracker { +func newRHHTracker(metrics *Metrics) *rhhTracker { return &rhhTracker{metrics: metrics} } +func (t *rhhTracker) SetLoadFactor(load float64) { + labels := t.metrics.Labels() + t.metrics.LoadFactor.With(labels).Set(load) +} + +func (t *rhhTracker) SetSize(sz uint64) { + labels := t.metrics.Labels() + t.metrics.Size.With(labels).Set(float64(sz)) +} + +func (t *rhhTracker) ObserveGet(d time.Duration) { + labels := t.metrics.Labels() + t.metrics.GetDuration.With(labels).Observe(float64(d.Nanoseconds())) + t.metrics.LastGetDuration.With(labels).Set(float64(d.Nanoseconds())) +} + +func (t *rhhTracker) ObservePut(d time.Duration) { + labels := t.metrics.Labels() + t.metrics.InsertDuration.With(labels).Observe(float64(d.Nanoseconds())) + t.metrics.LastInsertDuration.With(labels).Set(float64(d.Nanoseconds())) +} + +func (t *rhhTracker) SetGrowDuration(d time.Duration) { + labels := t.metrics.Labels() + t.metrics.LastGrowDuration.With(labels).Set(d.Seconds()) +} + +// TODO(edd): currently no safe way to calculate this concurrently. +func (t *rhhTracker) SetProbeCount(length float64) { + labels := t.metrics.Labels() + t.metrics.MeanProbeCount.With(labels).Set(length) +} + +func (t *rhhTracker) incGet(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Gets.With(labels).Inc() +} + +func (t *rhhTracker) IncGetHit() { t.incGet("hit") } +func (t *rhhTracker) IncGetMiss() { t.incGet("miss") } + +func (t *rhhTracker) incPut(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Puts.With(labels).Inc() +} + +func (t *rhhTracker) IncPutHit() { t.incPut("hit") } +func (t *rhhTracker) IncPutMiss() { t.incPut("miss") } + type hashElem struct { key []byte value interface{} @@ -242,6 +348,7 @@ func (e *hashElem) setKey(v []byte) { type Options struct { Capacity int64 LoadFactor int + Metrics *Metrics } // DefaultOptions represents a default set of options to pass to NewHashMap(). diff --git a/tsdb/metrics.go b/tsdb/metrics.go index fe70e9c612..845893c1fc 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -31,7 +31,6 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { for k := range labels { names = append(names, k) } - names = append(names, "partition_id") // All metrics have a partition_id label sort.Strings(names) totalCompactions := append(names, "status") @@ -70,7 +69,7 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { Namespace: namespace, Subsystem: seriesFileSubsystem, Name: "index_compactions_active", - Help: "Number of active compactions.", + Help: "Number of active index compactions.", }, names), CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, diff --git a/tsdb/series_file.go b/tsdb/series_file.go index 985286ddd6..722e135d8a 100644 --- a/tsdb/series_file.go +++ b/tsdb/series_file.go @@ -5,6 +5,8 @@ import ( "encoding/binary" "errors" "fmt" + "github.com/influxdata/platform/logger" + "github.com/influxdata/platform/pkg/rhh" "os" "path/filepath" "sort" @@ -36,8 +38,13 @@ type SeriesFile struct { path string partitions []*SeriesPartition + // N.B we have many partitions, but they must share the same metrics, so the + // metrics are managed in a single location (here in the SeriesFile), and + // each partition decorates the same metric measurements with different + // partition id label values. defaultMetricLabels prometheus.Labels - metrics *seriesFileMetrics + partitionMetrics *seriesFileMetrics // Metrics for each partition. + indexMetrics *rhh.Metrics // Metrics for each partition's index Hashmap. refs sync.RWMutex // RWMutex to track references to the SeriesFile that are in use. @@ -61,10 +68,14 @@ func (f *SeriesFile) WithLogger(log *zap.Logger) { // It must be called before the SeriesFile is opened. func (f *SeriesFile) SetDefaultMetricLabels(labels prometheus.Labels) { f.defaultMetricLabels = labels + f.defaultMetricLabels["partition_id"] = "" // All metrics have partition_id as a label. } // Open memory maps the data file at the file's path. func (f *SeriesFile) Open() error { + _, logEnd := logger.NewOperation(f.Logger, "Opening Series File", "series_file_open", zap.String("path", f.path)) + defer logEnd() + // Wait for all references to be released and prevent new ones from being acquired. f.refs.Lock() defer f.refs.Unlock() @@ -74,7 +85,8 @@ func (f *SeriesFile) Open() error { return err } - f.metrics = newSeriesFileMetrics(f.defaultMetricLabels) // All partitions must share the same metrics. + f.partitionMetrics = newSeriesFileMetrics(f.defaultMetricLabels) // All partitions must share the same metrics. + f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", f.defaultMetricLabels) // Open partitions. f.partitions = make([]*SeriesPartition, 0, SeriesFilePartitionN) @@ -83,8 +95,9 @@ func (f *SeriesFile) Open() error { p := NewSeriesPartition(i, f.SeriesPartitionPath(i)) p.Logger = f.Logger.With(zap.Int("partition", p.ID())) - // Set the metric tracker on the partition with any injected default labels. - p.tracker = newSeriesPartitionTracker(f.metrics, p.ID()) + // Set the metric trackers on the partition with any injected default labels. + p.tracker = newSeriesPartitionTracker(f.partitionMetrics, p.ID()) + p.index.setMetrics(f.indexMetrics, p.ID()) if err := p.Open(); err != nil { f.Close() @@ -318,11 +331,8 @@ func (f *SeriesFile) SeriesKeyPartition(key []byte) *SeriesPartition { // PrometheusCollectors returns all the prometheus metrics associated with the series file. func (f *SeriesFile) PrometheusCollectors() []prometheus.Collector { - collectors := f.metrics.PrometheusCollectors() // Shared per-partition metrics. - - for _, p := range f.partitions { - collectors = append(collectors, p.PrometheusCollectors()...) - } + collectors := f.partitionMetrics.PrometheusCollectors() // Shared per-partition metrics. + collectors = append(collectors, f.indexMetrics.PrometheusCollectors()...) return collectors } diff --git a/tsdb/series_index.go b/tsdb/series_index.go index 8346934cdc..558aa960ff 100644 --- a/tsdb/series_index.go +++ b/tsdb/series_index.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/binary" "errors" + "fmt" "io" "os" @@ -43,6 +44,10 @@ type SeriesIndex struct { maxSeriesID SeriesID maxOffset int64 + // metrics stores a shard instance of some Prometheus metrics. metrics + // must be set before Open is called. + metrics *rhh.Metrics + data []byte // mmap data keyIDData []byte // key/id mmap data idOffsetData []byte // id/offset mmap data @@ -59,6 +64,13 @@ func NewSeriesIndex(path string) *SeriesIndex { } } +// setMetrics sets the metrics +func (idx *SeriesIndex) setMetrics(metrics *rhh.Metrics, id int) { + idx.metrics = metrics + idx.metrics.Lab = idx.metrics.Labels() // Copy labels + idx.metrics.Lab["partition_id"] = fmt.Sprint(id) // N.B., This MUST be the same as the other series file metric labels +} + // Open memory-maps the index file. func (idx *SeriesIndex) Open() (err error) { // Map data file, if it exists. @@ -86,7 +98,10 @@ func (idx *SeriesIndex) Open() (err error) { return err } - idx.keyIDMap = rhh.NewHashMap(rhh.DefaultOptions) + options := rhh.DefaultOptions + options.Metrics = idx.metrics + + idx.keyIDMap = rhh.NewHashMap(options) idx.idOffsetMap = make(map[SeriesID]int64) idx.tombstones = make(map[SeriesID]struct{}) return nil @@ -109,7 +124,9 @@ func (idx *SeriesIndex) Close() (err error) { // Recover rebuilds the in-memory index for all new entries. func (idx *SeriesIndex) Recover(segments []*SeriesSegment) error { // Allocate new in-memory maps. - idx.keyIDMap = rhh.NewHashMap(rhh.DefaultOptions) + options := rhh.DefaultOptions + options.Metrics = idx.metrics + idx.keyIDMap = rhh.NewHashMap(options) idx.idOffsetMap = make(map[SeriesID]int64) idx.tombstones = make(map[SeriesID]struct{}) @@ -176,7 +193,7 @@ func (idx *SeriesIndex) execEntry(flag uint8, id SeriesIDTyped, offset int64, ke untypedID := id.SeriesID() switch flag { case SeriesEntryInsertFlag: - idx.keyIDMap.Put(key, id) + idx.keyIDMap.PutQuiet(key, id) idx.idOffsetMap[untypedID] = offset if untypedID.Greater(idx.maxSeriesID) { diff --git a/tsdb/series_partition.go b/tsdb/series_partition.go index 4f2e6888f3..f10bbc6662 100644 --- a/tsdb/series_partition.go +++ b/tsdb/series_partition.go @@ -13,7 +13,6 @@ import ( "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/rhh" - "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -52,7 +51,7 @@ type SeriesPartition struct { // NewSeriesPartition returns a new instance of SeriesPartition. func NewSeriesPartition(id int, path string) *SeriesPartition { - return &SeriesPartition{ + p := &SeriesPartition{ id: id, path: path, closing: make(chan struct{}), @@ -61,6 +60,8 @@ func NewSeriesPartition(id int, path string) *SeriesPartition { Logger: zap.NewNop(), seq: uint64(id) + 1, } + p.index = NewSeriesIndex(p.IndexPath()) + return p } // Open memory maps the data file at the partition's path. @@ -84,7 +85,6 @@ func (p *SeriesPartition) Open() error { return err } - p.index = NewSeriesIndex(p.IndexPath()) if err := p.index.Open(); err != nil { return err } else if p.index.Recover(p.segments); err != nil { @@ -178,14 +178,6 @@ func (p *SeriesPartition) Path() string { return p.path } // IndexPath returns the path to the series index. func (p *SeriesPartition) IndexPath() string { return filepath.Join(p.path, "index") } -// PrometheusCollectors returns the collectors associated with the partition. -func (p *SeriesPartition) PrometheusCollectors() []prometheus.Collector { - // SeriesFile metrics - p.mu.RLock() - defer p.mu.RUnlock() - return p.index.keyIDMap.PrometheusCollectors() // Metrics for RHH. -} - // CreateSeriesListIfNotExists creates a list of series in bulk if they don't exist. // The ids parameter is modified to contain series IDs for all keys belonging to this partition. // If the type does not match the existing type for the key, a zero id is stored. From c76626accf2045fa71b5d1624cd76052935a37ca Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 12:45:50 +0000 Subject: [PATCH 10/25] Fix rebase --- cmd/influxd/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/influxd/main.go b/cmd/influxd/main.go index 43f1691d29..5f3af73119 100644 --- a/cmd/influxd/main.go +++ b/cmd/influxd/main.go @@ -272,7 +272,7 @@ func (m *Main) run(ctx context.Context) (err error) { return err } // The Engine's metrics must be registered after it opens. - reg.MustRegister(engine.PrometheusCollectors()...) + reg.MustRegister(m.engine.PrometheusCollectors()...) pointsWriter = m.engine From 7960ccc3209bf411083d155acb3c091b4b78c40c Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 14:13:16 +0000 Subject: [PATCH 11/25] Add TSI index metrics --- pkg/rhh/metrics.go | 2 +- storage/engine.go | 3 +- tsdb/metrics.go | 2 +- tsdb/series_file.go | 10 +- tsdb/tsi1/cache.go | 66 +++++++++++- tsdb/tsi1/index.go | 25 +++++ tsdb/tsi1/metrics.go | 229 +++++++++++++++++++++++++++++++++++++++++ tsdb/tsi1/partition.go | 186 +++++++++++++++++++++++++++++++-- 8 files changed, 505 insertions(+), 18 deletions(-) create mode 100644 tsdb/tsi1/metrics.go diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go index 3d7f8840d0..e314339d8d 100644 --- a/pkg/rhh/metrics.go +++ b/pkg/rhh/metrics.go @@ -19,7 +19,7 @@ type Metrics struct { // These metrics have an extra label status = {"hit", "miss"} Gets *prometheus.CounterVec // Number of times item retrieved. - Puts *prometheus.CounterVec // Number of times item retrieved. + Puts *prometheus.CounterVec // Number of times item inserted. } // NewMetrics initialises prometheus metrics for tracking an RHH hashmap. diff --git a/storage/engine.go b/storage/engine.go index 51eaf88dce..dcc3e0659f 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -138,6 +138,7 @@ func NewEngine(path string, c Config, options ...Option) *Engine { // Set default metrics labels. e.engine.SetDefaultMetricLabels(e.defaultMetricLabels) e.sfile.SetDefaultMetricLabels(e.defaultMetricLabels) + e.index.SetDefaultMetricLabels(e.defaultMetricLabels) return e } @@ -165,8 +166,8 @@ func (e *Engine) WithLogger(log *zap.Logger) { // the engine and its components. func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector - // TODO(edd): Get prom metrics for index. metrics = append(metrics, e.sfile.PrometheusCollectors()...) + metrics = append(metrics, e.index.PrometheusCollectors()...) metrics = append(metrics, e.engine.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) return metrics diff --git a/tsdb/metrics.go b/tsdb/metrics.go index 845893c1fc..dbaa2f6218 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -70,7 +70,7 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { Subsystem: seriesFileSubsystem, Name: "index_compactions_active", Help: "Number of active index compactions.", - }, names), + }, durationCompaction), CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, Subsystem: seriesFileSubsystem, diff --git a/tsdb/series_file.go b/tsdb/series_file.go index 722e135d8a..95f75a26dc 100644 --- a/tsdb/series_file.go +++ b/tsdb/series_file.go @@ -5,13 +5,14 @@ import ( "encoding/binary" "errors" "fmt" - "github.com/influxdata/platform/logger" - "github.com/influxdata/platform/pkg/rhh" "os" "path/filepath" "sort" "sync" + "github.com/influxdata/platform/logger" + "github.com/influxdata/platform/pkg/rhh" + "github.com/cespare/xxhash" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/binaryutil" @@ -67,7 +68,10 @@ func (f *SeriesFile) WithLogger(log *zap.Logger) { // SetDefaultMetricLabels sets the default labels for metrics on the Series File. // It must be called before the SeriesFile is opened. func (f *SeriesFile) SetDefaultMetricLabels(labels prometheus.Labels) { - f.defaultMetricLabels = labels + f.defaultMetricLabels = make(prometheus.Labels, len(labels)) + for k, v := range labels { + f.defaultMetricLabels[k] = v + } f.defaultMetricLabels["partition_id"] = "" // All metrics have partition_id as a label. } diff --git a/tsdb/tsi1/cache.go b/tsdb/tsi1/cache.go index 5a2cb88401..1ee7616f82 100644 --- a/tsdb/tsi1/cache.go +++ b/tsdb/tsi1/cache.go @@ -5,6 +5,7 @@ import ( "sync" "github.com/influxdata/platform/tsdb" + "github.com/prometheus/client_golang/prometheus" ) // TagValueSeriesIDCache is an LRU cache for series id sets associated with @@ -24,6 +25,7 @@ type TagValueSeriesIDCache struct { cache map[string]map[string]map[string]*list.Element evictor *list.List + tracker *cacheTracker capacity int } @@ -32,6 +34,7 @@ func NewTagValueSeriesIDCache(c int) *TagValueSeriesIDCache { return &TagValueSeriesIDCache{ cache: map[string]map[string]map[string]*list.Element{}, evictor: list.New(), + tracker: newCacheTracker(newCacheMetrics(nil)), capacity: c, } } @@ -48,11 +51,13 @@ func (c *TagValueSeriesIDCache) get(name, key, value []byte) *tsdb.SeriesIDSet { if mmap, ok := c.cache[string(name)]; ok { if tkmap, ok := mmap[string(key)]; ok { if ele, ok := tkmap[string(value)]; ok { + c.tracker.IncGetHit() c.evictor.MoveToFront(ele) // This now becomes most recently used. return ele.Value.(*seriesIDCacheElement).SeriesIDSet } } } + c.tracker.IncGetMiss() return nil } @@ -100,6 +105,7 @@ func (c *TagValueSeriesIDCache) Put(name, key, value []byte, ss *tsdb.SeriesIDSe // Check under the write lock if the relevant item is now in the cache. if c.exists(name, key, value) { c.Unlock() + c.tracker.IncPutHit() return } defer c.Unlock() @@ -136,6 +142,7 @@ func (c *TagValueSeriesIDCache) Put(name, key, value []byte, ss *tsdb.SeriesIDSe EVICT: c.checkEviction() + c.tracker.IncPutMiss() } // Delete removes x from the tuple {name, key, value} if it exists. @@ -153,16 +160,21 @@ func (c *TagValueSeriesIDCache) delete(name, key, value []byte, x tsdb.SeriesID) if ele, ok := tkmap[string(value)]; ok { if ss := ele.Value.(*seriesIDCacheElement).SeriesIDSet; ss != nil { ele.Value.(*seriesIDCacheElement).SeriesIDSet.Remove(x) + c.tracker.IncDeletesHit() + return } } } } + c.tracker.IncDeletesMiss() } // checkEviction checks if the cache is too big, and evicts the least recently used // item if it is. func (c *TagValueSeriesIDCache) checkEviction() { - if c.evictor.Len() <= c.capacity { + l := c.evictor.Len() + c.tracker.SetSize(uint64(l)) + if l <= c.capacity { return } @@ -184,6 +196,13 @@ func (c *TagValueSeriesIDCache) checkEviction() { if len(c.cache[string(name)]) == 0 { delete(c.cache, string(name)) } + c.tracker.IncEvictions() +} + +func (c *TagValueSeriesIDCache) PrometheusCollectors() []prometheus.Collector { + var collectors []prometheus.Collector + collectors = append(collectors, c.tracker.metrics.PrometheusCollectors()...) + return collectors } // seriesIDCacheElement is an item stored within a cache. @@ -193,3 +212,48 @@ type seriesIDCacheElement struct { value []byte SeriesIDSet *tsdb.SeriesIDSet } + +type cacheTracker struct { + metrics *cacheMetrics +} + +func newCacheTracker(metrics *cacheMetrics) *cacheTracker { + return &cacheTracker{metrics: metrics} +} + +func (t *cacheTracker) SetSize(sz uint64) { + labels := t.metrics.Labels() + t.metrics.Size.With(labels).Set(float64(sz)) +} + +func (t *cacheTracker) incGet(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Gets.With(labels).Inc() +} + +func (t *cacheTracker) IncGetHit() { t.incGet("hit") } +func (t *cacheTracker) IncGetMiss() { t.incGet("miss") } + +func (t *cacheTracker) incPut(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Puts.With(labels).Inc() +} + +func (t *cacheTracker) IncPutHit() { t.incPut("hit") } +func (t *cacheTracker) IncPutMiss() { t.incPut("miss") } + +func (t *cacheTracker) incDeletes(status string) { + labels := t.metrics.Labels() + labels["status"] = status + t.metrics.Deletes.With(labels).Inc() +} + +func (t *cacheTracker) IncDeletesHit() { t.incDeletes("hit") } +func (t *cacheTracker) IncDeletesMiss() { t.incDeletes("miss") } + +func (t *cacheTracker) IncEvictions() { + labels := t.metrics.Labels() + t.metrics.Evictions.With(labels).Inc() +} diff --git a/tsdb/tsi1/index.go b/tsdb/tsi1/index.go index 7411e04c31..3f71b329a7 100644 --- a/tsdb/tsi1/index.go +++ b/tsdb/tsi1/index.go @@ -13,6 +13,8 @@ import ( "sync/atomic" "unsafe" + "github.com/prometheus/client_golang/prometheus" + "bytes" "sort" @@ -109,7 +111,10 @@ type Index struct { partitions []*Partition opened bool + defaultLabels prometheus.Labels + tagValueCache *TagValueSeriesIDCache + partitionMetrics *partitionMetrics // Maintain a single set of partition metrics to be shared by partition. // The following may be set when initializing an Index. path string // Root directory of the index partitions. @@ -137,6 +142,7 @@ func (i *Index) UniqueReferenceID() uintptr { func NewIndex(sfile *tsdb.SeriesFile, c Config, options ...IndexOption) *Index { idx := &Index{ tagValueCache: NewTagValueSeriesIDCache(DefaultSeriesIDSetCacheSize), + partitionMetrics: newPartitionMetrics(nil), maxLogFileSize: int64(c.MaxIndexLogFileSize), logger: zap.NewNop(), version: Version, @@ -151,6 +157,16 @@ func NewIndex(sfile *tsdb.SeriesFile, c Config, options ...IndexOption) *Index { return idx } +// SetDefaultMetricLabels sets the default labels on the trackers. +func (i *Index) SetDefaultMetricLabels(labels prometheus.Labels) { + i.defaultLabels = make(prometheus.Labels, len(labels)) + for k, v := range labels { + i.defaultLabels[k] = v + } + i.tagValueCache.tracker = newCacheTracker(newCacheMetrics(labels)) + i.partitionMetrics = newPartitionMetrics(labels) +} + // Bytes estimates the memory footprint of this Index, in bytes. func (i *Index) Bytes() int { var b int @@ -218,6 +234,7 @@ func (i *Index) Open() error { p.nosync = i.disableFsync p.logbufferSize = i.logfileBufferSize p.logger = i.logger.With(zap.String("tsi1_partition", fmt.Sprint(j+1))) + p.tracker = newPartitionTracker(i.partitionMetrics, j) i.partitions[j] = p } @@ -1517,6 +1534,14 @@ func (i *Index) matchTagValueNotEqualNotEmptySeriesIDIterator(name, key []byte, return tsdb.DifferenceSeriesIDIterators(mitr, tsdb.MergeSeriesIDIterators(itrs...)), nil } +// PrometheusCollectors returns all of the metrics for the index. +func (i *Index) PrometheusCollectors() []prometheus.Collector { + var collectors []prometheus.Collector + collectors = append(collectors, i.tagValueCache.PrometheusCollectors()...) + collectors = append(collectors, i.partitionMetrics.PrometheusCollectors()...) + return collectors +} + // IsIndexDir returns true if directory contains at least one partition directory. func IsIndexDir(path string) (bool, error) { fis, err := ioutil.ReadDir(path) diff --git a/tsdb/tsi1/metrics.go b/tsdb/tsi1/metrics.go new file mode 100644 index 0000000000..f81aa6b59a --- /dev/null +++ b/tsdb/tsi1/metrics.go @@ -0,0 +1,229 @@ +package tsi1 + +import ( + "fmt" + "sort" + + "github.com/prometheus/client_golang/prometheus" +) + +// namespace is the leading part of all published metrics for the Storage service. +const namespace = "storage" + +const cacheSubsystem = "tsi_cache" // sub-system associated with TSI index cache. +const partitionSubsystem = "tsi_index" // sub-system associated with the TSI index. + +type cacheMetrics struct { + labels prometheus.Labels + Size *prometheus.GaugeVec // Size of the cache. + + // These metrics have an extra label status = {"hit", "miss"} + Gets *prometheus.CounterVec // Number of times item retrieved. + Puts *prometheus.CounterVec // Number of times item inserted. + Deletes *prometheus.CounterVec // Number of times item deleted. + Evictions *prometheus.CounterVec // Number of times item deleted. +} + +// newCacheMetrics initialises the prometheus metrics for tracking the Series File. +func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { + var names []string + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + statusNames := append(names, "status") + sort.Strings(statusNames) + + return &cacheMetrics{ + labels: labels, + Size: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "size", + Help: "Number of items residing in the cache.", + }, names), + Gets: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "get_total", + Help: "Total number of gets on cache.", + }, statusNames), + Puts: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "put_total", + Help: "Total number of insertions in cache.", + }, statusNames), + Deletes: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "deletes_total", + Help: "Total number of deletions in cache.", + }, statusNames), + Evictions: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: cacheSubsystem, + Name: "evictions_total", + Help: "Total number of cache evictions.", + }, names), + } +} + +// Labels returns a copy of labels for use with RHH metrics. +func (m *cacheMetrics) Labels() prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.Size, + m.Gets, + m.Puts, + m.Deletes, + m.Evictions, + } +} + +type partitionMetrics struct { + labels prometheus.Labels + SeriesCreated *prometheus.CounterVec // Number of series created in Series File. + SeriesCreatedDuration *prometheus.HistogramVec // Distribution of time to insert series. + SeriesDropped *prometheus.CounterVec // Number of series removed from index. + Series *prometheus.GaugeVec // Number of series. + Measurements *prometheus.GaugeVec // Number of measurements. + DiskSize *prometheus.GaugeVec // Size occupied on disk. + + // This metrics has a "type" = {index, log} + FilesTotal *prometheus.GaugeVec // files on disk. + + // This metric has a "level" metric. + CompactionsActive *prometheus.GaugeVec // Number of active compactions. + + // These metrics have a "level" metric. + // The following metrics include a "status" = {ok, error}` label + CompactionDuration *prometheus.HistogramVec // Duration of compactions. + Compactions *prometheus.CounterVec // Total number of compactions. +} + +// newPartitionMetrics initialises the prometheus metrics for tracking the TSI partitions. +func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { + names := []string{"partition_id"} // All metrics have a partition + for k := range labels { + names = append(names, k) + } + sort.Strings(names) + + // type = {"index", "log"} + fileNames := append(names, "type") + sort.Strings(fileNames) + + // level = [0, 7] + compactionNames := append(names, "level") + sort.Strings(compactionNames) + + // status = {"ok", "error"} + attemptedCompactionNames := append(compactionNames, "status") + sort.Strings(attemptedCompactionNames) + + return &partitionMetrics{ + labels: labels, + SeriesCreated: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "series_created", + Help: "Number of series created in the partition.", + }, names), + SeriesCreatedDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "series_created_duration_ns", + Help: "Time taken in nanosecond to create single series.", + // 30 buckets spaced exponentially between 100ns and ~19 us. + Buckets: prometheus.ExponentialBuckets(100.0, 1.2, 30), + }, names), + SeriesDropped: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "series_dropped", + Help: "Number of series dropped from the partition.", + }, names), + Series: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "series_total", + Help: "Number of series in the partition.", + }, names), + Measurements: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "measurements_total", + Help: "Number of series in the partition.", + }, names), + FilesTotal: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "files_total", + Help: "Number of files in the partition.", + }, fileNames), + DiskSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "disk_bytes", + Help: "Number of bytes TSI partition is using on disk.", + }, names), + CompactionsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "compactions_active", + Help: "Number of active partition compactions.", + }, compactionNames), + CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "compactions_duration_seconds", + Help: "Time taken for a successful compaction of partition.", + // 30 buckets spaced exponentially between 1s and ~10 minutes. + Buckets: prometheus.ExponentialBuckets(1.0, 1.25, 30), + }, compactionNames), + Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: partitionSubsystem, + Name: "compactions", + Help: "Number of compactions.", + }, attemptedCompactionNames), + } +} + +// Labels returns a copy of labels for use with TSI partition metrics. +func (m *partitionMetrics) Labels(partition int) prometheus.Labels { + l := make(map[string]string, len(m.labels)) + for k, v := range m.labels { + l[k] = v + } + + // N.B all series file metrics include the partition. So it's included here. + l["partition_id"] = fmt.Sprint(partition) + return l +} + +// PrometheusCollectors satisfies the prom.PrometheusCollector interface. +func (m *partitionMetrics) PrometheusCollectors() []prometheus.Collector { + return []prometheus.Collector{ + m.SeriesCreated, + m.SeriesCreatedDuration, + m.SeriesDropped, + m.Series, + m.Measurements, + m.FilesTotal, + m.DiskSize, + m.CompactionsActive, + m.CompactionDuration, + m.Compactions, + } +} diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index fbfe1b7e8e..84d8eb08b2 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -54,6 +54,8 @@ type Partition struct { // Measurement stats stats MeasurementCardinalityStats + tracker *partitionTracker + // Fast series lookup of series IDs in the series file that have been present // in this partition. This set tracks both insertions and deletions of a series. seriesIDSet *tsdb.SeriesIDSet @@ -92,7 +94,7 @@ type Partition struct { // NewPartition returns a new instance of Partition. func NewPartition(sfile *tsdb.SeriesFile, path string) *Partition { - return &Partition{ + partition := &Partition{ closing: make(chan struct{}), path: path, sfile: sfile, @@ -106,6 +108,11 @@ func NewPartition(sfile *tsdb.SeriesFile, path string) *Partition { logger: zap.NewNop(), version: Version, } + + base := filepath.Base(path) + id, _ := strconv.Atoi(base) // Ignore error because we will re-check during Open. + partition.tracker = newPartitionTracker(newPartitionMetrics(nil), id) + return partition } // bytes estimates the memory footprint of this Partition, in bytes. @@ -244,6 +251,10 @@ func (p *Partition) Open() error { if err := p.buildSeriesSet(); err != nil { return err } + p.tracker.SetSeries(p.seriesIDSet.Cardinality()) + p.tracker.SetFiles(uint64(len(p.fileSet.IndexFiles())), "index") + p.tracker.SetFiles(uint64(len(p.fileSet.LogFiles())), "log") + p.tracker.SetDiskSize(uint64(p.fileSet.Size())) // Mark opened. p.opened = true @@ -472,6 +483,11 @@ func (p *Partition) prependActiveLogFile() error { if err := p.writeStatsFile(); err != nil { return err } + + // Set the file metrics again. + p.tracker.SetFiles(uint64(len(p.fileSet.IndexFiles())), "index") + p.tracker.SetFiles(uint64(len(p.fileSet.LogFiles())), "log") + p.tracker.SetDiskSize(uint64(p.fileSet.Size())) return nil } @@ -663,6 +679,7 @@ func (p *Partition) createSeriesListIfNotExists(collection *tsdb.SeriesCollectio defer fs.Release() // Ensure fileset cannot change during insert. + now := time.Now() p.mu.RLock() // Insert series into log file. ids, err := p.activeLogFile.AddSeriesList(p.seriesIDSet, collection) @@ -675,9 +692,26 @@ func (p *Partition) createSeriesListIfNotExists(collection *tsdb.SeriesCollectio if err := p.CheckLogFile(); err != nil { return nil, err } + + // NOTE(edd): if this becomes expensive then we can move the count into the + // log file. + var totalNew uint64 + for _, id := range ids { + if !id.IsZero() { + totalNew++ + } + } + if totalNew > 0 { + p.tracker.AddSeriesCreated(totalNew, time.Since(now)) + p.tracker.AddSeries(totalNew) + p.tracker.SetDiskSize(uint64(p.fileSet.Size())) + } return ids, nil } +// DropSeries removes the provided series id from the index. +// +// TODO(edd): We should support a bulk drop here. func (p *Partition) DropSeries(seriesID tsdb.SeriesID) error { // Ignore if the series is already deleted. if !p.seriesIDSet.Contains(seriesID) { @@ -691,6 +725,8 @@ func (p *Partition) DropSeries(seriesID tsdb.SeriesID) error { // Update series set. p.seriesIDSet.Remove(seriesID) + p.tracker.AddSeriesDropped(1) + p.tracker.SubSeries(1) // Swap log file, if necessary. return p.CheckLogFile() @@ -923,7 +959,22 @@ func (p *Partition) compact() { func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-chan struct{}) { assert(len(files) >= 2, "at least two index files are required for compaction") assert(level > 0, "cannot compact level zero") + + var err error + var start time.Time + p.tracker.IncActiveCompaction(level) + // Set the relevant metrics at the end of any compaction. + defer func() { + p.tracker.SetFiles(uint64(len(p.fileSet.IndexFiles())), "index") + p.tracker.SetFiles(uint64(len(p.fileSet.LogFiles())), "log") + p.tracker.SetDiskSize(uint64(p.fileSet.Size())) + p.tracker.DecActiveCompaction(level) + + success := err == nil + p.tracker.CompactionAttempted(level, success, time.Since(start)) + }() + // Build a logger for this compaction. log, logEnd := logger.NewOperation(p.logger, "TSI level compaction", "tsi1_compact_to_level", zap.Int("tsi1_level", level)) defer logEnd() @@ -942,12 +993,12 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch defer once.Do(func() { IndexFiles(files).Release() }) // Track time to compact. - start := time.Now() + start = time.Now() // Create new index file. path := filepath.Join(p.path, FormatIndexFileName(p.NextSequence(), level)) - f, err := os.Create(path) - if err != nil { + var f *os.File + if f, err = os.Create(path);err != nil { log.Error("Cannot create compaction files", zap.Error(err)) return } @@ -960,14 +1011,14 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch // Compact all index files to new index file. lvl := p.levels[level] - n, err := IndexFiles(files).CompactTo(f, p.sfile, lvl.M, lvl.K, interrupt) - if err != nil { + var n int64 + if n, err = IndexFiles(files).CompactTo(f, p.sfile, lvl.M, lvl.K, interrupt); err != nil { log.Error("Cannot compact index files", zap.Error(err)) return } // Close file. - if err := f.Close(); err != nil { + if err = f.Close(); err != nil { log.Error("Error closing index file", zap.Error(err)) return } @@ -975,13 +1026,13 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch // Reopen as an index file. file := NewIndexFile(p.sfile) file.SetPath(path) - if err := file.Open(); err != nil { + if err = file.Open(); err != nil { log.Error("Cannot open new index file", zap.Error(err)) return } // Obtain lock to swap in index file and write manifest. - if err := func() error { + if err = func() error { p.mu.Lock() defer p.mu.Unlock() @@ -1021,10 +1072,10 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch for _, f := range files { log.Info("Removing index file", zap.String("path", f.Path())) - if err := f.Close(); err != nil { + if err = f.Close(); err != nil { log.Error("Cannot close index file", zap.Error(err)) return - } else if err := os.Remove(f.Path()); err != nil { + } else if err = os.Remove(f.Path()); err != nil { log.Error("Cannot remove index file", zap.Error(err)) return } @@ -1228,6 +1279,119 @@ func (p *Partition) MeasurementCardinalityStats() MeasurementCardinalityStats { return stats } +type partitionTracker struct { + metrics *partitionMetrics + id int // ID of partition. +} + +func newPartitionTracker(metrics *partitionMetrics, partition int) *partitionTracker { + return &partitionTracker{ + metrics: metrics, + id: partition, + } +} + +// AddSeriesCreated increases the number of series created in the partition by n +// and sets a sample of the time taken to create a series. +func (t *partitionTracker) AddSeriesCreated(n uint64, d time.Duration) { + labels := t.metrics.Labels(t.id) + t.metrics.SeriesCreated.With(labels).Add(float64(n)) + + if n == 0 { + return // Nothing to record + } + + perseries := d.Seconds() / float64(n) + t.metrics.SeriesCreatedDuration.With(labels).Observe(perseries) +} + +// AddSeriesDropped increases the number of series dropped in the partition by n. +func (t *partitionTracker) AddSeriesDropped(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.SeriesDropped.With(labels).Add(float64(n)) +} + +// SetSeries sets the number of series in the partition. +func (t *partitionTracker) SetSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Set(float64(n)) +} + +// AddSeries increases the number of series in the partition by n. +func (t *partitionTracker) AddSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Add(float64(n)) +} + +// SubSeries decreases the number of series in the partition by n. +func (t *partitionTracker) SubSeries(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Series.With(labels).Sub(float64(n)) +} + +// SetMeasurements sets the number of measurements in the partition. +func (t *partitionTracker) SetMeasurements(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Measurements.With(labels).Set(float64(n)) +} + +// AddMeasurements increases the number of measurements in the partition by n. +func (t *partitionTracker) AddMeasurements(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Measurements.With(labels).Add(float64(n)) +} + +// SubMeasurements decreases the number of measurements in the partition by n. +func (t *partitionTracker) SubMeasurements(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.Measurements.With(labels).Sub(float64(n)) +} + +// SetFiles sets the number of files in the partition. +func (t *partitionTracker) SetFiles(n uint64, typ string) { + labels := t.metrics.Labels(t.id) + labels["type"] = typ + t.metrics.FilesTotal.With(labels).Set(float64(n)) +} + +// SetDiskSize sets the size of files in the partition. +func (t *partitionTracker) SetDiskSize(n uint64) { + labels := t.metrics.Labels(t.id) + t.metrics.DiskSize.With(labels).Set(float64(n)) +} + +// IncActiveCompaction increments the number of active compactions for the provided level. +func (t *partitionTracker) IncActiveCompaction(level int) { + labels := t.metrics.Labels(t.id) + labels["level"] = fmt.Sprint(level) + + t.metrics.CompactionsActive.With(labels).Inc() +} + +// DecActiveCompaction decrements the number of active compactions for the provided level. +func (t *partitionTracker) DecActiveCompaction(level int) { + labels := t.metrics.Labels(t.id) + labels["level"] = fmt.Sprint(level) + + t.metrics.CompactionsActive.With(labels).Dec() +} + +// CompactionAttempted updates the number of compactions attempted for the provided level. +func (t *partitionTracker) CompactionAttempted(level int, success bool, d time.Duration) { + labels := t.metrics.Labels(t.id) + labels["level"] = fmt.Sprint(level) + if success { + t.metrics.CompactionDuration.With(labels).Observe(d.Seconds()) + + labels["status"] = "ok" + t.metrics.Compactions.With(labels).Inc() + return + } + + labels["status"] = "error" + t.metrics.Compactions.With(labels).Inc() +} + // unionStringSets returns the union of two sets func unionStringSets(a, b map[string]struct{}) map[string]struct{} { other := make(map[string]struct{}) From e0c10227d05b73364960e7b67d4ed5a2fd40bfdf Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 14:58:32 +0000 Subject: [PATCH 12/25] Fix metric issue in series file --- pkg/rhh/rhh.go | 2 ++ tsdb/metrics.go | 2 +- tsdb/series_file.go | 15 ++++++++++----- tsdb/series_file_test.go | 4 ++-- tsdb/series_index.go | 3 ++- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index 0e51a4639b..2ed05c9ac6 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -6,6 +6,7 @@ import ( "math/rand" "sort" "time" + "fmt" "github.com/cespare/xxhash" "github.com/prometheus/client_golang/prometheus" @@ -311,6 +312,7 @@ func (t *rhhTracker) SetProbeCount(length float64) { func (t *rhhTracker) incGet(status string) { labels := t.metrics.Labels() labels["status"] = status + fmt.Println(labels) t.metrics.Gets.With(labels).Inc() } diff --git a/tsdb/metrics.go b/tsdb/metrics.go index dbaa2f6218..9e80506264 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -27,7 +27,7 @@ type seriesFileMetrics struct { // newSeriesFileMetrics initialises the prometheus metrics for tracking the Series File. func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { - var names []string + names := []string{"partition_id"} // All metrics have this label. for k := range labels { names = append(names, k) } diff --git a/tsdb/series_file.go b/tsdb/series_file.go index 95f75a26dc..f1c5fa6e1b 100644 --- a/tsdb/series_file.go +++ b/tsdb/series_file.go @@ -55,8 +55,10 @@ type SeriesFile struct { // NewSeriesFile returns a new instance of SeriesFile. func NewSeriesFile(path string) *SeriesFile { return &SeriesFile{ - path: path, - Logger: zap.NewNop(), + path: path, + partitionMetrics: newSeriesFileMetrics(nil), + indexMetrics: rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", nil), + Logger: zap.NewNop(), } } @@ -72,7 +74,8 @@ func (f *SeriesFile) SetDefaultMetricLabels(labels prometheus.Labels) { for k, v := range labels { f.defaultMetricLabels[k] = v } - f.defaultMetricLabels["partition_id"] = "" // All metrics have partition_id as a label. + f.partitionMetrics = newSeriesFileMetrics(labels) + f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", labels) } // Open memory maps the data file at the file's path. @@ -89,9 +92,11 @@ func (f *SeriesFile) Open() error { return err } - f.partitionMetrics = newSeriesFileMetrics(f.defaultMetricLabels) // All partitions must share the same metrics. - f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", f.defaultMetricLabels) + // Ensure the that RHH metrics have the correct partition label. + newLabels := f.indexMetrics.Labels() + newLabels["partition_id"] = "" // Each partition index will set this when setMetrics is called. + f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", newLabels) // Open partitions. f.partitions = make([]*SeriesPartition, 0, SeriesFilePartitionN) for i := 0; i < SeriesFilePartitionN; i++ { diff --git a/tsdb/series_file_test.go b/tsdb/series_file_test.go index 8dff5df24e..14fa6620d4 100644 --- a/tsdb/series_file_test.go +++ b/tsdb/series_file_test.go @@ -119,7 +119,7 @@ func TestSeriesFileCompactor(t *testing.T) { // Compact in-place for each partition. for _, p := range sfile.Partitions() { compactor := tsdb.NewSeriesPartitionCompactor() - if err := compactor.Compact(p); err != nil { + if _, err := compactor.Compact(p); err != nil { t.Fatal(err) } } @@ -267,7 +267,7 @@ func (f *SeriesFile) Reopen() error { // ForceCompact executes an immediate compaction across all partitions. func (f *SeriesFile) ForceCompact() error { for _, p := range f.Partitions() { - if err := tsdb.NewSeriesPartitionCompactor().Compact(p); err != nil { + if _, err := tsdb.NewSeriesPartitionCompactor().Compact(p); err != nil { return err } } diff --git a/tsdb/series_index.go b/tsdb/series_index.go index 558aa960ff..de45b89ec2 100644 --- a/tsdb/series_index.go +++ b/tsdb/series_index.go @@ -64,7 +64,8 @@ func NewSeriesIndex(path string) *SeriesIndex { } } -// setMetrics sets the metrics +// setMetrics sets the metrics for this index. The partition id has to be injected +// into the RHH metric labels. func (idx *SeriesIndex) setMetrics(metrics *rhh.Metrics, id int) { idx.metrics = metrics idx.metrics.Lab = idx.metrics.Labels() // Copy labels From f9a2f7a01784aea7f5f7d69184d3612805f087d9 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 15:15:20 +0000 Subject: [PATCH 13/25] go fmt --- pkg/rhh/rhh.go | 2 -- storage/engine.go | 4 ++-- tsdb/tsi1/index.go | 14 +++++++------- tsdb/tsi1/partition.go | 6 +++--- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index 2ed05c9ac6..0e51a4639b 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -6,7 +6,6 @@ import ( "math/rand" "sort" "time" - "fmt" "github.com/cespare/xxhash" "github.com/prometheus/client_golang/prometheus" @@ -312,7 +311,6 @@ func (t *rhhTracker) SetProbeCount(length float64) { func (t *rhhTracker) incGet(status string) { labels := t.metrics.Labels() labels["status"] = status - fmt.Println(labels) t.metrics.Gets.With(labels).Inc() } diff --git a/storage/engine.go b/storage/engine.go index dcc3e0659f..b86306013a 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -1,10 +1,10 @@ package storage import ( - "fmt" "bytes" "context" "errors" + "fmt" "sync" "time" @@ -202,7 +202,7 @@ func (e *Engine) Open() error { // For now we will just run on an interval as we only have the retention // policy enforcer. e.runRetentionEnforcer() - + return nil } diff --git a/tsdb/tsi1/index.go b/tsdb/tsi1/index.go index 3f71b329a7..45fa290cd3 100644 --- a/tsdb/tsi1/index.go +++ b/tsdb/tsi1/index.go @@ -113,7 +113,7 @@ type Index struct { defaultLabels prometheus.Labels - tagValueCache *TagValueSeriesIDCache + tagValueCache *TagValueSeriesIDCache partitionMetrics *partitionMetrics // Maintain a single set of partition metrics to be shared by partition. // The following may be set when initializing an Index. @@ -141,13 +141,13 @@ func (i *Index) UniqueReferenceID() uintptr { // NewIndex returns a new instance of Index. func NewIndex(sfile *tsdb.SeriesFile, c Config, options ...IndexOption) *Index { idx := &Index{ - tagValueCache: NewTagValueSeriesIDCache(DefaultSeriesIDSetCacheSize), + tagValueCache: NewTagValueSeriesIDCache(DefaultSeriesIDSetCacheSize), partitionMetrics: newPartitionMetrics(nil), - maxLogFileSize: int64(c.MaxIndexLogFileSize), - logger: zap.NewNop(), - version: Version, - sfile: sfile, - PartitionN: DefaultPartitionN, + maxLogFileSize: int64(c.MaxIndexLogFileSize), + logger: zap.NewNop(), + version: Version, + sfile: sfile, + PartitionN: DefaultPartitionN, } for _, option := range options { diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index 84d8eb08b2..fa47af4358 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -959,7 +959,7 @@ func (p *Partition) compact() { func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-chan struct{}) { assert(len(files) >= 2, "at least two index files are required for compaction") assert(level > 0, "cannot compact level zero") - + var err error var start time.Time @@ -974,7 +974,7 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch success := err == nil p.tracker.CompactionAttempted(level, success, time.Since(start)) }() - + // Build a logger for this compaction. log, logEnd := logger.NewOperation(p.logger, "TSI level compaction", "tsi1_compact_to_level", zap.Int("tsi1_level", level)) defer logEnd() @@ -998,7 +998,7 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch // Create new index file. path := filepath.Join(p.path, FormatIndexFileName(p.NextSequence(), level)) var f *os.File - if f, err = os.Create(path);err != nil { + if f, err = os.Create(path); err != nil { log.Error("Cannot create compaction files", zap.Error(err)) return } From a1804d27be3c16b20a1146d39c6c93a7ae862786 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 16:04:52 +0000 Subject: [PATCH 14/25] Fix race --- tsdb/tsi1/partition.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index fa47af4358..6287682f4d 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -704,7 +704,9 @@ func (p *Partition) createSeriesListIfNotExists(collection *tsdb.SeriesCollectio if totalNew > 0 { p.tracker.AddSeriesCreated(totalNew, time.Since(now)) p.tracker.AddSeries(totalNew) + p.mu.RLock() p.tracker.SetDiskSize(uint64(p.fileSet.Size())) + p.mu.RUnlock() } return ids, nil } From 2bb558a9d1319892b125e2303daee80dbc845348 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 16:09:38 +0000 Subject: [PATCH 15/25] Ensure fileset protected by lock --- tsdb/tsi1/partition.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index 6287682f4d..476a3269bb 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -968,6 +968,8 @@ func (p *Partition) compactToLevel(files []*IndexFile, level int, interrupt <-ch p.tracker.IncActiveCompaction(level) // Set the relevant metrics at the end of any compaction. defer func() { + p.mu.RLock() + defer p.mu.RUnlock() p.tracker.SetFiles(uint64(len(p.fileSet.IndexFiles())), "index") p.tracker.SetFiles(uint64(len(p.fileSet.LogFiles())), "log") p.tracker.SetDiskSize(uint64(p.fileSet.Size())) @@ -1134,6 +1136,14 @@ func (p *Partition) compactLogFile(logFile *LogFile) { return } + defer func() { + p.mu.RLock() + defer p.mu.RUnlock() + p.tracker.SetFiles(uint64(len(p.fileSet.IndexFiles())), "index") + p.tracker.SetFiles(uint64(len(p.fileSet.LogFiles())), "log") + p.tracker.SetDiskSize(uint64(p.fileSet.Size())) + }() + p.mu.Lock() interrupt := p.compactionInterrupt p.mu.Unlock() From a8b6827c9e2d8297d62ecdf0afa5a0503c897383 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 16:26:13 +0000 Subject: [PATCH 16/25] megacheck --- tsdb/tsm1/wal.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tsdb/tsm1/wal.go b/tsdb/tsm1/wal.go index 54e50609aa..13d97fbc1b 100644 --- a/tsdb/tsm1/wal.go +++ b/tsdb/tsm1/wal.go @@ -584,7 +584,6 @@ type walTracker struct { metrics *walMetrics oldSegmentBytes uint64 - currentSegmentBytes uint64 } func newWALTracker(metrics *walMetrics) *walTracker { From 93892c20abf9a4e76858f3dee9826627c2873928 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 29 Nov 2018 16:58:20 +0000 Subject: [PATCH 17/25] Fix test --- cmd/influxd/main.go | 1 - tsdb/tsm1/wal.go | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/influxd/main.go b/cmd/influxd/main.go index 5f3af73119..5fe525e0e3 100644 --- a/cmd/influxd/main.go +++ b/cmd/influxd/main.go @@ -265,7 +265,6 @@ func (m *Main) run(ctx context.Context) (err error) { { m.engine = storage.NewEngine(m.enginePath, storage.NewConfig(), storage.WithRetentionEnforcer(bucketSvc)) m.engine.WithLogger(m.logger) - reg.MustRegister(m.engine.PrometheusCollectors()...) if err := m.engine.Open(); err != nil { m.logger.Error("failed to open engine", zap.Error(err)) diff --git a/tsdb/tsm1/wal.go b/tsdb/tsm1/wal.go index 13d97fbc1b..dc650de0ae 100644 --- a/tsdb/tsm1/wal.go +++ b/tsdb/tsm1/wal.go @@ -583,7 +583,7 @@ func (l *WAL) newSegmentFile() error { type walTracker struct { metrics *walMetrics - oldSegmentBytes uint64 + oldSegmentBytes uint64 } func newWALTracker(metrics *walMetrics) *walTracker { From de491968baa39e00bce2f5408d2206ecd31015e1 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 30 Nov 2018 12:31:49 +0000 Subject: [PATCH 18/25] Fix rebase --- tsdb/tsm1/engine.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index f4dce6edfa..e2740a73e9 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -1161,7 +1161,6 @@ func (t *compactionTracker) SetOptimiseQueue(length uint64) { t.SetQueue(4, leng // SetFullQueue sets the queue depth for Full compactions. func (t *compactionTracker) SetFullQueue(length uint64) { t.SetQueue(5, length) } ->>>>>>> Convert TSM compaction stats to Prom metrics // WriteSnapshot will snapshot the cache and write a new TSM file with its contents, releasing the snapshot when done. func (e *Engine) WriteSnapshot() error { // Lock and grab the cache snapshot along with all the closed WAL From 79b108d174e73064796800f26d216bda8c150dd1 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 30 Nov 2018 16:04:40 +0000 Subject: [PATCH 19/25] Fix bug with slice reuse --- pkg/rhh/metrics.go | 2 +- tsdb/metrics.go | 4 ++-- tsdb/tsi1/metrics.go | 8 ++++---- tsdb/tsm1/metrics.go | 19 ++++++++++--------- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go index e314339d8d..40d71008c2 100644 --- a/pkg/rhh/metrics.go +++ b/pkg/rhh/metrics.go @@ -30,7 +30,7 @@ func NewMetrics(namespace, subsystem string, labels prometheus.Labels) *Metrics } sort.Strings(names) - getPutNames := append(names, "status") + getPutNames := append(append([]string(nil), names...), "status") sort.Strings(getPutNames) return &Metrics{ diff --git a/tsdb/metrics.go b/tsdb/metrics.go index 9e80506264..4c100aa953 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -33,10 +33,10 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { } sort.Strings(names) - totalCompactions := append(names, "status") + totalCompactions := append(append([]string(nil), names...), "status") sort.Strings(totalCompactions) - durationCompaction := append(names, "component") + durationCompaction := append(append([]string(nil), names...), "component") sort.Strings(durationCompaction) return &seriesFileMetrics{ diff --git a/tsdb/tsi1/metrics.go b/tsdb/tsi1/metrics.go index f81aa6b59a..11ac35da6b 100644 --- a/tsdb/tsi1/metrics.go +++ b/tsdb/tsi1/metrics.go @@ -32,7 +32,7 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { } sort.Strings(names) - statusNames := append(names, "status") + statusNames := append(append([]string(nil), names...), "status") sort.Strings(statusNames) return &cacheMetrics{ @@ -120,15 +120,15 @@ func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { sort.Strings(names) // type = {"index", "log"} - fileNames := append(names, "type") + fileNames := append(append([]string(nil), names...), "type") sort.Strings(fileNames) // level = [0, 7] - compactionNames := append(names, "level") + compactionNames := append(append([]string(nil), names...), "level") sort.Strings(compactionNames) // status = {"ok", "error"} - attemptedCompactionNames := append(compactionNames, "status") + attemptedCompactionNames := append(append([]string(nil), compactionNames...), "status") sort.Strings(attemptedCompactionNames) return &partitionMetrics{ diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index 7f3865f962..e260f0042c 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -59,12 +59,13 @@ type compactionMetrics struct { // newCompactionMetrics initialises the prometheus metrics for compactions. func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { - compactionNames := []string{"level"} // All compaction metrics have a `level` label. + names := []string{"level"} // All compaction metrics have a `level` label. for k := range labels { - compactionNames = append(compactionNames, k) + names = append(names, k) } - sort.Strings(compactionNames) - totalCompactionsNames := append(compactionNames, "status") + sort.Strings(names) + + totalCompactionsNames := append(append([]string(nil), names...), "status") sort.Strings(totalCompactionsNames) return &compactionMetrics{ @@ -80,7 +81,7 @@ func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { Subsystem: compactionSubsystem, Name: "active", Help: "Number of active compactions.", - }, compactionNames), + }, names), CompactionDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, Subsystem: compactionSubsystem, @@ -88,13 +89,13 @@ func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { Help: "Time taken for a successful compaction or snapshot.", // 30 buckets spaced exponentially between 5s and ~53 minutes. Buckets: prometheus.ExponentialBuckets(5.0, 1.25, 30), - }, compactionNames), + }, names), CompactionQueue: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: compactionSubsystem, Name: "queued", Help: "Number of queued compactions.", - }, compactionNames), + }, names), } } @@ -191,7 +192,7 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { } sort.Strings(names) - writeNames := append(names, "status") + writeNames := append(append([]string(nil), names...), "status") sort.Strings(writeNames) return &cacheMetrics{ @@ -280,7 +281,7 @@ func newWALMetrics(labels prometheus.Labels) *walMetrics { } sort.Strings(names) - writeNames := append(names, "status") + writeNames := append(append([]string(nil), names...), "status") sort.Strings(writeNames) return &walMetrics{ From d94f898c8b91750753109055ad83b9608a25e7ad Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Tue, 4 Dec 2018 21:40:28 +0000 Subject: [PATCH 20/25] WIP --- tsdb/tsm1/engine.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index e2740a73e9..8f438cba9d 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -499,11 +499,18 @@ func (e *Engine) DiskSize() int64 { return e.FileStore.DiskSizeBytes() + walDiskSizeBytes } +var _blockMetrics *blockMetrics +var _mu sync.RWMutex + // Open opens and initializes the engine. func (e *Engine) Open() error { - // Initialise metrics... - e.blockMetrics = newBlockMetrics(e.defaultMetricLabels) + // Initialise metrics if an engine has not done so already. + _mu.Lock() + if _blockMetrics == nil { + _blockMetrics = newBlockMetrics(e.defaultMetricLabels) + } + e.blockMetrics = _blockMetrics // Propagate prometheus metrics down into trackers. e.compactionTracker = newCompactionTracker(e.blockMetrics.compactionMetrics) e.FileStore.tracker = newFileTracker(e.blockMetrics.fileMetrics) @@ -513,6 +520,7 @@ func (e *Engine) Open() error { if wal, ok := e.WAL.(*WAL); ok { wal.tracker = newWALTracker(e.blockMetrics.walMetrics) } + _mu.Unlock() e.scheduler.setCompactionTracker(e.compactionTracker) From aa936df1381dc0ac7820dad4824867789922f8ce Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 5 Dec 2018 16:41:00 +0000 Subject: [PATCH 21/25] Ensure all tsm1 metrics support multiple instances --- storage/engine.go | 2 +- tsdb/tsm1/bit_reader_test.go | 2 +- tsdb/tsm1/cache.go | 46 +++--- tsdb/tsm1/engine.go | 72 ++++----- tsdb/tsm1/file_store.go | 18 ++- tsdb/tsm1/float.go | 2 +- tsdb/tsm1/metrics.go | 73 +++------ tsdb/tsm1/metrics_test.go | 282 +++++++++++++++++++++++++++++++++++ tsdb/tsm1/scheduler.go | 2 +- tsdb/tsm1/wal.go | 33 ++-- 10 files changed, 410 insertions(+), 122 deletions(-) create mode 100644 tsdb/tsm1/metrics_test.go diff --git a/storage/engine.go b/storage/engine.go index b86306013a..01bc6a59ba 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -168,7 +168,7 @@ func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector metrics = append(metrics, e.sfile.PrometheusCollectors()...) metrics = append(metrics, e.index.PrometheusCollectors()...) - metrics = append(metrics, e.engine.PrometheusCollectors()...) + metrics = append(metrics, tsm1.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) return metrics } diff --git a/tsdb/tsm1/bit_reader_test.go b/tsdb/tsm1/bit_reader_test.go index 0360a13531..f5d3150f1e 100644 --- a/tsdb/tsm1/bit_reader_test.go +++ b/tsdb/tsm1/bit_reader_test.go @@ -9,7 +9,7 @@ import ( "testing" "testing/quick" - "github.com/dgryski/go-bitstream" + bitstream "github.com/dgryski/go-bitstream" "github.com/influxdata/platform/tsdb/tsm1" ) diff --git a/tsdb/tsm1/cache.go b/tsdb/tsm1/cache.go index 70599277ca..1302b5b3a5 100644 --- a/tsdb/tsm1/cache.go +++ b/tsdb/tsm1/cache.go @@ -11,6 +11,7 @@ import ( "github.com/influxdata/influxql" "github.com/influxdata/platform/models" "github.com/influxdata/platform/tsdb" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -187,7 +188,7 @@ func NewCache(maxSize uint64) *Cache { maxSize: maxSize, store: emptyStore{}, lastSnapshot: time.Now(), - tracker: newCacheTracker(newCacheMetrics(nil)), + tracker: newCacheTracker(newCacheMetrics(nil), nil), } c.initialize.Store(&sync.Once{}) return c @@ -337,10 +338,9 @@ func (c *Cache) Snapshot() (*Cache, error) { return nil, err } - newMetrics := newCacheMetrics(c.tracker.metrics.Labels()) c.snapshot = &Cache{ store: store, - tracker: newCacheTracker(newMetrics), + tracker: newCacheTracker(c.tracker.metrics, c.tracker.labels), } } @@ -405,10 +405,9 @@ func (c *Cache) ClearSnapshot(success bool) { c.tracker.SubMemBytes(snapshotSize) // decrement the number of bytes in cache // Reset the snapshot to a fresh Cache. - newMetrics := newCacheMetrics(c.tracker.metrics.Labels()) c.snapshot = &Cache{ store: c.snapshot.store, - tracker: newCacheTracker(newMetrics), + tracker: newCacheTracker(c.tracker.metrics, c.tracker.labels), } c.tracker.SetSnapshotSize(0) @@ -734,6 +733,7 @@ func (c *Cache) UpdateAge() { // could result in the Engine exposing inaccurate metrics. type cacheTracker struct { metrics *cacheMetrics + labels prometheus.Labels snapshotsActive uint64 snapshotSize uint64 cacheSize uint64 @@ -745,15 +745,25 @@ type cacheTracker struct { writesErr uint64 } -func newCacheTracker(metrics *cacheMetrics) *cacheTracker { - return &cacheTracker{metrics: metrics} +func newCacheTracker(metrics *cacheMetrics, defaultLabels prometheus.Labels) *cacheTracker { + return &cacheTracker{metrics: metrics, labels: defaultLabels} +} + +// Labels returns a copy of the default labels used by the tracker's metrics. +// The returned map is safe for modification. +func (t *cacheTracker) Labels() prometheus.Labels { + labels := make(prometheus.Labels, len(t.labels)) + for k, v := range t.labels { + labels[k] = v + } + return labels } // AddMemBytes increases the number of in-memory cache bytes. func (t *cacheTracker) AddMemBytes(bytes uint64) { atomic.AddUint64(&t.memSizeBytes, bytes) - labels := t.metrics.Labels() + labels := t.labels t.metrics.MemSize.With(labels).Add(float64(bytes)) } @@ -761,7 +771,7 @@ func (t *cacheTracker) AddMemBytes(bytes uint64) { func (t *cacheTracker) SubMemBytes(bytes uint64) { atomic.AddUint64(&t.memSizeBytes, ^(bytes - 1)) - labels := t.metrics.Labels() + labels := t.labels t.metrics.MemSize.With(labels).Sub(float64(bytes)) } @@ -769,13 +779,13 @@ func (t *cacheTracker) SubMemBytes(bytes uint64) { func (t *cacheTracker) SetMemBytes(bytes uint64) { atomic.StoreUint64(&t.memSizeBytes, bytes) - labels := t.metrics.Labels() + labels := t.labels t.metrics.MemSize.With(labels).Set(float64(bytes)) } // AddBytesWritten increases the number of bytes written to the cache. func (t *cacheTracker) AddBytesWritten(bytes uint64) { - labels := t.metrics.Labels() + labels := t.labels t.metrics.MemSize.With(labels).Add(float64(bytes)) } @@ -783,13 +793,13 @@ func (t *cacheTracker) AddBytesWritten(bytes uint64) { func (t *cacheTracker) AddSnapshottedBytes(bytes uint64) { atomic.AddUint64(&t.snapshottedBytes, bytes) - labels := t.metrics.Labels() + labels := t.labels t.metrics.SnapshottedBytes.With(labels).Add(float64(bytes)) } // SetDiskBytes sets the number of bytes on disk used by snapshot data. func (t *cacheTracker) SetDiskBytes(bytes uint64) { - labels := t.metrics.Labels() + labels := t.labels t.metrics.DiskSize.With(labels).Set(float64(bytes)) } @@ -797,7 +807,7 @@ func (t *cacheTracker) SetDiskBytes(bytes uint64) { func (t *cacheTracker) IncSnapshotsActive() { atomic.AddUint64(&t.snapshotsActive, 1) - labels := t.metrics.Labels() + labels := t.labels t.metrics.SnapshotsActive.With(labels).Inc() } @@ -805,13 +815,13 @@ func (t *cacheTracker) IncSnapshotsActive() { func (t *cacheTracker) SetSnapshotsActive(n uint64) { atomic.StoreUint64(&t.snapshotsActive, n) - labels := t.metrics.Labels() + labels := t.labels t.metrics.SnapshotsActive.With(labels).Set(float64(n)) } // AddWrittenBytes increases the number of bytes written to the cache, with a required status. func (t *cacheTracker) AddWrittenBytes(status string, bytes uint64) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.WrittenBytes.With(labels).Add(float64(bytes)) } @@ -827,7 +837,7 @@ func (t *cacheTracker) AddWrittenBytesDrop(bytes uint64) { t.AddWrittenBytes("dr // IncWrites increments the number of writes to the cache, with a required status. func (t *cacheTracker) IncWrites(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Writes.With(labels).Inc() } @@ -869,7 +879,7 @@ func (t *cacheTracker) SnapshotSize() uint64 { return atomic.LoadUint64(&t.snaps // SetAge sets the time since the last successful snapshot func (t *cacheTracker) SetAge(d time.Duration) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.Age.With(labels).Set(d.Seconds()) } diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index 8f438cba9d..cef63dd471 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -156,7 +156,6 @@ type Engine struct { enableCompactionsOnOpen bool compactionTracker *compactionTracker // Used to track state of compactions. - blockMetrics *blockMetrics // Provides Engine metrics to external systems. defaultMetricLabels prometheus.Labels // N.B this must not be mutated after Open is called. // Limiter for concurrent compactions. @@ -499,30 +498,30 @@ func (e *Engine) DiskSize() int64 { return e.FileStore.DiskSizeBytes() + walDiskSizeBytes } -var _blockMetrics *blockMetrics -var _mu sync.RWMutex +func (e *Engine) initTrackers() { + mmu.Lock() + defer mmu.Unlock() -// Open opens and initializes the engine. -func (e *Engine) Open() error { - // Initialise metrics if an engine has not done so already. - _mu.Lock() - if _blockMetrics == nil { - _blockMetrics = newBlockMetrics(e.defaultMetricLabels) + if bms == nil { + // Initialise metrics if an engine has not done so already. + bms = newBlockMetrics(e.defaultMetricLabels) } - e.blockMetrics = _blockMetrics // Propagate prometheus metrics down into trackers. - e.compactionTracker = newCompactionTracker(e.blockMetrics.compactionMetrics) - e.FileStore.tracker = newFileTracker(e.blockMetrics.fileMetrics) - e.Cache.tracker = newCacheTracker(e.blockMetrics.cacheMetrics) + e.compactionTracker = newCompactionTracker(bms.compactionMetrics, e.defaultMetricLabels) + e.FileStore.tracker = newFileTracker(bms.fileMetrics, e.defaultMetricLabels) + e.Cache.tracker = newCacheTracker(bms.cacheMetrics, e.defaultMetricLabels) // Set default metrics on WAL if enabled. if wal, ok := e.WAL.(*WAL); ok { - wal.tracker = newWALTracker(e.blockMetrics.walMetrics) + wal.tracker = newWALTracker(bms.walMetrics, e.defaultMetricLabels) } - _mu.Unlock() - e.scheduler.setCompactionTracker(e.compactionTracker) +} + +// Open opens and initializes the engine. +func (e *Engine) Open() error { + e.initTrackers() if err := os.MkdirAll(e.path, 0777); err != nil { return err @@ -568,16 +567,6 @@ func (e *Engine) Close() error { return e.WAL.Close() } -// PrometheusCollectors returns all the prometheus collectors associated with -// the engine and its components. -func (e *Engine) PrometheusCollectors() []prometheus.Collector { - var metrics []prometheus.Collector - metrics = append(metrics, e.blockMetrics.PrometheusCollectors()...) - - // TODO(edd): Add WAL metrics - return metrics -} - // WithLogger sets the logger for the engine. func (e *Engine) WithLogger(log *zap.Logger) { e.logger = log.With(zap.String("engine", "tsm1")) @@ -1058,7 +1047,7 @@ func (l compactionLevel) String() string { // could result in the Engine exposing inaccurate metrics. type compactionTracker struct { metrics *compactionMetrics - + labels prometheus.Labels // Note: Compactions are levelled as follows: // 0 – Snapshots // 1-3 – Levelled compactions @@ -1071,8 +1060,21 @@ type compactionTracker struct { queue [6]uint64 // Gauge of TSM compactions queues (by level). } -func newCompactionTracker(metrics *compactionMetrics) *compactionTracker { - return &compactionTracker{metrics: metrics} +func newCompactionTracker(metrics *compactionMetrics, defaultLables prometheus.Labels) *compactionTracker { + return &compactionTracker{metrics: metrics, labels: defaultLables} +} + +// Labels returns a copy of the default labels used by the tracker's metrics. +// The returned map is safe for modification. +func (t *compactionTracker) Labels(level compactionLevel) prometheus.Labels { + labels := make(prometheus.Labels, len(t.labels)) + for k, v := range t.labels { + labels[k] = v + } + + // All metrics have a level label. + labels["level"] = fmt.Sprint(level) + return labels } // Completed returns the total number of compactions for the provided level. @@ -1112,7 +1114,7 @@ func (t *compactionTracker) Errors(level int) uint64 { return atomic.LoadUint64( func (t *compactionTracker) IncActive(level compactionLevel) { atomic.AddUint64(&t.active[level], 1) - labels := t.metrics.Labels(level) + labels := t.Labels(level) t.metrics.CompactionsActive.With(labels).Inc() } @@ -1123,7 +1125,7 @@ func (t *compactionTracker) IncFullActive() { t.IncActive(5) } func (t *compactionTracker) DecActive(level compactionLevel) { atomic.AddUint64(&t.active[level], ^uint64(0)) - labels := t.metrics.Labels(level) + labels := t.Labels(level) t.metrics.CompactionsActive.With(labels).Dec() } @@ -1135,17 +1137,19 @@ func (t *compactionTracker) Attempted(level compactionLevel, success bool, durat if success { atomic.AddUint64(&t.ok[level], 1) - labels := t.metrics.Labels(level) + labels := t.Labels(level) + t.metrics.CompactionDuration.With(labels).Observe(duration.Seconds()) labels["status"] = "ok" t.metrics.Compactions.With(labels).Inc() + return } atomic.AddUint64(&t.errors[level], 1) - labels := t.metrics.Labels(level) + labels := t.Labels(level) labels["status"] = "error" t.metrics.Compactions.With(labels).Inc() } @@ -1159,7 +1163,7 @@ func (t *compactionTracker) SnapshotAttempted(success bool, duration time.Durati func (t *compactionTracker) SetQueue(level compactionLevel, length uint64) { atomic.StoreUint64(&t.queue[level], length) - labels := t.metrics.Labels(level) + labels := t.Labels(level) t.metrics.CompactionQueue.With(labels).Set(float64(length)) } diff --git a/tsdb/tsm1/file_store.go b/tsdb/tsm1/file_store.go index 1657bd0bc8..eb81c93ab8 100644 --- a/tsdb/tsm1/file_store.go +++ b/tsdb/tsm1/file_store.go @@ -22,6 +22,7 @@ import ( "github.com/influxdata/platform/pkg/metrics" "github.com/influxdata/platform/query" "github.com/influxdata/platform/tsdb" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -241,7 +242,7 @@ func NewFileStore(dir string) *FileStore { }, obs: noFileStoreObserver{}, parseFileName: DefaultParseFileName, - tracker: newFileTracker(newFileMetrics(nil)), + tracker: newFileTracker(newFileMetrics(nil), nil), } fs.purger.fileStore = fs return fs @@ -297,12 +298,17 @@ type FileStoreStatistics struct { // could result in the Engine exposing inaccurate metrics. type fileTracker struct { metrics *fileMetrics + labels prometheus.Labels diskBytes uint64 fileCount uint64 } -func newFileTracker(metrics *fileMetrics) *fileTracker { - return &fileTracker{metrics: metrics} +func newFileTracker(metrics *fileMetrics, defaultLabels prometheus.Labels) *fileTracker { + return &fileTracker{metrics: metrics, labels: defaultLabels} +} + +func (t *fileTracker) Labels() prometheus.Labels { + return t.labels } // Bytes returns the number of bytes in use on disk. @@ -312,7 +318,7 @@ func (t *fileTracker) Bytes() uint64 { return atomic.LoadUint64(&t.diskBytes) } func (t *fileTracker) SetBytes(bytes uint64) { atomic.StoreUint64(&t.diskBytes, bytes) - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.DiskSize.With(labels).Set(float64(bytes)) } @@ -320,7 +326,7 @@ func (t *fileTracker) SetBytes(bytes uint64) { func (t *fileTracker) AddBytes(bytes uint64) { atomic.AddUint64(&t.diskBytes, bytes) - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.DiskSize.With(labels).Add(float64(bytes)) } @@ -328,7 +334,7 @@ func (t *fileTracker) AddBytes(bytes uint64) { func (t *fileTracker) SetFileCount(files uint64) { atomic.StoreUint64(&t.fileCount, files) - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.Files.With(labels).Set(float64(files)) } diff --git a/tsdb/tsm1/float.go b/tsdb/tsm1/float.go index bf1e65447d..ad8f43b7d5 100644 --- a/tsdb/tsm1/float.go +++ b/tsdb/tsm1/float.go @@ -15,7 +15,7 @@ import ( "math" "math/bits" - "github.com/dgryski/go-bitstream" + bitstream "github.com/dgryski/go-bitstream" ) // Note: an uncompressed format is not yet implemented. diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index e260f0042c..940d83eb26 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -1,12 +1,35 @@ package tsm1 import ( - "fmt" "sort" + "sync" "github.com/prometheus/client_golang/prometheus" ) +// The following package variables act as singletons, to be shared by all Engine +// instantiations. This allows multiple Engines to be instantiated within the +// same process. +var ( + bms *blockMetrics + mmu sync.RWMutex +) + +// PrometheusCollectors returns all prometheus metrics for the tsm1 package. +func PrometheusCollectors() []prometheus.Collector { + mmu.RLock() + defer mmu.RUnlock() + + var collectors []prometheus.Collector + if bms != nil { + collectors = append(collectors, bms.compactionMetrics.PrometheusCollectors()...) + collectors = append(collectors, bms.fileMetrics.PrometheusCollectors()...) + collectors = append(collectors, bms.cacheMetrics.PrometheusCollectors()...) + collectors = append(collectors, bms.walMetrics.PrometheusCollectors()...) + } + return collectors +} + // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" @@ -47,8 +70,6 @@ func (m *blockMetrics) PrometheusCollectors() []prometheus.Collector { // compactionMetrics are a set of metrics concerned with tracking data about compactions. type compactionMetrics struct { - labels prometheus.Labels // Read Only - CompactionsActive *prometheus.GaugeVec CompactionDuration *prometheus.HistogramVec CompactionQueue *prometheus.GaugeVec @@ -69,7 +90,6 @@ func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { sort.Strings(totalCompactionsNames) return &compactionMetrics{ - labels: labels, Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: compactionSubsystem, @@ -99,17 +119,6 @@ func newCompactionMetrics(labels prometheus.Labels) *compactionMetrics { } } -// Labels returns a copy of labels for use with compaction metrics. -func (m *compactionMetrics) Labels(level compactionLevel) prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - // N.B all compaction metrics include level. So it's included here. - l["level"] = fmt.Sprint(level) - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *compactionMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ @@ -122,7 +131,6 @@ func (m *compactionMetrics) PrometheusCollectors() []prometheus.Collector { // fileMetrics are a set of metrics concerned with tracking data about compactions. type fileMetrics struct { - labels prometheus.Labels DiskSize *prometheus.GaugeVec Files *prometheus.GaugeVec } @@ -136,7 +144,6 @@ func newFileMetrics(labels prometheus.Labels) *fileMetrics { sort.Strings(names) return &fileMetrics{ - labels: labels, DiskSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: fileStoreSubsystem, @@ -152,15 +159,6 @@ func newFileMetrics(labels prometheus.Labels) *fileMetrics { } } -// Labels returns a copy of labels for use with file metrics. -func (m *fileMetrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *fileMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ @@ -171,8 +169,6 @@ func (m *fileMetrics) PrometheusCollectors() []prometheus.Collector { // cacheMetrics are a set of metrics concerned with tracking data about the TSM Cache. type cacheMetrics struct { - labels prometheus.Labels // Read Only - MemSize *prometheus.GaugeVec DiskSize *prometheus.GaugeVec SnapshotsActive *prometheus.GaugeVec @@ -196,7 +192,6 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { sort.Strings(writeNames) return &cacheMetrics{ - labels: labels, MemSize: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: cacheSubsystem, @@ -242,15 +237,6 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { } } -// Labels returns a copy of labels for use with cache metrics. -func (m *cacheMetrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ @@ -266,7 +252,6 @@ func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { // walMetrics are a set of metrics concerned with tracking data about compactions. type walMetrics struct { - labels prometheus.Labels OldSegmentBytes *prometheus.GaugeVec CurrentSegmentBytes *prometheus.GaugeVec Segments *prometheus.GaugeVec @@ -285,7 +270,6 @@ func newWALMetrics(labels prometheus.Labels) *walMetrics { sort.Strings(writeNames) return &walMetrics{ - labels: labels, OldSegmentBytes: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: walSubsystem, @@ -313,15 +297,6 @@ func newWALMetrics(labels prometheus.Labels) *walMetrics { } } -// Labels returns a copy of labels for use with WAL metrics. -func (m *walMetrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *walMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ diff --git a/tsdb/tsm1/metrics_test.go b/tsdb/tsm1/metrics_test.go new file mode 100644 index 0000000000..12513e66ca --- /dev/null +++ b/tsdb/tsm1/metrics_test.go @@ -0,0 +1,282 @@ +package tsm1 + +import ( + "testing" + + "github.com/influxdata/platform/kit/prom/promtest" + "github.com/prometheus/client_golang/prometheus" +) + +func TestMetrics_Filestore(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newFileMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newFileTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newFileTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + // Generate some measurements. + t1.AddBytes(100) + t1.SetFileCount(3) + + t2.AddBytes(200) + t2.SetFileCount(4) + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + base := namespace + "_" + fileStoreSubsystem + "_" + m1Bytes := promtest.MustFindMetric(t, mfs, base+"disk_bytes", prometheus.Labels{"engine_id": "0", "node_id": "0"}) + m2Bytes := promtest.MustFindMetric(t, mfs, base+"disk_bytes", prometheus.Labels{"engine_id": "1", "node_id": "0"}) + m1Files := promtest.MustFindMetric(t, mfs, base+"total", prometheus.Labels{"engine_id": "0", "node_id": "0"}) + m2Files := promtest.MustFindMetric(t, mfs, base+"total", prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + if m, got, exp := m1Bytes, m1Bytes.GetGauge().GetValue(), 100.0; got != exp { + t.Errorf("[%s] got %v, expected %v", m, got, exp) + } + + if m, got, exp := m1Files, m1Files.GetGauge().GetValue(), 3.0; got != exp { + t.Errorf("[%s] got %v, expected %v", m, got, exp) + } + + if m, got, exp := m2Bytes, m2Bytes.GetGauge().GetValue(), 200.0; got != exp { + t.Errorf("[%s] got %v, expected %v", m, got, exp) + } + + if m, got, exp := m2Files, m2Files.GetGauge().GetValue(), 4.0; got != exp { + t.Errorf("[%s] got %v, expected %v", m, got, exp) + } + +} + +func TestMetrics_Cache(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newCacheMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newCacheTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newCacheTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + cacheSubsystem + "_" + + // All the metric names + gauges := []string{ + base + "inuse_bytes", + base + "disk_bytes", + base + "age", + base + "snapshots_active", + } + + counters := []string{ + base + "snapshot_bytes", + base + "written_bytes", + base + "writes", + } + + // Generate some measurements. + for i, tracker := range []*cacheTracker{t1, t2} { + tracker.SetMemBytes(uint64(i + len(gauges[0]))) + tracker.SetDiskBytes(uint64(i + len(gauges[1]))) + tracker.metrics.Age.With(tracker.Labels()).Set(float64(i + len(gauges[2]))) + tracker.SetSnapshotsActive(uint64(i + len(gauges[3]))) + + tracker.AddSnapshottedBytes(uint64(i + len(counters[0]))) + tracker.AddWrittenBytesOK(uint64(i + len(counters[1]))) + + labels := tracker.Labels() + labels["status"] = "ok" + tracker.metrics.Writes.With(labels).Add(float64(i + len(counters[2]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + for _, name := range gauges { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range counters { + exp := float64(i + len(name)) + + if name == counters[1] || name == counters[2] { + labels["status"] = "ok" + } + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} + +func TestMetrics_WAL(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newWALMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newWALTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newWALTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + walSubsystem + "_" + + // All the metric names + gauges := []string{ + base + "old_segment_bytes", + base + "current_segment_bytes", + base + "segments_total", + } + + counters := []string{ + base + "writes", + } + + // Generate some measurements. + for i, tracker := range []*walTracker{t1, t2} { + tracker.SetOldSegmentSize(uint64(i + len(gauges[0]))) + tracker.SetCurrentSegmentSize(uint64(i + len(gauges[1]))) + tracker.SetSegments(uint64(i + len(gauges[2]))) + + labels := tracker.Labels() + labels["status"] = "ok" + tracker.metrics.Writes.With(labels).Add(float64(i + len(counters[0]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + for _, name := range gauges { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range counters { + exp := float64(i + len(name)) + + labels["status"] = "ok" + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} + +func TestMetrics_Compactions(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newCompactionMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newCompactionTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newCompactionTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + compactionSubsystem + "_" + + // All the metric names + gauges := []string{ + base + "active", + base + "queued", + } + + counters := []string{base + "total"} + histograms := []string{base + "duration_seconds"} + + // Generate some measurements. + for i, tracker := range []*compactionTracker{t1, t2} { + labels := tracker.Labels(2) + tracker.metrics.CompactionsActive.With(labels).Add(float64(i + len(gauges[0]))) + tracker.SetQueue(2, uint64(i+len(gauges[1]))) + + labels = tracker.Labels(2) + labels["status"] = "ok" + tracker.metrics.Compactions.With(labels).Add(float64(i + len(counters[0]))) + + labels = tracker.Labels(2) + tracker.metrics.CompactionDuration.With(labels).Observe(float64(i + len(histograms[0]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + labels["level"] = "2" + + for _, name := range gauges { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range counters { + exp := float64(i + len(name)) + + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["status"] = "ok" + + metric := promtest.MustFindMetric(t, mfs, name, l) + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range histograms { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetHistogram().GetSampleSum(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} diff --git a/tsdb/tsm1/scheduler.go b/tsdb/tsm1/scheduler.go index c4beba403c..141077a8dd 100644 --- a/tsdb/tsm1/scheduler.go +++ b/tsdb/tsm1/scheduler.go @@ -15,7 +15,7 @@ func newScheduler(maxConcurrency int) *scheduler { return &scheduler{ maxConcurrency: maxConcurrency, weights: defaultWeights, - compactionTracker: newCompactionTracker(newCompactionMetrics(nil)), + compactionTracker: newCompactionTracker(newCompactionMetrics(nil), nil), } } diff --git a/tsdb/tsm1/wal.go b/tsdb/tsm1/wal.go index dc650de0ae..04be15c2fb 100644 --- a/tsdb/tsm1/wal.go +++ b/tsdb/tsm1/wal.go @@ -20,6 +20,7 @@ import ( "github.com/golang/snappy" "github.com/influxdata/platform/pkg/limiter" "github.com/influxdata/platform/pkg/pool" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -136,7 +137,7 @@ func NewWAL(path string) *WAL { limiter: limiter.NewFixed(defaultWaitingWALWrites), logger: logger, traceLogger: logger, - tracker: newWALTracker(newWALMetrics(nil)), + tracker: newWALTracker(newWALMetrics(nil), nil), } } @@ -581,18 +582,28 @@ func (l *WAL) newSegmentFile() error { // *NOTE* - walTracker fields should not be directory modified. Doing so // could result in the Engine exposing inaccurate metrics. type walTracker struct { - metrics *walMetrics - + metrics *walMetrics + labels prometheus.Labels oldSegmentBytes uint64 } -func newWALTracker(metrics *walMetrics) *walTracker { - return &walTracker{metrics: metrics} +func newWALTracker(metrics *walMetrics, defaultLabels prometheus.Labels) *walTracker { + return &walTracker{metrics: metrics, labels: defaultLabels} +} + +// Labels returns a copy of the default labels used by the tracker's metrics. +// The returned map is safe for modification. +func (t *walTracker) Labels() prometheus.Labels { + labels := make(prometheus.Labels, len(t.labels)) + for k, v := range t.labels { + labels[k] = v + } + return labels } // IncWrites increments the number of writes to the cache, with a required status. func (t *walTracker) IncWrites(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Writes.With(labels).Inc() } @@ -607,7 +618,7 @@ func (t *walTracker) IncWritesErr() { t.IncWrites("error") } func (t *walTracker) SetOldSegmentSize(sz uint64) { atomic.StoreUint64(&t.oldSegmentBytes, sz) - labels := t.metrics.Labels() + labels := t.labels t.metrics.OldSegmentBytes.With(labels).Set(float64(sz)) } @@ -618,7 +629,7 @@ func (t *walTracker) OldSegmentSize() uint64 { return atomic.LoadUint64(&t.oldSe func (t *walTracker) SetCurrentSegmentSize(sz uint64) { atomic.StoreUint64(&t.oldSegmentBytes, sz) - labels := t.metrics.Labels() + labels := t.labels t.metrics.CurrentSegmentBytes.With(labels).Set(float64(sz)) } @@ -627,19 +638,19 @@ func (t *walTracker) CurrentSegmentSize() uint64 { return atomic.LoadUint64(&t.o // SetSegments sets the number of segments files on disk. func (t *walTracker) SetSegments(sz uint64) { - labels := t.metrics.Labels() + labels := t.labels t.metrics.Segments.With(labels).Set(float64(sz)) } // IncSegments increases the number of segments files by one. func (t *walTracker) IncSegments() { - labels := t.metrics.Labels() + labels := t.labels t.metrics.Segments.With(labels).Inc() } // DecSegments decreases the number of segments files by one. func (t *walTracker) DecSegments() { - labels := t.metrics.Labels() + labels := t.labels t.metrics.Segments.With(labels).Dec() } From 170aaafa469847eb9fdbf51b2bc8b7dd189d9be2 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Wed, 5 Dec 2018 18:36:19 +0000 Subject: [PATCH 22/25] Add more verbosity to Go tests in CI --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 78bffc3b8d..006273cac4 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ GO_ARGS=-tags '$(GO_TAGS)' # Test vars can be used by all recursive Makefiles export GOOS=$(shell go env GOOS) export GO_BUILD=env GO111MODULE=on go build $(GO_ARGS) -export GO_TEST=env GO111MODULE=on go test $(GO_ARGS) +export GO_TEST=env GOTRACEBACK=all GO111MODULE=on go test $(GO_ARGS) # Do not add GO111MODULE=on to the call to go generate so it doesn't pollute the environment. export GO_GENERATE=go generate $(GO_ARGS) export GO_VET=env GO111MODULE=on go vet $(GO_ARGS) @@ -120,7 +120,7 @@ test-integration: test: test-go test-js test-go-race: - $(GO_TEST) -race -count=1 ./... + $(GO_TEST) -v -race -count=1 ./... vet: $(GO_VET) -v ./... From bff655786f3d417b2eea0cdd36d0b82ce66fa261 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Thu, 6 Dec 2018 15:39:06 +0000 Subject: [PATCH 23/25] Ensure tsdb metrics properly registered --- pkg/rhh/metrics.go | 13 +--- pkg/rhh/metrics_test.go | 108 ++++++++++++++++++++++++++++++++ pkg/rhh/rhh.go | 36 +++++++---- storage/engine.go | 2 +- tsdb/metrics.go | 45 ++++++++----- tsdb/metrics_test.go | 132 +++++++++++++++++++++++++++++++++++++++ tsdb/series_file.go | 55 +++++++++------- tsdb/series_index.go | 20 +++--- tsdb/series_partition.go | 38 ++++++----- 9 files changed, 362 insertions(+), 87 deletions(-) create mode 100644 pkg/rhh/metrics_test.go create mode 100644 tsdb/metrics_test.go diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go index 40d71008c2..24c50bed0f 100644 --- a/pkg/rhh/metrics.go +++ b/pkg/rhh/metrics.go @@ -1,13 +1,13 @@ package rhh import ( + "fmt" "sort" "github.com/prometheus/client_golang/prometheus" ) type Metrics struct { - Lab prometheus.Labels LoadFactor *prometheus.GaugeVec // Load factor of the hashmap. Size *prometheus.GaugeVec // Number of items in hashmap. GetDuration *prometheus.HistogramVec // Sample of get times. @@ -33,8 +33,8 @@ func NewMetrics(namespace, subsystem string, labels prometheus.Labels) *Metrics getPutNames := append(append([]string(nil), names...), "status") sort.Strings(getPutNames) + fmt.Println("getputname", getPutNames) return &Metrics{ - Lab: labels, LoadFactor: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: subsystem, @@ -103,15 +103,6 @@ func NewMetrics(namespace, subsystem string, labels prometheus.Labels) *Metrics } } -// Labels returns a copy of labels for use with RHH metrics. -func (m *Metrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.Lab)) - for k, v := range m.Lab { - l[k] = v - } - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *Metrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ diff --git a/pkg/rhh/metrics_test.go b/pkg/rhh/metrics_test.go new file mode 100644 index 0000000000..263534d50b --- /dev/null +++ b/pkg/rhh/metrics_test.go @@ -0,0 +1,108 @@ +package rhh + +import ( + "testing" + + "github.com/influxdata/platform/kit/prom/promtest" + "github.com/prometheus/client_golang/prometheus" +) + +func TestMetrics_Metrics(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := NewMetrics("test", "sub", prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newRHHTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newRHHTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := "test_sub_" + + // All the metric names + gauges := []string{ + base + "load_percent", + base + "size", + base + "get_duration_last_ns", + base + "put_duration_last_ns", + base + "grow_duration_s", + base + "mean_probes", + } + + counters := []string{ + base + "get_total", + base + "put_total", + } + + histograms := []string{ + base + "get_duration_ns", + base + "put_duration_ns", + } + + // Generate some measurements. + for i, tracker := range []*rhhTracker{t1, t2} { + tracker.SetLoadFactor(float64(i + len(gauges[0]))) + tracker.SetSize(uint64(i + len(gauges[1]))) + + labels := tracker.Labels() + tracker.metrics.LastGetDuration.With(labels).Set(float64(i + len(gauges[2]))) + tracker.metrics.LastInsertDuration.With(labels).Set(float64(i + len(gauges[3]))) + tracker.metrics.LastGrowDuration.With(labels).Set(float64(i + len(gauges[4]))) + tracker.SetProbeCount(float64(i + len(gauges[5]))) + + labels = tracker.Labels() + labels["status"] = "ok" + tracker.metrics.Gets.With(labels).Add(float64(i + len(counters[0]))) + tracker.metrics.Puts.With(labels).Add(float64(i + len(counters[1]))) + + labels = tracker.Labels() + tracker.metrics.GetDuration.With(labels).Observe(float64(i + len(histograms[0]))) + tracker.metrics.InsertDuration.With(labels).Observe(float64(i + len(histograms[1]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + for _, name := range gauges { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range counters { + exp := float64(i + len(name)) + + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["status"] = "ok" + + metric := promtest.MustFindMetric(t, mfs, name, l) + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range histograms { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetHistogram().GetSampleSum(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index 0e51a4639b..207c416e5d 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -3,6 +3,7 @@ package rhh import ( "bytes" "encoding/binary" + "fmt" "math/rand" "sort" "time" @@ -37,7 +38,7 @@ func NewHashMap(opt Options) *HashMap { m := &HashMap{ capacity: pow2(opt.Capacity), // Limited to 2^64. loadFactor: opt.LoadFactor, - tracker: newRHHTracker(opt.Metrics), + tracker: newRHHTracker(opt.Metrics, opt.Labels), } m.alloc() return m @@ -269,48 +270,60 @@ func (m *HashMap) PrometheusCollectors() []prometheus.Collector { type rhhTracker struct { metrics *Metrics + labels prometheus.Labels } -func newRHHTracker(metrics *Metrics) *rhhTracker { - return &rhhTracker{metrics: metrics} +// Labels returns a copy of the default labels used by the tracker's metrics. +// The returned map is safe for modification. +func (t *rhhTracker) Labels() prometheus.Labels { + labels := make(prometheus.Labels, len(t.labels)) + for k, v := range t.labels { + labels[k] = v + } + return labels +} + +func newRHHTracker(metrics *Metrics, defaultLabels prometheus.Labels) *rhhTracker { + return &rhhTracker{metrics: metrics, labels: defaultLabels} } func (t *rhhTracker) SetLoadFactor(load float64) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.LoadFactor.With(labels).Set(load) } func (t *rhhTracker) SetSize(sz uint64) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.Size.With(labels).Set(float64(sz)) } func (t *rhhTracker) ObserveGet(d time.Duration) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.GetDuration.With(labels).Observe(float64(d.Nanoseconds())) t.metrics.LastGetDuration.With(labels).Set(float64(d.Nanoseconds())) } func (t *rhhTracker) ObservePut(d time.Duration) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.InsertDuration.With(labels).Observe(float64(d.Nanoseconds())) t.metrics.LastInsertDuration.With(labels).Set(float64(d.Nanoseconds())) } func (t *rhhTracker) SetGrowDuration(d time.Duration) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.LastGrowDuration.With(labels).Set(d.Seconds()) } // TODO(edd): currently no safe way to calculate this concurrently. func (t *rhhTracker) SetProbeCount(length float64) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.MeanProbeCount.With(labels).Set(length) } func (t *rhhTracker) incGet(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status + fmt.Println("inc get", labels) t.metrics.Gets.With(labels).Inc() } @@ -318,7 +331,7 @@ func (t *rhhTracker) IncGetHit() { t.incGet("hit") } func (t *rhhTracker) IncGetMiss() { t.incGet("miss") } func (t *rhhTracker) incPut(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Puts.With(labels).Inc() } @@ -349,6 +362,7 @@ type Options struct { Capacity int64 LoadFactor int Metrics *Metrics + Labels prometheus.Labels } // DefaultOptions represents a default set of options to pass to NewHashMap(). diff --git a/storage/engine.go b/storage/engine.go index 01bc6a59ba..cd61208294 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -166,7 +166,7 @@ func (e *Engine) WithLogger(log *zap.Logger) { // the engine and its components. func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector - metrics = append(metrics, e.sfile.PrometheusCollectors()...) + metrics = append(metrics, tsdb.PrometheusCollectors()...) metrics = append(metrics, e.index.PrometheusCollectors()...) metrics = append(metrics, tsm1.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) diff --git a/tsdb/metrics.go b/tsdb/metrics.go index 4c100aa953..1704a00424 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -1,19 +1,45 @@ package tsdb import ( - "fmt" "sort" + "sync" + + "github.com/influxdata/platform/pkg/rhh" "github.com/prometheus/client_golang/prometheus" ) +// The following package variables act as singletons, to be shared by all +// storage.Engine instantiations. This allows multiple Series Files to be +// monitored within the same process. +var ( + sms *seriesFileMetrics // main metrics + ims *rhh.Metrics // hashmap specific metrics + mmu sync.RWMutex +) + +// PrometheusCollectors returns all the metrics associated with the tsdb package. +func PrometheusCollectors() []prometheus.Collector { + mmu.RLock() + defer mmu.RUnlock() + + var collectors []prometheus.Collector + if sms != nil { + collectors = append(collectors, sms.PrometheusCollectors()...) + } + + if ims != nil { + collectors = append(collectors, ims.PrometheusCollectors()...) + } + return collectors +} + // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" const seriesFileSubsystem = "series_file" // sub-system associated with metrics for the Series File. type seriesFileMetrics struct { - labels prometheus.Labels SeriesCreated *prometheus.CounterVec // Number of series created in Series File. Series *prometheus.GaugeVec // Number of series. DiskSize *prometheus.GaugeVec // Size occupied on disk. @@ -27,7 +53,7 @@ type seriesFileMetrics struct { // newSeriesFileMetrics initialises the prometheus metrics for tracking the Series File. func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { - names := []string{"partition_id"} // All metrics have this label. + names := []string{"series_file_partition"} // All metrics have this label. for k := range labels { names = append(names, k) } @@ -40,7 +66,6 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { sort.Strings(durationCompaction) return &seriesFileMetrics{ - labels: labels, SeriesCreated: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: seriesFileSubsystem, @@ -88,18 +113,6 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { } } -// Labels returns a copy of labels for use with Series File metrics. -func (m *seriesFileMetrics) Labels(partition int) prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - - // N.B all series file metrics include the partition. So it's included here. - l["partition_id"] = fmt.Sprint(partition) - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *seriesFileMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ diff --git a/tsdb/metrics_test.go b/tsdb/metrics_test.go new file mode 100644 index 0000000000..c6ebb9cee7 --- /dev/null +++ b/tsdb/metrics_test.go @@ -0,0 +1,132 @@ +package tsdb + +import ( + "testing" + + "github.com/influxdata/platform/kit/prom/promtest" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +func TestMetrics_SeriesPartition(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newSeriesFileMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newSeriesPartitionTracker(metrics, prometheus.Labels{"series_file_partition": "0", "engine_id": "0", "node_id": "0"}) + t2 := newSeriesPartitionTracker(metrics, prometheus.Labels{"series_file_partition": "0", "engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + seriesFileSubsystem + "_" + + // All the metric names + gauges := []string{ + base + "series_total", + base + "disk_bytes", + base + "segments", + base + "index_compactions_active", + } + + counters := []string{ + base + "series_created", + base + "compactions", + } + + histograms := []string{ + base + "index_compactions_duration_seconds", + } + + // Generate some measurements. + for i, tracker := range []*seriesPartitionTracker{t1, t2} { + tracker.SetSeries(uint64(i + len(gauges[0]))) + tracker.SetDiskSize(uint64(i + len(gauges[1]))) + tracker.SetSegments(uint64(i + len(gauges[2]))) + + labels := tracker.Labels() + labels["component"] = "index" + tracker.metrics.CompactionsActive.With(labels).Add(float64(i + len(gauges[3]))) + + tracker.AddSeriesCreated(uint64(i + len(counters[0]))) + labels = tracker.Labels() + labels["status"] = "ok" + tracker.metrics.Compactions.With(labels).Add(float64(i + len(counters[1]))) + + labels = tracker.Labels() + labels["component"] = "index" + tracker.metrics.CompactionDuration.With(labels).Observe(float64(i + len(histograms[0]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + labels["series_file_partition"] = "0" + var metric *dto.Metric + + for _, name := range gauges { + exp := float64(i + len(name)) + + if name == base+"index_compactions_active" { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["component"] = "index" + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range counters { + exp := float64(i + len(name)) + + if name == base+"compactions" { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["status"] = "ok" + + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for _, name := range histograms { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["component"] = "index" + + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, l) + if got := metric.GetHistogram().GetSampleSum(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} diff --git a/tsdb/series_file.go b/tsdb/series_file.go index f1c5fa6e1b..7ed8703935 100644 --- a/tsdb/series_file.go +++ b/tsdb/series_file.go @@ -40,12 +40,10 @@ type SeriesFile struct { partitions []*SeriesPartition // N.B we have many partitions, but they must share the same metrics, so the - // metrics are managed in a single location (here in the SeriesFile), and + // metrics are managed in a single shared package variable and // each partition decorates the same metric measurements with different // partition id label values. defaultMetricLabels prometheus.Labels - partitionMetrics *seriesFileMetrics // Metrics for each partition. - indexMetrics *rhh.Metrics // Metrics for each partition's index Hashmap. refs sync.RWMutex // RWMutex to track references to the SeriesFile that are in use. @@ -55,10 +53,10 @@ type SeriesFile struct { // NewSeriesFile returns a new instance of SeriesFile. func NewSeriesFile(path string) *SeriesFile { return &SeriesFile{ - path: path, - partitionMetrics: newSeriesFileMetrics(nil), - indexMetrics: rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", nil), - Logger: zap.NewNop(), + path: path, + // partitionMetrics: newSeriesFileMetrics(nil), + // indexMetrics: rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", nil), + Logger: zap.NewNop(), } } @@ -74,8 +72,6 @@ func (f *SeriesFile) SetDefaultMetricLabels(labels prometheus.Labels) { for k, v := range labels { f.defaultMetricLabels[k] = v } - f.partitionMetrics = newSeriesFileMetrics(labels) - f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", labels) } // Open memory maps the data file at the file's path. @@ -92,11 +88,22 @@ func (f *SeriesFile) Open() error { return err } - // Ensure the that RHH metrics have the correct partition label. - newLabels := f.indexMetrics.Labels() - newLabels["partition_id"] = "" // Each partition index will set this when setMetrics is called. + // Initialise metrics for trackers. + mmu.Lock() + if sms == nil { + sms = newSeriesFileMetrics(f.defaultMetricLabels) + } + if ims == nil { + // Make a copy of the default labels so that another label can be provided. + labels := make(prometheus.Labels, len(f.defaultMetricLabels)) + for k, v := range f.defaultMetricLabels { + labels[k] = v + } + labels["series_file_partition"] = "" // All partitions have this label. + ims = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", labels) + } + mmu.Unlock() - f.indexMetrics = rhh.NewMetrics(namespace, seriesFileSubsystem+"_index", newLabels) // Open partitions. f.partitions = make([]*SeriesPartition, 0, SeriesFilePartitionN) for i := 0; i < SeriesFilePartitionN; i++ { @@ -104,9 +111,20 @@ func (f *SeriesFile) Open() error { p := NewSeriesPartition(i, f.SeriesPartitionPath(i)) p.Logger = f.Logger.With(zap.Int("partition", p.ID())) + // For each series file index, rhh trackers are used to track the RHH Hashmap. + // Each of the trackers needs to be given slightly different default + // labels to ensure the correct partition_ids are set as labels. + labels := make(prometheus.Labels, len(f.defaultMetricLabels)) + for k, v := range f.defaultMetricLabels { + labels[k] = v + } + labels["series_file_partition"] = fmt.Sprint(p.ID()) + + p.index.rhhMetrics = ims + p.index.rhhLabels = labels + // Set the metric trackers on the partition with any injected default labels. - p.tracker = newSeriesPartitionTracker(f.partitionMetrics, p.ID()) - p.index.setMetrics(f.indexMetrics, p.ID()) + p.tracker = newSeriesPartitionTracker(sms, labels) if err := p.Open(); err != nil { f.Close() @@ -338,13 +356,6 @@ func (f *SeriesFile) SeriesKeyPartition(key []byte) *SeriesPartition { return f.partitions[partitionID] } -// PrometheusCollectors returns all the prometheus metrics associated with the series file. -func (f *SeriesFile) PrometheusCollectors() []prometheus.Collector { - collectors := f.partitionMetrics.PrometheusCollectors() // Shared per-partition metrics. - collectors = append(collectors, f.indexMetrics.PrometheusCollectors()...) - return collectors -} - // AppendSeriesKey serializes name and tags to a byte slice. // The total length is prepended as a uvarint. func AppendSeriesKey(dst []byte, name []byte, tags models.Tags) []byte { diff --git a/tsdb/series_index.go b/tsdb/series_index.go index de45b89ec2..170d0cc01c 100644 --- a/tsdb/series_index.go +++ b/tsdb/series_index.go @@ -4,13 +4,13 @@ import ( "bytes" "encoding/binary" "errors" - "fmt" "io" "os" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/mmap" "github.com/influxdata/platform/pkg/rhh" + "github.com/prometheus/client_golang/prometheus" ) const ( @@ -46,7 +46,8 @@ type SeriesIndex struct { // metrics stores a shard instance of some Prometheus metrics. metrics // must be set before Open is called. - metrics *rhh.Metrics + rhhMetrics *rhh.Metrics + rhhLabels prometheus.Labels data []byte // mmap data keyIDData []byte // key/id mmap data @@ -64,14 +65,6 @@ func NewSeriesIndex(path string) *SeriesIndex { } } -// setMetrics sets the metrics for this index. The partition id has to be injected -// into the RHH metric labels. -func (idx *SeriesIndex) setMetrics(metrics *rhh.Metrics, id int) { - idx.metrics = metrics - idx.metrics.Lab = idx.metrics.Labels() // Copy labels - idx.metrics.Lab["partition_id"] = fmt.Sprint(id) // N.B., This MUST be the same as the other series file metric labels -} - // Open memory-maps the index file. func (idx *SeriesIndex) Open() (err error) { // Map data file, if it exists. @@ -100,7 +93,8 @@ func (idx *SeriesIndex) Open() (err error) { } options := rhh.DefaultOptions - options.Metrics = idx.metrics + options.Metrics = idx.rhhMetrics + options.Labels = idx.rhhLabels idx.keyIDMap = rhh.NewHashMap(options) idx.idOffsetMap = make(map[SeriesID]int64) @@ -126,7 +120,9 @@ func (idx *SeriesIndex) Close() (err error) { func (idx *SeriesIndex) Recover(segments []*SeriesSegment) error { // Allocate new in-memory maps. options := rhh.DefaultOptions - options.Metrics = idx.metrics + options.Metrics = idx.rhhMetrics + options.Labels = idx.rhhLabels + idx.keyIDMap = rhh.NewHashMap(options) idx.idOffsetMap = make(map[SeriesID]int64) idx.tombstones = make(map[SeriesID]struct{}) diff --git a/tsdb/series_partition.go b/tsdb/series_partition.go index f10bbc6662..4ef1178be0 100644 --- a/tsdb/series_partition.go +++ b/tsdb/series_partition.go @@ -13,6 +13,7 @@ import ( "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/rhh" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -56,7 +57,7 @@ func NewSeriesPartition(id int, path string) *SeriesPartition { path: path, closing: make(chan struct{}), CompactThreshold: DefaultSeriesPartitionCompactThreshold, - tracker: newSeriesPartitionTracker(newSeriesFileMetrics(nil), id), + tracker: newSeriesPartitionTracker(newSeriesFileMetrics(nil), nil), Logger: zap.NewNop(), seq: uint64(id) + 1, } @@ -559,56 +560,65 @@ func (p *SeriesPartition) seriesKeyByOffset(offset int64) []byte { type seriesPartitionTracker struct { metrics *seriesFileMetrics - id int // ID of partition. + labels prometheus.Labels } -func newSeriesPartitionTracker(metrics *seriesFileMetrics, partition int) *seriesPartitionTracker { +func newSeriesPartitionTracker(metrics *seriesFileMetrics, defaultLabels prometheus.Labels) *seriesPartitionTracker { return &seriesPartitionTracker{ metrics: metrics, - id: partition, + labels: defaultLabels, } } +// Labels returns a copy of labels for use with Series File metrics. +func (t *seriesPartitionTracker) Labels() prometheus.Labels { + l := make(map[string]string, len(t.labels)) + for k, v := range t.labels { + l[k] = v + } + return l +} + // AddSeriesCreated increases the number of series created in the partition by n. func (t *seriesPartitionTracker) AddSeriesCreated(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.SeriesCreated.With(labels).Add(float64(n)) } // SetSeries sets the number of series in the partition. func (t *seriesPartitionTracker) SetSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Set(float64(n)) } // AddSeries increases the number of series in the partition by n. func (t *seriesPartitionTracker) AddSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Add(float64(n)) } // SubSeries decreases the number of series in the partition by n. func (t *seriesPartitionTracker) SubSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Sub(float64(n)) } // SetDiskSize sets the number of bytes used by files for in partition. func (t *seriesPartitionTracker) SetDiskSize(sz uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.DiskSize.With(labels).Set(float64(sz)) } // SetSegments sets the number of segments files for the partition. func (t *seriesPartitionTracker) SetSegments(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Segments.With(labels).Set(float64(n)) } // IncCompactionsActive increments the number of active compactions for the // components of a partition (index and segments). func (t *seriesPartitionTracker) IncCompactionsActive() { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["component"] = "index" // TODO(edd): when we add segment compactions we will add a new label value. t.metrics.CompactionsActive.With(labels).Inc() } @@ -616,7 +626,7 @@ func (t *seriesPartitionTracker) IncCompactionsActive() { // DecCompactionsActive decrements the number of active compactions for the // components of a partition (index and segments). func (t *seriesPartitionTracker) DecCompactionsActive() { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["component"] = "index" // TODO(edd): when we add segment compactions we will add a new label value. t.metrics.CompactionsActive.With(labels).Dec() } @@ -625,12 +635,12 @@ func (t *seriesPartitionTracker) DecCompactionsActive() { // Callers should use IncCompactionOK and IncCompactionErr. func (t *seriesPartitionTracker) incCompactions(status string, duration time.Duration) { if duration > 0 { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["component"] = "index" t.metrics.CompactionDuration.With(labels).Observe(duration.Seconds()) } - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["status"] = status t.metrics.Compactions.With(labels).Inc() } From b015757c06f9897e4a4ca8f74c862bf17b92b3a2 Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 7 Dec 2018 13:48:43 +0000 Subject: [PATCH 24/25] Ensure all tsi1 metrics support multiple instances --- pkg/rhh/metrics.go | 2 - pkg/rhh/rhh.go | 2 - storage/engine.go | 2 +- tsdb/tsi1/cache.go | 26 +++-- tsdb/tsi1/index.go | 32 ++++-- tsdb/tsi1/metrics.go | 55 +++++---- tsdb/tsi1/metrics_test.go | 232 ++++++++++++++++++++++++++++++++++++++ tsdb/tsi1/partition.go | 47 ++++---- 8 files changed, 327 insertions(+), 71 deletions(-) create mode 100644 tsdb/tsi1/metrics_test.go diff --git a/pkg/rhh/metrics.go b/pkg/rhh/metrics.go index 24c50bed0f..947743d996 100644 --- a/pkg/rhh/metrics.go +++ b/pkg/rhh/metrics.go @@ -1,7 +1,6 @@ package rhh import ( - "fmt" "sort" "github.com/prometheus/client_golang/prometheus" @@ -33,7 +32,6 @@ func NewMetrics(namespace, subsystem string, labels prometheus.Labels) *Metrics getPutNames := append(append([]string(nil), names...), "status") sort.Strings(getPutNames) - fmt.Println("getputname", getPutNames) return &Metrics{ LoadFactor: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, diff --git a/pkg/rhh/rhh.go b/pkg/rhh/rhh.go index 207c416e5d..bd986e86ee 100644 --- a/pkg/rhh/rhh.go +++ b/pkg/rhh/rhh.go @@ -3,7 +3,6 @@ package rhh import ( "bytes" "encoding/binary" - "fmt" "math/rand" "sort" "time" @@ -323,7 +322,6 @@ func (t *rhhTracker) SetProbeCount(length float64) { func (t *rhhTracker) incGet(status string) { labels := t.Labels() labels["status"] = status - fmt.Println("inc get", labels) t.metrics.Gets.With(labels).Inc() } diff --git a/storage/engine.go b/storage/engine.go index cd61208294..535a177c17 100644 --- a/storage/engine.go +++ b/storage/engine.go @@ -167,7 +167,7 @@ func (e *Engine) WithLogger(log *zap.Logger) { func (e *Engine) PrometheusCollectors() []prometheus.Collector { var metrics []prometheus.Collector metrics = append(metrics, tsdb.PrometheusCollectors()...) - metrics = append(metrics, e.index.PrometheusCollectors()...) + metrics = append(metrics, tsi1.PrometheusCollectors()...) metrics = append(metrics, tsm1.PrometheusCollectors()...) metrics = append(metrics, e.retentionEnforcer.PrometheusCollectors()...) return metrics diff --git a/tsdb/tsi1/cache.go b/tsdb/tsi1/cache.go index 1ee7616f82..f995cac9e5 100644 --- a/tsdb/tsi1/cache.go +++ b/tsdb/tsi1/cache.go @@ -34,7 +34,7 @@ func NewTagValueSeriesIDCache(c int) *TagValueSeriesIDCache { return &TagValueSeriesIDCache{ cache: map[string]map[string]map[string]*list.Element{}, evictor: list.New(), - tracker: newCacheTracker(newCacheMetrics(nil)), + tracker: newCacheTracker(newCacheMetrics(nil), nil), capacity: c, } } @@ -215,19 +215,29 @@ type seriesIDCacheElement struct { type cacheTracker struct { metrics *cacheMetrics + labels prometheus.Labels } -func newCacheTracker(metrics *cacheMetrics) *cacheTracker { - return &cacheTracker{metrics: metrics} +func newCacheTracker(metrics *cacheMetrics, defaultLabels prometheus.Labels) *cacheTracker { + return &cacheTracker{metrics: metrics, labels: defaultLabels} +} + +// Labels returns a copy of labels for use with index cache metrics. +func (t *cacheTracker) Labels() prometheus.Labels { + l := make(map[string]string, len(t.labels)) + for k, v := range t.labels { + l[k] = v + } + return l } func (t *cacheTracker) SetSize(sz uint64) { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.Size.With(labels).Set(float64(sz)) } func (t *cacheTracker) incGet(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Gets.With(labels).Inc() } @@ -236,7 +246,7 @@ func (t *cacheTracker) IncGetHit() { t.incGet("hit") } func (t *cacheTracker) IncGetMiss() { t.incGet("miss") } func (t *cacheTracker) incPut(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Puts.With(labels).Inc() } @@ -245,7 +255,7 @@ func (t *cacheTracker) IncPutHit() { t.incPut("hit") } func (t *cacheTracker) IncPutMiss() { t.incPut("miss") } func (t *cacheTracker) incDeletes(status string) { - labels := t.metrics.Labels() + labels := t.Labels() labels["status"] = status t.metrics.Deletes.With(labels).Inc() } @@ -254,6 +264,6 @@ func (t *cacheTracker) IncDeletesHit() { t.incDeletes("hit") } func (t *cacheTracker) IncDeletesMiss() { t.incDeletes("miss") } func (t *cacheTracker) IncEvictions() { - labels := t.metrics.Labels() + labels := t.Labels() t.metrics.Evictions.With(labels).Inc() } diff --git a/tsdb/tsi1/index.go b/tsdb/tsi1/index.go index 45fa290cd3..3baf428a82 100644 --- a/tsdb/tsi1/index.go +++ b/tsdb/tsi1/index.go @@ -163,8 +163,6 @@ func (i *Index) SetDefaultMetricLabels(labels prometheus.Labels) { for k, v := range labels { i.defaultLabels[k] = v } - i.tagValueCache.tracker = newCacheTracker(newCacheMetrics(labels)) - i.partitionMetrics = newPartitionMetrics(labels) } // Bytes estimates the memory footprint of this Index, in bytes. @@ -226,6 +224,18 @@ func (i *Index) Open() error { return err } + mmu.Lock() + if cms == nil { + cms = newCacheMetrics(i.defaultLabels) + } + if pms == nil { + pms = newPartitionMetrics(i.defaultLabels) + } + mmu.Unlock() + + // Set the correct shared metrics on the cache + i.tagValueCache.tracker = newCacheTracker(cms, i.defaultLabels) + // Initialize index partitions. i.partitions = make([]*Partition, i.PartitionN) for j := 0; j < len(i.partitions); j++ { @@ -234,7 +244,15 @@ func (i *Index) Open() error { p.nosync = i.disableFsync p.logbufferSize = i.logfileBufferSize p.logger = i.logger.With(zap.String("tsi1_partition", fmt.Sprint(j+1))) - p.tracker = newPartitionTracker(i.partitionMetrics, j) + + // Each of the trackers needs to be given slightly different default + // labels to ensure the correct partition ids are set as labels. + labels := make(prometheus.Labels, len(i.defaultLabels)) + for k, v := range i.defaultLabels { + labels[k] = v + } + labels["index_partition"] = fmt.Sprint(j) + p.tracker = newPartitionTracker(pms, labels) i.partitions[j] = p } @@ -1534,14 +1552,6 @@ func (i *Index) matchTagValueNotEqualNotEmptySeriesIDIterator(name, key []byte, return tsdb.DifferenceSeriesIDIterators(mitr, tsdb.MergeSeriesIDIterators(itrs...)), nil } -// PrometheusCollectors returns all of the metrics for the index. -func (i *Index) PrometheusCollectors() []prometheus.Collector { - var collectors []prometheus.Collector - collectors = append(collectors, i.tagValueCache.PrometheusCollectors()...) - collectors = append(collectors, i.partitionMetrics.PrometheusCollectors()...) - return collectors -} - // IsIndexDir returns true if directory contains at least one partition directory. func IsIndexDir(path string) (bool, error) { fis, err := ioutil.ReadDir(path) diff --git a/tsdb/tsi1/metrics.go b/tsdb/tsi1/metrics.go index 11ac35da6b..7e888687c2 100644 --- a/tsdb/tsi1/metrics.go +++ b/tsdb/tsi1/metrics.go @@ -1,12 +1,36 @@ package tsi1 import ( - "fmt" "sort" + "sync" "github.com/prometheus/client_golang/prometheus" ) +// The following package variables act as singletons, to be shared by all +// storage.Engine instantiations. This allows multiple TSI indexes to be +// monitored within the same process. +var ( + cms *cacheMetrics // TSI index cache metrics + pms *partitionMetrics // TSI partition metrics + mmu sync.RWMutex +) + +// PrometheusCollectors returns all prometheus metrics for the tsm1 package. +func PrometheusCollectors() []prometheus.Collector { + mmu.RLock() + defer mmu.RUnlock() + + var collectors []prometheus.Collector + if cms != nil { + collectors = append(collectors, cms.PrometheusCollectors()...) + } + if pms != nil { + collectors = append(collectors, pms.PrometheusCollectors()...) + } + return collectors +} + // namespace is the leading part of all published metrics for the Storage service. const namespace = "storage" @@ -14,8 +38,7 @@ const cacheSubsystem = "tsi_cache" // sub-system associated with TSI index c const partitionSubsystem = "tsi_index" // sub-system associated with the TSI index. type cacheMetrics struct { - labels prometheus.Labels - Size *prometheus.GaugeVec // Size of the cache. + Size *prometheus.GaugeVec // Size of the cache. // These metrics have an extra label status = {"hit", "miss"} Gets *prometheus.CounterVec // Number of times item retrieved. @@ -36,7 +59,6 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { sort.Strings(statusNames) return &cacheMetrics{ - labels: labels, Size: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: cacheSubsystem, @@ -70,15 +92,6 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { } } -// Labels returns a copy of labels for use with RHH metrics. -func (m *cacheMetrics) Labels() prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ @@ -91,7 +104,6 @@ func (m *cacheMetrics) PrometheusCollectors() []prometheus.Collector { } type partitionMetrics struct { - labels prometheus.Labels SeriesCreated *prometheus.CounterVec // Number of series created in Series File. SeriesCreatedDuration *prometheus.HistogramVec // Distribution of time to insert series. SeriesDropped *prometheus.CounterVec // Number of series removed from index. @@ -113,7 +125,7 @@ type partitionMetrics struct { // newPartitionMetrics initialises the prometheus metrics for tracking the TSI partitions. func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { - names := []string{"partition_id"} // All metrics have a partition + names := []string{"index_partition"} // All metrics have a partition for k := range labels { names = append(names, k) } @@ -132,7 +144,6 @@ func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { sort.Strings(attemptedCompactionNames) return &partitionMetrics{ - labels: labels, SeriesCreated: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: partitionSubsystem, @@ -200,18 +211,6 @@ func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { } } -// Labels returns a copy of labels for use with TSI partition metrics. -func (m *partitionMetrics) Labels(partition int) prometheus.Labels { - l := make(map[string]string, len(m.labels)) - for k, v := range m.labels { - l[k] = v - } - - // N.B all series file metrics include the partition. So it's included here. - l["partition_id"] = fmt.Sprint(partition) - return l -} - // PrometheusCollectors satisfies the prom.PrometheusCollector interface. func (m *partitionMetrics) PrometheusCollectors() []prometheus.Collector { return []prometheus.Collector{ diff --git a/tsdb/tsi1/metrics_test.go b/tsdb/tsi1/metrics_test.go new file mode 100644 index 0000000000..b59a1b6431 --- /dev/null +++ b/tsdb/tsi1/metrics_test.go @@ -0,0 +1,232 @@ +package tsi1 + +import ( + "testing" + + "github.com/influxdata/platform/kit/prom/promtest" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" +) + +func TestMetrics_Cache(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newCacheMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newCacheTracker(metrics, prometheus.Labels{"engine_id": "0", "node_id": "0"}) + t2 := newCacheTracker(metrics, prometheus.Labels{"engine_id": "1", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + cacheSubsystem + "_" + + // All the metric names + gauges := []string{base + "size"} + + counters := []string{ + base + "get_total", + base + "put_total", + base + "deletes_total", + base + "evictions_total", + } + + // Generate some measurements. + for i, tracker := range []*cacheTracker{t1, t2} { + tracker.SetSize(uint64(i + len(gauges[0]))) + + labels := tracker.Labels() + labels["status"] = "hit" + tracker.metrics.Gets.With(labels).Add(float64(i + len(counters[0]))) + tracker.metrics.Puts.With(labels).Add(float64(i + len(counters[1]))) + tracker.metrics.Deletes.With(labels).Add(float64(i + len(counters[2]))) + + tracker.metrics.Evictions.With(tracker.Labels()).Add(float64(i + len(counters[3]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "node_id": "0"}, + } + + for i, labels := range labelVariants { + for _, name := range gauges { + exp := float64(i + len(name)) + metric := promtest.MustFindMetric(t, mfs, name, labels) + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + var metric *dto.Metric + for _, name := range counters { + exp := float64(i + len(name)) + + if name != counters[3] { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["status"] = "hit" + + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} + +func TestMetrics_Partition(t *testing.T) { + // metrics to be shared by multiple file stores. + metrics := newPartitionMetrics(prometheus.Labels{"engine_id": "", "node_id": ""}) + + t1 := newPartitionTracker(metrics, prometheus.Labels{"engine_id": "0", "index_partition": "0", "node_id": "0"}) + t2 := newPartitionTracker(metrics, prometheus.Labels{"engine_id": "1", "index_partition": "0", "node_id": "0"}) + + reg := prometheus.NewRegistry() + reg.MustRegister(metrics.PrometheusCollectors()...) + + base := namespace + "_" + partitionSubsystem + "_" + + // All the metric names + gauges := []string{ + base + "series_total", + base + "measurements_total", + base + "files_total", + base + "disk_bytes", + base + "compactions_active", + } + + counters := []string{ + base + "series_created", + base + "series_dropped", + base + "compactions", + } + + histograms := []string{ + base + "series_created_duration_ns", + base + "compactions_duration_seconds", + } + + // Generate some measurements. + for i, tracker := range []*partitionTracker{t1, t2} { + tracker.SetSeries(uint64(i + len(gauges[0]))) + tracker.SetMeasurements(uint64(i + len(gauges[1]))) + labels := tracker.Labels() + labels["type"] = "index" + tracker.metrics.FilesTotal.With(labels).Add(float64(i + len(gauges[2]))) + tracker.SetDiskSize(uint64(i + len(gauges[3]))) + labels = tracker.Labels() + labels["level"] = "2" + tracker.metrics.CompactionsActive.With(labels).Add(float64(i + len(gauges[4]))) + + tracker.metrics.SeriesCreated.With(tracker.Labels()).Add(float64(i + len(counters[0]))) + tracker.AddSeriesDropped(uint64(i + len(counters[1]))) + labels = tracker.Labels() + labels["level"] = "2" + labels["status"] = "ok" + tracker.metrics.Compactions.With(labels).Add(float64(i + len(counters[2]))) + + tracker.metrics.SeriesCreatedDuration.With(tracker.Labels()).Observe(float64(i + len(histograms[0]))) + labels = tracker.Labels() + labels["level"] = "2" + tracker.metrics.CompactionDuration.With(labels).Observe(float64(i + len(histograms[1]))) + } + + // Test that all the correct metrics are present. + mfs, err := reg.Gather() + if err != nil { + t.Fatal(err) + } + + // The label variants for the two caches. + labelVariants := []prometheus.Labels{ + prometheus.Labels{"engine_id": "0", "index_partition": "0", "node_id": "0"}, + prometheus.Labels{"engine_id": "1", "index_partition": "0", "node_id": "0"}, + } + + for j, labels := range labelVariants { + var metric *dto.Metric + + for i, name := range gauges { + exp := float64(j + len(name)) + + if i == 2 { + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["type"] = "index" + metric = promtest.MustFindMetric(t, mfs, name, l) + } else if i == 4 { + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["level"] = "2" + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetGauge().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for i, name := range counters { + exp := float64(j + len(name)) + + if i == 2 { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["status"] = "ok" + l["level"] = "2" + + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetCounter().GetValue(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + + for i, name := range histograms { + exp := float64(j + len(name)) + + if i == 1 { + // Make a copy since we need to add a label + l := make(prometheus.Labels, len(labels)) + for k, v := range labels { + l[k] = v + } + l["level"] = "2" + + metric = promtest.MustFindMetric(t, mfs, name, l) + } else { + metric = promtest.MustFindMetric(t, mfs, name, labels) + } + + if got := metric.GetHistogram().GetSampleSum(); got != exp { + t.Errorf("[%s %d] got %v, expected %v", name, i, got, exp) + } + } + } +} diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index 476a3269bb..57e3809b7c 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -19,6 +19,7 @@ import ( "github.com/influxdata/platform/logger" "github.com/influxdata/platform/pkg/bytesutil" "github.com/influxdata/platform/tsdb" + "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -109,9 +110,8 @@ func NewPartition(sfile *tsdb.SeriesFile, path string) *Partition { version: Version, } - base := filepath.Base(path) - id, _ := strconv.Atoi(base) // Ignore error because we will re-check during Open. - partition.tracker = newPartitionTracker(newPartitionMetrics(nil), id) + defaultLabels := prometheus.Labels{"index_partition": ""} + partition.tracker = newPartitionTracker(newPartitionMetrics(nil), defaultLabels) return partition } @@ -1293,20 +1293,29 @@ func (p *Partition) MeasurementCardinalityStats() MeasurementCardinalityStats { type partitionTracker struct { metrics *partitionMetrics - id int // ID of partition. + labels prometheus.Labels } -func newPartitionTracker(metrics *partitionMetrics, partition int) *partitionTracker { +func newPartitionTracker(metrics *partitionMetrics, defaultLabels prometheus.Labels) *partitionTracker { return &partitionTracker{ metrics: metrics, - id: partition, + labels: defaultLabels, } } +// Labels returns a copy of labels for use with index partition metrics. +func (t *partitionTracker) Labels() prometheus.Labels { + l := make(map[string]string, len(t.labels)) + for k, v := range t.labels { + l[k] = v + } + return l +} + // AddSeriesCreated increases the number of series created in the partition by n // and sets a sample of the time taken to create a series. func (t *partitionTracker) AddSeriesCreated(n uint64, d time.Duration) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.SeriesCreated.With(labels).Add(float64(n)) if n == 0 { @@ -1319,62 +1328,62 @@ func (t *partitionTracker) AddSeriesCreated(n uint64, d time.Duration) { // AddSeriesDropped increases the number of series dropped in the partition by n. func (t *partitionTracker) AddSeriesDropped(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.SeriesDropped.With(labels).Add(float64(n)) } // SetSeries sets the number of series in the partition. func (t *partitionTracker) SetSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Set(float64(n)) } // AddSeries increases the number of series in the partition by n. func (t *partitionTracker) AddSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Add(float64(n)) } // SubSeries decreases the number of series in the partition by n. func (t *partitionTracker) SubSeries(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Series.With(labels).Sub(float64(n)) } // SetMeasurements sets the number of measurements in the partition. func (t *partitionTracker) SetMeasurements(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Measurements.With(labels).Set(float64(n)) } // AddMeasurements increases the number of measurements in the partition by n. func (t *partitionTracker) AddMeasurements(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Measurements.With(labels).Add(float64(n)) } // SubMeasurements decreases the number of measurements in the partition by n. func (t *partitionTracker) SubMeasurements(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.Measurements.With(labels).Sub(float64(n)) } // SetFiles sets the number of files in the partition. func (t *partitionTracker) SetFiles(n uint64, typ string) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["type"] = typ t.metrics.FilesTotal.With(labels).Set(float64(n)) } // SetDiskSize sets the size of files in the partition. func (t *partitionTracker) SetDiskSize(n uint64) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() t.metrics.DiskSize.With(labels).Set(float64(n)) } // IncActiveCompaction increments the number of active compactions for the provided level. func (t *partitionTracker) IncActiveCompaction(level int) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["level"] = fmt.Sprint(level) t.metrics.CompactionsActive.With(labels).Inc() @@ -1382,7 +1391,7 @@ func (t *partitionTracker) IncActiveCompaction(level int) { // DecActiveCompaction decrements the number of active compactions for the provided level. func (t *partitionTracker) DecActiveCompaction(level int) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["level"] = fmt.Sprint(level) t.metrics.CompactionsActive.With(labels).Dec() @@ -1390,7 +1399,7 @@ func (t *partitionTracker) DecActiveCompaction(level int) { // CompactionAttempted updates the number of compactions attempted for the provided level. func (t *partitionTracker) CompactionAttempted(level int, success bool, d time.Duration) { - labels := t.metrics.Labels(t.id) + labels := t.Labels() labels["level"] = fmt.Sprint(level) if success { t.metrics.CompactionDuration.With(labels).Observe(d.Seconds()) From e13309ebbe0bc8c5eddb58f217d997ad62b25c2b Mon Sep 17 00:00:00 2001 From: Edd Robinson Date: Fri, 7 Dec 2018 16:37:17 +0000 Subject: [PATCH 25/25] Fix metric names --- tsdb/metrics.go | 4 ++-- tsdb/metrics_test.go | 6 +++--- tsdb/tsi1/metrics.go | 2 +- tsdb/tsi1/metrics_test.go | 2 +- tsdb/tsm1/metrics.go | 8 ++++---- tsdb/tsm1/metrics_test.go | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tsdb/metrics.go b/tsdb/metrics.go index 1704a00424..47429ab94b 100644 --- a/tsdb/metrics.go +++ b/tsdb/metrics.go @@ -87,7 +87,7 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { Segments: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: seriesFileSubsystem, - Name: "segments", + Name: "segments_total", Help: "Number of segment files in Series File.", }, names), CompactionsActive: prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -107,7 +107,7 @@ func newSeriesFileMetrics(labels prometheus.Labels) *seriesFileMetrics { Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: seriesFileSubsystem, - Name: "compactions", + Name: "compactions_total", Help: "Number of compactions.", }, totalCompactions), } diff --git a/tsdb/metrics_test.go b/tsdb/metrics_test.go index c6ebb9cee7..8f84099466 100644 --- a/tsdb/metrics_test.go +++ b/tsdb/metrics_test.go @@ -24,13 +24,13 @@ func TestMetrics_SeriesPartition(t *testing.T) { gauges := []string{ base + "series_total", base + "disk_bytes", - base + "segments", + base + "segments_total", base + "index_compactions_active", } counters := []string{ base + "series_created", - base + "compactions", + base + "compactions_total", } histograms := []string{ @@ -96,7 +96,7 @@ func TestMetrics_SeriesPartition(t *testing.T) { for _, name := range counters { exp := float64(i + len(name)) - if name == base+"compactions" { + if name == base+"compactions_total" { // Make a copy since we need to add a label l := make(prometheus.Labels, len(labels)) for k, v := range labels { diff --git a/tsdb/tsi1/metrics.go b/tsdb/tsi1/metrics.go index 7e888687c2..84ac5aab56 100644 --- a/tsdb/tsi1/metrics.go +++ b/tsdb/tsi1/metrics.go @@ -205,7 +205,7 @@ func newPartitionMetrics(labels prometheus.Labels) *partitionMetrics { Compactions: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: partitionSubsystem, - Name: "compactions", + Name: "compactions_total", Help: "Number of compactions.", }, attemptedCompactionNames), } diff --git a/tsdb/tsi1/metrics_test.go b/tsdb/tsi1/metrics_test.go index b59a1b6431..2f054a74fd 100644 --- a/tsdb/tsi1/metrics_test.go +++ b/tsdb/tsi1/metrics_test.go @@ -112,7 +112,7 @@ func TestMetrics_Partition(t *testing.T) { counters := []string{ base + "series_created", base + "series_dropped", - base + "compactions", + base + "compactions_total", } histograms := []string{ diff --git a/tsdb/tsm1/metrics.go b/tsdb/tsm1/metrics.go index 940d83eb26..0d2299ffd6 100644 --- a/tsdb/tsm1/metrics.go +++ b/tsdb/tsm1/metrics.go @@ -213,8 +213,8 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { Age: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: cacheSubsystem, - Name: "age", - Help: "Age of the current cache (time since last snapshot or initialisation).", + Name: "age_seconds", + Help: "Age in seconds of the current cache (time since last snapshot or initialisation).", }, names), SnapshottedBytes: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, @@ -231,7 +231,7 @@ func newCacheMetrics(labels prometheus.Labels) *cacheMetrics { Writes: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: cacheSubsystem, - Name: "writes", + Name: "writes_total", Help: "Number of writes to the Cache.", }, writeNames), } @@ -291,7 +291,7 @@ func newWALMetrics(labels prometheus.Labels) *walMetrics { Writes: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: namespace, Subsystem: walSubsystem, - Name: "writes", + Name: "writes_total", Help: "Number of writes to the WAL.", }, writeNames), } diff --git a/tsdb/tsm1/metrics_test.go b/tsdb/tsm1/metrics_test.go index 12513e66ca..8aafb1aa7a 100644 --- a/tsdb/tsm1/metrics_test.go +++ b/tsdb/tsm1/metrics_test.go @@ -70,14 +70,14 @@ func TestMetrics_Cache(t *testing.T) { gauges := []string{ base + "inuse_bytes", base + "disk_bytes", - base + "age", + base + "age_seconds", base + "snapshots_active", } counters := []string{ base + "snapshot_bytes", base + "written_bytes", - base + "writes", + base + "writes_total", } // Generate some measurements. @@ -150,7 +150,7 @@ func TestMetrics_WAL(t *testing.T) { } counters := []string{ - base + "writes", + base + "writes_total", } // Generate some measurements.