influxdb/storage/retention.go

234 lines
6.8 KiB
Go

package storage
import (
"context"
"errors"
"math"
"time"
"github.com/influxdata/influxdb/v2"
"github.com/influxdata/influxdb/v2/kit/tracing"
"github.com/influxdata/influxdb/v2/logger"
"github.com/influxdata/influxdb/v2/tsdb/tsm1"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
const (
bucketAPITimeout = 10 * time.Second
)
// A Deleter implementation is capable of deleting data from a storage engine.
type Deleter interface {
DeleteBucketRange(ctx context.Context, orgID, bucketID influxdb.ID, min, max int64) error
}
// A Snapshotter implementation can take snapshots of the entire engine.
type Snapshotter interface {
WriteSnapshot(ctx context.Context, status tsm1.CacheStatus) error
}
// A BucketFinder is responsible for providing access to buckets via a filter.
type BucketFinder interface {
FindBuckets(context.Context, influxdb.BucketFilter, ...influxdb.FindOptions) ([]*influxdb.Bucket, int, error)
}
// ErrServiceClosed is returned when the service is unavailable.
var ErrServiceClosed = errors.New("service is currently closed")
// The retentionEnforcer periodically removes data that is outside of the retention
// period of the bucket associated with the data.
type retentionEnforcer struct {
// Engine provides access to data stored on the engine
Engine Deleter
Snapshotter Snapshotter
// BucketService provides an API for retrieving buckets associated with
// organisations.
BucketService BucketFinder
logger *zap.Logger
tracker *retentionTracker
}
// newRetentionEnforcer returns a new enforcer that ensures expired data is
// deleted every interval period. Setting interval to 0 is equivalent to
// disabling the service.
func newRetentionEnforcer(engine Deleter, snapshotter Snapshotter, bucketService BucketFinder) *retentionEnforcer {
return &retentionEnforcer{
Engine: engine,
Snapshotter: snapshotter,
BucketService: bucketService,
logger: zap.NewNop(),
tracker: newRetentionTracker(newRetentionMetrics(nil), nil),
}
}
// SetDefaultMetricLabels sets the default labels for the retention metrics.
func (s *retentionEnforcer) SetDefaultMetricLabels(defaultLabels prometheus.Labels) {
if s == nil {
return // Not initialized
}
mmu.Lock()
if rms == nil {
rms = newRetentionMetrics(defaultLabels)
}
mmu.Unlock()
s.tracker = newRetentionTracker(rms, defaultLabels)
}
// WithLogger sets the logger l on the service. It must be called before any run calls.
func (s *retentionEnforcer) WithLogger(l *zap.Logger) {
if s == nil {
return // Not initialised
}
s.logger = l.With(zap.String("component", "retention_enforcer"))
}
// run periodically expires (deletes) all data that's fallen outside of the
// retention period for the associated bucket.
func (s *retentionEnforcer) run() {
if s == nil {
return // Not initialized
}
span, ctx := tracing.StartSpanFromContext(context.Background())
defer span.Finish()
log, logEnd := logger.NewOperation(ctx, s.logger, "Data retention check", "data_retention_check")
defer logEnd()
now := time.Now().UTC()
buckets, err := s.getBucketInformation(ctx)
if err != nil {
log.Error("Unable to determine bucket information", zap.Error(err))
} else {
s.expireData(ctx, buckets, now)
}
s.tracker.CheckDuration(time.Since(now), err == nil)
}
// expireData runs a delete operation on the storage engine.
//
// Any series data that (1) belongs to a bucket in the provided list and
// (2) falls outside the bucket's indicated retention period will be deleted.
func (s *retentionEnforcer) expireData(ctx context.Context, buckets []*influxdb.Bucket, now time.Time) {
logger, logEnd := logger.NewOperation(ctx, s.logger, "Data deletion", "data_deletion",
zap.Int("buckets", len(buckets)))
defer logEnd()
// Snapshot to clear the cache to reduce write contention.
if err := s.Snapshotter.WriteSnapshot(ctx, tsm1.CacheStatusRetention); err != nil && err != tsm1.ErrSnapshotInProgress {
logger.Warn("Unable to snapshot cache before retention", zap.Error(err))
}
var skipInf, skipInvalid int
for _, b := range buckets {
bucketFields := []zapcore.Field{
zap.String("org_id", b.OrgID.String()),
zap.String("bucket_id", b.ID.String()),
zap.Duration("retention_period", b.RetentionPeriod),
zap.String("system_type", b.Type.String()),
}
if b.RetentionPeriod == 0 {
logger.Debug("Skipping bucket with infinite retention", bucketFields...)
skipInf++
continue
} else if !b.OrgID.Valid() || !b.ID.Valid() {
skipInvalid++
logger.Warn("Skipping bucket with invalid fields", bucketFields...)
continue
}
min := int64(math.MinInt64)
max := now.Add(-b.RetentionPeriod).UnixNano()
span, ctx := tracing.StartSpanFromContext(ctx)
span.LogKV(
"bucket_id", b.ID,
"org_id", b.OrgID,
"system_type", b.Type,
"retention_period", b.RetentionPeriod,
"retention_policy", b.RetentionPolicyName,
"from", time.Unix(0, min).UTC(),
"to", time.Unix(0, max).UTC(),
)
err := s.Engine.DeleteBucketRange(ctx, b.OrgID, b.ID, min, max)
if err != nil {
logger.Info("Unable to delete bucket range",
append(bucketFields, zap.Time("min", time.Unix(0, min)), zap.Time("max", time.Unix(0, max)), zap.Error(err))...)
tracing.LogError(span, err)
}
s.tracker.IncChecks(err == nil)
span.Finish()
}
if skipInf > 0 || skipInvalid > 0 {
logger.Info("Skipped buckets", zap.Int("infinite_retention_total", skipInf), zap.Int("invalid_total", skipInvalid))
}
}
// getBucketInformation returns a slice of buckets to run retention on.
func (s *retentionEnforcer) getBucketInformation(ctx context.Context) ([]*influxdb.Bucket, error) {
ctx, cancel := context.WithTimeout(ctx, bucketAPITimeout)
defer cancel()
buckets, _, err := s.BucketService.FindBuckets(ctx, influxdb.BucketFilter{})
return buckets, err
}
//
// metrics tracker
//
type retentionTracker struct {
metrics *retentionMetrics
labels prometheus.Labels
}
func newRetentionTracker(metrics *retentionMetrics, defaultLabels prometheus.Labels) *retentionTracker {
return &retentionTracker{metrics: metrics, labels: defaultLabels}
}
// Labels returns a copy of labels for use with index cache metrics.
func (t *retentionTracker) Labels() prometheus.Labels {
l := make(map[string]string, len(t.labels))
for k, v := range t.labels {
l[k] = v
}
return l
}
// IncChecks signals that a check happened for some bucket.
func (t *retentionTracker) IncChecks(success bool) {
labels := t.Labels()
if success {
labels["status"] = "ok"
} else {
labels["status"] = "error"
}
t.metrics.Checks.With(labels).Inc()
}
// CheckDuration records the overall duration of a full retention check.
func (t *retentionTracker) CheckDuration(dur time.Duration, success bool) {
labels := t.Labels()
if success {
labels["status"] = "ok"
} else {
labels["status"] = "error"
}
t.metrics.CheckDuration.With(labels).Observe(dur.Seconds())
}