influxdb/v1/services/retention/service_test.go

package retention_test

import (
	"context"
	"fmt"
	"reflect"
	"sync"
	"testing"
	"time"

	"github.com/influxdata/influxdb/v2/internal"
	"github.com/influxdata/influxdb/v2/toml"
	"github.com/influxdata/influxdb/v2/v1/services/meta"
	"github.com/influxdata/influxdb/v2/v1/services/retention"
	"go.uber.org/zap"
	"go.uber.org/zap/zapcore"
	"go.uber.org/zap/zaptest/observer"
)

func TestService_OpenDisabled(t *testing.T) {
	// Opening a disabled service should be a no-op.
	c := retention.NewConfig()
	c.Enabled = false
	s := NewService(t, c)

	if err := s.Open(context.Background()); err != nil {
		t.Fatal(err)
	}

	if s.LogBuf.Len() > 0 {
		t.Fatalf("service logged %q, didn't expect any logging", s.LogBuf.All())
	}
}

func TestService_OpenClose(t *testing.T) {
	// Opening a disabled service should be a no-op.
	s := NewService(t, retention.NewConfig())

	ctx := context.Background()
	if err := s.Open(ctx); err != nil {
		t.Fatal(err)
	}

	if s.LogBuf.Len() == 0 {
		t.Fatal("service didn't log anything on open")
	}

	// Reopening is a no-op
	if err := s.Open(ctx); err != nil {
		t.Fatal(err)
	}

	if err := s.Close(); err != nil {
		t.Fatal(err)
	}

	// Re-closing is a no-op
	if err := s.Close(); err != nil {
		t.Fatal(err)
	}
}

func TestService_CheckShards(t *testing.T) {
	now := time.Now()
	// Account for any time difference that could cause some of the logic in
	// this test to fail due to a race condition. If we are at the very end of
	// the hour, we can choose a time interval based on one "now" time and then
	// run the retention service in the next hour. If we're in one of those
	// situations, wait 100 milliseconds until we're in the next hour.
	if got, want := now.Add(100*time.Millisecond).Truncate(time.Hour), now.Truncate(time.Hour); !got.Equal(want) {
		time.Sleep(100 * time.Millisecond)
	}

	data := []meta.DatabaseInfo{
		{
			Name: "db0",

			DefaultRetentionPolicy: "rp0",
			RetentionPolicies: []meta.RetentionPolicyInfo{
				{
					Name:               "rp0",
					ReplicaN:           1,
					Duration:           time.Hour,
					ShardGroupDuration: time.Hour,
					ShardGroups: []meta.ShardGroupInfo{
						{
							ID:        1,
							StartTime: now.Truncate(time.Hour).Add(-2 * time.Hour),
							EndTime:   now.Truncate(time.Hour).Add(-1 * time.Hour),
							Shards: []meta.ShardInfo{
								{ID: 2},
								{ID: 3},
							},
						},
						{
							ID:        4,
							StartTime: now.Truncate(time.Hour).Add(-1 * time.Hour),
							EndTime:   now.Truncate(time.Hour),
							Shards: []meta.ShardInfo{
								{ID: 5},
								{ID: 6},
							},
						},
						{
							ID:        7,
							StartTime: now.Truncate(time.Hour),
							EndTime:   now.Truncate(time.Hour).Add(time.Hour),
							Shards: []meta.ShardInfo{
								{ID: 8},
								{ID: 9},
							},
						},
					},
				},
			},
		},
	}

	config := retention.NewConfig()
	config.CheckInterval = toml.Duration(10 * time.Millisecond)
	s := NewService(t, config)
	s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
		return data
	}

	done := make(chan struct{})
	deletedShardGroups := make(map[string]struct{})
	s.MetaClient.DeleteShardGroupFn = func(database, policy string, id uint64) error {
		for _, dbi := range data {
			if dbi.Name == database {
				for _, rpi := range dbi.RetentionPolicies {
					if rpi.Name == policy {
						for i, sg := range rpi.ShardGroups {
							if sg.ID == id {
								rpi.ShardGroups[i].DeletedAt = time.Now().UTC()
							}
						}
					}
				}
			}
		}

		deletedShardGroups[fmt.Sprintf("%s.%s.%d", database, policy, id)] = struct{}{}
		if got, want := deletedShardGroups, map[string]struct{}{
			"db0.rp0.1": struct{}{},
		}; reflect.DeepEqual(got, want) {
			close(done)
		} else if len(got) > 1 {
			t.Errorf("deleted too many shard groups")
		}
		return nil
	}

	pruned := false
	closing := make(chan struct{})
	s.MetaClient.PruneShardGroupsFn = func() error {
		select {
		case <-done:
			if !pruned {
				close(closing)
				pruned = true
			}
		default:
		}
		return nil
	}

	deletedShards := make(map[uint64]struct{})
	s.TSDBStore.ShardIDsFn = func() []uint64 {
		return []uint64{2, 3, 5, 6}
	}
	s.TSDBStore.DeleteShardFn = func(shardID uint64) error {
		deletedShards[shardID] = struct{}{}
		return nil
	}

	if err := s.Open(context.Background()); err != nil {
		t.Fatalf("unexpected open error: %s", err)
	}
	defer func() {
		if err := s.Close(); err != nil {
			t.Fatalf("unexpected close error: %s", err)
		}
	}()

	timer := time.NewTimer(100 * time.Millisecond)
	select {
	case <-done:
		timer.Stop()
	case <-timer.C:
		t.Errorf("timeout waiting for shard groups to be deleted")
		return
	}

	timer = time.NewTimer(100 * time.Millisecond)
	select {
	case <-closing:
		timer.Stop()
	case <-timer.C:
		t.Errorf("timeout waiting for shards to be deleted")
		return
	}

	if got, want := deletedShards, map[uint64]struct{}{
		2: struct{}{},
		3: struct{}{},
	}; !reflect.DeepEqual(got, want) {
		t.Errorf("unexpected deleted shards: got=%#v want=%#v", got, want)
	}
}

// This reproduces https://github.com/influxdata/influxdb/issues/8819
func TestService_8819_repro(t *testing.T) {
	for i := 0; i < 1000; i++ {
		s, errC, done := testService_8819_repro(t)

		if err := s.Open(context.Background()); err != nil {
			t.Fatal(err)
		}

		// Wait for service to run one sweep of all dbs/rps/shards.
		if err := <-errC; err != nil {
			t.Fatalf("%dth iteration: %v", i, err)
		}
		// Mark that we do not expect more errors in case it runs one more time.
		close(done)

		if err := s.Close(); err != nil {
			t.Fatal(err)
		}
	}
}

func testService_8819_repro(t *testing.T) (*Service, chan error, chan struct{}) {
	c := retention.NewConfig()
	c.CheckInterval = toml.Duration(time.Millisecond)
	s := NewService(t, c)
	errC := make(chan error, 1) // Buffer Important to prevent deadlock.
	done := make(chan struct{})

	// A database and a bunch of shards
	var mu sync.Mutex
	shards := []uint64{3, 5, 8, 9, 11, 12}
	localShards := []uint64{3, 5, 8, 9, 11, 12}
	databases := []meta.DatabaseInfo{
		{
			Name: "db0",
			RetentionPolicies: []meta.RetentionPolicyInfo{
				{
					Name:               "autogen",
					Duration:           24 * time.Hour,
					ShardGroupDuration: 24 * time.Hour,
					ShardGroups: []meta.ShardGroupInfo{
						{
							ID:        1,
							StartTime: time.Date(1980, 1, 1, 0, 0, 0, 0, time.UTC),
							EndTime:   time.Date(1981, 1, 1, 0, 0, 0, 0, time.UTC),
							Shards: []meta.ShardInfo{
								{ID: 3}, {ID: 9},
							},
						},
						{
							ID:        2,
							StartTime: time.Now().Add(-1 * time.Hour),
							EndTime:   time.Now(),
							DeletedAt: time.Now(),
							Shards: []meta.ShardInfo{
								{ID: 11}, {ID: 12},
							},
						},
					},
				},
			},
		},
	}

	sendError := func(err error) {
		select {
		case errC <- err:
		case <-done:
		}
	}

	s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
		mu.Lock()
		defer mu.Unlock()
		return databases
	}

	s.MetaClient.DeleteShardGroupFn = func(database string, policy string, id uint64) error {
		if database != "db0" {
			sendError(fmt.Errorf("wrong db name: %s", database))
			return nil
		} else if policy != "autogen" {
			sendError(fmt.Errorf("wrong rp name: %s", policy))
			return nil
		} else if id != 1 {
			sendError(fmt.Errorf("wrong shard group id: %d", id))
			return nil
		}

		// remove the associated shards (3 and 9) from the shards slice...
		mu.Lock()
		newShards := make([]uint64, 0, len(shards))
		for _, sid := range shards {
			if sid != 3 && sid != 9 {
				newShards = append(newShards, sid)
			}
		}
		shards = newShards
		databases[0].RetentionPolicies[0].ShardGroups[0].DeletedAt = time.Now().UTC()
		mu.Unlock()
		return nil
	}

	s.MetaClient.PruneShardGroupsFn = func() error {
		// When this is called all shards that have been deleted from the meta
		// store (expired) should also have been deleted from disk.
		// If they haven't then that indicates that shards can be removed from
		// the meta store and there can be a race where they haven't yet been
		// removed from the local disk and indexes. This has an impact on, for
		// example, the max series per database limit.

		mu.Lock()
		defer mu.Unlock()
		for _, lid := range localShards {
			var found bool
			for _, mid := range shards {
				if lid == mid {
					found = true
					break
				}
			}

			if !found {
				sendError(fmt.Errorf("local shard %d present, yet it's missing from meta store. %v -- %v ", lid, shards, localShards))
				return nil
			}
		}

		// We should have removed shards 3 and 9
		if !reflect.DeepEqual(localShards, []uint64{5, 8}) {
			sendError(fmt.Errorf("removed shards still present locally: %v", localShards))
			return nil
		}
		sendError(nil)
		return nil
	}

	s.TSDBStore.ShardIDsFn = func() []uint64 {
		mu.Lock()
		defer mu.Unlock()
		return localShards
	}

	s.TSDBStore.DeleteShardFn = func(id uint64) error {
		var found bool
		mu.Lock()
		newShards := make([]uint64, 0, len(localShards))
		for _, sid := range localShards {
			if sid != id {
				newShards = append(newShards, sid)
			} else {
				found = true
			}
		}
		localShards = newShards
		mu.Unlock()

		if !found {
			return fmt.Errorf("shard %d not found locally", id)
		}
		return nil
	}

	return s, errC, done
}

type Service struct {
	MetaClient *internal.MetaClientMock
	TSDBStore  *internal.TSDBStoreMock

	LogBuf *observer.ObservedLogs
	*retention.Service
}

func NewService(tb testing.TB, c retention.Config) *Service {
	tb.Helper()

	s := &Service{
		MetaClient: &internal.MetaClientMock{},
		TSDBStore:  &internal.TSDBStoreMock{},
		Service:    retention.NewService(c),
	}

	logcore, logbuf := observer.New(zapcore.InfoLevel)
	log := zap.New(logcore)

	s.LogBuf = logbuf
	s.WithLogger(log)

	s.Service.MetaClient = s.MetaClient
	s.Service.TSDBStore = s.TSDBStore
	return s
}