influxdb/v1/services/retention/service_test.go

406 lines
9.6 KiB
Go

package retention_test
import (
"context"
"fmt"
"reflect"
"sync"
"testing"
"time"
"github.com/influxdata/influxdb/v2/internal"
"github.com/influxdata/influxdb/v2/toml"
"github.com/influxdata/influxdb/v2/v1/services/meta"
"github.com/influxdata/influxdb/v2/v1/services/retention"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"go.uber.org/zap/zaptest/observer"
)
func TestService_OpenDisabled(t *testing.T) {
// Opening a disabled service should be a no-op.
c := retention.NewConfig()
c.Enabled = false
s := NewService(t, c)
if err := s.Open(context.Background()); err != nil {
t.Fatal(err)
}
if s.LogBuf.Len() > 0 {
t.Fatalf("service logged %q, didn't expect any logging", s.LogBuf.All())
}
}
func TestService_OpenClose(t *testing.T) {
// Opening a disabled service should be a no-op.
s := NewService(t, retention.NewConfig())
ctx := context.Background()
if err := s.Open(ctx); err != nil {
t.Fatal(err)
}
if s.LogBuf.Len() == 0 {
t.Fatal("service didn't log anything on open")
}
// Reopening is a no-op
if err := s.Open(ctx); err != nil {
t.Fatal(err)
}
if err := s.Close(); err != nil {
t.Fatal(err)
}
// Re-closing is a no-op
if err := s.Close(); err != nil {
t.Fatal(err)
}
}
func TestService_CheckShards(t *testing.T) {
now := time.Now()
// Account for any time difference that could cause some of the logic in
// this test to fail due to a race condition. If we are at the very end of
// the hour, we can choose a time interval based on one "now" time and then
// run the retention service in the next hour. If we're in one of those
// situations, wait 100 milliseconds until we're in the next hour.
if got, want := now.Add(100*time.Millisecond).Truncate(time.Hour), now.Truncate(time.Hour); !got.Equal(want) {
time.Sleep(100 * time.Millisecond)
}
data := []meta.DatabaseInfo{
{
Name: "db0",
DefaultRetentionPolicy: "rp0",
RetentionPolicies: []meta.RetentionPolicyInfo{
{
Name: "rp0",
ReplicaN: 1,
Duration: time.Hour,
ShardGroupDuration: time.Hour,
ShardGroups: []meta.ShardGroupInfo{
{
ID: 1,
StartTime: now.Truncate(time.Hour).Add(-2 * time.Hour),
EndTime: now.Truncate(time.Hour).Add(-1 * time.Hour),
Shards: []meta.ShardInfo{
{ID: 2},
{ID: 3},
},
},
{
ID: 4,
StartTime: now.Truncate(time.Hour).Add(-1 * time.Hour),
EndTime: now.Truncate(time.Hour),
Shards: []meta.ShardInfo{
{ID: 5},
{ID: 6},
},
},
{
ID: 7,
StartTime: now.Truncate(time.Hour),
EndTime: now.Truncate(time.Hour).Add(time.Hour),
Shards: []meta.ShardInfo{
{ID: 8},
{ID: 9},
},
},
},
},
},
},
}
config := retention.NewConfig()
config.CheckInterval = toml.Duration(10 * time.Millisecond)
s := NewService(t, config)
s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
return data
}
done := make(chan struct{})
deletedShardGroups := make(map[string]struct{})
s.MetaClient.DeleteShardGroupFn = func(database, policy string, id uint64) error {
for _, dbi := range data {
if dbi.Name == database {
for _, rpi := range dbi.RetentionPolicies {
if rpi.Name == policy {
for i, sg := range rpi.ShardGroups {
if sg.ID == id {
rpi.ShardGroups[i].DeletedAt = time.Now().UTC()
}
}
}
}
}
}
deletedShardGroups[fmt.Sprintf("%s.%s.%d", database, policy, id)] = struct{}{}
if got, want := deletedShardGroups, map[string]struct{}{
"db0.rp0.1": struct{}{},
}; reflect.DeepEqual(got, want) {
close(done)
} else if len(got) > 1 {
t.Errorf("deleted too many shard groups")
}
return nil
}
pruned := false
closing := make(chan struct{})
s.MetaClient.PruneShardGroupsFn = func() error {
select {
case <-done:
if !pruned {
close(closing)
pruned = true
}
default:
}
return nil
}
deletedShards := make(map[uint64]struct{})
s.TSDBStore.ShardIDsFn = func() []uint64 {
return []uint64{2, 3, 5, 6}
}
s.TSDBStore.DeleteShardFn = func(shardID uint64) error {
deletedShards[shardID] = struct{}{}
return nil
}
if err := s.Open(context.Background()); err != nil {
t.Fatalf("unexpected open error: %s", err)
}
defer func() {
if err := s.Close(); err != nil {
t.Fatalf("unexpected close error: %s", err)
}
}()
timer := time.NewTimer(100 * time.Millisecond)
select {
case <-done:
timer.Stop()
case <-timer.C:
t.Errorf("timeout waiting for shard groups to be deleted")
return
}
timer = time.NewTimer(100 * time.Millisecond)
select {
case <-closing:
timer.Stop()
case <-timer.C:
t.Errorf("timeout waiting for shards to be deleted")
return
}
if got, want := deletedShards, map[uint64]struct{}{
2: struct{}{},
3: struct{}{},
}; !reflect.DeepEqual(got, want) {
t.Errorf("unexpected deleted shards: got=%#v want=%#v", got, want)
}
}
// This reproduces https://github.com/influxdata/influxdb/issues/8819
func TestService_8819_repro(t *testing.T) {
for i := 0; i < 1000; i++ {
s, errC, done := testService_8819_repro(t)
if err := s.Open(context.Background()); err != nil {
t.Fatal(err)
}
// Wait for service to run one sweep of all dbs/rps/shards.
if err := <-errC; err != nil {
t.Fatalf("%dth iteration: %v", i, err)
}
// Mark that we do not expect more errors in case it runs one more time.
close(done)
if err := s.Close(); err != nil {
t.Fatal(err)
}
}
}
func testService_8819_repro(t *testing.T) (*Service, chan error, chan struct{}) {
c := retention.NewConfig()
c.CheckInterval = toml.Duration(time.Millisecond)
s := NewService(t, c)
errC := make(chan error, 1) // Buffer Important to prevent deadlock.
done := make(chan struct{})
// A database and a bunch of shards
var mu sync.Mutex
shards := []uint64{3, 5, 8, 9, 11, 12}
localShards := []uint64{3, 5, 8, 9, 11, 12}
databases := []meta.DatabaseInfo{
{
Name: "db0",
RetentionPolicies: []meta.RetentionPolicyInfo{
{
Name: "autogen",
Duration: 24 * time.Hour,
ShardGroupDuration: 24 * time.Hour,
ShardGroups: []meta.ShardGroupInfo{
{
ID: 1,
StartTime: time.Date(1980, 1, 1, 0, 0, 0, 0, time.UTC),
EndTime: time.Date(1981, 1, 1, 0, 0, 0, 0, time.UTC),
Shards: []meta.ShardInfo{
{ID: 3}, {ID: 9},
},
},
{
ID: 2,
StartTime: time.Now().Add(-1 * time.Hour),
EndTime: time.Now(),
DeletedAt: time.Now(),
Shards: []meta.ShardInfo{
{ID: 11}, {ID: 12},
},
},
},
},
},
},
}
sendError := func(err error) {
select {
case errC <- err:
case <-done:
}
}
s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
mu.Lock()
defer mu.Unlock()
return databases
}
s.MetaClient.DeleteShardGroupFn = func(database string, policy string, id uint64) error {
if database != "db0" {
sendError(fmt.Errorf("wrong db name: %s", database))
return nil
} else if policy != "autogen" {
sendError(fmt.Errorf("wrong rp name: %s", policy))
return nil
} else if id != 1 {
sendError(fmt.Errorf("wrong shard group id: %d", id))
return nil
}
// remove the associated shards (3 and 9) from the shards slice...
mu.Lock()
newShards := make([]uint64, 0, len(shards))
for _, sid := range shards {
if sid != 3 && sid != 9 {
newShards = append(newShards, sid)
}
}
shards = newShards
databases[0].RetentionPolicies[0].ShardGroups[0].DeletedAt = time.Now().UTC()
mu.Unlock()
return nil
}
s.MetaClient.PruneShardGroupsFn = func() error {
// When this is called all shards that have been deleted from the meta
// store (expired) should also have been deleted from disk.
// If they haven't then that indicates that shards can be removed from
// the meta store and there can be a race where they haven't yet been
// removed from the local disk and indexes. This has an impact on, for
// example, the max series per database limit.
mu.Lock()
defer mu.Unlock()
for _, lid := range localShards {
var found bool
for _, mid := range shards {
if lid == mid {
found = true
break
}
}
if !found {
sendError(fmt.Errorf("local shard %d present, yet it's missing from meta store. %v -- %v ", lid, shards, localShards))
return nil
}
}
// We should have removed shards 3 and 9
if !reflect.DeepEqual(localShards, []uint64{5, 8}) {
sendError(fmt.Errorf("removed shards still present locally: %v", localShards))
return nil
}
sendError(nil)
return nil
}
s.TSDBStore.ShardIDsFn = func() []uint64 {
mu.Lock()
defer mu.Unlock()
return localShards
}
s.TSDBStore.DeleteShardFn = func(id uint64) error {
var found bool
mu.Lock()
newShards := make([]uint64, 0, len(localShards))
for _, sid := range localShards {
if sid != id {
newShards = append(newShards, sid)
} else {
found = true
}
}
localShards = newShards
mu.Unlock()
if !found {
return fmt.Errorf("shard %d not found locally", id)
}
return nil
}
return s, errC, done
}
type Service struct {
MetaClient *internal.MetaClientMock
TSDBStore *internal.TSDBStoreMock
LogBuf *observer.ObservedLogs
*retention.Service
}
func NewService(tb testing.TB, c retention.Config) *Service {
tb.Helper()
s := &Service{
MetaClient: &internal.MetaClientMock{},
TSDBStore: &internal.TSDBStoreMock{},
Service: retention.NewService(c),
}
logcore, logbuf := observer.New(zapcore.InfoLevel)
log := zap.New(logcore)
s.LogBuf = logbuf
s.WithLogger(log)
s.Service.MetaClient = s.MetaClient
s.Service.TSDBStore = s.TSDBStore
return s
}