406 lines
9.6 KiB
Go
406 lines
9.6 KiB
Go
package retention_test
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"reflect"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/influxdata/influxdb/v2/internal"
|
|
"github.com/influxdata/influxdb/v2/toml"
|
|
"github.com/influxdata/influxdb/v2/v1/services/meta"
|
|
"github.com/influxdata/influxdb/v2/v1/services/retention"
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zapcore"
|
|
"go.uber.org/zap/zaptest/observer"
|
|
)
|
|
|
|
func TestService_OpenDisabled(t *testing.T) {
|
|
// Opening a disabled service should be a no-op.
|
|
c := retention.NewConfig()
|
|
c.Enabled = false
|
|
s := NewService(t, c)
|
|
|
|
if err := s.Open(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if s.LogBuf.Len() > 0 {
|
|
t.Fatalf("service logged %q, didn't expect any logging", s.LogBuf.All())
|
|
}
|
|
}
|
|
|
|
func TestService_OpenClose(t *testing.T) {
|
|
// Opening a disabled service should be a no-op.
|
|
s := NewService(t, retention.NewConfig())
|
|
|
|
ctx := context.Background()
|
|
if err := s.Open(ctx); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if s.LogBuf.Len() == 0 {
|
|
t.Fatal("service didn't log anything on open")
|
|
}
|
|
|
|
// Reopening is a no-op
|
|
if err := s.Open(ctx); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
if err := s.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Re-closing is a no-op
|
|
if err := s.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
func TestService_CheckShards(t *testing.T) {
|
|
now := time.Now()
|
|
// Account for any time difference that could cause some of the logic in
|
|
// this test to fail due to a race condition. If we are at the very end of
|
|
// the hour, we can choose a time interval based on one "now" time and then
|
|
// run the retention service in the next hour. If we're in one of those
|
|
// situations, wait 100 milliseconds until we're in the next hour.
|
|
if got, want := now.Add(100*time.Millisecond).Truncate(time.Hour), now.Truncate(time.Hour); !got.Equal(want) {
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
|
|
data := []meta.DatabaseInfo{
|
|
{
|
|
Name: "db0",
|
|
|
|
DefaultRetentionPolicy: "rp0",
|
|
RetentionPolicies: []meta.RetentionPolicyInfo{
|
|
{
|
|
Name: "rp0",
|
|
ReplicaN: 1,
|
|
Duration: time.Hour,
|
|
ShardGroupDuration: time.Hour,
|
|
ShardGroups: []meta.ShardGroupInfo{
|
|
{
|
|
ID: 1,
|
|
StartTime: now.Truncate(time.Hour).Add(-2 * time.Hour),
|
|
EndTime: now.Truncate(time.Hour).Add(-1 * time.Hour),
|
|
Shards: []meta.ShardInfo{
|
|
{ID: 2},
|
|
{ID: 3},
|
|
},
|
|
},
|
|
{
|
|
ID: 4,
|
|
StartTime: now.Truncate(time.Hour).Add(-1 * time.Hour),
|
|
EndTime: now.Truncate(time.Hour),
|
|
Shards: []meta.ShardInfo{
|
|
{ID: 5},
|
|
{ID: 6},
|
|
},
|
|
},
|
|
{
|
|
ID: 7,
|
|
StartTime: now.Truncate(time.Hour),
|
|
EndTime: now.Truncate(time.Hour).Add(time.Hour),
|
|
Shards: []meta.ShardInfo{
|
|
{ID: 8},
|
|
{ID: 9},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
config := retention.NewConfig()
|
|
config.CheckInterval = toml.Duration(10 * time.Millisecond)
|
|
s := NewService(t, config)
|
|
s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
|
|
return data
|
|
}
|
|
|
|
done := make(chan struct{})
|
|
deletedShardGroups := make(map[string]struct{})
|
|
s.MetaClient.DeleteShardGroupFn = func(database, policy string, id uint64) error {
|
|
for _, dbi := range data {
|
|
if dbi.Name == database {
|
|
for _, rpi := range dbi.RetentionPolicies {
|
|
if rpi.Name == policy {
|
|
for i, sg := range rpi.ShardGroups {
|
|
if sg.ID == id {
|
|
rpi.ShardGroups[i].DeletedAt = time.Now().UTC()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
deletedShardGroups[fmt.Sprintf("%s.%s.%d", database, policy, id)] = struct{}{}
|
|
if got, want := deletedShardGroups, map[string]struct{}{
|
|
"db0.rp0.1": struct{}{},
|
|
}; reflect.DeepEqual(got, want) {
|
|
close(done)
|
|
} else if len(got) > 1 {
|
|
t.Errorf("deleted too many shard groups")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
pruned := false
|
|
closing := make(chan struct{})
|
|
s.MetaClient.PruneShardGroupsFn = func() error {
|
|
select {
|
|
case <-done:
|
|
if !pruned {
|
|
close(closing)
|
|
pruned = true
|
|
}
|
|
default:
|
|
}
|
|
return nil
|
|
}
|
|
|
|
deletedShards := make(map[uint64]struct{})
|
|
s.TSDBStore.ShardIDsFn = func() []uint64 {
|
|
return []uint64{2, 3, 5, 6}
|
|
}
|
|
s.TSDBStore.DeleteShardFn = func(shardID uint64) error {
|
|
deletedShards[shardID] = struct{}{}
|
|
return nil
|
|
}
|
|
|
|
if err := s.Open(context.Background()); err != nil {
|
|
t.Fatalf("unexpected open error: %s", err)
|
|
}
|
|
defer func() {
|
|
if err := s.Close(); err != nil {
|
|
t.Fatalf("unexpected close error: %s", err)
|
|
}
|
|
}()
|
|
|
|
timer := time.NewTimer(100 * time.Millisecond)
|
|
select {
|
|
case <-done:
|
|
timer.Stop()
|
|
case <-timer.C:
|
|
t.Errorf("timeout waiting for shard groups to be deleted")
|
|
return
|
|
}
|
|
|
|
timer = time.NewTimer(100 * time.Millisecond)
|
|
select {
|
|
case <-closing:
|
|
timer.Stop()
|
|
case <-timer.C:
|
|
t.Errorf("timeout waiting for shards to be deleted")
|
|
return
|
|
}
|
|
|
|
if got, want := deletedShards, map[uint64]struct{}{
|
|
2: struct{}{},
|
|
3: struct{}{},
|
|
}; !reflect.DeepEqual(got, want) {
|
|
t.Errorf("unexpected deleted shards: got=%#v want=%#v", got, want)
|
|
}
|
|
}
|
|
|
|
// This reproduces https://github.com/influxdata/influxdb/issues/8819
|
|
func TestService_8819_repro(t *testing.T) {
|
|
for i := 0; i < 1000; i++ {
|
|
s, errC, done := testService_8819_repro(t)
|
|
|
|
if err := s.Open(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Wait for service to run one sweep of all dbs/rps/shards.
|
|
if err := <-errC; err != nil {
|
|
t.Fatalf("%dth iteration: %v", i, err)
|
|
}
|
|
// Mark that we do not expect more errors in case it runs one more time.
|
|
close(done)
|
|
|
|
if err := s.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func testService_8819_repro(t *testing.T) (*Service, chan error, chan struct{}) {
|
|
c := retention.NewConfig()
|
|
c.CheckInterval = toml.Duration(time.Millisecond)
|
|
s := NewService(t, c)
|
|
errC := make(chan error, 1) // Buffer Important to prevent deadlock.
|
|
done := make(chan struct{})
|
|
|
|
// A database and a bunch of shards
|
|
var mu sync.Mutex
|
|
shards := []uint64{3, 5, 8, 9, 11, 12}
|
|
localShards := []uint64{3, 5, 8, 9, 11, 12}
|
|
databases := []meta.DatabaseInfo{
|
|
{
|
|
Name: "db0",
|
|
RetentionPolicies: []meta.RetentionPolicyInfo{
|
|
{
|
|
Name: "autogen",
|
|
Duration: 24 * time.Hour,
|
|
ShardGroupDuration: 24 * time.Hour,
|
|
ShardGroups: []meta.ShardGroupInfo{
|
|
{
|
|
ID: 1,
|
|
StartTime: time.Date(1980, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
EndTime: time.Date(1981, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
Shards: []meta.ShardInfo{
|
|
{ID: 3}, {ID: 9},
|
|
},
|
|
},
|
|
{
|
|
ID: 2,
|
|
StartTime: time.Now().Add(-1 * time.Hour),
|
|
EndTime: time.Now(),
|
|
DeletedAt: time.Now(),
|
|
Shards: []meta.ShardInfo{
|
|
{ID: 11}, {ID: 12},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
sendError := func(err error) {
|
|
select {
|
|
case errC <- err:
|
|
case <-done:
|
|
}
|
|
}
|
|
|
|
s.MetaClient.DatabasesFn = func() []meta.DatabaseInfo {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
return databases
|
|
}
|
|
|
|
s.MetaClient.DeleteShardGroupFn = func(database string, policy string, id uint64) error {
|
|
if database != "db0" {
|
|
sendError(fmt.Errorf("wrong db name: %s", database))
|
|
return nil
|
|
} else if policy != "autogen" {
|
|
sendError(fmt.Errorf("wrong rp name: %s", policy))
|
|
return nil
|
|
} else if id != 1 {
|
|
sendError(fmt.Errorf("wrong shard group id: %d", id))
|
|
return nil
|
|
}
|
|
|
|
// remove the associated shards (3 and 9) from the shards slice...
|
|
mu.Lock()
|
|
newShards := make([]uint64, 0, len(shards))
|
|
for _, sid := range shards {
|
|
if sid != 3 && sid != 9 {
|
|
newShards = append(newShards, sid)
|
|
}
|
|
}
|
|
shards = newShards
|
|
databases[0].RetentionPolicies[0].ShardGroups[0].DeletedAt = time.Now().UTC()
|
|
mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
s.MetaClient.PruneShardGroupsFn = func() error {
|
|
// When this is called all shards that have been deleted from the meta
|
|
// store (expired) should also have been deleted from disk.
|
|
// If they haven't then that indicates that shards can be removed from
|
|
// the meta store and there can be a race where they haven't yet been
|
|
// removed from the local disk and indexes. This has an impact on, for
|
|
// example, the max series per database limit.
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
for _, lid := range localShards {
|
|
var found bool
|
|
for _, mid := range shards {
|
|
if lid == mid {
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !found {
|
|
sendError(fmt.Errorf("local shard %d present, yet it's missing from meta store. %v -- %v ", lid, shards, localShards))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// We should have removed shards 3 and 9
|
|
if !reflect.DeepEqual(localShards, []uint64{5, 8}) {
|
|
sendError(fmt.Errorf("removed shards still present locally: %v", localShards))
|
|
return nil
|
|
}
|
|
sendError(nil)
|
|
return nil
|
|
}
|
|
|
|
s.TSDBStore.ShardIDsFn = func() []uint64 {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
return localShards
|
|
}
|
|
|
|
s.TSDBStore.DeleteShardFn = func(id uint64) error {
|
|
var found bool
|
|
mu.Lock()
|
|
newShards := make([]uint64, 0, len(localShards))
|
|
for _, sid := range localShards {
|
|
if sid != id {
|
|
newShards = append(newShards, sid)
|
|
} else {
|
|
found = true
|
|
}
|
|
}
|
|
localShards = newShards
|
|
mu.Unlock()
|
|
|
|
if !found {
|
|
return fmt.Errorf("shard %d not found locally", id)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return s, errC, done
|
|
}
|
|
|
|
type Service struct {
|
|
MetaClient *internal.MetaClientMock
|
|
TSDBStore *internal.TSDBStoreMock
|
|
|
|
LogBuf *observer.ObservedLogs
|
|
*retention.Service
|
|
}
|
|
|
|
func NewService(tb testing.TB, c retention.Config) *Service {
|
|
tb.Helper()
|
|
|
|
s := &Service{
|
|
MetaClient: &internal.MetaClientMock{},
|
|
TSDBStore: &internal.TSDBStoreMock{},
|
|
Service: retention.NewService(c),
|
|
}
|
|
|
|
logcore, logbuf := observer.New(zapcore.InfoLevel)
|
|
log := zap.New(logcore)
|
|
|
|
s.LogBuf = logbuf
|
|
s.WithLogger(log)
|
|
|
|
s.Service.MetaClient = s.MetaClient
|
|
s.Service.TSDBStore = s.TSDBStore
|
|
return s
|
|
}
|