diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index e36295fa2fc..b20837fd3df 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -564,7 +564,7 @@ func (c *Cluster) reconcileEtcd(ctx context.Context) error { } for { - if err := e.Test(reconcileCtx); err != nil && !errors.Is(err, etcd.ErrNotMember) { + if err := e.Test(reconcileCtx, true); err != nil && !errors.Is(err, etcd.ErrNotMember) { logrus.Infof("Failed to test temporary data store connection: %v", err) } else { logrus.Info(e.EndpointName() + " temporary data store connection OK") diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8e9c8478180..7226fdac498 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -45,15 +45,17 @@ func (c *Cluster) ListenAndServe(ctx context.Context) error { // Start handles writing/reading bootstrap data. If embedded etcd is in use, // a secondary call to Cluster.save is made. func (c *Cluster) Start(ctx context.Context, wg *sync.WaitGroup) error { - if c.config.DisableETCD || c.managedDB == nil { - // if etcd is disabled or we're using kine, perform a no-op start of etcd - // to close the etcd ready channel. When etcd is in use, this is handled by - // c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD() - executor.ETCD(ctx, wg, nil, nil, func(context.Context) error { return nil }) + // if etcd is disabled or we're using kine, perform a no-op start of etcd + // to close the etcd ready channel. When etcd is in use, this is handled by + // c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD() + if c.config.DisableETCD { + return executor.ETCD(ctx, wg, nil, nil, func(ctx context.Context, _ bool) error { return c.managedDB.Test(ctx, false) }) } - if c.config.DisableETCD { - return nil + if c.managedDB == nil { + if err := executor.ETCD(ctx, wg, nil, nil, func(context.Context, bool) error { return nil }); err != nil { + return err + } } // start managed etcd database; when kine is in use this is a no-op. diff --git a/pkg/cluster/managed/drivers.go b/pkg/cluster/managed/drivers.go index 174db18dc65..b170fdc0263 100644 --- a/pkg/cluster/managed/drivers.go +++ b/pkg/cluster/managed/drivers.go @@ -27,6 +27,7 @@ type Driver interface { ReconcileSnapshotData(ctx context.Context) error GetMembersClientURLs(ctx context.Context) ([]string, error) RemoveSelf(ctx context.Context) error + Test(ctx context.Context, enableMaintenance bool) error } func RegisterDriver(d Driver) { diff --git a/pkg/daemons/executor/etcd.go b/pkg/daemons/executor/etcd.go index 547e5a8663f..c63890d080f 100644 --- a/pkg/daemons/executor/etcd.go +++ b/pkg/daemons/executor/etcd.go @@ -30,7 +30,7 @@ func (e *Embedded) ETCD(ctx context.Context, wg *sync.WaitGroup, args *ETCDConfi if e.etcdReady != nil { go func() { for { - if err := test(ctx); err != nil { + if err := test(ctx, true); err != nil { logrus.Infof("Failed to test etcd connection: %v", err) } else { logrus.Info("Connection to etcd is ready") diff --git a/pkg/daemons/executor/executor.go b/pkg/daemons/executor/executor.go index 7b3454b16fb..3c516af99c7 100644 --- a/pkg/daemons/executor/executor.go +++ b/pkg/daemons/executor/executor.go @@ -21,8 +21,9 @@ var ( executor Executor ) -// TestFunc is the signature of a function that returns nil error when the component is ready -type TestFunc func(context.Context) error +// TestFunc is the signature of a function that returns nil error when the component is ready. +// The enableMaintenance flag enables attempts to perform corrective maintenance during the test process. +type TestFunc func(ctx context.Context, enableMaintenance bool) error type Executor interface { Bootstrap(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent) error diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index 01a2ff25153..46e0e6c7d2e 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -204,8 +204,9 @@ func (e *ETCD) SetControlConfig(config *config.Control) error { // Test ensures that the local node is a voting member of the target cluster, // and that the datastore is defragmented and not in maintenance mode due to alarms. // If it is still a learner or not a part of the cluster, an error is raised. +// If enableMaintenance is true, an attempt will be made to defagment the datastore and clear alarms. // If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised. -func (e *ETCD) Test(ctx context.Context) error { +func (e *ETCD) Test(ctx context.Context, enableMaintenance bool) error { if e.config == nil { return errors.New("control config not set") } @@ -223,10 +224,15 @@ func (e *ETCD) Test(ctx context.Context) error { } logrus.Infof("Connected to etcd v%s - datastore using %d of %d bytes", status.Version, status.DbSizeInUse, status.DbSize) + if len(status.Errors) > 0 { logrus.Warnf("Errors present on etcd cluster: %s", strings.Join(status.Errors, ",")) } + if !enableMaintenance { + return nil + } + // defrag this node to reclaim freed space from compacted revisions if err := e.defragment(ctx); err != nil { return pkgerrors.WithMessage(err, "failed to defragment etcd database") @@ -349,7 +355,7 @@ func (e *ETCD) Reset(ctx context.Context, wg *sync.WaitGroup, rebootstrap func() <-executor.CRIReadyChan() } wait.PollUntilContextCancel(ctx, time.Second*5, true, func(ctx context.Context) (bool, error) { - if err := e.Test(ctx); err == nil { + if err := e.Test(ctx, true); err == nil { // reset the apiaddresses to nil since we are doing a restoration if _, err := e.client.Put(ctx, AddressKey, ""); err != nil { logrus.Warnf("failed to reset api addresses key in etcd: %v", err) diff --git a/pkg/etcd/etcd_linux_test.go b/pkg/etcd/etcd_linux_test.go index b3bdc340651..cea07e4c733 100644 --- a/pkg/etcd/etcd_linux_test.go +++ b/pkg/etcd/etcd_linux_test.go @@ -660,7 +660,7 @@ func Test_UnitETCD_Test(t *testing.T) { return } start := time.Now() - err := e.Test(tt.fields.context.ctx) + err := e.Test(tt.fields.context.ctx, true) duration := time.Now().Sub(start) t.Logf("ETCD.Test() %q completed in %v with err=%v", tt.name, duration, err) if (err != nil) != tt.wantErr {