Fix apiserver starting before remote etcd is up

Fixes issue where the apiserver on control-plane-only nodes does not actually wait for a connection to etcd to be available before starting. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2025-11-07 08:10:21 +00:00 · 2025-11-07 08:10:21 +00:00 · 7146e2000e
parent 8f781acff4
commit 7146e2000e
7 changed files with 24 additions and 14 deletions
--- a/pkg/cluster/bootstrap.go
+++ b/pkg/cluster/bootstrap.go
@ -564,7 +564,7 @@ func (c *Cluster) reconcileEtcd(ctx context.Context) error {
 	}

 	for {
-		if err := e.Test(reconcileCtx); err != nil && !errors.Is(err, etcd.ErrNotMember) {
+		if err := e.Test(reconcileCtx, true); err != nil && !errors.Is(err, etcd.ErrNotMember) {
 			logrus.Infof("Failed to test temporary data store connection: %v", err)
 		} else {
 			logrus.Info(e.EndpointName() + " temporary data store connection OK")
--- a/pkg/cluster/cluster.go
+++ b/pkg/cluster/cluster.go
@ -45,15 +45,17 @@ func (c *Cluster) ListenAndServe(ctx context.Context) error {
 // Start handles writing/reading bootstrap data. If embedded etcd is in use,
 // a secondary call to Cluster.save is made.
 func (c *Cluster) Start(ctx context.Context, wg *sync.WaitGroup) error {
-	if c.config.DisableETCD || c.managedDB == nil {
-		// if etcd is disabled or we're using kine, perform a no-op start of etcd
-		// to close the etcd ready channel. When etcd is in use, this is handled by
-		// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
-		executor.ETCD(ctx, wg, nil, nil, func(context.Context) error { return nil })
+	// if etcd is disabled or we're using kine, perform a no-op start of etcd
+	// to close the etcd ready channel. When etcd is in use, this is handled by
+	// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
+	if c.config.DisableETCD {
+		return executor.ETCD(ctx, wg, nil, nil, func(ctx context.Context, _ bool) error { return c.managedDB.Test(ctx, false) })
 	}

-	if c.config.DisableETCD {
-		return nil
+	if c.managedDB == nil {
+		if err := executor.ETCD(ctx, wg, nil, nil, func(context.Context, bool) error { return nil }); err != nil {
+			return err
+		}
 	}

 	// start managed etcd database; when kine is in use this is a no-op.
--- a/pkg/cluster/managed/drivers.go
+++ b/pkg/cluster/managed/drivers.go
@ -27,6 +27,7 @@ type Driver interface {
 	ReconcileSnapshotData(ctx context.Context) error
 	GetMembersClientURLs(ctx context.Context) ([]string, error)
 	RemoveSelf(ctx context.Context) error
+	Test(ctx context.Context, enableMaintenance bool) error
 }

 func RegisterDriver(d Driver) {
--- a/pkg/daemons/executor/etcd.go
+++ b/pkg/daemons/executor/etcd.go
@ -30,7 +30,7 @@ func (e *Embedded) ETCD(ctx context.Context, wg *sync.WaitGroup, args *ETCDConfi
 	if e.etcdReady != nil {
 		go func() {
 			for {
-				if err := test(ctx); err != nil {
+				if err := test(ctx, true); err != nil {
 					logrus.Infof("Failed to test etcd connection: %v", err)
 				} else {
 					logrus.Info("Connection to etcd is ready")
--- a/pkg/daemons/executor/executor.go
+++ b/pkg/daemons/executor/executor.go
@ -21,8 +21,9 @@ var (
 	executor Executor
 )

-// TestFunc is the signature of a function that returns nil error when the component is ready
-type TestFunc func(context.Context) error
+// TestFunc is the signature of a function that returns nil error when the component is ready.
+// The enableMaintenance flag enables attempts to perform corrective maintenance during the test process.
+type TestFunc func(ctx context.Context, enableMaintenance bool) error

 type Executor interface {
 	Bootstrap(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent) error
--- a/pkg/etcd/etcd.go
+++ b/pkg/etcd/etcd.go
@ -204,8 +204,9 @@ func (e *ETCD) SetControlConfig(config *config.Control) error {
 // Test ensures that the local node is a voting member of the target cluster,
 // and that the datastore is defragmented and not in maintenance mode due to alarms.
 // If it is still a learner or not a part of the cluster, an error is raised.
+// If enableMaintenance is true, an attempt will be made to defagment the datastore and clear alarms.
 // If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
-func (e *ETCD) Test(ctx context.Context) error {
+func (e *ETCD) Test(ctx context.Context, enableMaintenance bool) error {
 	if e.config == nil {
 		return errors.New("control config not set")
 	}
@ -223,10 +224,15 @@ func (e *ETCD) Test(ctx context.Context) error {
 	}

 	logrus.Infof("Connected to etcd v%s - datastore using %d of %d bytes", status.Version, status.DbSizeInUse, status.DbSize)
+
 	if len(status.Errors) > 0 {
 		logrus.Warnf("Errors present on etcd cluster: %s", strings.Join(status.Errors, ","))
 	}

+	if !enableMaintenance {
+		return nil
+	}
+
 	// defrag this node to reclaim freed space from compacted revisions
 	if err := e.defragment(ctx); err != nil {
 		return pkgerrors.WithMessage(err, "failed to defragment etcd database")
@ -349,7 +355,7 @@ func (e *ETCD) Reset(ctx context.Context, wg *sync.WaitGroup, rebootstrap func()
 			<-executor.CRIReadyChan()
 		}
 		wait.PollUntilContextCancel(ctx, time.Second*5, true, func(ctx context.Context) (bool, error) {
-			if err := e.Test(ctx); err == nil {
+			if err := e.Test(ctx, true); err == nil {
 				// reset the apiaddresses to nil since we are doing a restoration
 				if _, err := e.client.Put(ctx, AddressKey, ""); err != nil {
 					logrus.Warnf("failed to reset api addresses key in etcd: %v", err)
--- a/pkg/etcd/etcd_linux_test.go
+++ b/pkg/etcd/etcd_linux_test.go
@ -660,7 +660,7 @@ func Test_UnitETCD_Test(t *testing.T) {
 				return
 			}
 			start := time.Now()
-			err := e.Test(tt.fields.context.ctx)
+			err := e.Test(tt.fields.context.ctx, true)
 			duration := time.Now().Sub(start)
 			t.Logf("ETCD.Test() %q completed in %v with err=%v", tt.name, duration, err)
 			if (err != nil) != tt.wantErr {