Fix apiserver starting before remote etcd is up

Fixes issue where the apiserver on control-plane-only nodes does not
actually wait for a connection to etcd to be available before starting.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/13204/head
Brad Davidson 2025-11-07 08:10:21 +00:00 committed by Brad Davidson
parent 8f781acff4
commit 7146e2000e
7 changed files with 24 additions and 14 deletions

View File

@ -564,7 +564,7 @@ func (c *Cluster) reconcileEtcd(ctx context.Context) error {
}
for {
if err := e.Test(reconcileCtx); err != nil && !errors.Is(err, etcd.ErrNotMember) {
if err := e.Test(reconcileCtx, true); err != nil && !errors.Is(err, etcd.ErrNotMember) {
logrus.Infof("Failed to test temporary data store connection: %v", err)
} else {
logrus.Info(e.EndpointName() + " temporary data store connection OK")

View File

@ -45,15 +45,17 @@ func (c *Cluster) ListenAndServe(ctx context.Context) error {
// Start handles writing/reading bootstrap data. If embedded etcd is in use,
// a secondary call to Cluster.save is made.
func (c *Cluster) Start(ctx context.Context, wg *sync.WaitGroup) error {
if c.config.DisableETCD || c.managedDB == nil {
// if etcd is disabled or we're using kine, perform a no-op start of etcd
// to close the etcd ready channel. When etcd is in use, this is handled by
// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
executor.ETCD(ctx, wg, nil, nil, func(context.Context) error { return nil })
// if etcd is disabled or we're using kine, perform a no-op start of etcd
// to close the etcd ready channel. When etcd is in use, this is handled by
// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
if c.config.DisableETCD {
return executor.ETCD(ctx, wg, nil, nil, func(ctx context.Context, _ bool) error { return c.managedDB.Test(ctx, false) })
}
if c.config.DisableETCD {
return nil
if c.managedDB == nil {
if err := executor.ETCD(ctx, wg, nil, nil, func(context.Context, bool) error { return nil }); err != nil {
return err
}
}
// start managed etcd database; when kine is in use this is a no-op.

View File

@ -27,6 +27,7 @@ type Driver interface {
ReconcileSnapshotData(ctx context.Context) error
GetMembersClientURLs(ctx context.Context) ([]string, error)
RemoveSelf(ctx context.Context) error
Test(ctx context.Context, enableMaintenance bool) error
}
func RegisterDriver(d Driver) {

View File

@ -30,7 +30,7 @@ func (e *Embedded) ETCD(ctx context.Context, wg *sync.WaitGroup, args *ETCDConfi
if e.etcdReady != nil {
go func() {
for {
if err := test(ctx); err != nil {
if err := test(ctx, true); err != nil {
logrus.Infof("Failed to test etcd connection: %v", err)
} else {
logrus.Info("Connection to etcd is ready")

View File

@ -21,8 +21,9 @@ var (
executor Executor
)
// TestFunc is the signature of a function that returns nil error when the component is ready
type TestFunc func(context.Context) error
// TestFunc is the signature of a function that returns nil error when the component is ready.
// The enableMaintenance flag enables attempts to perform corrective maintenance during the test process.
type TestFunc func(ctx context.Context, enableMaintenance bool) error
type Executor interface {
Bootstrap(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent) error

View File

@ -204,8 +204,9 @@ func (e *ETCD) SetControlConfig(config *config.Control) error {
// Test ensures that the local node is a voting member of the target cluster,
// and that the datastore is defragmented and not in maintenance mode due to alarms.
// If it is still a learner or not a part of the cluster, an error is raised.
// If enableMaintenance is true, an attempt will be made to defagment the datastore and clear alarms.
// If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
func (e *ETCD) Test(ctx context.Context) error {
func (e *ETCD) Test(ctx context.Context, enableMaintenance bool) error {
if e.config == nil {
return errors.New("control config not set")
}
@ -223,10 +224,15 @@ func (e *ETCD) Test(ctx context.Context) error {
}
logrus.Infof("Connected to etcd v%s - datastore using %d of %d bytes", status.Version, status.DbSizeInUse, status.DbSize)
if len(status.Errors) > 0 {
logrus.Warnf("Errors present on etcd cluster: %s", strings.Join(status.Errors, ","))
}
if !enableMaintenance {
return nil
}
// defrag this node to reclaim freed space from compacted revisions
if err := e.defragment(ctx); err != nil {
return pkgerrors.WithMessage(err, "failed to defragment etcd database")
@ -349,7 +355,7 @@ func (e *ETCD) Reset(ctx context.Context, wg *sync.WaitGroup, rebootstrap func()
<-executor.CRIReadyChan()
}
wait.PollUntilContextCancel(ctx, time.Second*5, true, func(ctx context.Context) (bool, error) {
if err := e.Test(ctx); err == nil {
if err := e.Test(ctx, true); err == nil {
// reset the apiaddresses to nil since we are doing a restoration
if _, err := e.client.Put(ctx, AddressKey, ""); err != nil {
logrus.Warnf("failed to reset api addresses key in etcd: %v", err)

View File

@ -660,7 +660,7 @@ func Test_UnitETCD_Test(t *testing.T) {
return
}
start := time.Now()
err := e.Test(tt.fields.context.ctx)
err := e.Test(tt.fields.context.ctx, true)
duration := time.Now().Sub(start)
t.Logf("ETCD.Test() %q completed in %v with err=%v", tt.name, duration, err)
if (err != nil) != tt.wantErr {