mirror of https://github.com/k3s-io/k3s.git
Fix apiserver starting before remote etcd is up
Fixes issue where the apiserver on control-plane-only nodes does not actually wait for a connection to etcd to be available before starting. Signed-off-by: Brad Davidson <brad.davidson@rancher.com>pull/13204/head
parent
8f781acff4
commit
7146e2000e
|
|
@ -564,7 +564,7 @@ func (c *Cluster) reconcileEtcd(ctx context.Context) error {
|
|||
}
|
||||
|
||||
for {
|
||||
if err := e.Test(reconcileCtx); err != nil && !errors.Is(err, etcd.ErrNotMember) {
|
||||
if err := e.Test(reconcileCtx, true); err != nil && !errors.Is(err, etcd.ErrNotMember) {
|
||||
logrus.Infof("Failed to test temporary data store connection: %v", err)
|
||||
} else {
|
||||
logrus.Info(e.EndpointName() + " temporary data store connection OK")
|
||||
|
|
|
|||
|
|
@ -45,15 +45,17 @@ func (c *Cluster) ListenAndServe(ctx context.Context) error {
|
|||
// Start handles writing/reading bootstrap data. If embedded etcd is in use,
|
||||
// a secondary call to Cluster.save is made.
|
||||
func (c *Cluster) Start(ctx context.Context, wg *sync.WaitGroup) error {
|
||||
if c.config.DisableETCD || c.managedDB == nil {
|
||||
// if etcd is disabled or we're using kine, perform a no-op start of etcd
|
||||
// to close the etcd ready channel. When etcd is in use, this is handled by
|
||||
// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
|
||||
executor.ETCD(ctx, wg, nil, nil, func(context.Context) error { return nil })
|
||||
// if etcd is disabled or we're using kine, perform a no-op start of etcd
|
||||
// to close the etcd ready channel. When etcd is in use, this is handled by
|
||||
// c.start() -> c.managedDB.Start() -> etcd.Start() -> executor.ETCD()
|
||||
if c.config.DisableETCD {
|
||||
return executor.ETCD(ctx, wg, nil, nil, func(ctx context.Context, _ bool) error { return c.managedDB.Test(ctx, false) })
|
||||
}
|
||||
|
||||
if c.config.DisableETCD {
|
||||
return nil
|
||||
if c.managedDB == nil {
|
||||
if err := executor.ETCD(ctx, wg, nil, nil, func(context.Context, bool) error { return nil }); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// start managed etcd database; when kine is in use this is a no-op.
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ type Driver interface {
|
|||
ReconcileSnapshotData(ctx context.Context) error
|
||||
GetMembersClientURLs(ctx context.Context) ([]string, error)
|
||||
RemoveSelf(ctx context.Context) error
|
||||
Test(ctx context.Context, enableMaintenance bool) error
|
||||
}
|
||||
|
||||
func RegisterDriver(d Driver) {
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ func (e *Embedded) ETCD(ctx context.Context, wg *sync.WaitGroup, args *ETCDConfi
|
|||
if e.etcdReady != nil {
|
||||
go func() {
|
||||
for {
|
||||
if err := test(ctx); err != nil {
|
||||
if err := test(ctx, true); err != nil {
|
||||
logrus.Infof("Failed to test etcd connection: %v", err)
|
||||
} else {
|
||||
logrus.Info("Connection to etcd is ready")
|
||||
|
|
|
|||
|
|
@ -21,8 +21,9 @@ var (
|
|||
executor Executor
|
||||
)
|
||||
|
||||
// TestFunc is the signature of a function that returns nil error when the component is ready
|
||||
type TestFunc func(context.Context) error
|
||||
// TestFunc is the signature of a function that returns nil error when the component is ready.
|
||||
// The enableMaintenance flag enables attempts to perform corrective maintenance during the test process.
|
||||
type TestFunc func(ctx context.Context, enableMaintenance bool) error
|
||||
|
||||
type Executor interface {
|
||||
Bootstrap(ctx context.Context, nodeConfig *daemonconfig.Node, cfg cmds.Agent) error
|
||||
|
|
|
|||
|
|
@ -204,8 +204,9 @@ func (e *ETCD) SetControlConfig(config *config.Control) error {
|
|||
// Test ensures that the local node is a voting member of the target cluster,
|
||||
// and that the datastore is defragmented and not in maintenance mode due to alarms.
|
||||
// If it is still a learner or not a part of the cluster, an error is raised.
|
||||
// If enableMaintenance is true, an attempt will be made to defagment the datastore and clear alarms.
|
||||
// If it cannot be defragmented or has any alarms that cannot be disarmed, an error is raised.
|
||||
func (e *ETCD) Test(ctx context.Context) error {
|
||||
func (e *ETCD) Test(ctx context.Context, enableMaintenance bool) error {
|
||||
if e.config == nil {
|
||||
return errors.New("control config not set")
|
||||
}
|
||||
|
|
@ -223,10 +224,15 @@ func (e *ETCD) Test(ctx context.Context) error {
|
|||
}
|
||||
|
||||
logrus.Infof("Connected to etcd v%s - datastore using %d of %d bytes", status.Version, status.DbSizeInUse, status.DbSize)
|
||||
|
||||
if len(status.Errors) > 0 {
|
||||
logrus.Warnf("Errors present on etcd cluster: %s", strings.Join(status.Errors, ","))
|
||||
}
|
||||
|
||||
if !enableMaintenance {
|
||||
return nil
|
||||
}
|
||||
|
||||
// defrag this node to reclaim freed space from compacted revisions
|
||||
if err := e.defragment(ctx); err != nil {
|
||||
return pkgerrors.WithMessage(err, "failed to defragment etcd database")
|
||||
|
|
@ -349,7 +355,7 @@ func (e *ETCD) Reset(ctx context.Context, wg *sync.WaitGroup, rebootstrap func()
|
|||
<-executor.CRIReadyChan()
|
||||
}
|
||||
wait.PollUntilContextCancel(ctx, time.Second*5, true, func(ctx context.Context) (bool, error) {
|
||||
if err := e.Test(ctx); err == nil {
|
||||
if err := e.Test(ctx, true); err == nil {
|
||||
// reset the apiaddresses to nil since we are doing a restoration
|
||||
if _, err := e.client.Put(ctx, AddressKey, ""); err != nil {
|
||||
logrus.Warnf("failed to reset api addresses key in etcd: %v", err)
|
||||
|
|
|
|||
|
|
@ -660,7 +660,7 @@ func Test_UnitETCD_Test(t *testing.T) {
|
|||
return
|
||||
}
|
||||
start := time.Now()
|
||||
err := e.Test(tt.fields.context.ctx)
|
||||
err := e.Test(tt.fields.context.ctx, true)
|
||||
duration := time.Now().Sub(start)
|
||||
t.Logf("ETCD.Test() %q completed in %v with err=%v", tt.name, duration, err)
|
||||
if (err != nil) != tt.wantErr {
|
||||
|
|
|
|||
Loading…
Reference in New Issue