From a3bbd58f37939c45b4eca31df508274ec6671a8d Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Wed, 23 Sep 2020 22:22:15 -0700 Subject: [PATCH] Fix managed etcd cold startup deadlock issue #2249 We should ignore --token and --server if the managed database is initialized, just like we ignore --cluster-init. If the user wants to join a new cluster, or rejoin a cluster after --cluster-reset, they need to delete the database. This a cleaner way to prevent deadlocking on quorum loss, and removes the requirement that the target of the --server argument must be online before already joined nodes can start. Signed-off-by: Brad Davidson --- pkg/cluster/bootstrap.go | 64 ++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index 848cf44e66..9acf9cfe25 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -3,7 +3,7 @@ package cluster import ( "bytes" "context" - "fmt" + "errors" "os" "path/filepath" @@ -18,7 +18,7 @@ func (c *Cluster) Bootstrap(ctx context.Context) error { return err } - runBootstrap, err := c.shouldBootstrapLoad() + runBootstrap, err := c.shouldBootstrapLoad(ctx) if err != nil { return err } @@ -33,35 +33,58 @@ func (c *Cluster) Bootstrap(ctx context.Context) error { return nil } -func (c *Cluster) shouldBootstrapLoad() (bool, error) { +// shouldBootstrapLoad returns true if we need to load ControlRuntimeBootstrap data again. +// This is controlled by a stamp file on disk that records successful bootstrap using a hash of the join token. +func (c *Cluster) shouldBootstrapLoad(ctx context.Context) (bool, error) { + // Non-nil managedDB indicates that the database is either initialized, initializing, or joining if c.managedDB != nil { c.runtime.HTTPBootstrap = true - if c.config.JoinURL == "" { + + isInitialized, err := c.managedDB.IsInitialized(ctx, c.config) + if err != nil { + return false, err + } + + if isInitialized { + // If the database is initialized we skip bootstrapping; if the user wants to rejoin a + // cluster they need to delete the database. + logrus.Infof("Managed %s cluster bootstrap already complete and initialized", c.managedDB.EndpointName()) return false, nil - } + } else if c.config.JoinURL == "" { + // Not initialized, not joining - must be initializing (cluster-init) + logrus.Infof("Managed %s cluster initializing", c.managedDB.EndpointName()) + return false, nil + } else { + // Not initialized, but have a Join URL - fail if there's no token; if there is then validate it. + if c.config.Token == "" { + return false, errors.New(version.ProgramUpper + "_TOKEN is required to join a cluster") + } - token, err := clientaccess.NormalizeAndValidateTokenForUser(c.config.JoinURL, c.config.Token, "server") - if err != nil { - return false, err - } + token, err := clientaccess.NormalizeAndValidateTokenForUser(c.config.JoinURL, c.config.Token, "server") + if err != nil { + return false, err + } - info, err := clientaccess.ParseAndValidateToken(c.config.JoinURL, token) - if err != nil { - return false, err + info, err := clientaccess.ParseAndValidateToken(c.config.JoinURL, token) + if err != nil { + return false, err + } + + logrus.Infof("Managed %s cluster not yet initialized", c.managedDB.EndpointName()) + c.clientAccessInfo = info } - c.clientAccessInfo = info } + // Check the stamp file to see if we have successfully bootstrapped using this token. + // NOTE: The fact that we use a hash of the token to generate the stamp + // means that it is unsafe to use the same token for multiple clusters. stamp := c.bootstrapStamp() if _, err := os.Stat(stamp); err == nil { logrus.Info("Cluster bootstrap already complete") return false, nil } - if c.managedDB != nil && c.config.Token == "" { - return false, fmt.Errorf("K3S_TOKEN is required to join a cluster") - } - + // No errors and no bootstrap stamp, need to bootstrap. return true, nil } @@ -98,11 +121,8 @@ func (c *Cluster) bootstrap(ctx context.Context) error { return c.httpBootstrap() } - if err := c.storageBootstrap(ctx); err != nil { - return err - } - - return nil + // Bootstrap directly from datastore + return c.storageBootstrap(ctx) } func (c *Cluster) bootstrapStamp() string {