From fadc5a8057c244df11757cd47cc50cc4a4cf5887 Mon Sep 17 00:00:00 2001 From: Hussein Galal Date: Mon, 7 Dec 2020 22:30:44 +0200 Subject: [PATCH] Add tombstone file to etcd and catch errc etcd channel (#2592) * Add tombstone file to embedded etcd Signed-off-by: galal-hussein * go mod update Signed-off-by: galal-hussein * fixes Signed-off-by: galal-hussein * more fixes Signed-off-by: galal-hussein * more changes Signed-off-by: galal-hussein * gofmt and goimports Signed-off-by: galal-hussein * go mod update Signed-off-by: galal-hussein * go lint Signed-off-by: galal-hussein * go lint Signed-off-by: galal-hussein * go mod tidy Signed-off-by: galal-hussein --- go.mod | 2 +- go.sum | 4 +- pkg/daemons/executor/etcd.go | 18 ++++++++- pkg/etcd/etcd.go | 43 ++++++++++++++++++++- vendor/go.etcd.io/etcd/etcdserver/errors.go | 1 + vendor/go.etcd.io/etcd/etcdserver/server.go | 4 +- vendor/modules.txt | 4 +- 7 files changed, 68 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index a4073bf5cf..8ea8890dfa 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,7 @@ replace ( github.com/matryer/moq => github.com/rancher/moq v0.0.0-20190404221404-ee5226d43009 github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc92 github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6 - go.etcd.io/etcd => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008 // v3.4.13-k3s1 + go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8 golang.org/x/crypto => golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 golang.org/x/net => golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7 golang.org/x/sys => golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456 diff --git a/go.sum b/go.sum index 83903b07cc..542af462d0 100644 --- a/go.sum +++ b/go.sum @@ -472,8 +472,8 @@ github.com/k3s-io/cri v1.4.0-k3s.2 h1:HiJLH0P7k6sSJwbzjPwIN0CeY0iA6bKlb7OyThMiaE github.com/k3s-io/cri v1.4.0-k3s.2/go.mod h1:fGPUUHMKQik/vIegSe05DtX/m4miovdtvVLqRUFAkK0= github.com/k3s-io/cri-tools v1.19.0-k3s1 h1:FQ1iURavoP3rE/GqND/f3aIL1X59IpFQCRnDhiwzcZ8= github.com/k3s-io/cri-tools v1.19.0-k3s1/go.mod h1:bitvtZRi5F7t505Yw3zPzp22LOao1lqJKHfx6x0hnpw= -github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008 h1:PlAf/spqR2ZVFeWORItuvYk0YNDsjTlmq+e+7TQbtrI= -github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008/go.mod h1:yVHk9ub3CSBatqGNg7GRmsnfLWtoW60w4eDYfh7vHDg= +github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8 h1:S+MCp8UM5sS1bpxedfr3Qb907ig0dF1bARZ+UqeM4vk= +github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8/go.mod h1:yVHk9ub3CSBatqGNg7GRmsnfLWtoW60w4eDYfh7vHDg= github.com/k3s-io/flannel v0.12.0-k3s2 h1:KEfj4fe+P0qINcYZxo5/C0cga2XBEfeV4nhKrUGDyCg= github.com/k3s-io/flannel v0.12.0-k3s2/go.mod h1:2tCkIFWhim43MkRsOcPOxY8/Bcpai9uZLJaywN7ciNg= github.com/k3s-io/go-powershell v0.0.0-20200701182037-6845e6fcfa79 h1:9naOL3iAREsJh9mbf9C6Qqu9xuFv7/jIGBFCWvZMg4E= diff --git a/pkg/daemons/executor/etcd.go b/pkg/daemons/executor/etcd.go index f6471b0309..0cb9dbb274 100644 --- a/pkg/daemons/executor/etcd.go +++ b/pkg/daemons/executor/etcd.go @@ -3,8 +3,14 @@ package executor import ( + "io/ioutil" + "path/filepath" + "strings" + + "github.com/rancher/k3s/pkg/version" "github.com/sirupsen/logrus" "go.etcd.io/etcd/embed" + "go.etcd.io/etcd/etcdserver" ) func (e Embedded) CurrentETCDOptions() (InitialOptions, error) { @@ -27,8 +33,18 @@ func (e Embedded) ETCD(args ETCDConfig) error { go func() { select { + case err := <-etcd.Server.ErrNotify(): + if strings.Contains(err.Error(), etcdserver.ErrMemberRemoved.Error()) { + tombstoneFile := filepath.Join(args.DataDir, "tombstone") + if err := ioutil.WriteFile(tombstoneFile, []byte{}, 0600); err != nil { + logrus.Fatalf("failed to write tombstone file to %s", tombstoneFile) + } + logrus.Infof("this node has been removed from the cluster please restart %s to rejoin the cluster", version.Program) + return + } + case <-etcd.Server.StopNotify(): - logrus.Fatalf("etcd stopped - if this node was removed from the cluster, you must backup and delete %s before rejoining", args.DataDir) + logrus.Fatalf("etcd stopped") case err := <-etcd.Err(): logrus.Fatalf("etcd exited: %v", err) } diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index 4ddf162b76..20857c0740 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -72,6 +72,8 @@ const ( // other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go defaultKeepAliveTime = 30 * time.Second defaultKeepAliveTimeout = 10 * time.Second + + maxBackupRetention = 5 ) // Members contains a slice that holds all @@ -323,6 +325,13 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt return nil, err } + tombstoneFile := filepath.Join(etcdDBDir(e.config), "tombstone") + if _, err := os.Stat(tombstoneFile); err == nil { + logrus.Infof("tombstone file has been detected, removing data dir to rejoin the cluster") + if _, err := backupDirWithRetention(etcdDBDir(e.config), maxBackupRetention); err != nil { + return nil, err + } + } return e.handler(handler), err } @@ -512,7 +521,7 @@ func (e *ETCD) removePeer(ctx context.Context, id, address string) error { } if u.Hostname() == address { if e.address == address { - logrus.Fatalf("node has been delete from the cluster. Backup and delete ${datadir}/server/db if you like to rejoin the node") + return errors.New("node has been deleted from the cluster") } logrus.Infof("Removing name=%s id=%d address=%s from etcd", member.Name, member.ID, address) _, err := e.client.MemberRemove(ctx, member.ID) @@ -802,3 +811,35 @@ func snapshotRetention(retention int, snapshotDir string) error { }) return os.Remove(filepath.Join(snapshotDir, snapshotFiles[0].Name())) } + +// backupDirWithRetention will move the dir to a backup dir +// and will keep only maxBackupRetention of dirs. +func backupDirWithRetention(dir string, maxBackupRetention int) (string, error) { + backupDir := dir + "-backup-" + strconv.Itoa(int(time.Now().Unix())) + if _, err := os.Stat(dir); err != nil { + return "", nil + } + files, err := ioutil.ReadDir(filepath.Dir(dir)) + if err != nil { + return "", err + } + sort.Slice(files, func(i, j int) bool { + return files[i].ModTime().After(files[j].ModTime()) + }) + count := 0 + for _, f := range files { + if strings.HasPrefix(f.Name(), filepath.Base(dir)+"-backup") && f.IsDir() { + count++ + if count > maxBackupRetention { + if err := os.RemoveAll(filepath.Join(filepath.Dir(dir), f.Name())); err != nil { + return "", err + } + } + } + } + // move the directory to a temp path + if err := os.Rename(dir, backupDir); err != nil { + return "", err + } + return backupDir, nil +} diff --git a/vendor/go.etcd.io/etcd/etcdserver/errors.go b/vendor/go.etcd.io/etcd/etcdserver/errors.go index d0fe28970d..c9b2679fd6 100644 --- a/vendor/go.etcd.io/etcd/etcdserver/errors.go +++ b/vendor/go.etcd.io/etcd/etcdserver/errors.go @@ -39,6 +39,7 @@ var ( ErrKeyNotFound = errors.New("etcdserver: key not found") ErrCorrupt = errors.New("etcdserver: corrupt cluster") ErrBadLeaderTransferee = errors.New("etcdserver: bad leader transferee") + ErrMemberRemoved = errors.New("etcdserver: the member has been permanently removed from the cluster") ) type DiscoveryError struct { diff --git a/vendor/go.etcd.io/etcd/etcdserver/server.go b/vendor/go.etcd.io/etcd/etcdserver/server.go index a341625dcc..56a45dfdcd 100644 --- a/vendor/go.etcd.io/etcd/etcdserver/server.go +++ b/vendor/go.etcd.io/etcd/etcdserver/server.go @@ -1388,7 +1388,7 @@ func (s *EtcdServer) applyEntries(ep *etcdProgress, apply *apply) { } var shouldstop bool if ep.appliedt, ep.appliedi, shouldstop = s.apply(ents, &ep.confState); shouldstop { - go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster")) + go s.stopWithDelay(10*100*time.Millisecond, ErrMemberRemoved) } } @@ -1551,6 +1551,8 @@ func (s *EtcdServer) stopWithDelay(d time.Duration, err error) { // when the server is stopped. func (s *EtcdServer) StopNotify() <-chan struct{} { return s.done } +func (s *EtcdServer) ErrNotify() <-chan error { return s.errorc } + func (s *EtcdServer) SelfStats() []byte { return s.stats.JSON() } func (s *EtcdServer) LeaderStats() []byte { diff --git a/vendor/modules.txt b/vendor/modules.txt index c3dba82995..676da256c7 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1007,7 +1007,7 @@ github.com/willf/bitset github.com/xiang90/probing # go.etcd.io/bbolt v1.3.5 go.etcd.io/bbolt -# go.etcd.io/etcd v0.5.0-alpha.5.0.20200819165624-17cef6e3e9d5 => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008 +# go.etcd.io/etcd v0.5.0-alpha.5.0.20200819165624-17cef6e3e9d5 => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8 ## explicit go.etcd.io/etcd/auth go.etcd.io/etcd/auth/authpb @@ -2949,7 +2949,7 @@ vbom.ml/util/sortorder # github.com/matryer/moq => github.com/rancher/moq v0.0.0-20190404221404-ee5226d43009 # github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc92 # github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6 -# go.etcd.io/etcd => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008 +# go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8 # golang.org/x/crypto => golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 # golang.org/x/net => golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7 # golang.org/x/sys => golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456