periodically check for stale restic repo locks (#1708)
* periodically check for stale restic repo locks Signed-off-by: Steve Kriss <krisss@vmware.com> * changelog Signed-off-by: Steve Kriss <krisss@vmware.com> * only try to init a restic repo if it doesn't already exist Signed-off-by: Steve Kriss <krisss@vmware.com> * reword comment Signed-off-by: Steve Kriss <krisss@vmware.com>pull/1721/head
parent
f2d06bc5e9
commit
b24a603711
|
@ -0,0 +1 @@
|
|||
remove any stale locks from restic repositories every 5m
|
|
@ -18,6 +18,7 @@ package controller
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
jsonpatch "github.com/evanphx/json-patch"
|
||||
|
@ -120,9 +121,19 @@ func (c *resticRepositoryController) processQueueItem(key string) error {
|
|||
// Don't mutate the shared cache
|
||||
reqCopy := req.DeepCopy()
|
||||
|
||||
switch req.Status.Phase {
|
||||
case "", v1.ResticRepositoryPhaseNew:
|
||||
if req.Status.Phase == "" || req.Status.Phase == v1.ResticRepositoryPhaseNew {
|
||||
return c.initializeRepo(reqCopy, log)
|
||||
}
|
||||
|
||||
// If the repository is ready or not-ready, check it for stale locks, but if
|
||||
// this fails for any reason, it's non-critical so we still continue on to the
|
||||
// rest of the "process" logic.
|
||||
log.Debug("Checking repository for stale locks")
|
||||
if err := c.repositoryManager.UnlockRepo(reqCopy); err != nil {
|
||||
log.WithError(err).Error("Error checking repository for stale locks")
|
||||
}
|
||||
|
||||
switch req.Status.Phase {
|
||||
case v1.ResticRepositoryPhaseReady:
|
||||
return c.runMaintenanceIfDue(reqCopy, log)
|
||||
case v1.ResticRepositoryPhaseNotReady:
|
||||
|
@ -162,14 +173,23 @@ func (c *resticRepositoryController) initializeRepo(req *v1.ResticRepository, lo
|
|||
})
|
||||
}
|
||||
|
||||
// ensureRepo first tries to connect to the repo, and returns if it succeeds. If it fails,
|
||||
// it attempts to init the repo, and returns the result.
|
||||
// ensureRepo checks to see if a repository exists, and attempts to initialize it if
|
||||
// it does not exist. An error is returned if the repository can't be connected to
|
||||
// or initialized.
|
||||
func ensureRepo(repo *v1.ResticRepository, repoManager restic.RepositoryManager) error {
|
||||
if repoManager.ConnectToRepo(repo) == nil {
|
||||
return nil
|
||||
if err := repoManager.ConnectToRepo(repo); err != nil {
|
||||
// If the repository has not yet been initialized, the error message will always include
|
||||
// the following string. This is the only scenario where we should try to initialize it.
|
||||
// Other errors (e.g. "already locked") should be returned as-is since the repository
|
||||
// does already exist, but it can't be connected to.
|
||||
if strings.Contains(err.Error(), "Is there a repository at the following location?") {
|
||||
return repoManager.InitRepo(repo)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
return repoManager.InitRepo(repo)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *resticRepositoryController) runMaintenanceIfDue(req *v1.ResticRepository, log logrus.FieldLogger) error {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright 2018 the Velero contributors.
|
||||
Copyright 2018, 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -112,3 +112,10 @@ func ForgetCommand(repoIdentifier, snapshotID string) *Command {
|
|||
Args: []string{snapshotID},
|
||||
}
|
||||
}
|
||||
|
||||
func UnlockCommand(repoIdentifier string) *Command {
|
||||
return &Command{
|
||||
Command: "unlock",
|
||||
RepoIdentifier: repoIdentifier,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
Copyright 2018 the Velero contributors.
|
||||
Copyright 2018, 2019 the Velero contributors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
@ -55,6 +55,9 @@ type RepositoryManager interface {
|
|||
// PruneRepo deletes unused data from a repo.
|
||||
PruneRepo(repo *velerov1api.ResticRepository) error
|
||||
|
||||
// UnlockRepo removes stale locks from a repo.
|
||||
UnlockRepo(repo *velerov1api.ResticRepository) error
|
||||
|
||||
// Forget removes a snapshot from the list of
|
||||
// available snapshots in a repo.
|
||||
Forget(context.Context, SnapshotIdentifier) error
|
||||
|
@ -213,6 +216,14 @@ func (rm *repositoryManager) PruneRepo(repo *velerov1api.ResticRepository) error
|
|||
return rm.exec(PruneCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation)
|
||||
}
|
||||
|
||||
func (rm *repositoryManager) UnlockRepo(repo *velerov1api.ResticRepository) error {
|
||||
// restic unlock requires a non-exclusive lock
|
||||
rm.repoLocker.Lock(repo.Name)
|
||||
defer rm.repoLocker.Unlock(repo.Name)
|
||||
|
||||
return rm.exec(UnlockCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation)
|
||||
}
|
||||
|
||||
func (rm *repositoryManager) Forget(ctx context.Context, snapshot SnapshotIdentifier) error {
|
||||
// We can't wait for this in the constructor, because this informer is coming
|
||||
// from the shared informer factory, which isn't started until *after* the repo
|
||||
|
|
Loading…
Reference in New Issue