periodically check for stale restic repo locks (#1708)

* periodically check for stale restic repo locks

Signed-off-by: Steve Kriss <krisss@vmware.com>

* changelog

Signed-off-by: Steve Kriss <krisss@vmware.com>

* only try to init a restic repo if it doesn't already exist

Signed-off-by: Steve Kriss <krisss@vmware.com>

* reword comment

Signed-off-by: Steve Kriss <krisss@vmware.com>
pull/1721/head
Steve Kriss 2019-07-31 11:52:23 -06:00 committed by Adnan Abdulhussein
parent f2d06bc5e9
commit b24a603711
4 changed files with 48 additions and 9 deletions

View File

@ -0,0 +1 @@
remove any stale locks from restic repositories every 5m

View File

@ -18,6 +18,7 @@ package controller
import (
"encoding/json"
"strings"
"time"
jsonpatch "github.com/evanphx/json-patch"
@ -120,9 +121,19 @@ func (c *resticRepositoryController) processQueueItem(key string) error {
// Don't mutate the shared cache
reqCopy := req.DeepCopy()
switch req.Status.Phase {
case "", v1.ResticRepositoryPhaseNew:
if req.Status.Phase == "" || req.Status.Phase == v1.ResticRepositoryPhaseNew {
return c.initializeRepo(reqCopy, log)
}
// If the repository is ready or not-ready, check it for stale locks, but if
// this fails for any reason, it's non-critical so we still continue on to the
// rest of the "process" logic.
log.Debug("Checking repository for stale locks")
if err := c.repositoryManager.UnlockRepo(reqCopy); err != nil {
log.WithError(err).Error("Error checking repository for stale locks")
}
switch req.Status.Phase {
case v1.ResticRepositoryPhaseReady:
return c.runMaintenanceIfDue(reqCopy, log)
case v1.ResticRepositoryPhaseNotReady:
@ -162,14 +173,23 @@ func (c *resticRepositoryController) initializeRepo(req *v1.ResticRepository, lo
})
}
// ensureRepo first tries to connect to the repo, and returns if it succeeds. If it fails,
// it attempts to init the repo, and returns the result.
// ensureRepo checks to see if a repository exists, and attempts to initialize it if
// it does not exist. An error is returned if the repository can't be connected to
// or initialized.
func ensureRepo(repo *v1.ResticRepository, repoManager restic.RepositoryManager) error {
if repoManager.ConnectToRepo(repo) == nil {
return nil
if err := repoManager.ConnectToRepo(repo); err != nil {
// If the repository has not yet been initialized, the error message will always include
// the following string. This is the only scenario where we should try to initialize it.
// Other errors (e.g. "already locked") should be returned as-is since the repository
// does already exist, but it can't be connected to.
if strings.Contains(err.Error(), "Is there a repository at the following location?") {
return repoManager.InitRepo(repo)
}
return err
}
return repoManager.InitRepo(repo)
return nil
}
func (c *resticRepositoryController) runMaintenanceIfDue(req *v1.ResticRepository, log logrus.FieldLogger) error {

View File

@ -1,5 +1,5 @@
/*
Copyright 2018 the Velero contributors.
Copyright 2018, 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -112,3 +112,10 @@ func ForgetCommand(repoIdentifier, snapshotID string) *Command {
Args: []string{snapshotID},
}
}
func UnlockCommand(repoIdentifier string) *Command {
return &Command{
Command: "unlock",
RepoIdentifier: repoIdentifier,
}
}

View File

@ -1,5 +1,5 @@
/*
Copyright 2018 the Velero contributors.
Copyright 2018, 2019 the Velero contributors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -55,6 +55,9 @@ type RepositoryManager interface {
// PruneRepo deletes unused data from a repo.
PruneRepo(repo *velerov1api.ResticRepository) error
// UnlockRepo removes stale locks from a repo.
UnlockRepo(repo *velerov1api.ResticRepository) error
// Forget removes a snapshot from the list of
// available snapshots in a repo.
Forget(context.Context, SnapshotIdentifier) error
@ -213,6 +216,14 @@ func (rm *repositoryManager) PruneRepo(repo *velerov1api.ResticRepository) error
return rm.exec(PruneCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation)
}
func (rm *repositoryManager) UnlockRepo(repo *velerov1api.ResticRepository) error {
// restic unlock requires a non-exclusive lock
rm.repoLocker.Lock(repo.Name)
defer rm.repoLocker.Unlock(repo.Name)
return rm.exec(UnlockCommand(repo.Spec.ResticIdentifier), repo.Spec.BackupStorageLocation)
}
func (rm *repositoryManager) Forget(ctx context.Context, snapshot SnapshotIdentifier) error {
// We can't wait for this in the constructor, because this informer is coming
// from the shared informer factory, which isn't started until *after* the repo