Add restore attempt and success/failure counters
Signed-off-by: Nolan Brubaker <nolan@heptio.com>pull/607/head
parent
39c03008be
commit
7cebfe2df0
|
@ -725,6 +725,7 @@ func (s *server) runControllers(config *api.Config) error {
|
|||
s.snapshotService != nil,
|
||||
s.logger,
|
||||
s.pluginManager,
|
||||
s.metrics,
|
||||
)
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
|
|
|
@ -45,6 +45,7 @@ import (
|
|||
arkv1client "github.com/heptio/ark/pkg/generated/clientset/versioned/typed/ark/v1"
|
||||
informers "github.com/heptio/ark/pkg/generated/informers/externalversions/ark/v1"
|
||||
listers "github.com/heptio/ark/pkg/generated/listers/ark/v1"
|
||||
"github.com/heptio/ark/pkg/metrics"
|
||||
"github.com/heptio/ark/pkg/plugin"
|
||||
"github.com/heptio/ark/pkg/restore"
|
||||
"github.com/heptio/ark/pkg/util/boolptr"
|
||||
|
@ -84,6 +85,7 @@ type restoreController struct {
|
|||
queue workqueue.RateLimitingInterface
|
||||
logger logrus.FieldLogger
|
||||
pluginManager plugin.Manager
|
||||
metrics *metrics.ServerMetrics
|
||||
}
|
||||
|
||||
func NewRestoreController(
|
||||
|
@ -98,6 +100,7 @@ func NewRestoreController(
|
|||
pvProviderExists bool,
|
||||
logger logrus.FieldLogger,
|
||||
pluginManager plugin.Manager,
|
||||
metrics *metrics.ServerMetrics,
|
||||
) Interface {
|
||||
c := &restoreController{
|
||||
namespace: namespace,
|
||||
|
@ -114,6 +117,7 @@ func NewRestoreController(
|
|||
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "restore"),
|
||||
logger: logger,
|
||||
pluginManager: pluginManager,
|
||||
metrics: metrics,
|
||||
}
|
||||
|
||||
c.syncHandler = c.processRestore
|
||||
|
@ -255,8 +259,23 @@ func (c *restoreController) processRestore(key string) error {
|
|||
// don't modify items in the cache
|
||||
restore = restore.DeepCopy()
|
||||
|
||||
// complete & validate restore
|
||||
if restore.Status.ValidationErrors = c.completeAndValidate(restore); len(restore.Status.ValidationErrors) > 0 {
|
||||
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
|
||||
for _, nonrestorable := range nonRestorableResources {
|
||||
if !excludedResources.Has(nonrestorable) {
|
||||
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
|
||||
}
|
||||
}
|
||||
|
||||
backup, fetchErr := c.fetchBackup(c.bucket, restore.Spec.BackupName)
|
||||
backupScheduleName := ""
|
||||
if backup != nil {
|
||||
backupScheduleName = backup.GetLabels()["ark-schedule"]
|
||||
}
|
||||
// Register attempts before we do validation so we can get better tracking
|
||||
c.metrics.RegisterRestoreAttempt(backupScheduleName)
|
||||
|
||||
// validation
|
||||
if restore.Status.ValidationErrors = c.completeAndValidate(restore, fetchErr); len(restore.Status.ValidationErrors) > 0 {
|
||||
restore.Status.Phase = api.RestorePhaseFailedValidation
|
||||
} else {
|
||||
restore.Status.Phase = api.RestorePhaseInProgress
|
||||
|
@ -272,12 +291,12 @@ func (c *restoreController) processRestore(key string) error {
|
|||
restore = updatedRestore.DeepCopy()
|
||||
|
||||
if restore.Status.Phase == api.RestorePhaseFailedValidation {
|
||||
c.metrics.RegisterRestoreValidationFailed(backupScheduleName)
|
||||
return nil
|
||||
}
|
||||
|
||||
logContext.Debug("Running restore")
|
||||
// execution & upload of restore
|
||||
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket)
|
||||
restoreWarnings, restoreErrors := c.runRestore(restore, c.bucket, backup)
|
||||
|
||||
restore.Status.Warnings = len(restoreWarnings.Ark) + len(restoreWarnings.Cluster)
|
||||
for _, w := range restoreWarnings.Namespaces {
|
||||
|
@ -288,6 +307,11 @@ func (c *restoreController) processRestore(key string) error {
|
|||
for _, e := range restoreErrors.Namespaces {
|
||||
restore.Status.Errors += len(e)
|
||||
}
|
||||
if restore.Status.Errors > 0 {
|
||||
c.metrics.RegisterRestoreIncomplete(backupScheduleName)
|
||||
} else {
|
||||
c.metrics.RegisterRestoreSuccess(backupScheduleName)
|
||||
}
|
||||
|
||||
logContext.Debug("restore completed")
|
||||
restore.Status.Phase = api.RestorePhaseCompleted
|
||||
|
@ -300,7 +324,7 @@ func (c *restoreController) processRestore(key string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
|
||||
func (c *restoreController) completeAndValidate(restore *api.Restore, fetchErr error) []string {
|
||||
// add non-restorable resources to restore's excluded resources
|
||||
excludedResources := sets.NewString(restore.Spec.ExcludedResources...)
|
||||
for _, nonrestorable := range nonRestorableResources {
|
||||
|
@ -308,9 +332,14 @@ func (c *restoreController) completeAndValidate(restore *api.Restore) []string {
|
|||
restore.Spec.ExcludedResources = append(restore.Spec.ExcludedResources, nonrestorable)
|
||||
}
|
||||
}
|
||||
|
||||
var validationErrors []string
|
||||
|
||||
if restore.Spec.BackupName == "" {
|
||||
validationErrors = append(validationErrors, "BackupName must be non-empty and correspond to the name of a backup in object storage.")
|
||||
} else if fetchErr != nil {
|
||||
validationErrors = append(validationErrors, fmt.Sprintf("Error retrieving backup: %v", fetchErr))
|
||||
}
|
||||
|
||||
// validate that included resources don't contain any non-restorable resources
|
||||
includedResources := sets.NewString(restore.Spec.IncludedResources...)
|
||||
for _, nonRestorableResource := range nonRestorableResources {
|
||||
|
@ -433,7 +462,7 @@ func (c *restoreController) fetchBackup(bucket, name string) (*api.Backup, error
|
|||
return backup, nil
|
||||
}
|
||||
|
||||
func (c *restoreController) runRestore(restore *api.Restore, bucket string) (restoreWarnings, restoreErrors api.RestoreResult) {
|
||||
func (c *restoreController) runRestore(restore *api.Restore, bucket string, backup *api.Backup) (restoreWarnings, restoreErrors api.RestoreResult) {
|
||||
logContext := c.logger.WithFields(
|
||||
logrus.Fields{
|
||||
"restore": kubeutil.NamespaceAndName(restore),
|
||||
|
|
|
@ -37,6 +37,7 @@ import (
|
|||
api "github.com/heptio/ark/pkg/apis/ark/v1"
|
||||
"github.com/heptio/ark/pkg/generated/clientset/versioned/fake"
|
||||
informers "github.com/heptio/ark/pkg/generated/informers/externalversions"
|
||||
"github.com/heptio/ark/pkg/metrics"
|
||||
"github.com/heptio/ark/pkg/restore"
|
||||
"github.com/heptio/ark/pkg/util/collections"
|
||||
arktest "github.com/heptio/ark/pkg/util/test"
|
||||
|
@ -95,6 +96,7 @@ func TestFetchBackup(t *testing.T) {
|
|||
false,
|
||||
logger,
|
||||
pluginManager,
|
||||
metrics.NewServerMetrics(),
|
||||
).(*restoreController)
|
||||
|
||||
for _, itm := range test.informerBackups {
|
||||
|
@ -326,6 +328,7 @@ func TestProcessRestore(t *testing.T) {
|
|||
test.allowRestoreSnapshots,
|
||||
logger,
|
||||
pluginManager,
|
||||
metrics.NewServerMetrics(),
|
||||
).(*restoreController)
|
||||
|
||||
if test.restore != nil {
|
||||
|
@ -410,7 +413,6 @@ func TestProcessRestore(t *testing.T) {
|
|||
restorer.AssertExpectations(t)
|
||||
|
||||
assert.Equal(t, test.expectedErr, err != nil, "got error %v", err)
|
||||
|
||||
actions := client.Actions()
|
||||
|
||||
if test.expectedPhase == "" {
|
||||
|
|
|
@ -30,12 +30,18 @@ type ServerMetrics struct {
|
|||
const (
|
||||
metricNamespace = "ark"
|
||||
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
|
||||
backupAttemptCount = "backup_attempt_total"
|
||||
backupSuccessCount = "backup_success_total"
|
||||
backupFailureCount = "backup_failure_total"
|
||||
backupDurationSeconds = "backup_duration_seconds"
|
||||
// TODO: Rename the Count variables to match their strings
|
||||
backupAttemptCount = "backup_attempt_total"
|
||||
backupSuccessCount = "backup_success_total"
|
||||
backupFailureCount = "backup_failure_total"
|
||||
backupDurationSeconds = "backup_duration_seconds"
|
||||
restoreAttemptTotal = "restore_attempt_total"
|
||||
restoreValidationFailedTotal = "restore_validation_failed_total"
|
||||
restoreSuccessTotal = "restore_success_total"
|
||||
restoreIncompleteTotal = "restore_incomplete_total"
|
||||
|
||||
scheduleLabel = "schedule"
|
||||
scheduleLabel = "schedule"
|
||||
backupNameLabel = "backupName"
|
||||
|
||||
secondsInMinute = 60.0
|
||||
)
|
||||
|
@ -95,6 +101,38 @@ func NewServerMetrics() *ServerMetrics {
|
|||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
restoreAttemptTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreAttemptTotal,
|
||||
Help: "Total number of attempted restores",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
restoreSuccessTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreSuccessTotal,
|
||||
Help: "Total number of successful restores",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
restoreIncompleteTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreIncompleteTotal,
|
||||
Help: "Total number of incomplete restores",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
restoreValidationFailedTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricNamespace,
|
||||
Name: restoreValidationFailedTotal,
|
||||
Help: "Total number of failed restore validations",
|
||||
},
|
||||
[]string{scheduleLabel},
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -158,3 +196,31 @@ func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds fl
|
|||
func toSeconds(d time.Duration) float64 {
|
||||
return float64(d / time.Second)
|
||||
}
|
||||
|
||||
// RegisterRestoreAttempt records an attempt to restore a backup.
|
||||
func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreSuccess records a successful completion of a restore.
|
||||
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreIncomplete records a restore that finished with errors.
|
||||
func (m *ServerMetrics) RegisterRestoreIncomplete(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreIncompleteTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterRestoreValidationFailed records a failed restore.
|
||||
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
|
||||
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
||||
c.WithLabelValues(backupSchedule).Inc()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue