438 lines
15 KiB
Go
438 lines
15 KiB
Go
/*
|
|
Copyright 2018 the Velero contributors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package metrics
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// ServerMetrics contains Prometheus metrics for the Velero server.
|
|
type ServerMetrics struct {
|
|
metrics map[string]prometheus.Collector
|
|
}
|
|
|
|
const (
|
|
metricNamespace = "velero"
|
|
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
|
|
backupTotal = "backup_total"
|
|
backupAttemptTotal = "backup_attempt_total"
|
|
backupSuccessTotal = "backup_success_total"
|
|
backupPartialFailureTotal = "backup_partial_failure_total"
|
|
backupFailureTotal = "backup_failure_total"
|
|
backupDurationSeconds = "backup_duration_seconds"
|
|
backupDeletionAttemptTotal = "backup_deletion_attempt_total"
|
|
backupDeletionSuccessTotal = "backup_deletion_success_total"
|
|
backupDeletionFailureTotal = "backup_deletion_failure_total"
|
|
backupLastSuccessfulTimestamp = "backup_last_successful_timestamp"
|
|
restoreTotal = "restore_total"
|
|
restoreAttemptTotal = "restore_attempt_total"
|
|
restoreValidationFailedTotal = "restore_validation_failed_total"
|
|
restoreSuccessTotal = "restore_success_total"
|
|
restorePartialFailureTotal = "restore_partial_failure_total"
|
|
restoreFailedTotal = "restore_failed_total"
|
|
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
|
|
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
|
|
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
|
|
|
|
scheduleLabel = "schedule"
|
|
backupNameLabel = "backupName"
|
|
|
|
secondsInMinute = 60.0
|
|
)
|
|
|
|
// NewServerMetrics returns new ServerMetrics
|
|
func NewServerMetrics() *ServerMetrics {
|
|
return &ServerMetrics{
|
|
metrics: map[string]prometheus.Collector{
|
|
backupTarballSizeBytesGauge: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupTarballSizeBytesGauge,
|
|
Help: "Size, in bytes, of a backup",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupLastSuccessfulTimestamp: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupLastSuccessfulTimestamp,
|
|
Help: "Last time a backup ran successfully, Unix timestamp in seconds",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupTotal: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupTotal,
|
|
Help: "Current number of existent backups",
|
|
},
|
|
),
|
|
backupAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupAttemptTotal,
|
|
Help: "Total number of attempted backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupSuccessTotal,
|
|
Help: "Total number of successful backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupPartialFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupPartialFailureTotal,
|
|
Help: "Total number of partially failed backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupFailureTotal,
|
|
Help: "Total number of failed backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionAttemptTotal,
|
|
Help: "Total number of attempted backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionSuccessTotal,
|
|
Help: "Total number of successful backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionFailureTotal,
|
|
Help: "Total number of failed backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDurationSeconds: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDurationSeconds,
|
|
Help: "Time taken to complete backup, in seconds",
|
|
Buckets: []float64{
|
|
toSeconds(1 * time.Minute),
|
|
toSeconds(5 * time.Minute),
|
|
toSeconds(10 * time.Minute),
|
|
toSeconds(15 * time.Minute),
|
|
toSeconds(30 * time.Minute),
|
|
toSeconds(1 * time.Hour),
|
|
toSeconds(2 * time.Hour),
|
|
toSeconds(3 * time.Hour),
|
|
toSeconds(4 * time.Hour),
|
|
},
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreTotal: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreTotal,
|
|
Help: "Current number of existent restores",
|
|
},
|
|
),
|
|
restoreAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreAttemptTotal,
|
|
Help: "Total number of attempted restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreSuccessTotal,
|
|
Help: "Total number of successful restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restorePartialFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restorePartialFailureTotal,
|
|
Help: "Total number of partially failed restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreFailedTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreFailedTotal,
|
|
Help: "Total number of failed restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreValidationFailedTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreValidationFailedTotal,
|
|
Help: "Total number of failed restores failing validations",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotAttemptTotal,
|
|
Help: "Total number of attempted volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotSuccessTotal,
|
|
Help: "Total number of successful volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotFailureTotal,
|
|
Help: "Total number of failed volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
},
|
|
}
|
|
}
|
|
|
|
// RegisterAllMetrics registers all prometheus metrics.
|
|
func (m *ServerMetrics) RegisterAllMetrics() {
|
|
for _, pm := range m.metrics {
|
|
prometheus.MustRegister(pm)
|
|
}
|
|
}
|
|
|
|
// InitSchedule initializes counter metrics of a schedule.
|
|
func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(0)
|
|
}
|
|
}
|
|
|
|
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
|
|
func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) {
|
|
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(backupSchedule).Set(float64(size))
|
|
}
|
|
}
|
|
|
|
// SetBackupLastSuccessfulTimestamp records the last time a backup ran successfully, Unix timestamp in seconds
|
|
func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string) {
|
|
if g, ok := m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(backupSchedule).Set(float64(time.Now().Unix()))
|
|
}
|
|
}
|
|
|
|
// SetBackupTotal records the current number of existent backups.
|
|
func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) {
|
|
if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok {
|
|
g.Set(float64(numberOfBackups))
|
|
}
|
|
}
|
|
|
|
// RegisterBackupAttempt records an backup attempt.
|
|
func (m *ServerMetrics) RegisterBackupAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupSuccess records a successful completion of a backup.
|
|
func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
m.SetBackupLastSuccessfulTimestamp(backupSchedule)
|
|
}
|
|
|
|
// RegisterBackupPartialFailure records a partially failed backup.
|
|
func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) {
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupFailed records a failed backup.
|
|
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDuration records the number of seconds a backup took.
|
|
func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds float64) {
|
|
if c, ok := m.metrics[backupDurationSeconds].(*prometheus.HistogramVec); ok {
|
|
c.WithLabelValues(backupSchedule).Observe(seconds)
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionAttempt records the number of attempted backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionFailed records the number of failed backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionSuccess records the number of successful backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// toSeconds translates a time.Duration value into a float64
|
|
// representing the number of seconds in that duration.
|
|
func toSeconds(d time.Duration) float64 {
|
|
return float64(d / time.Second)
|
|
}
|
|
|
|
// SetRestoreTotal records the current number of existent restores.
|
|
func (m *ServerMetrics) SetRestoreTotal(numberOfRestores int64) {
|
|
if g, ok := m.metrics[restoreTotal].(prometheus.Gauge); ok {
|
|
g.Set(float64(numberOfRestores))
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreAttempt records an attempt to restore a backup.
|
|
func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
|
|
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestorePartialFailure records a restore that partially failed.
|
|
func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) {
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreFailed records a restore that failed.
|
|
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreValidationFailed records a restore that failed validation.
|
|
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotAttempts records an attempt to snapshot a volume.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotAttempts(backupSchedule string, volumeSnapshotsAttempted int) {
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsAttempted))
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotSuccesses records a completed volume snapshot.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotSuccesses(backupSchedule string, volumeSnapshotsCompleted int) {
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsCompleted))
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotFailures records a failed volume snapshot.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotFailures(backupSchedule string, volumeSnapshotsFailed int) {
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsFailed))
|
|
}
|
|
}
|