2018-06-06 21:35:06 +00:00
|
|
|
/*
|
2019-03-20 19:32:48 +00:00
|
|
|
Copyright 2018 the Velero contributors.
|
2018-06-06 21:35:06 +00:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package metrics
|
|
|
|
|
|
|
|
import (
|
2018-06-20 18:08:07 +00:00
|
|
|
"time"
|
|
|
|
|
2018-06-06 21:35:06 +00:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
)
|
|
|
|
|
2019-01-25 03:33:07 +00:00
|
|
|
// ServerMetrics contains Prometheus metrics for the Velero server.
|
2018-06-06 21:35:06 +00:00
|
|
|
type ServerMetrics struct {
|
|
|
|
metrics map[string]prometheus.Collector
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
2020-07-22 19:07:52 +00:00
|
|
|
metricNamespace = "velero"
|
|
|
|
resticMetricsNamespace = "restic"
|
|
|
|
//Velero metrics
|
2019-05-08 13:00:26 +00:00
|
|
|
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
|
|
|
|
backupTotal = "backup_total"
|
|
|
|
backupAttemptTotal = "backup_attempt_total"
|
|
|
|
backupSuccessTotal = "backup_success_total"
|
|
|
|
backupPartialFailureTotal = "backup_partial_failure_total"
|
|
|
|
backupFailureTotal = "backup_failure_total"
|
2020-07-16 17:13:17 +00:00
|
|
|
backupValidationFailureTotal = "backup_validation_failure_total"
|
2019-05-08 13:00:26 +00:00
|
|
|
backupDurationSeconds = "backup_duration_seconds"
|
|
|
|
backupDeletionAttemptTotal = "backup_deletion_attempt_total"
|
|
|
|
backupDeletionSuccessTotal = "backup_deletion_success_total"
|
|
|
|
backupDeletionFailureTotal = "backup_deletion_failure_total"
|
|
|
|
backupLastSuccessfulTimestamp = "backup_last_successful_timestamp"
|
|
|
|
restoreTotal = "restore_total"
|
|
|
|
restoreAttemptTotal = "restore_attempt_total"
|
|
|
|
restoreValidationFailedTotal = "restore_validation_failed_total"
|
|
|
|
restoreSuccessTotal = "restore_success_total"
|
|
|
|
restorePartialFailureTotal = "restore_partial_failure_total"
|
|
|
|
restoreFailedTotal = "restore_failed_total"
|
|
|
|
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
|
|
|
|
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
|
|
|
|
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
|
2018-06-25 18:15:46 +00:00
|
|
|
|
2020-07-22 19:07:52 +00:00
|
|
|
// Restic metrics
|
|
|
|
podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count"
|
|
|
|
podVolumeBackupDequeueTotal = "pod_volume_backup_dequeue_count"
|
|
|
|
resticOperationLatencySeconds = "restic_operation_latency_seconds"
|
|
|
|
resticOperationLatencyGaugeSeconds = "restic_operation_latency_seconds_gauge"
|
|
|
|
|
|
|
|
// Labels
|
|
|
|
nodeMetricLabel = "node"
|
|
|
|
resticOperationLabel = "operation"
|
|
|
|
pvbNameLabel = "pod_volume_backup"
|
|
|
|
scheduleLabel = "schedule"
|
|
|
|
backupNameLabel = "backupName"
|
2018-06-20 18:08:07 +00:00
|
|
|
|
|
|
|
secondsInMinute = 60.0
|
2018-06-06 21:35:06 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// NewServerMetrics returns new ServerMetrics
|
|
|
|
func NewServerMetrics() *ServerMetrics {
|
|
|
|
return &ServerMetrics{
|
|
|
|
metrics: map[string]prometheus.Collector{
|
|
|
|
backupTarballSizeBytesGauge: prometheus.NewGaugeVec(
|
|
|
|
prometheus.GaugeOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupTarballSizeBytesGauge,
|
|
|
|
Help: "Size, in bytes, of a backup",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-05-08 13:00:26 +00:00
|
|
|
backupLastSuccessfulTimestamp: prometheus.NewGaugeVec(
|
|
|
|
prometheus.GaugeOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupLastSuccessfulTimestamp,
|
|
|
|
Help: "Last time a backup ran successfully, Unix timestamp in seconds",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-04-05 02:11:53 +00:00
|
|
|
backupTotal: prometheus.NewGauge(
|
|
|
|
prometheus.GaugeOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupTotal,
|
|
|
|
Help: "Current number of existent backups",
|
|
|
|
},
|
|
|
|
),
|
2018-10-23 21:04:45 +00:00
|
|
|
backupAttemptTotal: prometheus.NewCounterVec(
|
2018-06-06 21:35:06 +00:00
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
2018-10-23 21:04:45 +00:00
|
|
|
Name: backupAttemptTotal,
|
2018-06-06 21:35:06 +00:00
|
|
|
Help: "Total number of attempted backups",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-10-23 21:04:45 +00:00
|
|
|
backupSuccessTotal: prometheus.NewCounterVec(
|
2018-06-06 21:35:06 +00:00
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
2018-10-23 21:04:45 +00:00
|
|
|
Name: backupSuccessTotal,
|
2018-06-06 21:35:06 +00:00
|
|
|
Help: "Total number of successful backups",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-04-26 16:14:26 +00:00
|
|
|
backupPartialFailureTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupPartialFailureTotal,
|
|
|
|
Help: "Total number of partially failed backups",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-10-23 21:04:45 +00:00
|
|
|
backupFailureTotal: prometheus.NewCounterVec(
|
2018-06-06 21:35:06 +00:00
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
2018-10-23 21:04:45 +00:00
|
|
|
Name: backupFailureTotal,
|
2018-06-06 21:35:06 +00:00
|
|
|
Help: "Total number of failed backups",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2020-07-16 17:13:17 +00:00
|
|
|
backupValidationFailureTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupValidationFailureTotal,
|
|
|
|
Help: "Total number of validation failed backups",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-04-04 18:25:59 +00:00
|
|
|
backupDeletionAttemptTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupDeletionAttemptTotal,
|
|
|
|
Help: "Total number of attempted backup deletions",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
backupDeletionSuccessTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupDeletionSuccessTotal,
|
|
|
|
Help: "Total number of successful backup deletions",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
backupDeletionFailureTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupDeletionFailureTotal,
|
|
|
|
Help: "Total number of failed backup deletions",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-06-20 18:08:07 +00:00
|
|
|
backupDurationSeconds: prometheus.NewHistogramVec(
|
|
|
|
prometheus.HistogramOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: backupDurationSeconds,
|
|
|
|
Help: "Time taken to complete backup, in seconds",
|
|
|
|
Buckets: []float64{
|
|
|
|
toSeconds(1 * time.Minute),
|
|
|
|
toSeconds(5 * time.Minute),
|
|
|
|
toSeconds(10 * time.Minute),
|
|
|
|
toSeconds(15 * time.Minute),
|
|
|
|
toSeconds(30 * time.Minute),
|
|
|
|
toSeconds(1 * time.Hour),
|
|
|
|
toSeconds(2 * time.Hour),
|
|
|
|
toSeconds(3 * time.Hour),
|
|
|
|
toSeconds(4 * time.Hour),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-04-05 02:11:53 +00:00
|
|
|
restoreTotal: prometheus.NewGauge(
|
|
|
|
prometheus.GaugeOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: restoreTotal,
|
|
|
|
Help: "Current number of existent restores",
|
|
|
|
},
|
|
|
|
),
|
2018-06-25 18:15:46 +00:00
|
|
|
restoreAttemptTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: restoreAttemptTotal,
|
|
|
|
Help: "Total number of attempted restores",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
restoreSuccessTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: restoreSuccessTotal,
|
|
|
|
Help: "Total number of successful restores",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2019-04-23 22:26:16 +00:00
|
|
|
restorePartialFailureTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: restorePartialFailureTotal,
|
|
|
|
Help: "Total number of partially failed restores",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-07-05 20:49:47 +00:00
|
|
|
restoreFailedTotal: prometheus.NewCounterVec(
|
2018-06-25 18:15:46 +00:00
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
2018-07-05 20:49:47 +00:00
|
|
|
Name: restoreFailedTotal,
|
|
|
|
Help: "Total number of failed restores",
|
2018-06-25 18:15:46 +00:00
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
restoreValidationFailedTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: restoreValidationFailedTotal,
|
2018-07-05 20:49:47 +00:00
|
|
|
Help: "Total number of failed restores failing validations",
|
2018-06-25 18:15:46 +00:00
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-10-23 21:04:45 +00:00
|
|
|
volumeSnapshotAttemptTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: volumeSnapshotAttemptTotal,
|
|
|
|
Help: "Total number of attempted volume snapshots",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
volumeSnapshotSuccessTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: volumeSnapshotSuccessTotal,
|
|
|
|
Help: "Total number of successful volume snapshots",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
|
|
|
volumeSnapshotFailureTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: metricNamespace,
|
|
|
|
Name: volumeSnapshotFailureTotal,
|
|
|
|
Help: "Total number of failed volume snapshots",
|
|
|
|
},
|
|
|
|
[]string{scheduleLabel},
|
|
|
|
),
|
2018-06-06 21:35:06 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-22 19:07:52 +00:00
|
|
|
func NewResticServerMetrics() *ServerMetrics {
|
|
|
|
return &ServerMetrics{
|
|
|
|
metrics: map[string]prometheus.Collector{
|
|
|
|
podVolumeBackupEnqueueTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: resticMetricsNamespace,
|
|
|
|
Name: podVolumeBackupEnqueueTotal,
|
|
|
|
Help: "Total number of pod_volume_backup objects enqueued",
|
|
|
|
},
|
|
|
|
[]string{nodeMetricLabel},
|
|
|
|
),
|
|
|
|
podVolumeBackupDequeueTotal: prometheus.NewCounterVec(
|
|
|
|
prometheus.CounterOpts{
|
|
|
|
Namespace: resticMetricsNamespace,
|
|
|
|
Name: podVolumeBackupDequeueTotal,
|
|
|
|
Help: "Total number of pod_volume_backup objects dequeued",
|
|
|
|
},
|
|
|
|
[]string{nodeMetricLabel},
|
|
|
|
),
|
|
|
|
resticOperationLatencyGaugeSeconds: prometheus.NewGaugeVec(
|
|
|
|
prometheus.GaugeOpts{
|
|
|
|
Namespace: resticMetricsNamespace,
|
|
|
|
Name: resticOperationLatencyGaugeSeconds,
|
|
|
|
Help: "Gauge metric indicating time taken, in seconds, to perform restic operations",
|
|
|
|
},
|
|
|
|
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
|
|
|
|
),
|
|
|
|
resticOperationLatencySeconds: prometheus.NewHistogramVec(
|
|
|
|
prometheus.HistogramOpts{
|
|
|
|
Namespace: resticMetricsNamespace,
|
|
|
|
Name: resticOperationLatencySeconds,
|
|
|
|
Help: "Time taken to complete restic operations, in seconds",
|
|
|
|
Buckets: []float64{
|
|
|
|
toSeconds(1 * time.Minute),
|
|
|
|
toSeconds(5 * time.Minute),
|
|
|
|
toSeconds(10 * time.Minute),
|
|
|
|
toSeconds(15 * time.Minute),
|
|
|
|
toSeconds(30 * time.Minute),
|
|
|
|
toSeconds(1 * time.Hour),
|
|
|
|
toSeconds(2 * time.Hour),
|
|
|
|
toSeconds(3 * time.Hour),
|
|
|
|
toSeconds(4 * time.Hour),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
[]string{nodeMetricLabel, resticOperationLabel, backupNameLabel, pvbNameLabel},
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// RegisterAllMetrics registers all prometheus metrics.
|
2018-06-06 21:35:06 +00:00
|
|
|
func (m *ServerMetrics) RegisterAllMetrics() {
|
|
|
|
for _, pm := range m.metrics {
|
|
|
|
prometheus.MustRegister(pm)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-23 21:04:45 +00:00
|
|
|
// InitSchedule initializes counter metrics of a schedule.
|
2018-07-20 13:03:44 +00:00
|
|
|
func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-20 13:03:44 +00:00
|
|
|
}
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-20 13:03:44 +00:00
|
|
|
}
|
2019-04-26 16:14:26 +00:00
|
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2019-04-26 16:14:26 +00:00
|
|
|
}
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-20 13:03:44 +00:00
|
|
|
}
|
2020-07-16 17:13:17 +00:00
|
|
|
if c, ok := m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
|
|
}
|
2019-04-04 18:25:59 +00:00
|
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2019-04-04 18:25:59 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2019-04-04 18:25:59 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2019-04-04 18:25:59 +00:00
|
|
|
}
|
2018-07-05 20:49:47 +00:00
|
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-05 20:49:47 +00:00
|
|
|
}
|
2019-04-23 22:26:16 +00:00
|
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2019-04-23 22:26:16 +00:00
|
|
|
}
|
2018-07-05 20:49:47 +00:00
|
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-05 20:49:47 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-05 20:49:47 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-07-05 20:49:47 +00:00
|
|
|
}
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-10-23 21:04:45 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-10-23 21:04:45 +00:00
|
|
|
}
|
|
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
2020-01-14 16:29:37 +00:00
|
|
|
c.WithLabelValues(scheduleName).Add(0)
|
2018-10-23 21:04:45 +00:00
|
|
|
}
|
2018-07-20 13:03:44 +00:00
|
|
|
}
|
|
|
|
|
2020-07-22 19:07:52 +00:00
|
|
|
// InitSchedule initializes counter metrics for a node.
|
|
|
|
func (m *ServerMetrics) InitResticMetricsForNode(node string) {
|
|
|
|
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(node).Add(0)
|
|
|
|
}
|
|
|
|
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(node).Add(0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterPodVolumeBackupEnqueue records enqueuing of a PodVolumeBackup object.
|
|
|
|
func (m *ServerMetrics) RegisterPodVolumeBackupEnqueue(node string) {
|
|
|
|
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(node).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterPodVolumeBackupDequeue records dequeuing of a PodVolumeBackup object.
|
|
|
|
func (m *ServerMetrics) RegisterPodVolumeBackupDequeue(node string) {
|
|
|
|
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(node).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-23 16:28:34 +00:00
|
|
|
// ObserveResticOpLatency records the number of seconds a restic operation took.
|
|
|
|
func (m *ServerMetrics) ObserveResticOpLatency(node, pvbName, opName, backupName string, seconds float64) {
|
2020-07-22 19:07:52 +00:00
|
|
|
if h, ok := m.metrics[resticOperationLatencySeconds].(*prometheus.HistogramVec); ok {
|
|
|
|
h.WithLabelValues(node, opName, backupName, pvbName).Observe(seconds)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterResticOpLatencyGauge registers the restic operation latency as a gauge metric.
|
|
|
|
func (m *ServerMetrics) RegisterResticOpLatencyGauge(node, pvbName, opName, backupName string, seconds float64) {
|
|
|
|
if g, ok := m.metrics[resticOperationLatencyGaugeSeconds].(*prometheus.GaugeVec); ok {
|
|
|
|
g.WithLabelValues(node, opName, backupName, pvbName).Set(seconds)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
|
2018-06-06 21:35:06 +00:00
|
|
|
func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) {
|
|
|
|
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {
|
|
|
|
g.WithLabelValues(backupSchedule).Set(float64(size))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-08 13:00:26 +00:00
|
|
|
// SetBackupLastSuccessfulTimestamp records the last time a backup ran successfully, Unix timestamp in seconds
|
2020-01-14 21:11:21 +00:00
|
|
|
func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string, time time.Time) {
|
2019-05-08 13:00:26 +00:00
|
|
|
if g, ok := m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec); ok {
|
2020-01-14 21:11:21 +00:00
|
|
|
g.WithLabelValues(backupSchedule).Set(float64(time.Unix()))
|
2019-05-08 13:00:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-05 02:11:53 +00:00
|
|
|
// SetBackupTotal records the current number of existent backups.
|
|
|
|
func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) {
|
|
|
|
if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok {
|
|
|
|
g.Set(float64(numberOfBackups))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// RegisterBackupAttempt records an backup attempt.
|
2018-06-06 21:35:06 +00:00
|
|
|
func (m *ServerMetrics) RegisterBackupAttempt(backupSchedule string) {
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
2018-06-06 21:35:06 +00:00
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// RegisterBackupSuccess records a successful completion of a backup.
|
2018-06-06 21:35:06 +00:00
|
|
|
func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
2018-06-06 21:35:06 +00:00
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
2020-01-14 21:11:21 +00:00
|
|
|
m.SetBackupLastSuccessfulTimestamp(backupSchedule, time.Now())
|
2018-06-06 21:35:06 +00:00
|
|
|
}
|
|
|
|
|
2019-04-26 16:14:26 +00:00
|
|
|
// RegisterBackupPartialFailure records a partially failed backup.
|
|
|
|
func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// RegisterBackupFailed records a failed backup.
|
2018-06-06 21:35:06 +00:00
|
|
|
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
|
2018-10-23 21:04:45 +00:00
|
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
2018-06-06 21:35:06 +00:00
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
2018-06-20 18:08:07 +00:00
|
|
|
|
2020-07-16 17:13:17 +00:00
|
|
|
// RegisterBackupValidationFailure records a validation failed backup.
|
|
|
|
func (m *ServerMetrics) RegisterBackupValidationFailure(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// RegisterBackupDuration records the number of seconds a backup took.
|
|
|
|
func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds float64) {
|
|
|
|
if c, ok := m.metrics[backupDurationSeconds].(*prometheus.HistogramVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Observe(seconds)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-04 18:25:59 +00:00
|
|
|
// RegisterBackupDeletionAttempt records the number of attempted backup deletions
|
|
|
|
func (m *ServerMetrics) RegisterBackupDeletionAttempt(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterBackupDeletionFailed records the number of failed backup deletions
|
|
|
|
func (m *ServerMetrics) RegisterBackupDeletionFailed(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterBackupDeletionSuccess records the number of successful backup deletions
|
|
|
|
func (m *ServerMetrics) RegisterBackupDeletionSuccess(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-20 18:08:07 +00:00
|
|
|
// toSeconds translates a time.Duration value into a float64
|
|
|
|
// representing the number of seconds in that duration.
|
|
|
|
func toSeconds(d time.Duration) float64 {
|
|
|
|
return float64(d / time.Second)
|
|
|
|
}
|
2018-06-25 18:15:46 +00:00
|
|
|
|
2019-04-05 02:11:53 +00:00
|
|
|
// SetRestoreTotal records the current number of existent restores.
|
|
|
|
func (m *ServerMetrics) SetRestoreTotal(numberOfRestores int64) {
|
|
|
|
if g, ok := m.metrics[restoreTotal].(prometheus.Gauge); ok {
|
|
|
|
g.Set(float64(numberOfRestores))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-25 18:15:46 +00:00
|
|
|
// RegisterRestoreAttempt records an attempt to restore a backup.
|
|
|
|
func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-05 20:49:47 +00:00
|
|
|
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
|
2018-06-25 18:15:46 +00:00
|
|
|
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-23 22:26:16 +00:00
|
|
|
// RegisterRestorePartialFailure records a restore that partially failed.
|
|
|
|
func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-05 20:49:47 +00:00
|
|
|
// RegisterRestoreFailed records a restore that failed.
|
|
|
|
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
2018-06-25 18:15:46 +00:00
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-05 20:49:47 +00:00
|
|
|
// RegisterRestoreValidationFailed records a restore that failed validation.
|
2018-06-25 18:15:46 +00:00
|
|
|
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
|
|
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
|
|
}
|
|
|
|
}
|
2018-10-23 21:04:45 +00:00
|
|
|
|
|
|
|
// RegisterVolumeSnapshotAttempts records an attempt to snapshot a volume.
|
|
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotAttempts(backupSchedule string, volumeSnapshotsAttempted int) {
|
|
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsAttempted))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterVolumeSnapshotSuccesses records a completed volume snapshot.
|
|
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotSuccesses(backupSchedule string, volumeSnapshotsCompleted int) {
|
|
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsCompleted))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// RegisterVolumeSnapshotFailures records a failed volume snapshot.
|
|
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotFailures(backupSchedule string, volumeSnapshotsFailed int) {
|
|
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsFailed))
|
|
|
|
}
|
|
|
|
}
|