891 lines
33 KiB
Go
891 lines
33 KiB
Go
/*
|
|
Copyright 2018 the Velero contributors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package metrics
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// ServerMetrics contains Prometheus metrics for the Velero server.
|
|
type ServerMetrics struct {
|
|
metrics map[string]prometheus.Collector
|
|
}
|
|
|
|
const (
|
|
metricNamespace = "velero"
|
|
podVolumeMetricsNamespace = "podVolume"
|
|
//Velero metrics
|
|
backupTarballSizeBytesGauge = "backup_tarball_size_bytes"
|
|
backupTotal = "backup_total"
|
|
backupAttemptTotal = "backup_attempt_total"
|
|
backupSuccessTotal = "backup_success_total"
|
|
backupPartialFailureTotal = "backup_partial_failure_total"
|
|
backupFailureTotal = "backup_failure_total"
|
|
backupValidationFailureTotal = "backup_validation_failure_total"
|
|
backupDurationSeconds = "backup_duration_seconds"
|
|
backupDeletionAttemptTotal = "backup_deletion_attempt_total"
|
|
backupDeletionSuccessTotal = "backup_deletion_success_total"
|
|
backupDeletionFailureTotal = "backup_deletion_failure_total"
|
|
backupLastSuccessfulTimestamp = "backup_last_successful_timestamp"
|
|
backupItemsTotalGauge = "backup_items_total"
|
|
backupItemsErrorsGauge = "backup_items_errors"
|
|
backupWarningTotal = "backup_warning_total"
|
|
backupLastStatus = "backup_last_status"
|
|
restoreTotal = "restore_total"
|
|
restoreAttemptTotal = "restore_attempt_total"
|
|
restoreValidationFailedTotal = "restore_validation_failed_total"
|
|
restoreSuccessTotal = "restore_success_total"
|
|
restorePartialFailureTotal = "restore_partial_failure_total"
|
|
restoreFailedTotal = "restore_failed_total"
|
|
volumeSnapshotAttemptTotal = "volume_snapshot_attempt_total"
|
|
volumeSnapshotSuccessTotal = "volume_snapshot_success_total"
|
|
volumeSnapshotFailureTotal = "volume_snapshot_failure_total"
|
|
csiSnapshotAttemptTotal = "csi_snapshot_attempt_total"
|
|
csiSnapshotSuccessTotal = "csi_snapshot_success_total"
|
|
csiSnapshotFailureTotal = "csi_snapshot_failure_total"
|
|
|
|
// pod volume metrics
|
|
podVolumeBackupEnqueueTotal = "pod_volume_backup_enqueue_count"
|
|
podVolumeBackupDequeueTotal = "pod_volume_backup_dequeue_count"
|
|
podVolumeOperationLatencySeconds = "pod_volume_operation_latency_seconds"
|
|
podVolumeOperationLatencyGaugeSeconds = "pod_volume_operation_latency_seconds_gauge"
|
|
|
|
// data mover metrics
|
|
DataUploadSuccessTotal = "data_upload_success_total"
|
|
DataUploadFailureTotal = "data_upload_failure_total"
|
|
DataUploadCancelTotal = "data_upload_cancel_total"
|
|
DataDownloadSuccessTotal = "data_download_success_total"
|
|
DataDownloadFailureTotal = "data_download_failure_total"
|
|
DataDownloadCancelTotal = "data_download_cancel_total"
|
|
|
|
// Labels
|
|
nodeMetricLabel = "node"
|
|
podVolumeOperationLabel = "operation"
|
|
pvbNameLabel = "pod_volume_backup"
|
|
scheduleLabel = "schedule"
|
|
backupNameLabel = "backupName"
|
|
|
|
// metrics values
|
|
BackupLastStatusSucc int64 = 1
|
|
BackupLastStatusFailure int64 = 0
|
|
)
|
|
|
|
// NewServerMetrics returns new ServerMetrics
|
|
func NewServerMetrics() *ServerMetrics {
|
|
return &ServerMetrics{
|
|
metrics: map[string]prometheus.Collector{
|
|
backupTarballSizeBytesGauge: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupTarballSizeBytesGauge,
|
|
Help: "Size, in bytes, of a backup",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupLastSuccessfulTimestamp: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupLastSuccessfulTimestamp,
|
|
Help: "Last time a backup ran successfully, Unix timestamp in seconds",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupTotal: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupTotal,
|
|
Help: "Current number of existent backups",
|
|
},
|
|
),
|
|
backupAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupAttemptTotal,
|
|
Help: "Total number of attempted backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupSuccessTotal,
|
|
Help: "Total number of successful backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupPartialFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupPartialFailureTotal,
|
|
Help: "Total number of partially failed backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupFailureTotal,
|
|
Help: "Total number of failed backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupValidationFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupValidationFailureTotal,
|
|
Help: "Total number of validation failed backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionAttemptTotal,
|
|
Help: "Total number of attempted backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionSuccessTotal,
|
|
Help: "Total number of successful backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDeletionFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDeletionFailureTotal,
|
|
Help: "Total number of failed backup deletions",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupDurationSeconds: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupDurationSeconds,
|
|
Help: "Time taken to complete backup, in seconds",
|
|
Buckets: []float64{
|
|
toSeconds(1 * time.Minute),
|
|
toSeconds(5 * time.Minute),
|
|
toSeconds(10 * time.Minute),
|
|
toSeconds(15 * time.Minute),
|
|
toSeconds(30 * time.Minute),
|
|
toSeconds(1 * time.Hour),
|
|
toSeconds(2 * time.Hour),
|
|
toSeconds(3 * time.Hour),
|
|
toSeconds(4 * time.Hour),
|
|
},
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupItemsTotalGauge: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupItemsTotalGauge,
|
|
Help: "Total number of items backed up",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupItemsErrorsGauge: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupItemsErrorsGauge,
|
|
Help: "Total number of errors encountered during backup",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupWarningTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupWarningTotal,
|
|
Help: "Total number of warned backups",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
backupLastStatus: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: backupLastStatus,
|
|
Help: "Last status of the backup. A value of 1 is success, 0 is failure",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreTotal: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreTotal,
|
|
Help: "Current number of existent restores",
|
|
},
|
|
),
|
|
restoreAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreAttemptTotal,
|
|
Help: "Total number of attempted restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreSuccessTotal,
|
|
Help: "Total number of successful restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restorePartialFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restorePartialFailureTotal,
|
|
Help: "Total number of partially failed restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreFailedTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreFailedTotal,
|
|
Help: "Total number of failed restores",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
restoreValidationFailedTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: restoreValidationFailedTotal,
|
|
Help: "Total number of failed restores failing validations",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotAttemptTotal,
|
|
Help: "Total number of attempted volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotSuccessTotal,
|
|
Help: "Total number of successful volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
volumeSnapshotFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: volumeSnapshotFailureTotal,
|
|
Help: "Total number of failed volume snapshots",
|
|
},
|
|
[]string{scheduleLabel},
|
|
),
|
|
csiSnapshotAttemptTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: csiSnapshotAttemptTotal,
|
|
Help: "Total number of CSI attempted volume snapshots",
|
|
},
|
|
[]string{scheduleLabel, backupNameLabel},
|
|
),
|
|
csiSnapshotSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: csiSnapshotSuccessTotal,
|
|
Help: "Total number of CSI successful volume snapshots",
|
|
},
|
|
[]string{scheduleLabel, backupNameLabel},
|
|
),
|
|
csiSnapshotFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: metricNamespace,
|
|
Name: csiSnapshotFailureTotal,
|
|
Help: "Total number of CSI failed volume snapshots",
|
|
},
|
|
[]string{scheduleLabel, backupNameLabel},
|
|
),
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewNodeMetrics() *ServerMetrics {
|
|
return &ServerMetrics{
|
|
metrics: map[string]prometheus.Collector{
|
|
podVolumeBackupEnqueueTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: podVolumeBackupEnqueueTotal,
|
|
Help: "Total number of pod_volume_backup objects enqueued",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
podVolumeBackupDequeueTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: podVolumeBackupDequeueTotal,
|
|
Help: "Total number of pod_volume_backup objects dequeued",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
podVolumeOperationLatencyGaugeSeconds: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: podVolumeOperationLatencyGaugeSeconds,
|
|
Help: "Gauge metric indicating time taken, in seconds, to perform pod volume operations",
|
|
},
|
|
[]string{nodeMetricLabel, podVolumeOperationLabel, backupNameLabel, pvbNameLabel},
|
|
),
|
|
podVolumeOperationLatencySeconds: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: podVolumeOperationLatencySeconds,
|
|
Help: "Time taken to complete pod volume operations, in seconds",
|
|
Buckets: []float64{
|
|
toSeconds(1 * time.Minute),
|
|
toSeconds(5 * time.Minute),
|
|
toSeconds(10 * time.Minute),
|
|
toSeconds(15 * time.Minute),
|
|
toSeconds(30 * time.Minute),
|
|
toSeconds(1 * time.Hour),
|
|
toSeconds(2 * time.Hour),
|
|
toSeconds(3 * time.Hour),
|
|
toSeconds(4 * time.Hour),
|
|
},
|
|
},
|
|
[]string{nodeMetricLabel, podVolumeOperationLabel, backupNameLabel, pvbNameLabel},
|
|
),
|
|
DataUploadSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataUploadSuccessTotal,
|
|
Help: "Total number of successful uploaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
DataUploadFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataUploadFailureTotal,
|
|
Help: "Total number of failed uploaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
DataUploadCancelTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataUploadCancelTotal,
|
|
Help: "Total number of canceled uploaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
DataDownloadSuccessTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataDownloadSuccessTotal,
|
|
Help: "Total number of successful downloaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
DataDownloadFailureTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataDownloadFailureTotal,
|
|
Help: "Total number of failed downloaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
DataDownloadCancelTotal: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: podVolumeMetricsNamespace,
|
|
Name: DataDownloadCancelTotal,
|
|
Help: "Total number of canceled downloaded snapshots",
|
|
},
|
|
[]string{nodeMetricLabel},
|
|
),
|
|
},
|
|
}
|
|
}
|
|
|
|
// RegisterAllMetrics registers all prometheus metrics.
|
|
func (m *ServerMetrics) RegisterAllMetrics() {
|
|
for _, pm := range m.metrics {
|
|
prometheus.MustRegister(pm)
|
|
}
|
|
}
|
|
|
|
// InitSchedule initializes counter metrics of a schedule.
|
|
func (m *ServerMetrics) InitSchedule(scheduleName string) {
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupItemsTotalGauge].(*prometheus.GaugeVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
|
|
c.WithLabelValues(scheduleName).Set(float64(1))
|
|
}
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName).Add(0)
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName, "").Add(0)
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName, "").Add(0)
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(scheduleName, "").Add(0)
|
|
}
|
|
}
|
|
|
|
// RemoveSchedule removes metrics associated with a specified schedule.
|
|
func (m *ServerMetrics) RemoveSchedule(scheduleName string) {
|
|
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {
|
|
g.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if h, ok := m.metrics[backupDurationSeconds].(*prometheus.HistogramVec); ok {
|
|
h.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if g, ok := m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec); ok {
|
|
g.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupItemsTotalGauge].(*prometheus.GaugeVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName)
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName, "")
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName, "")
|
|
}
|
|
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.DeleteLabelValues(scheduleName, "")
|
|
}
|
|
}
|
|
|
|
// InitMetricsForNode initializes counter metrics for a node.
|
|
func (m *ServerMetrics) InitMetricsForNode(node string) {
|
|
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataUploadSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataUploadFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataUploadCancelTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataDownloadSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataDownloadFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
if c, ok := m.metrics[DataDownloadCancelTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Add(0)
|
|
}
|
|
}
|
|
|
|
// RegisterPodVolumeBackupEnqueue records enqueuing of a PodVolumeBackup object.
|
|
func (m *ServerMetrics) RegisterPodVolumeBackupEnqueue(node string) {
|
|
if c, ok := m.metrics[podVolumeBackupEnqueueTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterPodVolumeBackupDequeue records dequeuing of a PodVolumeBackup object.
|
|
func (m *ServerMetrics) RegisterPodVolumeBackupDequeue(node string) {
|
|
if c, ok := m.metrics[podVolumeBackupDequeueTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataUploadSuccess records successful uploaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataUploadSuccess(node string) {
|
|
if c, ok := m.metrics[DataUploadSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataUploadFailure records failed uploaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataUploadFailure(node string) {
|
|
if c, ok := m.metrics[DataUploadFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataUploadCancel records canceled uploaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataUploadCancel(node string) {
|
|
if c, ok := m.metrics[DataUploadCancelTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataDownloadSuccess records successful downloaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataDownloadSuccess(node string) {
|
|
if c, ok := m.metrics[DataDownloadSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataDownloadFailure records failed downloaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataDownloadFailure(node string) {
|
|
if c, ok := m.metrics[DataDownloadFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterDataDownloadCancel records canceled downloaded snapshots.
|
|
func (m *ServerMetrics) RegisterDataDownloadCancel(node string) {
|
|
if c, ok := m.metrics[DataDownloadCancelTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(node).Inc()
|
|
}
|
|
}
|
|
|
|
// ObservePodVolumeOpLatency records the number of seconds a pod volume operation took.
|
|
func (m *ServerMetrics) ObservePodVolumeOpLatency(node, pvbName, opName, backupName string, seconds float64) {
|
|
if h, ok := m.metrics[podVolumeOperationLatencySeconds].(*prometheus.HistogramVec); ok {
|
|
h.WithLabelValues(node, opName, backupName, pvbName).Observe(seconds)
|
|
}
|
|
}
|
|
|
|
// RegisterPodVolumeOpLatencyGauge registers the pod volume operation latency as a gauge metric.
|
|
func (m *ServerMetrics) RegisterPodVolumeOpLatencyGauge(node, pvbName, opName, backupName string, seconds float64) {
|
|
if g, ok := m.metrics[podVolumeOperationLatencyGaugeSeconds].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(node, opName, backupName, pvbName).Set(seconds)
|
|
}
|
|
}
|
|
|
|
// SetBackupTarballSizeBytesGauge records the size, in bytes, of a backup tarball.
|
|
func (m *ServerMetrics) SetBackupTarballSizeBytesGauge(backupSchedule string, size int64) {
|
|
if g, ok := m.metrics[backupTarballSizeBytesGauge].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(backupSchedule).Set(float64(size))
|
|
}
|
|
}
|
|
|
|
// SetBackupLastSuccessfulTimestamp records the last time a backup ran successfully, Unix timestamp in seconds
|
|
func (m *ServerMetrics) SetBackupLastSuccessfulTimestamp(backupSchedule string, time time.Time) {
|
|
if g, ok := m.metrics[backupLastSuccessfulTimestamp].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(backupSchedule).Set(float64(time.Unix()))
|
|
}
|
|
}
|
|
|
|
// SetBackupTotal records the current number of existent backups.
|
|
func (m *ServerMetrics) SetBackupTotal(numberOfBackups int64) {
|
|
if g, ok := m.metrics[backupTotal].(prometheus.Gauge); ok {
|
|
g.Set(float64(numberOfBackups))
|
|
}
|
|
}
|
|
|
|
// RegisterBackupAttempt records an backup attempt.
|
|
func (m *ServerMetrics) RegisterBackupAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[backupAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupSuccess records a successful completion of a backup.
|
|
func (m *ServerMetrics) RegisterBackupSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[backupSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
m.SetBackupLastSuccessfulTimestamp(backupSchedule, time.Now())
|
|
}
|
|
|
|
// RegisterBackupPartialFailure records a partially failed backup.
|
|
func (m *ServerMetrics) RegisterBackupPartialFailure(backupSchedule string) {
|
|
if c, ok := m.metrics[backupPartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupFailed records a failed backup.
|
|
func (m *ServerMetrics) RegisterBackupFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[backupFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupValidationFailure records a validation failed backup.
|
|
func (m *ServerMetrics) RegisterBackupValidationFailure(backupSchedule string) {
|
|
if c, ok := m.metrics[backupValidationFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDuration records the number of seconds a backup took.
|
|
func (m *ServerMetrics) RegisterBackupDuration(backupSchedule string, seconds float64) {
|
|
if c, ok := m.metrics[backupDurationSeconds].(*prometheus.HistogramVec); ok {
|
|
c.WithLabelValues(backupSchedule).Observe(seconds)
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionAttempt records the number of attempted backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionFailed records the number of failed backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupDeletionSuccess records the number of successful backup deletions
|
|
func (m *ServerMetrics) RegisterBackupDeletionSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[backupDeletionSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupItemsTotalGauge records the number of items to be backed up.
|
|
func (m *ServerMetrics) RegisterBackupItemsTotalGauge(backupSchedule string, items int) {
|
|
if c, ok := m.metrics[backupItemsTotalGauge].(*prometheus.GaugeVec); ok {
|
|
c.WithLabelValues(backupSchedule).Set(float64(items))
|
|
}
|
|
}
|
|
|
|
// RegisterBackupItemsErrorsGauge records the number of all error messages that were generated during
|
|
// execution of the backup.
|
|
func (m *ServerMetrics) RegisterBackupItemsErrorsGauge(backupSchedule string, items int) {
|
|
if c, ok := m.metrics[backupItemsErrorsGauge].(*prometheus.GaugeVec); ok {
|
|
c.WithLabelValues(backupSchedule).Set(float64(items))
|
|
}
|
|
}
|
|
|
|
// RegisterBackupWarning records a warned backup.
|
|
func (m *ServerMetrics) RegisterBackupWarning(backupSchedule string) {
|
|
if c, ok := m.metrics[backupWarningTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterBackupLastStatus records the last status of the backup.
|
|
func (m *ServerMetrics) RegisterBackupLastStatus(backupSchedule string, lastStatus int64) {
|
|
if g, ok := m.metrics[backupLastStatus].(*prometheus.GaugeVec); ok {
|
|
g.WithLabelValues(backupSchedule).Set(float64(lastStatus))
|
|
}
|
|
}
|
|
|
|
// toSeconds translates a time.Duration value into a float64
|
|
// representing the number of seconds in that duration.
|
|
func toSeconds(d time.Duration) float64 {
|
|
return float64(d / time.Second)
|
|
}
|
|
|
|
// SetRestoreTotal records the current number of existent restores.
|
|
func (m *ServerMetrics) SetRestoreTotal(numberOfRestores int64) {
|
|
if g, ok := m.metrics[restoreTotal].(prometheus.Gauge); ok {
|
|
g.Set(float64(numberOfRestores))
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreAttempt records an attempt to restore a backup.
|
|
func (m *ServerMetrics) RegisterRestoreAttempt(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreSuccess records a successful (maybe partial) completion of a restore.
|
|
func (m *ServerMetrics) RegisterRestoreSuccess(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestorePartialFailure records a restore that partially failed.
|
|
func (m *ServerMetrics) RegisterRestorePartialFailure(backupSchedule string) {
|
|
if c, ok := m.metrics[restorePartialFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreFailed records a restore that failed.
|
|
func (m *ServerMetrics) RegisterRestoreFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterRestoreValidationFailed records a restore that failed validation.
|
|
func (m *ServerMetrics) RegisterRestoreValidationFailed(backupSchedule string) {
|
|
if c, ok := m.metrics[restoreValidationFailedTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Inc()
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotAttempts records an attempt to snapshot a volume.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotAttempts(backupSchedule string, volumeSnapshotsAttempted int) {
|
|
if c, ok := m.metrics[volumeSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsAttempted))
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotSuccesses records a completed volume snapshot.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotSuccesses(backupSchedule string, volumeSnapshotsCompleted int) {
|
|
if c, ok := m.metrics[volumeSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsCompleted))
|
|
}
|
|
}
|
|
|
|
// RegisterVolumeSnapshotFailures records a failed volume snapshot.
|
|
func (m *ServerMetrics) RegisterVolumeSnapshotFailures(backupSchedule string, volumeSnapshotsFailed int) {
|
|
if c, ok := m.metrics[volumeSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule).Add(float64(volumeSnapshotsFailed))
|
|
}
|
|
}
|
|
|
|
// RegisterCSISnapshotAttempts records an attempt to snapshot a volume by CSI plugin.
|
|
func (m *ServerMetrics) RegisterCSISnapshotAttempts(backupSchedule, backupName string, csiSnapshotsAttempted int) {
|
|
if c, ok := m.metrics[csiSnapshotAttemptTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotsAttempted))
|
|
}
|
|
}
|
|
|
|
// RegisterCSISnapshotSuccesses records a completed volume snapshot by CSI plugin.
|
|
func (m *ServerMetrics) RegisterCSISnapshotSuccesses(backupSchedule, backupName string, csiSnapshotCompleted int) {
|
|
if c, ok := m.metrics[csiSnapshotSuccessTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotCompleted))
|
|
}
|
|
}
|
|
|
|
// RegisterCSISnapshotFailures records a failed volume snapshot by CSI plugin.
|
|
func (m *ServerMetrics) RegisterCSISnapshotFailures(backupSchedule, backupName string, csiSnapshotsFailed int) {
|
|
if c, ok := m.metrics[csiSnapshotFailureTotal].(*prometheus.CounterVec); ok {
|
|
c.WithLabelValues(backupSchedule, backupName).Add(float64(csiSnapshotsFailed))
|
|
}
|
|
}
|