influxdb/task/backend/executor/executor_metrics.go

151 lines
4.7 KiB
Go

package executor
import (
"time"
"github.com/influxdata/influxdb"
"github.com/influxdata/influxdb/task/backend"
"github.com/prometheus/client_golang/prometheus"
)
type ExecutorMetrics struct {
totalRunsComplete *prometheus.CounterVec
activeRuns prometheus.Collector
queueDelta prometheus.Summary
runDuration prometheus.Summary
errorsCounter prometheus.Counter
manualRunsCounter *prometheus.CounterVec
resumeRunsCounter *prometheus.CounterVec
}
type runCollector struct {
totalRunsActive *prometheus.Desc
workersBusy *prometheus.Desc
promiseQueueUsage *prometheus.Desc
te *TaskExecutor
}
func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
const namespace = "task"
const subsystem = "executor"
return &ExecutorMetrics{
totalRunsComplete: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "total_runs_complete",
Help: "Total number of runs completed across all tasks, split out by success or failure.",
}, []string{"status"}),
activeRuns: NewRunCollector(te),
queueDelta: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "run_queue_delta",
Help: "The duration in seconds between a run being due to start and actually starting.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}),
runDuration: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "run_duration",
Help: "The duration in seconds between a run starting and finishing.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}),
errorsCounter: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_counter",
Help: "The number of errors thrown by the executor.",
}),
manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "manual_runs_counter",
Help: "Total number of manual runs scheduled to run by task ID",
}, []string{"taskID"}),
resumeRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "resume_runs_counter",
Help: "Total number of runs resumed by task ID",
}, []string{"taskID"}),
}
}
// NewRunCollector returns a collector which exports influxdb process metrics.
func NewRunCollector(te *TaskExecutor) prometheus.Collector {
return &runCollector{
workersBusy: prometheus.NewDesc(
"task_executor_workers_busy",
"Percent of total available workers that are currently busy",
nil,
prometheus.Labels{},
),
totalRunsActive: prometheus.NewDesc(
"task_executor_total_runs_active",
"Total number of workers currently running tasks",
nil,
prometheus.Labels{},
),
promiseQueueUsage: prometheus.NewDesc(
"task_executor_promise_queue_usage",
"Percent of the promise queue that is currently full",
nil,
prometheus.Labels{},
),
te: te,
}
}
// PrometheusCollectors satisfies the prom.PrometheusCollector interface.
func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
return []prometheus.Collector{
em.totalRunsComplete,
em.activeRuns,
em.queueDelta,
em.errorsCounter,
em.runDuration,
em.manualRunsCounter,
em.resumeRunsCounter,
}
}
// StartRun store the delta time between when a run is due to start and actually starting.
func (em *ExecutorMetrics) StartRun(taskID influxdb.ID, queueDelta time.Duration) {
em.queueDelta.Observe(queueDelta.Seconds())
}
// FinishRun adjusts the metrics to indicate a run is no longer in progress for the given task ID.
func (em *ExecutorMetrics) FinishRun(taskID influxdb.ID, status backend.RunStatus, runDuration time.Duration) {
em.totalRunsComplete.WithLabelValues(status.String()).Inc()
em.runDuration.Observe(runDuration.Seconds())
}
// LogError increments the count of errors.
func (em *ExecutorMetrics) LogError() {
em.errorsCounter.Inc()
}
// Describe returns all descriptions associated with the run collector.
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- r.workersBusy
ch <- r.promiseQueueUsage
ch <- r.totalRunsActive
}
// Collect returns the current state of all metrics of the run collector.
func (r *runCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(r.workersBusy, prometheus.GaugeValue, r.te.WorkersBusy())
ch <- prometheus.MustNewConstMetric(r.promiseQueueUsage, prometheus.GaugeValue, r.te.PromiseQueueUsage())
ch <- prometheus.MustNewConstMetric(r.totalRunsActive, prometheus.GaugeValue, float64(r.te.RunsActive()))
}