influxdb/task/backend/executor/executor_metrics.go

179 lines
6.3 KiB
Go

package executor
import (
"time"
"github.com/influxdata/influxdb"
"github.com/influxdata/influxdb/task/backend"
"github.com/prometheus/client_golang/prometheus"
)
type ExecutorMetrics struct {
totalRunsComplete *prometheus.CounterVec
activeRuns prometheus.Collector
queueDelta *prometheus.SummaryVec
runDuration *prometheus.SummaryVec
errorsCounter *prometheus.CounterVec
manualRunsCounter *prometheus.CounterVec
resumeRunsCounter *prometheus.CounterVec
unrecoverableCounter *prometheus.CounterVec
}
type runCollector struct {
totalRunsActive *prometheus.Desc
workersBusy *prometheus.Desc
promiseQueueUsage *prometheus.Desc
te *TaskExecutor
}
func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
const namespace = "task"
const subsystem = "executor"
return &ExecutorMetrics{
totalRunsComplete: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "total_runs_complete",
Help: "Total number of runs completed across all tasks, split out by success or failure.",
}, []string{"task_type", "status"}),
activeRuns: NewRunCollector(te),
queueDelta: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "run_queue_delta",
Help: "The duration in seconds between a run being due to start and actually starting.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"task_type", "taskID"}),
runDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "run_duration",
Help: "The duration in seconds between a run starting and finishing.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"task_type", "taskID"}),
errorsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_counter",
Help: "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
}, []string{"task_type", "errorType"}),
unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "unrecoverable_counter",
Help: "The number of errors by taskID that must be manually resolved or have the task deactivated.",
}, []string{"taskID", "errorType"}),
manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "manual_runs_counter",
Help: "Total number of manual runs scheduled to run by task ID",
}, []string{"taskID"}),
resumeRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "resume_runs_counter",
Help: "Total number of runs resumed by task ID",
}, []string{"taskID"}),
}
}
// NewRunCollector returns a collector which exports influxdb process metrics.
func NewRunCollector(te *TaskExecutor) prometheus.Collector {
return &runCollector{
workersBusy: prometheus.NewDesc(
"task_executor_workers_busy",
"Percent of total available workers that are currently busy",
nil,
prometheus.Labels{},
),
totalRunsActive: prometheus.NewDesc(
"task_executor_total_runs_active",
"Total number of workers currently running tasks",
nil,
prometheus.Labels{},
),
promiseQueueUsage: prometheus.NewDesc(
"task_executor_promise_queue_usage",
"Percent of the promise queue that is currently full",
nil,
prometheus.Labels{},
),
te: te,
}
}
// PrometheusCollectors satisfies the prom.PrometheusCollector interface.
func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
return []prometheus.Collector{
em.totalRunsComplete,
em.activeRuns,
em.queueDelta,
em.errorsCounter,
em.runDuration,
em.manualRunsCounter,
em.resumeRunsCounter,
em.unrecoverableCounter,
}
}
// StartRun store the delta time between when a run is due to start and actually starting.
func (em *ExecutorMetrics) StartRun(task *influxdb.Task, queueDelta time.Duration) {
em.queueDelta.WithLabelValues(task.Type, "all").Observe(queueDelta.Seconds())
em.queueDelta.WithLabelValues("", task.ID.String()).Observe(queueDelta.Seconds())
}
// FinishRun adjusts the metrics to indicate a run is no longer in progress for the given task ID.
func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStatus, runDuration time.Duration) {
em.totalRunsComplete.WithLabelValues(task.Type, status.String()).Inc()
em.runDuration.WithLabelValues(task.Type, "all").Observe(runDuration.Seconds())
em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
}
// LogError increments the count of errors by error code.
func (em *ExecutorMetrics) LogError(taskType string, err error) {
switch e := err.(type) {
case *influxdb.Error:
em.errorsCounter.WithLabelValues(taskType, e.Code).Inc()
default:
em.errorsCounter.WithLabelValues(taskType, "unknown").Inc()
}
}
// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
// and so that unrecoverable errors can be quickly identified for deactivation
func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
switch e := err.(type) {
case *influxdb.Error:
em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
default:
em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
}
}
// Describe returns all descriptions associated with the run collector.
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- r.workersBusy
ch <- r.promiseQueueUsage
ch <- r.totalRunsActive
}
// Collect returns the current state of all metrics of the run collector.
func (r *runCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(r.workersBusy, prometheus.GaugeValue, r.te.WorkersBusy())
ch <- prometheus.MustNewConstMetric(r.promiseQueueUsage, prometheus.GaugeValue, r.te.PromiseQueueUsage())
ch <- prometheus.MustNewConstMetric(r.totalRunsActive, prometheus.GaugeValue, float64(r.te.RunsActive()))
}