179 lines
6.3 KiB
Go
179 lines
6.3 KiB
Go
package executor
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/influxdata/influxdb"
|
|
"github.com/influxdata/influxdb/task/backend"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
type ExecutorMetrics struct {
|
|
totalRunsComplete *prometheus.CounterVec
|
|
activeRuns prometheus.Collector
|
|
queueDelta *prometheus.SummaryVec
|
|
runDuration *prometheus.SummaryVec
|
|
errorsCounter *prometheus.CounterVec
|
|
manualRunsCounter *prometheus.CounterVec
|
|
resumeRunsCounter *prometheus.CounterVec
|
|
unrecoverableCounter *prometheus.CounterVec
|
|
}
|
|
|
|
type runCollector struct {
|
|
totalRunsActive *prometheus.Desc
|
|
workersBusy *prometheus.Desc
|
|
promiseQueueUsage *prometheus.Desc
|
|
te *TaskExecutor
|
|
}
|
|
|
|
func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
|
|
const namespace = "task"
|
|
const subsystem = "executor"
|
|
|
|
return &ExecutorMetrics{
|
|
totalRunsComplete: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "total_runs_complete",
|
|
Help: "Total number of runs completed across all tasks, split out by success or failure.",
|
|
}, []string{"task_type", "status"}),
|
|
|
|
activeRuns: NewRunCollector(te),
|
|
|
|
queueDelta: prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "run_queue_delta",
|
|
Help: "The duration in seconds between a run being due to start and actually starting.",
|
|
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
|
}, []string{"task_type", "taskID"}),
|
|
|
|
runDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "run_duration",
|
|
Help: "The duration in seconds between a run starting and finishing.",
|
|
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
|
}, []string{"task_type", "taskID"}),
|
|
|
|
errorsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "errors_counter",
|
|
Help: "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
|
|
}, []string{"task_type", "errorType"}),
|
|
|
|
unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "unrecoverable_counter",
|
|
Help: "The number of errors by taskID that must be manually resolved or have the task deactivated.",
|
|
}, []string{"taskID", "errorType"}),
|
|
|
|
manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "manual_runs_counter",
|
|
Help: "Total number of manual runs scheduled to run by task ID",
|
|
}, []string{"taskID"}),
|
|
|
|
resumeRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "resume_runs_counter",
|
|
Help: "Total number of runs resumed by task ID",
|
|
}, []string{"taskID"}),
|
|
}
|
|
}
|
|
|
|
// NewRunCollector returns a collector which exports influxdb process metrics.
|
|
func NewRunCollector(te *TaskExecutor) prometheus.Collector {
|
|
return &runCollector{
|
|
workersBusy: prometheus.NewDesc(
|
|
"task_executor_workers_busy",
|
|
"Percent of total available workers that are currently busy",
|
|
nil,
|
|
prometheus.Labels{},
|
|
),
|
|
totalRunsActive: prometheus.NewDesc(
|
|
"task_executor_total_runs_active",
|
|
"Total number of workers currently running tasks",
|
|
nil,
|
|
prometheus.Labels{},
|
|
),
|
|
promiseQueueUsage: prometheus.NewDesc(
|
|
"task_executor_promise_queue_usage",
|
|
"Percent of the promise queue that is currently full",
|
|
nil,
|
|
prometheus.Labels{},
|
|
),
|
|
te: te,
|
|
}
|
|
}
|
|
|
|
// PrometheusCollectors satisfies the prom.PrometheusCollector interface.
|
|
func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
|
|
return []prometheus.Collector{
|
|
em.totalRunsComplete,
|
|
em.activeRuns,
|
|
em.queueDelta,
|
|
em.errorsCounter,
|
|
em.runDuration,
|
|
em.manualRunsCounter,
|
|
em.resumeRunsCounter,
|
|
em.unrecoverableCounter,
|
|
}
|
|
}
|
|
|
|
// StartRun store the delta time between when a run is due to start and actually starting.
|
|
func (em *ExecutorMetrics) StartRun(task *influxdb.Task, queueDelta time.Duration) {
|
|
em.queueDelta.WithLabelValues(task.Type, "all").Observe(queueDelta.Seconds())
|
|
em.queueDelta.WithLabelValues("", task.ID.String()).Observe(queueDelta.Seconds())
|
|
}
|
|
|
|
// FinishRun adjusts the metrics to indicate a run is no longer in progress for the given task ID.
|
|
func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStatus, runDuration time.Duration) {
|
|
em.totalRunsComplete.WithLabelValues(task.Type, status.String()).Inc()
|
|
|
|
em.runDuration.WithLabelValues(task.Type, "all").Observe(runDuration.Seconds())
|
|
em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
|
|
}
|
|
|
|
// LogError increments the count of errors by error code.
|
|
func (em *ExecutorMetrics) LogError(taskType string, err error) {
|
|
switch e := err.(type) {
|
|
case *influxdb.Error:
|
|
em.errorsCounter.WithLabelValues(taskType, e.Code).Inc()
|
|
default:
|
|
em.errorsCounter.WithLabelValues(taskType, "unknown").Inc()
|
|
}
|
|
}
|
|
|
|
// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
|
|
// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
|
|
// and so that unrecoverable errors can be quickly identified for deactivation
|
|
func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
|
|
switch e := err.(type) {
|
|
case *influxdb.Error:
|
|
em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
|
|
default:
|
|
em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
|
|
}
|
|
}
|
|
|
|
// Describe returns all descriptions associated with the run collector.
|
|
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- r.workersBusy
|
|
ch <- r.promiseQueueUsage
|
|
ch <- r.totalRunsActive
|
|
}
|
|
|
|
// Collect returns the current state of all metrics of the run collector.
|
|
func (r *runCollector) Collect(ch chan<- prometheus.Metric) {
|
|
ch <- prometheus.MustNewConstMetric(r.workersBusy, prometheus.GaugeValue, r.te.WorkersBusy())
|
|
|
|
ch <- prometheus.MustNewConstMetric(r.promiseQueueUsage, prometheus.GaugeValue, r.te.PromiseQueueUsage())
|
|
|
|
ch <- prometheus.MustNewConstMetric(r.totalRunsActive, prometheus.GaugeValue, float64(r.te.RunsActive()))
|
|
}
|