influxdb/task/backend/executor/executor_metrics.go

package executor

import (
	"time"

	"github.com/influxdata/influxdb"
	"github.com/influxdata/influxdb/task/backend"
	"github.com/prometheus/client_golang/prometheus"
)

type ExecutorMetrics struct {
	totalRunsComplete    *prometheus.CounterVec
	activeRuns           prometheus.Collector
	queueDelta           *prometheus.SummaryVec
	runDuration          *prometheus.SummaryVec
	errorsCounter        *prometheus.CounterVec
	manualRunsCounter    *prometheus.CounterVec
	resumeRunsCounter    *prometheus.CounterVec
	unrecoverableCounter *prometheus.CounterVec
}

type runCollector struct {
	totalRunsActive   *prometheus.Desc
	workersBusy       *prometheus.Desc
	promiseQueueUsage *prometheus.Desc
	te                *TaskExecutor
}

func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
	const namespace = "task"
	const subsystem = "executor"

	return &ExecutorMetrics{
		totalRunsComplete: prometheus.NewCounterVec(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "total_runs_complete",
			Help:      "Total number of runs completed across all tasks, split out by success or failure.",
		}, []string{"task_type", "status"}),

		activeRuns: NewRunCollector(te),

		queueDelta: prometheus.NewSummaryVec(prometheus.SummaryOpts{
			Namespace:  namespace,
			Subsystem:  subsystem,
			Name:       "run_queue_delta",
			Help:       "The duration in seconds between a run being due to start and actually starting.",
			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
		}, []string{"task_type", "taskID"}),

		runDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
			Namespace:  namespace,
			Subsystem:  subsystem,
			Name:       "run_duration",
			Help:       "The duration in seconds between a run starting and finishing.",
			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
		}, []string{"task_type", "taskID"}),

		errorsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "errors_counter",
			Help:      "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
		}, []string{"task_type", "errorType"}),

		unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "unrecoverable_counter",
			Help:      "The number of errors by taskID that must be manually resolved or have the task deactivated.",
		}, []string{"taskID", "errorType"}),

		manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "manual_runs_counter",
			Help:      "Total number of manual runs scheduled to run by task ID",
		}, []string{"taskID"}),

		resumeRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "resume_runs_counter",
			Help:      "Total number of runs resumed by task ID",
		}, []string{"taskID"}),
	}
}

// NewRunCollector returns a collector which exports influxdb process metrics.
func NewRunCollector(te *TaskExecutor) prometheus.Collector {
	return &runCollector{
		workersBusy: prometheus.NewDesc(
			"task_executor_workers_busy",
			"Percent of total available workers that are currently busy",
			nil,
			prometheus.Labels{},
		),
		totalRunsActive: prometheus.NewDesc(
			"task_executor_total_runs_active",
			"Total number of workers currently running tasks",
			nil,
			prometheus.Labels{},
		),
		promiseQueueUsage: prometheus.NewDesc(
			"task_executor_promise_queue_usage",
			"Percent of the promise queue that is currently full",
			nil,
			prometheus.Labels{},
		),
		te: te,
	}
}

// PrometheusCollectors satisfies the prom.PrometheusCollector interface.
func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
	return []prometheus.Collector{
		em.totalRunsComplete,
		em.activeRuns,
		em.queueDelta,
		em.errorsCounter,
		em.runDuration,
		em.manualRunsCounter,
		em.resumeRunsCounter,
		em.unrecoverableCounter,
	}
}

// StartRun store the delta time between when a run is due to start and actually starting.
func (em *ExecutorMetrics) StartRun(task *influxdb.Task, queueDelta time.Duration) {
	em.queueDelta.WithLabelValues(task.Type, "all").Observe(queueDelta.Seconds())
	em.queueDelta.WithLabelValues("", task.ID.String()).Observe(queueDelta.Seconds())
}

// FinishRun adjusts the metrics to indicate a run is no longer in progress for the given task ID.
func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStatus, runDuration time.Duration) {
	em.totalRunsComplete.WithLabelValues(task.Type, status.String()).Inc()

	em.runDuration.WithLabelValues(task.Type, "all").Observe(runDuration.Seconds())
	em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
}

// LogError increments the count of errors by error code.
func (em *ExecutorMetrics) LogError(taskType string, err error) {
	switch e := err.(type) {
	case *influxdb.Error:
		em.errorsCounter.WithLabelValues(taskType, e.Code).Inc()
	default:
		em.errorsCounter.WithLabelValues(taskType, "unknown").Inc()
	}
}

// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
// and so that unrecoverable errors can be quickly identified for deactivation
func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
	switch e := err.(type) {
	case *influxdb.Error:
		em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
	default:
		em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
	}
}

// Describe returns all descriptions associated with the run collector.
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
	ch <- r.workersBusy
	ch <- r.promiseQueueUsage
	ch <- r.totalRunsActive
}

// Collect returns the current state of all metrics of the run collector.
func (r *runCollector) Collect(ch chan<- prometheus.Metric) {
	ch <- prometheus.MustNewConstMetric(r.workersBusy, prometheus.GaugeValue, r.te.WorkersBusy())

	ch <- prometheus.MustNewConstMetric(r.promiseQueueUsage, prometheus.GaugeValue, r.te.PromiseQueueUsage())

	ch <- prometheus.MustNewConstMetric(r.totalRunsActive, prometheus.GaugeValue, float64(r.te.RunsActive()))
}