fix(tasks): replace deactivation of unrecoverable errors with metric (#15430)
parent
6f8977218a
commit
f096605327
|
@ -22,5 +22,10 @@ func IsUnrecoverable(err error) bool {
|
|||
return true
|
||||
}
|
||||
|
||||
// Flux script uses an API that attempts to read the filesystem
|
||||
if strings.Contains(errString, "filesystem service uninitialized") {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -9,13 +9,14 @@ import (
|
|||
)
|
||||
|
||||
type ExecutorMetrics struct {
|
||||
totalRunsComplete *prometheus.CounterVec
|
||||
activeRuns prometheus.Collector
|
||||
queueDelta *prometheus.SummaryVec
|
||||
runDuration *prometheus.SummaryVec
|
||||
errorsCounter *prometheus.CounterVec
|
||||
manualRunsCounter *prometheus.CounterVec
|
||||
resumeRunsCounter *prometheus.CounterVec
|
||||
totalRunsComplete *prometheus.CounterVec
|
||||
activeRuns prometheus.Collector
|
||||
queueDelta *prometheus.SummaryVec
|
||||
runDuration *prometheus.SummaryVec
|
||||
errorsCounter *prometheus.CounterVec
|
||||
manualRunsCounter *prometheus.CounterVec
|
||||
resumeRunsCounter *prometheus.CounterVec
|
||||
unrecoverableCounter *prometheus.CounterVec
|
||||
}
|
||||
|
||||
type runCollector struct {
|
||||
|
@ -62,6 +63,13 @@ func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
|
|||
Help: "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
|
||||
}, []string{"task_type", "errorType"}),
|
||||
|
||||
unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "unrecoverable_counter",
|
||||
Help: "The number of errors by taskID that must be manually resolved or have the task deactivated.",
|
||||
}, []string{"taskID", "errorType"}),
|
||||
|
||||
manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
|
@ -113,6 +121,7 @@ func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
|
|||
em.runDuration,
|
||||
em.manualRunsCounter,
|
||||
em.resumeRunsCounter,
|
||||
em.unrecoverableCounter,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,7 +139,7 @@ func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStat
|
|||
em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
|
||||
}
|
||||
|
||||
// LogError increments the count of errors.
|
||||
// LogError increments the count of errors by error code.
|
||||
func (em *ExecutorMetrics) LogError(taskType string, err error) {
|
||||
switch e := err.(type) {
|
||||
case *influxdb.Error:
|
||||
|
@ -140,6 +149,18 @@ func (em *ExecutorMetrics) LogError(taskType string, err error) {
|
|||
}
|
||||
}
|
||||
|
||||
// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
|
||||
// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
|
||||
// and so that unrecoverable errors can be quickly identified for deactivation
|
||||
func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
|
||||
switch e := err.(type) {
|
||||
case *influxdb.Error:
|
||||
em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
|
||||
default:
|
||||
em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
|
||||
}
|
||||
}
|
||||
|
||||
// Describe returns all descriptions associated with the run collector.
|
||||
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- r.workersBusy
|
||||
|
|
|
@ -335,11 +335,15 @@ func (w *worker) finish(p *promise, rs backend.RunStatus, err error) {
|
|||
w.te.metrics.LogError(p.task.Type, err)
|
||||
|
||||
if backend.IsUnrecoverable(err) {
|
||||
// TODO (al): once user notification system is put in place, this code should be uncommented
|
||||
// if we get an error that requires user intervention to fix, deactivate the task and alert the user
|
||||
inactive := string(backend.TaskInactive)
|
||||
w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
|
||||
// inactive := string(backend.TaskInactive)
|
||||
// w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
|
||||
|
||||
// and add to run logs
|
||||
w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task deactivated after encountering unrecoverable error: %v", err.Error()))
|
||||
w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task encountered unrecoverable error, requires admin action: %v", err.Error()))
|
||||
// add to metrics
|
||||
w.te.metrics.LogUnrecoverableError(p.task.ID, err)
|
||||
}
|
||||
|
||||
p.err = err
|
||||
|
|
|
@ -421,6 +421,10 @@ func testErrorHandling(t *testing.T) {
|
|||
t.Parallel()
|
||||
tes := taskExecutorSystem(t)
|
||||
|
||||
metrics := tes.metrics
|
||||
reg := prom.NewRegistry()
|
||||
reg.MustRegister(metrics.PrometheusCollectors()...)
|
||||
|
||||
script := fmt.Sprintf(fmtTestScript, t.Name())
|
||||
ctx := icontext.SetAuthorizer(context.Background(), tes.tc.Auth)
|
||||
task, err := tes.i.CreateTask(ctx, influxdb.TaskCreate{OrganizationID: tes.tc.OrgID, OwnerID: tes.tc.Auth.GetUserID(), Flux: script, Status: "active"})
|
||||
|
@ -428,7 +432,7 @@ func testErrorHandling(t *testing.T) {
|
|||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// encountering a bucket not found error should deactivate the task
|
||||
// encountering a bucket not found error should log an unrecoverable error in the metrics
|
||||
forcedErr := errors.New("could not find bucket")
|
||||
tes.svc.FailNextQuery(forcedErr)
|
||||
|
||||
|
@ -439,12 +443,23 @@ func testErrorHandling(t *testing.T) {
|
|||
|
||||
<-promise.Done()
|
||||
|
||||
inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
mg := promtest.MustGather(t, reg)
|
||||
|
||||
m := promtest.MustFindMetric(t, mg, "task_executor_unrecoverable_counter", map[string]string{"taskID": task.ID.String(), "errorType": "internal error"})
|
||||
if got := *m.Counter.Value; got != 1 {
|
||||
t.Fatalf("expected 1 unrecoverable error, got %v", got)
|
||||
}
|
||||
|
||||
if inactive.Status != "inactive" {
|
||||
t.Fatal("expected task to be deactivated after permanent error")
|
||||
}
|
||||
// TODO (al): once user notification system is put in place, this code should be uncommented
|
||||
// encountering a bucket not found error should deactivate the task
|
||||
/*
|
||||
inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if inactive.Status != "inactive" {
|
||||
t.Fatal("expected task to be deactivated after permanent error")
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue