fix(tasks): replace deactivation of unrecoverable errors with metric (#15430)

2019-10-16 16:00:58 -07:00 · 2019-10-16 16:00:58 -07:00 · f096605327
parent 6f8977218a
commit f096605327
4 changed files with 63 additions and 18 deletions
--- a/task/backend/check_task_error.go
+++ b/task/backend/check_task_error.go
@ -22,5 +22,10 @@ func IsUnrecoverable(err error) bool {
 		return true
 	}

+	// Flux script uses an API that attempts to read the filesystem
+	if strings.Contains(errString, "filesystem service uninitialized") {
+		return true
+	}
+
 	return false
 }
--- a/task/backend/executor/executor_metrics.go
+++ b/task/backend/executor/executor_metrics.go
@ -9,13 +9,14 @@ import (
 )

 type ExecutorMetrics struct {
-	totalRunsComplete *prometheus.CounterVec
-	activeRuns        prometheus.Collector
-	queueDelta        *prometheus.SummaryVec
-	runDuration       *prometheus.SummaryVec
-	errorsCounter     *prometheus.CounterVec
-	manualRunsCounter *prometheus.CounterVec
-	resumeRunsCounter *prometheus.CounterVec
+	totalRunsComplete    *prometheus.CounterVec
+	activeRuns           prometheus.Collector
+	queueDelta           *prometheus.SummaryVec
+	runDuration          *prometheus.SummaryVec
+	errorsCounter        *prometheus.CounterVec
+	manualRunsCounter    *prometheus.CounterVec
+	resumeRunsCounter    *prometheus.CounterVec
+	unrecoverableCounter *prometheus.CounterVec
 }

 type runCollector struct {
@ -62,6 +63,13 @@ func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
 			Help:      "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
 		}, []string{"task_type", "errorType"}),

+		unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "unrecoverable_counter",
+			Help:      "The number of errors by taskID that must be manually resolved or have the task deactivated.",
+		}, []string{"taskID", "errorType"}),
+
 		manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Namespace: namespace,
 			Subsystem: subsystem,
@ -113,6 +121,7 @@ func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
 		em.runDuration,
 		em.manualRunsCounter,
 		em.resumeRunsCounter,
+		em.unrecoverableCounter,
 	}
 }

@ -130,7 +139,7 @@ func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStat
 	em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
 }

-// LogError increments the count of errors.
+// LogError increments the count of errors by error code.
 func (em *ExecutorMetrics) LogError(taskType string, err error) {
 	switch e := err.(type) {
 	case *influxdb.Error:
@ -140,6 +149,18 @@ func (em *ExecutorMetrics) LogError(taskType string, err error) {
 	}
 }

+// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
+// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
+// and so that unrecoverable errors can be quickly identified for deactivation
+func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
+	switch e := err.(type) {
+	case *influxdb.Error:
+		em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
+	default:
+		em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
+	}
+}
+
 // Describe returns all descriptions associated with the run collector.
 func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
 	ch <- r.workersBusy
--- a/task/backend/executor/task_executor.go
+++ b/task/backend/executor/task_executor.go
@ -335,11 +335,15 @@ func (w *worker) finish(p *promise, rs backend.RunStatus, err error) {
 		w.te.metrics.LogError(p.task.Type, err)

 		if backend.IsUnrecoverable(err) {
+			// TODO (al): once user notification system is put in place, this code should be uncommented
 			// if we get an error that requires user intervention to fix, deactivate the task and alert the user
-			inactive := string(backend.TaskInactive)
-			w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
+			// inactive := string(backend.TaskInactive)
+			// w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
+
 			// and add to run logs
-			w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task deactivated after encountering unrecoverable error: %v", err.Error()))
+			w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task encountered unrecoverable error, requires admin action: %v", err.Error()))
+			// add to metrics
+			w.te.metrics.LogUnrecoverableError(p.task.ID, err)
 		}

 		p.err = err
--- a/task/backend/executor/task_executor_test.go
+++ b/task/backend/executor/task_executor_test.go
@ -421,6 +421,10 @@ func testErrorHandling(t *testing.T) {
 	t.Parallel()
 	tes := taskExecutorSystem(t)

+	metrics := tes.metrics
+	reg := prom.NewRegistry()
+	reg.MustRegister(metrics.PrometheusCollectors()...)
+
 	script := fmt.Sprintf(fmtTestScript, t.Name())
 	ctx := icontext.SetAuthorizer(context.Background(), tes.tc.Auth)
 	task, err := tes.i.CreateTask(ctx, influxdb.TaskCreate{OrganizationID: tes.tc.OrgID, OwnerID: tes.tc.Auth.GetUserID(), Flux: script, Status: "active"})
@ -428,7 +432,7 @@ func testErrorHandling(t *testing.T) {
 		t.Fatal(err)
 	}

-	// encountering a bucket not found error should deactivate the task
+	// encountering a bucket not found error should log an unrecoverable error in the metrics
 	forcedErr := errors.New("could not find bucket")
 	tes.svc.FailNextQuery(forcedErr)

@ -439,12 +443,23 @@ func testErrorHandling(t *testing.T) {

 	<-promise.Done()

-	inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
-	if err != nil {
-		t.Fatal(err)
+	mg := promtest.MustGather(t, reg)
+
+	m := promtest.MustFindMetric(t, mg, "task_executor_unrecoverable_counter", map[string]string{"taskID": task.ID.String(), "errorType": "internal error"})
+	if got := *m.Counter.Value; got != 1 {
+		t.Fatalf("expected 1 unrecoverable error, got %v", got)
 	}

-	if inactive.Status != "inactive" {
-		t.Fatal("expected task to be deactivated after permanent error")
-	}
+	// TODO (al): once user notification system is put in place, this code should be uncommented
+	// encountering a bucket not found error should deactivate the task
+	/*
+		inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if inactive.Status != "inactive" {
+			t.Fatal("expected task to be deactivated after permanent error")
+		}
+	*/
 }