fix(tasks): replace deactivation of unrecoverable errors with metric (#15430)

pull/15428/head^2
Alirie Gray 2019-10-16 16:00:58 -07:00 committed by GitHub
parent 6f8977218a
commit f096605327
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 63 additions and 18 deletions

View File

@ -22,5 +22,10 @@ func IsUnrecoverable(err error) bool {
return true
}
// Flux script uses an API that attempts to read the filesystem
if strings.Contains(errString, "filesystem service uninitialized") {
return true
}
return false
}

View File

@ -9,13 +9,14 @@ import (
)
type ExecutorMetrics struct {
totalRunsComplete *prometheus.CounterVec
activeRuns prometheus.Collector
queueDelta *prometheus.SummaryVec
runDuration *prometheus.SummaryVec
errorsCounter *prometheus.CounterVec
manualRunsCounter *prometheus.CounterVec
resumeRunsCounter *prometheus.CounterVec
totalRunsComplete *prometheus.CounterVec
activeRuns prometheus.Collector
queueDelta *prometheus.SummaryVec
runDuration *prometheus.SummaryVec
errorsCounter *prometheus.CounterVec
manualRunsCounter *prometheus.CounterVec
resumeRunsCounter *prometheus.CounterVec
unrecoverableCounter *prometheus.CounterVec
}
type runCollector struct {
@ -62,6 +63,13 @@ func NewExecutorMetrics(te *TaskExecutor) *ExecutorMetrics {
Help: "The number of errors thrown by the executor with the type of error (ex. Invalid, Internal, etc.)",
}, []string{"task_type", "errorType"}),
unrecoverableCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "unrecoverable_counter",
Help: "The number of errors by taskID that must be manually resolved or have the task deactivated.",
}, []string{"taskID", "errorType"}),
manualRunsCounter: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
@ -113,6 +121,7 @@ func (em *ExecutorMetrics) PrometheusCollectors() []prometheus.Collector {
em.runDuration,
em.manualRunsCounter,
em.resumeRunsCounter,
em.unrecoverableCounter,
}
}
@ -130,7 +139,7 @@ func (em *ExecutorMetrics) FinishRun(task *influxdb.Task, status backend.RunStat
em.runDuration.WithLabelValues("", task.ID.String()).Observe(runDuration.Seconds())
}
// LogError increments the count of errors.
// LogError increments the count of errors by error code.
func (em *ExecutorMetrics) LogError(taskType string, err error) {
switch e := err.(type) {
case *influxdb.Error:
@ -140,6 +149,18 @@ func (em *ExecutorMetrics) LogError(taskType string, err error) {
}
}
// LogUnrecoverableError increments the count of unrecoverable errors, which require admin intervention to resolve or deactivate
// This count is separate from the errors count so that the errors metric can be used to identify only internal, rather than user errors
// and so that unrecoverable errors can be quickly identified for deactivation
func (em *ExecutorMetrics) LogUnrecoverableError(taskID influxdb.ID, err error) {
switch e := err.(type) {
case *influxdb.Error:
em.unrecoverableCounter.WithLabelValues(taskID.String(), e.Code).Inc()
default:
em.unrecoverableCounter.WithLabelValues(taskID.String(), "unknown").Inc()
}
}
// Describe returns all descriptions associated with the run collector.
func (r *runCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- r.workersBusy

View File

@ -335,11 +335,15 @@ func (w *worker) finish(p *promise, rs backend.RunStatus, err error) {
w.te.metrics.LogError(p.task.Type, err)
if backend.IsUnrecoverable(err) {
// TODO (al): once user notification system is put in place, this code should be uncommented
// if we get an error that requires user intervention to fix, deactivate the task and alert the user
inactive := string(backend.TaskInactive)
w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
// inactive := string(backend.TaskInactive)
// w.te.ts.UpdateTask(p.ctx, p.task.ID, influxdb.TaskUpdate{Status: &inactive})
// and add to run logs
w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task deactivated after encountering unrecoverable error: %v", err.Error()))
w.te.tcs.AddRunLog(p.ctx, p.task.ID, p.run.ID, time.Now(), fmt.Sprintf("Task encountered unrecoverable error, requires admin action: %v", err.Error()))
// add to metrics
w.te.metrics.LogUnrecoverableError(p.task.ID, err)
}
p.err = err

View File

@ -421,6 +421,10 @@ func testErrorHandling(t *testing.T) {
t.Parallel()
tes := taskExecutorSystem(t)
metrics := tes.metrics
reg := prom.NewRegistry()
reg.MustRegister(metrics.PrometheusCollectors()...)
script := fmt.Sprintf(fmtTestScript, t.Name())
ctx := icontext.SetAuthorizer(context.Background(), tes.tc.Auth)
task, err := tes.i.CreateTask(ctx, influxdb.TaskCreate{OrganizationID: tes.tc.OrgID, OwnerID: tes.tc.Auth.GetUserID(), Flux: script, Status: "active"})
@ -428,7 +432,7 @@ func testErrorHandling(t *testing.T) {
t.Fatal(err)
}
// encountering a bucket not found error should deactivate the task
// encountering a bucket not found error should log an unrecoverable error in the metrics
forcedErr := errors.New("could not find bucket")
tes.svc.FailNextQuery(forcedErr)
@ -439,12 +443,23 @@ func testErrorHandling(t *testing.T) {
<-promise.Done()
inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
if err != nil {
t.Fatal(err)
mg := promtest.MustGather(t, reg)
m := promtest.MustFindMetric(t, mg, "task_executor_unrecoverable_counter", map[string]string{"taskID": task.ID.String(), "errorType": "internal error"})
if got := *m.Counter.Value; got != 1 {
t.Fatalf("expected 1 unrecoverable error, got %v", got)
}
if inactive.Status != "inactive" {
t.Fatal("expected task to be deactivated after permanent error")
}
// TODO (al): once user notification system is put in place, this code should be uncommented
// encountering a bucket not found error should deactivate the task
/*
inactive, err := tes.i.FindTaskByID(context.Background(), task.ID)
if err != nil {
t.Fatal(err)
}
if inactive.Status != "inactive" {
t.Fatal("expected task to be deactivated after permanent error")
}
*/
}