Merge pull request #8550 from influxdata/er-8548-panic
Allow panic recovery mechanism to be disabledpull/8570/head
commit
7374e4e8a4
|
@ -9,6 +9,7 @@
|
|||
### Features
|
||||
|
||||
- [#8426](https://github.com/influxdata/influxdb/issues/8426): Add `parse-multivalue-plugin` to allow users to choose how multivalue plugins should be handled by the collectd service.
|
||||
- [#8548](https://github.com/influxdata/influxdb/issues/8548): Allow panic recovery to be disabled when investigating server issues.
|
||||
|
||||
### Bugfixes
|
||||
|
||||
|
|
|
@ -3,7 +3,9 @@ package influxql
|
|||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
@ -36,10 +38,15 @@ var (
|
|||
|
||||
// Statistics for the QueryExecutor
|
||||
const (
|
||||
statQueriesActive = "queriesActive" // Number of queries currently being executed
|
||||
statQueriesActive = "queriesActive" // Number of queries currently being executed.
|
||||
statQueriesExecuted = "queriesExecuted" // Number of queries that have been executed (started).
|
||||
statQueriesFinished = "queriesFinished" // Number of queries that have finished.
|
||||
statQueryExecutionDuration = "queryDurationNs" // Total (wall) time spent executing queries
|
||||
statQueryExecutionDuration = "queryDurationNs" // Total (wall) time spent executing queries.
|
||||
statRecoveredPanics = "recoveredPanics" // Number of panics recovered by Query Executor.
|
||||
|
||||
// PanicCrashEnv is the environment variable that, when set, will prevent
|
||||
// the handler from recovering any panics.
|
||||
PanicCrashEnv = "INFLUXDB_PANIC_CRASH"
|
||||
)
|
||||
|
||||
// ErrDatabaseNotFound returns a database not found error for the given database name.
|
||||
|
@ -208,6 +215,7 @@ type QueryStatistics struct {
|
|||
ExecutedQueries int64
|
||||
FinishedQueries int64
|
||||
QueryExecutionDuration int64
|
||||
RecoveredPanics int64
|
||||
}
|
||||
|
||||
// Statistics returns statistics for periodic monitoring.
|
||||
|
@ -220,6 +228,7 @@ func (e *QueryExecutor) Statistics(tags map[string]string) []models.Statistic {
|
|||
statQueriesExecuted: atomic.LoadInt64(&e.stats.ExecutedQueries),
|
||||
statQueriesFinished: atomic.LoadInt64(&e.stats.FinishedQueries),
|
||||
statQueryExecutionDuration: atomic.LoadInt64(&e.stats.QueryExecutionDuration),
|
||||
statRecoveredPanics: atomic.LoadInt64(&e.stats.RecoveredPanics),
|
||||
},
|
||||
}}
|
||||
}
|
||||
|
@ -392,13 +401,32 @@ LOOP:
|
|||
}
|
||||
}
|
||||
|
||||
// Determines if the QueryExecutor will recover any panics or let them crash
|
||||
// the server.
|
||||
var willCrash bool
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
if willCrash, err = strconv.ParseBool(os.Getenv(PanicCrashEnv)); err != nil {
|
||||
willCrash = false
|
||||
}
|
||||
}
|
||||
|
||||
func (e *QueryExecutor) recover(query *Query, results chan *Result) {
|
||||
if err := recover(); err != nil {
|
||||
atomic.AddInt64(&e.stats.RecoveredPanics, 1) // Capture the panic in _internal stats.
|
||||
e.Logger.Error(fmt.Sprintf("%s [panic:%s] %s", query.String(), err, debug.Stack()))
|
||||
results <- &Result{
|
||||
StatementID: -1,
|
||||
Err: fmt.Errorf("%s [panic:%s]", query.String(), err),
|
||||
}
|
||||
|
||||
if willCrash {
|
||||
e.Logger.Error(fmt.Sprintf("\n\n=====\nAll goroutines now follow:"))
|
||||
buf := debug.Stack()
|
||||
e.Logger.Error(fmt.Sprintf("%s", buf))
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -180,6 +180,7 @@ type Statistics struct {
|
|||
ActiveWriteRequests int64
|
||||
ClientErrors int64
|
||||
ServerErrors int64
|
||||
RecoveredPanics int64
|
||||
}
|
||||
|
||||
// Statistics returns statistics for periodic monitoring.
|
||||
|
@ -206,6 +207,7 @@ func (h *Handler) Statistics(tags map[string]string) []models.Statistic {
|
|||
statWriteRequestsActive: atomic.LoadInt64(&h.stats.ActiveWriteRequests),
|
||||
statClientError: atomic.LoadInt64(&h.stats.ClientErrors),
|
||||
statServerError: atomic.LoadInt64(&h.stats.ServerErrors),
|
||||
statRecoveredPanics: atomic.LoadInt64(&h.stats.RecoveredPanics),
|
||||
},
|
||||
}}
|
||||
}
|
||||
|
@ -1185,6 +1187,17 @@ func (h *Handler) responseWriter(inner http.Handler) http.Handler {
|
|||
})
|
||||
}
|
||||
|
||||
// if the env var is set, and the value is truthy, then we will *not*
|
||||
// recover from a panic.
|
||||
var willCrash bool
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
if willCrash, err = strconv.ParseBool(os.Getenv(influxql.PanicCrashEnv)); err != nil {
|
||||
willCrash = false
|
||||
}
|
||||
}
|
||||
|
||||
func (h *Handler) recovery(inner http.Handler, name string) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
start := time.Now()
|
||||
|
@ -1196,6 +1209,14 @@ func (h *Handler) recovery(inner http.Handler, name string) http.Handler {
|
|||
logLine = fmt.Sprintf("%s [panic:%s] %s", logLine, err, debug.Stack())
|
||||
h.CLFLogger.Println(logLine)
|
||||
http.Error(w, http.StatusText(http.StatusInternalServerError), 500)
|
||||
atomic.AddInt64(&h.stats.RecoveredPanics, 1) // Capture the panic in _internal stats.
|
||||
|
||||
if willCrash {
|
||||
h.CLFLogger.Println("\n\n=====\nAll goroutines now follow:")
|
||||
buf := debug.Stack()
|
||||
h.CLFLogger.Printf("%s\n", buf)
|
||||
os.Exit(1) // If we panic then the Go server will recover.
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
|
|
|
@ -19,24 +19,25 @@ import (
|
|||
|
||||
// statistics gathered by the httpd package.
|
||||
const (
|
||||
statRequest = "req" // Number of HTTP requests served
|
||||
statQueryRequest = "queryReq" // Number of query requests served
|
||||
statWriteRequest = "writeReq" // Number of write requests serverd
|
||||
statPingRequest = "pingReq" // Number of ping requests served
|
||||
statStatusRequest = "statusReq" // Number of status requests served
|
||||
statWriteRequestBytesReceived = "writeReqBytes" // Sum of all bytes in write requests
|
||||
statQueryRequestBytesTransmitted = "queryRespBytes" // Sum of all bytes returned in query reponses
|
||||
statPointsWrittenOK = "pointsWrittenOK" // Number of points written OK
|
||||
statPointsWrittenDropped = "pointsWrittenDropped" // Number of points dropped by the storage engine
|
||||
statPointsWrittenFail = "pointsWrittenFail" // Number of points that failed to be written
|
||||
statAuthFail = "authFail" // Number of authentication failures
|
||||
statRequestDuration = "reqDurationNs" // Number of (wall-time) nanoseconds spent inside requests
|
||||
statQueryRequestDuration = "queryReqDurationNs" // Number of (wall-time) nanoseconds spent inside query requests
|
||||
statWriteRequestDuration = "writeReqDurationNs" // Number of (wall-time) nanoseconds spent inside write requests
|
||||
statRequestsActive = "reqActive" // Number of currently active requests
|
||||
statWriteRequestsActive = "writeReqActive" // Number of currently active write requests
|
||||
statClientError = "clientError" // Number of HTTP responses due to client error
|
||||
statServerError = "serverError" // Number of HTTP responses due to server error
|
||||
statRequest = "req" // Number of HTTP requests served.
|
||||
statQueryRequest = "queryReq" // Number of query requests served.
|
||||
statWriteRequest = "writeReq" // Number of write requests serverd.
|
||||
statPingRequest = "pingReq" // Number of ping requests served.
|
||||
statStatusRequest = "statusReq" // Number of status requests served.
|
||||
statWriteRequestBytesReceived = "writeReqBytes" // Sum of all bytes in write requests.
|
||||
statQueryRequestBytesTransmitted = "queryRespBytes" // Sum of all bytes returned in query reponses.
|
||||
statPointsWrittenOK = "pointsWrittenOK" // Number of points written OK.
|
||||
statPointsWrittenDropped = "pointsWrittenDropped" // Number of points dropped by the storage engine.
|
||||
statPointsWrittenFail = "pointsWrittenFail" // Number of points that failed to be written.
|
||||
statAuthFail = "authFail" // Number of authentication failures.
|
||||
statRequestDuration = "reqDurationNs" // Number of (wall-time) nanoseconds spent inside requests.
|
||||
statQueryRequestDuration = "queryReqDurationNs" // Number of (wall-time) nanoseconds spent inside query requests.
|
||||
statWriteRequestDuration = "writeReqDurationNs" // Number of (wall-time) nanoseconds spent inside write requests.
|
||||
statRequestsActive = "reqActive" // Number of currently active requests.
|
||||
statWriteRequestsActive = "writeReqActive" // Number of currently active write requests.
|
||||
statClientError = "clientError" // Number of HTTP responses due to client error.
|
||||
statServerError = "serverError" // Number of HTTP responses due to server error.
|
||||
statRecoveredPanics = "recoveredPanics" // Number of panics recovered by HTTP handler.
|
||||
)
|
||||
|
||||
// Service manages the listener and handler for an HTTP endpoint.
|
||||
|
|
Loading…
Reference in New Issue