Merge pull request #8550 from influxdata/er-8548-panic

Allow panic recovery mechanism to be disabled
pull/8570/head
Edd Robinson 2017-07-05 22:09:09 +01:00 committed by GitHub
commit 7374e4e8a4
4 changed files with 71 additions and 20 deletions

View File

@ -9,6 +9,7 @@
### Features
- [#8426](https://github.com/influxdata/influxdb/issues/8426): Add `parse-multivalue-plugin` to allow users to choose how multivalue plugins should be handled by the collectd service.
- [#8548](https://github.com/influxdata/influxdb/issues/8548): Allow panic recovery to be disabled when investigating server issues.
### Bugfixes

View File

@ -3,7 +3,9 @@ package influxql
import (
"errors"
"fmt"
"os"
"runtime/debug"
"strconv"
"sync"
"sync/atomic"
"time"
@ -36,10 +38,15 @@ var (
// Statistics for the QueryExecutor
const (
statQueriesActive = "queriesActive" // Number of queries currently being executed
statQueriesActive = "queriesActive" // Number of queries currently being executed.
statQueriesExecuted = "queriesExecuted" // Number of queries that have been executed (started).
statQueriesFinished = "queriesFinished" // Number of queries that have finished.
statQueryExecutionDuration = "queryDurationNs" // Total (wall) time spent executing queries
statQueryExecutionDuration = "queryDurationNs" // Total (wall) time spent executing queries.
statRecoveredPanics = "recoveredPanics" // Number of panics recovered by Query Executor.
// PanicCrashEnv is the environment variable that, when set, will prevent
// the handler from recovering any panics.
PanicCrashEnv = "INFLUXDB_PANIC_CRASH"
)
// ErrDatabaseNotFound returns a database not found error for the given database name.
@ -208,6 +215,7 @@ type QueryStatistics struct {
ExecutedQueries int64
FinishedQueries int64
QueryExecutionDuration int64
RecoveredPanics int64
}
// Statistics returns statistics for periodic monitoring.
@ -220,6 +228,7 @@ func (e *QueryExecutor) Statistics(tags map[string]string) []models.Statistic {
statQueriesExecuted: atomic.LoadInt64(&e.stats.ExecutedQueries),
statQueriesFinished: atomic.LoadInt64(&e.stats.FinishedQueries),
statQueryExecutionDuration: atomic.LoadInt64(&e.stats.QueryExecutionDuration),
statRecoveredPanics: atomic.LoadInt64(&e.stats.RecoveredPanics),
},
}}
}
@ -392,13 +401,32 @@ LOOP:
}
}
// Determines if the QueryExecutor will recover any panics or let them crash
// the server.
var willCrash bool
func init() {
var err error
if willCrash, err = strconv.ParseBool(os.Getenv(PanicCrashEnv)); err != nil {
willCrash = false
}
}
func (e *QueryExecutor) recover(query *Query, results chan *Result) {
if err := recover(); err != nil {
atomic.AddInt64(&e.stats.RecoveredPanics, 1) // Capture the panic in _internal stats.
e.Logger.Error(fmt.Sprintf("%s [panic:%s] %s", query.String(), err, debug.Stack()))
results <- &Result{
StatementID: -1,
Err: fmt.Errorf("%s [panic:%s]", query.String(), err),
}
if willCrash {
e.Logger.Error(fmt.Sprintf("\n\n=====\nAll goroutines now follow:"))
buf := debug.Stack()
e.Logger.Error(fmt.Sprintf("%s", buf))
os.Exit(1)
}
}
}

View File

@ -180,6 +180,7 @@ type Statistics struct {
ActiveWriteRequests int64
ClientErrors int64
ServerErrors int64
RecoveredPanics int64
}
// Statistics returns statistics for periodic monitoring.
@ -206,6 +207,7 @@ func (h *Handler) Statistics(tags map[string]string) []models.Statistic {
statWriteRequestsActive: atomic.LoadInt64(&h.stats.ActiveWriteRequests),
statClientError: atomic.LoadInt64(&h.stats.ClientErrors),
statServerError: atomic.LoadInt64(&h.stats.ServerErrors),
statRecoveredPanics: atomic.LoadInt64(&h.stats.RecoveredPanics),
},
}}
}
@ -1185,6 +1187,17 @@ func (h *Handler) responseWriter(inner http.Handler) http.Handler {
})
}
// if the env var is set, and the value is truthy, then we will *not*
// recover from a panic.
var willCrash bool
func init() {
var err error
if willCrash, err = strconv.ParseBool(os.Getenv(influxql.PanicCrashEnv)); err != nil {
willCrash = false
}
}
func (h *Handler) recovery(inner http.Handler, name string) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
@ -1196,6 +1209,14 @@ func (h *Handler) recovery(inner http.Handler, name string) http.Handler {
logLine = fmt.Sprintf("%s [panic:%s] %s", logLine, err, debug.Stack())
h.CLFLogger.Println(logLine)
http.Error(w, http.StatusText(http.StatusInternalServerError), 500)
atomic.AddInt64(&h.stats.RecoveredPanics, 1) // Capture the panic in _internal stats.
if willCrash {
h.CLFLogger.Println("\n\n=====\nAll goroutines now follow:")
buf := debug.Stack()
h.CLFLogger.Printf("%s\n", buf)
os.Exit(1) // If we panic then the Go server will recover.
}
}
}()

View File

@ -19,24 +19,25 @@ import (
// statistics gathered by the httpd package.
const (
statRequest = "req" // Number of HTTP requests served
statQueryRequest = "queryReq" // Number of query requests served
statWriteRequest = "writeReq" // Number of write requests serverd
statPingRequest = "pingReq" // Number of ping requests served
statStatusRequest = "statusReq" // Number of status requests served
statWriteRequestBytesReceived = "writeReqBytes" // Sum of all bytes in write requests
statQueryRequestBytesTransmitted = "queryRespBytes" // Sum of all bytes returned in query reponses
statPointsWrittenOK = "pointsWrittenOK" // Number of points written OK
statPointsWrittenDropped = "pointsWrittenDropped" // Number of points dropped by the storage engine
statPointsWrittenFail = "pointsWrittenFail" // Number of points that failed to be written
statAuthFail = "authFail" // Number of authentication failures
statRequestDuration = "reqDurationNs" // Number of (wall-time) nanoseconds spent inside requests
statQueryRequestDuration = "queryReqDurationNs" // Number of (wall-time) nanoseconds spent inside query requests
statWriteRequestDuration = "writeReqDurationNs" // Number of (wall-time) nanoseconds spent inside write requests
statRequestsActive = "reqActive" // Number of currently active requests
statWriteRequestsActive = "writeReqActive" // Number of currently active write requests
statClientError = "clientError" // Number of HTTP responses due to client error
statServerError = "serverError" // Number of HTTP responses due to server error
statRequest = "req" // Number of HTTP requests served.
statQueryRequest = "queryReq" // Number of query requests served.
statWriteRequest = "writeReq" // Number of write requests serverd.
statPingRequest = "pingReq" // Number of ping requests served.
statStatusRequest = "statusReq" // Number of status requests served.
statWriteRequestBytesReceived = "writeReqBytes" // Sum of all bytes in write requests.
statQueryRequestBytesTransmitted = "queryRespBytes" // Sum of all bytes returned in query reponses.
statPointsWrittenOK = "pointsWrittenOK" // Number of points written OK.
statPointsWrittenDropped = "pointsWrittenDropped" // Number of points dropped by the storage engine.
statPointsWrittenFail = "pointsWrittenFail" // Number of points that failed to be written.
statAuthFail = "authFail" // Number of authentication failures.
statRequestDuration = "reqDurationNs" // Number of (wall-time) nanoseconds spent inside requests.
statQueryRequestDuration = "queryReqDurationNs" // Number of (wall-time) nanoseconds spent inside query requests.
statWriteRequestDuration = "writeReqDurationNs" // Number of (wall-time) nanoseconds spent inside write requests.
statRequestsActive = "reqActive" // Number of currently active requests.
statWriteRequestsActive = "writeReqActive" // Number of currently active write requests.
statClientError = "clientError" // Number of HTTP responses due to client error.
statServerError = "serverError" // Number of HTTP responses due to server error.
statRecoveredPanics = "recoveredPanics" // Number of panics recovered by HTTP handler.
)
// Service manages the listener and handler for an HTTP endpoint.