diff --git a/cmd/influxd/main.go b/cmd/influxd/main.go index e09565cdbf..10e7acebc0 100644 --- a/cmd/influxd/main.go +++ b/cmd/influxd/main.go @@ -15,13 +15,13 @@ import ( "github.com/influxdata/flux/control" "github.com/influxdata/flux/execute" - influxlogger "github.com/influxdata/influxdb/logger" "github.com/influxdata/platform" "github.com/influxdata/platform/bolt" "github.com/influxdata/platform/chronograf/server" "github.com/influxdata/platform/gather" "github.com/influxdata/platform/http" "github.com/influxdata/platform/kit/prom" + influxlogger "github.com/influxdata/platform/logger" "github.com/influxdata/platform/nats" "github.com/influxdata/platform/query" _ "github.com/influxdata/platform/query/builtin" diff --git a/gather/scheduler_test.go b/gather/scheduler_test.go index e7a62b1121..b26e4987ab 100644 --- a/gather/scheduler_test.go +++ b/gather/scheduler_test.go @@ -8,8 +8,8 @@ import ( "time" "github.com/google/go-cmp/cmp" - influxlogger "github.com/influxdata/influxdb/logger" "github.com/influxdata/platform" + influxlogger "github.com/influxdata/platform/logger" "github.com/influxdata/platform/mock" ) diff --git a/go.mod b/go.mod index 994553939e..61b77bc322 100644 --- a/go.mod +++ b/go.mod @@ -48,11 +48,12 @@ require ( github.com/influxdata/roaring v0.4.12 // indirect github.com/influxdata/usage-client v0.0.0-20160829180054-6d3895376368 github.com/jessevdk/go-flags v1.4.0 - github.com/jsternberg/zap-logfmt v1.2.0 // indirect + github.com/jsternberg/zap-logfmt v1.2.0 github.com/jtolds/gls v4.2.1+incompatible // indirect github.com/julienschmidt/httprouter v0.0.0-20180715161854-348b672cd90d github.com/jwilder/encoding v0.0.0-20170811194829-b4e1701a28ef github.com/kevinburke/go-bindata v3.11.0+incompatible + github.com/mattn/go-isatty v0.0.4 github.com/mattn/go-zglob v0.0.0-20180803001819-2ea3427bfa53 // indirect github.com/matttproud/golang_protobuf_extensions v1.0.1 github.com/mitchellh/go-homedir v1.0.0 // indirect diff --git a/http/server.go b/http/server.go index 1679b8e4a0..89699523be 100644 --- a/http/server.go +++ b/http/server.go @@ -10,7 +10,7 @@ import ( "syscall" "time" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "go.uber.org/zap" ) diff --git a/logger/config.go b/logger/config.go new file mode 100644 index 0000000000..210ebc1ce7 --- /dev/null +++ b/logger/config.go @@ -0,0 +1,18 @@ +package logger + +import ( + "go.uber.org/zap/zapcore" +) + +type Config struct { + Format string `toml:"format"` + Level zapcore.Level `toml:"level"` + SuppressLogo bool `toml:"suppress-logo"` +} + +// NewConfig returns a new instance of Config with defaults. +func NewConfig() Config { + return Config{ + Format: "auto", + } +} diff --git a/logger/context.go b/logger/context.go new file mode 100644 index 0000000000..3b4b775e64 --- /dev/null +++ b/logger/context.go @@ -0,0 +1,24 @@ +package logger + +import ( + "context" + + "go.uber.org/zap" +) + +type key int + +const ( + loggerKey key = iota +) + +// NewContextWithLogger returns a new context with log added. +func NewContextWithLogger(ctx context.Context, log *zap.Logger) context.Context { + return context.WithValue(ctx, loggerKey, log) +} + +// LoggerFromContext returns the zap.Logger associated with ctx or nil if no logger has been assigned. +func LoggerFromContext(ctx context.Context) *zap.Logger { + l, _ := ctx.Value(loggerKey).(*zap.Logger) + return l +} diff --git a/logger/fields.go b/logger/fields.go new file mode 100644 index 0000000000..ddc49571e0 --- /dev/null +++ b/logger/fields.go @@ -0,0 +1,111 @@ +package logger + +import ( + "time" + + "github.com/influxdata/platform/pkg/snowflake" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +const ( + // TraceIDKey is the logging context key used for identifying unique traces. + TraceIDKey = "trace_id" + + // OperationNameKey is the logging context key used for identifying name of an operation. + OperationNameKey = "op_name" + + // OperationEventKey is the logging context key used for identifying a notable + // event during the course of an operation. + OperationEventKey = "op_event" + + // OperationElapsedKey is the logging context key used for identifying time elapsed to finish an operation. + OperationElapsedKey = "op_elapsed" + + // DBInstanceKey is the logging context key used for identifying name of the relevant database. + DBInstanceKey = "db_instance" + + // DBRetentionKey is the logging context key used for identifying name of the relevant retention policy. + DBRetentionKey = "db_rp" + + // DBShardGroupKey is the logging context key used for identifying relevant shard group. + DBShardGroupKey = "db_shard_group" + + // DBShardIDKey is the logging context key used for identifying name of the relevant shard number. + DBShardIDKey = "db_shard_id" +) +const ( + eventStart = "start" + eventEnd = "end" +) + +var ( + gen = snowflake.New(0) +) + +func nextID() string { + return gen.NextString() +} + +// TraceID returns a field for tracking the trace identifier. +func TraceID(id string) zapcore.Field { + return zap.String(TraceIDKey, id) +} + +// OperationName returns a field for tracking the name of an operation. +func OperationName(name string) zapcore.Field { + return zap.String(OperationNameKey, name) +} + +// OperationElapsed returns a field for tracking the duration of an operation. +func OperationElapsed(d time.Duration) zapcore.Field { + return zap.Duration(OperationElapsedKey, d) +} + +// OperationEventStart returns a field for tracking the start of an operation. +func OperationEventStart() zapcore.Field { + return zap.String(OperationEventKey, eventStart) +} + +// OperationEventFinish returns a field for tracking the end of an operation. +func OperationEventEnd() zapcore.Field { + return zap.String(OperationEventKey, eventEnd) +} + +// Database returns a field for tracking the name of a database. +func Database(name string) zapcore.Field { + return zap.String(DBInstanceKey, name) +} + +// Database returns a field for tracking the name of a database. +func RetentionPolicy(name string) zapcore.Field { + return zap.String(DBRetentionKey, name) +} + +// ShardGroup returns a field for tracking the shard group identifier. +func ShardGroup(id uint64) zapcore.Field { + return zap.Uint64(DBShardGroupKey, id) +} + +// Shard returns a field for tracking the shard identifier. +func Shard(id uint64) zapcore.Field { + return zap.Uint64(DBShardIDKey, id) +} + +// NewOperation uses the exiting log to create a new logger with context +// containing a trace id and the operation. Prior to returning, a standardized message +// is logged indicating the operation has started. The returned function should be +// called when the operation concludes in order to log a corresponding message which +// includes an elapsed time and that the operation has ended. +func NewOperation(log *zap.Logger, msg, name string, fields ...zapcore.Field) (*zap.Logger, func()) { + f := []zapcore.Field{TraceID(nextID()), OperationName(name)} + if len(fields) > 0 { + f = append(f, fields...) + } + + now := time.Now() + log = log.With(f...) + log.Info(msg+" (start)", OperationEventStart()) + + return log, func() { log.Info(msg+" (end)", OperationEventEnd(), OperationElapsed(time.Since(now))) } +} diff --git a/logger/logger.go b/logger/logger.go new file mode 100644 index 0000000000..44dc39cfbf --- /dev/null +++ b/logger/logger.go @@ -0,0 +1,127 @@ +package logger + +import ( + "fmt" + "io" + "time" + + "github.com/jsternberg/zap-logfmt" + isatty "github.com/mattn/go-isatty" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +const TimeFormat = "2006-01-02T15:04:05.000000Z07:00" + +func New(w io.Writer) *zap.Logger { + config := NewConfig() + l, _ := config.New(w) + return l +} + +func (c *Config) New(defaultOutput io.Writer) (*zap.Logger, error) { + w := defaultOutput + format := c.Format + if format == "console" { + // Disallow the console logger if the output is not a terminal. + return nil, fmt.Errorf("unknown logging format: %s", format) + } + + // If the format is empty or auto, then set the format depending + // on whether or not a terminal is present. + if format == "" || format == "auto" { + if IsTerminal(w) { + format = "console" + } else { + format = "logfmt" + } + } + + encoder, err := newEncoder(format) + if err != nil { + return nil, err + } + return zap.New(zapcore.NewCore( + encoder, + zapcore.Lock(zapcore.AddSync(w)), + c.Level, + ), zap.Fields(zap.String("log_id", nextID()))), nil +} + +func newEncoder(format string) (zapcore.Encoder, error) { + config := newEncoderConfig() + switch format { + case "json": + return zapcore.NewJSONEncoder(config), nil + case "console": + return zapcore.NewConsoleEncoder(config), nil + case "logfmt": + return zaplogfmt.NewEncoder(config), nil + default: + return nil, fmt.Errorf("unknown logging format: %s", format) + } +} + +func newEncoderConfig() zapcore.EncoderConfig { + config := zap.NewProductionEncoderConfig() + config.EncodeTime = func(ts time.Time, encoder zapcore.PrimitiveArrayEncoder) { + encoder.AppendString(ts.UTC().Format(TimeFormat)) + } + config.EncodeDuration = func(d time.Duration, encoder zapcore.PrimitiveArrayEncoder) { + val := float64(d) / float64(time.Millisecond) + encoder.AppendString(fmt.Sprintf("%.3fms", val)) + } + config.LevelKey = "lvl" + return config +} + +// IsTerminal checks if w is a file and whether it is an interactive terminal session. +func IsTerminal(w io.Writer) bool { + if f, ok := w.(interface { + Fd() uintptr + }); ok { + return isatty.IsTerminal(f.Fd()) + } + return false +} + +const ( + year = 365 * 24 * time.Hour + week = 7 * 24 * time.Hour + day = 24 * time.Hour +) + +func DurationLiteral(key string, val time.Duration) zapcore.Field { + if val == 0 { + return zap.String(key, "0s") + } + + var ( + value int + unit string + ) + switch { + case val%year == 0: + value = int(val / year) + unit = "y" + case val%week == 0: + value = int(val / week) + unit = "w" + case val%day == 0: + value = int(val / day) + unit = "d" + case val%time.Hour == 0: + value = int(val / time.Hour) + unit = "h" + case val%time.Minute == 0: + value = int(val / time.Minute) + unit = "m" + case val%time.Second == 0: + value = int(val / time.Second) + unit = "s" + default: + value = int(val / time.Millisecond) + unit = "ms" + } + return zap.String(key, fmt.Sprintf("%d%s", value, unit)) +} diff --git a/logger/style_guide.md b/logger/style_guide.md new file mode 100644 index 0000000000..7003fd3f3b --- /dev/null +++ b/logger/style_guide.md @@ -0,0 +1,192 @@ +# Logging Style Guide + +The intention of logging is to give insight to the administrator of how +the server is running and also notify the administrator of any problems +or potential problems with the system. + +At the moment, log level filtering is the only option to configure +logging in InfluxDB. Adding a logging message and choosing its level +should be done according to the guidelines in this document for +operational clarity. The available log levels are: + +* Error +* Warn +* Info +* Debug + +InfluxDB uses structured logging. Structured logging is when you log +messages and attach context to those messages with more easily read data +regarding the state of the system. A structured log message is composed +of: + +* Time +* Level +* Message +* (Optionally) Additional context + +## Guidelines + +**Log messages** should be simple statements or phrases that begin with +a capital letter, but have no punctuation at the end. The message should be a +constant so that every time it is logged it is easily identified and can +be filtered by without regular expressions. + +Any **dynamic content** should be expressed by context. The key should +be a constant and the value is the dynamic content. + +Do not log messages in tight loops or other high performance locations. +It will likely create a performance problem. + +## Naming Conventions + +If the log encoding format uses keys for the time, message, or level, +the key names should be `ts` for time, `msg` for the message, and +`lvl` for the level. + +If the log encoding format does not use keys for the time, message, or +level and instead outputs them in some other method, this guideline can +be ignored. The output formats logfmt and json both use keys when +encoding these values. + +### Context Key Names + +The key for the dynamic content in the context should be formatted in +`snake_case`. The key should be completely lower case. + +## Levels + +As a reminder, levels are usually the only way to configure what is +logged. There are four available logging levels. + +* Error +* Warn +* Info +* Debug + +It is important to get the right logging level to ensure the log +messages are useful for end users to act on. + +In general, when considering which log level to use, you should use +**info**. If you are considering using another level, read the below +expanded descriptions to determine which level your message belongs in. + +### Error + +The **error** level is intended to communicate that there is a serious +problem with the server. **An error should be emitted only when an +on-call engineer can take some action to remedy the situation _and_ the +system cannot continue operating properly without remedying the +situation.** + +An example of what may qualify as an error level message is the creation +of the internal storage for the monitor service. For that system to +function at all, a database must be created. If no database is created, +the service itself cannot function. The error has a clear actionable +solution. Figure out why the database isn't being created and create it. + +An example of what does not qualify as an error is failing to parse a +query or a socket closing prematurely. Both of these usually indicate +some kind of user error rather than system error. Both are ephemeral +errors and they would not be clearly actionable to an administrator who +was paged at 3 AM. Both of these are examples of logging messages that +should be emitted at the info level with an error key rather than being +logged at the error level. + +Logged errors **must not propagate**. Propagating the error risks +logging it in multiple locations and confusing users when the same error +is reported multiple times. In general, if you are returning an error, +never log at any level. By returning the error, you are telling the +parent function to handle the error. Logging a message at any level is +handling the error. + +This logging message should be used very rarely and any messages that +use this logging level should not repeat frequently. Assume that +anything that is logged with error will page someone in the middle of +the night. + +### Warn + +The **warn** level is intended to communicate that there is likely to be +a serious problem with the server if it not addressed. **A warning +should be emitted only when a support engineer can take some action to +remedy the situation _and_ the system may not continue operating +properly in the near future without remedying the situation.** + +An example of what may qualify as a warning is the `max-values-per-tag` +setting. If the server starts to approach the maximum number of values, +the server may stop being able to function properly when it reaches the +maximum number. + +An example of what does not qualify as a warning is the +`log-queries-after` setting. While the message is "warning" that a query +was running for a long period of time, it is not clearly actionable and +does not indicate that the server will fail in the near future. This +should be logged at the info level instead. + +This logging message should be used very rarely and any messages that +use this logging level should not repeat frequently. Assume that +anything that is logged with warn will page someone in the middle of the +night and potentially ignored until normal working hours. + +### Info + +The **info** level should be used for almost anything. If you are not +sure which logging level to use, use info. Temporary or user errors +should be logged at the info level and any informational messages for +administrators should be logged at this level. Info level messages +should be safe for an administrator to discard if they really want to, +but most people will run the system at the info level. + +### Debug + +The **debug** level exists to log messages that are useful only for +debugging a bad running instance. + +This level should be rarely used if ever. If you intend to use this +level, please have a rationale ready. Most messages that could be +considered debug either shouldn't exist or should be logged at the info +level. Debug messages will be suppressed by default. + +## Value Formatting + +Formatting for strings, integers, and other standard values are usually +determined by the log format itself and those will be kept ambiguous. +The following specific formatting choices are for data types that could +be output in multiple ways. + +### Time + +Time values should be encoded using RFC3339 with microsecond precision. +The size of the string should be normalized to the same number of digits +every time to ensure that it is easier to read the time as a column. + +### Duration + +Duration values that denote a period of time should be output in +milliseconds with microsecond precision. The microseconds should be in +decimal form with three decimal points. Durations that denote a static +period of time should be output with a single number and a suffix with +the largest possible unit that doesn't cause the value to be a decimal. + +There are two types of durations. + +* Tracks a (usually small) period of time and is meant for timing how + long something take. The content is dynamic and may be graphed. +* Duration literal where the content is dynamic, is unlikely to be + graphed, and usually comes from some type of configuration. + +If the content is dynamic, the duration should be printed as a number of +milliseconds with a decimal indicating the number of microseconds. Any +duration lower than microseconds should be truncated. The decimal section +should always print exactly 3 points after the decimal point. + +If the content is static, the duration should be printed with a single +number and a suffix indicating the unit in years (`y`), weeks (`w`), +days (`d`), hours (`h`), minutes (`m`), seconds (`s`), or +milliseconds (`ms`). The suffix should be the greatest unit that can be +used without truncating the value. As an example, if the duration is +60 minutes, then `1h` should be used. If the duration is 61 minutes, +then `61m` should be used. + +For anything lower than milliseconds that is static, the duration should +be truncated. A value of zero should be shown as `0s`. diff --git a/storage/reads/group_resultset.go b/storage/reads/group_resultset.go index 68d6e83a85..45370659a6 100644 --- a/storage/reads/group_resultset.go +++ b/storage/reads/group_resultset.go @@ -7,7 +7,7 @@ import ( "sort" "strings" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/storage/reads/datatypes" "github.com/influxdata/platform/tsdb/cursors" diff --git a/storage/retention.go b/storage/retention.go index 4c1fe92baa..ba5288b92b 100644 --- a/storage/retention.go +++ b/storage/retention.go @@ -8,9 +8,9 @@ import ( "sync/atomic" "time" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxql" "github.com/influxdata/platform" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/tsdb" "github.com/prometheus/client_golang/prometheus" diff --git a/task/backend/executor/executor.go b/task/backend/executor/executor.go index 553eb34941..b4be704c1a 100644 --- a/task/backend/executor/executor.go +++ b/task/backend/executor/executor.go @@ -9,7 +9,7 @@ import ( "github.com/influxdata/flux" "github.com/influxdata/flux/lang" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/query" "github.com/influxdata/platform/task/backend" "go.uber.org/zap" diff --git a/tsdb/index_test.go b/tsdb/index_test.go index 184132ddd6..32216d2d53 100644 --- a/tsdb/index_test.go +++ b/tsdb/index_test.go @@ -10,9 +10,9 @@ import ( "sync" "testing" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxdb/query" "github.com/influxdata/influxql" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/slices" "github.com/influxdata/platform/tsdb" diff --git a/tsdb/series_file_test.go b/tsdb/series_file_test.go index 25e575ce20..8dff5df24e 100644 --- a/tsdb/series_file_test.go +++ b/tsdb/series_file_test.go @@ -7,7 +7,7 @@ import ( "os" "testing" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/tsdb" ) diff --git a/tsdb/series_partition.go b/tsdb/series_partition.go index 367dc73e64..40f1d7445e 100644 --- a/tsdb/series_partition.go +++ b/tsdb/series_partition.go @@ -9,7 +9,7 @@ import ( "path/filepath" "sync" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/rhh" "go.uber.org/zap" diff --git a/tsdb/shard_internal_test.go b/tsdb/shard_internal_test.go index 2b25b8cdbd..ea683acc33 100644 --- a/tsdb/shard_internal_test.go +++ b/tsdb/shard_internal_test.go @@ -7,7 +7,7 @@ import ( "strings" "time" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" ) diff --git a/tsdb/tsi1/partition.go b/tsdb/tsi1/partition.go index cd6ee1670b..5e3634488d 100644 --- a/tsdb/tsi1/partition.go +++ b/tsdb/tsi1/partition.go @@ -14,8 +14,8 @@ import ( "time" "unsafe" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxql" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/pkg/bytesutil" "github.com/influxdata/platform/pkg/estimator" "github.com/influxdata/platform/tsdb" diff --git a/tsdb/tsm1/engine.go b/tsdb/tsm1/engine.go index bc0583383d..f96784df84 100644 --- a/tsdb/tsm1/engine.go +++ b/tsdb/tsm1/engine.go @@ -20,12 +20,12 @@ import ( "sync/atomic" "time" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxdb/pkg/metrics" intar "github.com/influxdata/influxdb/pkg/tar" "github.com/influxdata/influxdb/pkg/tracing" "github.com/influxdata/influxdb/query" "github.com/influxdata/influxql" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/bytesutil" "github.com/influxdata/platform/pkg/estimator" diff --git a/tsdb/tsm1/engine_test.go b/tsdb/tsm1/engine_test.go index 263e2ac91f..2ae6694485 100644 --- a/tsdb/tsm1/engine_test.go +++ b/tsdb/tsm1/engine_test.go @@ -20,9 +20,9 @@ import ( "time" "github.com/google/go-cmp/cmp" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxdb/query" "github.com/influxdata/influxql" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/models" "github.com/influxdata/platform/pkg/deep" "github.com/influxdata/platform/tsdb" diff --git a/tsdb/tsm1/file_store_test.go b/tsdb/tsm1/file_store_test.go index 92407729ed..ef62b6e9c6 100644 --- a/tsdb/tsm1/file_store_test.go +++ b/tsdb/tsm1/file_store_test.go @@ -12,7 +12,7 @@ import ( "testing" "time" - "github.com/influxdata/influxdb/logger" + "github.com/influxdata/platform/logger" "github.com/influxdata/platform/tsdb/tsm1" ) diff --git a/tsdb/tsm1/iterator_test.go b/tsdb/tsm1/iterator_test.go index 6327a01fa4..562ab13ed5 100644 --- a/tsdb/tsm1/iterator_test.go +++ b/tsdb/tsm1/iterator_test.go @@ -6,9 +6,9 @@ import ( "testing" "time" - "github.com/influxdata/influxdb/logger" "github.com/influxdata/influxdb/query" "github.com/influxdata/influxql" + "github.com/influxdata/platform/logger" ) func BenchmarkIntegerIterator_Next(b *testing.B) {