diff --git a/content/telegraf/v1/data_formats/input/csv.md b/content/telegraf/v1/data_formats/input/csv.md index 435f37ca2..97217ee67 100644 --- a/content/telegraf/v1/data_formats/input/csv.md +++ b/content/telegraf/v1/data_formats/input/csv.md @@ -18,146 +18,248 @@ Use the `csv` input data format to parse comma-separated values into Telegraf me [[inputs.file]] files = ["example"] - ## Data format to consume. - ## Each data format has its own unique set of configuration options, read - ## more about them here: - ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + ## The data format to consume. + ## Type: string + ## Each data format has its own unique set of configuration options. + ## For more information about input data formats and options, + ## see https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md data_format = "csv" - ## Indicates how many rows to treat as a header. By default, the parser assumes - ## there is no header and will parse the first row as data. If set to anything more - ## than 1, column names will be concatenated with the name listed in the next header row. - ## If `csv_column_names` is specified, the column names in header will be overridden. + ## Specifies the number of rows to treat as the header. + ## Type: integer + ## Default: 0 + ## The value can be 0 or greater. + ## If `0`, doesn't use a header; the parser treats all rows as data and uses the names specified in `csv_column_names`. + ## If `1`, uses the first row as the header. + ## If greater than `1`, concatenates that number of values for each column. + ## Values specified in `csv_column_names` override column names in the header. csv_header_row_count = 0 - ## For assigning custom names to columns - ## If this is specified, all columns should have a name - ## Unnamed columns will be ignored by the parser. - ## If `csv_header_row_count` is set to 0, this config must be used + ## Specifies custom names for columns. + ## Type: []string + ## Default: [] + ## Specify names in order by column; unnamed columns are ignored by the parser. + ## Required if `csv_header_row_count` is set to `0`. csv_column_names = [] - ## For assigning explicit data types to columns. - ## Supported types: "int", "float", "bool", "string". - ## Specify types in order by column (e.g. `["string", "int", "float"]`) - ## If this is not specified, type conversion will be done on the types above. + ## Specifies data types for columns. + ## Type: []string{"int", "float", "bool", "string"} + ## Default: Tries to convert each column to one of the possible types, in the following order: "int", "float", "bool", "string". + ## Possible values: "int", "float", "bool", "string". + ## Specify types in order by column (for example, `["string", "int", "float"]`). csv_column_types = [] - ## Indicates the number of rows to skip before looking for metadata and header information. + ## Specifies the number of rows to skip before looking for metadata and header information. + ## Default: 0 csv_skip_rows = 0 - ## Indicates the number of rows to parse as metadata before looking for header information. - ## By default, the parser assumes there are no metadata rows to parse. - ## If set, the parser would use the provided separators in the csv_metadata_separators to look for metadata. - ## Please note that by default, the (key, value) pairs will be added as tags. - ## If fields are required, use the converter processor. + ## Specifies the number of rows to parse as metadata (before looking for header information). + ## Type: integer + ## Default: 0; no metadata rows to parse. + ## If set, parses the rows using the characters specified in `csv_metadata_separators`, and then adds the + ## parsed key-value pairs as tags in the data. + ## To convert the tags to fields, use the converter processor. csv_metadata_rows = 0 - ## A list of metadata separators. If csv_metadata_rows is set, - ## csv_metadata_separators must contain at least one separator. - ## Please note that separators are case sensitive and the sequence of the separators are respected. + ## Specifies metadata separators, in order of precedence, for parsing metadata rows. + ## Type: []string + ## At least one separator is required if `csv_metadata_rows` is set. + ## The specified values set the order of precedence for separators used to parse `csv_metadata_rows` into key-value pairs. + ## Separators are case-sensitive. csv_metadata_separators = [":", "="] - ## A set of metadata trim characters. - ## If csv_metadata_trim_set is not set, no trimming is performed. - ## Please note that the trim cutset is case sensitive. + ## Specifies a set of characters to trim from metadata rows. + ## Type: string + ## Default: empty; the parser doesn't trim metadata rows. + ## Trim characters are case sensitive. csv_metadata_trim_set = "" - ## Indicates the number of columns to skip before looking for data to parse. - ## These columns will be skipped in the header as well. + ## Specifies the number of columns to skip in header and data rows. + ## Type: integer + ## Default: 0; no columns are skipped csv_skip_columns = 0 - ## The separator between csv fields - ## By default, the parser assumes a comma (",") - ## Please note that if you use invalid delimiters (e.g. "\u0000"), commas - ## will be changed to "\ufffd", the invalid delimiters changed to a comma - ## during parsing, and afterwards the invalid characters and commas are - ## returned to their original values. + ## Specifies the separator for columns in the CSV. + ## Type: string + ## Default: a comma (`,`) + ## If you specify an invalid delimiter (for example, `"\u0000"`), + ## the parser converts commas to `"\ufffd"` and converts invalid delimiters + ## to commas, parses the data, and then reverts invalid characters and commas + ## to their original values. csv_delimiter = "," - ## The character reserved for marking a row as a comment row - ## Commented rows are skipped and not parsed + ## Specifies the character used to indicate a comment row. + ## Type: string + ## Default: empty; no rows are treated as comments + ## The parser skips rows that begin with the specified character. csv_comment = "" - ## If set to true, the parser will remove leading whitespace from fields - ## By default, this is false + ## Specifies whether to remove leading whitespace from fields. + ## Type: boolean + ## Default: false csv_trim_space = false - ## Columns listed here will be added as tags. Any other columns - ## will be added as fields. + ## Specifies columns (by name) to use as tags. + ## Type: []string + ## Default: empty + ## Columns not specified as tags or measurement name are considered fields. csv_tag_columns = [] - ## Set to true to let the column tags overwrite the metadata and default tags. + ## Specifies whether column tags overwrite metadata and default tags. + ## Type: boolean + ## Default: false + ## If true, the column tag value takes precedence over metadata + ## or default tags that have the same name. csv_tag_overwrite = false - ## The column to extract the name of the metric from. Will not be - ## included as field in metric. + ## Specifies the CSV column to use for the measurement name. + ## Type: string + ## Default: empty; uses the input plugin name for the measurement name. + ## If set, the measurement name is extracted from values in the specified + ## column and the column isn't included as a field. csv_measurement_column = "" - ## The column to extract time information for the metric - ## `csv_timestamp_format` must be specified if this is used. - ## Will not be included as field in metric. + ## Specifies the CSV column to use for the timestamp. + ## Type: string + ## Default: empty; uses the current system time as the timestamp in metrics + ## If set, the parser extracts time values from the specified column + ## to use as timestamps in metrics, and the column isn't included + ## as a field in metrics. + ## If set, you must also specify a value for `csv_timestamp_format`. + ## For more information, see [timestamps](/telegraf/v1/data_formats/input/csv/#timestamps). csv_timestamp_column = "" - ## The format of time data extracted from `csv_timestamp_column` - ## this must be specified if `csv_timestamp_column` is specified + ## Specifies the timestamp format for values extracted from `csv_timestamp_column`. + ## Type: string + ## Possible values: "unix", "unix_ms", "unix_us", "unix_ns", the Go reference time in one of the predefined layouts + ## Default: empty + ## Required if `csv_timestamp_column` is specified. + ## For more information, see [timestamps](/telegraf/v1/data_formats/input/csv/#timestamps). csv_timestamp_format = "" - ## The timezone of time data extracted from `csv_timestamp_column` - ## in case of there is no timezone information. - ## It follows the IANA Time Zone database. + ## Specifies the time zone to use and outputs location-specific timestamps in metrics. + ## Only used if `csv_timestamp_format` is the Go reference time in one of the + ## predefined layouts; unix formats are in UTC. + ## Type: string + ## Default: empty + ## Possible values: a time zone name in TZ syntax. For a list of names, see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List. csv_timezone = "" + ## For more information, see [timestamps](/telegraf/v1/data_formats/input/csv/#timestamps). - ## Indicates values to skip, such as an empty string value "". - ## The field will be skipped entirely where it matches any values inserted here. + ## Specifies values to skip--for example, an empty string (`""`). + ## Type: []string + ## Default: empty + ## The parser skips field values that match any of the specified values. csv_skip_values = [] - ## If set to true, the parser will skip csv lines that cannot be parsed. - ## By default, this is false + ## Specifies whether to skip CSV lines that can't be parsed. + ## Type: boolean + ## Default: false csv_skip_errors = false - ## Reset the parser on given conditions. - ## This option can be used to reset the parser's state e.g. when always reading a - ## full CSV structure including header etc. Available modes are - ## "none" -- do not reset the parser (default) - ## "always" -- reset the parser with each call (ignored in line-wise parsing) - ## Helpful when e.g. reading whole files in each gather-cycle. - # csv_reset_mode = "none" + ## Specifies whether to reset the parser after each call. + ## Type: string + ## Default: "none" + ## Possible values: + ## - "none": Do not reset the parser. + ## - "always": Reset the parser's state after reading each file in the gather + ## cycle. If parsing by line, the setting is ignored. + ## Resetting the parser state after parsing each file is helpful when reading + ## full CSV structures that include headers or metadata. + csv_reset_mode = "none" ``` -### csv_timestamp_column, csv_timestamp_format - -By default, the current time will be used for all created metrics, to set the -time using the JSON document you can use the `csv_timestamp_column` and -`csv_timestamp_format` options together to set the time to a value in the parsed -document. - -The `csv_timestamp_column` option specifies the key containing the time value -and `csv_timestamp_format` must be set to `unix`, `unix_ms`, `unix_us`, -`unix_ns`, or a format string in using the Go "reference time" which is defined -to be the **specific time**: `Mon Jan 2 15:04:05 MST 2006`. - -Consult the Go [time][time parse] package for details and additional examples -on how to set the time format. - ## Metrics -One metric is created for each row with the columns added as fields. The type -of the field is automatically determined based on the contents of the value. +With the default configuration, the CSV data format parser creates one metric +for each CSV row, and adds CSV columns as fields in the metric. +A field's data type is automatically determined from its value (unless explicitly defined with `csv_column_types`). -In addition to the options above, you can use [metric filtering][] to skip over -columns and rows. +Data format configuration options let you customize how the parser handles +specific CSV rows, columns, and data types. + +[Metric filtering](/telegraf/v1/configuration/#metric-filtering) and [aggregator and processor plugins](/telegraf/v1/configure_plugins/aggregator_processor/) provide additional data transformation options--for example: +- Use metric filtering to skip columns and rows. +- Use the [converter processor](https://github.com/influxdata/telegraf/tree/master/plugins/processors/converter/) to convert parsed metadata from tags to fields. + +## Timestamps + +Every metric has a timestamp--a date and time associated with the fields. +The default timestamp for created metrics is the _current time_ in UTC. + +To use extracted values from the CSV as timestamps for metrics, specify +the `csv_timestamp_column` and `csv_timestamp_format` options. + +### csv_timestamp_column + +The `csv_timestamp_column` option specifies the key (column name) in the CSV data +that contains the time value to extract and use as the timestamp in metrics. + +A unix time value may be one of the following data types: + +- int64 +- float64 +- string + +If you specify a [Go format](https://go.dev/src/time/format.go) for `csv_timestamp_format`, +values in your timestamp column must be strings. + +When using the [`"unix"` format](#csv_timestamp_format), an optional fractional component is allowed. +Other unix time formats, such as `"unix_ms"`, cannot have a fractional component. + +### csv_timestamp_format + +If specifying `csv_timestamp_column`, you must also specify the format of timestamps in the column. +To specify the format, set `csv_timestamp_format` to one of the following values: + +- `"unix"` +- `"unix_ms"` +- `"unix_us"` +- `"unix_ns"` +- a predefined layout from Go [`time` constants](https://pkg.go.dev/time#pkg-constants) using the + Go _reference time_--for example, `"Mon Jan 2 15:04:05 MST 2006"` (the `UnixDate` format string). + +For more information about time formats, see the following: + +- Unix time documentation +- Go [time][time parse] package documentation + +### Time zone + +Telegraf outputs timestamps in UTC. + +To parse location-aware timestamps in your data, +specify a [`csv_timestamp_format`](#csv_timestamp_format) +that contains time zone information. + +If timestamps in the `csv_timestamp_column` contain a time zone offset, the parser uses the offset to calculate the timestamp in UTC. + +If `csv_timestamp_format` and your timestamp data contain a time zone abbreviation, then the parser tries to resolve the abbreviation to a location in the [IANA Time Zone Database](https://www.iana.org/time-zones) and return a UTC offset for that location. +To set the location that the parser should use when resolving time zone abbreviations, specify a value for `csv_timezone`, following the TZ syntax in the [Internet Assigned Numbers Authority time zone database](https://en.wikipedia.org/wiki/List_of_tz_database_time_zones#List). + +{{% warn %}} +Prior to Telegraf v1.27, the Telegraf parser ignored abbreviated time zones (for example, "EST") in parsed time values, and used UTC for the timestamp location. +{{% /warn %}} ## Examples -Config: +### Extract timestamps from a time column using RFC3339 format + +Configuration: ```toml +[agent] + omit_hostname = true [[inputs.file]] files = ["example"] data_format = "csv" csv_header_row_count = 1 + csv_measurement_column = "measurement" csv_timestamp_column = "time" csv_timestamp_format = "2006-01-02T15:04:05Z07:00" +[[outputs.file]] + files = ["metrics.out"] + influx_sort_fields = true ``` Input: @@ -167,25 +269,135 @@ measurement,cpu,time_user,time_system,time_idle,time cpu,cpu0,42,42,42,2018-09-13T13:03:28Z ``` -Output: - -```text -cpu cpu=cpu0,time_user=42,time_system=42,time_idle=42 1536869008000000000 -``` - -Config: - -```toml + + +Output: + + + +``` +cpu cpu="cpu0",time_idle=42i,time_system=42i,time_user=42i 1536843808000000000 +``` + +### Parse timestamp abbreviations + +The following example specifies `csv_timezone` for resolving an associated time zone (`EST`) in the input data: + +Configuration: + +```toml +[agent] + omit_hostname = true +[[inputs.file]] + files = ["example"] + data_format = "csv" + csv_header_row_count = 1 + csv_measurement_column = "measurement" + csv_timestamp_column = "time" + csv_timestamp_format = "Mon, 02 Jan 2006 15:04:05 MST" + csv_timezone = "America/New_York" +[[outputs.file]] + files = ["metrics.out"] + influx_sort_fields = true +``` + +Input: + +```csv +measurement,cpu,time_user,time_system,time_idle,time +cpu,cpu1,42,42,42,"Mon, 02 Jan 2006 15:04:05 EST" +cpu,cpu1,42,42,42,"Mon, 02 Jan 2006 15:04:05 GMT" +``` + + + +The parser resolves the `GMT` and `EST` abbreviations and outputs the following: + + + +``` +cpu cpu="cpu1",time_idle=42i,time_system=42i,time_user=42i 1136232245000000000 +cpu cpu="cpu1",time_idle=42i,time_system=42i,time_user=42i 1136214245000000000 +``` + +The timestamps represent the following dates, respectively: + +```text +2006-01-02 20:04:05 +2006-01-02 15:04:05 +``` + +### Parse metadata into tags + +Configuration: + +```toml +[agent] + omit_hostname = true +[[inputs.file]] + files = ["example"] + data_format = "csv" + csv_measurement_column = "measurement" csv_metadata_rows = 2 csv_metadata_separators = [":", "="] - csv_metadata_trim_set = " #" + csv_metadata_trim_set = "# " csv_header_row_count = 1 csv_tag_columns = ["Version","cpu"] csv_timestamp_column = "time" csv_timestamp_format = "2006-01-02T15:04:05Z07:00" +[[outputs.file]] + files = ["metrics.out"] + influx_sort_fields = true ``` Input: @@ -197,18 +409,57 @@ Version,measurement,cpu,time_user,time_system,time_idle,time 1.2,cpu,cpu0,42,42,42,2018-09-13T13:03:28Z ``` -Output: - -```text -cpu,cpu=cpu0,File\ Created=2021-11-17T07:02:45+10:00,Version=1.1 time_user=42,time_system=42,time_idle=42 1536869008000000000 -``` - -Config: - -```toml + + +Output: + + + +``` +cpu,File\ Created=2021-11-17T07:02:45+10:00,Version=1.1,cpu=cpu0 time_idle=42i,time_system=42i,time_user=42i 1536843808000000000 +``` + +### Allow tag column values to overwrite parsed metadata + +Configuration: + +```toml +[agent] + omit_hostname = true +[[inputs.file]] + files = ["example"] + data_format = "csv" + csv_measurement_column = "measurement" csv_metadata_rows = 2 csv_metadata_separators = [":", "="] csv_metadata_trim_set = " #" @@ -217,6 +468,9 @@ Config: csv_tag_overwrite = true csv_timestamp_column = "time" csv_timestamp_format = "2006-01-02T15:04:05Z07:00" +[[outputs.file]] + files = ["metrics.out"] + influx_sort_fields = true ``` Input: @@ -228,10 +482,117 @@ Version,measurement,cpu,time_user,time_system,time_idle,time 1.2,cpu,cpu0,42,42,42,2018-09-13T13:03:28Z ``` + + Output: -```text -cpu,cpu=cpu0,File\ Created=2021-11-17T07:02:45+10:00,Version=1.2 time_user=42,time_system=42,time_idle=42 1536869008000000000 + + +``` +cpu,File\ Created=2021-11-17T07:02:45+10:00,Version=1.2,cpu=cpu0 time_idle=42i,time_system=42i,time_user=42i 1536843808000000000 +``` + +### Combine multiple header rows + +Configuration: + +```toml +[agent] + omit_hostname = true +[[inputs.file]] + files = ["example"] + data_format = "csv" + csv_comment = "#" + csv_header_row_count = 2 + csv_measurement_column = "measurement" + csv_timestamp_column = "time" + csv_timestamp_format = "2006-01-02T15:04:05Z07:00" +[[outputs.file]] + ## Files to write to. + files = ["metrics.out"] + ## Use determinate ordering. + influx_sort_fields = true +``` + +Input: + +```csv +# Version=1.1 +# File Created: 2021-11-17T07:02:45+10:00 +Version,measurement,cpu,time,time,time,time +_system,,,_user,_system,_idle, +1.2,cpu,cpu0,42,42,42,2018-09-13T13:03:28Z +``` + + + +Output: + + + +``` +cpu Version_system=1.2,cpu="cpu0",time_idle=42i,time_system=42i,time_user=42i 1536843808000000000 ``` [time parse]: https://pkg.go.dev/time#Parse diff --git a/test/run-tests.sh b/test/run-tests.sh index 622279d27..a7f30d15a 100644 --- a/test/run-tests.sh +++ b/test/run-tests.sh @@ -96,11 +96,16 @@ gpg -q --batch --yes --delete-key D8FF8E1F7DF8B07E > /dev/null 2>&1 # Run test commands with options provided in the CMD of the Dockerfile. # pytest rootdir is the directory where pytest.ini is located (/test). if [ -d ./content/influxdb/cloud-dedicated/ ]; then -echo "Running cloud-dedicated tests..." +echo "Running content/influxdb/cloud-dedicated tests..." pytest --codeblocks --envfile $BASE_DIR/.env.dedicated ./content/influxdb/cloud-dedicated/ $@ fi if [ -d ./content/influxdb/cloud-serverless/ ]; then -echo "Running cloud-serverless tests..." +echo "Running content/influxdb/cloud-serverless tests..." pytest --codeblocks --envfile $BASE_DIR/.env.serverless ./content/influxdb/cloud-serverless/ $@ fi + +if [ -d ./content/telegraf/ ]; then +echo "Running content/telegraf tests..." +pytest --codeblocks --envfile $BASE_DIR/.env.telegraf ./content/telegraf/ $@ +fi