influxdb/write/batcher.go

package write

import (
	"bufio"
	"bytes"
	"context"
	"fmt"
	"io"
	"time"

	platform "github.com/influxdata/influxdb"
)

const (
	// DefaultMaxBytes is 500KB; this is typically 250 to 500 lines.
	DefaultMaxBytes = 500000
	// DefaultInterval will flush every 10 seconds.
	DefaultInterval = 10 * time.Second
)

// batcher is a write service that batches for another write service.
var _ platform.WriteService = (*Batcher)(nil)

// Batcher batches line protocol for sends to output.
type Batcher struct {
	MaxFlushBytes    int                   // MaxFlushBytes is the maximum number of bytes to buffer before flushing
	MaxFlushInterval time.Duration         // MaxFlushInterval is the maximum amount of time to wait before flushing
	Service          platform.WriteService // Service receives batches flushed from Batcher.
}

// Write reads r in batches and sends to the output.
func (b *Batcher) Write(ctx context.Context, org, bucket platform.ID, r io.Reader) error {
	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	if b.Service == nil {
		return fmt.Errorf("destination write service required")
	}

	lines := make(chan []byte)

	errC := make(chan error, 2)
	go b.write(ctx, org, bucket, lines, errC)
	go b.read(ctx, r, lines, errC)

	// we loop twice to check if both read and write have an error. if read exits
	// cleanly, then we still want to wait for write.
	for i := 0; i < 2; i++ {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case err := <-errC:
			// onky if there is any error, exit immediately.
			if err != nil {
				return err
			}
		}
	}
	return nil
}

// read will close the line channel when there is no more data, or an error occurs.
// it is possible for an io.Reader to block forever; Write's context can be
// used to cancel, but, it's possible there will be dangling read go routines.
func (b *Batcher) read(ctx context.Context, r io.Reader, lines chan<- []byte, errC chan<- error) {
	defer close(lines)
	scanner := bufio.NewScanner(r)
	scanner.Split(ScanLines)
	for scanner.Scan() {
		// exit early if the context is done
		select {
		case lines <- []byte(scanner.Text()):
		case <-ctx.Done():
			errC <- ctx.Err()
			return
		}
	}
	errC <- scanner.Err()
}

// finishes when the lines channel is closed or context is done.
// if an error occurs while writing data to the write service, the error is send in the
// errC channel and the function returns.
func (b *Batcher) write(ctx context.Context, org, bucket platform.ID, lines <-chan []byte, errC chan<- error) {
	flushInterval := b.MaxFlushInterval
	if flushInterval == 0 {
		flushInterval = DefaultInterval
	}

	maxBytes := b.MaxFlushBytes
	if maxBytes == 0 {
		maxBytes = DefaultMaxBytes
	}

	timer := time.NewTimer(flushInterval)
	defer func() { _ = timer.Stop() }()

	buf := make([]byte, 0, maxBytes)
	r := bytes.NewReader(buf)

	var line []byte
	var more = true
	// if read closes the channel normally, exit the loop
	for more {
		select {
		case line, more = <-lines:
			if more {
				buf = append(buf, line...)
			}
			// write if we exceed the max lines OR read routine has finished
			if len(buf) >= maxBytes || (!more && len(buf) > 0) {
				r.Reset(buf)
				timer.Reset(flushInterval)
				if err := b.Service.Write(ctx, org, bucket, r); err != nil {
					errC <- err
					return
				}
				buf = buf[:0]
			}
		case <-timer.C:
			if len(buf) > 0 {
				r.Reset(buf)
				timer.Reset(flushInterval)
				if err := b.Service.Write(ctx, org, bucket, r); err != nil {
					errC <- err
					return
				}
				buf = buf[:0]
			}
		case <-ctx.Done():
			errC <- ctx.Err()
			return
		}
	}

	errC <- nil
}

// ScanLines is used in bufio.Scanner.Split to split lines of line protocol.
func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}

	if i := bytes.IndexByte(data, '\n'); i >= 0 {
		// We have a full newline-terminated line.
		return i + 1, data[0 : i+1], nil

	}

	// If we're at EOF, we have a final, non-terminated line. Return it.
	if atEOF {
		return len(data), data, nil
	}

	// Request more data.
	return 0, nil, nil
}
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`package write`

			`import (`
			`"bufio"`
			`"bytes"`
			`"context"`
			`"fmt"`
			`"io"`
			`"time"`

chore: rename imports from platform to influxdb I did this with a dumb editor macro, so some comments changed too. Also rename root package from platform to influxdb. In interest of minimizing risk, anyone importing the root package has now aliased it to "platform" so that no changes beyond imports were necessary in those files. Lastly, replace the old platform module to local path /dev/null so that nobody can accidentally reintroduce a platform dependency while migrating platform code to influxdb. 2019-01-08 00:37:16 +00:00			`platform "github.com/influxdata/influxdb"`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`)`

			`const (`
			`// DefaultMaxBytes is 500KB; this is typically 250 to 500 lines.`
			`DefaultMaxBytes = 500000`
			`// DefaultInterval will flush every 10 seconds.`
			`DefaultInterval = 10 * time.Second`
			`)`

			`// batcher is a write service that batches for another write service.`
			`var _ platform.WriteService = (*Batcher)(nil)`

			`// Batcher batches line protocol for sends to output.`
			`type Batcher struct {`
			`MaxFlushBytes int // MaxFlushBytes is the maximum number of bytes to buffer before flushing`
			`MaxFlushInterval time.Duration // MaxFlushInterval is the maximum amount of time to wait before flushing`
			`Service platform.WriteService // Service receives batches flushed from Batcher.`
			`}`

			`// Write reads r in batches and sends to the output.`
			`func (b *Batcher) Write(ctx context.Context, org, bucket platform.ID, r io.Reader) error {`
test(write): add batcher tests 2018-10-25 17:31:22 +00:00			`ctx, cancel := context.WithCancel(ctx)`
			`defer cancel()`

feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`if b.Service == nil {`
			`return fmt.Errorf("destination write service required")`
			`}`

			`lines := make(chan []byte)`

fix(write): fix close logic to avoid race from read error In the case that there is a read error, we would close the lines channel before sending the error into the read error channel. closing lines then allows the write goroutine to possibly send in a nil error before read is able to, causing the main function driving both to return a nil error. Additionally, it is possible for both reads and writes to race sending errors into their channels, and the main goroutine will only read from one, causing the other goroutine to leak. To fix this, we close lines only after we have sent an error into the channel, we ensure we read from both errors to make sure that both have exited, and we unify the channels and add a buffer of size two to the channel. It is possible for write to exit leaving read blocked forever, but write only exits with a nil error when read has exited, so this only happens during an actual write error, just like before. Channels are hard. 2018-12-29 22:08:09 +00:00			`errC := make(chan error, 2)`
			`go b.write(ctx, org, bucket, lines, errC)`
			`go b.read(ctx, r, lines, errC)`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00
fix(write): fix close logic to avoid race from read error In the case that there is a read error, we would close the lines channel before sending the error into the read error channel. closing lines then allows the write goroutine to possibly send in a nil error before read is able to, causing the main function driving both to return a nil error. Additionally, it is possible for both reads and writes to race sending errors into their channels, and the main goroutine will only read from one, causing the other goroutine to leak. To fix this, we close lines only after we have sent an error into the channel, we ensure we read from both errors to make sure that both have exited, and we unify the channels and add a buffer of size two to the channel. It is possible for write to exit leaving read blocked forever, but write only exits with a nil error when read has exited, so this only happens during an actual write error, just like before. Channels are hard. 2018-12-29 22:08:09 +00:00			`// we loop twice to check if both read and write have an error. if read exits`
			`// cleanly, then we still want to wait for write.`
			`for i := 0; i < 2; i++ {`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`select {`
			`case <-ctx.Done():`
			`return ctx.Err()`
fix(write): fix close logic to avoid race from read error In the case that there is a read error, we would close the lines channel before sending the error into the read error channel. closing lines then allows the write goroutine to possibly send in a nil error before read is able to, causing the main function driving both to return a nil error. Additionally, it is possible for both reads and writes to race sending errors into their channels, and the main goroutine will only read from one, causing the other goroutine to leak. To fix this, we close lines only after we have sent an error into the channel, we ensure we read from both errors to make sure that both have exited, and we unify the channels and add a buffer of size two to the channel. It is possible for write to exit leaving read blocked forever, but write only exits with a nil error when read has exited, so this only happens during an actual write error, just like before. Channels are hard. 2018-12-29 22:08:09 +00:00			`case err := <-errC:`
			`// onky if there is any error, exit immediately.`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`if err != nil {`
			`return err`
			`}`
			`}`
			`}`
fix(write): fix close logic to avoid race from read error In the case that there is a read error, we would close the lines channel before sending the error into the read error channel. closing lines then allows the write goroutine to possibly send in a nil error before read is able to, causing the main function driving both to return a nil error. Additionally, it is possible for both reads and writes to race sending errors into their channels, and the main goroutine will only read from one, causing the other goroutine to leak. To fix this, we close lines only after we have sent an error into the channel, we ensure we read from both errors to make sure that both have exited, and we unify the channels and add a buffer of size two to the channel. It is possible for write to exit leaving read blocked forever, but write only exits with a nil error when read has exited, so this only happens during an actual write error, just like before. Channels are hard. 2018-12-29 22:08:09 +00:00			`return nil`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`}`

			`// read will close the line channel when there is no more data, or an error occurs.`
			`// it is possible for an io.Reader to block forever; Write's context can be`
			`// used to cancel, but, it's possible there will be dangling read go routines.`
			`func (b *Batcher) read(ctx context.Context, r io.Reader, lines chan<- []byte, errC chan<- error) {`
fix(write): fix close logic to avoid race from read error In the case that there is a read error, we would close the lines channel before sending the error into the read error channel. closing lines then allows the write goroutine to possibly send in a nil error before read is able to, causing the main function driving both to return a nil error. Additionally, it is possible for both reads and writes to race sending errors into their channels, and the main goroutine will only read from one, causing the other goroutine to leak. To fix this, we close lines only after we have sent an error into the channel, we ensure we read from both errors to make sure that both have exited, and we unify the channels and add a buffer of size two to the channel. It is possible for write to exit leaving read blocked forever, but write only exits with a nil error when read has exited, so this only happens during an actual write error, just like before. Channels are hard. 2018-12-29 22:08:09 +00:00			`defer close(lines)`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`scanner := bufio.NewScanner(r)`
			`scanner.Split(ScanLines)`
			`for scanner.Scan() {`
			`// exit early if the context is done`
			`select {`
fix(cmd/write): remove shared buffer access that caused race condition on large imports 2020-02-14 17:47:11 +00:00			`case lines <- []byte(scanner.Text()):`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`case <-ctx.Done():`
			`errC <- ctx.Err()`
			`return`
			`}`
			`}`
			`errC <- scanner.Err()`
			`}`

			`// finishes when the lines channel is closed or context is done.`
			`// if an error occurs while writing data to the write service, the error is send in the`
			`// errC channel and the function returns.`
			`func (b *Batcher) write(ctx context.Context, org, bucket platform.ID, lines <-chan []byte, errC chan<- error) {`
			`flushInterval := b.MaxFlushInterval`
			`if flushInterval == 0 {`
			`flushInterval = DefaultInterval`
			`}`

			`maxBytes := b.MaxFlushBytes`
			`if maxBytes == 0 {`
			`maxBytes = DefaultMaxBytes`
			`}`

refactor(write): reuse timers 2018-10-26 00:10:06 +00:00			`timer := time.NewTimer(flushInterval)`
			`defer func() { _ = timer.Stop() }()`

feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`buf := make([]byte, 0, maxBytes)`
refactor(write): reuse bytes readers 2018-10-25 23:51:24 +00:00			`r := bytes.NewReader(buf)`
refactor(write): reuse timers 2018-10-26 00:10:06 +00:00
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`var line []byte`
			`var more = true`
			`// if read closes the channel normally, exit the loop`
			`for more {`
			`select {`
			`case line, more = <-lines:`
			`if more {`
			`buf = append(buf, line...)`
			`}`
			`// write if we exceed the max lines OR read routine has finished`
			`if len(buf) >= maxBytes \|\| (!more && len(buf) > 0) {`
refactor(write): reuse bytes readers 2018-10-25 23:51:24 +00:00			`r.Reset(buf)`
refactor(write): reuse timers 2018-10-26 00:10:06 +00:00			`timer.Reset(flushInterval)`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`if err := b.Service.Write(ctx, org, bucket, r); err != nil {`
			`errC <- err`
			`return`
			`}`
			`buf = buf[:0]`
			`}`
refactor(write): reuse timers 2018-10-26 00:10:06 +00:00			`case <-timer.C:`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`if len(buf) > 0 {`
refactor(write): reuse bytes readers 2018-10-25 23:51:24 +00:00			`r.Reset(buf)`
refactor(write): reuse timers 2018-10-26 00:10:06 +00:00			`timer.Reset(flushInterval)`
feat(write): add line protocol batch writing 2018-10-24 20:51:28 +00:00			`if err := b.Service.Write(ctx, org, bucket, r); err != nil {`
			`errC <- err`
			`return`
			`}`
			`buf = buf[:0]`
			`}`
			`case <-ctx.Done():`
			`errC <- ctx.Err()`
			`return`
			`}`
			`}`

			`errC <- nil`
			`}`

			`// ScanLines is used in bufio.Scanner.Split to split lines of line protocol.`
			`func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {`
			`if atEOF && len(data) == 0 {`
			`return 0, nil, nil`
			`}`

			`if i := bytes.IndexByte(data, '\n'); i >= 0 {`
			`// We have a full newline-terminated line.`
			`return i + 1, data[0 : i+1], nil`

			`}`

			`// If we're at EOF, we have a final, non-terminated line. Return it.`
			`if atEOF {`
			`return len(data), data, nil`
			`}`

			`// Request more data.`
			`return 0, nil, nil`
			`}`