influxdb/query/functions/to_kafka.go

package functions

import (
	"bufio"
	"context"
	"encoding/binary"
	"fmt"
	"io"
	"sort"
	"sync"
	"time"

	"github.com/cespare/xxhash"
	"github.com/influxdata/line-protocol"
	"github.com/influxdata/platform/query"
	"github.com/influxdata/platform/query/execute"
	"github.com/influxdata/platform/query/plan"
	"github.com/influxdata/platform/query/semantic"
	"github.com/pkg/errors"
	kafka "github.com/segmentio/kafka-go"
)

const (
	// ToKafkaKind is the Kind for the ToKafka Flux function
	ToKafkaKind = "toKafka"
)

type ToKafkaOpSpec struct {
	Brokers      []string `json:"brokers"`
	Topic        string   `json:"topic"`
	Balancer     string   `json:"balancer"`
	Name         string   `json:"name"`
	NameColumn   string   `json:"nameColumn"` // either name or name_column must be set, if none is set try to use the "_measurement" column.
	TimeColumn   string   `json:"timeColumn"`
	TagColumns   []string `json:"tagColumns"`
	ValueColumns []string `json:"valueColumns"`
	MsgBufSize   int      `json:"msgBufferSize"` // the maximim number of messages to buffer before sending to kafka, the library we use defaults to 100
}

var ToKafkaSignature = query.DefaultFunctionSignature()

func init() {
	ToKafkaSignature.Params["brokers"] = semantic.NewArrayType(semantic.String)
	ToKafkaSignature.Params["topic"] = semantic.String
	ToKafkaSignature.Params["balancer"] = semantic.String
	ToKafkaSignature.Params["name"] = semantic.String
	ToKafkaSignature.Params["nameColumn"] = semantic.String
	ToKafkaSignature.Params["timeColumn"] = semantic.String
	ToKafkaSignature.Params["tagColumns"] = semantic.NewArrayType(semantic.String)
	ToKafkaSignature.Params["valueColumns"] = semantic.NewArrayType(semantic.String)
	query.RegisterFunctionWithSideEffect(ToKafkaKind, createToKafkaOpSpec, ToKafkaSignature)
	query.RegisterOpSpec(ToKafkaKind,
		func() query.OperationSpec { return &ToKafkaOpSpec{} })
	plan.RegisterProcedureSpec(ToKafkaKind, newToKafkaProcedure, ToKafkaKind)
	execute.RegisterTransformation(ToKafkaKind, createToKafkaTransformation)
}

// DefaultKafkaWriterFactory is a terrible name for a way to make a kafkaWriter that is injectable for testing
var DefaultKafkaWriterFactory = func(conf kafka.WriterConfig) KafkaWriter {
	return kafka.NewWriter(conf)
}

// KafkaWriter is an interface for what we need fromDefaultKafkaWriterFactory
type KafkaWriter interface {
	io.Closer
	WriteMessages(context.Context, ...kafka.Message) error
}

// ReadArgs loads a query.Arguments into ToKafkaOpSpec.  It sets several default values.
// If the time_column isn't set, it defaults to execute.TimeColLabel.
// If the value_column isn't set it defaults to a []string{execute.DefaultValueColLabel}.
func (o *ToKafkaOpSpec) ReadArgs(args query.Arguments) error {
	var err error
	var ok bool

	brokers, err := args.GetRequiredArray("brokers", semantic.String)
	if err != nil {
		return err
	}
	l := brokers.Len()

	o.Brokers = make([]string, l)
	if brokers.Len() < 1 {
		return errors.New("at least one broker is required")
	}
	for i := 0; i < l; i++ {
		o.Brokers[i] = brokers.Get(i).Str()
	}

	o.Topic, err = args.GetRequiredString("topic")
	if err != nil {
		return err
	}
	if len(o.Topic) == 0 {
		return errors.New("invalid topic name")
	}

	o.Balancer, _, err = args.GetString("balancer")
	if err != nil {
		return err
	}

	o.Name, ok, err = args.GetString("name")
	if err != nil {
		return err
	}
	if !ok {
		o.NameColumn, ok, err = args.GetString("nameColumn")
		if err != nil {
			return err
		}
		if !ok {
			o.NameColumn = "_measurement"
		}
	}
	o.TimeColumn, ok, err = args.GetString("timeColumn")
	if err != nil {
		return err
	}
	if !ok {
		o.TimeColumn = execute.DefaultTimeColLabel
	}
	tagColumns, ok, err := args.GetArray("tagColumns", semantic.String)
	if err != nil {
		return err
	}
	o.TagColumns = o.TagColumns[:0]
	if ok {
		for i := 0; i < tagColumns.Len(); i++ {
			o.TagColumns = append(o.TagColumns, tagColumns.Get(i).Str())
		}
		sort.Strings(o.TagColumns)
	}
	valueColumns, ok, err := args.GetArray("valueColumns", semantic.String)
	if err != nil {
		return err
	}
	o.ValueColumns = o.ValueColumns[:0]
	if !ok || valueColumns.Len() == 0 {
		o.ValueColumns = append(o.ValueColumns, execute.DefaultValueColLabel)
	} else {
		for i := 0; i < valueColumns.Len(); i++ {
			o.TagColumns = append(o.ValueColumns, valueColumns.Get(i).Str())
		}
		sort.Strings(o.TagColumns)
	}

	msgBufSize, ok, err := args.GetInt("msgBufferSize")
	o.MsgBufSize = int(msgBufSize)
	if err != nil {
		return err
	}
	if o.MsgBufSize < 0 || !ok {
		o.MsgBufSize = 0 // so the library will set it  to the default
	}

	return err
}
func createToKafkaOpSpec(args query.Arguments, a *query.Administration) (query.OperationSpec, error) {
	if err := a.AddParentFromArgs(args); err != nil {
		return nil, err
	}
	s := new(ToKafkaOpSpec)
	if err := s.ReadArgs(args); err != nil {
		return nil, err
	}
	return s, nil
}

func (ToKafkaOpSpec) Kind() query.OperationKind {
	return ToKafkaKind
}

type ToKafkaProcedureSpec struct {
	Spec     *ToKafkaOpSpec
	balancer kafka.Balancer
}

func (o *ToKafkaProcedureSpec) Kind() plan.ProcedureKind {
	return ToKafkaKind
}
func (o *ToKafkaProcedureSpec) Copy() plan.ProcedureSpec {
	s := o.Spec
	res := &ToKafkaProcedureSpec{
		Spec: &ToKafkaOpSpec{
			Brokers:      append([]string(nil), s.Brokers...),
			Topic:        s.Topic,
			Balancer:     s.Balancer,
			Name:         s.Name,
			NameColumn:   s.NameColumn,
			TimeColumn:   s.TimeColumn,
			TagColumns:   append([]string(nil), s.TagColumns...),
			ValueColumns: append([]string(nil), s.ValueColumns...),
		},
	}
	switch s.Balancer {
	case "hash", "": //hash is default for compatibility with enterprise
		res.balancer = &kafka.Hash{}

	case "round-robin":
		res.balancer = &kafka.RoundRobin{}

	case "least-bytes":
		res.balancer = &kafka.LeastBytes{}
	}
	return res
}
func newToKafkaProcedure(qs query.OperationSpec, a plan.Administration) (plan.ProcedureSpec, error) {
	spec, ok := qs.(*ToKafkaOpSpec)
	if !ok && spec != nil {
		return nil, fmt.Errorf("invalid spec type %T", qs)
	}
	return &ToKafkaProcedureSpec{Spec: spec}, nil
}
func createToKafkaTransformation(id execute.DatasetID, mode execute.AccumulationMode, spec plan.ProcedureSpec, a execute.Administration) (execute.Transformation, execute.Dataset, error) {
	s, ok := spec.(*ToKafkaProcedureSpec)
	if !ok {
		return nil, nil, fmt.Errorf("invalid spec type %T", spec)
	}
	cache := execute.NewTableBuilderCache(a.Allocator())
	d := execute.NewDataset(id, mode, cache)
	t := NewToKafkaTransformation(d, cache, s)
	return t, d, nil
}

type ToKafkaTransformation struct {
	d     execute.Dataset
	cache execute.TableBuilderCache
	spec  *ToKafkaProcedureSpec
}

func (t *ToKafkaTransformation) RetractTable(id execute.DatasetID, key query.GroupKey) error {
	return t.d.RetractTable(key)
}
func NewToKafkaTransformation(d execute.Dataset, cache execute.TableBuilderCache, spec *ToKafkaProcedureSpec) *ToKafkaTransformation {
	return &ToKafkaTransformation{
		d:     d,
		cache: cache,
		spec:  spec,
	}
}

type toKafkaMetric struct {
	tags   []*protocol.Tag
	fields []*protocol.Field
	name   string
	t      time.Time
}

func (m *toKafkaMetric) TagList() []*protocol.Tag {
	return m.tags
}
func (m *toKafkaMetric) FieldList() []*protocol.Field {
	return m.fields
}
func (m *toKafkaMetric) truncateTagsAndFields() {
	m.fields = m.fields[:0]
	m.tags = m.tags[:0]
}
func (m *toKafkaMetric) Name() string {
	return m.name
}
func (m *toKafkaMetric) Time() time.Time {
	return m.t
}

func (t *ToKafkaTransformation) Process(id execute.DatasetID, tbl query.Table) (err error) {
	w := DefaultKafkaWriterFactory(kafka.WriterConfig{
		Brokers:       t.spec.Spec.Brokers,
		Topic:         t.spec.Spec.Topic,
		Balancer:      t.spec.balancer,
		BatchSize:     t.spec.Spec.MsgBufSize,
		QueueCapacity: t.spec.Spec.MsgBufSize,
	})

	defer func() {
		err2 := w.Close()
		// don't overwrite current error
		if err != nil {
			return
		}
		if err2 != nil {
			// allow Process to return the error from the defered Close()
			err = err2
			return
		}
	}()
	pr, pw := io.Pipe() // TODO: replce the pipe with something faster
	// I'd like a linereader in line-protocol
	m := &toKafkaMetric{}
	e := protocol.NewEncoder(pw)
	e.FailOnFieldErr(true)
	e.SetFieldSortOrder(protocol.SortFields)
	cols := tbl.Cols()
	labels := make(map[string]idxType, len(cols))
	for i, col := range cols {
		labels[col.Label] = idxType{Idx: i, Type: col.Type}
	}
	// do time
	timeColLabel := t.spec.Spec.TimeColumn
	timeColIdx, ok := labels[timeColLabel]
	if !ok {
		return errors.New("Could not get time column")
	}
	if timeColIdx.Type != query.TTime {
		return fmt.Errorf("column %s is not of type %s", timeColLabel, timeColIdx.Type)
	}
	var measurementNameCol string
	if t.spec.Spec.Name == "" {
		measurementNameCol = t.spec.Spec.NameColumn
	}
	// check if each col is a tag or value and cache this value for the loop
	colMetadatas := tbl.Cols()
	isTag := make([]bool, len(colMetadatas))
	isValue := make([]bool, len(colMetadatas))
	for i, col := range colMetadatas {
		isValue[i] = sort.SearchStrings(t.spec.Spec.ValueColumns, col.Label) < len(t.spec.Spec.ValueColumns) && t.spec.Spec.ValueColumns[sort.SearchStrings(t.spec.Spec.ValueColumns, col.Label)] == col.Label
		isTag[i] = sort.SearchStrings(t.spec.Spec.TagColumns, col.Label) < len(t.spec.Spec.TagColumns) && t.spec.Spec.TagColumns[sort.SearchStrings(t.spec.Spec.TagColumns, col.Label)] == col.Label
	}
	m.name = t.spec.Spec.Name
	wg := sync.WaitGroup{}
	wg.Add(1)
	go func() {
		err = tbl.Do(func(er query.ColReader) error {
			l := er.Len()
			for i := 0; i < l; i++ {
				m.truncateTagsAndFields()
				for j, col := range er.Cols() {
					switch {
					case col.Label == timeColLabel:
						m.t = er.Times(j)[i].Time()
					case measurementNameCol != "" && measurementNameCol == col.Label:
						if col.Type != query.TString {
							return errors.New("invalid type for measurement column")
						}
						m.name = er.Strings(j)[i]
					case isTag[j]:
						if col.Type != query.TString {
							return errors.New("invalid type for measurement column")
						}
						m.tags = append(m.tags, &protocol.Tag{Key: col.Label, Value: er.Strings(j)[i]})
					case isValue[j]:
						switch col.Type {
						case query.TFloat:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.Floats(j)[i]})
						case query.TInt:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.Ints(j)[i]})
						case query.TUInt:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.UInts(j)[i]})
						case query.TString:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.Strings(j)[i]})
						case query.TTime:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.Times(j)[i]})
						case query.TBool:
							m.fields = append(m.fields, &protocol.Field{Key: col.Label, Value: er.Bools(j)[i]})
						default:
							return fmt.Errorf("invalid type for column %s", col.Label)
						}
					}
				}
				_, err := e.Encode(m)
				if err != nil {
					return err
				}
			}
			return nil
		})
		pw.Close()
		wg.Done()
	}()
	// write the data to kafka
	{
		scan := bufio.NewScanner(pr)
		msgBuf := make([]kafka.Message, 128)
		i := 0
		// todo, make this a little more async
		for scan.Scan() {
			v := append([]byte(nil), scan.Bytes()...) // we do this since scan.Bytes()'s result can be overwritten by calls to Scan()
			if cap(msgBuf[i].Key) != 8 {
				msgBuf[i].Key = make([]byte, 8)
			}
			binary.LittleEndian.PutUint64(msgBuf[i].Key, xxhash.Sum64(v))
			msgBuf[i].Value = v
			if i == t.spec.Spec.MsgBufSize-1 {
				if err = w.WriteMessages(context.Background(), msgBuf...); err != nil {
					return err
				}
				msgBuf = msgBuf[:0]
				i = 0
			}
			i++
		}
		// send the remainder of the messages
		if len(msgBuf) > 0 {
			err = w.WriteMessages(context.Background(), msgBuf[:i]...)
		}
	}
	wg.Wait()
	return err
}

func (t *ToKafkaTransformation) UpdateWatermark(id execute.DatasetID, pt execute.Time) error {
	return t.d.UpdateWatermark(pt)
}

func (t *ToKafkaTransformation) UpdateProcessingTime(id execute.DatasetID, pt execute.Time) error {
	return t.d.UpdateProcessingTime(pt)
}

func (t *ToKafkaTransformation) Finish(id execute.DatasetID, err error) {
	t.d.Finish(err)
}