influxdb/influxql/engine.go

package influxql

import (
	"bytes"
	"errors"
	"hash/fnv"
	"math"
	"sort"
	"time"
)

// DB represents an interface for creating transactions.
type DB interface {
	Begin() (Tx, error)
}

const (
	// Return an error if the user is trying to select more than this number of points in a group by statement.
	// Most likely they specified a group by interval without time boundaries.
	MaxGroupByPoints = 100000

	// Since time is always selected, the column count when selecting only a single other value will be 2
	SelectColumnCountWithOneValue = 2
)

// Tx represents a transaction.
// The Tx must be opened before being used.
type Tx interface {
	// Create MapReduceJobs for the given select statement. One MRJob will be created per unique tagset that matches the query
	CreateMapReduceJobs(stmt *SelectStatement, tagKeys []string) ([]*MapReduceJob, error)
}

type MapReduceJob struct {
	MeasurementName string
	TagSet          *TagSet
	Mappers         []Mapper         // the mappers to hit all shards for this MRJob
	TMin            int64            // minimum time specified in the query
	TMax            int64            // maximum time specified in the query
	key             []byte           // a key that identifies the MRJob so it can be sorted
	interval        int64            // the group by interval of the query
	stmt            *SelectStatement // the select statement this job was created for
	chunkSize       int              // the number of points to buffer in raw queries before returning a chunked response
}

func (m *MapReduceJob) Open() error {
	for _, mm := range m.Mappers {
		if err := mm.Open(); err != nil {
			m.Close()
			return err
		}
	}
	return nil
}

func (m *MapReduceJob) Close() {
	for _, mm := range m.Mappers {
		mm.Close()
	}
}

func (m *MapReduceJob) Key() []byte {
	if m.key == nil {
		m.key = append([]byte(m.MeasurementName), m.TagSet.Key...)
	}
	return m.key
}

func (m *MapReduceJob) Execute(out chan *Row, filterEmptyResults bool) {
	// if it's a raw query we handle processing differently
	if m.stmt.IsRawQuery {
		m.processRawQuery(out, filterEmptyResults)
		return
	}

	// get the aggregates and the associated reduce functions
	aggregates := m.stmt.FunctionCalls()
	reduceFuncs := make([]ReduceFunc, len(aggregates))
	for i, c := range aggregates {
		reduceFunc, err := InitializeReduceFunc(c)
		if err != nil {
			out <- &Row{Err: err}
			return
		}
		reduceFuncs[i] = reduceFunc
	}

	// we'll have a fixed number of points with timestamps in buckets. Initialize those times and a slice to hold the associated values
	var pointCountInResult int

	// if the user didn't specify a start time or a group by interval, we're returning a single point that describes the entire range
	if m.TMin == 0 || m.interval == 0 {
		// they want a single aggregate point for the entire time range
		m.interval = m.TMax - m.TMin
		pointCountInResult = 1
	} else {
		intervalTop := m.TMax/m.interval*m.interval + m.interval
		intervalBottom := m.TMin / m.interval * m.interval
		pointCountInResult = int((intervalTop - intervalBottom) / m.interval)
	}

	// For group by time queries, limit the number of data points returned by the limit and offset
	// raw query limits are handled elsewhere
	if m.stmt.Limit > 0 || m.stmt.Offset > 0 {
		// ensure that the offset isn't higher than the number of points we'd get
		if m.stmt.Offset > pointCountInResult {
			return
		}

		// take the lesser of either the pre computed number of group by buckets that
		// will be in the result or the limit passed in by the user
		if m.stmt.Limit < pointCountInResult {
			pointCountInResult = m.stmt.Limit
		}
	}

	// If we are exceeding our MaxGroupByPoints and we aren't a raw query, error out
	if pointCountInResult > MaxGroupByPoints {
		out <- &Row{
			Err: errors.New("too many points in the group by interval. maybe you forgot to specify a where time clause?"),
		}
		return
	}

	// initialize the times of the aggregate points
	resultValues := make([][]interface{}, pointCountInResult)

	// ensure that the start time for the results is on the start of the window
	startTimeBucket := m.TMin / m.interval * m.interval

	for i, _ := range resultValues {
		var t int64
		if m.stmt.Offset > 0 {
			t = startTimeBucket + (int64(i+1) * m.interval * int64(m.stmt.Offset))
		} else {
			t = startTimeBucket + (int64(i+1) * m.interval) - m.interval
		}

		// If we start getting out of our max time range, then truncate values and return
		if t > m.TMax {
			resultValues = resultValues[:i]
			break
		}

		// we always include time so we need one more column than we have aggregates
		vals := make([]interface{}, 0, len(aggregates)+1)
		resultValues[i] = append(vals, time.Unix(0, t).UTC())
	}

	// This just makes sure that if they specify a start time less than what the start time would be with the offset,
	// we just reset the start time to the later time to avoid going over data that won't show up in the result.
	if m.stmt.Offset > 0 {
		m.TMin = resultValues[0][0].(time.Time).UnixNano()
	}

	// now loop through the aggregate functions and populate everything
	for i, c := range aggregates {
		if err := m.processAggregate(c, reduceFuncs[i], resultValues); err != nil {
			out <- &Row{
				Name: m.MeasurementName,
				Tags: m.TagSet.Tags,
				Err:  err,
			}

			return
		}
	}

	// filter out empty results
	if filterEmptyResults && m.resultsEmpty(resultValues) {
		return
	}

	// put together the row to return
	columnNames := make([]string, len(m.stmt.Fields)+1)
	columnNames[0] = "time"
	for i, f := range m.stmt.Fields {
		columnNames[i+1] = f.Name()
	}

	// processes the result values if there's any math in there
	resultValues = m.processResults(resultValues)

	// handle any fill options
	resultValues = m.processFill(resultValues)

	row := &Row{
		Name:    m.MeasurementName,
		Tags:    m.TagSet.Tags,
		Columns: columnNames,
		Values:  resultValues,
	}

	// and we out
	out <- row
}

// processRawQuery will handle running the mappers and then reducing their output
// for queries that pull back raw data values without computing any kind of aggregates.
func (m *MapReduceJob) processRawQuery(out chan *Row, filterEmptyResults bool) {
	// initialize the mappers
	for _, mm := range m.Mappers {
		if err := mm.Begin(nil, m.TMin); err != nil {
			out <- &Row{Err: err}
			return
		}
	}

	mapperOutputs := make([][]*rawQueryMapOutput, len(m.Mappers))
	// markers for which mappers have been completely emptied
	mapperComplete := make([]bool, len(m.Mappers))

	// for limit and offset we need to track how many values we've swalloed for the offset and how many we've already set for the limit.
	// we track the number set for the limit because they could be getting chunks. For instance if your limit is 10k, but chunk size is 1k
	valuesSent := 0
	valuesOffset := 0
	valuesToReturn := make([]*rawQueryMapOutput, 0)

	// loop until we've emptied out all the mappers and sent everything out
	for {
		// collect up to the limit for each mapper
		for j, mm := range m.Mappers {
			// only pull from mappers that potentially have more data and whose last output has been completely sent out.
			if mapperOutputs[j] != nil || mapperComplete[j] {
				continue
			}

			mm.SetLimit(m.chunkSize)
			res, err := mm.NextInterval(m.TMax)
			if err != nil {
				out <- &Row{Err: err}
				return
			}
			if res != nil {
				mapperOutputs[j] = res.([]*rawQueryMapOutput)
			} else { // if we got a nil from the mapper it means that we've emptied all data from it
				mapperComplete[j] = true
			}
		}

		// process the mapper outputs. we can send out everything up to the min of the last time in the mappers
		min := int64(math.MaxInt64)
		for _, o := range mapperOutputs {
			// some of the mappers could empty out before others so ignore them because they'll be nil
			if o == nil {
				continue
			}

			// find the min of the last point in each mapper
			t := o[len(o)-1].timestamp
			if t < min {
				min = t
			}
		}

		// now empty out all the mapper outputs up to the min time
		var values []*rawQueryMapOutput
		for j, o := range mapperOutputs {
			// find the index of the point up to the min
			ind := len(o)
			for i, mo := range o {
				if mo.timestamp > min {
					ind = i
					break
				}
			}

			// add up to the index to the values
			values = append(values, o[:ind]...)

			// if we emptied out all the values, set this output to nil so that the mapper will get run again on the next loop
			if ind == len(o) {
				mapperOutputs[j] = nil
			}
		}

		// if we didn't pull out any values, we're done here
		if values == nil {
			break
		}

		// sort the values by time first so we can then handle offset and limit
		sort.Sort(rawOutputs(values))

		// get rid of any points that need to be offset
		if valuesOffset < m.stmt.Offset {
			offset := m.stmt.Offset - valuesOffset

			// if offset is bigger than the number of values we have, move to the next batch from the mappers
			if offset > len(values) {
				valuesOffset += len(values)
				continue
			}

			values = values[offset:]
			valuesOffset += offset
		}

		// ensure we don't send more than the limit
		if valuesSent < m.stmt.Limit {
			limit := m.stmt.Limit - valuesSent
			if len(values) > limit {
				values = values[:limit]
			}
			valuesSent += len(values)
		}
		valuesToReturn = append(valuesToReturn, values...)

		// hit the chunk size? Send out what has been accumulated, but keep
		// processing.
		if len(valuesToReturn) >= m.chunkSize {
			row := m.processRawResults(valuesToReturn)
			// perform post-processing, such as math.
			row.Values = m.processResults(row.Values)
			out <- row
			valuesToReturn = make([]*rawQueryMapOutput, 0)
		}

		// stop processing if we've hit the limit
		if m.stmt.Limit != 0 && valuesSent >= m.stmt.Limit {
			break
		}
	}

	if len(valuesToReturn) == 0 {
		if !filterEmptyResults {
			out <- m.processRawResults(nil)
		}
	} else {
		row := m.processRawResults(valuesToReturn)
		// perform post-processing, such as math.
		row.Values = m.processResults(row.Values)
		out <- row
	}
}

// processsResults will apply any math that was specified in the select statement against the passed in results
func (m *MapReduceJob) processResults(results [][]interface{}) [][]interface{} {
	hasMath := false
	for _, f := range m.stmt.Fields {
		if _, ok := f.Expr.(*BinaryExpr); ok {
			hasMath = true
		} else if _, ok := f.Expr.(*ParenExpr); ok {
			hasMath = true
		}
	}

	if !hasMath {
		return results
	}

	processors := make([]processor, len(m.stmt.Fields))
	startIndex := 1
	for i, f := range m.stmt.Fields {
		processors[i], startIndex = getProcessor(f.Expr, startIndex)
	}

	mathResults := make([][]interface{}, len(results))
	for i, _ := range mathResults {
		mathResults[i] = make([]interface{}, len(m.stmt.Fields)+1)
		// put the time in
		mathResults[i][0] = results[i][0]
		for j, p := range processors {
			mathResults[i][j+1] = p(results[i])
		}
	}

	return mathResults
}

// processFill will take the results and return new reaults (or the same if no fill modifications are needed) with whatever fill options the query has.
func (m *MapReduceJob) processFill(results [][]interface{}) [][]interface{} {
	// don't do anything if we're supposed to leave the nulls
	if m.stmt.Fill == NullFill {
		return results
	}

	if m.stmt.Fill == NoFill {
		// remove any rows that have even one nil value. This one is tricky because they could have multiple
		// aggregates, but this option means that any row that has even one nil gets purged.
		newResults := make([][]interface{}, 0, len(results))
		for _, vals := range results {
			hasNil := false
			// start at 1 because the first value is always time
			for j := 1; j < len(vals); j++ {
				if vals[j] == nil {
					hasNil = true
					break
				}
			}
			if !hasNil {
				newResults = append(newResults, vals)
			}
		}
		return newResults
	}

	// they're either filling with previous values or a specific number
	for i, vals := range results {
		// start at 1 because the first value is always time
		for j := 1; j < len(vals); j++ {
			if vals[j] == nil {
				switch m.stmt.Fill {
				case PreviousFill:
					if i != 0 {
						vals[j] = results[i-1][j]
					}
				case NumberFill:
					vals[j] = m.stmt.FillValue
				}
			}
		}
	}
	return results
}

func getProcessor(expr Expr, startIndex int) (processor, int) {
	switch expr := expr.(type) {
	case *VarRef:
		return newEchoProcessor(startIndex), startIndex + 1
	case *Call:
		return newEchoProcessor(startIndex), startIndex + 1
	case *BinaryExpr:
		return getBinaryProcessor(expr, startIndex)
	case *ParenExpr:
		return getProcessor(expr.Expr, startIndex)
	case *NumberLiteral:
		return newLiteralProcessor(expr.Val), startIndex
	case *StringLiteral:
		return newLiteralProcessor(expr.Val), startIndex
	case *BooleanLiteral:
		return newLiteralProcessor(expr.Val), startIndex
	case *TimeLiteral:
		return newLiteralProcessor(expr.Val), startIndex
	case *DurationLiteral:
		return newLiteralProcessor(expr.Val), startIndex
	}
	panic("unreachable")
}

type processor func(values []interface{}) interface{}

func newEchoProcessor(index int) processor {
	return func(values []interface{}) interface{} {
		return values[index]
	}
}

func newLiteralProcessor(val interface{}) processor {
	return func(values []interface{}) interface{} {
		return val
	}
}

func getBinaryProcessor(expr *BinaryExpr, startIndex int) (processor, int) {
	lhs, index := getProcessor(expr.LHS, startIndex)
	rhs, index := getProcessor(expr.RHS, index)

	return newBinaryExprEvaluator(expr.Op, lhs, rhs), index
}

func newBinaryExprEvaluator(op Token, lhs, rhs processor) processor {
	switch op {
	case ADD:
		return func(values []interface{}) interface{} {
			l := lhs(values)
			r := rhs(values)
			if lv, ok := l.(float64); ok {
				if rv, ok := r.(float64); ok {
					if rv != 0 {
						return lv + rv
					}
				}
			}
			return nil
		}
	case SUB:
		return func(values []interface{}) interface{} {
			l := lhs(values)
			r := rhs(values)
			if lv, ok := l.(float64); ok {
				if rv, ok := r.(float64); ok {
					if rv != 0 {
						return lv - rv
					}
				}
			}
			return nil
		}
	case MUL:
		return func(values []interface{}) interface{} {
			l := lhs(values)
			r := rhs(values)
			if lv, ok := l.(float64); ok {
				if rv, ok := r.(float64); ok {
					if rv != 0 {
						return lv * rv
					}
				}
			}
			return nil
		}
	case DIV:
		return func(values []interface{}) interface{} {
			l := lhs(values)
			r := rhs(values)
			if lv, ok := l.(float64); ok {
				if rv, ok := r.(float64); ok {
					if rv != 0 {
						return lv / rv
					}
				}
			}
			return nil
		}
	default:
		// we shouldn't get here, but give them back nils if it goes this way
		return func(values []interface{}) interface{} {
			return nil
		}
	}
}

// resultsEmpty will return true if the all the result values are empty or contain only nulls
func (m *MapReduceJob) resultsEmpty(resultValues [][]interface{}) bool {
	for _, vals := range resultValues {
		// start the loop at 1 because we want to skip over the time value
		for i := 1; i < len(vals); i++ {
			if vals[i] != nil {
				return false
			}
		}
	}
	return true
}

// processRawResults will handle converting the reduce results from a raw query into a Row
func (m *MapReduceJob) processRawResults(values []*rawQueryMapOutput) *Row {
	selectNames := m.stmt.NamesInSelect()

	// ensure that time is in the select names and in the first position
	hasTime := false
	for i, n := range selectNames {
		if n == "time" {
			if i != 0 {
				tmp := selectNames[0]
				selectNames[0] = "time"
				selectNames[i] = tmp
			}
			hasTime = true
		}
	}

	// time should always be in the list of names they get back
	if !hasTime {
		selectNames = append([]string{"time"}, selectNames...)
	}

	// if they've selected only a single value we have to handle things a little differently
	singleValue := len(selectNames) == SelectColumnCountWithOneValue

	row := &Row{
		Name:    m.MeasurementName,
		Tags:    m.TagSet.Tags,
		Columns: selectNames,
	}

	// return an empty row if there are no results
	if len(values) == 0 {
		return row
	}

	// the results will have all of the raw mapper results, convert into the row
	for _, v := range values {
		vals := make([]interface{}, len(selectNames))

		if singleValue {
			vals[0] = time.Unix(0, v.timestamp).UTC()
			vals[1] = v.values.(interface{})
		} else {
			fields := v.values.(map[string]interface{})

			// time is always the first value
			vals[0] = time.Unix(0, v.timestamp).UTC()

			// populate the other values
			for i := 1; i < len(selectNames); i++ {
				vals[i] = fields[selectNames[i]]
			}
		}

		row.Values = append(row.Values, vals)
	}

	return row
}

func (m *MapReduceJob) processAggregate(c *Call, reduceFunc ReduceFunc, resultValues [][]interface{}) error {
	mapperOutputs := make([]interface{}, len(m.Mappers))

	// intialize the mappers
	for _, mm := range m.Mappers {
		if err := mm.Begin(c, m.TMin); err != nil {
			return err
		}
	}

	// the first interval in a query with a group by may be smaller than the others. This happens when they have a
	// where time > clause that is in the middle of the bucket that the group by time creates
	firstInterval := (m.TMin/m.interval*m.interval + m.interval) - m.TMin

	// populate the result values for each interval of time
	for i, _ := range resultValues {
		// collect the results from each mapper
		for j, mm := range m.Mappers {
			interval := m.interval
			if i == 0 {
				interval = firstInterval
			}
			res, err := mm.NextInterval(interval)
			if err != nil {
				return err
			}
			mapperOutputs[j] = res
		}
		resultValues[i] = append(resultValues[i], reduceFunc(mapperOutputs))
	}

	return nil
}

type MapReduceJobs []*MapReduceJob

func (a MapReduceJobs) Len() int           { return len(a) }
func (a MapReduceJobs) Less(i, j int) bool { return bytes.Compare(a[i].Key(), a[j].Key()) == -1 }
func (a MapReduceJobs) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }

// Mapper will run through a map function. A single mapper will be created
// for each shard for each tagset that must be hit to satisfy a query.
// Mappers can either point to a local shard or could point to a remote server.
type Mapper interface {
	// Open will open the necessary resources to being the map job. Could be connections to remote servers or
	// hitting the local bolt store
	Open() error

	// Close will close the mapper (either the bolt transaction or the request)
	Close()

	// Begin will set up the mapper to run the map function for a given aggregate call starting at the passed in time
	Begin(*Call, int64) error

	// NextInterval will get the time ordered next interval of the given interval size from the mapper. This is a
	// forward only operation from the start time passed into Begin. Will return nil when there is no more data to be read.
	// We pass the interval in here so that it can be varied over the period of the query. This is useful for queries that
	// must respect natural time boundaries like months or queries that span daylight savings time borders. Note that if
	// a limit is set on the mapper, the interval passed here should represent the MaxTime in a nano epoch.
	NextInterval(interval int64) (interface{}, error)

	// Set limit will limit the number of data points yielded on the next interval. If a limit is set, the interval
	// passed into NextInterval will be used as the MaxTime to scan until.
	SetLimit(limit int)
}

type TagSet struct {
	Tags      map[string]string
	Filters   []Expr
	SeriesIDs []uint32
	Key       []byte
}

func (t *TagSet) AddFilter(id uint32, filter Expr) {
	t.SeriesIDs = append(t.SeriesIDs, id)
	t.Filters = append(t.Filters, filter)
}

// Planner represents an object for creating execution plans.
type Planner struct {
	DB DB

	// Returns the current time. Defaults to time.Now().
	Now func() time.Time
}

// NewPlanner returns a new instance of Planner.
func NewPlanner(db DB) *Planner {
	return &Planner{
		DB:  db,
		Now: time.Now,
	}
}

// Plan creates an execution plan for the given SelectStatement and returns an Executor.
func (p *Planner) Plan(stmt *SelectStatement, chunkSize int) (*Executor, error) {
	now := p.Now().UTC()

	// Replace instances of "now()" with the current time.
	stmt.Condition = Reduce(stmt.Condition, &nowValuer{Now: now})

	// Begin an unopened transaction.
	tx, err := p.DB.Begin()
	if err != nil {
		return nil, err
	}

	// Determine group by tag keys.
	interval, tags, err := stmt.Dimensions.Normalize()
	if err != nil {
		return nil, err
	}

	// TODO: hanldle queries that select from multiple measurements. This assumes that we're only selecting from a single one
	jobs, err := tx.CreateMapReduceJobs(stmt, tags)
	if err != nil {
		return nil, err
	}

	// LIMIT and OFFSET the unique series
	if stmt.SLimit > 0 || stmt.SOffset > 0 {
		if stmt.SOffset > len(jobs) {
			jobs = nil
		} else {
			if stmt.SOffset+stmt.SLimit > len(jobs) {
				stmt.SLimit = len(jobs) - stmt.SOffset
			}

			jobs = jobs[stmt.SOffset : stmt.SOffset+stmt.SLimit]
		}
	}

	for _, j := range jobs {
		j.interval = interval.Nanoseconds()
		j.stmt = stmt
		j.chunkSize = chunkSize
	}

	return &Executor{tx: tx, stmt: stmt, jobs: jobs, interval: interval.Nanoseconds()}, nil
}

// Executor represents the implementation of Executor.
// It executes all reducers and combines their result into a row.
type Executor struct {
	tx       Tx               // transaction
	stmt     *SelectStatement // original statement
	jobs     []*MapReduceJob  // one job per unique tag set that will return in the query
	interval int64            // the group by interval of the query in nanoseconds
}

// Execute begins execution of the query and returns a channel to receive rows.
func (e *Executor) Execute() (<-chan *Row, error) {
	// Open transaction.
	for _, j := range e.jobs {
		if err := j.Open(); err != nil {
			e.close()
			return nil, err
		}
	}

	// Create output channel and stream data in a separate goroutine.
	out := make(chan *Row, 0)
	go e.execute(out)

	return out, nil
}

func (e *Executor) close() {
	for _, j := range e.jobs {
		j.Close()
	}
}

// execute runs in a separate separate goroutine and streams data from processors.
func (e *Executor) execute(out chan *Row) {
	// Ensure the the MRJobs close after execution.
	defer e.close()

	// If we have multiple tag sets we'll want to filter out the empty ones
	filterEmptyResults := len(e.jobs) > 1

	// Execute each MRJob serially
	for _, j := range e.jobs {
		j.Execute(out, filterEmptyResults)
	}

	// Mark the end of the output channel.
	close(out)
}

// Row represents a single row returned from the execution of a statement.
type Row struct {
	Name    string            `json:"name,omitempty"`
	Tags    map[string]string `json:"tags,omitempty"`
	Columns []string          `json:"columns"`
	Values  [][]interface{}   `json:"values,omitempty"`
	Err     error             `json:"err,omitempty"`
}

// tagsHash returns a hash of tag key/value pairs.
func (r *Row) tagsHash() uint64 {
	h := fnv.New64a()
	keys := r.tagsKeys()
	for _, k := range keys {
		h.Write([]byte(k))
		h.Write([]byte(r.Tags[k]))
	}
	return h.Sum64()
}

// tagKeys returns a sorted list of tag keys.
func (r *Row) tagsKeys() []string {
	a := make([]string, len(r.Tags))
	for k := range r.Tags {
		a = append(a, k)
	}
	sort.Strings(a)
	return a
}

// Rows represents a list of rows that can be sorted consistently by name/tag.
type Rows []*Row

func (p Rows) Len() int { return len(p) }

func (p Rows) Less(i, j int) bool {
	// Sort by name first.
	if p[i].Name != p[j].Name {
		return p[i].Name < p[j].Name
	}

	// Sort by tag set hash. Tags don't have a meaningful sort order so we
	// just compute a hash and sort by that instead. This allows the tests
	// to receive rows in a predictable order every time.
	return p[i].tagsHash() < p[j].tagsHash()
}

func (p Rows) Swap(i, j int) { p[i], p[j] = p[j], p[i] }