influxdb/task/backend/scheduler/treescheduler.go

package scheduler

import (
	"context"
	"encoding/binary"
	"errors"
	"sync"
	"time"

	"github.com/benbjohnson/clock"
	"github.com/cespare/xxhash"
	"github.com/google/btree"
)

const (
	// degreeBtreeScheduled is the btree degree for the btree internal to the tree scheduler.
	// it is purely a performance tuning parameter, but required by github.com/google/btree
	degreeBtreeScheduled = 3 // TODO(docmerlin): find the best number for this, its purely a perf optimization

	// defaultMaxWorkers is a constant that sets the default number of maximum workers for a TreeScheduler
	defaultMaxWorkers = 128
)

// TreeScheduler is a Scheduler based on a btree.
// It calls Executor in-order per ID.  That means you are guaranteed that for a specific ID,
//
// - The scheduler should, after creation, automatically call ExecutorFunc, when a task should run as defined by its Schedulable.
//
// - the scheduler's should not be able to get into a state where blocks Release and Schedule indefinitely.
//
// - Schedule should add a Schedulable to being scheduled, and Release should remove a task from being scheduled.
//
// - Calling of ExecutorFunc should be serial in time on a per taskID basis. I.E.: the run at 12:00 will go before the run at 12:01.
//
// Design:
//
// The core of the scheduler is a btree keyed by time, a nonce, and a task ID, and a map keyed by task ID and containing a
// nonce and a time (called a uniqueness index from now on).
// The map is to ensure task uniqueness in the tree, so we can replace or delete tasks in the tree.
//
// Scheduling in the tree consists of a main loop that feeds a fixed set of workers, each with their own communication channel.
// Distribution is handled by hashing the TaskID (to ensure uniform distribution) and then distributing over those channels
// evenly based on the hashed ID.  This is to ensure that all tasks of the same ID go to the same worker.
//
// The workers call ExecutorFunc handle any errors and update the LastScheduled time internally and also via the Checkpointer.
//
// The main loop:
//
// The main loop waits on a time.Timer to grab the task with the minimum time.  Once it successfully grabs a task ready
// to trigger, it will start walking the btree from the item nearest
//
// Putting a task into the scheduler:
//
// Adding a task to the scheduler acquires a write lock, grabs the task from the uniqueness map, and replaces the item
// in the uniqueness index and btree.  If new task would trigger sooner than the current soonest triggering task, it
// replaces the Timer when added to the scheduler.  Finally it releases the write lock.
//
// Removing a task from the scheduler:
//
// Removing a task from the scheduler acquires a write lock, deletes the task from the uniqueness index and from the
// btree, then releases the lock.  We do not have to readjust the time on delete, because, if the minimum task isn't
// ready yet, the main loop just resets the timer and keeps going.
type TreeScheduler struct {
	mu            sync.RWMutex
	priorityQueue *btree.BTree
	nextTime      map[ID]int64 // we need this index so we can delete items from the scheduled
	when          time.Time
	executor      Executor
	onErr         ErrorFunc
	time          clock.Clock
	timer         *clock.Timer
	done          chan struct{}
	workchans     []chan Item
	wg            sync.WaitGroup
	checkpointer  SchedulableService
	items         *itemList

	sm *SchedulerMetrics
}

// ErrorFunc is a function for error handling.  It is a good way to inject logging into a TreeScheduler.
type ErrorFunc func(ctx context.Context, taskID ID, scheduledFor time.Time, err error)

type treeSchedulerOptFunc func(t *TreeScheduler) error

// WithOnErrorFn is an option that sets the error function that gets called when there is an error in a TreeScheduler.
// its useful for injecting logging or special error handling.
func WithOnErrorFn(fn ErrorFunc) treeSchedulerOptFunc {
	return func(t *TreeScheduler) error {
		t.onErr = fn
		return nil
	}
}

// WithMaxConcurrentWorkers is an option that sets the max number of concurrent workers that a TreeScheduler will use.
func WithMaxConcurrentWorkers(n int) treeSchedulerOptFunc {
	return func(t *TreeScheduler) error {
		t.workchans = make([]chan Item, n)
		return nil
	}
}

// WithTime is an option for NewScheduler that allows you to inject a clock.Clock from ben johnson's github.com/benbjohnson/clock library, for testing purposes.
func WithTime(t clock.Clock) treeSchedulerOptFunc {
	return func(sch *TreeScheduler) error {
		sch.time = t
		return nil
	}
}

// NewScheduler gives us a new TreeScheduler and SchedulerMetrics when given an  Executor, a SchedulableService, and zero or more options.
// Schedulers should be initialized with this function.
func NewScheduler(executor Executor, checkpointer SchedulableService, opts ...treeSchedulerOptFunc) (*TreeScheduler, *SchedulerMetrics, error) {
	s := &TreeScheduler{
		executor:      executor,
		priorityQueue: btree.New(degreeBtreeScheduled),
		nextTime:      map[ID]int64{},
		onErr:         func(_ context.Context, _ ID, _ time.Time, _ error) {},
		time:          clock.New(),
		done:          make(chan struct{}, 1),
		checkpointer:  checkpointer,
		items:         &itemList{},
	}

	// apply options
	for i := range opts {
		if err := opts[i](s); err != nil {
			return nil, nil, err
		}
	}
	if s.workchans == nil {
		s.workchans = make([]chan Item, defaultMaxWorkers)

	}

	s.wg.Add(len(s.workchans))
	for i := 0; i < len(s.workchans); i++ {
		s.workchans[i] = make(chan Item)
		go s.work(context.Background(), s.workchans[i])
	}

	s.sm = NewSchedulerMetrics(s)
	s.when = time.Time{}
	s.timer = s.time.Timer(0)
	s.timer.Stop()
	// because a stopped timer will wait forever, this allows us to wait for items to be added before triggering.

	if executor == nil {
		return nil, nil, errors.New("executor must be a non-nil function")
	}
	s.wg.Add(1)
	go func() {
		defer s.wg.Done()
	schedulerLoop:
		for {
			select {
			case <-s.done:
				s.mu.Lock()
				s.timer.Stop()
				// close workchans
				for i := range s.workchans {
					close(s.workchans[i])
				}
				s.mu.Unlock()
				return
			case <-s.timer.C:
				for { // this for loop is a work around to the way clock's mock works when you reset duration 0 in a different thread than you are calling your clock.Set
					s.mu.Lock()
					min := s.priorityQueue.Min()
					if min == nil { // grab a new item, because there could be a different item at the top of the queue
						s.when = time.Time{}
						s.mu.Unlock()
						continue schedulerLoop
					}
					it := min.(Item)
					if ts := s.time.Now().UTC(); it.When().After(ts) {
						s.timer.Reset(ts.Sub(it.When()))
						s.mu.Unlock()
						continue schedulerLoop
					}
					s.process()
					min = s.priorityQueue.Min()
					if min == nil { // grab a new item, because there could be a different item at the top of the queue after processing
						s.when = time.Time{}
						s.mu.Unlock()
						continue schedulerLoop
					}
					it = min.(Item)
					s.when = it.When()
					until := s.when.Sub(s.time.Now())

					if until > 0 {
						s.resetTimer(until) // we can reset without a stop because we know it is fired here
						s.mu.Unlock()
						continue schedulerLoop
					}
					s.mu.Unlock()
				}
			}
		}
	}()
	return s, s.sm, nil
}

func (s *TreeScheduler) Stop() {
	s.mu.Lock()
	close(s.done)
	s.mu.Unlock()
	s.wg.Wait()
}

// itemList is a list of items for deleting and inserting.  We have to do them separately instead of just a re-add,
// because usually the items key must be changed between the delete and insert
type itemList struct {
	toInsert []Item
	toDelete []Item
}

func (s *TreeScheduler) process() {
	// Reset the length of the slice in preparation of the next iterator.
	s.items.toDelete = s.items.toDelete[:0]
	s.items.toInsert = s.items.toInsert[:0]

	toReAdd := s.items
	iter := s.iterator(s.time.Now())
	s.priorityQueue.Ascend(iter)
	for i := range toReAdd.toDelete {
		delete(s.nextTime, toReAdd.toDelete[i].id)
		s.priorityQueue.Delete(toReAdd.toDelete[i])
	}
	for i := range toReAdd.toInsert {
		s.nextTime[toReAdd.toInsert[i].id] = toReAdd.toInsert[i].when
		s.priorityQueue.ReplaceOrInsert(toReAdd.toInsert[i])
	}
}

func (s *TreeScheduler) resetTimer(whenFromNow time.Duration) {
	s.when = s.time.Now().Add(whenFromNow)
	s.timer.Reset(whenFromNow)
}

func (s *TreeScheduler) iterator(ts time.Time) btree.ItemIterator {
	return func(i btree.Item) bool {
		if i == nil {
			return false
		}
		it := i.(Item) // we want it to panic if things other than Items are populating the scheduler, as it is something we can't recover from.
		if time.Unix(it.next+it.Offset, 0).After(ts) {
			return false
		}
		// distribute to the right worker.
		{
			buf := [8]byte{}
			binary.LittleEndian.PutUint64(buf[:], uint64(it.id))
			wc := xxhash.Sum64(buf[:]) % uint64(len(s.workchans)) // we just hash so that the number is uniformly distributed
			select {
			case s.workchans[wc] <- it:
				s.items.toDelete = append(s.items.toDelete, it)
				if err := it.updateNext(); err != nil {
					// in this error case we can't schedule next, so we have to drop the task
					s.onErr(context.Background(), it.id, it.Next(), &ErrUnrecoverable{err})
					return true
				}
				s.items.toInsert = append(s.items.toInsert, it)

			case <-s.done:
				return false
			default:
				return true
			}
		}
		return true
	}
}

// When gives us the next time the scheduler will run a task.
func (s *TreeScheduler) When() time.Time {
	s.mu.RLock()
	w := s.when
	s.mu.RUnlock()
	return w
}

func (s *TreeScheduler) release(taskID ID) {
	when, ok := s.nextTime[taskID]
	if !ok {
		return
	}

	// delete the old task run time
	s.priorityQueue.Delete(Item{id: taskID, when: when})
	delete(s.nextTime, taskID)
}

// Release releases a task.
// Release also cancels the running task.
// Task deletion would be faster if the tree supported deleting ranges.
func (s *TreeScheduler) Release(taskID ID) error {
	s.sm.release(taskID)
	s.mu.Lock()
	s.release(taskID)
	s.mu.Unlock()
	return nil
}

// work does work from the channel and checkpoints it.
func (s *TreeScheduler) work(ctx context.Context, ch chan Item) {
	var it Item
	defer func() {
		s.wg.Done()
	}()
	for it = range ch {
		t := time.Unix(it.next, 0)
		err := func() (err error) {
			defer func() {
				if r := recover(); r != nil {
					err = &ErrUnrecoverable{errors.New("executor panicked")}
				}
			}()
			// report the difference between when the item was supposed to be scheduled and now
			s.sm.reportScheduleDelay(time.Since(it.Next()))
			preExec := time.Now()
			// execute
			err = s.executor.Execute(ctx, it.id, t, it.When())
			// report how long execution took
			s.sm.reportExecution(err, time.Since(preExec))
			return err
		}()
		if err != nil {
			s.onErr(ctx, it.id, it.Next(), err)
		}
		// TODO(docmerlin): we can increase performance by making the call to UpdateLastScheduled async
		if err := s.checkpointer.UpdateLastScheduled(ctx, it.id, t); err != nil {
			s.onErr(ctx, it.id, it.Next(), err)
		}
	}
}

// Schedule put puts a Schedulable on the TreeScheduler.
func (s *TreeScheduler) Schedule(sch Schedulable) error {
	s.sm.schedule(sch.ID())
	it := Item{
		cron:   sch.Schedule(),
		id:     sch.ID(),
		Offset: int64(sch.Offset().Seconds()),
		//last:   sch.LastScheduled().Unix(),
	}
	nt, err := it.cron.Next(sch.LastScheduled())
	if err != nil {
		s.sm.scheduleFail(it.id)
		s.onErr(context.Background(), it.id, time.Time{}, err)
		return err
	}
	it.next = nt.UTC().Unix()
	it.when = it.next + it.Offset

	s.mu.Lock()
	defer s.mu.Unlock()

	nt = nt.Add(sch.Offset())
	if s.when.IsZero() || s.when.After(nt) {
		s.when = nt
		s.timer.Stop()
		until := s.when.Sub(s.time.Now())
		if until <= 0 {
			s.timer.Reset(0)
		} else {
			s.timer.Reset(s.when.Sub(s.time.Now()))
		}
	}
	nextTime, ok := s.nextTime[it.id]

	if ok {
		// delete the old task run time
		s.priorityQueue.Delete(Item{
			when: nextTime,
			id:   it.id,
		})
	}
	s.nextTime[it.id] = it.next + it.Offset

	// insert the new task run time
	s.priorityQueue.ReplaceOrInsert(it)
	return nil
}

// Item is a task in the scheduler.
type Item struct {
	when   int64
	id     ID
	cron   Schedule
	next   int64
	Offset int64
}

func (it Item) Next() time.Time {
	return time.Unix(it.next, 0)
}

func (it Item) When() time.Time {
	return time.Unix(it.when, 0)
}

// Less tells us if one Item is less than another
func (it Item) Less(bItem btree.Item) bool {
	it2 := bItem.(Item)
	return it.when < it2.when || ((it.when == it2.when) && it.id < it2.id)
}

func (it *Item) updateNext() error {
	newNext, err := it.cron.Next(time.Unix(it.next, 0))
	if err != nil {
		return err
	}
	it.next = newNext.UTC().Unix()
	it.when = it.next + it.Offset
	return nil
}