influxdb/task/backend/scheduler/treescheduler.go

419 lines
12 KiB
Go

package scheduler
import (
"context"
"encoding/binary"
"errors"
"sync"
"time"
"github.com/benbjohnson/clock"
"github.com/cespare/xxhash"
"github.com/google/btree"
)
const (
// degreeBtreeScheduled is the btree degree for the btree internal to the tree scheduler.
// it is purely a performance tuning parameter, but required by github.com/google/btree
degreeBtreeScheduled = 3 // TODO(docmerlin): find the best number for this, its purely a perf optimization
// defaultMaxWorkers is a constant that sets the default number of maximum workers for a TreeScheduler
defaultMaxWorkers = 128
)
// TreeScheduler is a Scheduler based on a btree.
// It calls Executor in-order per ID. That means you are guaranteed that for a specific ID,
//
// - The scheduler should, after creation, automatically call ExecutorFunc, when a task should run as defined by its Schedulable.
//
// - the scheduler's should not be able to get into a state where blocks Release and Schedule indefinitely.
//
// - Schedule should add a Schedulable to being scheduled, and Release should remove a task from being scheduled.
//
// - Calling of ExecutorFunc should be serial in time on a per taskID basis. I.E.: the run at 12:00 will go before the run at 12:01.
//
// Design:
//
// The core of the scheduler is a btree keyed by time, a nonce, and a task ID, and a map keyed by task ID and containing a
// nonce and a time (called a uniqueness index from now on).
// The map is to ensure task uniqueness in the tree, so we can replace or delete tasks in the tree.
//
// Scheduling in the tree consists of a main loop that feeds a fixed set of workers, each with their own communication channel.
// Distribution is handled by hashing the TaskID (to ensure uniform distribution) and then distributing over those channels
// evenly based on the hashed ID. This is to ensure that all tasks of the same ID go to the same worker.
//
// The workers call ExecutorFunc handle any errors and update the LastScheduled time internally and also via the Checkpointer.
//
// The main loop:
//
// The main loop waits on a time.Timer to grab the task with the minimum time. Once it successfully grabs a task ready
// to trigger, it will start walking the btree from the item nearest
//
// Putting a task into the scheduler:
//
// Adding a task to the scheduler acquires a write lock, grabs the task from the uniqueness map, and replaces the item
// in the uniqueness index and btree. If new task would trigger sooner than the current soonest triggering task, it
// replaces the Timer when added to the scheduler. Finally it releases the write lock.
//
// Removing a task from the scheduler:
//
// Removing a task from the scheduler acquires a write lock, deletes the task from the uniqueness index and from the
// btree, then releases the lock. We do not have to readjust the time on delete, because, if the minimum task isn't
// ready yet, the main loop just resets the timer and keeps going.
type TreeScheduler struct {
mu sync.RWMutex
priorityQueue *btree.BTree
nextTime map[ID]int64 // we need this index so we can delete items from the scheduled
when time.Time
executor Executor
onErr ErrorFunc
time clock.Clock
timer *clock.Timer
done chan struct{}
workchans []chan Item
wg sync.WaitGroup
checkpointer SchedulableService
items *itemList
sm *SchedulerMetrics
}
// ErrorFunc is a function for error handling. It is a good way to inject logging into a TreeScheduler.
type ErrorFunc func(ctx context.Context, taskID ID, scheduledFor time.Time, err error)
type treeSchedulerOptFunc func(t *TreeScheduler) error
// WithOnErrorFn is an option that sets the error function that gets called when there is an error in a TreeScheduler.
// its useful for injecting logging or special error handling.
func WithOnErrorFn(fn ErrorFunc) treeSchedulerOptFunc {
return func(t *TreeScheduler) error {
t.onErr = fn
return nil
}
}
// WithMaxConcurrentWorkers is an option that sets the max number of concurrent workers that a TreeScheduler will use.
func WithMaxConcurrentWorkers(n int) treeSchedulerOptFunc {
return func(t *TreeScheduler) error {
t.workchans = make([]chan Item, n)
return nil
}
}
// WithTime is an option for NewScheduler that allows you to inject a clock.Clock from ben johnson's github.com/benbjohnson/clock library, for testing purposes.
func WithTime(t clock.Clock) treeSchedulerOptFunc {
return func(sch *TreeScheduler) error {
sch.time = t
return nil
}
}
// NewScheduler gives us a new TreeScheduler and SchedulerMetrics when given an Executor, a SchedulableService, and zero or more options.
// Schedulers should be initialized with this function.
func NewScheduler(executor Executor, checkpointer SchedulableService, opts ...treeSchedulerOptFunc) (*TreeScheduler, *SchedulerMetrics, error) {
s := &TreeScheduler{
executor: executor,
priorityQueue: btree.New(degreeBtreeScheduled),
nextTime: map[ID]int64{},
onErr: func(_ context.Context, _ ID, _ time.Time, _ error) {},
time: clock.New(),
done: make(chan struct{}, 1),
checkpointer: checkpointer,
items: &itemList{},
}
// apply options
for i := range opts {
if err := opts[i](s); err != nil {
return nil, nil, err
}
}
if s.workchans == nil {
s.workchans = make([]chan Item, defaultMaxWorkers)
}
s.wg.Add(len(s.workchans))
for i := 0; i < len(s.workchans); i++ {
s.workchans[i] = make(chan Item)
go s.work(context.Background(), s.workchans[i])
}
s.sm = NewSchedulerMetrics(s)
s.when = time.Time{}
s.timer = s.time.Timer(0)
s.timer.Stop()
// because a stopped timer will wait forever, this allows us to wait for items to be added before triggering.
if executor == nil {
return nil, nil, errors.New("executor must be a non-nil function")
}
s.wg.Add(1)
go func() {
defer s.wg.Done()
schedulerLoop:
for {
select {
case <-s.done:
s.mu.Lock()
s.timer.Stop()
// close workchans
for i := range s.workchans {
close(s.workchans[i])
}
s.mu.Unlock()
return
case <-s.timer.C:
for { // this for loop is a work around to the way clock's mock works when you reset duration 0 in a different thread than you are calling your clock.Set
s.mu.Lock()
min := s.priorityQueue.Min()
if min == nil { // grab a new item, because there could be a different item at the top of the queue
s.when = time.Time{}
s.mu.Unlock()
continue schedulerLoop
}
it := min.(Item)
if ts := s.time.Now().UTC(); it.When().After(ts) {
s.timer.Reset(ts.Sub(it.When()))
s.mu.Unlock()
continue schedulerLoop
}
s.process()
min = s.priorityQueue.Min()
if min == nil { // grab a new item, because there could be a different item at the top of the queue after processing
s.when = time.Time{}
s.mu.Unlock()
continue schedulerLoop
}
it = min.(Item)
s.when = it.When()
until := s.when.Sub(s.time.Now())
if until > 0 {
s.resetTimer(until) // we can reset without a stop because we know it is fired here
s.mu.Unlock()
continue schedulerLoop
}
s.mu.Unlock()
}
}
}
}()
return s, s.sm, nil
}
func (s *TreeScheduler) Stop() {
s.mu.Lock()
close(s.done)
s.mu.Unlock()
s.wg.Wait()
}
// itemList is a list of items for deleting and inserting. We have to do them separately instead of just a re-add,
// because usually the items key must be changed between the delete and insert
type itemList struct {
toInsert []Item
toDelete []Item
}
func (s *TreeScheduler) process() {
// Reset the length of the slice in preparation of the next iterator.
s.items.toDelete = s.items.toDelete[:0]
s.items.toInsert = s.items.toInsert[:0]
toReAdd := s.items
iter := s.iterator(s.time.Now())
s.priorityQueue.Ascend(iter)
for i := range toReAdd.toDelete {
delete(s.nextTime, toReAdd.toDelete[i].id)
s.priorityQueue.Delete(toReAdd.toDelete[i])
}
for i := range toReAdd.toInsert {
s.nextTime[toReAdd.toInsert[i].id] = toReAdd.toInsert[i].when
s.priorityQueue.ReplaceOrInsert(toReAdd.toInsert[i])
}
}
func (s *TreeScheduler) resetTimer(whenFromNow time.Duration) {
s.when = s.time.Now().Add(whenFromNow)
s.timer.Reset(whenFromNow)
}
func (s *TreeScheduler) iterator(ts time.Time) btree.ItemIterator {
return func(i btree.Item) bool {
if i == nil {
return false
}
it := i.(Item) // we want it to panic if things other than Items are populating the scheduler, as it is something we can't recover from.
if time.Unix(it.next+it.Offset, 0).After(ts) {
return false
}
// distribute to the right worker.
{
buf := [8]byte{}
binary.LittleEndian.PutUint64(buf[:], uint64(it.id))
wc := xxhash.Sum64(buf[:]) % uint64(len(s.workchans)) // we just hash so that the number is uniformly distributed
select {
case s.workchans[wc] <- it:
s.items.toDelete = append(s.items.toDelete, it)
if err := it.updateNext(); err != nil {
// in this error case we can't schedule next, so we have to drop the task
s.onErr(context.Background(), it.id, it.Next(), &ErrUnrecoverable{err})
return true
}
s.items.toInsert = append(s.items.toInsert, it)
case <-s.done:
return false
default:
return true
}
}
return true
}
}
// When gives us the next time the scheduler will run a task.
func (s *TreeScheduler) When() time.Time {
s.mu.RLock()
w := s.when
s.mu.RUnlock()
return w
}
func (s *TreeScheduler) release(taskID ID) {
when, ok := s.nextTime[taskID]
if !ok {
return
}
// delete the old task run time
s.priorityQueue.Delete(Item{id: taskID, when: when})
delete(s.nextTime, taskID)
}
// Release releases a task.
// Release also cancels the running task.
// Task deletion would be faster if the tree supported deleting ranges.
func (s *TreeScheduler) Release(taskID ID) error {
s.sm.release(taskID)
s.mu.Lock()
s.release(taskID)
s.mu.Unlock()
return nil
}
// work does work from the channel and checkpoints it.
func (s *TreeScheduler) work(ctx context.Context, ch chan Item) {
var it Item
defer func() {
s.wg.Done()
}()
for it = range ch {
t := time.Unix(it.next, 0)
err := func() (err error) {
defer func() {
if r := recover(); r != nil {
err = &ErrUnrecoverable{errors.New("executor panicked")}
}
}()
// report the difference between when the item was supposed to be scheduled and now
s.sm.reportScheduleDelay(time.Since(it.Next()))
preExec := time.Now()
// execute
err = s.executor.Execute(ctx, it.id, t, it.When())
// report how long execution took
s.sm.reportExecution(err, time.Since(preExec))
return err
}()
if err != nil {
s.onErr(ctx, it.id, it.Next(), err)
}
// TODO(docmerlin): we can increase performance by making the call to UpdateLastScheduled async
if err := s.checkpointer.UpdateLastScheduled(ctx, it.id, t); err != nil {
s.onErr(ctx, it.id, it.Next(), err)
}
}
}
// Schedule put puts a Schedulable on the TreeScheduler.
func (s *TreeScheduler) Schedule(sch Schedulable) error {
s.sm.schedule(sch.ID())
it := Item{
cron: sch.Schedule(),
id: sch.ID(),
Offset: int64(sch.Offset().Seconds()),
//last: sch.LastScheduled().Unix(),
}
nt, err := it.cron.Next(sch.LastScheduled())
if err != nil {
s.sm.scheduleFail(it.id)
s.onErr(context.Background(), it.id, time.Time{}, err)
return err
}
it.next = nt.UTC().Unix()
it.when = it.next + it.Offset
s.mu.Lock()
defer s.mu.Unlock()
nt = nt.Add(sch.Offset())
if s.when.IsZero() || s.when.After(nt) {
s.when = nt
s.timer.Stop()
until := s.when.Sub(s.time.Now())
if until <= 0 {
s.timer.Reset(0)
} else {
s.timer.Reset(s.when.Sub(s.time.Now()))
}
}
nextTime, ok := s.nextTime[it.id]
if ok {
// delete the old task run time
s.priorityQueue.Delete(Item{
when: nextTime,
id: it.id,
})
}
s.nextTime[it.id] = it.next + it.Offset
// insert the new task run time
s.priorityQueue.ReplaceOrInsert(it)
return nil
}
// Item is a task in the scheduler.
type Item struct {
when int64
id ID
cron Schedule
next int64
Offset int64
}
func (it Item) Next() time.Time {
return time.Unix(it.next, 0)
}
func (it Item) When() time.Time {
return time.Unix(it.when, 0)
}
// Less tells us if one Item is less than another
func (it Item) Less(bItem btree.Item) bool {
it2 := bItem.(Item)
return it.when < it2.when || ((it.when == it2.when) && it.id < it2.id)
}
func (it *Item) updateNext() error {
newNext, err := it.cron.Next(time.Unix(it.next, 0))
if err != nil {
return err
}
it.next = newNext.UTC().Unix()
it.when = it.next + it.Offset
return nil
}