tsdb: conflict based concurrency resolution

There are some problematic races that occur when deletes happen
against writes to the same points at the same time. This change
introduces guards and an epoch based system to coordinate these
modifications.

A guard matches a point based on the time, measurement name, and
some conditions loaded from an influxql expression. The intent
is to be as precise as possible without allowing any false
neagatives: if a point would be deleted, the guard must match it.
We are allowed to match more points than necessary, at the cost
of slowing down writes.

The epoch based system keeps track of outstanding writes and
deletes and their associated guards. When a delete operation
is going to start, it waits until all current writes are
done, and installs its guard, blocking all future writes that
contain points that may conflict with the delete. This allows
writes to disjoint points to proceed uncontended, and the
implementation is optimized for assuming there are few
outstanding deletes. For example, in the case that there are no
deletes, a write just has to take a mutex, bump a counter, and
compare a value against zero. The epoch trackers are per shard,
so that different shards never have to contend with one another.
pull/10516/head
Jeff Wendling 2018-11-21 16:56:20 -07:00
parent 030adf4bd5
commit 4cad51a604
5 changed files with 888 additions and 0 deletions

147
tsdb/epoch_tracker.go Normal file
View File

@ -0,0 +1,147 @@
package tsdb
import (
"sync"
)
// TODO(jeff): using a mutex is easiest, but there may be a way to do
// this with atomics only, and in a way such that writes are minimally
// blocked.
// epochTracker keeps track of epochs for write and delete operations
// allowing a delete to block until all previous writes have completed.
type epochTracker struct {
mu sync.Mutex
epoch uint64 // current epoch
largest uint64 // largest delete possible
writes int64 // pending writes
// pending deletes waiting on writes
deletes map[uint64]*epochDeleteState
}
// newEpochTracker constructs an epochTracker.
func newEpochTracker() *epochTracker {
return &epochTracker{
deletes: make(map[uint64]*epochDeleteState),
}
}
// epochDeleteState keeps track of the state for a pending delete.
type epochDeleteState struct {
cond *sync.Cond
guard *guard
pending int64
}
// done signals that an earlier write has finished.
func (e *epochDeleteState) done() {
e.cond.L.Lock()
e.pending--
if e.pending == 0 {
e.cond.Broadcast()
}
e.cond.L.Unlock()
}
// Wait blocks until all earlier writes have finished.
func (e *epochDeleteState) Wait() {
e.cond.L.Lock()
for e.pending > 0 {
e.cond.Wait()
}
e.cond.L.Unlock()
}
// next bumps the epoch and returns it.
func (e *epochTracker) next() uint64 {
e.epoch++
return e.epoch
}
// StartWrite should be called before a write is going to start, and after
// it has checked for guards.
func (e *epochTracker) StartWrite() ([]*guard, uint64) {
e.mu.Lock()
gen := e.next()
e.writes++
if len(e.deletes) == 0 {
e.mu.Unlock()
return nil, gen
}
guards := make([]*guard, 0, len(e.deletes))
for _, state := range e.deletes {
guards = append(guards, state.guard)
}
e.mu.Unlock()
return guards, gen
}
// EndWrite should be called when the write ends for any reason.
func (e *epochTracker) EndWrite(gen uint64) {
e.mu.Lock()
if gen <= e.largest {
// TODO(jeff): at the cost of making waitDelete more
// complicated, we can keep a sorted slice which would
// allow this to exit early rather than go over the
// whole map.
for dgen, state := range e.deletes {
if gen > dgen {
continue
}
state.done()
}
}
e.writes--
e.mu.Unlock()
}
// epochWaiter is a type that can be waited on for prior writes to finish.
type epochWaiter struct {
gen uint64
guard *guard
state *epochDeleteState
tracker *epochTracker
}
// Wait blocks until all writes prior to the creation of the waiter finish.
func (e epochWaiter) Wait() {
if e.state == nil || e.tracker == nil {
return
}
e.state.Wait()
}
// Done marks the delete as completed, removing its guard.
func (e epochWaiter) Done() {
e.tracker.mu.Lock()
delete(e.tracker.deletes, e.gen)
e.tracker.mu.Unlock()
e.guard.Done()
}
// WaitDelete should be called after any delete guards have been installed.
// The returned epochWaiter will not be affected by any future writes.
func (e *epochTracker) WaitDelete(guard *guard) epochWaiter {
e.mu.Lock()
state := &epochDeleteState{
pending: e.writes,
cond: sync.NewCond(new(sync.Mutex)),
guard: guard,
}
// record our pending delete
gen := e.next()
e.largest = gen
e.deletes[gen] = state
e.mu.Unlock()
return epochWaiter{
gen: gen,
guard: guard,
state: state,
tracker: e,
}
}

141
tsdb/epoch_tracker_test.go Normal file
View File

@ -0,0 +1,141 @@
package tsdb
import (
"testing"
"time"
)
func TestEpochTracker(t *testing.T) {
t.Run("Delete waits", func(t *testing.T) {
tr := newEpochTracker()
// delete should proceed with no pending writes
waiter := tr.WaitDelete(newGuard(0, 0, nil, nil))
waiter.Wait()
waiter.Done()
for i := 0; i < 1000; i++ {
// start up some writes
_, w1 := tr.StartWrite()
_, w2 := tr.StartWrite()
_, w3 := tr.StartWrite()
// wait for a delete. this time based stuff isn't sufficient
// to check every problem, but it can catch some.
waiter := tr.WaitDelete(nil)
done := make(chan time.Time, 1)
go func() { waiter.Wait(); done <- time.Now() }()
// future writes should not block the waiter
_, w4 := tr.StartWrite()
// ending the writes allows the waiter to proceed
tr.EndWrite(w1)
tr.EndWrite(w2)
now := time.Now()
tr.EndWrite(w3)
if (<-done).Before(now) {
t.Fatal("Wait ended too soon")
}
tr.EndWrite(w4)
}
})
t.Run("Guards tracked", func(t *testing.T) {
checkGuards := func(got []*guard, exp ...*guard) {
t.Helper()
if len(exp) != len(got) {
t.Fatalf("invalid: %p != %p", exp, got)
}
next:
for _, g1 := range got {
for _, g2 := range exp {
if g1 == g2 {
continue next
}
}
t.Fatalf("invalid: %p != %p", exp, got)
}
}
tr := newEpochTracker()
g1, g2, g3 := newGuard(0, 0, nil, nil), newGuard(0, 0, nil, nil), newGuard(0, 0, nil, nil)
guards, _ := tr.StartWrite()
checkGuards(guards)
d1 := tr.WaitDelete(g1)
guards, _ = tr.StartWrite()
checkGuards(guards, g1)
d2 := tr.WaitDelete(g2)
guards, _ = tr.StartWrite()
checkGuards(guards, g1, g2)
d3 := tr.WaitDelete(g3)
guards, _ = tr.StartWrite()
checkGuards(guards, g1, g2, g3)
d2.Done()
guards, _ = tr.StartWrite()
checkGuards(guards, g1, g3)
d1.Done()
guards, _ = tr.StartWrite()
checkGuards(guards, g3)
d3.Done()
guards, _ = tr.StartWrite()
checkGuards(guards)
})
}
func BenchmarkEpochTracker(b *testing.B) {
b.Run("Writes with deletes", func(b *testing.B) {
b.Run("Serial", func(b *testing.B) {
run := func(b *testing.B, deletes int) {
tr := newEpochTracker()
tr.StartWrite()
for i := 0; i < deletes; i++ {
tr.WaitDelete(nil)
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, gen := tr.StartWrite()
tr.EndWrite(gen)
}
}
b.Run("0", func(b *testing.B) { run(b, 0) })
b.Run("1", func(b *testing.B) { run(b, 1) })
b.Run("10", func(b *testing.B) { run(b, 10) })
b.Run("100", func(b *testing.B) { run(b, 100) })
})
b.Run("Parallel", func(b *testing.B) {
run := func(b *testing.B, deletes int) {
tr := newEpochTracker()
tr.StartWrite()
for i := 0; i < deletes; i++ {
tr.WaitDelete(nil)
}
b.ReportAllocs()
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
_, gen := tr.StartWrite()
tr.EndWrite(gen)
}
})
}
b.Run("0", func(b *testing.B) { run(b, 0) })
b.Run("1", func(b *testing.B) { run(b, 1) })
b.Run("10", func(b *testing.B) { run(b, 10) })
b.Run("100", func(b *testing.B) { run(b, 100) })
})
})
}

253
tsdb/guard.go Normal file
View File

@ -0,0 +1,253 @@
package tsdb
import (
"bytes"
"sync"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxql"
)
// guard lets one match a set of points and block until they are done.
type guard struct {
cond *sync.Cond
done bool
min int64
max int64
names map[string]struct{}
expr *exprGuard
}
// newGuard constructs a guard that will match any points in the given min and max
// time range, with the given set of measurement names, or the given expression.
// The expression is optional.
func newGuard(min, max int64, names []string, expr influxql.Expr) *guard {
set := make(map[string]struct{}, len(names))
for _, name := range names {
set[name] = struct{}{}
}
return &guard{
cond: sync.NewCond(new(sync.Mutex)),
min: min,
max: max,
names: set,
expr: newExprGuard(expr),
}
}
// Matches returns true if any of the points match the guard.
func (g *guard) Matches(points []models.Point) bool {
if g == nil {
return true
}
for _, pt := range points {
if t := pt.Time().UnixNano(); t < g.min || t > g.max {
continue
}
if len(g.names) == 0 && g.expr.matches(pt) {
return true
} else if _, ok := g.names[string(pt.Name())]; ok && g.expr.matches(pt) {
return true
}
}
return false
}
// Wait blocks until the guard has been marked Done.
func (g *guard) Wait() {
g.cond.L.Lock()
for !g.done {
g.cond.Wait()
}
g.cond.L.Unlock()
}
// Done signals to anyone waiting on the guard that they can proceed.
func (g *guard) Done() {
g.cond.L.Lock()
g.done = true
g.cond.Broadcast()
g.cond.L.Unlock()
}
// exprGuard is a union of influxql.Expr based guards. a nil exprGuard matches
// everything, while the zero value matches nothing.
type exprGuard struct {
and *[2]*exprGuard
or *[2]*exprGuard
tagMatches *tagGuard
tagExists map[string]struct{}
}
type tagGuard struct {
meas bool
key []byte
op func([]byte) bool
}
// empty returns true if the exprGuard is empty, meaning that it matches no points.
func (e *exprGuard) empty() bool {
return e != nil && e.and == nil && e.or == nil && e.tagMatches == nil && e.tagExists == nil
}
// newExprGuard scrutinizes the expression and returns an efficient guard.
func newExprGuard(expr influxql.Expr) *exprGuard {
if expr == nil {
return nil
}
switch expr := expr.(type) {
case *influxql.ParenExpr:
return newExprGuard(expr.Expr)
case *influxql.BooleanLiteral:
if expr.Val {
return nil // matches everything
}
return new(exprGuard) // matches nothing
case *influxql.BinaryExpr:
switch expr.Op {
case influxql.AND:
lhs, rhs := newExprGuard(expr.LHS), newExprGuard(expr.RHS)
if lhs == nil { // reduce
return rhs
} else if rhs == nil { // reduce
return lhs
} else if lhs.empty() || rhs.empty() { // short circuit
return new(exprGuard)
} else {
return &exprGuard{and: &[2]*exprGuard{lhs, rhs}}
}
case influxql.OR:
lhs, rhs := newExprGuard(expr.LHS), newExprGuard(expr.RHS)
if lhs.empty() { // reduce
return rhs
} else if rhs.empty() { // reduce
return lhs
} else if lhs == nil || rhs == nil { // short circuit
return nil
} else {
return &exprGuard{or: &[2]*exprGuard{lhs, rhs}}
}
default:
return newBinaryExprGuard(expr)
}
default:
// if we couldn't analyze, match everything
return nil
}
}
// newBinaryExprGuard scrutinizes the binary expression and returns an efficient guard.
func newBinaryExprGuard(expr *influxql.BinaryExpr) *exprGuard {
// if it's a nested binary expression, always match.
if _, ok := expr.LHS.(*influxql.BinaryExpr); ok {
return nil
} else if _, ok := expr.RHS.(*influxql.BinaryExpr); ok {
return nil
}
// ensure one of the expressions is a VarRef, and make that the key.
key, ok := expr.LHS.(*influxql.VarRef)
value := expr.RHS
if !ok {
key, ok = expr.RHS.(*influxql.VarRef)
if !ok {
return nil
}
value = expr.LHS
}
// check the key for situations we know we can't filter.
if key.Val != "_name" && key.Type != influxql.Unknown && key.Type != influxql.Tag {
return nil
}
// scrutinize the value to return an efficient guard.
switch value := value.(type) {
case *influxql.StringLiteral:
val := []byte(value.Val)
g := &exprGuard{tagMatches: &tagGuard{
meas: key.Val == "_name",
key: []byte(key.Val),
}}
switch expr.Op {
case influxql.EQ:
g.tagMatches.op = func(x []byte) bool { return bytes.Equal(val, x) }
case influxql.NEQ:
g.tagMatches.op = func(x []byte) bool { return !bytes.Equal(val, x) }
default: // any other operator isn't valid. conservatively match everything.
return nil
}
return g
case *influxql.RegexLiteral:
// There's a tradeoff between being precise and being fast. For example, if the
// delete includes a very expensive regex, we don't want to run that against every
// incoming point. The decision here is to match any point that has a possibly
// expensive match if there is any overlap on the tags. In other words, expensive
// matches get transformed into trivially matching everything.
return &exprGuard{tagExists: map[string]struct{}{key.Val: {}}}
case *influxql.VarRef:
// We could do a better job here by encoding the two names and checking the points
// against them, but I'm not quite sure how to do that. Be conservative and match
// any points that contain either the key or value.
// since every point has a measurement, always match if either are on the measurement.
if key.Val == "_name" || value.Val == "_name" {
return nil
}
return &exprGuard{tagExists: map[string]struct{}{
key.Val: {},
value.Val: {},
}}
default: // any other value type matches everything
return nil
}
}
// matches checks if the exprGuard matches the point.
func (g *exprGuard) matches(pt models.Point) bool {
switch {
case g == nil:
return true
case g.and != nil:
return g.and[0].matches(pt) && g.and[1].matches(pt)
case g.or != nil:
return g.or[0].matches(pt) || g.or[1].matches(pt)
case g.tagMatches != nil:
if g.tagMatches.meas {
return g.tagMatches.op(pt.Name())
}
for _, tag := range pt.Tags() {
if bytes.Equal(tag.Key, g.tagMatches.key) && g.tagMatches.op(tag.Value) {
return true
}
}
return false
case g.tagExists != nil:
for _, tag := range pt.Tags() {
if _, ok := g.tagExists[string(tag.Key)]; ok {
return true
}
}
return false
default:
return false
}
}

314
tsdb/guard_test.go Normal file
View File

@ -0,0 +1,314 @@
package tsdb
import (
"testing"
"time"
"github.com/davecgh/go-spew/spew"
"github.com/influxdata/influxdb/models"
"github.com/influxdata/influxql"
)
func TestGuard(t *testing.T) {
tests := []struct {
min, max int64
names []string
expr string
point string
matches bool
}{
{ // in time matching
min: 0, max: 1000,
point: "cpu value=1 100",
matches: true,
},
{ // out of time range doesn't match
min: 0, max: 10,
names: []string{"cpu"},
point: "cpu value=1 100",
matches: false,
},
{ // measurement name matches
min: 0, max: 1000,
names: []string{"cpu"},
point: "cpu value=1 100",
matches: true,
},
{ // measurement doesn't match
min: 0, max: 1000,
names: []string{"mem"},
point: "cpu value=1 100",
matches: false,
},
{ // basic expression matching
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server1'",
matches: true,
},
{ // basic expression matching
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host != 'server2'",
matches: true,
},
{ // basic expression mismatch
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server2'",
matches: false,
},
{ // basic expression mismatch
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host != 'server1'",
matches: false,
},
{ // parenthesis unwrap
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "(host = 'server1')",
matches: true,
},
{ // compound expression matching
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server2' or host = 'server1'",
matches: true,
},
{ // compound expression mismatch
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server1' and host = 'server2'",
matches: false,
},
{ // regex expression matching
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host =~ /server1/",
matches: true,
},
{ // regex expression mismatch
min: 0, max: 1000,
point: "cpu,foo=server1 value=1 100",
expr: "host =~ /server1/",
matches: false,
},
{ // regex over-approximation
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host =~ /server2/",
matches: true,
},
{ // regex over-approximation
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host !~ /server1/",
matches: true,
},
{ // key doesn't have to come first
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "'server1' = host",
matches: true,
},
{ // key doesn't have to come first
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "'server2' = host",
matches: false,
},
{ // conservative on no var refs
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "1 = 2",
matches: true,
},
{ // expr matches measurement
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "_name = 'cpu'",
matches: true,
},
{ // expr mismatches measurement
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "_name = 'mem'",
matches: false,
},
{ // expr conservative on dual var ref
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = test",
matches: true,
},
{ // expr conservative on dual var ref mismatches
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "foo = bar",
matches: false,
},
{ // expr conservative on dual var ref involving measurement
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "_name = host",
matches: true,
},
{ // expr conservative on dual var ref involving measurement
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = _name",
matches: true,
},
{ // boolean literal matches
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "true",
matches: true,
},
{ // boolean literal mismatches
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "false",
matches: false,
},
{ // reduce and
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "true and host = 'server1'",
matches: true,
},
{ // reduce and
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server1' and true",
matches: true,
},
{ // reduce or
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "false or host = 'server1'",
matches: true,
},
{ // reduce or
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server1' or false",
matches: true,
},
{ // short circuit and
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "false and host = 'server1'",
matches: false,
},
{ // short circuit and
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server1' and false",
matches: false,
},
{ // short circuit or
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "true or host = 'server2'",
matches: true,
},
{ // short circuit or
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = 'server2' or true",
matches: true,
},
{ // conservative match weird exprs
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "'wierd'",
matches: true,
},
{ // conservative match weird exprs
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "value::field = '1'",
matches: true,
},
{ // conservative match weird exprs
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host <= 'aaa'",
matches: true,
},
{ // conservative match weird exprs
min: 0, max: 1000,
point: "cpu,host=server1 value=1 100",
expr: "host = ('server2')",
matches: true,
},
}
for i, test := range tests {
var expr influxql.Expr
if test.expr != "" {
var err error
expr, err = influxql.ParseExpr(test.expr)
if err != nil {
t.Fatal(err)
}
}
points, err := models.ParsePointsString(test.point)
if err != nil {
t.Fatal(err)
}
guard := newGuard(test.min, test.max, test.names, expr)
if guard.Matches(points) != test.matches {
t.Errorf("%d: expected matching %q with time:[%d, %d] measurements:%v expr:%q to be %t",
i, test.point, test.min, test.max, test.names, test.expr, test.matches)
cs := &spew.ConfigState{DisableMethods: true, SpewKeys: true, Indent: " "}
t.Errorf("%d: expr: %s", i, cs.Sdump(expr))
t.Errorf("%d: guard: %s", i, cs.Sdump(guard.expr))
}
}
}
func BenchmarkGuard(b *testing.B) {
tag := func(key, value string) models.Tag {
return models.Tag{Key: []byte(key), Value: []byte(value)}
}
run := func(b *testing.B, g *guard) {
run := func(b *testing.B, batch int) {
points := make([]models.Point, batch)
for i := range points {
points[i] = models.MustNewPoint("cpu", models.Tags{
tag("t0", "v0"), tag("t1", "v1"), tag("t2", "v2"),
tag("t3", "v3"), tag("t4", "v4"), tag("t5", "v5"),
tag("t6", "v6"), tag("t7", "v7"), tag("t8", "v8"),
}, models.Fields{"value": 100}, time.Unix(0, 50))
}
for i := 0; i < b.N; i++ {
if g.Matches(points) {
b.Fatal("matched")
}
}
}
b.Run("1", func(b *testing.B) { run(b, 1) })
b.Run("100", func(b *testing.B) { run(b, 100) })
b.Run("10000", func(b *testing.B) { run(b, 10000) })
}
b.Run("Time Filtered", func(b *testing.B) {
run(b, newGuard(0, 10, nil, nil))
})
b.Run("Measurement Filtered", func(b *testing.B) {
run(b, newGuard(0, 100, []string{"mem"}, nil))
})
b.Run("Tag Filtered", func(b *testing.B) {
expr, _ := influxql.ParseExpr("t4 = 'v5'")
run(b, newGuard(0, 100, []string{"cpu"}, expr))
})
}

View File

@ -88,6 +88,10 @@ type Store struct {
// This prevents new shards from being created while old ones are being deleted.
pendingShardDeletes map[uint64]struct{}
// Epoch tracker helps serialize writes and deletes that may conflict. It
// is stored by shard.
epochs map[uint64]*epochTracker
EngineOptions EngineOptions
baseLogger *zap.Logger
@ -108,6 +112,7 @@ func NewStore(path string) *Store {
sfiles: make(map[string]*SeriesFile),
indexes: make(map[string]interface{}),
pendingShardDeletes: make(map[uint64]struct{}),
epochs: make(map[uint64]*epochTracker),
EngineOptions: NewEngineOptions(),
Logger: logger,
baseLogger: logger,
@ -412,6 +417,7 @@ func (s *Store) loadShards() error {
continue
}
s.shards[res.s.id] = res.s
s.epochs[res.s.id] = newEpochTracker()
if _, ok := s.databases[res.s.database]; !ok {
s.databases[res.s.database] = new(databaseState)
}
@ -633,6 +639,7 @@ func (s *Store) CreateShard(database, retentionPolicy string, shardID uint64, en
}
s.shards[shardID] = shard
s.epochs[shardID] = newEpochTracker()
if _, ok := s.databases[database]; !ok {
s.databases[database] = new(databaseState)
}
@ -690,6 +697,7 @@ func (s *Store) DeleteShard(shardID uint64) error {
return nil
}
delete(s.shards, shardID)
delete(s.epochs, shardID)
s.pendingShardDeletes[shardID] = struct{}{}
db := sh.Database()
@ -828,6 +836,7 @@ func (s *Store) DeleteDatabase(name string) error {
for _, sh := range shards {
delete(s.shards, sh.id)
delete(s.epochs, sh.id)
}
// Remove database from store list of databases
@ -911,6 +920,13 @@ func (s *Store) DeleteMeasurement(database, name string) error {
limit.Take()
defer limit.Release()
// install our guard and wait for any prior deletes to finish. the
// guard ensures future deletes that could conflict wait for us.
guard := newGuard(influxql.MinTime, influxql.MaxTime, []string{name}, nil)
waiter := s.epochs[sh.id].WaitDelete(guard)
waiter.Wait()
defer waiter.Done()
return sh.DeleteMeasurement([]byte(name))
})
}
@ -1294,6 +1310,12 @@ func (s *Store) DeleteSeries(database string, sources []influxql.Source, conditi
limit.Take()
defer limit.Release()
// install our guard and wait for any prior deletes to finish. the
// guard ensures future deletes that could conflict wait for us.
waiter := s.epochs[sh.id].WaitDelete(newGuard(min, max, names, condition))
waiter.Wait()
defer waiter.Done()
index, err := sh.Index()
if err != nil {
return err
@ -1347,6 +1369,17 @@ func (s *Store) WriteToShard(shardID uint64, points []models.Point) error {
}
s.mu.RUnlock()
// enter the epoch tracker
guards, gen := s.epochs[shardID].StartWrite()
defer s.epochs[shardID].EndWrite(gen)
// wait for any guards before writing the points.
for _, guard := range guards {
if guard.Matches(points) {
guard.Wait()
}
}
// Ensure snapshot compactions are enabled since the shard might have been cold
// and disabled by the monitor.
if sh.IsIdle() {