tsdb: conflict based concurrency resolution

There are some problematic races that occur when deletes happen against writes to the same points at the same time. This change introduces guards and an epoch based system to coordinate these modifications. A guard matches a point based on the time, measurement name, and some conditions loaded from an influxql expression. The intent is to be as precise as possible without allowing any false neagatives: if a point would be deleted, the guard must match it. We are allowed to match more points than necessary, at the cost of slowing down writes. The epoch based system keeps track of outstanding writes and deletes and their associated guards. When a delete operation is going to start, it waits until all current writes are done, and installs its guard, blocking all future writes that contain points that may conflict with the delete. This allows writes to disjoint points to proceed uncontended, and the implementation is optimized for assuming there are few outstanding deletes. For example, in the case that there are no deletes, a write just has to take a mutex, bump a counter, and compare a value against zero. The epoch trackers are per shard, so that different shards never have to contend with one another.
2018-11-21 16:56:20 -07:00 · 2018-11-21 16:56:20 -07:00 · 4cad51a604
parent 030adf4bd5
commit 4cad51a604
5 changed files with 888 additions and 0 deletions
--- a/tsdb/epoch_tracker.go
+++ b/tsdb/epoch_tracker.go
@ -0,0 +1,147 @@
+package tsdb
+
+import (
+	"sync"
+)
+
+// TODO(jeff): using a mutex is easiest, but there may be a way to do
+// this with atomics only, and in a way such that writes are minimally
+// blocked.
+
+// epochTracker keeps track of epochs for write and delete operations
+// allowing a delete to block until all previous writes have completed.
+type epochTracker struct {
+	mu      sync.Mutex
+	epoch   uint64 // current epoch
+	largest uint64 // largest delete possible
+	writes  int64  // pending writes
+	// pending deletes waiting on writes
+	deletes map[uint64]*epochDeleteState
+}
+
+// newEpochTracker constructs an epochTracker.
+func newEpochTracker() *epochTracker {
+	return &epochTracker{
+		deletes: make(map[uint64]*epochDeleteState),
+	}
+}
+
+// epochDeleteState keeps track of the state for a pending delete.
+type epochDeleteState struct {
+	cond    *sync.Cond
+	guard   *guard
+	pending int64
+}
+
+// done signals that an earlier write has finished.
+func (e *epochDeleteState) done() {
+	e.cond.L.Lock()
+	e.pending--
+	if e.pending == 0 {
+		e.cond.Broadcast()
+	}
+	e.cond.L.Unlock()
+}
+
+// Wait blocks until all earlier writes have finished.
+func (e *epochDeleteState) Wait() {
+	e.cond.L.Lock()
+	for e.pending > 0 {
+		e.cond.Wait()
+	}
+	e.cond.L.Unlock()
+}
+
+// next bumps the epoch and returns it.
+func (e *epochTracker) next() uint64 {
+	e.epoch++
+	return e.epoch
+}
+
+// StartWrite should be called before a write is going to start, and after
+// it has checked for guards.
+func (e *epochTracker) StartWrite() ([]*guard, uint64) {
+	e.mu.Lock()
+	gen := e.next()
+	e.writes++
+
+	if len(e.deletes) == 0 {
+		e.mu.Unlock()
+		return nil, gen
+	}
+
+	guards := make([]*guard, 0, len(e.deletes))
+	for _, state := range e.deletes {
+		guards = append(guards, state.guard)
+	}
+
+	e.mu.Unlock()
+	return guards, gen
+}
+
+// EndWrite should be called when the write ends for any reason.
+func (e *epochTracker) EndWrite(gen uint64) {
+	e.mu.Lock()
+	if gen <= e.largest {
+		// TODO(jeff): at the cost of making waitDelete more
+		// complicated, we can keep a sorted slice which would
+		// allow this to exit early rather than go over the
+		// whole map.
+		for dgen, state := range e.deletes {
+			if gen > dgen {
+				continue
+			}
+			state.done()
+		}
+	}
+	e.writes--
+	e.mu.Unlock()
+}
+
+// epochWaiter is a type that can be waited on for prior writes to finish.
+type epochWaiter struct {
+	gen     uint64
+	guard   *guard
+	state   *epochDeleteState
+	tracker *epochTracker
+}
+
+// Wait blocks until all writes prior to the creation of the waiter finish.
+func (e epochWaiter) Wait() {
+	if e.state == nil || e.tracker == nil {
+		return
+	}
+	e.state.Wait()
+}
+
+// Done marks the delete as completed, removing its guard.
+func (e epochWaiter) Done() {
+	e.tracker.mu.Lock()
+	delete(e.tracker.deletes, e.gen)
+	e.tracker.mu.Unlock()
+	e.guard.Done()
+}
+
+// WaitDelete should be called after any delete guards have been installed.
+// The returned epochWaiter will not be affected by any future writes.
+func (e *epochTracker) WaitDelete(guard *guard) epochWaiter {
+	e.mu.Lock()
+	state := &epochDeleteState{
+		pending: e.writes,
+		cond:    sync.NewCond(new(sync.Mutex)),
+		guard:   guard,
+	}
+
+	// record our pending delete
+	gen := e.next()
+	e.largest = gen
+	e.deletes[gen] = state
+	e.mu.Unlock()
+
+	return epochWaiter{
+		gen:     gen,
+		guard:   guard,
+		state:   state,
+		tracker: e,
+	}
+}
--- a/tsdb/epoch_tracker_test.go
+++ b/tsdb/epoch_tracker_test.go
@ -0,0 +1,141 @@
+package tsdb
+
+import (
+	"testing"
+	"time"
+)
+
+func TestEpochTracker(t *testing.T) {
+	t.Run("Delete waits", func(t *testing.T) {
+		tr := newEpochTracker()
+
+		// delete should proceed with no pending writes
+		waiter := tr.WaitDelete(newGuard(0, 0, nil, nil))
+		waiter.Wait()
+		waiter.Done()
+
+		for i := 0; i < 1000; i++ {
+			// start up some writes
+			_, w1 := tr.StartWrite()
+			_, w2 := tr.StartWrite()
+			_, w3 := tr.StartWrite()
+
+			// wait for a delete. this time based stuff isn't sufficient
+			// to check every problem, but it can catch some.
+			waiter := tr.WaitDelete(nil)
+			done := make(chan time.Time, 1)
+			go func() { waiter.Wait(); done <- time.Now() }()
+
+			// future writes should not block the waiter
+			_, w4 := tr.StartWrite()
+
+			// ending the writes allows the waiter to proceed
+			tr.EndWrite(w1)
+			tr.EndWrite(w2)
+			now := time.Now()
+			tr.EndWrite(w3)
+			if (<-done).Before(now) {
+				t.Fatal("Wait ended too soon")
+			}
+			tr.EndWrite(w4)
+		}
+	})
+
+	t.Run("Guards tracked", func(t *testing.T) {
+		checkGuards := func(got []*guard, exp ...*guard) {
+			t.Helper()
+			if len(exp) != len(got) {
+				t.Fatalf("invalid: %p != %p", exp, got)
+			}
+		next:
+			for _, g1 := range got {
+				for _, g2 := range exp {
+					if g1 == g2 {
+						continue next
+					}
+				}
+				t.Fatalf("invalid: %p != %p", exp, got)
+			}
+		}
+
+		tr := newEpochTracker()
+		g1, g2, g3 := newGuard(0, 0, nil, nil), newGuard(0, 0, nil, nil), newGuard(0, 0, nil, nil)
+
+		guards, _ := tr.StartWrite()
+		checkGuards(guards)
+
+		d1 := tr.WaitDelete(g1)
+		guards, _ = tr.StartWrite()
+		checkGuards(guards, g1)
+
+		d2 := tr.WaitDelete(g2)
+		guards, _ = tr.StartWrite()
+		checkGuards(guards, g1, g2)
+
+		d3 := tr.WaitDelete(g3)
+		guards, _ = tr.StartWrite()
+		checkGuards(guards, g1, g2, g3)
+
+		d2.Done()
+		guards, _ = tr.StartWrite()
+		checkGuards(guards, g1, g3)
+
+		d1.Done()
+		guards, _ = tr.StartWrite()
+		checkGuards(guards, g3)
+
+		d3.Done()
+		guards, _ = tr.StartWrite()
+		checkGuards(guards)
+	})
+}
+
+func BenchmarkEpochTracker(b *testing.B) {
+	b.Run("Writes with deletes", func(b *testing.B) {
+		b.Run("Serial", func(b *testing.B) {
+			run := func(b *testing.B, deletes int) {
+				tr := newEpochTracker()
+				tr.StartWrite()
+				for i := 0; i < deletes; i++ {
+					tr.WaitDelete(nil)
+				}
+				b.ReportAllocs()
+				b.ResetTimer()
+
+				for i := 0; i < b.N; i++ {
+					_, gen := tr.StartWrite()
+					tr.EndWrite(gen)
+				}
+			}
+
+			b.Run("0", func(b *testing.B) { run(b, 0) })
+			b.Run("1", func(b *testing.B) { run(b, 1) })
+			b.Run("10", func(b *testing.B) { run(b, 10) })
+			b.Run("100", func(b *testing.B) { run(b, 100) })
+		})
+
+		b.Run("Parallel", func(b *testing.B) {
+			run := func(b *testing.B, deletes int) {
+				tr := newEpochTracker()
+				tr.StartWrite()
+				for i := 0; i < deletes; i++ {
+					tr.WaitDelete(nil)
+				}
+				b.ReportAllocs()
+				b.ResetTimer()
+
+				b.RunParallel(func(pb *testing.PB) {
+					for pb.Next() {
+						_, gen := tr.StartWrite()
+						tr.EndWrite(gen)
+					}
+				})
+			}
+
+			b.Run("0", func(b *testing.B) { run(b, 0) })
+			b.Run("1", func(b *testing.B) { run(b, 1) })
+			b.Run("10", func(b *testing.B) { run(b, 10) })
+			b.Run("100", func(b *testing.B) { run(b, 100) })
+		})
+	})
+}
--- a/tsdb/guard.go
+++ b/tsdb/guard.go
@ -0,0 +1,253 @@
+package tsdb
+
+import (
+	"bytes"
+	"sync"
+
+	"github.com/influxdata/influxdb/models"
+	"github.com/influxdata/influxql"
+)
+
+// guard lets one match a set of points and block until they are done.
+type guard struct {
+	cond  *sync.Cond
+	done  bool
+	min   int64
+	max   int64
+	names map[string]struct{}
+	expr  *exprGuard
+}
+
+// newGuard constructs a guard that will match any points in the given min and max
+// time range, with the given set of measurement names, or the given expression.
+// The expression is optional.
+func newGuard(min, max int64, names []string, expr influxql.Expr) *guard {
+	set := make(map[string]struct{}, len(names))
+	for _, name := range names {
+		set[name] = struct{}{}
+	}
+	return &guard{
+		cond:  sync.NewCond(new(sync.Mutex)),
+		min:   min,
+		max:   max,
+		names: set,
+		expr:  newExprGuard(expr),
+	}
+}
+
+// Matches returns true if any of the points match the guard.
+func (g *guard) Matches(points []models.Point) bool {
+	if g == nil {
+		return true
+	}
+
+	for _, pt := range points {
+		if t := pt.Time().UnixNano(); t < g.min || t > g.max {
+			continue
+		}
+		if len(g.names) == 0 && g.expr.matches(pt) {
+			return true
+		} else if _, ok := g.names[string(pt.Name())]; ok && g.expr.matches(pt) {
+			return true
+		}
+	}
+	return false
+}
+
+// Wait blocks until the guard has been marked Done.
+func (g *guard) Wait() {
+	g.cond.L.Lock()
+	for !g.done {
+		g.cond.Wait()
+	}
+	g.cond.L.Unlock()
+}
+
+// Done signals to anyone waiting on the guard that they can proceed.
+func (g *guard) Done() {
+	g.cond.L.Lock()
+	g.done = true
+	g.cond.Broadcast()
+	g.cond.L.Unlock()
+}
+
+// exprGuard is a union of influxql.Expr based guards. a nil exprGuard matches
+// everything, while the zero value matches nothing.
+type exprGuard struct {
+	and        *[2]*exprGuard
+	or         *[2]*exprGuard
+	tagMatches *tagGuard
+	tagExists  map[string]struct{}
+}
+
+type tagGuard struct {
+	meas bool
+	key  []byte
+	op   func([]byte) bool
+}
+
+// empty returns true if the exprGuard is empty, meaning that it matches no points.
+func (e *exprGuard) empty() bool {
+	return e != nil && e.and == nil && e.or == nil && e.tagMatches == nil && e.tagExists == nil
+}
+
+// newExprGuard scrutinizes the expression and returns an efficient guard.
+func newExprGuard(expr influxql.Expr) *exprGuard {
+	if expr == nil {
+		return nil
+	}
+
+	switch expr := expr.(type) {
+	case *influxql.ParenExpr:
+		return newExprGuard(expr.Expr)
+
+	case *influxql.BooleanLiteral:
+		if expr.Val {
+			return nil // matches everything
+		}
+		return new(exprGuard) // matches nothing
+
+	case *influxql.BinaryExpr:
+		switch expr.Op {
+		case influxql.AND:
+			lhs, rhs := newExprGuard(expr.LHS), newExprGuard(expr.RHS)
+			if lhs == nil { // reduce
+				return rhs
+			} else if rhs == nil { // reduce
+				return lhs
+			} else if lhs.empty() || rhs.empty() { // short circuit
+				return new(exprGuard)
+			} else {
+				return &exprGuard{and: &[2]*exprGuard{lhs, rhs}}
+			}
+
+		case influxql.OR:
+			lhs, rhs := newExprGuard(expr.LHS), newExprGuard(expr.RHS)
+			if lhs.empty() { // reduce
+				return rhs
+			} else if rhs.empty() { // reduce
+				return lhs
+			} else if lhs == nil || rhs == nil { // short circuit
+				return nil
+			} else {
+				return &exprGuard{or: &[2]*exprGuard{lhs, rhs}}
+			}
+
+		default:
+			return newBinaryExprGuard(expr)
+		}
+	default:
+		// if we couldn't analyze, match everything
+		return nil
+	}
+}
+
+// newBinaryExprGuard scrutinizes the binary expression and returns an efficient guard.
+func newBinaryExprGuard(expr *influxql.BinaryExpr) *exprGuard {
+	// if it's a nested binary expression, always match.
+	if _, ok := expr.LHS.(*influxql.BinaryExpr); ok {
+		return nil
+	} else if _, ok := expr.RHS.(*influxql.BinaryExpr); ok {
+		return nil
+	}
+
+	// ensure one of the expressions is a VarRef, and make that the key.
+	key, ok := expr.LHS.(*influxql.VarRef)
+	value := expr.RHS
+	if !ok {
+		key, ok = expr.RHS.(*influxql.VarRef)
+		if !ok {
+			return nil
+		}
+		value = expr.LHS
+	}
+
+	// check the key for situations we know we can't filter.
+	if key.Val != "_name" && key.Type != influxql.Unknown && key.Type != influxql.Tag {
+		return nil
+	}
+
+	// scrutinize the value to return an efficient guard.
+	switch value := value.(type) {
+	case *influxql.StringLiteral:
+		val := []byte(value.Val)
+		g := &exprGuard{tagMatches: &tagGuard{
+			meas: key.Val == "_name",
+			key:  []byte(key.Val),
+		}}
+
+		switch expr.Op {
+		case influxql.EQ:
+			g.tagMatches.op = func(x []byte) bool { return bytes.Equal(val, x) }
+
+		case influxql.NEQ:
+			g.tagMatches.op = func(x []byte) bool { return !bytes.Equal(val, x) }
+
+		default: // any other operator isn't valid. conservatively match everything.
+			return nil
+		}
+
+		return g
+
+	case *influxql.RegexLiteral:
+		// There's a tradeoff between being precise and being fast. For example, if the
+		// delete includes a very expensive regex, we don't want to run that against every
+		// incoming point. The decision here is to match any point that has a possibly
+		// expensive match if there is any overlap on the tags. In other words, expensive
+		// matches get transformed into trivially matching everything.
+		return &exprGuard{tagExists: map[string]struct{}{key.Val: {}}}
+
+	case *influxql.VarRef:
+		// We could do a better job here by encoding the two names and checking the points
+		// against them, but I'm not quite sure how to do that. Be conservative and match
+		// any points that contain either the key or value.
+
+		// since every point has a measurement, always match if either are on the measurement.
+		if key.Val == "_name" || value.Val == "_name" {
+			return nil
+		}
+		return &exprGuard{tagExists: map[string]struct{}{
+			key.Val:   {},
+			value.Val: {},
+		}}
+
+	default: // any other value type matches everything
+		return nil
+	}
+}
+
+// matches checks if the exprGuard matches the point.
+func (g *exprGuard) matches(pt models.Point) bool {
+	switch {
+	case g == nil:
+		return true
+
+	case g.and != nil:
+		return g.and[0].matches(pt) && g.and[1].matches(pt)
+
+	case g.or != nil:
+		return g.or[0].matches(pt) || g.or[1].matches(pt)
+
+	case g.tagMatches != nil:
+		if g.tagMatches.meas {
+			return g.tagMatches.op(pt.Name())
+		}
+		for _, tag := range pt.Tags() {
+			if bytes.Equal(tag.Key, g.tagMatches.key) && g.tagMatches.op(tag.Value) {
+				return true
+			}
+		}
+		return false
+
+	case g.tagExists != nil:
+		for _, tag := range pt.Tags() {
+			if _, ok := g.tagExists[string(tag.Key)]; ok {
+				return true
+			}
+		}
+		return false
+
+	default:
+		return false
+	}
+}
--- a/tsdb/guard_test.go
+++ b/tsdb/guard_test.go
@ -0,0 +1,314 @@
+package tsdb
+
+import (
+	"testing"
+	"time"
+
+	"github.com/davecgh/go-spew/spew"
+	"github.com/influxdata/influxdb/models"
+	"github.com/influxdata/influxql"
+)
+
+func TestGuard(t *testing.T) {
+	tests := []struct {
+		min, max int64
+		names    []string
+		expr     string
+		point    string
+		matches  bool
+	}{
+		{ // in time matching
+			min: 0, max: 1000,
+			point:   "cpu value=1 100",
+			matches: true,
+		},
+		{ // out of time range doesn't match
+			min: 0, max: 10,
+			names:   []string{"cpu"},
+			point:   "cpu value=1 100",
+			matches: false,
+		},
+		{ // measurement name matches
+			min: 0, max: 1000,
+			names:   []string{"cpu"},
+			point:   "cpu value=1 100",
+			matches: true,
+		},
+		{ // measurement doesn't match
+			min: 0, max: 1000,
+			names:   []string{"mem"},
+			point:   "cpu value=1 100",
+			matches: false,
+		},
+		{ // basic expression matching
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server1'",
+			matches: true,
+		},
+		{ // basic expression matching
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host != 'server2'",
+			matches: true,
+		},
+		{ // basic expression mismatch
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server2'",
+			matches: false,
+		},
+		{ // basic expression mismatch
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host != 'server1'",
+			matches: false,
+		},
+		{ // parenthesis unwrap
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "(host = 'server1')",
+			matches: true,
+		},
+		{ // compound expression matching
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server2' or host = 'server1'",
+			matches: true,
+		},
+		{ // compound expression mismatch
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server1' and host = 'server2'",
+			matches: false,
+		},
+		{ // regex expression matching
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host =~ /server1/",
+			matches: true,
+		},
+		{ // regex expression mismatch
+			min: 0, max: 1000,
+			point:   "cpu,foo=server1 value=1 100",
+			expr:    "host =~ /server1/",
+			matches: false,
+		},
+		{ // regex over-approximation
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host =~ /server2/",
+			matches: true,
+		},
+		{ // regex over-approximation
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host !~ /server1/",
+			matches: true,
+		},
+		{ // key doesn't have to come first
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "'server1' = host",
+			matches: true,
+		},
+		{ // key doesn't have to come first
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "'server2' = host",
+			matches: false,
+		},
+		{ // conservative on no var refs
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "1 = 2",
+			matches: true,
+		},
+		{ // expr matches measurement
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "_name = 'cpu'",
+			matches: true,
+		},
+		{ // expr mismatches measurement
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "_name = 'mem'",
+			matches: false,
+		},
+		{ // expr conservative on dual var ref
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = test",
+			matches: true,
+		},
+		{ // expr conservative on dual var ref mismatches
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "foo = bar",
+			matches: false,
+		},
+		{ // expr conservative on dual var ref involving measurement
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "_name = host",
+			matches: true,
+		},
+		{ // expr conservative on dual var ref involving measurement
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = _name",
+			matches: true,
+		},
+		{ // boolean literal matches
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "true",
+			matches: true,
+		},
+		{ // boolean literal mismatches
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "false",
+			matches: false,
+		},
+		{ // reduce and
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "true and host = 'server1'",
+			matches: true,
+		},
+		{ // reduce and
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server1' and true",
+			matches: true,
+		},
+		{ // reduce or
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "false or host = 'server1'",
+			matches: true,
+		},
+		{ // reduce or
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server1' or false",
+			matches: true,
+		},
+		{ // short circuit and
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "false and host = 'server1'",
+			matches: false,
+		},
+		{ // short circuit and
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server1' and false",
+			matches: false,
+		},
+		{ // short circuit or
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "true or host = 'server2'",
+			matches: true,
+		},
+		{ // short circuit or
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = 'server2' or true",
+			matches: true,
+		},
+		{ // conservative match weird exprs
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "'wierd'",
+			matches: true,
+		},
+		{ // conservative match weird exprs
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "value::field = '1'",
+			matches: true,
+		},
+		{ // conservative match weird exprs
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host <= 'aaa'",
+			matches: true,
+		},
+		{ // conservative match weird exprs
+			min: 0, max: 1000,
+			point:   "cpu,host=server1 value=1 100",
+			expr:    "host = ('server2')",
+			matches: true,
+		},
+	}
+
+	for i, test := range tests {
+		var expr influxql.Expr
+		if test.expr != "" {
+			var err error
+			expr, err = influxql.ParseExpr(test.expr)
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+		points, err := models.ParsePointsString(test.point)
+		if err != nil {
+			t.Fatal(err)
+		}
+		guard := newGuard(test.min, test.max, test.names, expr)
+
+		if guard.Matches(points) != test.matches {
+			t.Errorf("%d: expected matching %q with time:[%d, %d] measurements:%v expr:%q to be %t",
+				i, test.point, test.min, test.max, test.names, test.expr, test.matches)
+			cs := &spew.ConfigState{DisableMethods: true, SpewKeys: true, Indent: "  "}
+			t.Errorf("%d: expr: %s", i, cs.Sdump(expr))
+			t.Errorf("%d: guard: %s", i, cs.Sdump(guard.expr))
+		}
+	}
+}
+
+func BenchmarkGuard(b *testing.B) {
+	tag := func(key, value string) models.Tag {
+		return models.Tag{Key: []byte(key), Value: []byte(value)}
+	}
+
+	run := func(b *testing.B, g *guard) {
+		run := func(b *testing.B, batch int) {
+			points := make([]models.Point, batch)
+			for i := range points {
+				points[i] = models.MustNewPoint("cpu", models.Tags{
+					tag("t0", "v0"), tag("t1", "v1"), tag("t2", "v2"),
+					tag("t3", "v3"), tag("t4", "v4"), tag("t5", "v5"),
+					tag("t6", "v6"), tag("t7", "v7"), tag("t8", "v8"),
+				}, models.Fields{"value": 100}, time.Unix(0, 50))
+			}
+
+			for i := 0; i < b.N; i++ {
+				if g.Matches(points) {
+					b.Fatal("matched")
+				}
+			}
+		}
+
+		b.Run("1", func(b *testing.B) { run(b, 1) })
+		b.Run("100", func(b *testing.B) { run(b, 100) })
+		b.Run("10000", func(b *testing.B) { run(b, 10000) })
+	}
+
+	b.Run("Time Filtered", func(b *testing.B) {
+		run(b, newGuard(0, 10, nil, nil))
+	})
+
+	b.Run("Measurement Filtered", func(b *testing.B) {
+		run(b, newGuard(0, 100, []string{"mem"}, nil))
+	})
+
+	b.Run("Tag Filtered", func(b *testing.B) {
+		expr, _ := influxql.ParseExpr("t4 = 'v5'")
+		run(b, newGuard(0, 100, []string{"cpu"}, expr))
+	})
+}
--- a/tsdb/store.go
+++ b/tsdb/store.go
@ -88,6 +88,10 @@ type Store struct {
 	// This prevents new shards from being created while old ones are being deleted.
 	pendingShardDeletes map[uint64]struct{}

+	// Epoch tracker helps serialize writes and deletes that may conflict. It
+	// is stored by shard.
+	epochs map[uint64]*epochTracker
+
 	EngineOptions EngineOptions

 	baseLogger *zap.Logger
@ -108,6 +112,7 @@ func NewStore(path string) *Store {
 		sfiles:              make(map[string]*SeriesFile),
 		indexes:             make(map[string]interface{}),
 		pendingShardDeletes: make(map[uint64]struct{}),
+		epochs:              make(map[uint64]*epochTracker),
 		EngineOptions:       NewEngineOptions(),
 		Logger:              logger,
 		baseLogger:          logger,
@ -412,6 +417,7 @@ func (s *Store) loadShards() error {
 			continue
 		}
 		s.shards[res.s.id] = res.s
+		s.epochs[res.s.id] = newEpochTracker()
 		if _, ok := s.databases[res.s.database]; !ok {
 			s.databases[res.s.database] = new(databaseState)
 		}
@ -633,6 +639,7 @@ func (s *Store) CreateShard(database, retentionPolicy string, shardID uint64, en
 	}

 	s.shards[shardID] = shard
+	s.epochs[shardID] = newEpochTracker()
 	if _, ok := s.databases[database]; !ok {
 		s.databases[database] = new(databaseState)
 	}
@ -690,6 +697,7 @@ func (s *Store) DeleteShard(shardID uint64) error {
 		return nil
 	}
 	delete(s.shards, shardID)
+	delete(s.epochs, shardID)
 	s.pendingShardDeletes[shardID] = struct{}{}

 	db := sh.Database()
@ -828,6 +836,7 @@ func (s *Store) DeleteDatabase(name string) error {

 	for _, sh := range shards {
 		delete(s.shards, sh.id)
+		delete(s.epochs, sh.id)
 	}

 	// Remove database from store list of databases
@ -911,6 +920,13 @@ func (s *Store) DeleteMeasurement(database, name string) error {
 		limit.Take()
 		defer limit.Release()

+		// install our guard and wait for any prior deletes to finish. the
+		// guard ensures future deletes that could conflict wait for us.
+		guard := newGuard(influxql.MinTime, influxql.MaxTime, []string{name}, nil)
+		waiter := s.epochs[sh.id].WaitDelete(guard)
+		waiter.Wait()
+		defer waiter.Done()
+
 		return sh.DeleteMeasurement([]byte(name))
 	})
 }
@ -1294,6 +1310,12 @@ func (s *Store) DeleteSeries(database string, sources []influxql.Source, conditi
 		limit.Take()
 		defer limit.Release()

+		// install our guard and wait for any prior deletes to finish. the
+		// guard ensures future deletes that could conflict wait for us.
+		waiter := s.epochs[sh.id].WaitDelete(newGuard(min, max, names, condition))
+		waiter.Wait()
+		defer waiter.Done()
+
 		index, err := sh.Index()
 		if err != nil {
 			return err
@ -1347,6 +1369,17 @@ func (s *Store) WriteToShard(shardID uint64, points []models.Point) error {
 	}
 	s.mu.RUnlock()

+	// enter the epoch tracker
+	guards, gen := s.epochs[shardID].StartWrite()
+	defer s.epochs[shardID].EndWrite(gen)
+
+	// wait for any guards before writing the points.
+	for _, guard := range guards {
+		if guard.Matches(points) {
+			guard.Wait()
+		}
+	}
+
 	// Ensure snapshot compactions are enabled since the shard might have been cold
 	// and disabled by the monitor.
 	if sh.IsIdle() {