Merge pull request #8856 from influxdata/jw-cache

Snapshot compaction improvements
2017-09-22 10:45:54 -06:00 · 2017-09-22 10:45:54 -06:00 · 1e345aa7a1
parent ccb6f7451f 94aba64b88
commit 1e345aa7a1
19 changed files with 624 additions and 458 deletions
--- a/etc/config.sample.toml
+++ b/etc/config.sample.toml
@ -88,7 +88,8 @@
  # compact-full-write-cold-duration = "4h"

  # The maximum number of concurrent full and level compactions that can run at one time.  A
-  # value of 0 results in runtime.GOMAXPROCS(0) used at runtime.  This setting does not apply
+  # value of 0 results in 50% of runtime.GOMAXPROCS(0) used at runtime.  Any number greater
+  # than 0 limits compactions to that value.  This setting does not apply
  # to cache snapshotting.
  # max-concurrent-compactions = 0

@ -358,10 +359,10 @@
  # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max.
  # read-buffer = 0

-  # Multi-value plugins can be handled two ways.  
+  # Multi-value plugins can be handled two ways.
  # "split" will parse and store the multi-value plugin data into separate measurements
-  # "join" will parse and store the multi-value plugin as a single multi-value measurement.  
-  # "split" is the default behavior for backward compatability with previous versions of influxdb.  
+  # "join" will parse and store the multi-value plugin as a single multi-value measurement.
+  # "split" is the default behavior for backward compatability with previous versions of influxdb.
  # parse-multivalue-plugin = "split"
 ###
 ### [opentsdb]
--- a/pkg/limiter/fixed.go
+++ b/pkg/limiter/fixed.go
@ -10,10 +10,27 @@ func NewFixed(limit int) Fixed {
 	return make(Fixed, limit)
 }

+// Idle returns true if the limiter has all its capacity is available.
+func (t Fixed) Idle() bool {
+	return len(t) == cap(t)
+}
+
+// TryTake attempts to take a token and return true if successful, otherwise returns false.
+func (t Fixed) TryTake() bool {
+	select {
+	case t <- struct{}{}:
+		return true
+	default:
+		return false
+	}
+}
+
+// Take attempts to take a token and blocks until one is available.
 func (t Fixed) Take() {
 	t <- struct{}{}
 }

+// Release releases a token back to the limiter.
 func (t Fixed) Release() {
 	<-t
 }
--- a/tsdb/config.go
+++ b/tsdb/config.go
@ -49,7 +49,7 @@ const (
 	DefaultMaxValuesPerTag = 100000

 	// DefaultMaxConcurrentCompactions is the maximum number of concurrent full and level compactions
-	// that can run at one time.  A value of results in runtime.GOMAXPROCS(0) used at runtime.
+	// that can run at one time.  A value of 0 results in 50% of runtime.GOMAXPROCS(0) used at runtime.
 	DefaultMaxConcurrentCompactions = 0
 )

--- a/tsdb/engine.go
+++ b/tsdb/engine.go
@ -144,11 +144,13 @@ func NewEngine(id uint64, i Index, database, path string, walPath string, option

 // EngineOptions represents the options used to initialize the engine.
 type EngineOptions struct {
-	EngineVersion     string
-	IndexVersion      string
-	ShardID           uint64
-	InmemIndex        interface{} // shared in-memory index
-	CompactionLimiter limiter.Fixed
+	EngineVersion string
+	IndexVersion  string
+	ShardID       uint64
+	InmemIndex    interface{} // shared in-memory index
+
+	HiPriCompactionLimiter limiter.Fixed
+	LoPriCompactionLimiter limiter.Fixed

 	Config Config
 }
--- a/tsdb/engine/tsm1/cache.go
+++ b/tsdb/engine/tsm1/cache.go
@ -118,7 +118,7 @@ func (e *entry) deduplicate() {
 	e.mu.Lock()
 	defer e.mu.Unlock()

-	if len(e.values) == 0 {
+	if len(e.values) <= 1 {
 		return
 	}
 	e.values = e.values.Deduplicate()
@ -176,6 +176,8 @@ type storer interface {
 	apply(f func([]byte, *entry) error) error       // Apply f to all entries in the store in parallel.
 	applySerial(f func([]byte, *entry) error) error // Apply f to all entries in serial.
 	reset()                                         // Reset the store to an initial unused state.
+	split(n int) []storer                           // Split splits the store into n stores
+	count() int                                     // Count returns the number of keys in the store
 }

 // Cache maintains an in-memory store of Values for a set of keys.
@ -436,6 +438,15 @@ func (c *Cache) Deduplicate() {
 func (c *Cache) ClearSnapshot(success bool) {
 	c.init()

+	c.mu.RLock()
+	snapStore := c.snapshot.store
+	c.mu.RUnlock()
+
+	// reset the snapshot store outside of the write lock
+	if success {
+		snapStore.reset()
+	}
+
 	c.mu.Lock()
 	defer c.mu.Unlock()

@ -445,8 +456,7 @@ func (c *Cache) ClearSnapshot(success bool) {
 		c.snapshotAttempts = 0
 		c.updateMemSize(-int64(atomic.LoadUint64(&c.snapshotSize))) // decrement the number of bytes in cache

-		// Reset the snapshot's store, and reset the snapshot to a fresh Cache.
-		c.snapshot.store.reset()
+		// Reset the snapshot to a fresh Cache.
 		c.snapshot = &Cache{
 			store: c.snapshot.store,
 		}
@ -477,6 +487,13 @@ func (c *Cache) MaxSize() uint64 {
 	return c.maxSize
 }

+func (c *Cache) Count() int {
+	c.mu.RLock()
+	n := c.store.count()
+	c.mu.RUnlock()
+	return n
+}
+
 // Keys returns a sorted slice of all keys under management by the cache.
 func (c *Cache) Keys() [][]byte {
 	c.mu.RLock()
@ -485,6 +502,21 @@ func (c *Cache) Keys() [][]byte {
 	return store.keys(true)
 }

+func (c *Cache) Split(n int) []*Cache {
+	if n == 1 {
+		return []*Cache{c}
+	}
+
+	caches := make([]*Cache, n)
+	storers := c.store.split(n)
+	for i := 0; i < n; i++ {
+		caches[i] = &Cache{
+			store: storers[i],
+		}
+	}
+	return caches
+}
+
 // unsortedKeys returns a slice of all keys under management by the cache. The
 // keys are not sorted.
 func (c *Cache) unsortedKeys() [][]byte {
@ -765,3 +797,5 @@ func (e emptyStore) keys(sorted bool) [][]byte                      { return nil
 func (e emptyStore) apply(f func([]byte, *entry) error) error       { return nil }
 func (e emptyStore) applySerial(f func([]byte, *entry) error) error { return nil }
 func (e emptyStore) reset()                                         {}
+func (e emptyStore) split(n int) []storer                           { return nil }
+func (e emptyStore) count() int                                     { return 0 }
--- a/tsdb/engine/tsm1/cache_test.go
+++ b/tsdb/engine/tsm1/cache_test.go
@ -759,6 +759,45 @@ func TestCacheLoader_LoadDeleted(t *testing.T) {
 	}
 }

+func TestCache_Split(t *testing.T) {
+	v0 := NewValue(1, 1.0)
+	v1 := NewValue(2, 2.0)
+	v2 := NewValue(3, 3.0)
+	values := Values{v0, v1, v2}
+	valuesSize := uint64(v0.Size() + v1.Size() + v2.Size())
+
+	c := NewCache(0, "")
+
+	if err := c.Write([]byte("foo"), values); err != nil {
+		t.Fatalf("failed to write key foo to cache: %s", err.Error())
+	}
+	if err := c.Write([]byte("bar"), values); err != nil {
+		t.Fatalf("failed to write key foo to cache: %s", err.Error())
+	}
+
+	if err := c.Write([]byte("baz"), values); err != nil {
+		t.Fatalf("failed to write key foo to cache: %s", err.Error())
+	}
+
+	if n := c.Size(); n != 3*valuesSize+9 {
+		t.Fatalf("cache size incorrect after 3 writes, exp %d, got %d", 3*valuesSize*9, n)
+	}
+
+	splits := c.Split(3)
+	keys := make(map[string]int)
+	for _, s := range splits {
+		for _, k := range s.Keys() {
+			keys[string(k)] = s.Values(k).Size()
+		}
+	}
+
+	for _, key := range []string{"foo", "bar", "baz"} {
+		if _, ok := keys[key]; !ok {
+			t.Fatalf("missing key, exp %s, got %v", key, nil)
+		}
+	}
+}
+
 func mustTempDir() string {
 	dir, err := ioutil.TempDir("", "tsm1-test")
 	if err != nil {
@ -797,6 +836,8 @@ type TestStore struct {
 	applyf       func(f func([]byte, *entry) error) error
 	applySerialf func(f func([]byte, *entry) error) error
 	resetf       func()
+	splitf       func(n int) []storer
+	countf       func() int
 }

 func NewTestStore() *TestStore                                      { return &TestStore{} }
@ -808,6 +849,8 @@ func (s *TestStore) keys(sorted bool) [][]byte                      { return s.k
 func (s *TestStore) apply(f func([]byte, *entry) error) error       { return s.applyf(f) }
 func (s *TestStore) applySerial(f func([]byte, *entry) error) error { return s.applySerialf(f) }
 func (s *TestStore) reset()                                         { s.resetf() }
+func (s *TestStore) split(n int) []storer                           { return s.splitf(n) }
+func (s *TestStore) count() int                                     { return s.countf() }

 var fvSize = uint64(NewValue(1, float64(1)).Size())

--- a/tsdb/engine/tsm1/compact.gen.go
+++ b/tsdb/engine/tsm1/compact.gen.go
@ -6,10 +6,6 @@

 package tsm1

-import (
-	"runtime"
-)
-
 // merge combines the next set of blocks into merged blocks.
 func (k *tsmKeyIterator) mergeFloat() {
 	// No blocks left, or pending merged values, we're done
@ -92,10 +88,6 @@ func (k *tsmKeyIterator) combineFloat(dedup bool) blocks {
 				}

 				k.mergedFloatValues = k.mergedFloatValues.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -120,8 +112,6 @@ func (k *tsmKeyIterator) combineFloat(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -134,8 +124,6 @@ func (k *tsmKeyIterator) combineFloat(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -170,8 +158,6 @@ func (k *tsmKeyIterator) combineFloat(dedup bool) blocks {

 			k.mergedFloatValues = k.mergedFloatValues.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
@ -300,10 +286,6 @@ func (k *tsmKeyIterator) combineInteger(dedup bool) blocks {
 				}

 				k.mergedIntegerValues = k.mergedIntegerValues.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -328,8 +310,6 @@ func (k *tsmKeyIterator) combineInteger(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -342,8 +322,6 @@ func (k *tsmKeyIterator) combineInteger(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -378,8 +356,6 @@ func (k *tsmKeyIterator) combineInteger(dedup bool) blocks {

 			k.mergedIntegerValues = k.mergedIntegerValues.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
@ -508,10 +484,6 @@ func (k *tsmKeyIterator) combineUnsigned(dedup bool) blocks {
 				}

 				k.mergedUnsignedValues = k.mergedUnsignedValues.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -536,8 +508,6 @@ func (k *tsmKeyIterator) combineUnsigned(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -550,8 +520,6 @@ func (k *tsmKeyIterator) combineUnsigned(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -586,8 +554,6 @@ func (k *tsmKeyIterator) combineUnsigned(dedup bool) blocks {

 			k.mergedUnsignedValues = k.mergedUnsignedValues.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
@ -716,10 +682,6 @@ func (k *tsmKeyIterator) combineString(dedup bool) blocks {
 				}

 				k.mergedStringValues = k.mergedStringValues.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -744,8 +706,6 @@ func (k *tsmKeyIterator) combineString(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -758,8 +718,6 @@ func (k *tsmKeyIterator) combineString(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -794,8 +752,6 @@ func (k *tsmKeyIterator) combineString(dedup bool) blocks {

 			k.mergedStringValues = k.mergedStringValues.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
@ -924,10 +880,6 @@ func (k *tsmKeyIterator) combineBoolean(dedup bool) blocks {
 				}

 				k.mergedBooleanValues = k.mergedBooleanValues.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -952,8 +904,6 @@ func (k *tsmKeyIterator) combineBoolean(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -966,8 +916,6 @@ func (k *tsmKeyIterator) combineBoolean(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -1002,8 +950,6 @@ func (k *tsmKeyIterator) combineBoolean(dedup bool) blocks {

 			k.mergedBooleanValues = k.mergedBooleanValues.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
--- a/tsdb/engine/tsm1/compact.gen.go.tmpl
+++ b/tsdb/engine/tsm1/compact.gen.go.tmpl
@ -1,9 +1,5 @@
 package tsm1

-import (
-	"runtime"
-)
-
 {{range .}}

 // merge combines the next set of blocks into merged blocks.
@ -88,10 +84,6 @@ func (k *tsmKeyIterator) combine{{.Name}}(dedup bool) blocks {
 				}

 				k.merged{{.Name}}Values = k.merged{{.Name}}Values.Merge(v)
-
-				// Allow other goroutines to run
-				runtime.Gosched()
-
 			}
 		}

@ -116,8 +108,6 @@ func (k *tsmKeyIterator) combine{{.Name}}(dedup bool) blocks {
 				break
 			}
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		if k.fast {
@ -130,8 +120,6 @@ func (k *tsmKeyIterator) combine{{.Name}}(dedup bool) blocks {

 				chunked = append(chunked, k.blocks[i])
 				i++
-				// Allow other goroutines to run
-				runtime.Gosched()
 			}
 		}

@ -166,8 +154,6 @@ func (k *tsmKeyIterator) combine{{.Name}}(dedup bool) blocks {

 			k.merged{{.Name}}Values = k.merged{{.Name}}Values.Merge(v)
 			i++
-			// Allow other goroutines to run
-			runtime.Gosched()
 		}

 		k.blocks = k.blocks[i:]
--- a/tsdb/engine/tsm1/compact.go
+++ b/tsdb/engine/tsm1/compact.go
@ -227,9 +227,16 @@ func (c *DefaultPlanner) PlanLevel(level int) []CompactionGroup {
 		minGenerations = level + 1
 	}

+	// Each compaction group should run against 4 generations.  For level 1, since these
+	// can get created much more quickly, bump the grouping to 8 to keep file counts lower.
+	groupSize := 4
+	if level == 1 || level == 3 {
+		groupSize = 8
+	}
+
 	var cGroups []CompactionGroup
 	for _, group := range levelGroups {
-		for _, chunk := range group.chunk(4) {
+		for _, chunk := range group.chunk(groupSize) {
 			var cGroup CompactionGroup
 			var hasTombstones bool
 			for _, gen := range chunk {
@ -697,8 +704,43 @@ func (c *Compactor) WriteSnapshot(cache *Cache) ([]string, error) {
 		return nil, errSnapshotsDisabled
 	}

-	iter := NewCacheKeyIterator(cache, tsdb.DefaultMaxPointsPerBlock, intC)
-	files, err := c.writeNewFiles(c.FileStore.NextGeneration(), 0, iter)
+	concurrency := 1
+	card := cache.Count()
+	if card >= 1024*1024 {
+		concurrency = card / 1024 * 1024
+		if concurrency < 1 {
+			concurrency = 1
+		}
+		if concurrency > 4 {
+			concurrency = 4
+		}
+	}
+	splits := cache.Split(concurrency)
+
+	type res struct {
+		files []string
+		err   error
+	}
+
+	resC := make(chan res, concurrency)
+	for i := 0; i < concurrency; i++ {
+		go func(sp *Cache) {
+			iter := NewCacheKeyIterator(sp, tsdb.DefaultMaxPointsPerBlock, intC)
+			files, err := c.writeNewFiles(c.FileStore.NextGeneration(), 0, iter)
+			resC <- res{files: files, err: err}
+
+		}(splits[i])
+	}
+
+	var err error
+	files := make([]string, 0, concurrency)
+	for i := 0; i < concurrency; i++ {
+		result := <-resC
+		if result.err != nil {
+			err = result.err
+		}
+		files = append(files, result.files...)
+	}

 	// See if we were disabled while writing a snapshot
 	c.mu.RLock()
@ -1337,56 +1379,87 @@ func (c *cacheKeyIterator) encode() {
 	n := len(c.ready)

 	// Divide the keyset across each CPU
-	chunkSize := 128
+	chunkSize := 1
 	idx := uint64(0)
+
 	for i := 0; i < concurrency; i++ {
 		// Run one goroutine per CPU and encode a section of the key space concurrently
 		go func() {
+			tenc := getTimeEncoder(tsdb.DefaultMaxPointsPerBlock)
+			fenc := getFloatEncoder(tsdb.DefaultMaxPointsPerBlock)
+			benc := getBooleanEncoder(tsdb.DefaultMaxPointsPerBlock)
+			uenc := getUnsignedEncoder(tsdb.DefaultMaxPointsPerBlock)
+			senc := getStringEncoder(tsdb.DefaultMaxPointsPerBlock)
+			ienc := getIntegerEncoder(tsdb.DefaultMaxPointsPerBlock)
+
+			defer putTimeEncoder(tenc)
+			defer putFloatEncoder(fenc)
+			defer putBooleanEncoder(benc)
+			defer putUnsignedEncoder(uenc)
+			defer putStringEncoder(senc)
+			defer putIntegerEncoder(ienc)
+
 			for {
-				start := int(atomic.AddUint64(&idx, uint64(chunkSize))) - chunkSize
-				if start >= n {
+				i := int(atomic.AddUint64(&idx, uint64(chunkSize))) - chunkSize
+
+				if i >= n {
 					break
 				}
-				end := start + chunkSize
-				if end > n {
-					end = n
+
+				key := c.order[i]
+				values := c.cache.values(key)
+
+				for len(values) > 0 {
+
+					end := len(values)
+					if end > c.size {
+						end = c.size
+					}
+
+					minTime, maxTime := values[0].UnixNano(), values[end-1].UnixNano()
+					var b []byte
+					var err error
+					tenc.Reset()
+
+					maxTime = values[end-1].UnixNano()
+
+					switch values[0].(type) {
+					case FloatValue:
+						fenc.Reset()
+						b, err = encodeFloatBlockUsing(nil, values[:end], tenc, fenc)
+					case IntegerValue:
+						ienc.Reset()
+						b, err = encodeIntegerBlockUsing(nil, values[:end], tenc, ienc)
+					case UnsignedValue:
+						uenc.Reset()
+						b, err = encodeUnsignedBlockUsing(nil, values[:end], tenc, uenc)
+					case BooleanValue:
+						benc.Reset()
+						b, err = encodeBooleanBlockUsing(nil, values[:end], tenc, benc)
+					case StringValue:
+						senc.Reset()
+						b, err = encodeStringBlockUsing(nil, values[:end], tenc, senc)
+					default:
+						b, err = Values(values[:end]).Encode(nil)
+					}
+
+					values = values[end:]
+
+					c.blocks[i] = append(c.blocks[i], cacheBlock{
+						k:       key,
+						minTime: minTime,
+						maxTime: maxTime,
+						b:       b,
+						err:     err,
+					})
 				}
-				c.encodeRange(start, end)
+				// Notify this key is fully encoded
+				c.ready[i] <- struct{}{}
 			}
 		}()
 	}
 }

-func (c *cacheKeyIterator) encodeRange(start, stop int) {
-	for i := start; i < stop; i++ {
-		key := c.order[i]
-		values := c.cache.values(key)
-
-		for len(values) > 0 {
-			minTime, maxTime := values[0].UnixNano(), values[len(values)-1].UnixNano()
-			var b []byte
-			var err error
-			if len(values) > c.size {
-				maxTime = values[c.size-1].UnixNano()
-				b, err = Values(values[:c.size]).Encode(nil)
-				values = values[c.size:]
-			} else {
-				b, err = Values(values).Encode(nil)
-				values = values[:0]
-			}
-			c.blocks[i] = append(c.blocks[i], cacheBlock{
-				k:       key,
-				minTime: minTime,
-				maxTime: maxTime,
-				b:       b,
-				err:     err,
-			})
-		}
-		// Notify this key is fully encoded
-		c.ready[i] <- struct{}{}
-	}
-}
-
 func (c *cacheKeyIterator) Next() bool {
 	if c.i >= 0 && c.i < len(c.ready) && len(c.blocks[c.i]) > 0 {
 		c.blocks[c.i] = c.blocks[c.i][1:]
--- a/tsdb/engine/tsm1/compact_test.go
+++ b/tsdb/engine/tsm1/compact_test.go
@ -1689,6 +1689,22 @@ func TestDefaultPlanner_PlanLevel_Multiple(t *testing.T) {
 			Path: "06-01.tsm1",
 			Size: 1 * 1024 * 1024,
 		},
+		tsm1.FileStat{
+			Path: "07-01.tsm1",
+			Size: 1 * 1024 * 1024,
+		},
+		tsm1.FileStat{
+			Path: "08-01.tsm1",
+			Size: 1 * 1024 * 1024,
+		},
+		tsm1.FileStat{
+			Path: "09-01.tsm1",
+			Size: 1 * 1024 * 1024,
+		},
+		tsm1.FileStat{
+			Path: "10-01.tsm1",
+			Size: 1 * 1024 * 1024,
+		},
 	}

 	cp := tsm1.NewDefaultPlanner(
@ -1699,8 +1715,8 @@ func TestDefaultPlanner_PlanLevel_Multiple(t *testing.T) {
 		}, tsdb.DefaultCompactFullWriteColdDuration,
 	)

-	expFiles1 := []tsm1.FileStat{data[0], data[1], data[2], data[3]}
-	expFiles2 := []tsm1.FileStat{data[4], data[5]}
+	expFiles1 := []tsm1.FileStat{data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]}
+	expFiles2 := []tsm1.FileStat{data[8], data[9]}

 	tsm := cp.PlanLevel(1)
 	if exp, got := len(expFiles1), len(tsm[0]); got != exp {
--- a/tsdb/engine/tsm1/encoding.gen.go
+++ b/tsdb/engine/tsm1/encoding.gen.go
@ -56,7 +56,7 @@ func (a Values) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a Values) Deduplicate() Values {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

@ -268,7 +268,7 @@ func (a FloatValues) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a FloatValues) Deduplicate() FloatValues {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

@ -524,7 +524,7 @@ func (a IntegerValues) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a IntegerValues) Deduplicate() IntegerValues {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

@ -780,7 +780,7 @@ func (a UnsignedValues) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a UnsignedValues) Deduplicate() UnsignedValues {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

@ -1036,7 +1036,7 @@ func (a StringValues) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a StringValues) Deduplicate() StringValues {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

@ -1292,7 +1292,7 @@ func (a BooleanValues) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a BooleanValues) Deduplicate() BooleanValues {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

--- a/tsdb/engine/tsm1/encoding.gen.go.tmpl
+++ b/tsdb/engine/tsm1/encoding.gen.go.tmpl
@ -53,7 +53,7 @@ func (a {{.Name}}Values) assertOrdered() {
 // Deduplicate returns a new slice with any values that have the same timestamp removed.
 // The Value that appears last in the slice is the one that is kept.
 func (a {{.Name}}Values) Deduplicate() {{.Name}}Values {
-	if len(a) == 0 {
+	if len(a) <= 1 {
 		return a
 	}

--- a/tsdb/engine/tsm1/encoding.go
+++ b/tsdb/engine/tsm1/encoding.go
@ -359,32 +359,7 @@ func encodeFloatBlock(buf []byte, values []Value) ([]byte, error) {
 	// frame-or-reference and run length encoding.
 	tsenc := getTimeEncoder(len(values))

-	var b []byte
-	err := func() error {
-		for _, v := range values {
-			vv := v.(FloatValue)
-			tsenc.Write(vv.unixnano)
-			venc.Write(vv.value)
-		}
-		venc.Flush()
-
-		// Encoded timestamp values
-		tb, err := tsenc.Bytes()
-		if err != nil {
-			return err
-		}
-		// Encoded float values
-		vb, err := venc.Bytes()
-		if err != nil {
-			return err
-		}
-
-		// Prepend the first timestamp of the block in the first 8 bytes and the block
-		// in the next byte, followed by the block
-		b = packBlock(buf, BlockFloat64, tb, vb)
-
-		return nil
-	}()
+	b, err := encodeFloatBlockUsing(buf, values, tsenc, venc)

 	putTimeEncoder(tsenc)
 	putFloatEncoder(venc)
@ -392,6 +367,33 @@ func encodeFloatBlock(buf []byte, values []Value) ([]byte, error) {
 	return b, err
 }

+func encodeFloatBlockUsing(buf []byte, values []Value, tsenc TimeEncoder, venc *FloatEncoder) ([]byte, error) {
+	tsenc.Reset()
+	venc.Reset()
+
+	for _, v := range values {
+		vv := v.(FloatValue)
+		tsenc.Write(vv.unixnano)
+		venc.Write(vv.value)
+	}
+	venc.Flush()
+
+	// Encoded timestamp values
+	tb, err := tsenc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+	// Encoded float values
+	vb, err := venc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepend the first timestamp of the block in the first 8 bytes and the block
+	// in the next byte, followed by the block
+	return packBlock(buf, BlockFloat64, tb, vb), nil
+}
+
 // DecodeFloatBlock decodes the float block from the byte slice
 // and appends the float values to a.
 func DecodeFloatBlock(block []byte, a *[]FloatValue) ([]FloatValue, error) {
@ -499,30 +501,7 @@ func encodeBooleanBlock(buf []byte, values []Value) ([]byte, error) {
 	// Encode timestamps using an adaptive encoder
 	tsenc := getTimeEncoder(len(values))

-	var b []byte
-	err := func() error {
-		for _, v := range values {
-			vv := v.(BooleanValue)
-			tsenc.Write(vv.unixnano)
-			venc.Write(vv.value)
-		}
-
-		// Encoded timestamp values
-		tb, err := tsenc.Bytes()
-		if err != nil {
-			return err
-		}
-		// Encoded float values
-		vb, err := venc.Bytes()
-		if err != nil {
-			return err
-		}
-
-		// Prepend the first timestamp of the block in the first 8 bytes and the block
-		// in the next byte, followed by the block
-		b = packBlock(buf, BlockBoolean, tb, vb)
-		return nil
-	}()
+	b, err := encodeBooleanBlockUsing(buf, values, tsenc, venc)

 	putTimeEncoder(tsenc)
 	putBooleanEncoder(venc)
@ -530,6 +509,32 @@ func encodeBooleanBlock(buf []byte, values []Value) ([]byte, error) {
 	return b, err
 }

+func encodeBooleanBlockUsing(buf []byte, values []Value, tenc TimeEncoder, venc BooleanEncoder) ([]byte, error) {
+	tenc.Reset()
+	venc.Reset()
+
+	for _, v := range values {
+		vv := v.(BooleanValue)
+		tenc.Write(vv.unixnano)
+		venc.Write(vv.value)
+	}
+
+	// Encoded timestamp values
+	tb, err := tenc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+	// Encoded float values
+	vb, err := venc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepend the first timestamp of the block in the first 8 bytes and the block
+	// in the next byte, followed by the block
+	return packBlock(buf, BlockBoolean, tb, vb), nil
+}
+
 // DecodeBooleanBlock decodes the boolean block from the byte slice
 // and appends the boolean values to a.
 func DecodeBooleanBlock(block []byte, a *[]BooleanValue) ([]BooleanValue, error) {
@ -622,39 +627,42 @@ func (v IntegerValue) String() string {
 }

 func encodeIntegerBlock(buf []byte, values []Value) ([]byte, error) {
-	tsEnc := getTimeEncoder(len(values))
-	vEnc := getIntegerEncoder(len(values))
+	tenc := getTimeEncoder(len(values))
+	venc := getIntegerEncoder(len(values))

-	var b []byte
-	err := func() error {
-		for _, v := range values {
-			vv := v.(IntegerValue)
-			tsEnc.Write(vv.unixnano)
-			vEnc.Write(vv.value)
-		}
+	b, err := encodeIntegerBlockUsing(buf, values, tenc, venc)

-		// Encoded timestamp values
-		tb, err := tsEnc.Bytes()
-		if err != nil {
-			return err
-		}
-		// Encoded int64 values
-		vb, err := vEnc.Bytes()
-		if err != nil {
-			return err
-		}
-
-		// Prepend the first timestamp of the block in the first 8 bytes
-		b = packBlock(buf, BlockInteger, tb, vb)
-		return nil
-	}()
-
-	putTimeEncoder(tsEnc)
-	putIntegerEncoder(vEnc)
+	putTimeEncoder(tenc)
+	putIntegerEncoder(venc)

 	return b, err
 }

+func encodeIntegerBlockUsing(buf []byte, values []Value, tenc TimeEncoder, venc IntegerEncoder) ([]byte, error) {
+	tenc.Reset()
+	venc.Reset()
+
+	for _, v := range values {
+		vv := v.(IntegerValue)
+		tenc.Write(vv.unixnano)
+		venc.Write(vv.value)
+	}
+
+	// Encoded timestamp values
+	tb, err := tenc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+	// Encoded int64 values
+	vb, err := venc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepend the first timestamp of the block in the first 8 bytes
+	return packBlock(buf, BlockInteger, tb, vb), nil
+}
+
 // DecodeIntegerBlock decodes the integer block from the byte slice
 // and appends the integer values to a.
 func DecodeIntegerBlock(block []byte, a *[]IntegerValue) ([]IntegerValue, error) {
@ -748,39 +756,42 @@ func (v UnsignedValue) String() string {
 }

 func encodeUnsignedBlock(buf []byte, values []Value) ([]byte, error) {
-	tsEnc := getTimeEncoder(len(values))
-	vEnc := getUnsignedEncoder(len(values))
+	tenc := getTimeEncoder(len(values))
+	venc := getUnsignedEncoder(len(values))

-	var b []byte
-	err := func() error {
-		for _, v := range values {
-			vv := v.(UnsignedValue)
-			tsEnc.Write(vv.unixnano)
-			vEnc.Write(int64(vv.value))
-		}
+	b, err := encodeUnsignedBlockUsing(buf, values, tenc, venc)

-		// Encoded timestamp values
-		tb, err := tsEnc.Bytes()
-		if err != nil {
-			return err
-		}
-		// Encoded int64 values
-		vb, err := vEnc.Bytes()
-		if err != nil {
-			return err
-		}
-
-		// Prepend the first timestamp of the block in the first 8 bytes
-		b = packBlock(buf, BlockUnsigned, tb, vb)
-		return nil
-	}()
-
-	putTimeEncoder(tsEnc)
-	putUnsignedEncoder(vEnc)
+	putTimeEncoder(tenc)
+	putUnsignedEncoder(venc)

 	return b, err
 }

+func encodeUnsignedBlockUsing(buf []byte, values []Value, tenc TimeEncoder, venc IntegerEncoder) ([]byte, error) {
+	tenc.Reset()
+	venc.Reset()
+
+	for _, v := range values {
+		vv := v.(UnsignedValue)
+		tenc.Write(vv.unixnano)
+		venc.Write(int64(vv.value))
+	}
+
+	// Encoded timestamp values
+	tb, err := tenc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+	// Encoded int64 values
+	vb, err := venc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepend the first timestamp of the block in the first 8 bytes
+	return packBlock(buf, BlockUnsigned, tb, vb), nil
+}
+
 // DecodeUnsignedBlock decodes the unsigned integer block from the byte slice
 // and appends the unsigned integer values to a.
 func DecodeUnsignedBlock(block []byte, a *[]UnsignedValue) ([]UnsignedValue, error) {
@ -874,40 +885,42 @@ func (v StringValue) String() string {
 }

 func encodeStringBlock(buf []byte, values []Value) ([]byte, error) {
-	tsEnc := getTimeEncoder(len(values))
-	vEnc := getStringEncoder(len(values) * len(values[0].(StringValue).value))
+	tenc := getTimeEncoder(len(values))
+	venc := getStringEncoder(len(values) * len(values[0].(StringValue).value))

-	var b []byte
-	err := func() error {
-		for _, v := range values {
-			vv := v.(StringValue)
-			tsEnc.Write(vv.unixnano)
-			vEnc.Write(vv.value)
-		}
+	b, err := encodeStringBlockUsing(buf, values, tenc, venc)

-		// Encoded timestamp values
-		tb, err := tsEnc.Bytes()
-		if err != nil {
-			return err
-		}
-		// Encoded string values
-		vb, err := vEnc.Bytes()
-		if err != nil {
-			return err
-		}
-
-		// Prepend the first timestamp of the block in the first 8 bytes
-		b = packBlock(buf, BlockString, tb, vb)
-
-		return nil
-	}()
-
-	putTimeEncoder(tsEnc)
-	putStringEncoder(vEnc)
+	putTimeEncoder(tenc)
+	putStringEncoder(venc)

 	return b, err
 }

+func encodeStringBlockUsing(buf []byte, values []Value, tenc TimeEncoder, venc StringEncoder) ([]byte, error) {
+	tenc.Reset()
+	venc.Reset()
+
+	for _, v := range values {
+		vv := v.(StringValue)
+		tenc.Write(vv.unixnano)
+		venc.Write(vv.value)
+	}
+
+	// Encoded timestamp values
+	tb, err := tenc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+	// Encoded string values
+	vb, err := venc.Bytes()
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepend the first timestamp of the block in the first 8 bytes
+	return packBlock(buf, BlockString, tb, vb), nil
+}
+
 // DecodeStringBlock decodes the string block from the byte slice
 // and appends the string values to a.
 func DecodeStringBlock(block []byte, a *[]StringValue) ([]StringValue, error) {
--- a/tsdb/engine/tsm1/engine.go
+++ b/tsdb/engine/tsm1/engine.go
@ -26,6 +26,7 @@ import (
 	"github.com/influxdata/influxdb/tsdb"
 	_ "github.com/influxdata/influxdb/tsdb/index"
 	"github.com/influxdata/influxdb/tsdb/index/inmem"
+	"github.com/influxdata/influxdb/tsdb/index/tsi1"
 	"github.com/uber-go/zap"
 )

@ -136,8 +137,10 @@ type Engine struct {

 	stats *EngineStatistics

-	// The limiter for concurrent compactions
-	compactionLimiter limiter.Fixed
+	// Limiters for concurrent compactions.  The low priority limiter is for level 3 and 4
+	// compactions.  The high priority is for level 1 and 2 compactions.
+	loPriCompactionLimiter limiter.Fixed
+	hiPriCompactionLimiter limiter.Fixed
 }

 // NewEngine returns a new instance of Engine.
@ -175,8 +178,9 @@ func NewEngine(id uint64, idx tsdb.Index, database, path string, walPath string,
 		CacheFlushMemorySizeThreshold: opt.Config.CacheSnapshotMemorySize,
 		CacheFlushWriteColdDuration:   time.Duration(opt.Config.CacheSnapshotWriteColdDuration),
 		enableCompactionsOnOpen:       true,
-		stats:             &EngineStatistics{},
-		compactionLimiter: opt.CompactionLimiter,
+		stats: &EngineStatistics{},
+		loPriCompactionLimiter: opt.LoPriCompactionLimiter,
+		hiPriCompactionLimiter: opt.HiPriCompactionLimiter,
 	}

 	// Attach fieldset to index.
@ -1289,6 +1293,10 @@ func (e *Engine) compactTSMFull(quit <-chan struct{}) {
 // onFileStoreReplace is callback handler invoked when the FileStore
 // has replaced one set of TSM files with a new set.
 func (e *Engine) onFileStoreReplace(newFiles []TSMFile) {
+	if e.index.Type() == tsi1.IndexName {
+		return
+	}
+
 	// Load any new series keys to the index
 	readers := make([]chan seriesKey, 0, len(newFiles))
 	for _, r := range newFiles {
@ -1341,46 +1349,33 @@ func (e *Engine) onFileStoreReplace(newFiles []TSMFile) {
 type compactionStrategy struct {
 	compactionGroups []CompactionGroup

-	// concurrency determines how many compactions groups will be started
-	// concurrently.  These groups may be limited by the global limiter if
-	// enabled.
-	concurrency int
 	fast        bool
 	description string
+	level       int

 	durationStat *int64
 	activeStat   *int64
 	successStat  *int64
 	errorStat    *int64

-	logger    zap.Logger
-	compactor *Compactor
-	fileStore *FileStore
-	limiter   limiter.Fixed
-	engine    *Engine
+	logger       zap.Logger
+	compactor    *Compactor
+	fileStore    *FileStore
+	loPriLimiter limiter.Fixed
+	hiPriLimiter limiter.Fixed
+
+	engine *Engine
 }

 // Apply concurrently compacts all the groups in a compaction strategy.
 func (s *compactionStrategy) Apply() {
 	start := time.Now()

-	// cap concurrent compaction groups to no more than 4 at a time.
-	concurrency := s.concurrency
-	if concurrency == 0 {
-		concurrency = 4
-	}
-
-	throttle := limiter.NewFixed(concurrency)
 	var wg sync.WaitGroup
 	for i := range s.compactionGroups {
 		wg.Add(1)
 		go func(groupNum int) {
 			defer wg.Done()
-
-			// limit concurrent compaction groups
-			throttle.Take()
-			defer throttle.Release()
-
 			s.compactGroup(groupNum)
 		}(i)
 	}
@ -1391,10 +1386,31 @@ func (s *compactionStrategy) Apply() {

 // compactGroup executes the compaction strategy against a single CompactionGroup.
 func (s *compactionStrategy) compactGroup(groupNum int) {
-	// Limit concurrent compactions if we have a limiter
-	if cap(s.limiter) > 0 {
-		s.limiter.Take()
-		defer s.limiter.Release()
+	// Level 1 and 2 are high priority and have a larger slice of the pool.  If all
+	// the high priority capacity is used up, they can steal from the low priority
+	// pool as well if there is capacity.  Otherwise, it wait on the high priority
+	// limiter until an running compaction completes.  Level 3 and 4 are low priority
+	// as they are generally larger compactions and more expensive to run.  They can
+	// steal a little from the high priority limiter if there is no high priority work.
+	switch s.level {
+	case 1, 2:
+		if s.hiPriLimiter.TryTake() {
+			defer s.hiPriLimiter.Release()
+		} else if s.loPriLimiter.TryTake() {
+			defer s.loPriLimiter.Release()
+		} else {
+			s.hiPriLimiter.Take()
+			defer s.hiPriLimiter.Release()
+		}
+	default:
+		if s.loPriLimiter.TryTake() {
+			defer s.loPriLimiter.Release()
+		} else if s.hiPriLimiter.Idle() && s.hiPriLimiter.TryTake() {
+			defer s.hiPriLimiter.Release()
+		} else {
+			s.loPriLimiter.Take()
+			defer s.loPriLimiter.Release()
+		}
 	}

 	group := s.compactionGroups[groupNum]
@ -1457,14 +1473,15 @@ func (e *Engine) levelCompactionStrategy(fast bool, level int) *compactionStrate
 	}

 	return &compactionStrategy{
-		concurrency:      4,
 		compactionGroups: compactionGroups,
 		logger:           e.logger,
 		fileStore:        e.FileStore,
 		compactor:        e.Compactor,
 		fast:             fast,
-		limiter:          e.compactionLimiter,
+		loPriLimiter:     e.loPriCompactionLimiter,
+		hiPriLimiter:     e.hiPriCompactionLimiter,
 		engine:           e,
+		level:            level,

 		description:  fmt.Sprintf("level %d", level),
 		activeStat:   &e.stats.TSMCompactionsActive[level-1],
@ -1490,14 +1507,15 @@ func (e *Engine) fullCompactionStrategy() *compactionStrategy {
 	}

 	s := &compactionStrategy{
-		concurrency:      1,
 		compactionGroups: compactionGroups,
 		logger:           e.logger,
 		fileStore:        e.FileStore,
 		compactor:        e.Compactor,
 		fast:             optimize,
-		limiter:          e.compactionLimiter,
+		loPriLimiter:     e.loPriCompactionLimiter,
+		hiPriLimiter:     e.hiPriCompactionLimiter,
 		engine:           e,
+		level:            4,
 	}

 	if optimize {
--- a/tsdb/engine/tsm1/reader_test.go
+++ b/tsdb/engine/tsm1/reader_test.go
@ -881,9 +881,10 @@ func TestIndirectIndex_Entries(t *testing.T) {
 	index := NewIndexWriter()
 	index.Add([]byte("cpu"), BlockFloat64, 0, 1, 10, 100)
 	index.Add([]byte("cpu"), BlockFloat64, 2, 3, 20, 200)
-	index.Add([]byte("mem"), BlockFloat64, 0, 1, 10, 100)
 	exp := index.Entries([]byte("cpu"))

+	index.Add([]byte("mem"), BlockFloat64, 0, 1, 10, 100)
+
 	b, err := index.MarshalBinary()
 	if err != nil {
 		t.Fatalf("unexpected error marshaling index: %v", err)
@ -981,27 +982,16 @@ func TestIndirectIndex_Type(t *testing.T) {
 	}
 }

-func TestIndirectIndex_Keys(t *testing.T) {
+func TestDirectIndex_KeyCount(t *testing.T) {
 	index := NewIndexWriter()
 	index.Add([]byte("cpu"), BlockFloat64, 0, 1, 10, 20)
 	index.Add([]byte("cpu"), BlockFloat64, 1, 2, 20, 30)
 	index.Add([]byte("mem"), BlockFloat64, 0, 1, 10, 20)

-	keys := index.Keys()
-
 	// 2 distinct keys
-	if got, exp := len(keys), 2; got != exp {
+	if got, exp := index.KeyCount(), 2; got != exp {
 		t.Fatalf("length mismatch: got %v, exp %v", got, exp)
 	}
-
-	// Keys should be sorted
-	if got, exp := string(keys[0]), "cpu"; got != exp {
-		t.Fatalf("key mismatch: got %v, exp %v", got, exp)
-	}
-
-	if got, exp := string(keys[1]), "mem"; got != exp {
-		t.Fatalf("key mismatch: got %v, exp %v", got, exp)
-	}
 }

 func TestBlockIterator_Single(t *testing.T) {
--- a/tsdb/engine/tsm1/ring.go
+++ b/tsdb/engine/tsm1/ring.go
@ -32,19 +32,15 @@ const partitions = 4096
 // key is hashed and the first 8 bits are used as an index to the ring.
 //
 type ring struct {
-	// The unique set of partitions in the ring.
-	// len(partitions) <= len(continuum)
-	partitions []*partition
-
-	// A mapping of partition to location on the ring continuum. This is used
-	// to lookup a partition.
-	continuum []*partition
-
 	// Number of keys within the ring. This is used to provide a hint for
 	// allocating the return values in keys(). It will not be perfectly accurate
 	// since it doesn't consider adding duplicate keys, or trying to remove non-
 	// existent keys.
 	keysHint int64
+
+	// The unique set of partitions in the ring.
+	// len(partitions) <= len(continuum)
+	partitions []*partition
 }

 // newring returns a new ring initialised with n partitions. n must always be a
@ -59,20 +55,16 @@ func newring(n int) (*ring, error) {
 	}

 	r := ring{
-		continuum: make([]*partition, partitions), // maximum number of partitions.
+		partitions: make([]*partition, n), // maximum number of partitions.
 	}

 	// The trick here is to map N partitions to all points on the continuum,
 	// such that the first eight bits of a given hash will map directly to one
 	// of the N partitions.
-	for i := 0; i < len(r.continuum); i++ {
-		if (i == 0 || i%(partitions/n) == 0) && len(r.partitions) < n {
-			r.partitions = append(r.partitions, &partition{
-				store:          make(map[string]*entry),
-				entrySizeHints: make(map[uint64]int),
-			})
+	for i := 0; i < len(r.partitions); i++ {
+		r.partitions[i] = &partition{
+			store: make(map[string]*entry),
 		}
-		r.continuum[i] = r.partitions[len(r.partitions)-1]
 	}
 	return &r, nil
 }
@ -92,7 +84,7 @@ func (r *ring) reset() {
 // getPartition retrieves the hash ring partition associated with the provided
 // key.
 func (r *ring) getPartition(key []byte) *partition {
-	return r.continuum[int(xxhash.Sum64(key)%partitions)]
+	return r.partitions[int(xxhash.Sum64(key)%partitions)]
 }

 // entry returns the entry for the given key.
@ -137,6 +129,14 @@ func (r *ring) keys(sorted bool) [][]byte {
 	return keys
 }

+func (r *ring) count() int {
+	var n int
+	for _, p := range r.partitions {
+		n += p.count()
+	}
+	return n
+}
+
 // apply applies the provided function to every entry in the ring under a read
 // lock using a separate goroutine for each partition. The provided function
 // will be called with each key and the corresponding entry. The first error
@ -188,6 +188,9 @@ func (r *ring) applySerial(f func([]byte, *entry) error) error {
 	for _, p := range r.partitions {
 		p.mu.RLock()
 		for k, e := range p.store {
+			if e.count() == 0 {
+				continue
+			}
 			if err := f([]byte(k), e); err != nil {
 				p.mu.RUnlock()
 				return err
@ -198,16 +201,25 @@ func (r *ring) applySerial(f func([]byte, *entry) error) error {
 	return nil
 }

+func (r *ring) split(n int) []storer {
+	var keys int
+	storers := make([]storer, n)
+	for i := 0; i < n; i++ {
+		storers[i], _ = newring(len(r.partitions))
+	}
+
+	for i, p := range r.partitions {
+		r := storers[i%n].(*ring)
+		r.partitions[i] = p
+		keys += len(p.store)
+	}
+	return storers
+}
+
 // partition provides safe access to a map of series keys to entries.
 type partition struct {
 	mu    sync.RWMutex
 	store map[string]*entry
-
-	// entrySizeHints stores hints for appropriate sizes to pre-allocate the
-	// []Values in an entry. entrySizeHints will only contain hints for entries
-	// that were present prior to the most recent snapshot, preventing unbounded
-	// growth over time.
-	entrySizeHints map[uint64]int
 }

 // entry returns the partition's entry for the provided key.
@ -240,8 +252,7 @@ func (p *partition) write(key []byte, values Values) (bool, error) {
 	}

 	// Create a new entry using a preallocated size if we have a hint available.
-	hint, _ := p.entrySizeHints[xxhash.Sum64(key)]
-	e, err := newEntryValues(values, hint)
+	e, err := newEntryValues(values, 32)
 	if err != nil {
 		return false, err
 	}
@ -269,7 +280,10 @@ func (p *partition) remove(key []byte) {
 func (p *partition) keys() [][]byte {
 	p.mu.RLock()
 	keys := make([][]byte, 0, len(p.store))
-	for k := range p.store {
+	for k, v := range p.store {
+		if v.count() == 0 {
+			continue
+		}
 		keys = append(keys, []byte(k))
 	}
 	p.mu.RUnlock()
@ -279,21 +293,25 @@ func (p *partition) keys() [][]byte {
 // reset resets the partition by reinitialising the store. reset returns hints
 // about sizes that the entries within the store could be reallocated with.
 func (p *partition) reset() {
+	p.mu.RLock()
+	sz := len(p.store)
+	p.mu.RUnlock()
+
+	newStore := make(map[string]*entry, sz)
 	p.mu.Lock()
-	defer p.mu.Unlock()
-
-	// Collect the allocated sizes of values for each entry in the store.
-	p.entrySizeHints = make(map[uint64]int)
-	for k, entry := range p.store {
-		// If the capacity is large then there are many values in the entry.
-		// Store a hint to pre-allocate the next time we see the same entry.
-		entry.mu.RLock()
-		if cap(entry.values) > 128 { // 4 x the default entry capacity size.
-			p.entrySizeHints[xxhash.Sum64String(k)] = cap(entry.values)
-		}
-		entry.mu.RUnlock()
-	}
-
-	// Reset the store.
-	p.store = make(map[string]*entry, len(p.store))
+	p.store = newStore
+	p.mu.Unlock()
+}
+
+func (p *partition) count() int {
+	var n int
+	p.mu.RLock()
+	for _, v := range p.store {
+		if v.count() > 0 {
+			n++
+		}
+	}
+	p.mu.RUnlock()
+	return n
+
 }
--- a/tsdb/engine/tsm1/ring_test.go
+++ b/tsdb/engine/tsm1/ring_test.go
@ -31,7 +31,7 @@ func TestRing_newRing(t *testing.T) {

 		// Check partitions distributed correctly
 		partitions := make([]*partition, 0)
-		for i, partition := range r.continuum {
+		for i, partition := range r.partitions {
 			if i == 0 || partition != partitions[len(partitions)-1] {
 				partitions = append(partitions, partition)
 			}
--- a/tsdb/engine/tsm1/writer.go
+++ b/tsdb/engine/tsm1/writer.go
@ -152,9 +152,6 @@ type IndexWriter interface {
 	// Entries returns all index entries for a key.
 	Entries(key []byte) []IndexEntry

-	// Keys returns the unique set of keys in the index.
-	Keys() [][]byte
-
 	// KeyCount returns the count of unique keys in the index.
 	KeyCount() int

@ -232,12 +229,13 @@ func (e *IndexEntry) String() string {

 // NewIndexWriter returns a new IndexWriter.
 func NewIndexWriter() IndexWriter {
-	return &directIndex{}
+	buf := bytes.NewBuffer(make([]byte, 0, 4096))
+	return &directIndex{buf: buf, w: bufio.NewWriter(buf)}
 }

 // NewIndexWriter returns a new IndexWriter.
 func NewDiskIndexWriter(f *os.File) IndexWriter {
-	return &directIndex{fd: f, w: bufio.NewWriter(f)}
+	return &directIndex{fd: f, w: bufio.NewWriterSize(f, 1024*1024)}
 }

 // indexBlock represent an index information for a series within a TSM file.
@ -249,43 +247,48 @@ type indexBlock struct {
 // directIndex is a simple in-memory index implementation for a TSM file.  The full index
 // must fit in memory.
 type directIndex struct {
-	size   uint32
-	blocks []indexBlock
-	fd     *os.File
-	w      *bufio.Writer
+	keyCount int
+	size     uint32
+	fd       *os.File
+	buf      *bytes.Buffer
+
+	w *bufio.Writer
+
+	key          []byte
+	indexEntries *indexEntries
 }

 func (d *directIndex) Add(key []byte, blockType byte, minTime, maxTime int64, offset int64, size uint32) {
 	// Is this the first block being added?
-	if len(d.blocks) == 0 {
+	if len(d.key) == 0 {
 		// size of the key stored in the index
 		d.size += uint32(2 + len(key))
 		// size of the count of entries stored in the index
 		d.size += indexCountSize

-		d.blocks = append(d.blocks, indexBlock{
-			key: key,
-			entries: &indexEntries{
-				Type: blockType,
-				entries: []IndexEntry{IndexEntry{
-					MinTime: minTime,
-					MaxTime: maxTime,
-					Offset:  offset,
-					Size:    size,
-				}}},
+		d.key = key
+		if d.indexEntries == nil {
+			d.indexEntries = &indexEntries{}
+		}
+		d.indexEntries.Type = blockType
+		d.indexEntries.entries = append(d.indexEntries.entries, IndexEntry{
+			MinTime: minTime,
+			MaxTime: maxTime,
+			Offset:  offset,
+			Size:    size,
 		})

 		// size of the encoded index entry
 		d.size += indexEntrySize
+		d.keyCount++
 		return
 	}

-	// Find the last block so we can see if were still adding to the same series key.
-	block := d.blocks[len(d.blocks)-1]
-	cmp := bytes.Compare(block.key, key)
+	// See if were still adding to the same series key.
+	cmp := bytes.Compare(d.key, key)
 	if cmp == 0 {
 		// The last block is still this key
-		block.entries.entries = append(block.entries.entries, IndexEntry{
+		d.indexEntries.entries = append(d.indexEntries.entries, IndexEntry{
 			MinTime: minTime,
 			MaxTime: maxTime,
 			Offset:  offset,
@ -296,9 +299,7 @@ func (d *directIndex) Add(key []byte, blockType byte, minTime, maxTime int64, of
 		d.size += indexEntrySize

 	} else if cmp < 0 {
-		if d.w != nil {
-			d.flush(d.w)
-		}
+		d.flush(d.w)
 		// We have a new key that is greater than the last one so we need to add
 		// a new index block section.

@ -307,38 +308,31 @@ func (d *directIndex) Add(key []byte, blockType byte, minTime, maxTime int64, of
 		// size of the count of entries stored in the index
 		d.size += indexCountSize

-		d.blocks = append(d.blocks, indexBlock{
-			key: key,
-			entries: &indexEntries{
-				Type: blockType,
-				entries: []IndexEntry{IndexEntry{
-					MinTime: minTime,
-					MaxTime: maxTime,
-					Offset:  offset,
-					Size:    size,
-				}}},
+		d.key = key
+		d.indexEntries.Type = blockType
+		d.indexEntries.entries = append(d.indexEntries.entries, IndexEntry{
+			MinTime: minTime,
+			MaxTime: maxTime,
+			Offset:  offset,
+			Size:    size,
 		})

 		// size of the encoded index entry
 		d.size += indexEntrySize
+		d.keyCount++
 	} else {
 		// Keys can't be added out of order.
-		panic(fmt.Sprintf("keys must be added in sorted order: %s < %s", string(key), string(d.blocks[len(d.blocks)-1].key)))
+		panic(fmt.Sprintf("keys must be added in sorted order: %s < %s", string(key), string(d.key)))
 	}
 }

 func (d *directIndex) entries(key []byte) []IndexEntry {
-	if len(d.blocks) == 0 {
+	if len(d.key) == 0 {
 		return nil
 	}

-	if bytes.Equal(d.blocks[len(d.blocks)-1].key, key) {
-		return d.blocks[len(d.blocks)-1].entries.entries
-	}
-
-	i := sort.Search(len(d.blocks), func(i int) bool { return bytes.Compare(d.blocks[i].key, key) >= 0 })
-	if i < len(d.blocks) && bytes.Equal(d.blocks[i].key, key) {
-		return d.blocks[i].entries.entries
+	if bytes.Equal(d.key, key) {
+		return d.indexEntries.entries
 	}

 	return nil
@ -358,23 +352,11 @@ func (d *directIndex) Entry(key []byte, t int64) *IndexEntry {
 	return nil
 }

-func (d *directIndex) Keys() [][]byte {
-	keys := make([][]byte, 0, len(d.blocks))
-	for _, v := range d.blocks {
-		keys = append(keys, v.key)
-	}
-	return keys
-}
-
 func (d *directIndex) KeyCount() int {
-	return len(d.blocks)
+	return d.keyCount
 }

 func (d *directIndex) WriteTo(w io.Writer) (int64, error) {
-	if d.w == nil {
-		return d.flush(w)
-	}
-
 	if _, err := d.flush(d.w); err != nil {
 		return 0, err
 	}
@ -383,6 +365,10 @@ func (d *directIndex) WriteTo(w io.Writer) (int64, error) {
 		return 0, err
 	}

+	if d.fd == nil {
+		return io.Copy(w, d.buf)
+	}
+
 	if _, err := d.fd.Seek(0, io.SeekStart); err != nil {
 		return 0, err
 	}
@ -398,50 +384,52 @@ func (d *directIndex) flush(w io.Writer) (int64, error) {
 		N   int64
 	)

+	if len(d.key) == 0 {
+		return 0, nil
+	}
 	// For each key, individual entries are sorted by time
-	for _, ie := range d.blocks {
-		key := ie.key
-		entries := ie.entries
-
-		if entries.Len() > maxIndexEntries {
-			return N, fmt.Errorf("key '%s' exceeds max index entries: %d > %d", key, entries.Len(), maxIndexEntries)
-		}
-
-		if !sort.IsSorted(entries) {
-			sort.Sort(entries)
-		}
-
-		binary.BigEndian.PutUint16(buf[0:2], uint16(len(key)))
-		buf[2] = entries.Type
-		binary.BigEndian.PutUint16(buf[3:5], uint16(entries.Len()))
-
-		// Append the key length and key
-		if n, err = w.Write(buf[0:2]); err != nil {
-			return int64(n) + N, fmt.Errorf("write: writer key length error: %v", err)
-		}
-		N += int64(n)
-
-		if n, err = w.Write(key); err != nil {
-			return int64(n) + N, fmt.Errorf("write: writer key error: %v", err)
-		}
-		N += int64(n)
-
-		// Append the block type and count
-		if n, err = w.Write(buf[2:5]); err != nil {
-			return int64(n) + N, fmt.Errorf("write: writer block type and count error: %v", err)
-		}
-		N += int64(n)
-
-		// Append each index entry for all blocks for this key
-		var n64 int64
-		if n64, err = entries.WriteTo(w); err != nil {
-			return n64 + N, fmt.Errorf("write: writer entries error: %v", err)
-		}
-		N += n64
+	key := d.key
+	entries := d.indexEntries

+	if entries.Len() > maxIndexEntries {
+		return N, fmt.Errorf("key '%s' exceeds max index entries: %d > %d", key, entries.Len(), maxIndexEntries)
 	}

-	d.blocks = d.blocks[:0]
+	if !sort.IsSorted(entries) {
+		sort.Sort(entries)
+	}
+
+	binary.BigEndian.PutUint16(buf[0:2], uint16(len(key)))
+	buf[2] = entries.Type
+	binary.BigEndian.PutUint16(buf[3:5], uint16(entries.Len()))
+
+	// Append the key length and key
+	if n, err = w.Write(buf[0:2]); err != nil {
+		return int64(n) + N, fmt.Errorf("write: writer key length error: %v", err)
+	}
+	N += int64(n)
+
+	if n, err = w.Write(key); err != nil {
+		return int64(n) + N, fmt.Errorf("write: writer key error: %v", err)
+	}
+	N += int64(n)
+
+	// Append the block type and count
+	if n, err = w.Write(buf[2:5]); err != nil {
+		return int64(n) + N, fmt.Errorf("write: writer block type and count error: %v", err)
+	}
+	N += int64(n)
+
+	// Append each index entry for all blocks for this key
+	var n64 int64
+	if n64, err = entries.WriteTo(w); err != nil {
+		return n64 + N, fmt.Errorf("write: writer entries error: %v", err)
+	}
+	N += n64
+
+	d.key = nil
+	d.indexEntries.Type = 0
+	d.indexEntries.entries = d.indexEntries.entries[:0]

 	return N, nil

@ -460,14 +448,15 @@ func (d *directIndex) Size() uint32 {
 }

 func (d *directIndex) Close() error {
-	if d.w == nil {
-		return nil
-	}
-
 	// Flush anything remaining in the index
 	if err := d.w.Flush(); err != nil {
 		return err
 	}
+
+	if d.fd == nil {
+		return nil
+	}
+
 	if err := d.fd.Close(); err != nil {
 		return nil
 	}
--- a/tsdb/store.go
+++ b/tsdb/store.go
@ -159,15 +159,35 @@ func (s *Store) loadShards() error {
 		err error
 	}

-	t := limiter.NewFixed(runtime.GOMAXPROCS(0))
-
 	// Setup a shared limiter for compactions
 	lim := s.EngineOptions.Config.MaxConcurrentCompactions
 	if lim == 0 {
+		lim = runtime.GOMAXPROCS(0) / 2 // Default to 50% of cores for compactions
+		if lim < 1 {
+			lim = 1
+		}
+	}
+
+	// Don't allow more compactions to run than cores.
+	if lim > runtime.GOMAXPROCS(0) {
 		lim = runtime.GOMAXPROCS(0)
 	}
-	s.EngineOptions.CompactionLimiter = limiter.NewFixed(lim)

+	// If only one compacttion can run at time, use the same limiter for high and low
+	// priority work.
+	if lim == 1 {
+		s.EngineOptions.HiPriCompactionLimiter = limiter.NewFixed(1)
+		s.EngineOptions.LoPriCompactionLimiter = s.EngineOptions.HiPriCompactionLimiter
+	} else {
+		// Split the available high and low priority limiters between the available cores.
+		// The high priority work can steal from low priority at times so it can use the
+		// full limit if there is pending work.  The low priority is capped at half the
+		// limit.
+		s.EngineOptions.HiPriCompactionLimiter = limiter.NewFixed(lim/2 + lim%2)
+		s.EngineOptions.LoPriCompactionLimiter = limiter.NewFixed(lim / 2)
+	}
+
+	t := limiter.NewFixed(runtime.GOMAXPROCS(0))
 	resC := make(chan *res)
 	var n int