Fix race on store close

There was a very small window where it was possible to deadlock during the close of the Store. When closing, the Store waited on its Waitgroup under a `Lock`. Naturally, all other goroutines must have been in a position to call `Done` on the `Waitgroup` before the `Wait` call in `Close` would return. For the goroutine running the `monitorShards` method it was possible that it would be unable to do this. Specifically, if the `monitorShards` goroutine was jumping into the `t.C` case as the `Close()` goroutine was acquiring the `Lock` then then `monitorShards` goroutine would be unable to acquire the `RLock`. Since it would also be unable to progress around its loop to jump into the `s.closing` case, it would be unable to call `Done` on the `WaitGroup` and we would have a deadlock. This was identified during an AppVeyor CI run, though I was unable to reproduce this locally.
2017-11-07 12:48:43 +00:00 · 2017-11-07 12:48:43 +00:00 · e762da9aca
parent e69217440b
commit e762da9aca
1 changed files with 11 additions and 4 deletions
--- a/tsdb/store.go
+++ b/tsdb/store.go
@ -132,6 +132,11 @@ func (s *Store) Open() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

+	if s.opened {
+		// Already open
+		return nil
+	}
+
 	s.closing = make(chan struct{})
 	s.shards = map[uint64]*Shard{}

@ -291,12 +296,13 @@ func (s *Store) loadShards() error {
 // shards through the Store will result in ErrStoreClosed being returned.
 func (s *Store) Close() error {
 	s.mu.Lock()
-	defer s.mu.Unlock()
-
 	if s.opened {
 		close(s.closing)
 	}
+	s.mu.Unlock()
+
 	s.wg.Wait()
+	// No other goroutines accessing the store, so no need for a Lock.

 	// Close all the shards in parallel.
 	if err := s.walkShards(s.shardsSlice(), func(sh *Shard) error {
@ -305,9 +311,10 @@ func (s *Store) Close() error {
 		return err
 	}

-	s.opened = false
+	s.mu.Lock()
 	s.shards = nil
-
+	s.opened = false // Store may now be opened again.
+	s.mu.Unlock()
 	return nil
 }