From bbd9d97d736acb4a80e479a73d80bb7e3c41527d Mon Sep 17 00:00:00 2001 From: Jason Wilder Date: Fri, 6 Jan 2017 16:55:12 -0700 Subject: [PATCH 1/3] Re-enabled TestServer_BackupAndRestore It was failing intermittently, but seems to fail consistently one re-enabled. A slice pointer was incremented too early causing a panic. Fixes #6590 --- cmd/influxd/restore/restore.go | 2 +- cmd/influxd/run/backup_restore_test.go | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmd/influxd/restore/restore.go b/cmd/influxd/restore/restore.go index 87f1f44079..253cb81e0a 100644 --- a/cmd/influxd/restore/restore.go +++ b/cmd/influxd/restore/restore.go @@ -159,8 +159,8 @@ func (cmd *Command) unpackMeta() error { i += int(length) // Size of the node.json bytes - i += 8 length = int(binary.BigEndian.Uint64(b[i : i+8])) + i += 8 nodeBytes := b[i : i+length] // Unpack into metadata. diff --git a/cmd/influxd/run/backup_restore_test.go b/cmd/influxd/run/backup_restore_test.go index c6b419d81b..ee9fda7656 100644 --- a/cmd/influxd/run/backup_restore_test.go +++ b/cmd/influxd/run/backup_restore_test.go @@ -13,7 +13,6 @@ import ( ) func TestServer_BackupAndRestore(t *testing.T) { - t.Skip("currently fails intermittently. See issue https://github.com/influxdata/influxdb/issues/6590") config := NewConfig() config.Data.Engine = "tsm1" config.Data.Dir, _ = ioutil.TempDir("", "data_backup") @@ -25,7 +24,7 @@ func TestServer_BackupAndRestore(t *testing.T) { db := "mydb" rp := "forever" - expected := `{"results":[{"series":[{"name":"myseries","columns":["time","host","value"],"values":[["1970-01-01T00:00:00.001Z","A",23]]}]}]}` + expected := `{"results":[{"statement_id":0,"series":[{"name":"myseries","columns":["time","host","value"],"values":[["1970-01-01T00:00:00.001Z","A",23]]}]}]}` // set the cache snapshot size low so that a single point will cause TSM file creation config.Data.CacheSnapshotMemorySize = 1 From 194c5adfaf1a665c2ed6b72efd5b64dec91d61b1 Mon Sep 17 00:00:00 2001 From: Jason Wilder Date: Sat, 7 Jan 2017 12:39:45 -0700 Subject: [PATCH 2/3] Fix race on t.refs Read at 0x00c42018f620 by goroutine 58: github.com/influxdata/influxdb/tsdb/engine/tsm1.(*TSMReader).Close() /root/go/src/github.com/influxdata/influxdb/tsdb/engine/tsm1/reader.go:330 +0x94 github.com/influxdata/influxdb/tsdb/engine/tsm1.(*FileStore).Close() /root/go/src/github.com/influxdata/influxdb/tsdb/engine/tsm1/file_store.go:464 +0x123 Previous write at 0x00c42018f620 by goroutine 63: sync/atomic.AddInt64() /usr/local/go/src/runtime/race_amd64.s:276 +0xb github.com/influxdata/influxdb/tsdb/engine/tsm1.(*TSMReader).Unref() /root/go/src/github.com/influxdata/influxdb/tsdb/engine/tsm1/reader.go:352 +0x43 github.com/influxdata/influxdb/tsdb/engine/tsm1.(*KeyCursor).Close() --- tsdb/engine/tsm1/reader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/engine/tsm1/reader.go b/tsdb/engine/tsm1/reader.go index f8e5485894..fca6bb3ca5 100644 --- a/tsdb/engine/tsm1/reader.go +++ b/tsdb/engine/tsm1/reader.go @@ -327,7 +327,7 @@ func (t *TSMReader) Close() error { t.mu.Lock() defer t.mu.Unlock() - if t.refs > 0 { + if t.InUse() { return ErrFileInUse } From eb4d311c0a7d142641e535f0956d39bb8fc50cac Mon Sep 17 00:00:00 2001 From: Jason Wilder Date: Sat, 7 Jan 2017 14:36:25 -0700 Subject: [PATCH 3/3] Add retry/backup when backing up a shard fails The backup command can fail if a snapshot is running which silently closes the connection. This causes the backup shard command to continue on as if nothing failed. --- cmd/influxd/backup/backup.go | 40 ++++++++++++++++++++++-------------- tsdb/engine/tsm1/engine.go | 6 +++--- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/cmd/influxd/backup/backup.go b/cmd/influxd/backup/backup.go index 5ba5069190..448d0ac0c3 100644 --- a/cmd/influxd/backup/backup.go +++ b/cmd/influxd/backup/backup.go @@ -301,24 +301,34 @@ func (cmd *Command) download(req *snapshotter.Request, path string) error { } defer f.Close() - // Connect to snapshotter service. - conn, err := tcp.Dial("tcp", cmd.host, snapshotter.MuxHeader) - if err != nil { - return err - } - defer conn.Close() + for i := 0; i < 10; i++ { + if err = func() error { + // Connect to snapshotter service. + conn, err := tcp.Dial("tcp", cmd.host, snapshotter.MuxHeader) + if err != nil { + return err + } + defer conn.Close() - // Write the request - if err := json.NewEncoder(conn).Encode(req); err != nil { - return fmt.Errorf("encode snapshot request: %s", err) + // Write the request + if err := json.NewEncoder(conn).Encode(req); err != nil { + return fmt.Errorf("encode snapshot request: %s", err) + } + + // Read snapshot from the connection + if n, err := io.Copy(f, conn); err != nil || n == 0 { + return fmt.Errorf("copy backup to file: err=%v, n=%d", err, n) + } + return nil + }(); err == nil { + break + } else if err != nil { + cmd.Logger.Printf("Download shard %v failed %s. Retrying (%d)...\n", req.ShardID, err, i) + time.Sleep(time.Second) + } } - // Read snapshot from the connection - if _, err := io.Copy(f, conn); err != nil { - return fmt.Errorf("copy backup to file: %s", err) - } - - return nil + return err } // requestInfo will request the database or retention policy information from the host diff --git a/tsdb/engine/tsm1/engine.go b/tsdb/engine/tsm1/engine.go index 37ba8ee2a3..4d09922b05 100644 --- a/tsdb/engine/tsm1/engine.go +++ b/tsdb/engine/tsm1/engine.go @@ -489,6 +489,9 @@ func (e *Engine) Backup(w io.Writer, basePath string, since time.Time) error { return err } + tw := tar.NewWriter(w) + defer tw.Close() + // Remove the temporary snapshot dir defer os.RemoveAll(path) @@ -515,9 +518,6 @@ func (e *Engine) Backup(w io.Writer, basePath string, since time.Time) error { return nil } - tw := tar.NewWriter(w) - defer tw.Close() - for _, f := range files { if err := e.writeFileToBackup(f, basePath, filepath.Join(path, f.Name()), tw); err != nil { return err