diff --git a/CHANGELOG.md b/CHANGELOG.md index ff7f1f31f3..b716c44827 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - [#4198](https://github.com/influxdb/influxdb/pull/4198): Add basic cluster-service stats - [#4262](https://github.com/influxdb/influxdb/pull/4262): Allow configuration of UDP retention policy - [#4265](https://github.com/influxdb/influxdb/pull/4265): Add statistics for Hinted-Handoff +- [#4284](https://github.com/influxdb/influxdb/pull/4284): Add exponential backoff for hinted-handoff failures ### Bugfixes - [#4166](https://github.com/influxdb/influxdb/pull/4166): Fix parser error on invalid SHOW diff --git a/services/hh/config.go b/services/hh/config.go index dfdca1a37e..b5ffe715fe 100644 --- a/services/hh/config.go +++ b/services/hh/config.go @@ -19,26 +19,34 @@ const ( // value of 0 disables the rate limit. DefaultRetryRateLimit = 0 - // DefaultRetryInterval is the default amout of time the system waits before - // attempting to flush hinted handoff queues. + // DefaultRetryInterval is the default amount of time the system waits before + // attempting to flush hinted handoff queues. With each failure of a hinted + // handoff write, this retry interval increases exponentially until it reaches + // the maximum DefaultRetryInterval = time.Second + + // DefaultRetryMaxInterval is the maximum the hinted handoff retry interval + // will ever be. + DefaultRetryMaxInterval = time.Minute ) type Config struct { - Enabled bool `toml:"enabled"` - Dir string `toml:"dir"` - MaxSize int64 `toml:"max-size"` - MaxAge toml.Duration `toml:"max-age"` - RetryRateLimit int64 `toml:"retry-rate-limit"` - RetryInterval toml.Duration `toml:"retry-interval"` + Enabled bool `toml:"enabled"` + Dir string `toml:"dir"` + MaxSize int64 `toml:"max-size"` + MaxAge toml.Duration `toml:"max-age"` + RetryRateLimit int64 `toml:"retry-rate-limit"` + RetryInterval toml.Duration `toml:"retry-interval"` + RetryMaxInterval toml.Duration `toml:"retry-max-interval"` } func NewConfig() Config { return Config{ - Enabled: true, - MaxSize: DefaultMaxSize, - MaxAge: toml.Duration(DefaultMaxAge), - RetryRateLimit: DefaultRetryRateLimit, - RetryInterval: toml.Duration(DefaultRetryInterval), + Enabled: true, + MaxSize: DefaultMaxSize, + MaxAge: toml.Duration(DefaultMaxAge), + RetryRateLimit: DefaultRetryRateLimit, + RetryInterval: toml.Duration(DefaultRetryInterval), + RetryMaxInterval: toml.Duration(DefaultRetryMaxInterval), } } diff --git a/services/hh/config_test.go b/services/hh/config_test.go index 2d2f4f217e..9963a31af3 100644 --- a/services/hh/config_test.go +++ b/services/hh/config_test.go @@ -14,6 +14,7 @@ func TestConfigParse(t *testing.T) { if _, err := toml.Decode(` enabled = false retry-interval = "10m" +retry-max-interval = "100m" max-size=2048 max-age="20m" retry-rate-limit=1000 @@ -30,6 +31,10 @@ retry-rate-limit=1000 t.Fatalf("unexpected retry interval: got %v, exp %v", c.RetryInterval, exp) } + if exp := 100 * time.Minute; c.RetryMaxInterval.String() != exp.String() { + t.Fatalf("unexpected retry max interval: got %v, exp %v", c.RetryMaxInterval, exp) + } + if exp := 20 * time.Minute; c.MaxAge.String() != exp.String() { t.Fatalf("unexpected max age: got %v, exp %v", c.MaxAge, exp) } diff --git a/services/hh/service.go b/services/hh/service.go index cf9424a123..88860350eb 100644 --- a/services/hh/service.go +++ b/services/hh/service.go @@ -119,17 +119,29 @@ func (s *Service) WriteShard(shardID, ownerID uint64, points []models.Point) err func (s *Service) retryWrites() { defer s.wg.Done() - ticker := time.NewTicker(time.Duration(s.cfg.RetryInterval)) - defer ticker.Stop() + currInterval := time.Duration(s.cfg.RetryInterval) + if currInterval > time.Duration(s.cfg.RetryMaxInterval) { + currInterval = time.Duration(s.cfg.RetryMaxInterval) + } + for { + select { case <-s.closing: return - case <-ticker.C: + case <-time.After(currInterval): s.statMap.Add(processReq, 1) if err := s.HintedHandoff.Process(); err != nil && err != io.EOF { s.statMap.Add(processReqFail, 1) s.Logger.Printf("retried write failed: %v", err) + + currInterval = currInterval * 2 + if currInterval > time.Duration(s.cfg.RetryMaxInterval) { + currInterval = time.Duration(s.cfg.RetryMaxInterval) + } + } else { + // Success! Return to configured interval. + currInterval = time.Duration(s.cfg.RetryInterval) } } }