From 4eba2c1725b428173c5b6a4f1bbc73531785c58a Mon Sep 17 00:00:00 2001 From: Philip O'Toole Date: Wed, 30 Sep 2015 21:10:03 -0700 Subject: [PATCH 1/4] Add config support for max HH retry interval --- services/hh/config.go | 34 +++++++++++++++++++++------------- services/hh/config_test.go | 5 +++++ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/services/hh/config.go b/services/hh/config.go index dfdca1a37e..b5ffe715fe 100644 --- a/services/hh/config.go +++ b/services/hh/config.go @@ -19,26 +19,34 @@ const ( // value of 0 disables the rate limit. DefaultRetryRateLimit = 0 - // DefaultRetryInterval is the default amout of time the system waits before - // attempting to flush hinted handoff queues. + // DefaultRetryInterval is the default amount of time the system waits before + // attempting to flush hinted handoff queues. With each failure of a hinted + // handoff write, this retry interval increases exponentially until it reaches + // the maximum DefaultRetryInterval = time.Second + + // DefaultRetryMaxInterval is the maximum the hinted handoff retry interval + // will ever be. + DefaultRetryMaxInterval = time.Minute ) type Config struct { - Enabled bool `toml:"enabled"` - Dir string `toml:"dir"` - MaxSize int64 `toml:"max-size"` - MaxAge toml.Duration `toml:"max-age"` - RetryRateLimit int64 `toml:"retry-rate-limit"` - RetryInterval toml.Duration `toml:"retry-interval"` + Enabled bool `toml:"enabled"` + Dir string `toml:"dir"` + MaxSize int64 `toml:"max-size"` + MaxAge toml.Duration `toml:"max-age"` + RetryRateLimit int64 `toml:"retry-rate-limit"` + RetryInterval toml.Duration `toml:"retry-interval"` + RetryMaxInterval toml.Duration `toml:"retry-max-interval"` } func NewConfig() Config { return Config{ - Enabled: true, - MaxSize: DefaultMaxSize, - MaxAge: toml.Duration(DefaultMaxAge), - RetryRateLimit: DefaultRetryRateLimit, - RetryInterval: toml.Duration(DefaultRetryInterval), + Enabled: true, + MaxSize: DefaultMaxSize, + MaxAge: toml.Duration(DefaultMaxAge), + RetryRateLimit: DefaultRetryRateLimit, + RetryInterval: toml.Duration(DefaultRetryInterval), + RetryMaxInterval: toml.Duration(DefaultRetryMaxInterval), } } diff --git a/services/hh/config_test.go b/services/hh/config_test.go index 2d2f4f217e..9963a31af3 100644 --- a/services/hh/config_test.go +++ b/services/hh/config_test.go @@ -14,6 +14,7 @@ func TestConfigParse(t *testing.T) { if _, err := toml.Decode(` enabled = false retry-interval = "10m" +retry-max-interval = "100m" max-size=2048 max-age="20m" retry-rate-limit=1000 @@ -30,6 +31,10 @@ retry-rate-limit=1000 t.Fatalf("unexpected retry interval: got %v, exp %v", c.RetryInterval, exp) } + if exp := 100 * time.Minute; c.RetryMaxInterval.String() != exp.String() { + t.Fatalf("unexpected retry max interval: got %v, exp %v", c.RetryMaxInterval, exp) + } + if exp := 20 * time.Minute; c.MaxAge.String() != exp.String() { t.Fatalf("unexpected max age: got %v, exp %v", c.MaxAge, exp) } From 878f776403cb576e689365bc88ce8f95efe8780e Mon Sep 17 00:00:00 2001 From: Philip O'Toole Date: Wed, 30 Sep 2015 21:27:13 -0700 Subject: [PATCH 2/4] Exponential backoff if any hinted-handoff fails --- services/hh/service.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/services/hh/service.go b/services/hh/service.go index cf9424a123..1dcf339cd5 100644 --- a/services/hh/service.go +++ b/services/hh/service.go @@ -119,17 +119,26 @@ func (s *Service) WriteShard(shardID, ownerID uint64, points []models.Point) err func (s *Service) retryWrites() { defer s.wg.Done() - ticker := time.NewTicker(time.Duration(s.cfg.RetryInterval)) - defer ticker.Stop() + currInterval := time.Duration(s.cfg.RetryInterval) + for { + select { case <-s.closing: return - case <-ticker.C: + case <-time.After(currInterval): s.statMap.Add(processReq, 1) if err := s.HintedHandoff.Process(); err != nil && err != io.EOF { s.statMap.Add(processReqFail, 1) s.Logger.Printf("retried write failed: %v", err) + + currInterval = currInterval * 2 + if currInterval > time.Duration(s.cfg.RetryMaxInterval) { + currInterval = time.Duration(s.cfg.RetryMaxInterval) + } + } else { + // Success! Return to configured interval. + currInterval = time.Duration(s.cfg.RetryInterval) } } } From c7599e04090ddfdb6cec559dac1f04a89e0d065c Mon Sep 17 00:00:00 2001 From: Philip O'Toole Date: Thu, 1 Oct 2015 11:56:20 -0700 Subject: [PATCH 3/4] Update CHANGELOG [ci skip] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff7f1f31f3..b716c44827 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - [#4198](https://github.com/influxdb/influxdb/pull/4198): Add basic cluster-service stats - [#4262](https://github.com/influxdb/influxdb/pull/4262): Allow configuration of UDP retention policy - [#4265](https://github.com/influxdb/influxdb/pull/4265): Add statistics for Hinted-Handoff +- [#4284](https://github.com/influxdb/influxdb/pull/4284): Add exponential backoff for hinted-handoff failures ### Bugfixes - [#4166](https://github.com/influxdb/influxdb/pull/4166): Fix parser error on invalid SHOW From 8a1e5a9e5339f358f4f07bcf325cdc1770f78384 Mon Sep 17 00:00:00 2001 From: Philip O'Toole Date: Thu, 1 Oct 2015 12:04:06 -0700 Subject: [PATCH 4/4] Clamp initial value of HH retry interval This could happen due to misconfiguration, so do something sensible in that case. --- services/hh/service.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/hh/service.go b/services/hh/service.go index 1dcf339cd5..88860350eb 100644 --- a/services/hh/service.go +++ b/services/hh/service.go @@ -120,6 +120,9 @@ func (s *Service) WriteShard(shardID, ownerID uint64, points []models.Point) err func (s *Service) retryWrites() { defer s.wg.Done() currInterval := time.Duration(s.cfg.RetryInterval) + if currInterval > time.Duration(s.cfg.RetryMaxInterval) { + currInterval = time.Duration(s.cfg.RetryMaxInterval) + } for {