Merge pull request #4284 from influxdb/hh_backoff

Exponential hinted-handoff interval on fail
pull/4293/head
Philip O'Toole 2015-10-01 12:13:56 -07:00
commit 0a63bb1883
4 changed files with 42 additions and 16 deletions

View File

@ -11,6 +11,7 @@
- [#4198](https://github.com/influxdb/influxdb/pull/4198): Add basic cluster-service stats
- [#4262](https://github.com/influxdb/influxdb/pull/4262): Allow configuration of UDP retention policy
- [#4265](https://github.com/influxdb/influxdb/pull/4265): Add statistics for Hinted-Handoff
- [#4284](https://github.com/influxdb/influxdb/pull/4284): Add exponential backoff for hinted-handoff failures
### Bugfixes
- [#4166](https://github.com/influxdb/influxdb/pull/4166): Fix parser error on invalid SHOW

View File

@ -19,26 +19,34 @@ const (
// value of 0 disables the rate limit.
DefaultRetryRateLimit = 0
// DefaultRetryInterval is the default amout of time the system waits before
// attempting to flush hinted handoff queues.
// DefaultRetryInterval is the default amount of time the system waits before
// attempting to flush hinted handoff queues. With each failure of a hinted
// handoff write, this retry interval increases exponentially until it reaches
// the maximum
DefaultRetryInterval = time.Second
// DefaultRetryMaxInterval is the maximum the hinted handoff retry interval
// will ever be.
DefaultRetryMaxInterval = time.Minute
)
type Config struct {
Enabled bool `toml:"enabled"`
Dir string `toml:"dir"`
MaxSize int64 `toml:"max-size"`
MaxAge toml.Duration `toml:"max-age"`
RetryRateLimit int64 `toml:"retry-rate-limit"`
RetryInterval toml.Duration `toml:"retry-interval"`
Enabled bool `toml:"enabled"`
Dir string `toml:"dir"`
MaxSize int64 `toml:"max-size"`
MaxAge toml.Duration `toml:"max-age"`
RetryRateLimit int64 `toml:"retry-rate-limit"`
RetryInterval toml.Duration `toml:"retry-interval"`
RetryMaxInterval toml.Duration `toml:"retry-max-interval"`
}
func NewConfig() Config {
return Config{
Enabled: true,
MaxSize: DefaultMaxSize,
MaxAge: toml.Duration(DefaultMaxAge),
RetryRateLimit: DefaultRetryRateLimit,
RetryInterval: toml.Duration(DefaultRetryInterval),
Enabled: true,
MaxSize: DefaultMaxSize,
MaxAge: toml.Duration(DefaultMaxAge),
RetryRateLimit: DefaultRetryRateLimit,
RetryInterval: toml.Duration(DefaultRetryInterval),
RetryMaxInterval: toml.Duration(DefaultRetryMaxInterval),
}
}

View File

@ -14,6 +14,7 @@ func TestConfigParse(t *testing.T) {
if _, err := toml.Decode(`
enabled = false
retry-interval = "10m"
retry-max-interval = "100m"
max-size=2048
max-age="20m"
retry-rate-limit=1000
@ -30,6 +31,10 @@ retry-rate-limit=1000
t.Fatalf("unexpected retry interval: got %v, exp %v", c.RetryInterval, exp)
}
if exp := 100 * time.Minute; c.RetryMaxInterval.String() != exp.String() {
t.Fatalf("unexpected retry max interval: got %v, exp %v", c.RetryMaxInterval, exp)
}
if exp := 20 * time.Minute; c.MaxAge.String() != exp.String() {
t.Fatalf("unexpected max age: got %v, exp %v", c.MaxAge, exp)
}

View File

@ -119,17 +119,29 @@ func (s *Service) WriteShard(shardID, ownerID uint64, points []models.Point) err
func (s *Service) retryWrites() {
defer s.wg.Done()
ticker := time.NewTicker(time.Duration(s.cfg.RetryInterval))
defer ticker.Stop()
currInterval := time.Duration(s.cfg.RetryInterval)
if currInterval > time.Duration(s.cfg.RetryMaxInterval) {
currInterval = time.Duration(s.cfg.RetryMaxInterval)
}
for {
select {
case <-s.closing:
return
case <-ticker.C:
case <-time.After(currInterval):
s.statMap.Add(processReq, 1)
if err := s.HintedHandoff.Process(); err != nil && err != io.EOF {
s.statMap.Add(processReqFail, 1)
s.Logger.Printf("retried write failed: %v", err)
currInterval = currInterval * 2
if currInterval > time.Duration(s.cfg.RetryMaxInterval) {
currInterval = time.Duration(s.cfg.RetryMaxInterval)
}
} else {
// Success! Return to configured interval.
currInterval = time.Duration(s.cfg.RetryInterval)
}
}
}