Use GroupChecker for flowgraph node input check (#14916)

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
pull/14952/head
congqixia 2022-01-06 19:27:25 +08:00 committed by GitHub
parent 2fdd74affa
commit da4182a90f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 172 additions and 9 deletions

View File

@ -17,7 +17,6 @@
package flowgraph
import (
"context"
"fmt"
"sync"
"time"
@ -75,15 +74,14 @@ func (nodeCtx *nodeCtx) Start(wg *sync.WaitGroup) {
// 2. invoke node.Operate
// 3. deliver the Operate result to downstream nodes
func (nodeCtx *nodeCtx) work() {
// TODO: necessary to check every node?
name := fmt.Sprintf("nodeCtxTtChecker-%s", nodeCtx.node.Name())
warn := fmt.Sprintf("node %s haven't received input for %f minutes",
nodeCtx.node.Name(), nodeCtxTtInterval.Minutes())
var checker *timerecord.LongTermChecker
var checker *timerecord.GroupChecker
if enableTtChecker {
checker = timerecord.NewLongTermChecker(context.Background(), name, nodeCtxTtInterval, warn)
checker.Start()
defer checker.Stop()
checker = timerecord.GetGroupChecker("fgNode", nodeCtxTtInterval, func(list []string) {
log.Warn("some node(s) haven't received input", zap.Strings("list", list), zap.Duration("duration ", nodeCtxTtInterval))
})
checker.Check(name)
defer checker.Remove(name)
}
for {
@ -102,7 +100,7 @@ func (nodeCtx *nodeCtx) work() {
res = n.Operate(inputs)
if enableTtChecker {
checker.Check()
checker.Check(name)
}
downstreamLength := len(nodeCtx.downstreamInputChanIdx)

View File

@ -0,0 +1,113 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package timerecord
import (
"sync"
"time"
)
// groups maintains string to GroupChecker
var groups sync.Map
// GroupChecker checks members in same group silent for certain period of time
// print warning msg if there are item(s) that not reported
type GroupChecker struct {
groupName string
d time.Duration // check duration
t *time.Ticker // internal ticker
ch chan struct{} // closing signal
lastest sync.Map // map member name => lastest report time
initOnce sync.Once
stopOnce sync.Once
fn func(list []string)
}
// init start worker goroutine
// protected by initOnce
func (gc *GroupChecker) init() {
gc.initOnce.Do(func() {
gc.ch = make(chan struct{})
go gc.work()
})
}
// work is the main procedure logic
func (gc *GroupChecker) work() {
gc.t = time.NewTicker(gc.d)
var name string
var ts time.Time
for {
select {
case <-gc.t.C:
case <-gc.ch:
return
}
var list []string
gc.lastest.Range(func(k, v interface{}) bool {
name = k.(string)
ts = v.(time.Time)
if time.Since(ts) > gc.d {
list = append(list, name)
}
return true
})
if len(list) > 0 && gc.fn != nil {
gc.fn(list)
}
}
}
// Check updates the latest timestamp for provided name
func (gc *GroupChecker) Check(name string) {
gc.lastest.Store(name, time.Now())
}
// Remove deletes name from watch list
func (gc *GroupChecker) Remove(name string) {
gc.lastest.Delete(name)
}
// Stop closes the GroupChecker
func (gc *GroupChecker) Stop() {
gc.stopOnce.Do(func() {
close(gc.ch)
groups.Delete(gc.groupName)
})
}
// GetGroupChecker returns the GroupChecker with related group name
// if no exist GroupChecker has the provided name, a new instance will be created with provided params
// otherwise the params will be ignored
func GetGroupChecker(groupName string, duration time.Duration, fn func([]string)) *GroupChecker {
gc := &GroupChecker{
groupName: groupName,
d: duration,
fn: fn,
}
actual, loaded := groups.LoadOrStore(groupName, gc)
if !loaded {
gc.init()
}
gc = actual.(*GroupChecker)
return gc
}

View File

@ -0,0 +1,52 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package timerecord
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func TestGroupChecker(t *testing.T) {
groupName := `test_group`
signal := make(chan []string, 1)
gc1 := GetGroupChecker(groupName, 10*time.Millisecond, func(list []string) {
signal <- list
})
gc1.Check("1")
gc2 := GetGroupChecker(groupName, time.Second, func(list []string) {
t.FailNow()
})
gc2.Check("2")
assert.Equal(t, 10*time.Millisecond, gc2.d)
list := <-signal
assert.Equal(t, []string{"1", "2"}, list)
gc2.Remove("2")
list = <-signal
assert.Equal(t, []string{"1"}, list)
assert.NotPanics(t, func() {
gc1.Stop()
gc2.Stop()
})
}