Add log for flowgraph (#14441)

Signed-off-by: dragondriver <jiquan.long@zilliz.com>
pull/14238/head
Jiquan Long 2021-12-29 14:55:21 +08:00 committed by GitHub
parent 69087ff8dd
commit 00b3fcb98b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 52 deletions

View File

@ -26,6 +26,8 @@ import (
"syscall" "syscall"
"time" "time"
"github.com/milvus-io/milvus/internal/util/timerecord"
datanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client" datanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client"
rootcoordclient "github.com/milvus-io/milvus/internal/distributed/rootcoord/client" rootcoordclient "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd" etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
@ -454,9 +456,9 @@ func (s *Server) startDataNodeTtLoop(ctx context.Context) {
ttMsgStream.Start() ttMsgStream.Start()
go func() { go func() {
var checker *LongTermChecker var checker *timerecord.LongTermChecker
if enableTtChecker { if enableTtChecker {
checker = NewLongTermChecker(ctx, ttCheckerName, ttMaxInterval, ttCheckerWarnMsg) checker = timerecord.NewLongTermChecker(ctx, ttCheckerName, ttMaxInterval, ttCheckerWarnMsg)
checker.Start() checker.Start()
defer checker.Stop() defer checker.Stop()
} }

View File

@ -19,10 +19,8 @@ package datacoord
import ( import (
"context" "context"
"errors" "errors"
"fmt"
"time" "time"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/commonpb" "github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/util/tsoutil" "github.com/milvus-io/milvus/internal/util/tsoutil"
) )
@ -68,54 +66,6 @@ func FailResponse(status *commonpb.Status, reason string) {
status.Reason = reason status.Reason = reason
} }
// LongTermChecker checks we receive at least one msg in d duration. If not, checker
// will print a warn message.
type LongTermChecker struct {
d time.Duration
t *time.Ticker
ch chan struct{}
warn string
name string
}
// NewLongTermChecker creates a long term checker specified name, checking interval and warning string to print
func NewLongTermChecker(ctx context.Context, name string, d time.Duration, warn string) *LongTermChecker {
c := &LongTermChecker{
name: name,
d: d,
warn: warn,
ch: make(chan struct{}),
}
return c
}
// Start starts the check process
func (c *LongTermChecker) Start() {
c.t = time.NewTicker(c.d)
go func() {
for {
select {
case <-c.ch:
log.Warn(fmt.Sprintf("long term checker [%s] shutdown", c.name))
return
case <-c.t.C:
log.Warn(c.warn)
}
}
}()
}
// Check resets the time ticker
func (c *LongTermChecker) Check() {
c.t.Reset(c.d)
}
// Stop stops the checker
func (c *LongTermChecker) Stop() {
c.t.Stop()
close(c.ch)
}
func getTimetravelReverseTime(ctx context.Context, allocator allocator) (*timetravel, error) { func getTimetravelReverseTime(ctx context.Context, allocator allocator) (*timetravel, error) {
ts, err := allocator.allocTimestamp(ctx) ts, err := allocator.allocTimestamp(ctx)
if err != nil { if err != nil {

View File

@ -12,14 +12,23 @@
package flowgraph package flowgraph
import ( import (
"context"
"fmt" "fmt"
"sync" "sync"
"time" "time"
"github.com/milvus-io/milvus/internal/util/timerecord"
"github.com/milvus-io/milvus/internal/log" "github.com/milvus-io/milvus/internal/log"
"go.uber.org/zap" "go.uber.org/zap"
) )
const (
// TODO: better to be configured
nodeCtxTtInterval = 2 * time.Minute
enableTtChecker = true
)
// Node is the interface defines the behavior of flowgraph // Node is the interface defines the behavior of flowgraph
type Node interface { type Node interface {
Name() string Name() string
@ -61,6 +70,17 @@ func (nodeCtx *nodeCtx) Start(wg *sync.WaitGroup) {
// 2. invoke node.Operate // 2. invoke node.Operate
// 3. deliver the Operate result to downstream nodes // 3. deliver the Operate result to downstream nodes
func (nodeCtx *nodeCtx) work() { func (nodeCtx *nodeCtx) work() {
// TODO: necessary to check every node?
name := fmt.Sprintf("nodeCtxTtChecker-%s", nodeCtx.node.Name())
warn := fmt.Sprintf("node %s haven't received input for %f minutes",
nodeCtx.node.Name(), nodeCtxTtInterval.Minutes())
var checker *timerecord.LongTermChecker
if enableTtChecker {
checker = timerecord.NewLongTermChecker(context.Background(), name, nodeCtxTtInterval, warn)
checker.Start()
defer checker.Stop()
}
for { for {
select { select {
case <-nodeCtx.closeCh: case <-nodeCtx.closeCh:
@ -76,6 +96,10 @@ func (nodeCtx *nodeCtx) work() {
n := nodeCtx.node n := nodeCtx.node
res = n.Operate(inputs) res = n.Operate(inputs)
if enableTtChecker {
checker.Check()
}
downstreamLength := len(nodeCtx.downstreamInputChanIdx) downstreamLength := len(nodeCtx.downstreamInputChanIdx)
if len(nodeCtx.downstream) < downstreamLength { if len(nodeCtx.downstream) < downstreamLength {
log.Warn("", zap.Any("nodeCtx.downstream length", len(nodeCtx.downstream))) log.Warn("", zap.Any("nodeCtx.downstream length", len(nodeCtx.downstream)))

View File

@ -12,6 +12,8 @@
package timerecord package timerecord
import ( import (
"context"
"fmt"
"strconv" "strconv"
"time" "time"
@ -75,3 +77,51 @@ func (tr *TimeRecorder) printTimeRecord(msg string, span time.Duration) {
str += "ms)" str += "ms)"
log.Debug(str) log.Debug(str)
} }
// LongTermChecker checks we receive at least one msg in d duration. If not, checker
// will print a warn message.
type LongTermChecker struct {
d time.Duration
t *time.Ticker
ch chan struct{}
warn string
name string
}
// NewLongTermChecker creates a long term checker specified name, checking interval and warning string to print
func NewLongTermChecker(ctx context.Context, name string, d time.Duration, warn string) *LongTermChecker {
c := &LongTermChecker{
name: name,
d: d,
warn: warn,
ch: make(chan struct{}),
}
return c
}
// Start starts the check process
func (c *LongTermChecker) Start() {
c.t = time.NewTicker(c.d)
go func() {
for {
select {
case <-c.ch:
log.Warn(fmt.Sprintf("long term checker [%s] shutdown", c.name))
return
case <-c.t.C:
log.Warn(c.warn)
}
}
}()
}
// Check resets the time ticker
func (c *LongTermChecker) Check() {
c.t.Reset(c.d)
}
// Stop stops the checker
func (c *LongTermChecker) Stop() {
c.t.Stop()
close(c.ch)
}