mirror of https://github.com/milvus-io/milvus.git
parent
69087ff8dd
commit
00b3fcb98b
|
@ -26,6 +26,8 @@ import (
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/util/timerecord"
|
||||||
|
|
||||||
datanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client"
|
datanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client"
|
||||||
rootcoordclient "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
|
rootcoordclient "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
|
||||||
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
||||||
|
@ -454,9 +456,9 @@ func (s *Server) startDataNodeTtLoop(ctx context.Context) {
|
||||||
ttMsgStream.Start()
|
ttMsgStream.Start()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
var checker *LongTermChecker
|
var checker *timerecord.LongTermChecker
|
||||||
if enableTtChecker {
|
if enableTtChecker {
|
||||||
checker = NewLongTermChecker(ctx, ttCheckerName, ttMaxInterval, ttCheckerWarnMsg)
|
checker = timerecord.NewLongTermChecker(ctx, ttCheckerName, ttMaxInterval, ttCheckerWarnMsg)
|
||||||
checker.Start()
|
checker.Start()
|
||||||
defer checker.Stop()
|
defer checker.Stop()
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,10 +19,8 @@ package datacoord
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/log"
|
|
||||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||||
"github.com/milvus-io/milvus/internal/util/tsoutil"
|
"github.com/milvus-io/milvus/internal/util/tsoutil"
|
||||||
)
|
)
|
||||||
|
@ -68,54 +66,6 @@ func FailResponse(status *commonpb.Status, reason string) {
|
||||||
status.Reason = reason
|
status.Reason = reason
|
||||||
}
|
}
|
||||||
|
|
||||||
// LongTermChecker checks we receive at least one msg in d duration. If not, checker
|
|
||||||
// will print a warn message.
|
|
||||||
type LongTermChecker struct {
|
|
||||||
d time.Duration
|
|
||||||
t *time.Ticker
|
|
||||||
ch chan struct{}
|
|
||||||
warn string
|
|
||||||
name string
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewLongTermChecker creates a long term checker specified name, checking interval and warning string to print
|
|
||||||
func NewLongTermChecker(ctx context.Context, name string, d time.Duration, warn string) *LongTermChecker {
|
|
||||||
c := &LongTermChecker{
|
|
||||||
name: name,
|
|
||||||
d: d,
|
|
||||||
warn: warn,
|
|
||||||
ch: make(chan struct{}),
|
|
||||||
}
|
|
||||||
return c
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start starts the check process
|
|
||||||
func (c *LongTermChecker) Start() {
|
|
||||||
c.t = time.NewTicker(c.d)
|
|
||||||
go func() {
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-c.ch:
|
|
||||||
log.Warn(fmt.Sprintf("long term checker [%s] shutdown", c.name))
|
|
||||||
return
|
|
||||||
case <-c.t.C:
|
|
||||||
log.Warn(c.warn)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check resets the time ticker
|
|
||||||
func (c *LongTermChecker) Check() {
|
|
||||||
c.t.Reset(c.d)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop stops the checker
|
|
||||||
func (c *LongTermChecker) Stop() {
|
|
||||||
c.t.Stop()
|
|
||||||
close(c.ch)
|
|
||||||
}
|
|
||||||
|
|
||||||
func getTimetravelReverseTime(ctx context.Context, allocator allocator) (*timetravel, error) {
|
func getTimetravelReverseTime(ctx context.Context, allocator allocator) (*timetravel, error) {
|
||||||
ts, err := allocator.allocTimestamp(ctx)
|
ts, err := allocator.allocTimestamp(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -12,14 +12,23 @@
|
||||||
package flowgraph
|
package flowgraph
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/util/timerecord"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/log"
|
"github.com/milvus-io/milvus/internal/log"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// TODO: better to be configured
|
||||||
|
nodeCtxTtInterval = 2 * time.Minute
|
||||||
|
enableTtChecker = true
|
||||||
|
)
|
||||||
|
|
||||||
// Node is the interface defines the behavior of flowgraph
|
// Node is the interface defines the behavior of flowgraph
|
||||||
type Node interface {
|
type Node interface {
|
||||||
Name() string
|
Name() string
|
||||||
|
@ -61,6 +70,17 @@ func (nodeCtx *nodeCtx) Start(wg *sync.WaitGroup) {
|
||||||
// 2. invoke node.Operate
|
// 2. invoke node.Operate
|
||||||
// 3. deliver the Operate result to downstream nodes
|
// 3. deliver the Operate result to downstream nodes
|
||||||
func (nodeCtx *nodeCtx) work() {
|
func (nodeCtx *nodeCtx) work() {
|
||||||
|
// TODO: necessary to check every node?
|
||||||
|
name := fmt.Sprintf("nodeCtxTtChecker-%s", nodeCtx.node.Name())
|
||||||
|
warn := fmt.Sprintf("node %s haven't received input for %f minutes",
|
||||||
|
nodeCtx.node.Name(), nodeCtxTtInterval.Minutes())
|
||||||
|
var checker *timerecord.LongTermChecker
|
||||||
|
if enableTtChecker {
|
||||||
|
checker = timerecord.NewLongTermChecker(context.Background(), name, nodeCtxTtInterval, warn)
|
||||||
|
checker.Start()
|
||||||
|
defer checker.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-nodeCtx.closeCh:
|
case <-nodeCtx.closeCh:
|
||||||
|
@ -76,6 +96,10 @@ func (nodeCtx *nodeCtx) work() {
|
||||||
n := nodeCtx.node
|
n := nodeCtx.node
|
||||||
res = n.Operate(inputs)
|
res = n.Operate(inputs)
|
||||||
|
|
||||||
|
if enableTtChecker {
|
||||||
|
checker.Check()
|
||||||
|
}
|
||||||
|
|
||||||
downstreamLength := len(nodeCtx.downstreamInputChanIdx)
|
downstreamLength := len(nodeCtx.downstreamInputChanIdx)
|
||||||
if len(nodeCtx.downstream) < downstreamLength {
|
if len(nodeCtx.downstream) < downstreamLength {
|
||||||
log.Warn("", zap.Any("nodeCtx.downstream length", len(nodeCtx.downstream)))
|
log.Warn("", zap.Any("nodeCtx.downstream length", len(nodeCtx.downstream)))
|
||||||
|
|
|
@ -12,6 +12,8 @@
|
||||||
package timerecord
|
package timerecord
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -75,3 +77,51 @@ func (tr *TimeRecorder) printTimeRecord(msg string, span time.Duration) {
|
||||||
str += "ms)"
|
str += "ms)"
|
||||||
log.Debug(str)
|
log.Debug(str)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LongTermChecker checks we receive at least one msg in d duration. If not, checker
|
||||||
|
// will print a warn message.
|
||||||
|
type LongTermChecker struct {
|
||||||
|
d time.Duration
|
||||||
|
t *time.Ticker
|
||||||
|
ch chan struct{}
|
||||||
|
warn string
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewLongTermChecker creates a long term checker specified name, checking interval and warning string to print
|
||||||
|
func NewLongTermChecker(ctx context.Context, name string, d time.Duration, warn string) *LongTermChecker {
|
||||||
|
c := &LongTermChecker{
|
||||||
|
name: name,
|
||||||
|
d: d,
|
||||||
|
warn: warn,
|
||||||
|
ch: make(chan struct{}),
|
||||||
|
}
|
||||||
|
return c
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start starts the check process
|
||||||
|
func (c *LongTermChecker) Start() {
|
||||||
|
c.t = time.NewTicker(c.d)
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-c.ch:
|
||||||
|
log.Warn(fmt.Sprintf("long term checker [%s] shutdown", c.name))
|
||||||
|
return
|
||||||
|
case <-c.t.C:
|
||||||
|
log.Warn(c.warn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check resets the time ticker
|
||||||
|
func (c *LongTermChecker) Check() {
|
||||||
|
c.t.Reset(c.d)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop stops the checker
|
||||||
|
func (c *LongTermChecker) Stop() {
|
||||||
|
c.t.Stop()
|
||||||
|
close(c.ch)
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue