enhance: add graceful stop timeout to avoid node stop hang under extreme cases (#30317)

1. add coordinator graceful stop timeout to 5s
2. change the order of datacoord component while stop
3. change querynode grace stop timeout to 900s, and we should
potentially change this to 600s when graceful stop is smooth

issue: #30310
also see pr: #30306

---------

Signed-off-by: chyezh <chyezh@outlook.com>
pull/30847/head
chyezh 2024-02-29 17:01:50 +08:00 committed by GitHub
parent 816ed671aa
commit 0c7474d7e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 267 additions and 132 deletions

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcdatacoordclient "github.com/milvus-io/milvus/internal/distributed/datacoord"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -57,10 +59,8 @@ func (s *DataCoord) Run() error {
// Stop terminates service
func (s *DataCoord) Stop() error {
if err := s.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().DataCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(s.svr.Stop, timeout)
}
// GetComponentStates returns DataCoord's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcdatanode "github.com/milvus-io/milvus/internal/distributed/datanode"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -60,10 +62,8 @@ func (d *DataNode) Run() error {
// Stop terminates service
func (d *DataNode) Stop() error {
if err := d.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().DataNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(d.svr.Stop, timeout)
}
// GetComponentStates returns DataNode's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcindexnode "github.com/milvus-io/milvus/internal/distributed/indexnode"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -58,10 +60,8 @@ func (n *IndexNode) Run() error {
// Stop terminates service
func (n *IndexNode) Stop() error {
if err := n.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().IndexNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(n.svr.Stop, timeout)
}
// GetComponentStates returns IndexNode's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcproxy "github.com/milvus-io/milvus/internal/distributed/proxy"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -59,10 +61,8 @@ func (n *Proxy) Run() error {
// Stop terminates service
func (n *Proxy) Stop() error {
if err := n.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().ProxyCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(n.svr.Stop, timeout)
}
// GetComponentStates returns Proxy's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcquerycoord "github.com/milvus-io/milvus/internal/distributed/querycoord"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -60,10 +62,8 @@ func (qs *QueryCoord) Run() error {
// Stop terminates service
func (qs *QueryCoord) Stop() error {
if err := qs.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().QueryCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(qs.svr.Stop, timeout)
}
// GetComponentStates returns QueryCoord's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
grpcquerynode "github.com/milvus-io/milvus/internal/distributed/querynode"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -60,10 +62,8 @@ func (q *QueryNode) Run() error {
// Stop terminates service
func (q *QueryNode) Stop() error {
if err := q.svr.Stop(); err != nil {
return err
}
return nil
timeout := paramtable.Get().QueryNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(q.svr.Stop, timeout)
}
// GetComponentStates returns QueryNode's states

View File

@ -18,6 +18,7 @@ package components
import (
"context"
"time"
"go.uber.org/zap"
@ -26,6 +27,7 @@ import (
rc "github.com/milvus-io/milvus/internal/distributed/rootcoord"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -59,10 +61,8 @@ func (rc *RootCoord) Run() error {
// Stop terminates service
func (rc *RootCoord) Stop() error {
if rc.svr != nil {
return rc.svr.Stop()
}
return nil
timeout := paramtable.Get().RootCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
return exitWhenStopTimeout(rc.svr.Stop, timeout)
}
// GetComponentStates returns RootCoord's states

38
cmd/components/util.go Normal file
View File

@ -0,0 +1,38 @@
package components
import (
"context"
"os"
"time"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/pkg/util/conc"
)
var errStopTimeout = errors.New("stop timeout")
// exitWhenStopTimeout stops a component with timeout and exit progress when timeout.
func exitWhenStopTimeout(stop func() error, timeout time.Duration) error {
err := stopWithTimeout(stop, timeout)
if errors.Is(err, errStopTimeout) {
os.Exit(1)
}
return err
}
// stopWithTimeout stops a component with timeout.
func stopWithTimeout(stop func() error, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
future := conc.Go(func() (struct{}, error) {
return struct{}{}, stop()
})
select {
case <-future.Inner():
return errors.Wrap(future.Err(), "failed to stop component")
case <-ctx.Done():
return errStopTimeout
}
}

View File

@ -0,0 +1,38 @@
package components
import (
"testing"
"time"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/assert"
)
func TestExitWithTimeout(t *testing.T) {
// only normal path can be tested.
targetErr := errors.New("stop error")
err := exitWhenStopTimeout(func() error {
time.Sleep(1 * time.Second)
return targetErr
}, 5*time.Second)
assert.ErrorIs(t, err, targetErr)
}
func TestStopWithTimeout(t *testing.T) {
ch := make(chan struct{})
stop := func() error {
<-ch
return nil
}
err := stopWithTimeout(stop, 1*time.Second)
assert.ErrorIs(t, err, errStopTimeout)
targetErr := errors.New("stop error")
stop = func() error {
return targetErr
}
err = stopWithTimeout(stop, 1*time.Second)
assert.ErrorIs(t, err, targetErr)
}

View File

@ -271,16 +271,7 @@ func (s *Server) Register() error {
s.session.LivenessCheck(s.serverLoopCtx, func() {
logutil.Logger(s.ctx).Error("disconnected from etcd and exited", zap.Int64("serverID", s.session.GetServerID()))
if err := s.Stop(); err != nil {
logutil.Logger(s.ctx).Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.DataCoordRole).Dec()
// manually send signal to starter goroutine
if s.session.IsTriggerKill() {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
return nil
}
@ -1102,16 +1093,24 @@ func (s *Server) Stop() error {
if !s.stateCode.CompareAndSwap(commonpb.StateCode_Healthy, commonpb.StateCode_Abnormal) {
return nil
}
logutil.Logger(s.ctx).Info("server shutdown")
s.cluster.Close()
logutil.Logger(s.ctx).Info("datacoord server shutdown")
s.garbageCollector.close()
s.stopServerLoop()
logutil.Logger(s.ctx).Info("datacoord garbage collector stopped")
if Params.DataCoordCfg.EnableCompaction.GetAsBool() {
s.stopCompactionTrigger()
s.stopCompactionHandler()
}
logutil.Logger(s.ctx).Info("datacoord compaction stopped")
s.indexBuilder.Stop()
logutil.Logger(s.ctx).Info("datacoord index builder stopped")
s.cluster.Close()
logutil.Logger(s.ctx).Info("datacoord cluster stopped")
s.stopServerLoop()
logutil.Logger(s.ctx).Info("datacoord serverloop stopped")
if s.session != nil {
s.session.Stop()
@ -1120,6 +1119,7 @@ func (s *Server) Stop() error {
if s.icSession != nil {
s.icSession.Stop()
}
logutil.Logger(s.ctx).Warn("datacoord stop successful")
return nil
}

View File

@ -26,12 +26,11 @@ import (
"math/rand"
"os"
"sync"
"syscall"
"sync/atomic"
"time"
"github.com/cockroachdb/errors"
clientv3 "go.etcd.io/etcd/client/v3"
"go.uber.org/atomic"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
@ -199,16 +198,7 @@ func (node *DataNode) Register() error {
// Start liveness check
node.session.LivenessCheck(node.ctx, func() {
log.Error("Data Node disconnected from etcd, process will exit", zap.Int64("Server Id", node.GetSession().ServerID))
if err := node.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.DataNodeRole).Dec()
// manually send signal to starter goroutine
if node.session.TriggerKill {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
return nil

View File

@ -221,10 +221,14 @@ func (s *Server) start() error {
// Stop stops the DataCoord server gracefully.
// Need to call the GracefulStop interface of grpc server and call the stop method of the inner DataCoord object.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().DataCoordGrpcServerCfg
log.Debug("Datacoord stop", zap.String("Address", Params.GetAddress()))
var err error
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("Datacoord stopping")
defer func() {
logger.Info("Datacoord stopped", zap.Error(err))
}()
s.cancel()
if s.etcdCli != nil {

View File

@ -199,9 +199,14 @@ func (s *Server) Run() error {
}
// Stop stops Datanode's grpc service.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().DataNodeGrpcServerCfg
log.Debug("Datanode stop", zap.String("Address", Params.GetAddress()))
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("Datanode stopping")
defer func() {
logger.Info("Datanode stopped", zap.Error(err))
}()
s.cancel()
if s.etcdCli != nil {
defer s.etcdCli.Close()
@ -210,7 +215,7 @@ func (s *Server) Stop() error {
utils.GracefulStopGRPCServer(s.grpcServer)
}
err := s.datanode.Stop()
err = s.datanode.Stop()
if err != nil {
return err
}

View File

@ -211,9 +211,14 @@ func (s *Server) start() error {
}
// Stop stops IndexNode's grpc service.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().IndexNodeGrpcServerCfg
log.Debug("IndexNode stop", zap.String("Address", Params.GetAddress()))
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("IndexNode stopping")
defer func() {
logger.Info("IndexNode stopped", zap.Error(err))
}()
if s.indexnode != nil {
s.indexnode.Stop()
}

View File

@ -695,9 +695,13 @@ func (s *Server) start() error {
}
// Stop stop the Proxy Server
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().ProxyGrpcServerCfg
log.Debug("Proxy stop", zap.String("internal address", Params.GetInternalAddress()), zap.String("external address", Params.GetInternalAddress()))
logger := log.With(zap.String("internal address", Params.GetInternalAddress()), zap.String("external address", Params.GetInternalAddress()))
logger.Info("Proxy stopping")
defer func() {
logger.Info("Proxy stopped", zap.Error(err))
}()
if s.etcdCli != nil {
defer s.etcdCli.Close()
@ -741,7 +745,7 @@ func (s *Server) Stop() error {
s.wg.Wait()
err := s.proxy.Stop()
err = s.proxy.Stop()
if err != nil {
return err
}

View File

@ -272,9 +272,14 @@ func (s *Server) start() error {
}
// Stop stops QueryCoord's grpc service.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().QueryCoordGrpcServerCfg
log.Debug("QueryCoord stop", zap.String("Address", Params.GetAddress()))
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("QueryCoord stopping")
defer func() {
logger.Info("QueryCoord stopped", zap.Error(err))
}()
if s.etcdCli != nil {
defer s.etcdCli.Close()
}
@ -282,9 +287,7 @@ func (s *Server) Stop() error {
if s.grpcServer != nil {
utils.GracefulStopGRPCServer(s.grpcServer)
}
err := s.queryCoord.Stop()
return err
return s.queryCoord.Stop()
}
// SetRootCoord sets root coordinator's client

View File

@ -237,10 +237,15 @@ func (s *Server) Run() error {
}
// Stop stops QueryNode's grpc service.
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().QueryNodeGrpcServerCfg
log.Debug("QueryNode stop", zap.String("Address", Params.GetAddress()))
err := s.querynode.Stop()
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("QueryNode stopping")
defer func() {
logger.Info("QueryNode stopped", zap.Error(err))
}()
err = s.querynode.Stop()
if err != nil {
return err
}

View File

@ -315,9 +315,14 @@ func (s *Server) start() error {
return nil
}
func (s *Server) Stop() error {
func (s *Server) Stop() (err error) {
Params := &paramtable.Get().RootCoordGrpcServerCfg
log.Debug("Rootcoord stop", zap.String("Address", Params.GetAddress()))
logger := log.With(zap.String("address", Params.GetAddress()))
logger.Info("Rootcoord stopping")
defer func() {
logger.Info("Rootcoord stopped", zap.Error(err))
}()
if s.etcdCli != nil {
defer s.etcdCli.Close()
}

View File

@ -35,7 +35,6 @@ import (
"path"
"path/filepath"
"sync"
"syscall"
"time"
"unsafe"
@ -139,16 +138,7 @@ func (i *IndexNode) Register() error {
// start liveness check
i.session.LivenessCheck(i.loopCtx, func() {
log.Error("Index Node disconnected from etcd, process will exit", zap.Int64("Server Id", i.session.ServerID))
if err := i.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.IndexNodeRole).Dec()
// manually send signal to starter goroutine
if i.session.TriggerKill {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
return nil
}

View File

@ -23,7 +23,6 @@ import (
"os"
"strconv"
"sync"
"syscall"
"time"
"github.com/cockroachdb/errors"
@ -169,15 +168,7 @@ func (node *Proxy) Register() error {
log.Info("Proxy Register Finished")
node.session.LivenessCheck(node.ctx, func() {
log.Error("Proxy disconnected from etcd, process will exit", zap.Int64("Server Id", node.session.ServerID))
if err := node.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.ProxyRole).Dec()
if node.session.TriggerKill {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
// TODO Reset the logger
// Params.initLogCfg()

View File

@ -150,16 +150,7 @@ func (s *Server) Register() error {
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.QueryCoordRole).Inc()
s.session.LivenessCheck(s.ctx, func() {
log.Error("QueryCoord disconnected from etcd, process will exit", zap.Int64("serverID", s.session.GetServerID()))
if err := s.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.QueryCoordRole).Dec()
// manually send signal to starter goroutine
if s.session.IsTriggerKill() {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
return nil
}

View File

@ -37,7 +37,6 @@ import (
"runtime/debug"
"strings"
"sync"
"syscall"
"time"
"unsafe"
@ -168,17 +167,8 @@ func (node *QueryNode) Register() error {
// start liveness check
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.QueryNodeRole).Inc()
node.session.LivenessCheck(node.ctx, func() {
log.Error("Query Node disconnected from etcd, process will exit", zap.Int64("Server Id", node.GetNodeID()))
if err := node.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.QueryNodeRole).Dec()
// manually send signal to starter goroutine
if node.session.TriggerKill {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
log.Error("Query Node disconnected from etcd, process will exit", zap.Int64("Server Id", paramtable.GetNodeID()))
os.Exit(1)
})
return nil
}
@ -418,6 +408,8 @@ func (node *QueryNode) Stop() error {
log.Warn("session fail to go stopping state", zap.Error(err))
} else {
metrics.StoppingBalanceNodeNum.WithLabelValues().Set(1)
// TODO: Redundant timeout control, graceful stop timeout is controlled by outside by `component`.
// Integration test is still using it, Remove it in future.
timeoutCh := time.After(paramtable.Get().QueryNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second))
outer:
@ -438,7 +430,7 @@ func (node *QueryNode) Stop() error {
select {
case <-timeoutCh:
log.Warn("migrate data timed out", zap.Int64("ServerID", node.GetNodeID()),
log.Warn("migrate data timed out", zap.Int64("ServerID", paramtable.GetNodeID()),
zap.Int64s("sealedSegments", lo.Map(sealedSegments, func(s segments.Segment, i int) int64 {
return s.ID()
})),
@ -448,10 +440,18 @@ func (node *QueryNode) Stop() error {
zap.Int("channelNum", channelNum),
)
break outer
case <-time.After(time.Second):
metrics.StoppingBalanceSegmentNum.WithLabelValues(fmt.Sprint(node.GetNodeID())).Set(float64(len(sealedSegments)))
metrics.StoppingBalanceChannelNum.WithLabelValues(fmt.Sprint(node.GetNodeID())).Set(float64(channelNum))
log.Info("migrate data...", zap.Int64("ServerID", paramtable.GetNodeID()),
zap.Int64s("sealedSegments", lo.Map(sealedSegments, func(s segments.Segment, i int) int64 {
return s.ID()
})),
zap.Int64s("growingSegments", lo.Map(growingSegments, func(t segments.Segment, i int) int64 {
return t.ID()
})),
zap.Int("channelNum", channelNum),
)
}
}

View File

@ -22,7 +22,6 @@ import (
"math/rand"
"os"
"sync"
"syscall"
"time"
"github.com/cockroachdb/errors"
@ -281,16 +280,7 @@ func (c *Core) Register() error {
log.Info("RootCoord Register Finished")
c.session.LivenessCheck(c.ctx, func() {
log.Error("Root Coord disconnected from etcd, process will exit", zap.Int64("Server Id", c.session.ServerID))
if err := c.Stop(); err != nil {
log.Fatal("failed to stop server", zap.Error(err))
}
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.RootCoordRole).Dec()
// manually send signal to starter goroutine
if c.session.TriggerKill {
if p, err := os.FindProcess(os.Getpid()); err == nil {
p.Signal(syscall.SIGINT)
}
}
os.Exit(1)
})
return nil

View File

@ -32,7 +32,9 @@ const (
// DefaultIndexSliceSize defines the default slice size of index file when serializing.
DefaultIndexSliceSize = 16
DefaultGracefulTime = 5000 // ms
DefaultGracefulStopTimeout = 1800 // s
DefaultGracefulStopTimeout = 1800 // s, for node
DefaultProxyGracefulStopTimeout = 30 // sfor proxy
DefaultCoordGracefulStopTimeout = 5 // sfor coord
DefaultHighPriorityThreadCoreCoefficient = 10
DefaultMiddlePriorityThreadCoreCoefficient = 5
DefaultLowPriorityThreadCoreCoefficient = 1
@ -894,6 +896,7 @@ type rootCoordConfig struct {
EnableActiveStandby ParamItem `refreshable:"false"`
MaxDatabaseNum ParamItem `refreshable:"false"`
MaxGeneralCapacity ParamItem `refreshable:"true"`
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *rootCoordConfig) init(base *BaseTable) {
@ -988,6 +991,15 @@ func (p *rootCoordConfig) init(base *BaseTable) {
},
}
p.MaxGeneralCapacity.Init(base.mgr)
p.GracefulStopTimeout = ParamItem{
Key: "rootCoord.gracefulStopTimeout",
Version: "2.3.7",
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////
@ -1035,6 +1047,8 @@ type proxyConfig struct {
PartitionNameRegexp ParamItem `refreshable:"true"`
AccessLog AccessLogConfig
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *proxyConfig) init(base *BaseTable) {
@ -1341,6 +1355,15 @@ please adjust in embedded Milvus: false`,
Doc: "switch for whether proxy shall use partition name as regexp when searching",
}
p.PartitionNameRegexp.Init(base.mgr)
p.GracefulStopTimeout = ParamItem{
Key: "proxy.gracefulStopTimeout",
Version: "2.3.7",
DefaultValue: strconv.Itoa(DefaultProxyGracefulStopTimeout),
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////
@ -1411,6 +1434,7 @@ type queryCoordConfig struct {
ObserverTaskParallel ParamItem `refreshable:"false"`
CheckAutoBalanceConfigInterval ParamItem `refreshable:"false"`
CheckNodeSessionInterval ParamItem `refreshable:"false"`
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *queryCoordConfig) init(base *BaseTable) {
@ -1869,6 +1893,15 @@ func (p *queryCoordConfig) init(base *BaseTable) {
Export: true,
}
p.HeartBeatWarningLag.Init(base.mgr)
p.GracefulStopTimeout = ParamItem{
Key: "queryCoord.gracefulStopTimeout",
Version: "2.3.7",
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////
@ -2449,6 +2482,8 @@ type dataCoordConfig struct {
// auto balance channel on datanode
AutoBalance ParamItem `refreshable:"true"`
CheckAutoBalanceConfigInterval ParamItem `refreshable:"false"`
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *dataCoordConfig) init(base *BaseTable) {
@ -2903,6 +2938,15 @@ During compaction, the size of segment # of rows is able to exceed segment max #
Export: true,
}
p.AutoUpgradeSegmentIndex.Init(base.mgr)
p.GracefulStopTimeout = ParamItem{
Key: "dataCoord.gracefulStopTimeout",
Version: "2.3.7",
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////
@ -2962,6 +3006,8 @@ type dataNodeConfig struct {
// Compaction
L0BatchMemoryRatio ParamItem `refreshable:"true"`
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *dataNodeConfig) init(base *BaseTable) {
@ -3231,6 +3277,15 @@ func (p *dataNodeConfig) init(base *BaseTable) {
Export: true,
}
p.L0BatchMemoryRatio.Init(base.mgr)
p.GracefulStopTimeout = ParamItem{
Key: "datanode.gracefulStopTimeout",
Version: "2.3.7",
DefaultValue: strconv.Itoa(DefaultGracefulStopTimeout),
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////
@ -3242,7 +3297,7 @@ type indexNodeConfig struct {
DiskCapacityLimit ParamItem `refreshable:"true"`
MaxDiskUsagePercentage ParamItem `refreshable:"true"`
GracefulStopTimeout ParamItem `refreshable:"false"`
GracefulStopTimeout ParamItem `refreshable:"true"`
}
func (p *indexNodeConfig) init(base *BaseTable) {
@ -3297,6 +3352,7 @@ func (p *indexNodeConfig) init(base *BaseTable) {
Key: "indexNode.gracefulStopTimeout",
Version: "2.2.1",
FallbackKeys: []string{"common.gracefulStopTimeout"},
Doc: "seconds. force stop node without graceful stop",
Export: true,
}
p.GracefulStopTimeout.Init(base.mgr)

View File

@ -122,6 +122,9 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, Params.EnableActiveStandby.GetAsBool(), false)
t.Logf("rootCoord EnableActiveStandby = %t", Params.EnableActiveStandby.GetAsBool())
params.Save("rootCoord.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
SetCreateTime(time.Now())
SetUpdateTime(time.Now())
})
@ -166,6 +169,9 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, Params.CostMetricsExpireTime.GetAsInt(), 1000)
assert.Equal(t, Params.RetryTimesOnReplica.GetAsInt(), 2)
assert.EqualValues(t, Params.HealthCheckTimeout.GetAsInt64(), 3000)
params.Save("proxy.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
// t.Run("test proxyConfig panic", func(t *testing.T) {
@ -284,6 +290,9 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, true, Params.AutoBalance.GetAsBool())
assert.Equal(t, true, Params.AutoBalanceChannel.GetAsBool())
assert.Equal(t, 10, Params.CheckAutoBalanceConfigInterval.GetAsInt())
params.Save("queryCoord.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
t.Run("test queryNodeConfig", func(t *testing.T) {
@ -349,6 +358,9 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, int64(100), gracefulStopTimeout.GetAsInt64())
assert.Equal(t, false, Params.EnableWorkerSQCostMetrics.GetAsBool())
params.Save("querynode.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
t.Run("test dataCoordConfig", func(t *testing.T) {
@ -361,6 +373,9 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, true, Params.AutoBalance.GetAsBool())
assert.Equal(t, 10, Params.CheckAutoBalanceConfigInterval.GetAsInt())
assert.Equal(t, false, Params.AutoUpgradeSegmentIndex.GetAsBool())
params.Save("datacoord.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
t.Run("test dataNodeConfig", func(t *testing.T) {
@ -411,12 +426,17 @@ func TestComponentParam(t *testing.T) {
maxConcurrentImportTaskNum := Params.MaxConcurrentImportTaskNum.GetAsInt()
t.Logf("maxConcurrentImportTaskNum: %d", maxConcurrentImportTaskNum)
assert.Equal(t, 16, maxConcurrentImportTaskNum)
params.Save("datanode.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
t.Run("test indexNodeConfig", func(t *testing.T) {
Params := &params.IndexNodeCfg
params.Save(Params.GracefulStopTimeout.Key, "50")
assert.Equal(t, Params.GracefulStopTimeout.GetAsInt64(), int64(50))
params.Save("indexnode.gracefulStopTimeout", "100")
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
})
t.Run("channel config priority", func(t *testing.T) {