mirror of https://github.com/milvus-io/milvus.git
enhance: add graceful stop timeout to avoid node stop hang under extreme cases (#30317)
1. add coordinator graceful stop timeout to 5s 2. change the order of datacoord component while stop 3. change querynode grace stop timeout to 900s, and we should potentially change this to 600s when graceful stop is smooth issue: #30310 also see pr: #30306 --------- Signed-off-by: chyezh <chyezh@outlook.com>pull/30847/head
parent
816ed671aa
commit
0c7474d7e8
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcdatacoordclient "github.com/milvus-io/milvus/internal/distributed/datacoord"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -57,10 +59,8 @@ func (s *DataCoord) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (s *DataCoord) Stop() error {
|
||||
if err := s.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().DataCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(s.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns DataCoord's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcdatanode "github.com/milvus-io/milvus/internal/distributed/datanode"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -60,10 +62,8 @@ func (d *DataNode) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (d *DataNode) Stop() error {
|
||||
if err := d.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().DataNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(d.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns DataNode's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcindexnode "github.com/milvus-io/milvus/internal/distributed/indexnode"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -58,10 +60,8 @@ func (n *IndexNode) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (n *IndexNode) Stop() error {
|
||||
if err := n.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().IndexNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(n.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns IndexNode's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcproxy "github.com/milvus-io/milvus/internal/distributed/proxy"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -59,10 +61,8 @@ func (n *Proxy) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (n *Proxy) Stop() error {
|
||||
if err := n.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().ProxyCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(n.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns Proxy's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcquerycoord "github.com/milvus-io/milvus/internal/distributed/querycoord"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -60,10 +62,8 @@ func (qs *QueryCoord) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (qs *QueryCoord) Stop() error {
|
||||
if err := qs.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().QueryCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(qs.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns QueryCoord's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
grpcquerynode "github.com/milvus-io/milvus/internal/distributed/querynode"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -60,10 +62,8 @@ func (q *QueryNode) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (q *QueryNode) Stop() error {
|
||||
if err := q.svr.Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().QueryNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(q.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns QueryNode's states
|
||||
|
|
|
@ -18,6 +18,7 @@ package components
|
|||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
|
@ -26,6 +27,7 @@ import (
|
|||
rc "github.com/milvus-io/milvus/internal/distributed/rootcoord"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -59,10 +61,8 @@ func (rc *RootCoord) Run() error {
|
|||
|
||||
// Stop terminates service
|
||||
func (rc *RootCoord) Stop() error {
|
||||
if rc.svr != nil {
|
||||
return rc.svr.Stop()
|
||||
}
|
||||
return nil
|
||||
timeout := paramtable.Get().RootCoordCfg.GracefulStopTimeout.GetAsDuration(time.Second)
|
||||
return exitWhenStopTimeout(rc.svr.Stop, timeout)
|
||||
}
|
||||
|
||||
// GetComponentStates returns RootCoord's states
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package components
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
|
||||
"github.com/milvus-io/milvus/pkg/util/conc"
|
||||
)
|
||||
|
||||
var errStopTimeout = errors.New("stop timeout")
|
||||
|
||||
// exitWhenStopTimeout stops a component with timeout and exit progress when timeout.
|
||||
func exitWhenStopTimeout(stop func() error, timeout time.Duration) error {
|
||||
err := stopWithTimeout(stop, timeout)
|
||||
if errors.Is(err, errStopTimeout) {
|
||||
os.Exit(1)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// stopWithTimeout stops a component with timeout.
|
||||
func stopWithTimeout(stop func() error, timeout time.Duration) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), timeout)
|
||||
defer cancel()
|
||||
|
||||
future := conc.Go(func() (struct{}, error) {
|
||||
return struct{}{}, stop()
|
||||
})
|
||||
select {
|
||||
case <-future.Inner():
|
||||
return errors.Wrap(future.Err(), "failed to stop component")
|
||||
case <-ctx.Done():
|
||||
return errStopTimeout
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package components
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestExitWithTimeout(t *testing.T) {
|
||||
// only normal path can be tested.
|
||||
targetErr := errors.New("stop error")
|
||||
err := exitWhenStopTimeout(func() error {
|
||||
time.Sleep(1 * time.Second)
|
||||
return targetErr
|
||||
}, 5*time.Second)
|
||||
assert.ErrorIs(t, err, targetErr)
|
||||
}
|
||||
|
||||
func TestStopWithTimeout(t *testing.T) {
|
||||
ch := make(chan struct{})
|
||||
stop := func() error {
|
||||
<-ch
|
||||
return nil
|
||||
}
|
||||
|
||||
err := stopWithTimeout(stop, 1*time.Second)
|
||||
assert.ErrorIs(t, err, errStopTimeout)
|
||||
|
||||
targetErr := errors.New("stop error")
|
||||
stop = func() error {
|
||||
return targetErr
|
||||
}
|
||||
|
||||
err = stopWithTimeout(stop, 1*time.Second)
|
||||
assert.ErrorIs(t, err, targetErr)
|
||||
}
|
|
@ -271,16 +271,7 @@ func (s *Server) Register() error {
|
|||
|
||||
s.session.LivenessCheck(s.serverLoopCtx, func() {
|
||||
logutil.Logger(s.ctx).Error("disconnected from etcd and exited", zap.Int64("serverID", s.session.GetServerID()))
|
||||
if err := s.Stop(); err != nil {
|
||||
logutil.Logger(s.ctx).Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.DataCoordRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if s.session.IsTriggerKill() {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
@ -1102,16 +1093,24 @@ func (s *Server) Stop() error {
|
|||
if !s.stateCode.CompareAndSwap(commonpb.StateCode_Healthy, commonpb.StateCode_Abnormal) {
|
||||
return nil
|
||||
}
|
||||
logutil.Logger(s.ctx).Info("server shutdown")
|
||||
s.cluster.Close()
|
||||
logutil.Logger(s.ctx).Info("datacoord server shutdown")
|
||||
s.garbageCollector.close()
|
||||
s.stopServerLoop()
|
||||
logutil.Logger(s.ctx).Info("datacoord garbage collector stopped")
|
||||
|
||||
if Params.DataCoordCfg.EnableCompaction.GetAsBool() {
|
||||
s.stopCompactionTrigger()
|
||||
s.stopCompactionHandler()
|
||||
}
|
||||
logutil.Logger(s.ctx).Info("datacoord compaction stopped")
|
||||
|
||||
s.indexBuilder.Stop()
|
||||
logutil.Logger(s.ctx).Info("datacoord index builder stopped")
|
||||
|
||||
s.cluster.Close()
|
||||
logutil.Logger(s.ctx).Info("datacoord cluster stopped")
|
||||
|
||||
s.stopServerLoop()
|
||||
logutil.Logger(s.ctx).Info("datacoord serverloop stopped")
|
||||
|
||||
if s.session != nil {
|
||||
s.session.Stop()
|
||||
|
@ -1120,6 +1119,7 @@ func (s *Server) Stop() error {
|
|||
if s.icSession != nil {
|
||||
s.icSession.Stop()
|
||||
}
|
||||
logutil.Logger(s.ctx).Warn("datacoord stop successful")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -26,12 +26,11 @@ import (
|
|||
"math/rand"
|
||||
"os"
|
||||
"sync"
|
||||
"syscall"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
clientv3 "go.etcd.io/etcd/client/v3"
|
||||
"go.uber.org/atomic"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||
|
@ -199,16 +198,7 @@ func (node *DataNode) Register() error {
|
|||
// Start liveness check
|
||||
node.session.LivenessCheck(node.ctx, func() {
|
||||
log.Error("Data Node disconnected from etcd, process will exit", zap.Int64("Server Id", node.GetSession().ServerID))
|
||||
if err := node.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.DataNodeRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if node.session.TriggerKill {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
|
||||
return nil
|
||||
|
|
|
@ -221,10 +221,14 @@ func (s *Server) start() error {
|
|||
|
||||
// Stop stops the DataCoord server gracefully.
|
||||
// Need to call the GracefulStop interface of grpc server and call the stop method of the inner DataCoord object.
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().DataCoordGrpcServerCfg
|
||||
log.Debug("Datacoord stop", zap.String("Address", Params.GetAddress()))
|
||||
var err error
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("Datacoord stopping")
|
||||
defer func() {
|
||||
logger.Info("Datacoord stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
s.cancel()
|
||||
|
||||
if s.etcdCli != nil {
|
||||
|
|
|
@ -199,9 +199,14 @@ func (s *Server) Run() error {
|
|||
}
|
||||
|
||||
// Stop stops Datanode's grpc service.
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().DataNodeGrpcServerCfg
|
||||
log.Debug("Datanode stop", zap.String("Address", Params.GetAddress()))
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("Datanode stopping")
|
||||
defer func() {
|
||||
logger.Info("Datanode stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
s.cancel()
|
||||
if s.etcdCli != nil {
|
||||
defer s.etcdCli.Close()
|
||||
|
@ -210,7 +215,7 @@ func (s *Server) Stop() error {
|
|||
utils.GracefulStopGRPCServer(s.grpcServer)
|
||||
}
|
||||
|
||||
err := s.datanode.Stop()
|
||||
err = s.datanode.Stop()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -211,9 +211,14 @@ func (s *Server) start() error {
|
|||
}
|
||||
|
||||
// Stop stops IndexNode's grpc service.
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().IndexNodeGrpcServerCfg
|
||||
log.Debug("IndexNode stop", zap.String("Address", Params.GetAddress()))
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("IndexNode stopping")
|
||||
defer func() {
|
||||
logger.Info("IndexNode stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
if s.indexnode != nil {
|
||||
s.indexnode.Stop()
|
||||
}
|
||||
|
|
|
@ -695,9 +695,13 @@ func (s *Server) start() error {
|
|||
}
|
||||
|
||||
// Stop stop the Proxy Server
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().ProxyGrpcServerCfg
|
||||
log.Debug("Proxy stop", zap.String("internal address", Params.GetInternalAddress()), zap.String("external address", Params.GetInternalAddress()))
|
||||
logger := log.With(zap.String("internal address", Params.GetInternalAddress()), zap.String("external address", Params.GetInternalAddress()))
|
||||
logger.Info("Proxy stopping")
|
||||
defer func() {
|
||||
logger.Info("Proxy stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
if s.etcdCli != nil {
|
||||
defer s.etcdCli.Close()
|
||||
|
@ -741,7 +745,7 @@ func (s *Server) Stop() error {
|
|||
|
||||
s.wg.Wait()
|
||||
|
||||
err := s.proxy.Stop()
|
||||
err = s.proxy.Stop()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -272,9 +272,14 @@ func (s *Server) start() error {
|
|||
}
|
||||
|
||||
// Stop stops QueryCoord's grpc service.
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().QueryCoordGrpcServerCfg
|
||||
log.Debug("QueryCoord stop", zap.String("Address", Params.GetAddress()))
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("QueryCoord stopping")
|
||||
defer func() {
|
||||
logger.Info("QueryCoord stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
if s.etcdCli != nil {
|
||||
defer s.etcdCli.Close()
|
||||
}
|
||||
|
@ -282,9 +287,7 @@ func (s *Server) Stop() error {
|
|||
if s.grpcServer != nil {
|
||||
utils.GracefulStopGRPCServer(s.grpcServer)
|
||||
}
|
||||
err := s.queryCoord.Stop()
|
||||
|
||||
return err
|
||||
return s.queryCoord.Stop()
|
||||
}
|
||||
|
||||
// SetRootCoord sets root coordinator's client
|
||||
|
|
|
@ -237,10 +237,15 @@ func (s *Server) Run() error {
|
|||
}
|
||||
|
||||
// Stop stops QueryNode's grpc service.
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().QueryNodeGrpcServerCfg
|
||||
log.Debug("QueryNode stop", zap.String("Address", Params.GetAddress()))
|
||||
err := s.querynode.Stop()
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("QueryNode stopping")
|
||||
defer func() {
|
||||
logger.Info("QueryNode stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
err = s.querynode.Stop()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -315,9 +315,14 @@ func (s *Server) start() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) Stop() error {
|
||||
func (s *Server) Stop() (err error) {
|
||||
Params := ¶mtable.Get().RootCoordGrpcServerCfg
|
||||
log.Debug("Rootcoord stop", zap.String("Address", Params.GetAddress()))
|
||||
logger := log.With(zap.String("address", Params.GetAddress()))
|
||||
logger.Info("Rootcoord stopping")
|
||||
defer func() {
|
||||
logger.Info("Rootcoord stopped", zap.Error(err))
|
||||
}()
|
||||
|
||||
if s.etcdCli != nil {
|
||||
defer s.etcdCli.Close()
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@ import (
|
|||
"path"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
|
@ -139,16 +138,7 @@ func (i *IndexNode) Register() error {
|
|||
// start liveness check
|
||||
i.session.LivenessCheck(i.loopCtx, func() {
|
||||
log.Error("Index Node disconnected from etcd, process will exit", zap.Int64("Server Id", i.session.ServerID))
|
||||
if err := i.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.IndexNodeRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if i.session.TriggerKill {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -23,7 +23,6 @@ import (
|
|||
"os"
|
||||
"strconv"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
|
@ -169,15 +168,7 @@ func (node *Proxy) Register() error {
|
|||
log.Info("Proxy Register Finished")
|
||||
node.session.LivenessCheck(node.ctx, func() {
|
||||
log.Error("Proxy disconnected from etcd, process will exit", zap.Int64("Server Id", node.session.ServerID))
|
||||
if err := node.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.ProxyRole).Dec()
|
||||
if node.session.TriggerKill {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
// TODO Reset the logger
|
||||
// Params.initLogCfg()
|
||||
|
|
|
@ -150,16 +150,7 @@ func (s *Server) Register() error {
|
|||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.QueryCoordRole).Inc()
|
||||
s.session.LivenessCheck(s.ctx, func() {
|
||||
log.Error("QueryCoord disconnected from etcd, process will exit", zap.Int64("serverID", s.session.GetServerID()))
|
||||
if err := s.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.QueryCoordRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if s.session.IsTriggerKill() {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -37,7 +37,6 @@ import (
|
|||
"runtime/debug"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
|
@ -168,17 +167,8 @@ func (node *QueryNode) Register() error {
|
|||
// start liveness check
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.QueryNodeRole).Inc()
|
||||
node.session.LivenessCheck(node.ctx, func() {
|
||||
log.Error("Query Node disconnected from etcd, process will exit", zap.Int64("Server Id", node.GetNodeID()))
|
||||
if err := node.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(node.GetNodeID()), typeutil.QueryNodeRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if node.session.TriggerKill {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
log.Error("Query Node disconnected from etcd, process will exit", zap.Int64("Server Id", paramtable.GetNodeID()))
|
||||
os.Exit(1)
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
@ -418,6 +408,8 @@ func (node *QueryNode) Stop() error {
|
|||
log.Warn("session fail to go stopping state", zap.Error(err))
|
||||
} else {
|
||||
metrics.StoppingBalanceNodeNum.WithLabelValues().Set(1)
|
||||
// TODO: Redundant timeout control, graceful stop timeout is controlled by outside by `component`.
|
||||
// Integration test is still using it, Remove it in future.
|
||||
timeoutCh := time.After(paramtable.Get().QueryNodeCfg.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
|
||||
outer:
|
||||
|
@ -438,7 +430,7 @@ func (node *QueryNode) Stop() error {
|
|||
|
||||
select {
|
||||
case <-timeoutCh:
|
||||
log.Warn("migrate data timed out", zap.Int64("ServerID", node.GetNodeID()),
|
||||
log.Warn("migrate data timed out", zap.Int64("ServerID", paramtable.GetNodeID()),
|
||||
zap.Int64s("sealedSegments", lo.Map(sealedSegments, func(s segments.Segment, i int) int64 {
|
||||
return s.ID()
|
||||
})),
|
||||
|
@ -448,10 +440,18 @@ func (node *QueryNode) Stop() error {
|
|||
zap.Int("channelNum", channelNum),
|
||||
)
|
||||
break outer
|
||||
|
||||
case <-time.After(time.Second):
|
||||
metrics.StoppingBalanceSegmentNum.WithLabelValues(fmt.Sprint(node.GetNodeID())).Set(float64(len(sealedSegments)))
|
||||
metrics.StoppingBalanceChannelNum.WithLabelValues(fmt.Sprint(node.GetNodeID())).Set(float64(channelNum))
|
||||
log.Info("migrate data...", zap.Int64("ServerID", paramtable.GetNodeID()),
|
||||
zap.Int64s("sealedSegments", lo.Map(sealedSegments, func(s segments.Segment, i int) int64 {
|
||||
return s.ID()
|
||||
})),
|
||||
zap.Int64s("growingSegments", lo.Map(growingSegments, func(t segments.Segment, i int) int64 {
|
||||
return t.ID()
|
||||
})),
|
||||
zap.Int("channelNum", channelNum),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,6 @@ import (
|
|||
"math/rand"
|
||||
"os"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
|
@ -281,16 +280,7 @@ func (c *Core) Register() error {
|
|||
log.Info("RootCoord Register Finished")
|
||||
c.session.LivenessCheck(c.ctx, func() {
|
||||
log.Error("Root Coord disconnected from etcd, process will exit", zap.Int64("Server Id", c.session.ServerID))
|
||||
if err := c.Stop(); err != nil {
|
||||
log.Fatal("failed to stop server", zap.Error(err))
|
||||
}
|
||||
metrics.NumNodes.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), typeutil.RootCoordRole).Dec()
|
||||
// manually send signal to starter goroutine
|
||||
if c.session.TriggerKill {
|
||||
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
||||
p.Signal(syscall.SIGINT)
|
||||
}
|
||||
}
|
||||
os.Exit(1)
|
||||
})
|
||||
|
||||
return nil
|
||||
|
|
|
@ -32,7 +32,9 @@ const (
|
|||
// DefaultIndexSliceSize defines the default slice size of index file when serializing.
|
||||
DefaultIndexSliceSize = 16
|
||||
DefaultGracefulTime = 5000 // ms
|
||||
DefaultGracefulStopTimeout = 1800 // s
|
||||
DefaultGracefulStopTimeout = 1800 // s, for node
|
||||
DefaultProxyGracefulStopTimeout = 30 // s,for proxy
|
||||
DefaultCoordGracefulStopTimeout = 5 // s,for coord
|
||||
DefaultHighPriorityThreadCoreCoefficient = 10
|
||||
DefaultMiddlePriorityThreadCoreCoefficient = 5
|
||||
DefaultLowPriorityThreadCoreCoefficient = 1
|
||||
|
@ -894,6 +896,7 @@ type rootCoordConfig struct {
|
|||
EnableActiveStandby ParamItem `refreshable:"false"`
|
||||
MaxDatabaseNum ParamItem `refreshable:"false"`
|
||||
MaxGeneralCapacity ParamItem `refreshable:"true"`
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *rootCoordConfig) init(base *BaseTable) {
|
||||
|
@ -988,6 +991,15 @@ func (p *rootCoordConfig) init(base *BaseTable) {
|
|||
},
|
||||
}
|
||||
p.MaxGeneralCapacity.Init(base.mgr)
|
||||
|
||||
p.GracefulStopTimeout = ParamItem{
|
||||
Key: "rootCoord.gracefulStopTimeout",
|
||||
Version: "2.3.7",
|
||||
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1035,6 +1047,8 @@ type proxyConfig struct {
|
|||
PartitionNameRegexp ParamItem `refreshable:"true"`
|
||||
|
||||
AccessLog AccessLogConfig
|
||||
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *proxyConfig) init(base *BaseTable) {
|
||||
|
@ -1341,6 +1355,15 @@ please adjust in embedded Milvus: false`,
|
|||
Doc: "switch for whether proxy shall use partition name as regexp when searching",
|
||||
}
|
||||
p.PartitionNameRegexp.Init(base.mgr)
|
||||
|
||||
p.GracefulStopTimeout = ParamItem{
|
||||
Key: "proxy.gracefulStopTimeout",
|
||||
Version: "2.3.7",
|
||||
DefaultValue: strconv.Itoa(DefaultProxyGracefulStopTimeout),
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1411,6 +1434,7 @@ type queryCoordConfig struct {
|
|||
ObserverTaskParallel ParamItem `refreshable:"false"`
|
||||
CheckAutoBalanceConfigInterval ParamItem `refreshable:"false"`
|
||||
CheckNodeSessionInterval ParamItem `refreshable:"false"`
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *queryCoordConfig) init(base *BaseTable) {
|
||||
|
@ -1869,6 +1893,15 @@ func (p *queryCoordConfig) init(base *BaseTable) {
|
|||
Export: true,
|
||||
}
|
||||
p.HeartBeatWarningLag.Init(base.mgr)
|
||||
|
||||
p.GracefulStopTimeout = ParamItem{
|
||||
Key: "queryCoord.gracefulStopTimeout",
|
||||
Version: "2.3.7",
|
||||
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -2449,6 +2482,8 @@ type dataCoordConfig struct {
|
|||
// auto balance channel on datanode
|
||||
AutoBalance ParamItem `refreshable:"true"`
|
||||
CheckAutoBalanceConfigInterval ParamItem `refreshable:"false"`
|
||||
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *dataCoordConfig) init(base *BaseTable) {
|
||||
|
@ -2903,6 +2938,15 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
|||
Export: true,
|
||||
}
|
||||
p.AutoUpgradeSegmentIndex.Init(base.mgr)
|
||||
|
||||
p.GracefulStopTimeout = ParamItem{
|
||||
Key: "dataCoord.gracefulStopTimeout",
|
||||
Version: "2.3.7",
|
||||
DefaultValue: strconv.Itoa(DefaultCoordGracefulStopTimeout),
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -2962,6 +3006,8 @@ type dataNodeConfig struct {
|
|||
|
||||
// Compaction
|
||||
L0BatchMemoryRatio ParamItem `refreshable:"true"`
|
||||
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *dataNodeConfig) init(base *BaseTable) {
|
||||
|
@ -3231,6 +3277,15 @@ func (p *dataNodeConfig) init(base *BaseTable) {
|
|||
Export: true,
|
||||
}
|
||||
p.L0BatchMemoryRatio.Init(base.mgr)
|
||||
|
||||
p.GracefulStopTimeout = ParamItem{
|
||||
Key: "datanode.gracefulStopTimeout",
|
||||
Version: "2.3.7",
|
||||
DefaultValue: strconv.Itoa(DefaultGracefulStopTimeout),
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -3242,7 +3297,7 @@ type indexNodeConfig struct {
|
|||
DiskCapacityLimit ParamItem `refreshable:"true"`
|
||||
MaxDiskUsagePercentage ParamItem `refreshable:"true"`
|
||||
|
||||
GracefulStopTimeout ParamItem `refreshable:"false"`
|
||||
GracefulStopTimeout ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *indexNodeConfig) init(base *BaseTable) {
|
||||
|
@ -3297,6 +3352,7 @@ func (p *indexNodeConfig) init(base *BaseTable) {
|
|||
Key: "indexNode.gracefulStopTimeout",
|
||||
Version: "2.2.1",
|
||||
FallbackKeys: []string{"common.gracefulStopTimeout"},
|
||||
Doc: "seconds. force stop node without graceful stop",
|
||||
Export: true,
|
||||
}
|
||||
p.GracefulStopTimeout.Init(base.mgr)
|
||||
|
|
|
@ -122,6 +122,9 @@ func TestComponentParam(t *testing.T) {
|
|||
assert.Equal(t, Params.EnableActiveStandby.GetAsBool(), false)
|
||||
t.Logf("rootCoord EnableActiveStandby = %t", Params.EnableActiveStandby.GetAsBool())
|
||||
|
||||
params.Save("rootCoord.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
|
||||
SetCreateTime(time.Now())
|
||||
SetUpdateTime(time.Now())
|
||||
})
|
||||
|
@ -166,6 +169,9 @@ func TestComponentParam(t *testing.T) {
|
|||
assert.Equal(t, Params.CostMetricsExpireTime.GetAsInt(), 1000)
|
||||
assert.Equal(t, Params.RetryTimesOnReplica.GetAsInt(), 2)
|
||||
assert.EqualValues(t, Params.HealthCheckTimeout.GetAsInt64(), 3000)
|
||||
|
||||
params.Save("proxy.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
// t.Run("test proxyConfig panic", func(t *testing.T) {
|
||||
|
@ -284,6 +290,9 @@ func TestComponentParam(t *testing.T) {
|
|||
assert.Equal(t, true, Params.AutoBalance.GetAsBool())
|
||||
assert.Equal(t, true, Params.AutoBalanceChannel.GetAsBool())
|
||||
assert.Equal(t, 10, Params.CheckAutoBalanceConfigInterval.GetAsInt())
|
||||
|
||||
params.Save("queryCoord.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
t.Run("test queryNodeConfig", func(t *testing.T) {
|
||||
|
@ -349,6 +358,9 @@ func TestComponentParam(t *testing.T) {
|
|||
assert.Equal(t, int64(100), gracefulStopTimeout.GetAsInt64())
|
||||
|
||||
assert.Equal(t, false, Params.EnableWorkerSQCostMetrics.GetAsBool())
|
||||
|
||||
params.Save("querynode.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
t.Run("test dataCoordConfig", func(t *testing.T) {
|
||||
|
@ -361,6 +373,9 @@ func TestComponentParam(t *testing.T) {
|
|||
assert.Equal(t, true, Params.AutoBalance.GetAsBool())
|
||||
assert.Equal(t, 10, Params.CheckAutoBalanceConfigInterval.GetAsInt())
|
||||
assert.Equal(t, false, Params.AutoUpgradeSegmentIndex.GetAsBool())
|
||||
|
||||
params.Save("datacoord.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
t.Run("test dataNodeConfig", func(t *testing.T) {
|
||||
|
@ -411,12 +426,17 @@ func TestComponentParam(t *testing.T) {
|
|||
maxConcurrentImportTaskNum := Params.MaxConcurrentImportTaskNum.GetAsInt()
|
||||
t.Logf("maxConcurrentImportTaskNum: %d", maxConcurrentImportTaskNum)
|
||||
assert.Equal(t, 16, maxConcurrentImportTaskNum)
|
||||
params.Save("datanode.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
t.Run("test indexNodeConfig", func(t *testing.T) {
|
||||
Params := ¶ms.IndexNodeCfg
|
||||
params.Save(Params.GracefulStopTimeout.Key, "50")
|
||||
assert.Equal(t, Params.GracefulStopTimeout.GetAsInt64(), int64(50))
|
||||
|
||||
params.Save("indexnode.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
})
|
||||
|
||||
t.Run("channel config priority", func(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue