mirror of https://github.com/milvus-io/milvus.git
enhance: improve check health (#33800)
issue: #34264 Signed-off-by: jaime <yun.zhang@zilliz.com>pull/34279/head
parent
e5d691d854
commit
0426390f06
|
@ -1704,6 +1704,17 @@ func (m *meta) DropChannelCheckpoint(vChannel string) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *meta) GetChannelCheckpoints() map[string]*msgpb.MsgPosition {
|
||||||
|
m.channelCPs.RLock()
|
||||||
|
defer m.channelCPs.RUnlock()
|
||||||
|
|
||||||
|
checkpoints := make(map[string]*msgpb.MsgPosition, len(m.channelCPs.checkpoints))
|
||||||
|
for ch, cp := range m.channelCPs.checkpoints {
|
||||||
|
checkpoints[ch] = proto.Clone(cp).(*msgpb.MsgPosition)
|
||||||
|
}
|
||||||
|
return checkpoints
|
||||||
|
}
|
||||||
|
|
||||||
func (m *meta) GcConfirm(ctx context.Context, collectionID, partitionID UniqueID) bool {
|
func (m *meta) GcConfirm(ctx context.Context, collectionID, partitionID UniqueID) bool {
|
||||||
return m.catalog.GcConfirm(ctx, collectionID, partitionID)
|
return m.catalog.GcConfirm(ctx, collectionID, partitionID)
|
||||||
}
|
}
|
||||||
|
|
|
@ -3134,6 +3134,51 @@ func closeTestServer(t *testing.T, svr *Server) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_CheckHealth(t *testing.T) {
|
func Test_CheckHealth(t *testing.T) {
|
||||||
|
getSessionManager := func(isHealthy bool) *SessionManagerImpl {
|
||||||
|
var client *mockDataNodeClient
|
||||||
|
if isHealthy {
|
||||||
|
client = &mockDataNodeClient{
|
||||||
|
id: 1,
|
||||||
|
state: commonpb.StateCode_Healthy,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
client = &mockDataNodeClient{
|
||||||
|
id: 1,
|
||||||
|
state: commonpb.StateCode_Abnormal,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sm := NewSessionManagerImpl()
|
||||||
|
sm.sessions = struct {
|
||||||
|
lock.RWMutex
|
||||||
|
data map[int64]*Session
|
||||||
|
}{data: map[int64]*Session{1: {
|
||||||
|
client: client,
|
||||||
|
clientCreator: func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
|
||||||
|
return client, nil
|
||||||
|
},
|
||||||
|
}}}
|
||||||
|
return sm
|
||||||
|
}
|
||||||
|
|
||||||
|
getChannelManager := func(t *testing.T, findWatcherOk bool) ChannelManager {
|
||||||
|
channelManager := NewMockChannelManager(t)
|
||||||
|
if findWatcherOk {
|
||||||
|
channelManager.EXPECT().FindWatcher(mock.Anything).Return(0, nil)
|
||||||
|
} else {
|
||||||
|
channelManager.EXPECT().FindWatcher(mock.Anything).Return(0, errors.New("error"))
|
||||||
|
}
|
||||||
|
return channelManager
|
||||||
|
}
|
||||||
|
|
||||||
|
collections := map[UniqueID]*collectionInfo{
|
||||||
|
1: {
|
||||||
|
ID: 1,
|
||||||
|
VChannelNames: []string{"ch1", "ch2"},
|
||||||
|
},
|
||||||
|
2: nil,
|
||||||
|
}
|
||||||
|
|
||||||
t.Run("not healthy", func(t *testing.T) {
|
t.Run("not healthy", func(t *testing.T) {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
s := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
s := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
||||||
|
@ -3144,56 +3189,76 @@ func Test_CheckHealth(t *testing.T) {
|
||||||
assert.NotEmpty(t, resp.Reasons)
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("data node health check is ok", func(t *testing.T) {
|
|
||||||
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
|
||||||
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
|
||||||
healthClient := &mockDataNodeClient{
|
|
||||||
id: 1,
|
|
||||||
state: commonpb.StateCode_Healthy,
|
|
||||||
}
|
|
||||||
sm := NewSessionManagerImpl()
|
|
||||||
sm.sessions = struct {
|
|
||||||
lock.RWMutex
|
|
||||||
data map[int64]*Session
|
|
||||||
}{data: map[int64]*Session{1: {
|
|
||||||
client: healthClient,
|
|
||||||
clientCreator: func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
|
|
||||||
return healthClient, nil
|
|
||||||
},
|
|
||||||
}}}
|
|
||||||
|
|
||||||
svr.sessionManager = sm
|
|
||||||
ctx := context.Background()
|
|
||||||
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
|
||||||
assert.NoError(t, err)
|
|
||||||
assert.Equal(t, true, resp.IsHealthy)
|
|
||||||
assert.Empty(t, resp.Reasons)
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("data node health check is fail", func(t *testing.T) {
|
t.Run("data node health check is fail", func(t *testing.T) {
|
||||||
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
||||||
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
||||||
unhealthClient := &mockDataNodeClient{
|
svr.sessionManager = getSessionManager(false)
|
||||||
id: 1,
|
|
||||||
state: commonpb.StateCode_Abnormal,
|
|
||||||
}
|
|
||||||
sm := NewSessionManagerImpl()
|
|
||||||
sm.sessions = struct {
|
|
||||||
lock.RWMutex
|
|
||||||
data map[int64]*Session
|
|
||||||
}{data: map[int64]*Session{1: {
|
|
||||||
client: unhealthClient,
|
|
||||||
clientCreator: func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
|
|
||||||
return unhealthClient, nil
|
|
||||||
},
|
|
||||||
}}}
|
|
||||||
svr.sessionManager = sm
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
assert.Equal(t, false, resp.IsHealthy)
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
assert.NotEmpty(t, resp.Reasons)
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("check channel watched fail", func(t *testing.T) {
|
||||||
|
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
||||||
|
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
||||||
|
svr.sessionManager = getSessionManager(true)
|
||||||
|
svr.channelManager = getChannelManager(t, false)
|
||||||
|
svr.meta = &meta{collections: collections}
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("check checkpoint fail", func(t *testing.T) {
|
||||||
|
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
||||||
|
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
||||||
|
svr.sessionManager = getSessionManager(true)
|
||||||
|
svr.channelManager = getChannelManager(t, true)
|
||||||
|
svr.meta = &meta{
|
||||||
|
collections: collections,
|
||||||
|
channelCPs: &channelCPs{
|
||||||
|
checkpoints: map[string]*msgpb.MsgPosition{
|
||||||
|
"ch1": {
|
||||||
|
Timestamp: tsoutil.ComposeTSByTime(time.Now().Add(-1000*time.Hour), 0),
|
||||||
|
MsgID: []byte{1, 2, 3, 4},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("ok", func(t *testing.T) {
|
||||||
|
svr := &Server{session: &sessionutil.Session{SessionRaw: sessionutil.SessionRaw{ServerID: 1}}}
|
||||||
|
svr.stateCode.Store(commonpb.StateCode_Healthy)
|
||||||
|
svr.sessionManager = getSessionManager(true)
|
||||||
|
svr.channelManager = getChannelManager(t, true)
|
||||||
|
svr.meta = &meta{
|
||||||
|
collections: collections,
|
||||||
|
channelCPs: &channelCPs{
|
||||||
|
checkpoints: map[string]*msgpb.MsgPosition{
|
||||||
|
"ch1": {
|
||||||
|
Timestamp: tsoutil.ComposeTSByTime(time.Now(), 0),
|
||||||
|
MsgID: []byte{1, 2, 3, 4},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := svr.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, true, resp.IsHealthy)
|
||||||
|
assert.Empty(t, resp.Reasons)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_newChunkManagerFactory(t *testing.T) {
|
func Test_newChunkManagerFactory(t *testing.T) {
|
||||||
|
|
|
@ -35,6 +35,7 @@ import (
|
||||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
||||||
"github.com/milvus-io/milvus/internal/storage"
|
"github.com/milvus-io/milvus/internal/storage"
|
||||||
|
"github.com/milvus-io/milvus/internal/util/componentutil"
|
||||||
"github.com/milvus-io/milvus/internal/util/importutilv2"
|
"github.com/milvus-io/milvus/internal/util/importutilv2"
|
||||||
"github.com/milvus-io/milvus/internal/util/segmentutil"
|
"github.com/milvus-io/milvus/internal/util/segmentutil"
|
||||||
"github.com/milvus-io/milvus/pkg/common"
|
"github.com/milvus-io/milvus/pkg/common"
|
||||||
|
@ -1541,10 +1542,18 @@ func (s *Server) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthReque
|
||||||
|
|
||||||
err := s.sessionManager.CheckHealth(ctx)
|
err := s.sessionManager.CheckHealth(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: false, Reasons: []string{err.Error()}}, nil
|
return componentutil.CheckHealthRespWithErr(err), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: true, Reasons: []string{}}, nil
|
if err = CheckAllChannelsWatched(s.meta, s.channelManager); err != nil {
|
||||||
|
return componentutil.CheckHealthRespWithErr(err), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = CheckCheckPointsHealth(s.meta); err != nil {
|
||||||
|
return componentutil.CheckHealthRespWithErr(err), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return componentutil.CheckHealthRespWithErr(nil), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) GcConfirm(ctx context.Context, request *datapb.GcConfirmRequest) (*datapb.GcConfirmResponse, error) {
|
func (s *Server) GcConfirm(ctx context.Context, request *datapb.GcConfirmRequest) (*datapb.GcConfirmResponse, error) {
|
||||||
|
|
|
@ -18,6 +18,7 @@ package datacoord
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
@ -33,6 +34,8 @@ import (
|
||||||
"github.com/milvus-io/milvus/pkg/metrics"
|
"github.com/milvus-io/milvus/pkg/metrics"
|
||||||
"github.com/milvus-io/milvus/pkg/util/indexparamcheck"
|
"github.com/milvus-io/milvus/pkg/util/indexparamcheck"
|
||||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
||||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -273,3 +276,35 @@ func getBinLogIDs(segment *SegmentInfo, fieldID int64) []int64 {
|
||||||
}
|
}
|
||||||
return binlogIDs
|
return binlogIDs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CheckCheckPointsHealth(meta *meta) error {
|
||||||
|
for channel, cp := range meta.GetChannelCheckpoints() {
|
||||||
|
ts, _ := tsoutil.ParseTS(cp.Timestamp)
|
||||||
|
lag := time.Since(ts)
|
||||||
|
if lag > paramtable.Get().DataCoordCfg.ChannelCheckpointMaxLag.GetAsDuration(time.Second) {
|
||||||
|
return merr.WrapErrChannelCPExceededMaxLag(channel, fmt.Sprintf("checkpoint lag: %f(min)", lag.Minutes()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func CheckAllChannelsWatched(meta *meta, channelManager ChannelManager) error {
|
||||||
|
collIDs := meta.ListCollections()
|
||||||
|
for _, collID := range collIDs {
|
||||||
|
collInfo := meta.GetCollection(collID)
|
||||||
|
if collInfo == nil {
|
||||||
|
log.Warn("collection info is nil, skip it", zap.Int64("collectionID", collID))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, channelName := range collInfo.VChannelNames {
|
||||||
|
_, err := channelManager.FindWatcher(channelName)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("find watcher for channel failed", zap.Int64("collectionID", collID),
|
||||||
|
zap.String("channelName", channelName), zap.Error(err))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
|
@ -33,6 +33,7 @@ import (
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/job"
|
"github.com/milvus-io/milvus/internal/querycoordv2/job"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
||||||
|
"github.com/milvus-io/milvus/internal/util/componentutil"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
"github.com/milvus-io/milvus/pkg/metrics"
|
"github.com/milvus-io/milvus/pkg/metrics"
|
||||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||||
|
@ -913,10 +914,14 @@ func (s *Server) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthReque
|
||||||
|
|
||||||
errReasons, err := s.checkNodeHealth(ctx)
|
errReasons, err := s.checkNodeHealth(ctx)
|
||||||
if err != nil || len(errReasons) != 0 {
|
if err != nil || len(errReasons) != 0 {
|
||||||
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: false, Reasons: errReasons}, nil
|
return componentutil.CheckHealthRespWithErrMsg(errReasons...), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: true, Reasons: errReasons}, nil
|
if err := utils.CheckCollectionsQueryable(s.meta, s.targetMgr, s.dist, s.nodeMgr); err != nil {
|
||||||
|
return componentutil.CheckHealthRespWithErr(err), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return componentutil.CheckHealthRespWithErr(nil), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) checkNodeHealth(ctx context.Context) ([]string, error) {
|
func (s *Server) checkNodeHealth(ctx context.Context) ([]string, error) {
|
||||||
|
|
|
@ -82,12 +82,12 @@ func CheckLeaderAvailable(nodeMgr *session.NodeManager, leader *meta.LeaderView,
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetShardLeaders(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.DistributionManager, nodeMgr *session.NodeManager, collectionID int64) ([]*querypb.ShardLeadersList, error) {
|
func checkLoadStatus(m *meta.Meta, collectionID int64) error {
|
||||||
percentage := m.CollectionManager.CalculateLoadPercentage(collectionID)
|
percentage := m.CollectionManager.CalculateLoadPercentage(collectionID)
|
||||||
if percentage < 0 {
|
if percentage < 0 {
|
||||||
err := merr.WrapErrCollectionNotLoaded(collectionID)
|
err := merr.WrapErrCollectionNotLoaded(collectionID)
|
||||||
log.Warn("failed to GetShardLeaders", zap.Error(err))
|
log.Warn("failed to GetShardLeaders", zap.Error(err))
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
collection := m.CollectionManager.GetCollection(collectionID)
|
collection := m.CollectionManager.GetCollection(collectionID)
|
||||||
if collection != nil && collection.GetStatus() == querypb.LoadStatus_Loaded {
|
if collection != nil && collection.GetStatus() == querypb.LoadStatus_Loaded {
|
||||||
|
@ -99,17 +99,14 @@ func GetShardLeaders(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.Dis
|
||||||
err := merr.WrapErrCollectionNotFullyLoaded(collectionID)
|
err := merr.WrapErrCollectionNotFullyLoaded(collectionID)
|
||||||
msg := fmt.Sprintf("collection %v is not fully loaded", collectionID)
|
msg := fmt.Sprintf("collection %v is not fully loaded", collectionID)
|
||||||
log.Warn(msg)
|
log.Warn(msg)
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
return nil
|
||||||
channels := targetMgr.GetDmChannelsByCollection(collectionID, meta.CurrentTarget)
|
|
||||||
if len(channels) == 0 {
|
|
||||||
msg := "loaded collection do not found any channel in target, may be in recovery"
|
|
||||||
err := merr.WrapErrCollectionOnRecovering(collectionID, msg)
|
|
||||||
log.Warn("failed to get channels", zap.Error(err))
|
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetShardLeadersWithChannels(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.DistributionManager,
|
||||||
|
nodeMgr *session.NodeManager, collectionID int64, channels map[string]*meta.DmChannel,
|
||||||
|
) ([]*querypb.ShardLeadersList, error) {
|
||||||
ret := make([]*querypb.ShardLeadersList, 0)
|
ret := make([]*querypb.ShardLeadersList, 0)
|
||||||
currentTargets := targetMgr.GetSealedSegmentsByCollection(collectionID, meta.CurrentTarget)
|
currentTargets := targetMgr.GetSealedSegmentsByCollection(collectionID, meta.CurrentTarget)
|
||||||
for _, channel := range channels {
|
for _, channel := range channels {
|
||||||
|
@ -166,6 +163,49 @@ func GetShardLeaders(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.Dis
|
||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetShardLeaders(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.DistributionManager, nodeMgr *session.NodeManager, collectionID int64) ([]*querypb.ShardLeadersList, error) {
|
||||||
|
if err := checkLoadStatus(m, collectionID); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
channels := targetMgr.GetDmChannelsByCollection(collectionID, meta.CurrentTarget)
|
||||||
|
if len(channels) == 0 {
|
||||||
|
msg := "loaded collection do not found any channel in target, may be in recovery"
|
||||||
|
err := merr.WrapErrCollectionOnRecovering(collectionID, msg)
|
||||||
|
log.Warn("failed to get channels", zap.Error(err))
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return GetShardLeadersWithChannels(m, targetMgr, dist, nodeMgr, collectionID, channels)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CheckCollectionsQueryable check all channels are watched and all segments are loaded for this collection
|
||||||
|
func CheckCollectionsQueryable(m *meta.Meta, targetMgr *meta.TargetManager, dist *meta.DistributionManager, nodeMgr *session.NodeManager) error {
|
||||||
|
for _, coll := range m.GetAllCollections() {
|
||||||
|
collectionID := coll.GetCollectionID()
|
||||||
|
if err := checkLoadStatus(m, collectionID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
channels := targetMgr.GetDmChannelsByCollection(collectionID, meta.CurrentTarget)
|
||||||
|
if len(channels) == 0 {
|
||||||
|
msg := "loaded collection do not found any channel in target, may be in recovery"
|
||||||
|
err := merr.WrapErrCollectionOnRecovering(collectionID, msg)
|
||||||
|
log.Warn("failed to get channels", zap.Error(err))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
shardList, err := GetShardLeadersWithChannels(m, targetMgr, dist, nodeMgr, collectionID, channels)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(channels) != len(shardList) {
|
||||||
|
return merr.WrapErrCollectionNotFullyLoaded(collectionID, "still have unwatched channels or loaded segments")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func filterDupLeaders(replicaManager *meta.ReplicaManager, leaders map[int64]*meta.LeaderView) map[int64]*meta.LeaderView {
|
func filterDupLeaders(replicaManager *meta.ReplicaManager, leaders map[int64]*meta.LeaderView) map[int64]*meta.LeaderView {
|
||||||
type leaderID struct {
|
type leaderID struct {
|
||||||
ReplicaID int64
|
ReplicaID int64
|
||||||
|
|
|
@ -677,18 +677,6 @@ func withDataCoord(dc types.DataCoordClient) Opt {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func withUnhealthyDataCoord() Opt {
|
|
||||||
dc := newMockDataCoord()
|
|
||||||
err := errors.New("mock error")
|
|
||||||
dc.GetComponentStatesFunc = func(ctx context.Context) (*milvuspb.ComponentStates, error) {
|
|
||||||
return &milvuspb.ComponentStates{
|
|
||||||
State: &milvuspb.ComponentInfo{StateCode: commonpb.StateCode_Abnormal},
|
|
||||||
Status: merr.Status(err),
|
|
||||||
}, retry.Unrecoverable(errors.New("error mock GetComponentStates"))
|
|
||||||
}
|
|
||||||
return withDataCoord(dc)
|
|
||||||
}
|
|
||||||
|
|
||||||
func withInvalidDataCoord() Opt {
|
func withInvalidDataCoord() Opt {
|
||||||
dc := newMockDataCoord()
|
dc := newMockDataCoord()
|
||||||
dc.GetComponentStatesFunc = func(ctx context.Context) (*milvuspb.ComponentStates, error) {
|
dc.GetComponentStatesFunc = func(ctx context.Context) (*milvuspb.ComponentStates, error) {
|
||||||
|
|
|
@ -43,7 +43,6 @@ import (
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
"github.com/milvus-io/milvus/pkg/metrics"
|
"github.com/milvus-io/milvus/pkg/metrics"
|
||||||
"github.com/milvus-io/milvus/pkg/util/commonpbutil"
|
"github.com/milvus-io/milvus/pkg/util/commonpbutil"
|
||||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
|
||||||
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
|
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
|
||||||
"github.com/milvus-io/milvus/pkg/util/ratelimitutil"
|
"github.com/milvus-io/milvus/pkg/util/ratelimitutil"
|
||||||
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
||||||
|
@ -357,19 +356,10 @@ func (q *QuotaCenter) collectMetrics() error {
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
group := &errgroup.Group{}
|
group := &errgroup.Group{}
|
||||||
req, err := metricsinfo.ConstructRequestByMetricType(metricsinfo.SystemInfoMetrics)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// get Query cluster metrics
|
// get Query cluster metrics
|
||||||
group.Go(func() error {
|
group.Go(func() error {
|
||||||
rsp, err := q.queryCoord.GetMetrics(ctx, req)
|
queryCoordTopology, err := getQueryCoordMetrics(ctx, q.queryCoord)
|
||||||
if err = merr.CheckRPCCall(rsp, err); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
queryCoordTopology := &metricsinfo.QueryCoordTopology{}
|
|
||||||
err = metricsinfo.UnmarshalTopology(rsp.GetResponse(), queryCoordTopology)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -414,12 +404,7 @@ func (q *QuotaCenter) collectMetrics() error {
|
||||||
})
|
})
|
||||||
// get Data cluster metrics
|
// get Data cluster metrics
|
||||||
group.Go(func() error {
|
group.Go(func() error {
|
||||||
rsp, err := q.dataCoord.GetMetrics(ctx, req)
|
dataCoordTopology, err := getDataCoordMetrics(ctx, q.dataCoord)
|
||||||
if err = merr.CheckRPCCall(rsp, err); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
dataCoordTopology := &metricsinfo.DataCoordTopology{}
|
|
||||||
err = metricsinfo.UnmarshalTopology(rsp.GetResponse(), dataCoordTopology)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -505,17 +490,11 @@ func (q *QuotaCenter) collectMetrics() error {
|
||||||
})
|
})
|
||||||
// get Proxies metrics
|
// get Proxies metrics
|
||||||
group.Go(func() error {
|
group.Go(func() error {
|
||||||
// TODO: get more proxy metrics info
|
ret, err := getProxyMetrics(ctx, q.proxies)
|
||||||
rsps, err := q.proxies.GetProxyMetrics(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
for _, rsp := range rsps {
|
|
||||||
proxyMetric := &metricsinfo.ProxyInfos{}
|
|
||||||
err = metricsinfo.UnmarshalComponentInfos(rsp.GetResponse(), proxyMetric)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
for _, proxyMetric := range ret {
|
||||||
if proxyMetric.QuotaMetrics != nil {
|
if proxyMetric.QuotaMetrics != nil {
|
||||||
q.proxyMetrics[proxyMetric.ID] = proxyMetric.QuotaMetrics
|
q.proxyMetrics[proxyMetric.ID] = proxyMetric.QuotaMetrics
|
||||||
}
|
}
|
||||||
|
@ -532,7 +511,8 @@ func (q *QuotaCenter) collectMetrics() error {
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
err = group.Wait()
|
|
||||||
|
err := group.Wait()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -2773,9 +2773,8 @@ func (c *Core) CheckHealth(ctx context.Context, in *milvuspb.CheckHealthRequest)
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
mu := &sync.Mutex{}
|
|
||||||
group, ctx := errgroup.WithContext(ctx)
|
group, ctx := errgroup.WithContext(ctx)
|
||||||
errReasons := make([]string, 0, c.proxyClientManager.GetProxyCount())
|
errs := typeutil.NewConcurrentSet[error]()
|
||||||
|
|
||||||
proxyClients := c.proxyClientManager.GetProxyClients()
|
proxyClients := c.proxyClientManager.GetProxyClients()
|
||||||
proxyClients.Range(func(key int64, value types.ProxyClient) bool {
|
proxyClients.Range(func(key int64, value types.ProxyClient) bool {
|
||||||
|
@ -2784,28 +2783,41 @@ func (c *Core) CheckHealth(ctx context.Context, in *milvuspb.CheckHealthRequest)
|
||||||
group.Go(func() error {
|
group.Go(func() error {
|
||||||
sta, err := proxyClient.GetComponentStates(ctx, &milvuspb.GetComponentStatesRequest{})
|
sta, err := proxyClient.GetComponentStates(ctx, &milvuspb.GetComponentStatesRequest{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
errs.Insert(err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = merr.AnalyzeState("Proxy", nodeID, sta)
|
err = merr.AnalyzeState("Proxy", nodeID, sta)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
mu.Lock()
|
errs.Insert(err)
|
||||||
defer mu.Unlock()
|
|
||||||
errReasons = append(errReasons, err.Error())
|
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
return err
|
||||||
})
|
})
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
|
|
||||||
|
maxDelay := Params.QuotaConfig.MaxTimeTickDelay.GetAsDuration(time.Second)
|
||||||
|
if maxDelay > 0 {
|
||||||
|
group.Go(func() error {
|
||||||
|
err := CheckTimeTickLagExceeded(ctx, c.queryCoord, c.dataCoord, maxDelay)
|
||||||
|
if err != nil {
|
||||||
|
errs.Insert(err)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
err := group.Wait()
|
err := group.Wait()
|
||||||
if err != nil || len(errReasons) != 0 {
|
if err != nil {
|
||||||
return &milvuspb.CheckHealthResponse{
|
return &milvuspb.CheckHealthResponse{
|
||||||
Status: merr.Success(),
|
Status: merr.Success(),
|
||||||
IsHealthy: false,
|
IsHealthy: false,
|
||||||
Reasons: errReasons,
|
Reasons: lo.Map(errs.Collect(), func(e error, i int) string {
|
||||||
|
return err.Error()
|
||||||
|
}),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: true, Reasons: errReasons}, nil
|
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: true, Reasons: []string{}}, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,7 @@ import (
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
|
||||||
"github.com/milvus-io/milvus/internal/metastore/model"
|
"github.com/milvus-io/milvus/internal/metastore/model"
|
||||||
|
"github.com/milvus-io/milvus/internal/mocks"
|
||||||
"github.com/milvus-io/milvus/internal/proto/etcdpb"
|
"github.com/milvus-io/milvus/internal/proto/etcdpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/proxypb"
|
"github.com/milvus-io/milvus/internal/proto/proxypb"
|
||||||
|
@ -47,6 +48,7 @@ import (
|
||||||
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
|
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
|
||||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
"github.com/milvus-io/milvus/pkg/util/tikv"
|
"github.com/milvus-io/milvus/pkg/util/tikv"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
||||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1450,6 +1452,65 @@ func TestRootCoord_AlterCollection(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRootCoord_CheckHealth(t *testing.T) {
|
func TestRootCoord_CheckHealth(t *testing.T) {
|
||||||
|
getQueryCoordMetricsFunc := func(tt typeutil.Timestamp) (*milvuspb.GetMetricsResponse, error) {
|
||||||
|
clusterTopology := metricsinfo.QueryClusterTopology{
|
||||||
|
ConnectedNodes: []metricsinfo.QueryNodeInfos{
|
||||||
|
{
|
||||||
|
QuotaMetrics: &metricsinfo.QueryNodeQuotaMetrics{
|
||||||
|
Fgm: metricsinfo.FlowGraphMetric{
|
||||||
|
MinFlowGraphChannel: "ch1",
|
||||||
|
MinFlowGraphTt: tt,
|
||||||
|
NumFlowGraph: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, _ := metricsinfo.MarshalTopology(metricsinfo.QueryCoordTopology{Cluster: clusterTopology})
|
||||||
|
return &milvuspb.GetMetricsResponse{
|
||||||
|
Status: merr.Success(),
|
||||||
|
Response: resp,
|
||||||
|
ComponentName: metricsinfo.ConstructComponentName(typeutil.QueryCoordRole, 0),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
getDataCoordMetricsFunc := func(tt typeutil.Timestamp) (*milvuspb.GetMetricsResponse, error) {
|
||||||
|
clusterTopology := metricsinfo.DataClusterTopology{
|
||||||
|
ConnectedDataNodes: []metricsinfo.DataNodeInfos{
|
||||||
|
{
|
||||||
|
QuotaMetrics: &metricsinfo.DataNodeQuotaMetrics{
|
||||||
|
Fgm: metricsinfo.FlowGraphMetric{
|
||||||
|
MinFlowGraphChannel: "ch1",
|
||||||
|
MinFlowGraphTt: tt,
|
||||||
|
NumFlowGraph: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, _ := metricsinfo.MarshalTopology(metricsinfo.DataCoordTopology{Cluster: clusterTopology})
|
||||||
|
return &milvuspb.GetMetricsResponse{
|
||||||
|
Status: merr.Success(),
|
||||||
|
Response: resp,
|
||||||
|
ComponentName: metricsinfo.ConstructComponentName(typeutil.DataCoordRole, 0),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
querynodeTT := tsoutil.ComposeTSByTime(time.Now().Add(-1*time.Minute), 0)
|
||||||
|
datanodeTT := tsoutil.ComposeTSByTime(time.Now().Add(-2*time.Minute), 0)
|
||||||
|
|
||||||
|
dcClient := mocks.NewMockDataCoordClient(t)
|
||||||
|
dcClient.EXPECT().GetMetrics(mock.Anything, mock.Anything).Return(getDataCoordMetricsFunc(datanodeTT))
|
||||||
|
qcClient := mocks.NewMockQueryCoordClient(t)
|
||||||
|
qcClient.EXPECT().GetMetrics(mock.Anything, mock.Anything).Return(getQueryCoordMetricsFunc(querynodeTT))
|
||||||
|
|
||||||
|
errDataCoordClient := mocks.NewMockDataCoordClient(t)
|
||||||
|
errDataCoordClient.EXPECT().GetMetrics(mock.Anything, mock.Anything).Return(nil, errors.New("error"))
|
||||||
|
errQueryCoordClient := mocks.NewMockQueryCoordClient(t)
|
||||||
|
errQueryCoordClient.EXPECT().GetMetrics(mock.Anything, mock.Anything).Return(nil, errors.New("error"))
|
||||||
|
|
||||||
t.Run("not healthy", func(t *testing.T) {
|
t.Run("not healthy", func(t *testing.T) {
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
c := newTestCore(withAbnormalCode())
|
c := newTestCore(withAbnormalCode())
|
||||||
|
@ -1459,10 +1520,12 @@ func TestRootCoord_CheckHealth(t *testing.T) {
|
||||||
assert.NotEmpty(t, resp.Reasons)
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("proxy health check is ok", func(t *testing.T) {
|
t.Run("ok with disabled tt lag configuration", func(t *testing.T) {
|
||||||
c := newTestCore(withHealthyCode(),
|
v := Params.QuotaConfig.MaxTimeTickDelay.GetValue()
|
||||||
withValidProxyManager())
|
Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, "-1")
|
||||||
|
defer Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, v)
|
||||||
|
|
||||||
|
c := newTestCore(withHealthyCode(), withValidProxyManager())
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
@ -1470,9 +1533,12 @@ func TestRootCoord_CheckHealth(t *testing.T) {
|
||||||
assert.Empty(t, resp.Reasons)
|
assert.Empty(t, resp.Reasons)
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("proxy health check is fail", func(t *testing.T) {
|
t.Run("proxy health check fail with invalid proxy", func(t *testing.T) {
|
||||||
c := newTestCore(withHealthyCode(),
|
v := Params.QuotaConfig.MaxTimeTickDelay.GetValue()
|
||||||
withInvalidProxyManager())
|
Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, "6000")
|
||||||
|
defer Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, v)
|
||||||
|
|
||||||
|
c := newTestCore(withHealthyCode(), withInvalidProxyManager(), withDataCoord(dcClient), withQueryCoord(qcClient))
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
@ -1480,6 +1546,62 @@ func TestRootCoord_CheckHealth(t *testing.T) {
|
||||||
assert.Equal(t, false, resp.IsHealthy)
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
assert.NotEmpty(t, resp.Reasons)
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("proxy health check fail with get metrics error", func(t *testing.T) {
|
||||||
|
v := Params.QuotaConfig.MaxTimeTickDelay.GetValue()
|
||||||
|
Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, "6000")
|
||||||
|
defer Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, v)
|
||||||
|
|
||||||
|
{
|
||||||
|
c := newTestCore(withHealthyCode(),
|
||||||
|
withValidProxyManager(), withDataCoord(dcClient), withQueryCoord(errQueryCoordClient))
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
c := newTestCore(withHealthyCode(),
|
||||||
|
withValidProxyManager(), withDataCoord(errDataCoordClient), withQueryCoord(qcClient))
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("ok with tt lag exceeded", func(t *testing.T) {
|
||||||
|
v := Params.QuotaConfig.MaxTimeTickDelay.GetValue()
|
||||||
|
Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, "90")
|
||||||
|
defer Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, v)
|
||||||
|
|
||||||
|
c := newTestCore(withHealthyCode(),
|
||||||
|
withValidProxyManager(), withDataCoord(dcClient), withQueryCoord(qcClient))
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, false, resp.IsHealthy)
|
||||||
|
assert.NotEmpty(t, resp.Reasons)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("ok with tt lag checking", func(t *testing.T) {
|
||||||
|
v := Params.QuotaConfig.MaxTimeTickDelay.GetValue()
|
||||||
|
Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, "600")
|
||||||
|
defer Params.Save(Params.QuotaConfig.MaxTimeTickDelay.Key, v)
|
||||||
|
|
||||||
|
c := newTestCore(withHealthyCode(),
|
||||||
|
withValidProxyManager(), withDataCoord(dcClient), withQueryCoord(qcClient))
|
||||||
|
ctx := context.Background()
|
||||||
|
resp, err := c.CheckHealth(ctx, &milvuspb.CheckHealthRequest{})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, true, resp.IsHealthy)
|
||||||
|
assert.Empty(t, resp.Reasons)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRootCoord_DescribeDatabase(t *testing.T) {
|
func TestRootCoord_DescribeDatabase(t *testing.T) {
|
||||||
|
|
|
@ -17,16 +17,24 @@
|
||||||
package rootcoord
|
package rootcoord
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"time"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||||
|
"github.com/milvus-io/milvus/internal/types"
|
||||||
|
"github.com/milvus-io/milvus/internal/util/proxyutil"
|
||||||
"github.com/milvus-io/milvus/pkg/common"
|
"github.com/milvus-io/milvus/pkg/common"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
"github.com/milvus-io/milvus/pkg/mq/msgstream"
|
"github.com/milvus-io/milvus/pkg/mq/msgstream"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
||||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -204,3 +212,136 @@ func getRateLimitConfig(properties map[string]string, configKey string, configVa
|
||||||
|
|
||||||
return configValue
|
return configValue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getQueryCoordMetrics(ctx context.Context, queryCoord types.QueryCoordClient) (*metricsinfo.QueryCoordTopology, error) {
|
||||||
|
req, err := metricsinfo.ConstructRequestByMetricType(metricsinfo.SystemInfoMetrics)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
rsp, err := queryCoord.GetMetrics(ctx, req)
|
||||||
|
if err = merr.CheckRPCCall(rsp, err); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
queryCoordTopology := &metricsinfo.QueryCoordTopology{}
|
||||||
|
if err := metricsinfo.UnmarshalTopology(rsp.GetResponse(), queryCoordTopology); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return queryCoordTopology, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getDataCoordMetrics(ctx context.Context, dataCoord types.DataCoordClient) (*metricsinfo.DataCoordTopology, error) {
|
||||||
|
req, err := metricsinfo.ConstructRequestByMetricType(metricsinfo.SystemInfoMetrics)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
rsp, err := dataCoord.GetMetrics(ctx, req)
|
||||||
|
if err = merr.CheckRPCCall(rsp, err); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
dataCoordTopology := &metricsinfo.DataCoordTopology{}
|
||||||
|
if err = metricsinfo.UnmarshalTopology(rsp.GetResponse(), dataCoordTopology); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return dataCoordTopology, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getProxyMetrics(ctx context.Context, proxies proxyutil.ProxyClientManagerInterface) ([]*metricsinfo.ProxyInfos, error) {
|
||||||
|
resp, err := proxies.GetProxyMetrics(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := make([]*metricsinfo.ProxyInfos, 0, len(resp))
|
||||||
|
for _, rsp := range resp {
|
||||||
|
proxyMetric := &metricsinfo.ProxyInfos{}
|
||||||
|
err = metricsinfo.UnmarshalComponentInfos(rsp.GetResponse(), proxyMetric)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
ret = append(ret, proxyMetric)
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func CheckTimeTickLagExceeded(ctx context.Context, queryCoord types.QueryCoordClient, dataCoord types.DataCoordClient, maxDelay time.Duration) error {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, GetMetricsTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
group := &errgroup.Group{}
|
||||||
|
queryNodeTTDelay := typeutil.NewConcurrentMap[string, time.Duration]()
|
||||||
|
dataNodeTTDelay := typeutil.NewConcurrentMap[string, time.Duration]()
|
||||||
|
|
||||||
|
group.Go(func() error {
|
||||||
|
queryCoordTopology, err := getQueryCoordMetrics(ctx, queryCoord)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, queryNodeMetric := range queryCoordTopology.Cluster.ConnectedNodes {
|
||||||
|
qm := queryNodeMetric.QuotaMetrics
|
||||||
|
if qm != nil {
|
||||||
|
if qm.Fgm.NumFlowGraph > 0 && qm.Fgm.MinFlowGraphChannel != "" {
|
||||||
|
minTt, _ := tsoutil.ParseTS(qm.Fgm.MinFlowGraphTt)
|
||||||
|
delay := now.Sub(minTt)
|
||||||
|
|
||||||
|
if delay.Milliseconds() >= maxDelay.Milliseconds() {
|
||||||
|
queryNodeTTDelay.Insert(qm.Fgm.MinFlowGraphChannel, delay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
// get Data cluster metrics
|
||||||
|
group.Go(func() error {
|
||||||
|
dataCoordTopology, err := getDataCoordMetrics(ctx, dataCoord)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dataNodeMetric := range dataCoordTopology.Cluster.ConnectedDataNodes {
|
||||||
|
dm := dataNodeMetric.QuotaMetrics
|
||||||
|
if dm.Fgm.NumFlowGraph > 0 && dm.Fgm.MinFlowGraphChannel != "" {
|
||||||
|
minTt, _ := tsoutil.ParseTS(dm.Fgm.MinFlowGraphTt)
|
||||||
|
delay := now.Sub(minTt)
|
||||||
|
|
||||||
|
if delay.Milliseconds() >= maxDelay.Milliseconds() {
|
||||||
|
dataNodeTTDelay.Insert(dm.Fgm.MinFlowGraphChannel, delay)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
err := group.Wait()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var maxLagChannel string
|
||||||
|
var maxLag time.Duration
|
||||||
|
findMaxLagChannel := func(params ...*typeutil.ConcurrentMap[string, time.Duration]) {
|
||||||
|
for _, param := range params {
|
||||||
|
param.Range(func(k string, v time.Duration) bool {
|
||||||
|
if v > maxLag {
|
||||||
|
maxLag = v
|
||||||
|
maxLagChannel = k
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
findMaxLagChannel(queryNodeTTDelay, dataNodeTTDelay)
|
||||||
|
|
||||||
|
if maxLag > 0 && len(maxLagChannel) != 0 {
|
||||||
|
return fmt.Errorf("max timetick lag execced threhold, max timetick lag:%s on channel:%s", maxLag, maxLagChannel)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
|
@ -84,3 +84,17 @@ func WaitForComponentHealthy[T interface {
|
||||||
}](ctx context.Context, client T, serviceName string, attempts uint, sleep time.Duration) error {
|
}](ctx context.Context, client T, serviceName string, attempts uint, sleep time.Duration) error {
|
||||||
return WaitForComponentStates(ctx, client, serviceName, []commonpb.StateCode{commonpb.StateCode_Healthy}, attempts, sleep)
|
return WaitForComponentStates(ctx, client, serviceName, []commonpb.StateCode{commonpb.StateCode_Healthy}, attempts, sleep)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CheckHealthRespWithErr(err error) *milvuspb.CheckHealthResponse {
|
||||||
|
if err != nil {
|
||||||
|
return CheckHealthRespWithErrMsg(err.Error())
|
||||||
|
}
|
||||||
|
return CheckHealthRespWithErrMsg()
|
||||||
|
}
|
||||||
|
|
||||||
|
func CheckHealthRespWithErrMsg(errMsg ...string) *milvuspb.CheckHealthResponse {
|
||||||
|
if len(errMsg) != 0 {
|
||||||
|
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: false, Reasons: errMsg}
|
||||||
|
}
|
||||||
|
return &milvuspb.CheckHealthResponse{Status: merr.Success(), IsHealthy: true, Reasons: []string{}}
|
||||||
|
}
|
||||||
|
|
|
@ -21,7 +21,7 @@ import (
|
||||||
"github.com/tecbot/gorocksdb"
|
"github.com/tecbot/gorocksdb"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/pkg/kv/rocksdb"
|
rocksdbkv "github.com/milvus-io/milvus/pkg/kv/rocksdb"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
|
|
|
@ -81,6 +81,7 @@ var (
|
||||||
ErrChannelLack = newMilvusError("channel lacks", 501, false)
|
ErrChannelLack = newMilvusError("channel lacks", 501, false)
|
||||||
ErrChannelReduplicate = newMilvusError("channel reduplicates", 502, false)
|
ErrChannelReduplicate = newMilvusError("channel reduplicates", 502, false)
|
||||||
ErrChannelNotAvailable = newMilvusError("channel not available", 503, false)
|
ErrChannelNotAvailable = newMilvusError("channel not available", 503, false)
|
||||||
|
ErrChannelCPExceededMaxLag = newMilvusError("channel checkpoint exceed max lag", 504, false)
|
||||||
|
|
||||||
// Segment related
|
// Segment related
|
||||||
ErrSegmentNotFound = newMilvusError("segment not found", 600, false)
|
ErrSegmentNotFound = newMilvusError("segment not found", 600, false)
|
||||||
|
|
|
@ -639,36 +639,33 @@ func WrapErrReplicaNotAvailable(id int64, msg ...string) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Channel related
|
// Channel related
|
||||||
func WrapErrChannelNotFound(name string, msg ...string) error {
|
|
||||||
err := wrapFields(ErrChannelNotFound, value("channel", name))
|
func warpChannelErr(mErr milvusError, name string, msg ...string) error {
|
||||||
|
err := wrapFields(mErr, value("channel", name))
|
||||||
if len(msg) > 0 {
|
if len(msg) > 0 {
|
||||||
err = errors.Wrap(err, strings.Join(msg, "->"))
|
err = errors.Wrap(err, strings.Join(msg, "->"))
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WrapErrChannelNotFound(name string, msg ...string) error {
|
||||||
|
return warpChannelErr(ErrChannelNotFound, name, msg...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func WrapErrChannelCPExceededMaxLag(name string, msg ...string) error {
|
||||||
|
return warpChannelErr(ErrChannelCPExceededMaxLag, name, msg...)
|
||||||
|
}
|
||||||
|
|
||||||
func WrapErrChannelLack(name string, msg ...string) error {
|
func WrapErrChannelLack(name string, msg ...string) error {
|
||||||
err := wrapFields(ErrChannelLack, value("channel", name))
|
return warpChannelErr(ErrChannelLack, name, msg...)
|
||||||
if len(msg) > 0 {
|
|
||||||
err = errors.Wrap(err, strings.Join(msg, "->"))
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func WrapErrChannelReduplicate(name string, msg ...string) error {
|
func WrapErrChannelReduplicate(name string, msg ...string) error {
|
||||||
err := wrapFields(ErrChannelReduplicate, value("channel", name))
|
return warpChannelErr(ErrChannelReduplicate, name, msg...)
|
||||||
if len(msg) > 0 {
|
|
||||||
err = errors.Wrap(err, strings.Join(msg, "->"))
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func WrapErrChannelNotAvailable(name string, msg ...string) error {
|
func WrapErrChannelNotAvailable(name string, msg ...string) error {
|
||||||
err := wrapFields(ErrChannelNotAvailable, value("channel", name))
|
return warpChannelErr(ErrChannelNotAvailable, name, msg...)
|
||||||
if len(msg) > 0 {
|
|
||||||
err = errors.Wrap(err, strings.Join(msg, "->"))
|
|
||||||
}
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segment related
|
// Segment related
|
||||||
|
|
|
@ -1584,15 +1584,10 @@ specific conditions, such as memory of nodes to water marker), ` + "true" + ` me
|
||||||
Version: "2.2.0",
|
Version: "2.2.0",
|
||||||
DefaultValue: defaultMaxTtDelay,
|
DefaultValue: defaultMaxTtDelay,
|
||||||
Formatter: func(v string) string {
|
Formatter: func(v string) string {
|
||||||
if !p.TtProtectionEnabled.GetAsBool() {
|
if getAsFloat(v) < 0 {
|
||||||
return fmt.Sprintf("%d", math.MaxInt64)
|
return "0"
|
||||||
}
|
}
|
||||||
delay := getAsFloat(v)
|
return v
|
||||||
// (0, 65536)
|
|
||||||
if delay <= 0 || delay >= 65536 {
|
|
||||||
return defaultMaxTtDelay
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%f", delay)
|
|
||||||
},
|
},
|
||||||
Doc: `maxTimeTickDelay indicates the backpressure for DML Operations.
|
Doc: `maxTimeTickDelay indicates the backpressure for DML Operations.
|
||||||
DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
|
DML rates would be reduced according to the ratio of time tick delay to maxTimeTickDelay,
|
||||||
|
|
|
@ -190,7 +190,7 @@ func TestQuotaParam(t *testing.T) {
|
||||||
t.Run("test limit writing", func(t *testing.T) {
|
t.Run("test limit writing", func(t *testing.T) {
|
||||||
assert.False(t, qc.ForceDenyWriting.GetAsBool())
|
assert.False(t, qc.ForceDenyWriting.GetAsBool())
|
||||||
assert.Equal(t, false, qc.TtProtectionEnabled.GetAsBool())
|
assert.Equal(t, false, qc.TtProtectionEnabled.GetAsBool())
|
||||||
assert.Equal(t, math.MaxInt64, qc.MaxTimeTickDelay.GetAsInt())
|
assert.Equal(t, 300, qc.MaxTimeTickDelay.GetAsInt())
|
||||||
assert.Equal(t, defaultLowWaterLevel, qc.DataNodeMemoryLowWaterLevel.GetAsFloat())
|
assert.Equal(t, defaultLowWaterLevel, qc.DataNodeMemoryLowWaterLevel.GetAsFloat())
|
||||||
assert.Equal(t, defaultHighWaterLevel, qc.DataNodeMemoryHighWaterLevel.GetAsFloat())
|
assert.Equal(t, defaultHighWaterLevel, qc.DataNodeMemoryHighWaterLevel.GetAsFloat())
|
||||||
assert.Equal(t, defaultLowWaterLevel, qc.QueryNodeMemoryLowWaterLevel.GetAsFloat())
|
assert.Equal(t, defaultLowWaterLevel, qc.QueryNodeMemoryLowWaterLevel.GetAsFloat())
|
||||||
|
|
Loading…
Reference in New Issue