mirror of https://github.com/milvus-io/milvus.git
Panic when recover querynode failed (#15161)
Signed-off-by: xige-16 <xi.ge@zilliz.com>pull/15164/head
parent
0c98f21d4d
commit
4e956ee10c
|
@ -342,29 +342,12 @@ func (lct *loadCollectionTask) execute(ctx context.Context) error {
|
||||||
defer lct.reduceRetryCount()
|
defer lct.reduceRetryCount()
|
||||||
collectionID := lct.CollectionID
|
collectionID := lct.CollectionID
|
||||||
|
|
||||||
showPartitionRequest := &milvuspb.ShowPartitionsRequest{
|
toLoadPartitionIDs, err := showPartitions(ctx, collectionID, lct.rootCoord)
|
||||||
Base: &commonpb.MsgBase{
|
|
||||||
MsgType: commonpb.MsgType_ShowPartitions,
|
|
||||||
},
|
|
||||||
CollectionID: collectionID,
|
|
||||||
}
|
|
||||||
ctx2, cancel2 := context.WithTimeout(ctx, timeoutForRPC)
|
|
||||||
defer cancel2()
|
|
||||||
showPartitionResponse, err := lct.rootCoord.ShowPartitions(ctx2, showPartitionRequest)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadCollectionTask: showPartition failed", zap.Int64("collectionID", collectionID), zap.Int64("msgID", lct.Base.MsgID), zap.Error(err))
|
log.Error("loadCollectionTask: showPartition failed", zap.Int64("collectionID", collectionID), zap.Int64("msgID", lct.Base.MsgID), zap.Error(err))
|
||||||
lct.setResultInfo(err)
|
lct.setResultInfo(err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if showPartitionResponse.Status.ErrorCode != commonpb.ErrorCode_Success {
|
|
||||||
err = errors.New(showPartitionResponse.Status.Reason)
|
|
||||||
log.Error("loadCollectionTask: showPartition failed", zap.Int64("collectionID", collectionID), zap.Int64("msgID", lct.Base.MsgID), zap.Error(err))
|
|
||||||
lct.setResultInfo(err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
toLoadPartitionIDs := showPartitionResponse.PartitionIDs
|
|
||||||
log.Debug("loadCollectionTask: get collection's all partitionIDs", zap.Int64("collectionID", collectionID), zap.Int64s("partitionIDs", toLoadPartitionIDs), zap.Int64("msgID", lct.Base.MsgID))
|
log.Debug("loadCollectionTask: get collection's all partitionIDs", zap.Int64("collectionID", collectionID), zap.Int64s("partitionIDs", toLoadPartitionIDs), zap.Int64("msgID", lct.Base.MsgID))
|
||||||
|
|
||||||
loadSegmentReqs := make([]*querypb.LoadSegmentsRequest, 0)
|
loadSegmentReqs := make([]*querypb.LoadSegmentsRequest, 0)
|
||||||
|
@ -1690,20 +1673,11 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
|
||||||
|
|
||||||
var toRecoverPartitionIDs []UniqueID
|
var toRecoverPartitionIDs []UniqueID
|
||||||
if collectionInfo.LoadType == querypb.LoadType_loadCollection {
|
if collectionInfo.LoadType == querypb.LoadType_loadCollection {
|
||||||
showPartitionRequest := &milvuspb.ShowPartitionsRequest{
|
toRecoverPartitionIDs, err = showPartitions(ctx, collectionID, lbt.rootCoord)
|
||||||
Base: &commonpb.MsgBase{
|
|
||||||
MsgType: commonpb.MsgType_ShowPartitions,
|
|
||||||
},
|
|
||||||
CollectionID: collectionID,
|
|
||||||
}
|
|
||||||
showPartitionResponse, err := lbt.rootCoord.ShowPartitions(ctx, showPartitionRequest)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lbt.setResultInfo(err)
|
log.Error("loadBalanceTask: show collection's partitionIDs failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
||||||
return err
|
panic(err)
|
||||||
}
|
}
|
||||||
//TODO:: queryNode receive dm message according partitionID cache
|
|
||||||
//TODO:: queryNode add partitionID to cache if receive create partition message from dmChannel
|
|
||||||
toRecoverPartitionIDs = showPartitionResponse.PartitionIDs
|
|
||||||
} else {
|
} else {
|
||||||
toRecoverPartitionIDs = collectionInfo.PartitionIDs
|
toRecoverPartitionIDs = collectionInfo.PartitionIDs
|
||||||
}
|
}
|
||||||
|
@ -1713,8 +1687,7 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
|
||||||
vChannelInfos, binlogs, err := getRecoveryInfo(lbt.ctx, lbt.dataCoord, collectionID, partitionID)
|
vChannelInfos, binlogs, err := getRecoveryInfo(lbt.ctx, lbt.dataCoord, collectionID, partitionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: getRecoveryInfo failed", zap.Int64("collectionID", collectionID), zap.Int64("partitionID", partitionID), zap.Error(err))
|
log.Error("loadBalanceTask: getRecoveryInfo failed", zap.Int64("collectionID", collectionID), zap.Int64("partitionID", partitionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
panic(err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, segmentBingLog := range binlogs {
|
for _, segmentBingLog := range binlogs {
|
||||||
|
@ -1755,8 +1728,7 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
|
||||||
deltaChannel, err := generateWatchDeltaChannelInfo(info)
|
deltaChannel, err := generateWatchDeltaChannelInfo(info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: generateWatchDeltaChannelInfo failed", zap.Int64("collectionID", collectionID), zap.String("channelName", info.ChannelName), zap.Error(err))
|
log.Error("loadBalanceTask: generateWatchDeltaChannelInfo failed", zap.Int64("collectionID", collectionID), zap.String("channelName", info.ChannelName), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
panic(err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
deltaChannelInfos = append(deltaChannelInfos, deltaChannel)
|
deltaChannelInfos = append(deltaChannelInfos, deltaChannel)
|
||||||
dmChannelInfos = append(dmChannelInfos, info)
|
dmChannelInfos = append(dmChannelInfos, info)
|
||||||
|
@ -1768,8 +1740,7 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
|
||||||
err = lbt.meta.setDeltaChannel(collectionID, mergedDeltaChannel)
|
err = lbt.meta.setDeltaChannel(collectionID, mergedDeltaChannel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: set delta channel info meta failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
log.Error("loadBalanceTask: set delta channel info meta failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
panic(err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mergedDmChannel := mergeDmChannelInfo(dmChannelInfos)
|
mergedDmChannel := mergeDmChannelInfo(dmChannelInfos)
|
||||||
|
@ -1794,9 +1765,8 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
internalTasks, err := assignInternalTask(ctx, lbt, lbt.meta, lbt.cluster, loadSegmentReqs, watchDmChannelReqs, true, lbt.SourceNodeIDs, lbt.DstNodeIDs)
|
internalTasks, err := assignInternalTask(ctx, lbt, lbt.meta, lbt.cluster, loadSegmentReqs, watchDmChannelReqs, true, lbt.SourceNodeIDs, lbt.DstNodeIDs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("loadBalanceTask: assign child task failed", zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs))
|
log.Error("loadBalanceTask: assign child task failed", zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs))
|
||||||
lbt.setResultInfo(err)
|
panic(err)
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
for _, internalTask := range internalTasks {
|
for _, internalTask := range internalTasks {
|
||||||
lbt.addChildTask(internalTask)
|
lbt.addChildTask(internalTask)
|
||||||
|
@ -2125,6 +2095,28 @@ func getRecoveryInfo(ctx context.Context, dataCoord types.DataCoord, collectionI
|
||||||
return recoveryInfo.Channels, recoveryInfo.Binlogs, nil
|
return recoveryInfo.Channels, recoveryInfo.Binlogs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func showPartitions(ctx context.Context, collectionID UniqueID, rootCoord types.RootCoord) ([]UniqueID, error) {
|
||||||
|
ctx2, cancel2 := context.WithTimeout(ctx, timeoutForRPC)
|
||||||
|
defer cancel2()
|
||||||
|
showPartitionRequest := &milvuspb.ShowPartitionsRequest{
|
||||||
|
Base: &commonpb.MsgBase{
|
||||||
|
MsgType: commonpb.MsgType_ShowPartitions,
|
||||||
|
},
|
||||||
|
CollectionID: collectionID,
|
||||||
|
}
|
||||||
|
showPartitionResponse, err := rootCoord.ShowPartitions(ctx2, showPartitionRequest)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if showPartitionResponse.Status.ErrorCode != commonpb.ErrorCode_Success {
|
||||||
|
err = errors.New(showPartitionResponse.Status.Reason)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return showPartitionResponse.PartitionIDs, nil
|
||||||
|
}
|
||||||
|
|
||||||
func mergeDmChannelInfo(infos []*datapb.VchannelInfo) map[string]*datapb.VchannelInfo {
|
func mergeDmChannelInfo(infos []*datapb.VchannelInfo) map[string]*datapb.VchannelInfo {
|
||||||
minPositions := make(map[string]*datapb.VchannelInfo)
|
minPositions := make(map[string]*datapb.VchannelInfo)
|
||||||
for _, info := range infos {
|
for _, info := range infos {
|
||||||
|
|
|
@ -1299,3 +1299,15 @@ func TestUpdateTaskProcessWhenWatchDmChannel(t *testing.T) {
|
||||||
err = removeAllSession()
|
err = removeAllSession()
|
||||||
assert.Nil(t, err)
|
assert.Nil(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestShowPartitions(t *testing.T) {
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
rootCoord := newRootCoordMock()
|
||||||
|
rootCoord.createCollection(defaultCollectionID)
|
||||||
|
rootCoord.createPartition(defaultCollectionID, defaultPartitionID)
|
||||||
|
|
||||||
|
partitionIDs, err := showPartitions(ctx, defaultCollectionID, rootCoord)
|
||||||
|
assert.Nil(t, err)
|
||||||
|
assert.Equal(t, 2, len(partitionIDs))
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue