Fix recover multi collection after query node down (#13952)

Signed-off-by: xige-16 <xi.ge@zilliz.com>
pull/13980/head
xige-16 2021-12-22 16:29:06 +08:00 committed by GitHub
parent 526715aee3
commit d39a4a3f2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 50 additions and 20 deletions

View File

@ -1735,29 +1735,23 @@ func (lbt *loadBalanceTask) execute(ctx context.Context) error {
}
mergedDmChannel := mergeDmChannelInfo(dmChannelInfos)
for channelName := range dmChannel2WatchInfo {
vChannelInfo, ok := mergedDmChannel[channelName]
if !ok {
err = fmt.Errorf("loadBalanceTask: can't get recovery info from data coord, channel name = %s", channelName)
log.Error(err.Error())
lbt.setResultInfo(err)
return err
}
for channelName, vChannelInfo := range mergedDmChannel {
if _, ok := dmChannel2WatchInfo[channelName]; ok {
msgBase := proto.Clone(lbt.Base).(*commonpb.MsgBase)
msgBase.MsgType = commonpb.MsgType_WatchDmChannels
watchRequest := &querypb.WatchDmChannelsRequest{
Base: msgBase,
CollectionID: collectionID,
Infos: []*datapb.VchannelInfo{vChannelInfo},
Schema: schema,
}
msgBase := proto.Clone(lbt.Base).(*commonpb.MsgBase)
msgBase.MsgType = commonpb.MsgType_WatchDmChannels
watchRequest := &querypb.WatchDmChannelsRequest{
Base: msgBase,
CollectionID: collectionID,
Infos: []*datapb.VchannelInfo{vChannelInfo},
Schema: schema,
}
if collectionInfo.LoadType == querypb.LoadType_LoadPartition {
watchRequest.PartitionIDs = toRecoverPartitionIDs
}
if collectionInfo.LoadType == querypb.LoadType_LoadPartition {
watchRequest.PartitionIDs = toRecoverPartitionIDs
watchDmChannelReqs = append(watchDmChannelReqs, watchRequest)
}
watchDmChannelReqs = append(watchDmChannelReqs, watchRequest)
}
}
internalTasks, err := assignInternalTask(ctx, lbt, lbt.meta, lbt.cluster, loadSegmentReqs, watchDmChannelReqs, true, lbt.SourceNodeIDs, lbt.DstNodeIDs)

View File

@ -1036,6 +1036,42 @@ func TestLoadBalanceIndexedSegmentsAfterNodeDown(t *testing.T) {
assert.Nil(t, err)
}
func TestLoadBalancePartitionAfterNodeDown(t *testing.T) {
refreshParams()
ctx := context.Background()
queryCoord, err := startQueryCoord(ctx)
assert.Nil(t, err)
node1, err := startQueryNodeServer(ctx)
assert.Nil(t, err)
waitQueryNodeOnline(queryCoord.cluster, node1.queryNodeID)
loadPartitionTask := genLoadPartitionTask(ctx, queryCoord)
err = queryCoord.scheduler.Enqueue(loadPartitionTask)
assert.Nil(t, err)
waitTaskFinalState(loadPartitionTask, taskExpired)
node2, err := startQueryNodeServer(ctx)
assert.Nil(t, err)
waitQueryNodeOnline(queryCoord.cluster, node2.queryNodeID)
indexCoord := newIndexCoordMock()
indexCoord.returnIndexFile = true
queryCoord.indexCoordClient = indexCoord
removeNodeSession(node1.queryNodeID)
for {
if len(queryCoord.meta.getSegmentInfosByNode(node1.queryNodeID)) == 0 {
break
}
}
node2.stop()
queryCoord.Stop()
err = removeAllSession()
assert.Nil(t, err)
}
func TestMergeWatchDeltaChannelInfo(t *testing.T) {
infos := []*datapb.VchannelInfo{
{