Remove offline nodes from replica info after LoadBalanceTask done (#16653)

Signed-off-by: yah01 <yang.cen@zilliz.com>
pull/16675/head
yah01 2022-04-26 18:23:46 +08:00 committed by GitHub
parent a3f7bb5f2d
commit 60f7fef3b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 3 deletions

View File

@ -123,7 +123,12 @@ func shuffleSegmentsToQueryNodeV2(ctx context.Context, reqs []*querypb.LoadSegme
if err != nil {
return err
}
onlineNodeIDs = replica.GetNodeIds()
replicaNodes := replica.GetNodeIds()
for _, nodeID := range replicaNodes {
if ok, err := cluster.isOnline(nodeID); err == nil && ok {
onlineNodeIDs = append(onlineNodeIDs, nodeID)
}
}
}
if len(onlineNodeIDs) == 0 && !wait {
err := errors.New("no online queryNode to allocate")

View File

@ -2206,13 +2206,58 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
// then the queryCoord will panic, and the nodeInfo should not be removed immediately
// after queryCoord recovery, the balanceTask will redo
if lbt.triggerCondition == querypb.TriggerCondition_NodeDown && lbt.getResultInfo().ErrorCode == commonpb.ErrorCode_Success {
offlineNodes := make(map[UniqueID]struct{}, len(lbt.SourceNodeIDs))
for _, nodeID := range lbt.SourceNodeIDs {
offlineNodes[nodeID] = struct{}{}
}
replicas := make(map[UniqueID]*milvuspb.ReplicaInfo)
for _, id := range lbt.SourceNodeIDs {
err := lbt.cluster.removeNodeInfo(id)
if err != nil {
//TODO:: clear node info after removeNodeInfo failed
log.Error("loadBalanceTask: occur error when removing node info from cluster", zap.Int64("nodeID", id))
log.Warn("loadBalanceTask: occur error when removing node info from cluster",
zap.Int64("nodeID", id),
zap.Error(err))
continue
}
replica, err := lbt.getReplica(id, lbt.CollectionID)
if err != nil {
log.Warn("failed to get replica for removing offline querynode from it",
zap.Int64("querynodeID", id),
zap.Int64("collectionID", lbt.CollectionID),
zap.Error(err))
continue
}
replicas[replica.ReplicaID] = replica
}
log.Debug("removing offline nodes from replicas...",
zap.Int("len(replicas)", len(replicas)))
wg := sync.WaitGroup{}
for _, replica := range replicas {
wg.Add(1)
go func(replica *milvuspb.ReplicaInfo) {
defer wg.Done()
onlineNodes := make([]UniqueID, 0, len(replica.NodeIds))
for _, nodeID := range replica.NodeIds {
if _, ok := offlineNodes[nodeID]; !ok {
onlineNodes = append(onlineNodes, nodeID)
}
}
replica.NodeIds = onlineNodes
err := lbt.meta.setReplicaInfo(replica)
if err != nil {
log.Warn("failed to remove offline nodes from replica info",
zap.Int64("replicaID", replica.ReplicaID),
zap.Error(err))
}
}(replica)
}
wg.Wait()
}
log.Info("loadBalanceTask postExecute done",

View File

@ -925,7 +925,7 @@ func TestLoadBalanceSegmentsTask(t *testing.T) {
})
t.Run("Test LoadBalanceByNode", func(t *testing.T) {
baseTask := newBaseTask(ctx, querypb.TriggerCondition_LoadBalance)
baseTask := newBaseTask(ctx, querypb.TriggerCondition_NodeDown)
loadBalanceTask := &loadBalanceTask{
baseTask: baseTask,
LoadBalanceRequest: &querypb.LoadBalanceRequest{
@ -934,6 +934,7 @@ func TestLoadBalanceSegmentsTask(t *testing.T) {
},
SourceNodeIDs: []int64{node1.queryNodeID},
CollectionID: defaultCollectionID,
BalanceReason: querypb.TriggerCondition_NodeDown,
},
broker: queryCoord.broker,
cluster: queryCoord.cluster,
@ -942,6 +943,7 @@ func TestLoadBalanceSegmentsTask(t *testing.T) {
err = queryCoord.scheduler.Enqueue(loadBalanceTask)
assert.Nil(t, err)
waitTaskFinalState(loadBalanceTask, taskExpired)
assert.Equal(t, commonpb.ErrorCode_Success, loadBalanceTask.result.ErrorCode)
})
t.Run("Test LoadBalanceWithEmptySourceNode", func(t *testing.T) {