mirror of https://github.com/milvus-io/milvus.git
Clean segments as releasing collection (#17932)
Signed-off-by: yah01 <yang.cen@zilliz.com>pull/18021/head
parent
f33b090819
commit
27eca2881a
|
@ -182,6 +182,11 @@ func (m *MetaReplica) reloadFromKV() error {
|
||||||
if err := m.segmentsInfo.loadSegments(); err != nil {
|
if err := m.segmentsInfo.loadSegments(); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
for id, segment := range m.segmentsInfo.segmentIDMap {
|
||||||
|
if _, ok := m.collectionInfos[segment.CollectionID]; !ok {
|
||||||
|
delete(m.segmentsInfo.segmentIDMap, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
deltaChannelKeys, deltaChannelValues, err := m.getKvClient().LoadWithPrefix(deltaChannelMetaPrefix)
|
deltaChannelKeys, deltaChannelValues, err := m.getKvClient().LoadWithPrefix(deltaChannelMetaPrefix)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -524,6 +529,14 @@ func (m *MetaReplica) releaseCollection(collectionID UniqueID) error {
|
||||||
|
|
||||||
m.replicas.Remove(collection.ReplicaIds...)
|
m.replicas.Remove(collection.ReplicaIds...)
|
||||||
|
|
||||||
|
m.segmentsInfo.mu.Lock()
|
||||||
|
for id, segment := range m.segmentsInfo.segmentIDMap {
|
||||||
|
if segment.CollectionID == collectionID {
|
||||||
|
delete(m.segmentsInfo.segmentIDMap, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.segmentsInfo.mu.Unlock()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1182,6 +1195,9 @@ func removeCollectionMeta(collectionID UniqueID, replicas []UniqueID, kv kv.Meta
|
||||||
prefixes = append(prefixes, replicaPrefix)
|
prefixes = append(prefixes, replicaPrefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prefixes = append(prefixes,
|
||||||
|
fmt.Sprintf("%s/%d", util.SegmentMetaPrefix, collectionID))
|
||||||
|
|
||||||
return kv.MultiRemoveWithPrefix(prefixes)
|
return kv.MultiRemoveWithPrefix(prefixes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -319,7 +319,8 @@ func TestReloadMetaFromKV(t *testing.T) {
|
||||||
kvs[collectionKey] = string(collectionBlobs)
|
kvs[collectionKey] = string(collectionBlobs)
|
||||||
|
|
||||||
segmentInfo := &querypb.SegmentInfo{
|
segmentInfo := &querypb.SegmentInfo{
|
||||||
SegmentID: defaultSegmentID,
|
SegmentID: defaultSegmentID,
|
||||||
|
CollectionID: defaultCollectionID,
|
||||||
}
|
}
|
||||||
segmentBlobs, err := proto.Marshal(segmentInfo)
|
segmentBlobs, err := proto.Marshal(segmentInfo)
|
||||||
assert.Nil(t, err)
|
assert.Nil(t, err)
|
||||||
|
|
|
@ -1913,7 +1913,14 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
recoveredCollectionIDs.Insert(watchInfo.CollectionID)
|
recoveredCollectionIDs.Insert(watchInfo.CollectionID)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log.Debug("loadBalanceTask: ready to process segments and channels on offline node",
|
||||||
|
zap.Int64("taskID", lbt.getTaskID()),
|
||||||
|
zap.Int("segmentNum", len(segments)),
|
||||||
|
zap.Int("dmChannelNum", len(dmChannels)))
|
||||||
|
|
||||||
if len(segments) == 0 && len(dmChannels) == 0 {
|
if len(segments) == 0 && len(dmChannels) == 0 {
|
||||||
|
log.Debug("loadBalanceTask: no segment/channel on this offline node, skip it",
|
||||||
|
zap.Int64("taskID", lbt.getTaskID()))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1922,7 +1929,7 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
watchDmChannelReqs := make([]*querypb.WatchDmChannelsRequest, 0)
|
watchDmChannelReqs := make([]*querypb.WatchDmChannelsRequest, 0)
|
||||||
collectionInfo, err := lbt.meta.getCollectionInfoByID(collectionID)
|
collectionInfo, err := lbt.meta.getCollectionInfoByID(collectionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: get collectionInfo from meta failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
log.Error("loadBalanceTask: get collectionInfo from meta failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -1934,25 +1941,25 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
if collectionInfo.LoadType == querypb.LoadType_LoadCollection {
|
if collectionInfo.LoadType == querypb.LoadType_LoadCollection {
|
||||||
toRecoverPartitionIDs, err = lbt.broker.showPartitionIDs(ctx, collectionID)
|
toRecoverPartitionIDs, err = lbt.broker.showPartitionIDs(ctx, collectionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: show collection's partitionIDs failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
log.Error("loadBalanceTask: show collection's partitionIDs failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
toRecoverPartitionIDs = collectionInfo.PartitionIDs
|
toRecoverPartitionIDs = collectionInfo.PartitionIDs
|
||||||
}
|
}
|
||||||
log.Info("loadBalanceTask: get collection's all partitionIDs", zap.Int64("collectionID", collectionID), zap.Int64s("partitionIDs", toRecoverPartitionIDs))
|
log.Info("loadBalanceTask: get collection's all partitionIDs", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Int64s("partitionIDs", toRecoverPartitionIDs))
|
||||||
replica, err := lbt.getReplica(nodeID, collectionID)
|
replica, err := lbt.getReplica(nodeID, collectionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// getReplica maybe failed, it will cause the balanceTask execute infinitely
|
// getReplica maybe failed, it will cause the balanceTask execute infinitely
|
||||||
log.Warn("loadBalanceTask: get replica failed", zap.Int64("collectionID", collectionID), zap.Int64("nodeId", nodeID))
|
log.Warn("loadBalanceTask: get replica failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Int64("nodeId", nodeID))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, partitionID := range toRecoverPartitionIDs {
|
for _, partitionID := range toRecoverPartitionIDs {
|
||||||
vChannelInfos, binlogs, err := lbt.broker.getRecoveryInfo(lbt.ctx, collectionID, partitionID)
|
vChannelInfos, binlogs, err := lbt.broker.getRecoveryInfo(lbt.ctx, collectionID, partitionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: getRecoveryInfo failed", zap.Int64("collectionID", collectionID), zap.Int64("partitionID", partitionID), zap.Error(err))
|
log.Error("loadBalanceTask: getRecoveryInfo failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Int64("partitionID", partitionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
@ -1986,7 +1993,7 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
for _, info := range vChannelInfos {
|
for _, info := range vChannelInfos {
|
||||||
deltaChannel, err := generateWatchDeltaChannelInfo(info)
|
deltaChannel, err := generateWatchDeltaChannelInfo(info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: generateWatchDeltaChannelInfo failed", zap.Int64("collectionID", collectionID), zap.String("channelName", info.ChannelName), zap.Error(err))
|
log.Error("loadBalanceTask: generateWatchDeltaChannelInfo failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.String("channelName", info.ChannelName), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
@ -1999,7 +2006,7 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
|
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
|
||||||
err = lbt.meta.setDeltaChannel(collectionID, mergedDeltaChannel)
|
err = lbt.meta.setDeltaChannel(collectionID, mergedDeltaChannel)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: set delta channel info meta failed", zap.Int64("collectionID", collectionID), zap.Error(err))
|
log.Error("loadBalanceTask: set delta channel info meta failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("collectionID", collectionID), zap.Error(err))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
@ -2038,7 +2045,7 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
|
|
||||||
tasks, err := assignInternalTask(ctx, lbt, lbt.meta, lbt.cluster, loadSegmentReqs, watchDmChannelReqs, false, lbt.SourceNodeIDs, lbt.DstNodeIDs, replica.GetReplicaID(), lbt.broker)
|
tasks, err := assignInternalTask(ctx, lbt, lbt.meta, lbt.cluster, loadSegmentReqs, watchDmChannelReqs, false, lbt.SourceNodeIDs, lbt.DstNodeIDs, replica.GetReplicaID(), lbt.broker)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("loadBalanceTask: assign child task failed", zap.Int64("sourceNodeID", nodeID))
|
log.Error("loadBalanceTask: assign child task failed", zap.Int64("taskID", lbt.getTaskID()), zap.Int64("sourceNodeID", nodeID))
|
||||||
lbt.setResultInfo(err)
|
lbt.setResultInfo(err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -2047,9 +2054,9 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
|
||||||
}
|
}
|
||||||
for _, internalTask := range internalTasks {
|
for _, internalTask := range internalTasks {
|
||||||
lbt.addChildTask(internalTask)
|
lbt.addChildTask(internalTask)
|
||||||
log.Info("loadBalanceTask: add a childTask", zap.String("task type", internalTask.msgType().String()), zap.Any("task", internalTask))
|
log.Info("loadBalanceTask: add a childTask", zap.Int64("taskID", lbt.getTaskID()), zap.String("task type", internalTask.msgType().String()), zap.Any("task", internalTask))
|
||||||
}
|
}
|
||||||
log.Info("loadBalanceTask: assign child task done", zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs))
|
log.Info("loadBalanceTask: assign child task done", zap.Int64("taskID", lbt.getTaskID()), zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs))
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -2250,7 +2257,8 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
|
||||||
zap.Int32("trigger type", int32(lbt.triggerCondition)),
|
zap.Int32("trigger type", int32(lbt.triggerCondition)),
|
||||||
zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs),
|
zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs),
|
||||||
zap.Any("balanceReason", lbt.BalanceReason),
|
zap.Any("balanceReason", lbt.BalanceReason),
|
||||||
zap.Int64("taskID", lbt.getTaskID()))
|
zap.Int64("taskID", lbt.getTaskID()),
|
||||||
|
zap.Int("childTaskNum", len(lbt.childTasks)))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2285,16 +2293,20 @@ func (lbt *loadBalanceTask) globalPostExecute(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("removing offline nodes from replicas and segments...",
|
log.Debug("removing offline nodes from replicas and segments...",
|
||||||
zap.Int("replicaNum", len(replicas)),
|
|
||||||
zap.Int("segmentNum", len(segments)),
|
|
||||||
zap.Int64("triggerTaskID", lbt.getTaskID()),
|
zap.Int64("triggerTaskID", lbt.getTaskID()),
|
||||||
)
|
zap.Int("replicaNum", len(replicas)),
|
||||||
|
zap.Int("segmentNum", len(segments)))
|
||||||
|
|
||||||
wg := errgroup.Group{}
|
wg := errgroup.Group{}
|
||||||
// Remove offline nodes from replica
|
// Remove offline nodes from replica
|
||||||
for replicaID := range replicas {
|
for replicaID := range replicas {
|
||||||
replicaID := replicaID
|
replicaID := replicaID
|
||||||
wg.Go(func() error {
|
wg.Go(func() error {
|
||||||
|
log.Debug("remove offline nodes from replica",
|
||||||
|
zap.Int64("taskID", lbt.taskID),
|
||||||
|
zap.Int64("replicaID", replicaID),
|
||||||
|
zap.Int64s("offlineNodes", lbt.SourceNodeIDs))
|
||||||
|
|
||||||
return lbt.meta.applyReplicaBalancePlan(
|
return lbt.meta.applyReplicaBalancePlan(
|
||||||
NewRemoveBalancePlan(replicaID, lbt.SourceNodeIDs...))
|
NewRemoveBalancePlan(replicaID, lbt.SourceNodeIDs...))
|
||||||
})
|
})
|
||||||
|
|
|
@ -136,6 +136,9 @@ func syncReplicaSegments(ctx context.Context, meta Meta, cluster Cluster, replic
|
||||||
|
|
||||||
for _, shard := range replica.ShardReplicas {
|
for _, shard := range replica.ShardReplicas {
|
||||||
if len(shards) > 0 && !isInShards(shard.DmChannelName, shards) {
|
if len(shards) > 0 && !isInShards(shard.DmChannelName, shards) {
|
||||||
|
log.Debug("skip this shard",
|
||||||
|
zap.Int64("replicaID", replicaID),
|
||||||
|
zap.String("shard", shard.DmChannelName))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,6 +158,12 @@ func syncReplicaSegments(ctx context.Context, meta Meta, cluster Cluster, replic
|
||||||
|
|
||||||
for i, j := 0, 0; i < len(segments); i = j {
|
for i, j := 0, 0; i < len(segments); i = j {
|
||||||
node := getNodeInReplica(replica, segments[i].NodeIds)
|
node := getNodeInReplica(replica, segments[i].NodeIds)
|
||||||
|
if node < 0 {
|
||||||
|
log.Warn("syncReplicaSegments: no segment node in replica",
|
||||||
|
zap.Int64("replicaID", replicaID),
|
||||||
|
zap.Any("segment", segments[i]))
|
||||||
|
}
|
||||||
|
|
||||||
partition := segments[i].PartitionID
|
partition := segments[i].PartitionID
|
||||||
|
|
||||||
j++
|
j++
|
||||||
|
@ -207,7 +216,7 @@ func getNodeInReplica(replica *milvuspb.ReplicaInfo, nodes []UniqueID) UniqueID
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeFromSlice(origin []UniqueID, del ...UniqueID) []UniqueID {
|
func removeFromSlice(origin []UniqueID, del ...UniqueID) []UniqueID {
|
||||||
|
|
Loading…
Reference in New Issue