// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package observers import ( "context" "sync" "time" "go.uber.org/zap" "github.com/milvus-io/milvus/internal/coordinator/snmanager" "github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/params" "github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/util/streamingutil" "github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/util/syncutil" "github.com/milvus-io/milvus/pkg/v2/util/typeutil" ) // check replica, find read only nodes and remove it from replica if all segment/channel has been moved type ReplicaObserver struct { cancel context.CancelFunc wg sync.WaitGroup meta *meta.Meta distMgr *meta.DistributionManager startOnce sync.Once stopOnce sync.Once } func NewReplicaObserver(meta *meta.Meta, distMgr *meta.DistributionManager) *ReplicaObserver { return &ReplicaObserver{ meta: meta, distMgr: distMgr, } } func (ob *ReplicaObserver) Start() { ob.startOnce.Do(func() { ctx, cancel := context.WithCancel(context.Background()) ob.cancel = cancel ob.wg.Add(1) go ob.schedule(ctx) if streamingutil.IsStreamingServiceEnabled() { ob.wg.Add(1) go ob.scheduleStreamingQN(ctx) } }) } func (ob *ReplicaObserver) Stop() { ob.stopOnce.Do(func() { if ob.cancel != nil { ob.cancel() } ob.wg.Wait() }) } func (ob *ReplicaObserver) schedule(ctx context.Context) { defer ob.wg.Done() log.Info("Start check replica loop") listener := ob.meta.ResourceManager.ListenNodeChanged(ctx) for { ob.waitNodeChangedOrTimeout(ctx, listener) // stop if the context is canceled. if ctx.Err() != nil { log.Info("Stop check replica observer") return } // do check once. ob.checkNodesInReplica() } } // scheduleStreamingQN is used to check streaming query node in replica func (ob *ReplicaObserver) scheduleStreamingQN(ctx context.Context) { defer ob.wg.Done() log.Info("Start streaming query node check replica loop") listener := snmanager.StaticStreamingNodeManager.ListenNodeChanged() for { ob.waitNodeChangedOrTimeout(ctx, listener) if ctx.Err() != nil { log.Info("Stop streaming query node check replica observer") return } ids := snmanager.StaticStreamingNodeManager.GetStreamingQueryNodeIDs() ob.checkStreamingQueryNodesInReplica(ids) } } func (ob *ReplicaObserver) waitNodeChangedOrTimeout(ctx context.Context, listener *syncutil.VersionedListener) { ctxWithTimeout, cancel := context.WithTimeout(ctx, params.Params.QueryCoordCfg.CheckNodeInReplicaInterval.GetAsDuration(time.Second)) defer cancel() listener.Wait(ctxWithTimeout) } func (ob *ReplicaObserver) checkStreamingQueryNodesInReplica(sqNodeIDs typeutil.UniqueSet) { ctx := context.Background() log := log.Ctx(ctx).WithRateGroup("qcv2.replicaObserver", 1, 60) collections := ob.meta.GetAll(context.Background()) for _, collectionID := range collections { ob.meta.RecoverSQNodesInCollection(context.Background(), collectionID, sqNodeIDs) } for _, collectionID := range collections { replicas := ob.meta.ReplicaManager.GetByCollection(ctx, collectionID) for _, replica := range replicas { roSQNodes := replica.GetROSQNodes() rwSQNodes := replica.GetRWSQNodes() if len(roSQNodes) == 0 { continue } removeNodes := make([]int64, 0, len(roSQNodes)) for _, node := range roSQNodes { channels := ob.distMgr.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(node)) segments := ob.distMgr.SegmentDistManager.GetByFilter(meta.WithCollectionID(collectionID), meta.WithNodeID(node)) if len(channels) == 0 && len(segments) == 0 { removeNodes = append(removeNodes, node) } } if len(removeNodes) == 0 { continue } logger := log.With( zap.Int64("collectionID", replica.GetCollectionID()), zap.Int64("replicaID", replica.GetID()), zap.Int64s("removedNodes", removeNodes), zap.Int64s("roNodes", roSQNodes), zap.Int64s("rwNodes", rwSQNodes), ) if err := ob.meta.ReplicaManager.RemoveSQNode(ctx, replica.GetID(), removeNodes...); err != nil { logger.Warn("fail to remove streaming query node from replica", zap.Error(err)) continue } logger.Info("all segment/channel has been removed from ro streaming query node, remove it from replica") } } } func (ob *ReplicaObserver) checkNodesInReplica() { ctx := context.Background() log := log.Ctx(ctx).WithRateGroup("qcv2.replicaObserver", 1, 60) collections := ob.meta.GetAll(ctx) for _, collectionID := range collections { utils.RecoverReplicaOfCollection(ctx, ob.meta, collectionID) } // check all ro nodes, remove it from replica if all segment/channel has been moved for _, collectionID := range collections { replicas := ob.meta.ReplicaManager.GetByCollection(ctx, collectionID) for _, replica := range replicas { roNodes := replica.GetRONodes() rwNodes := replica.GetRWNodes() if len(roNodes) == 0 { continue } log.RatedInfo(10, "found ro nodes in replica", zap.Int64("collectionID", replica.GetCollectionID()), zap.Int64("replicaID", replica.GetID()), zap.Int64s("RONodes", roNodes), ) removeNodes := make([]int64, 0, len(roNodes)) for _, node := range roNodes { channels := ob.distMgr.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(node)) segments := ob.distMgr.SegmentDistManager.GetByFilter(meta.WithCollectionID(collectionID), meta.WithNodeID(node)) if len(channels) == 0 && len(segments) == 0 { removeNodes = append(removeNodes, node) } } if len(removeNodes) == 0 { continue } logger := log.With( zap.Int64("collectionID", replica.GetCollectionID()), zap.Int64("replicaID", replica.GetID()), zap.Int64s("removedNodes", removeNodes), zap.Int64s("roNodes", roNodes), zap.Int64s("rwNodes", rwNodes), ) if err := ob.meta.ReplicaManager.RemoveNode(ctx, replica.GetID(), removeNodes...); err != nil { logger.Warn("fail to remove node from replica", zap.Error(err)) continue } logger.Info("all segment/channel has been removed from ro node, remove it from replica") } } }