Fix may update the current target to an unavailable target when node down (#21743)

Signed-off-by: yah01 <yang.cen@zilliz.com>
pull/21758/head
yah01 2023-01-16 18:47:42 +08:00 committed by GitHub
parent 7e2121e608
commit 3a5f38b1df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 170 additions and 44 deletions

View File

@ -28,6 +28,7 @@ import (
"github.com/milvus-io/milvus/internal/metrics" "github.com/milvus-io/milvus/internal/metrics"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/observers"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
@ -250,10 +251,11 @@ func (job *LoadCollectionJob) PostExecute() {
type ReleaseCollectionJob struct { type ReleaseCollectionJob struct {
*BaseJob *BaseJob
req *querypb.ReleaseCollectionRequest req *querypb.ReleaseCollectionRequest
dist *meta.DistributionManager dist *meta.DistributionManager
meta *meta.Meta meta *meta.Meta
targetMgr *meta.TargetManager targetMgr *meta.TargetManager
targetObserver *observers.TargetObserver
} }
func NewReleaseCollectionJob(ctx context.Context, func NewReleaseCollectionJob(ctx context.Context,
@ -261,13 +263,15 @@ func NewReleaseCollectionJob(ctx context.Context,
dist *meta.DistributionManager, dist *meta.DistributionManager,
meta *meta.Meta, meta *meta.Meta,
targetMgr *meta.TargetManager, targetMgr *meta.TargetManager,
targetObserver *observers.TargetObserver,
) *ReleaseCollectionJob { ) *ReleaseCollectionJob {
return &ReleaseCollectionJob{ return &ReleaseCollectionJob{
BaseJob: NewBaseJob(ctx, req.Base.GetMsgID(), req.GetCollectionID()), BaseJob: NewBaseJob(ctx, req.Base.GetMsgID(), req.GetCollectionID()),
req: req, req: req,
dist: dist, dist: dist,
meta: meta, meta: meta,
targetMgr: targetMgr, targetMgr: targetMgr,
targetObserver: targetObserver,
} }
} }
@ -295,6 +299,7 @@ func (job *ReleaseCollectionJob) Execute() error {
} }
job.targetMgr.RemoveCollection(req.GetCollectionID()) job.targetMgr.RemoveCollection(req.GetCollectionID())
job.targetObserver.ReleaseCollection(req.GetCollectionID())
waitCollectionReleased(job.dist, req.GetCollectionID()) waitCollectionReleased(job.dist, req.GetCollectionID())
metrics.QueryCoordNumCollections.WithLabelValues().Dec() metrics.QueryCoordNumCollections.WithLabelValues().Dec()
return nil return nil
@ -455,10 +460,11 @@ func (job *LoadPartitionJob) PostExecute() {
type ReleasePartitionJob struct { type ReleasePartitionJob struct {
*BaseJob *BaseJob
req *querypb.ReleasePartitionsRequest req *querypb.ReleasePartitionsRequest
dist *meta.DistributionManager dist *meta.DistributionManager
meta *meta.Meta meta *meta.Meta
targetMgr *meta.TargetManager targetMgr *meta.TargetManager
targetObserver *observers.TargetObserver
} }
func NewReleasePartitionJob(ctx context.Context, func NewReleasePartitionJob(ctx context.Context,
@ -466,13 +472,15 @@ func NewReleasePartitionJob(ctx context.Context,
dist *meta.DistributionManager, dist *meta.DistributionManager,
meta *meta.Meta, meta *meta.Meta,
targetMgr *meta.TargetManager, targetMgr *meta.TargetManager,
targetObserver *observers.TargetObserver,
) *ReleasePartitionJob { ) *ReleasePartitionJob {
return &ReleasePartitionJob{ return &ReleasePartitionJob{
BaseJob: NewBaseJob(ctx, req.Base.GetMsgID(), req.GetCollectionID()), BaseJob: NewBaseJob(ctx, req.Base.GetMsgID(), req.GetCollectionID()),
req: req, req: req,
dist: dist, dist: dist,
meta: meta, meta: meta,
targetMgr: targetMgr, targetMgr: targetMgr,
targetObserver: targetObserver,
} }
} }
@ -520,6 +528,7 @@ func (job *ReleasePartitionJob) Execute() error {
log.Warn("failed to remove replicas", zap.Error(err)) log.Warn("failed to remove replicas", zap.Error(err))
} }
job.targetMgr.RemoveCollection(req.GetCollectionID()) job.targetMgr.RemoveCollection(req.GetCollectionID())
job.targetObserver.ReleaseCollection(req.GetCollectionID())
waitCollectionReleased(job.dist, req.GetCollectionID()) waitCollectionReleased(job.dist, req.GetCollectionID())
} else { } else {
err := job.meta.CollectionManager.RemovePartition(toRelease...) err := job.meta.CollectionManager.RemovePartition(toRelease...)

View File

@ -29,6 +29,7 @@ import (
"github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/observers"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
@ -50,13 +51,14 @@ type JobSuite struct {
loadTypes map[int64]querypb.LoadType loadTypes map[int64]querypb.LoadType
// Dependencies // Dependencies
kv kv.MetaKv kv kv.MetaKv
store meta.Store store meta.Store
dist *meta.DistributionManager dist *meta.DistributionManager
meta *meta.Meta meta *meta.Meta
targetMgr *meta.TargetManager targetMgr *meta.TargetManager
broker *meta.MockBroker targetObserver *observers.TargetObserver
nodeMgr *session.NodeManager broker *meta.MockBroker
nodeMgr *session.NodeManager
// Test objects // Test objects
scheduler *Scheduler scheduler *Scheduler
@ -131,6 +133,11 @@ func (suite *JobSuite) SetupTest() {
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store) suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store)
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
suite.targetObserver = observers.NewTargetObserver(suite.meta,
suite.targetMgr,
suite.dist,
suite.broker,
)
suite.nodeMgr = session.NewNodeManager() suite.nodeMgr = session.NewNodeManager()
suite.nodeMgr.Add(&session.NodeInfo{}) suite.nodeMgr.Add(&session.NodeInfo{})
suite.scheduler = NewScheduler() suite.scheduler = NewScheduler()
@ -583,6 +590,7 @@ func (suite *JobSuite) TestReleaseCollection() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()
@ -601,6 +609,7 @@ func (suite *JobSuite) TestReleaseCollection() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()
@ -626,6 +635,7 @@ func (suite *JobSuite) TestReleasePartition() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()
@ -650,6 +660,7 @@ func (suite *JobSuite) TestReleasePartition() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()
@ -676,6 +687,7 @@ func (suite *JobSuite) TestReleasePartition() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()
@ -843,6 +855,7 @@ func (suite *JobSuite) releaseAll() {
suite.dist, suite.dist,
suite.meta, suite.meta,
suite.targetMgr, suite.targetMgr,
suite.targetObserver,
) )
suite.scheduler.Add(job) suite.scheduler.Add(job)
err := job.Wait() err := job.Wait()

View File

@ -30,6 +30,12 @@ import (
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
) )
type targetUpdateRequest struct {
CollectionID int64
Notifier chan error
ReadyNotifier chan struct{}
}
type TargetObserver struct { type TargetObserver struct {
c chan struct{} c chan struct{}
wg sync.WaitGroup wg sync.WaitGroup
@ -39,7 +45,11 @@ type TargetObserver struct {
broker meta.Broker broker meta.Broker
nextTargetLastUpdate map[int64]time.Time nextTargetLastUpdate map[int64]time.Time
stopOnce sync.Once updateChan chan targetUpdateRequest
mut sync.Mutex // Guard readyNotifiers
readyNotifiers map[int64][]chan struct{} // CollectionID -> Notifiers
stopOnce sync.Once
} }
func NewTargetObserver(meta *meta.Meta, targetMgr *meta.TargetManager, distMgr *meta.DistributionManager, broker meta.Broker) *TargetObserver { func NewTargetObserver(meta *meta.Meta, targetMgr *meta.TargetManager, distMgr *meta.DistributionManager, broker meta.Broker) *TargetObserver {
@ -50,6 +60,8 @@ func NewTargetObserver(meta *meta.Meta, targetMgr *meta.TargetManager, distMgr *
distMgr: distMgr, distMgr: distMgr,
broker: broker, broker: broker,
nextTargetLastUpdate: make(map[int64]time.Time), nextTargetLastUpdate: make(map[int64]time.Time),
updateChan: make(chan targetUpdateRequest),
readyNotifiers: make(map[int64][]chan struct{}),
} }
} }
@ -80,11 +92,49 @@ func (ob *TargetObserver) schedule(ctx context.Context) {
return return
case <-ticker.C: case <-ticker.C:
ob.clean()
ob.tryUpdateTarget() ob.tryUpdateTarget()
case request := <-ob.updateChan:
err := ob.updateNextTarget(request.CollectionID)
if err != nil {
close(request.ReadyNotifier)
} else {
ob.mut.Lock()
ob.readyNotifiers[request.CollectionID] = append(ob.readyNotifiers[request.CollectionID], request.ReadyNotifier)
ob.mut.Unlock()
}
request.Notifier <- err
} }
} }
} }
// UpdateNextTarget updates the next target,
// returns a channel which will be closed when the next target is ready,
// or returns error if failed to pull target
func (ob *TargetObserver) UpdateNextTarget(collectionID int64) (chan struct{}, error) {
notifier := make(chan error)
readyCh := make(chan struct{})
defer close(notifier)
ob.updateChan <- targetUpdateRequest{
CollectionID: collectionID,
Notifier: notifier,
ReadyNotifier: readyCh,
}
return readyCh, <-notifier
}
func (ob *TargetObserver) ReleaseCollection(collectionID int64) {
ob.mut.Lock()
defer ob.mut.Unlock()
for _, notifier := range ob.readyNotifiers[collectionID] {
close(notifier)
}
delete(ob.readyNotifiers, collectionID)
}
func (ob *TargetObserver) tryUpdateTarget() { func (ob *TargetObserver) tryUpdateTarget() {
collections := ob.meta.GetAll() collections := ob.meta.GetAll()
for _, collectionID := range collections { for _, collectionID := range collections {
@ -94,7 +144,7 @@ func (ob *TargetObserver) tryUpdateTarget() {
if ob.shouldUpdateNextTarget(collectionID) { if ob.shouldUpdateNextTarget(collectionID) {
// update next target in collection level // update next target in collection level
ob.UpdateNextTarget(collectionID) ob.updateNextTarget(collectionID)
} }
} }
@ -107,6 +157,21 @@ func (ob *TargetObserver) tryUpdateTarget() {
} }
} }
func (ob *TargetObserver) clean() {
collections := typeutil.NewSet(ob.meta.GetAll()...)
ob.mut.Lock()
defer ob.mut.Unlock()
for collectionID, notifiers := range ob.readyNotifiers {
if !collections.Contain(collectionID) {
for i := range notifiers {
close(notifiers[i])
}
delete(ob.readyNotifiers, collectionID)
}
}
}
func (ob *TargetObserver) shouldUpdateNextTarget(collectionID int64) bool { func (ob *TargetObserver) shouldUpdateNextTarget(collectionID int64) bool {
return !ob.targetMgr.IsNextTargetExist(collectionID) || ob.isNextTargetExpired(collectionID) return !ob.targetMgr.IsNextTargetExist(collectionID) || ob.isNextTargetExpired(collectionID)
} }
@ -115,7 +180,7 @@ func (ob *TargetObserver) isNextTargetExpired(collectionID int64) bool {
return time.Since(ob.nextTargetLastUpdate[collectionID]) > params.Params.QueryCoordCfg.NextTargetSurviveTime return time.Since(ob.nextTargetLastUpdate[collectionID]) > params.Params.QueryCoordCfg.NextTargetSurviveTime
} }
func (ob *TargetObserver) UpdateNextTarget(collectionID int64) { func (ob *TargetObserver) updateNextTarget(collectionID int64) error {
log := log.With(zap.Int64("collectionID", collectionID)) log := log.With(zap.Int64("collectionID", collectionID))
log.Warn("observer trigger update next target") log.Warn("observer trigger update next target")
@ -123,9 +188,10 @@ func (ob *TargetObserver) UpdateNextTarget(collectionID int64) {
if err != nil { if err != nil {
log.Error("failed to update next target for collection", log.Error("failed to update next target for collection",
zap.Error(err)) zap.Error(err))
return return err
} }
ob.updateNextTargetTimestamp(collectionID) ob.updateNextTargetTimestamp(collectionID)
return nil
} }
func (ob *TargetObserver) updateNextTargetTimestamp(collectionID int64) { func (ob *TargetObserver) updateNextTargetTimestamp(collectionID int64) {
@ -175,4 +241,15 @@ func (ob *TargetObserver) updateCurrentTarget(collectionID int64) {
log.Warn("observer trigger update current target", log.Warn("observer trigger update current target",
zap.Int64("collectionID", collectionID)) zap.Int64("collectionID", collectionID))
ob.targetMgr.UpdateCollectionCurrentTarget(collectionID) ob.targetMgr.UpdateCollectionCurrentTarget(collectionID)
ob.mut.Lock()
defer ob.mut.Unlock()
notifiers := ob.readyNotifiers[collectionID]
for _, notifier := range notifiers {
close(notifier)
}
// Reuse the capacity of notifiers slice
if notifiers != nil {
ob.readyNotifiers[collectionID] = notifiers[:0]
}
} }

View File

@ -153,15 +153,24 @@ func (suite *TargetObserverSuite) TestTriggerUpdateTarget() {
SegmentID: 13, SegmentID: 13,
InsertChannel: "channel-1", InsertChannel: "channel-1",
}) })
suite.broker.EXPECT().GetRecoveryInfo(mock.Anything, mock.Anything, mock.Anything).Return(suite.nextTargetChannels, suite.nextTargetSegments, nil)
suite.broker.EXPECT().GetPartitions(mock.Anything, mock.Anything).Return([]int64{suite.partitionID}, nil)
suite.targetMgr.UpdateCollectionCurrentTarget(suite.collectionID) suite.targetMgr.UpdateCollectionCurrentTarget(suite.collectionID)
// Pull next again // Pull next again
suite.broker.EXPECT().
GetRecoveryInfo(mock.Anything, mock.Anything, mock.Anything).
Return(suite.nextTargetChannels, suite.nextTargetSegments, nil)
suite.broker.EXPECT().
GetPartitions(mock.Anything, mock.Anything).
Return([]int64{suite.partitionID}, nil)
suite.Eventually(func() bool { suite.Eventually(func() bool {
return len(suite.targetMgr.GetHistoricalSegmentsByCollection(suite.collectionID, meta.NextTarget)) == 3 && return len(suite.targetMgr.GetHistoricalSegmentsByCollection(suite.collectionID, meta.NextTarget)) == 3 &&
len(suite.targetMgr.GetDmChannelsByCollection(suite.collectionID, meta.NextTarget)) == 2 len(suite.targetMgr.GetDmChannelsByCollection(suite.collectionID, meta.NextTarget)) == 2
}, 7*time.Second, 1*time.Second) }, 7*time.Second, 1*time.Second)
suite.broker.AssertExpectations(suite.T())
// Manually update next target
ready, err := suite.observer.UpdateNextTarget(suite.collectionID)
suite.NoError(err)
suite.distMgr.LeaderViewManager.Update(2, suite.distMgr.LeaderViewManager.Update(2,
&meta.LeaderView{ &meta.LeaderView{
@ -185,7 +194,14 @@ func (suite *TargetObserverSuite) TestTriggerUpdateTarget() {
// Able to update current if it's not empty // Able to update current if it's not empty
suite.Eventually(func() bool { suite.Eventually(func() bool {
return len(suite.targetMgr.GetHistoricalSegmentsByCollection(suite.collectionID, meta.CurrentTarget)) == 3 && isReady := false
select {
case <-ready:
isReady = true
default:
}
return isReady &&
len(suite.targetMgr.GetHistoricalSegmentsByCollection(suite.collectionID, meta.CurrentTarget)) == 3 &&
len(suite.targetMgr.GetDmChannelsByCollection(suite.collectionID, meta.CurrentTarget)) == 2 len(suite.targetMgr.GetDmChannelsByCollection(suite.collectionID, meta.CurrentTarget)) == 2
}, 7*time.Second, 1*time.Second) }, 7*time.Second, 1*time.Second)
} }

View File

@ -636,7 +636,7 @@ func (s *Server) handleNodeDown(node int64) {
// are missed, it will recover for a while. // are missed, it will recover for a while.
channels := s.dist.ChannelDistManager.GetByNode(node) channels := s.dist.ChannelDistManager.GetByNode(node)
for _, channel := range channels { for _, channel := range channels {
err := s.targetMgr.UpdateCollectionNextTarget(channel.GetCollectionID()) _, err := s.targetObserver.UpdateNextTarget(channel.GetCollectionID())
if err != nil { if err != nil {
msg := "failed to update next targets for collection" msg := "failed to update next targets for collection"
log.Error(msg, log.Error(msg,

View File

@ -258,6 +258,7 @@ func (s *Server) ReleaseCollection(ctx context.Context, req *querypb.ReleaseColl
s.dist, s.dist,
s.meta, s.meta,
s.targetMgr, s.targetMgr,
s.targetObserver,
) )
s.jobScheduler.Add(releaseJob) s.jobScheduler.Add(releaseJob)
err := releaseJob.Wait() err := releaseJob.Wait()
@ -345,6 +346,7 @@ func (s *Server) ReleasePartitions(ctx context.Context, req *querypb.ReleasePart
s.dist, s.dist,
s.meta, s.meta,
s.targetMgr, s.targetMgr,
s.targetObserver,
) )
s.jobScheduler.Add(releaseJob) s.jobScheduler.Add(releaseJob)
err := releaseJob.Wait() err := releaseJob.Wait()

View File

@ -33,6 +33,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/balance" "github.com/milvus-io/milvus/internal/querycoordv2/balance"
"github.com/milvus-io/milvus/internal/querycoordv2/job" "github.com/milvus-io/milvus/internal/querycoordv2/job"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/observers"
"github.com/milvus-io/milvus/internal/querycoordv2/params" "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/internal/querycoordv2/task"
@ -59,17 +60,18 @@ type ServiceSuite struct {
nodes []int64 nodes []int64
// Dependencies // Dependencies
kv kv.MetaKv kv kv.MetaKv
store meta.Store store meta.Store
dist *meta.DistributionManager dist *meta.DistributionManager
meta *meta.Meta meta *meta.Meta
targetMgr *meta.TargetManager targetMgr *meta.TargetManager
broker *meta.MockBroker broker *meta.MockBroker
cluster *session.MockCluster targetObserver *observers.TargetObserver
nodeMgr *session.NodeManager cluster *session.MockCluster
jobScheduler *job.Scheduler nodeMgr *session.NodeManager
taskScheduler *task.MockScheduler jobScheduler *job.Scheduler
balancer balance.Balance taskScheduler *task.MockScheduler
balancer balance.Balance
// Test object // Test object
server *Server server *Server
@ -127,6 +129,12 @@ func (suite *ServiceSuite) SetupTest() {
suite.meta = meta.NewMeta(params.RandomIncrementIDAllocator(), suite.store) suite.meta = meta.NewMeta(params.RandomIncrementIDAllocator(), suite.store)
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
suite.targetObserver = observers.NewTargetObserver(
suite.meta,
suite.targetMgr,
suite.dist,
suite.broker,
)
suite.nodeMgr = session.NewNodeManager() suite.nodeMgr = session.NewNodeManager()
for _, node := range suite.nodes { for _, node := range suite.nodes {
suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost")) suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost"))
@ -153,6 +161,7 @@ func (suite *ServiceSuite) SetupTest() {
meta: suite.meta, meta: suite.meta,
targetMgr: suite.targetMgr, targetMgr: suite.targetMgr,
broker: suite.broker, broker: suite.broker,
targetObserver: suite.targetObserver,
nodeMgr: suite.nodeMgr, nodeMgr: suite.nodeMgr,
cluster: suite.cluster, cluster: suite.cluster,
jobScheduler: suite.jobScheduler, jobScheduler: suite.jobScheduler,