Clear stale replicas (#20456)

Signed-off-by: yah01 <yang.cen@zilliz.com>

Signed-off-by: yah01 <yang.cen@zilliz.com>
pull/20488/head
yah01 2022-11-10 17:01:03 +08:00 committed by GitHub
parent 174310a14e
commit c71c6378ff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 45 additions and 11 deletions

View File

@ -180,8 +180,14 @@ func (job *LoadCollectionJob) Execute() error {
zap.Int64("collectionID", req.GetCollectionID()), zap.Int64("collectionID", req.GetCollectionID()),
) )
// Clear stale replicas
err := job.meta.ReplicaManager.RemoveCollection(req.GetCollectionID())
if err != nil {
log.Warn("failed to clear stale replicas", zap.Error(err))
return err
}
// Create replicas // Create replicas
// TODO(yah01): store replicas and collection atomically
replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager, replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager,
job.nodeMgr, job.nodeMgr,
req.GetCollectionID(), req.GetCollectionID(),
@ -381,8 +387,14 @@ func (job *LoadPartitionJob) Execute() error {
zap.Int64s("partitionIDs", req.GetPartitionIDs()), zap.Int64s("partitionIDs", req.GetPartitionIDs()),
) )
// Clear stale replicas
err := job.meta.ReplicaManager.RemoveCollection(req.GetCollectionID())
if err != nil {
log.Warn("failed to clear stale replicas", zap.Error(err))
return err
}
// Create replicas // Create replicas
// TODO(yah01): store replicas and collection atomically
replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager, replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager,
job.nodeMgr, job.nodeMgr,
req.GetCollectionID(), req.GetCollectionID(),

View File

@ -21,8 +21,11 @@ import (
"sync" "sync"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/util/typeutil"
. "github.com/milvus-io/milvus/internal/util/typeutil" . "github.com/milvus-io/milvus/internal/util/typeutil"
"go.uber.org/zap"
) )
type Replica struct { type Replica struct {
@ -64,15 +67,34 @@ func NewReplicaManager(idAllocator func() (int64, error), store Store) *ReplicaM
} }
// Recover recovers the replicas for given collections from meta store // Recover recovers the replicas for given collections from meta store
func (m *ReplicaManager) Recover() error { func (m *ReplicaManager) Recover(collections []int64) error {
replicas, err := m.store.GetReplicas() replicas, err := m.store.GetReplicas()
if err != nil { if err != nil {
return fmt.Errorf("failed to recover replicas, err=%w", err) return fmt.Errorf("failed to recover replicas, err=%w", err)
} }
collectionSet := typeutil.NewUniqueSet(collections...)
for _, replica := range replicas { for _, replica := range replicas {
m.replicas[replica.GetID()] = &Replica{ if collectionSet.Contain(replica.GetCollectionID()) {
Replica: replica, m.replicas[replica.GetID()] = &Replica{
Nodes: NewUniqueSet(replica.GetNodes()...), Replica: replica,
Nodes: NewUniqueSet(replica.GetNodes()...),
}
log.Info("recover replica",
zap.Int64("collectionID", replica.GetCollectionID()),
zap.Int64("replicaID", replica.GetID()),
zap.Int64s("nodes", replica.GetNodes()),
)
} else {
err := m.store.ReleaseReplica(replica.GetCollectionID(), replica.GetID())
if err != nil {
return err
}
log.Info("clear stale replica",
zap.Int64("collectionID", replica.GetCollectionID()),
zap.Int64("replicaID", replica.GetID()),
zap.Int64s("nodes", replica.GetNodes()),
)
} }
} }
return nil return nil

View File

@ -111,7 +111,7 @@ func (suite *ReplicaManagerSuite) TestRecover() {
// Clear data in memory, and then recover from meta store // Clear data in memory, and then recover from meta store
suite.clearMemory() suite.clearMemory()
mgr.Recover() mgr.Recover(suite.collections)
suite.TestGet() suite.TestGet()
// Test recover from 2.1 meta store // Test recover from 2.1 meta store
@ -125,7 +125,7 @@ func (suite *ReplicaManagerSuite) TestRecover() {
suite.kv.Save(ReplicaMetaPrefixV1+"/2100", string(value)) suite.kv.Save(ReplicaMetaPrefixV1+"/2100", string(value))
suite.clearMemory() suite.clearMemory()
mgr.Recover() mgr.Recover(append(suite.collections, 1000))
replica := mgr.Get(2100) replica := mgr.Get(2100)
suite.NotNil(replica) suite.NotNil(replica)
suite.EqualValues(1000, replica.CollectionID) suite.EqualValues(1000, replica.CollectionID)
@ -148,7 +148,7 @@ func (suite *ReplicaManagerSuite) TestRemove() {
} }
// Check whether the replicas are also removed from meta store // Check whether the replicas are also removed from meta store
mgr.Recover() mgr.Recover(suite.collections)
for _, collection := range suite.collections { for _, collection := range suite.collections {
replicas := mgr.GetByCollection(collection) replicas := mgr.GetByCollection(collection)
suite.Empty(replicas) suite.Empty(replicas)
@ -179,7 +179,7 @@ func (suite *ReplicaManagerSuite) TestNodeManipulate() {
// Check these modifications are applied to meta store // Check these modifications are applied to meta store
suite.clearMemory() suite.clearMemory()
mgr.Recover() mgr.Recover(suite.collections)
for _, collection := range suite.collections { for _, collection := range suite.collections {
replica := mgr.GetByCollectionAndNode(collection, firstNode) replica := mgr.GetByCollectionAndNode(collection, firstNode)
suite.Nil(replica) suite.Nil(replica)

View File

@ -256,7 +256,7 @@ func (s *Server) initMeta() error {
} }
metrics.QueryCoordNumCollections.WithLabelValues().Set(float64(len(s.meta.GetAll()))) metrics.QueryCoordNumCollections.WithLabelValues().Set(float64(len(s.meta.GetAll())))
err = s.meta.ReplicaManager.Recover() err = s.meta.ReplicaManager.Recover(s.meta.CollectionManager.GetAll())
if err != nil { if err != nil {
log.Error("failed to recover replicas") log.Error("failed to recover replicas")
return err return err