milvus/internal/querycoordv2/meta/replica_manager_test.go

620 lines
18 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
"context"
"testing"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"google.golang.org/protobuf/proto"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus/internal/json"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore"
"github.com/milvus-io/milvus/internal/metastore/kv/querycoord"
"github.com/milvus-io/milvus/internal/metastore/mocks"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/pkg/v2/kv"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/etcd"
"github.com/milvus-io/milvus/pkg/v2/util/metricsinfo"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type collectionLoadConfig struct {
spawnConfig map[string]int
}
func (c *collectionLoadConfig) getTotalSpawn() int {
totalSpawn := 0
for _, spawnNum := range c.spawnConfig {
totalSpawn += spawnNum
}
return totalSpawn
}
// Old replica manager test suite.
type ReplicaManagerSuite struct {
suite.Suite
rgs map[string]typeutil.UniqueSet
collections map[int64]collectionLoadConfig
idAllocator func() (int64, error)
kv kv.MetaKv
catalog metastore.QueryCoordCatalog
mgr *ReplicaManager
ctx context.Context
}
func (suite *ReplicaManagerSuite) SetupSuite() {
paramtable.Init()
suite.rgs = map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1),
"RG2": typeutil.NewUniqueSet(2, 3),
"RG3": typeutil.NewUniqueSet(4, 5, 6),
}
suite.collections = map[int64]collectionLoadConfig{
100: {
spawnConfig: map[string]int{"RG1": 1},
},
101: {
spawnConfig: map[string]int{"RG2": 2},
},
102: {
spawnConfig: map[string]int{"RG3": 2},
},
103: {
spawnConfig: map[string]int{"RG1": 1, "RG2": 1, "RG3": 1},
},
}
suite.ctx = context.Background()
}
func (suite *ReplicaManagerSuite) SetupTest() {
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.catalog = querycoord.NewCatalog(suite.kv)
suite.idAllocator = RandomIncrementIDAllocator()
suite.mgr = NewReplicaManager(suite.idAllocator, suite.catalog)
suite.spawnAll()
}
func (suite *ReplicaManagerSuite) TearDownTest() {
suite.kv.Close()
}
func (suite *ReplicaManagerSuite) TestSpawn() {
mgr := suite.mgr
ctx := suite.ctx
mgr.idAllocator = ErrorIDAllocator()
_, err := mgr.Spawn(ctx, 1, map[string]int{DefaultResourceGroupName: 1}, nil)
suite.Error(err)
replicas := mgr.GetByCollection(ctx, 1)
suite.Len(replicas, 0)
mgr.idAllocator = suite.idAllocator
replicas, err = mgr.Spawn(ctx, 1, map[string]int{DefaultResourceGroupName: 1}, []string{"channel1", "channel2"})
suite.NoError(err)
for _, replica := range replicas {
suite.Len(replica.replicaPB.GetChannelNodeInfos(), 0)
}
paramtable.Get().Save(paramtable.Get().QueryCoordCfg.Balancer.Key, ChannelLevelScoreBalancerName)
defer paramtable.Get().Reset(paramtable.Get().QueryCoordCfg.Balancer.Key)
replicas, err = mgr.Spawn(ctx, 2, map[string]int{DefaultResourceGroupName: 1}, []string{"channel1", "channel2"})
suite.NoError(err)
for _, replica := range replicas {
suite.Len(replica.replicaPB.GetChannelNodeInfos(), 2)
}
}
func (suite *ReplicaManagerSuite) TestGet() {
mgr := suite.mgr
ctx := suite.ctx
for collectionID, collectionCfg := range suite.collections {
replicas := mgr.GetByCollection(ctx, collectionID)
replicaNodes := make(map[int64][]int64)
nodes := make([]int64, 0)
for _, replica := range replicas {
suite.Equal(collectionID, replica.GetCollectionID())
suite.Equal(replica, mgr.Get(ctx, replica.GetID()))
suite.Equal(len(replica.replicaPB.GetNodes()), replica.RWNodesCount())
suite.ElementsMatch(replica.replicaPB.GetNodes(), replica.GetNodes())
replicaNodes[replica.GetID()] = replica.GetNodes()
nodes = append(nodes, replica.GetNodes()...)
}
expectedNodes := make([]int64, 0)
for rg := range collectionCfg.spawnConfig {
expectedNodes = append(expectedNodes, suite.rgs[rg].Collect()...)
}
suite.ElementsMatch(nodes, expectedNodes)
for replicaID, nodes := range replicaNodes {
for _, node := range nodes {
replica := mgr.GetByCollectionAndNode(ctx, collectionID, node)
suite.Equal(replicaID, replica.GetID())
}
}
}
}
func (suite *ReplicaManagerSuite) TestGetByNode() {
mgr := suite.mgr
ctx := suite.ctx
randomNodeID := int64(11111)
testReplica1 := newReplica(&querypb.Replica{
CollectionID: 3002,
ID: 10086,
Nodes: []int64{randomNodeID},
ResourceGroup: DefaultResourceGroupName,
})
testReplica2 := newReplica(&querypb.Replica{
CollectionID: 3002,
ID: 10087,
Nodes: []int64{randomNodeID},
ResourceGroup: DefaultResourceGroupName,
})
mgr.Put(ctx, testReplica1, testReplica2)
replicas := mgr.GetByNode(ctx, randomNodeID)
suite.Len(replicas, 2)
}
func (suite *ReplicaManagerSuite) TestRecover() {
mgr := suite.mgr
ctx := suite.ctx
// Clear data in memory, and then recover from meta store
suite.clearMemory()
mgr.Recover(ctx, lo.Keys(suite.collections))
suite.TestGet()
// Test recover from 2.1 meta store
replicaInfo := milvuspb.ReplicaInfo{
ReplicaID: 2100,
CollectionID: 1000,
NodeIds: []int64{1, 2, 3},
}
value, err := proto.Marshal(&replicaInfo)
suite.NoError(err)
suite.kv.Save(ctx, querycoord.ReplicaMetaPrefixV1+"/2100", string(value))
suite.clearMemory()
mgr.Recover(ctx, append(lo.Keys(suite.collections), 1000))
replica := mgr.Get(ctx, 2100)
suite.NotNil(replica)
suite.EqualValues(1000, replica.GetCollectionID())
suite.ElementsMatch([]int64{1, 2, 3}, replica.GetNodes())
suite.Len(replica.GetNodes(), len(replica.GetNodes()))
for _, node := range replica.GetNodes() {
suite.True(replica.Contains(node))
}
}
func (suite *ReplicaManagerSuite) TestRemove() {
mgr := suite.mgr
ctx := suite.ctx
for collection := range suite.collections {
err := mgr.RemoveCollection(ctx, collection)
suite.NoError(err)
replicas := mgr.GetByCollection(ctx, collection)
suite.Empty(replicas)
}
// Check whether the replicas are also removed from meta store
mgr.Recover(ctx, lo.Keys(suite.collections))
for collection := range suite.collections {
replicas := mgr.GetByCollection(ctx, collection)
suite.Empty(replicas)
}
}
func (suite *ReplicaManagerSuite) TestNodeManipulate() {
mgr := suite.mgr
ctx := suite.ctx
// add node into rg.
rgs := map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1, 7),
"RG2": typeutil.NewUniqueSet(2, 3, 8),
"RG3": typeutil.NewUniqueSet(4, 5, 6, 9),
}
// Add node into rg.
for collectionID, cfg := range suite.collections {
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = rgs[rg]
}
mgr.RecoverNodesInCollection(ctx, collectionID, rgsOfCollection)
for rg := range cfg.spawnConfig {
for _, node := range rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(ctx, collectionID, node)
suite.Contains(replica.GetNodes(), node)
}
}
}
// Check these modifications are applied to meta store
suite.clearMemory()
mgr.Recover(ctx, lo.Keys(suite.collections))
for collectionID, cfg := range suite.collections {
for rg := range cfg.spawnConfig {
for _, node := range rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(ctx, collectionID, node)
suite.Contains(replica.GetNodes(), node)
}
}
}
}
func (suite *ReplicaManagerSuite) spawnAll() {
mgr := suite.mgr
ctx := suite.ctx
for id, cfg := range suite.collections {
replicas, err := mgr.Spawn(ctx, id, cfg.spawnConfig, nil)
suite.NoError(err)
totalSpawn := 0
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg, spawnNum := range cfg.spawnConfig {
totalSpawn += spawnNum
rgsOfCollection[rg] = suite.rgs[rg]
}
mgr.RecoverNodesInCollection(ctx, id, rgsOfCollection)
suite.Len(replicas, totalSpawn)
}
}
func (suite *ReplicaManagerSuite) TestResourceGroup() {
mgr := NewReplicaManager(suite.idAllocator, suite.catalog)
ctx := suite.ctx
replicas1, err := mgr.Spawn(ctx, int64(1000), map[string]int{DefaultResourceGroupName: 1}, nil)
suite.NoError(err)
suite.NotNil(replicas1)
suite.Len(replicas1, 1)
replica2, err := mgr.Spawn(ctx, int64(2000), map[string]int{DefaultResourceGroupName: 1}, nil)
suite.NoError(err)
suite.NotNil(replica2)
suite.Len(replica2, 1)
replicas := mgr.GetByResourceGroup(ctx, DefaultResourceGroupName)
suite.Len(replicas, 2)
rgNames := mgr.GetResourceGroupByCollection(ctx, int64(1000))
suite.Len(rgNames, 1)
suite.True(rgNames.Contain(DefaultResourceGroupName))
}
func (suite *ReplicaManagerSuite) clearMemory() {
suite.mgr.replicas = make(map[int64]*Replica)
}
type ReplicaManagerV2Suite struct {
suite.Suite
rgs map[string]typeutil.UniqueSet
sqNodes typeutil.UniqueSet
outboundSQNodes []int64
collections map[int64]collectionLoadConfig
kv kv.MetaKv
catalog metastore.QueryCoordCatalog
mgr *ReplicaManager
ctx context.Context
}
func (suite *ReplicaManagerV2Suite) SetupSuite() {
paramtable.Init()
suite.rgs = map[string]typeutil.UniqueSet{
"RG1": typeutil.NewUniqueSet(1),
"RG2": typeutil.NewUniqueSet(2, 3),
"RG3": typeutil.NewUniqueSet(4, 5, 6),
"RG4": typeutil.NewUniqueSet(7, 8, 9, 10),
"RG5": typeutil.NewUniqueSet(11, 12, 13, 14, 15),
}
suite.sqNodes = typeutil.NewUniqueSet(16, 17, 18, 19, 20)
suite.outboundSQNodes = []int64{}
suite.collections = map[int64]collectionLoadConfig{
1000: {
spawnConfig: map[string]int{"RG1": 1},
},
1001: {
spawnConfig: map[string]int{"RG2": 2},
},
1002: {
spawnConfig: map[string]int{"RG3": 2},
},
1003: {
spawnConfig: map[string]int{"RG1": 1, "RG2": 1, "RG3": 1},
},
1004: {
spawnConfig: map[string]int{"RG4": 2, "RG5": 3},
},
1005: {
spawnConfig: map[string]int{"RG4": 3, "RG5": 2},
},
}
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.catalog = querycoord.NewCatalog(suite.kv)
idAllocator := RandomIncrementIDAllocator()
suite.mgr = NewReplicaManager(idAllocator, suite.catalog)
suite.ctx = context.Background()
}
func (suite *ReplicaManagerV2Suite) TearDownSuite() {
suite.kv.Close()
}
func (suite *ReplicaManagerV2Suite) TestSpawn() {
mgr := suite.mgr
ctx := suite.ctx
for id, cfg := range suite.collections {
replicas, err := mgr.Spawn(ctx, id, cfg.spawnConfig, nil)
suite.NoError(err)
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = suite.rgs[rg]
}
mgr.RecoverNodesInCollection(ctx, id, rgsOfCollection)
mgr.RecoverSQNodesInCollection(ctx, id, suite.sqNodes)
for rg := range cfg.spawnConfig {
for _, node := range suite.rgs[rg].Collect() {
replica := mgr.GetByCollectionAndNode(ctx, id, node)
suite.Contains(replica.GetNodes(), node)
}
}
suite.Len(replicas, cfg.getTotalSpawn())
replicas = mgr.GetByCollection(ctx, id)
suite.Len(replicas, cfg.getTotalSpawn())
}
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) testIfBalanced() {
ctx := suite.ctx
// If balanced
for id := range suite.collections {
replicas := suite.mgr.GetByCollection(ctx, id)
rgToReplica := make(map[string][]*Replica, 0)
for _, r := range replicas {
rgToReplica[r.GetResourceGroup()] = append(rgToReplica[r.GetResourceGroup()], r)
}
maximumSQNodes := -1
minimumSQNodes := -1
sqNodes := make([]int64, 0)
for _, replicas := range rgToReplica {
maximumNodes := -1
minimumNodes := -1
nodes := make([]int64, 0)
for _, r := range replicas {
availableNodes := suite.rgs[r.GetResourceGroup()]
if maximumNodes == -1 || r.RWNodesCount() > maximumNodes {
maximumNodes = r.RWNodesCount()
}
if minimumNodes == -1 || r.RWNodesCount() < minimumNodes {
minimumNodes = r.RWNodesCount()
}
if maximumSQNodes == -1 || r.RWSQNodesCount() > maximumSQNodes {
maximumSQNodes = r.RWSQNodesCount()
}
if minimumSQNodes == -1 || r.RWSQNodesCount() < minimumSQNodes {
minimumSQNodes = r.RWSQNodesCount()
}
nodes = append(nodes, r.GetRWNodes()...)
nodes = append(nodes, r.GetRONodes()...)
sqNodes = append(sqNodes, r.GetRWSQNodes()...)
r.RangeOverRONodes(func(node int64) bool {
if availableNodes.Contain(node) {
nodes = append(nodes, node)
}
return true
})
}
suite.ElementsMatch(nodes, suite.rgs[replicas[0].GetResourceGroup()].Collect())
suite.True(maximumNodes-minimumNodes <= 1)
}
availableSQNodes := suite.sqNodes.Clone()
availableSQNodes.Remove(suite.outboundSQNodes...)
suite.ElementsMatch(availableSQNodes.Collect(), sqNodes)
suite.True(maximumSQNodes-minimumSQNodes <= 1)
}
}
func (suite *ReplicaManagerV2Suite) TestTransferReplica() {
ctx := suite.ctx
// param error
err := suite.mgr.TransferReplica(ctx, 10086, "RG4", "RG5", 1)
suite.Error(err)
err = suite.mgr.TransferReplica(ctx, 1005, "RG4", "RG5", 0)
suite.Error(err)
err = suite.mgr.TransferReplica(ctx, 1005, "RG4", "RG4", 1)
suite.Error(err)
err = suite.mgr.TransferReplica(ctx, 1005, "RG4", "RG5", 1)
suite.NoError(err)
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) TestTransferReplicaAndAddNode() {
ctx := suite.ctx
suite.mgr.TransferReplica(ctx, 1005, "RG4", "RG5", 1)
suite.recoverReplica(1, false)
suite.rgs["RG5"].Insert(16, 17, 18)
suite.sqNodes.Insert(20, 21, 22)
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) TestTransferNode() {
suite.rgs["RG4"].Remove(7)
suite.rgs["RG5"].Insert(7)
suite.outboundSQNodes = []int64{16, 17, 18}
suite.recoverReplica(2, true)
suite.testIfBalanced()
}
func (suite *ReplicaManagerV2Suite) recoverReplica(k int, clearOutbound bool) {
ctx := suite.ctx
// need at least two times to recover the replicas.
// transfer node between replicas need set to outbound and then set to incoming.
for i := 0; i < k; i++ {
// do a recover
for id, cfg := range suite.collections {
rgsOfCollection := make(map[string]typeutil.UniqueSet)
for rg := range cfg.spawnConfig {
rgsOfCollection[rg] = suite.rgs[rg]
}
sqNodes := suite.sqNodes.Clone()
sqNodes.Remove(suite.outboundSQNodes...)
suite.mgr.RecoverNodesInCollection(ctx, id, rgsOfCollection)
suite.mgr.RecoverSQNodesInCollection(ctx, id, sqNodes)
}
// clear all outbound nodes
if clearOutbound {
for id := range suite.collections {
replicas := suite.mgr.GetByCollection(ctx, id)
for _, r := range replicas {
outboundNodes := r.GetRONodes()
suite.mgr.RemoveNode(ctx, r.GetID(), outboundNodes...)
suite.mgr.RemoveSQNode(ctx, r.GetID(), r.GetROSQNodes()...)
}
}
}
}
}
func TestReplicaManager(t *testing.T) {
suite.Run(t, new(ReplicaManagerSuite))
suite.Run(t, new(ReplicaManagerV2Suite))
}
func TestGetReplicasJSON(t *testing.T) {
catalog := mocks.NewQueryCoordCatalog(t)
catalog.EXPECT().SaveReplica(mock.Anything, mock.Anything).Return(nil)
idAllocator := RandomIncrementIDAllocator()
replicaManager := NewReplicaManager(idAllocator, catalog)
ctx := context.Background()
// Add some replicas to the ReplicaManager
replica1 := newReplica(&querypb.Replica{
ID: 1,
CollectionID: 100,
ResourceGroup: "rg1",
Nodes: []int64{1, 2, 3},
})
replica2 := newReplica(&querypb.Replica{
ID: 2,
CollectionID: 200,
ResourceGroup: "rg2",
Nodes: []int64{4, 5, 6},
})
err := replicaManager.put(ctx, replica1)
assert.NoError(t, err)
err = replicaManager.put(ctx, replica2)
assert.NoError(t, err)
meta := &Meta{
CollectionManager: NewCollectionManager(catalog),
}
err = meta.PutCollectionWithoutSave(ctx, &Collection{
CollectionLoadInfo: &querypb.CollectionLoadInfo{
CollectionID: 100,
DbID: int64(1),
},
})
assert.NoError(t, err)
err = meta.PutCollectionWithoutSave(ctx, &Collection{
CollectionLoadInfo: &querypb.CollectionLoadInfo{
CollectionID: 200,
},
})
assert.NoError(t, err)
jsonOutput := replicaManager.GetReplicasJSON(ctx, meta)
var replicas []*metricsinfo.Replica
err = json.Unmarshal([]byte(jsonOutput), &replicas)
assert.NoError(t, err)
assert.Len(t, replicas, 2)
checkResult := func(replica *metricsinfo.Replica) {
if replica.ID == 1 {
assert.Equal(t, int64(100), replica.CollectionID)
assert.Equal(t, "rg1", replica.ResourceGroup)
assert.ElementsMatch(t, []int64{1, 2, 3}, replica.RWNodes)
assert.Equal(t, int64(1), replica.DatabaseID)
} else if replica.ID == 2 {
assert.Equal(t, int64(200), replica.CollectionID)
assert.Equal(t, "rg2", replica.ResourceGroup)
assert.ElementsMatch(t, []int64{4, 5, 6}, replica.RWNodes)
assert.Equal(t, int64(0), replica.DatabaseID)
} else {
assert.Failf(t, "unexpected replica id", "unexpected replica id %d", replica.ID)
}
}
for _, replica := range replicas {
checkResult(replica)
}
}