milvus/internal/querycoordv2/observers/resource_observer_test.go

214 lines
7.3 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"testing"
"time"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus/internal/kv"
etcdKV "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/metastore/mocks"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/pkg/util/etcd"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type ResourceObserverSuite struct {
suite.Suite
kv kv.MetaKv
// dependency
store *mocks.QueryCoordCatalog
meta *meta.Meta
observer *ResourceObserver
nodeMgr *session.NodeManager
collectionID int64
partitionID int64
}
func (suite *ResourceObserverSuite) SetupSuite() {
paramtable.Init()
paramtable.Get().Save(Params.QueryCoordCfg.CheckResourceGroupInterval.Key, "3")
}
func (suite *ResourceObserverSuite) SetupTest() {
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdKV.NewEtcdKV(cli, config.MetaRootPath.GetValue())
// meta
suite.store = mocks.NewQueryCoordCatalog(suite.T())
idAllocator := RandomIncrementIDAllocator()
suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(idAllocator, suite.store, suite.nodeMgr)
suite.observer = NewResourceObserver(suite.meta)
suite.observer.Start()
suite.store.EXPECT().SaveResourceGroup(mock.Anything).Return(nil)
for i := 0; i < 10; i++ {
suite.nodeMgr.Add(session.NewNodeInfo(int64(i), "localhost"))
suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, int64(i))
}
}
func (suite *ResourceObserverSuite) TestCheckNodesInReplica() {
suite.store.EXPECT().SaveCollection(mock.Anything).Return(nil)
suite.store.EXPECT().SaveReplica(mock.Anything).Return(nil)
suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 2))
suite.meta.ReplicaManager.Put(meta.NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 1,
Nodes: []int64{100, 101},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(100, 101),
))
// hack all node down from replica
suite.meta.ReplicaManager.Put(meta.NewReplica(
&querypb.Replica{
ID: 2,
CollectionID: 1,
Nodes: []int64{},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(),
))
suite.meta.ResourceManager.AddResourceGroup("rg")
suite.nodeMgr.Add(session.NewNodeInfo(int64(100), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(101), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(102), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(103), "localhost"))
suite.meta.ResourceManager.AssignNode("rg", 100)
suite.meta.ResourceManager.AssignNode("rg", 101)
suite.meta.ResourceManager.AssignNode("rg", 102)
suite.meta.ResourceManager.AssignNode("rg", 103)
suite.meta.ResourceManager.HandleNodeDown(100)
suite.meta.ResourceManager.HandleNodeDown(101)
// before auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
nodesInReplica := suite.meta.ReplicaManager.Get(2).GetNodes()
return lackNodesNum == 2 && len(nodesInReplica) == 0
}, 5*time.Second, 1*time.Second)
// after auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
nodesInReplica := suite.meta.ReplicaManager.Get(2).GetNodes()
return lackNodesNum == 0 && len(nodesInReplica) == 2
}, 5*time.Second, 1*time.Second)
}
func (suite *ResourceObserverSuite) TestRecoverResourceGroupFailed() {
suite.meta.ResourceManager.AddResourceGroup("rg")
for i := 100; i < 200; i++ {
suite.nodeMgr.Add(session.NewNodeInfo(int64(i), "localhost"))
suite.meta.ResourceManager.AssignNode("rg", int64(i))
suite.meta.ResourceManager.HandleNodeDown(int64(i))
}
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
return lackNodesNum == 90
}, 5*time.Second, 1*time.Second)
}
func (suite *ResourceObserverSuite) TestRecoverReplicaFailed() {
suite.store.EXPECT().SaveCollection(mock.Anything).Return(nil)
suite.store.EXPECT().SaveReplica(mock.Anything).Return(nil).Times(2)
suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 2))
suite.meta.ReplicaManager.Put(meta.NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 1,
Nodes: []int64{100, 101},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(100, 101),
))
// hack all node down from replica
suite.meta.ReplicaManager.Put(meta.NewReplica(
&querypb.Replica{
ID: 2,
CollectionID: 1,
Nodes: []int64{},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(),
))
suite.store.EXPECT().SaveReplica(mock.Anything).Return(errors.New("store error"))
suite.meta.ResourceManager.AddResourceGroup("rg")
suite.nodeMgr.Add(session.NewNodeInfo(int64(100), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(101), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(102), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(103), "localhost"))
suite.meta.ResourceManager.AssignNode("rg", 100)
suite.meta.ResourceManager.AssignNode("rg", 101)
suite.meta.ResourceManager.AssignNode("rg", 102)
suite.meta.ResourceManager.AssignNode("rg", 103)
suite.meta.ResourceManager.HandleNodeDown(100)
suite.meta.ResourceManager.HandleNodeDown(101)
// before auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
nodesInReplica := suite.meta.ReplicaManager.Get(2).GetNodes()
return lackNodesNum == 2 && len(nodesInReplica) == 0
}, 5*time.Second, 1*time.Second)
// after auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
nodesInReplica := suite.meta.ReplicaManager.Get(2).GetNodes()
return lackNodesNum == 0 && len(nodesInReplica) == 0
}, 5*time.Second, 1*time.Second)
}
func (suite *ResourceObserverSuite) TearDownSuite() {
suite.kv.Close()
suite.observer.Stop()
}
func TestResourceObserver(t *testing.T) {
suite.Run(t, new(ResourceObserverSuite))
}