mirror of https://github.com/milvus-io/milvus.git
enable config different interval for different checker (#25514)
Signed-off-by: Wei Liu <wei.liu@zilliz.com>pull/25747/head
parent
9a4761dcc7
commit
6534396b3d
|
@ -27,24 +27,31 @@ import (
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
checkRoundTaskNumLimit = 256
|
checkRoundTaskNumLimit = 256
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
Segment_Checker = "segment_checker"
|
||||||
|
Channel_Checker = "channel_checker"
|
||||||
|
Balance_Checker = "balance_checker"
|
||||||
|
)
|
||||||
|
|
||||||
type CheckerController struct {
|
type CheckerController struct {
|
||||||
stopCh chan struct{}
|
stopCh chan struct{}
|
||||||
checkCh chan struct{}
|
manualCheckChs map[string]chan struct{}
|
||||||
meta *meta.Meta
|
meta *meta.Meta
|
||||||
dist *meta.DistributionManager
|
dist *meta.DistributionManager
|
||||||
targetMgr *meta.TargetManager
|
targetMgr *meta.TargetManager
|
||||||
broker *meta.CoordinatorBroker
|
broker *meta.CoordinatorBroker
|
||||||
nodeMgr *session.NodeManager
|
nodeMgr *session.NodeManager
|
||||||
balancer balance.Balance
|
balancer balance.Balance
|
||||||
|
|
||||||
scheduler task.Scheduler
|
scheduler task.Scheduler
|
||||||
checkers []Checker
|
checkers map[string]Checker
|
||||||
|
|
||||||
stopOnce sync.Once
|
stopOnce sync.Once
|
||||||
}
|
}
|
||||||
|
@ -59,50 +66,80 @@ func NewCheckerController(
|
||||||
|
|
||||||
// CheckerController runs checkers with the order,
|
// CheckerController runs checkers with the order,
|
||||||
// the former checker has higher priority
|
// the former checker has higher priority
|
||||||
checkers := []Checker{
|
checkers := map[string]Checker{
|
||||||
NewChannelChecker(meta, dist, targetMgr, balancer),
|
Channel_Checker: NewChannelChecker(meta, dist, targetMgr, balancer),
|
||||||
NewSegmentChecker(meta, dist, targetMgr, balancer, nodeMgr),
|
Segment_Checker: NewSegmentChecker(meta, dist, targetMgr, balancer, nodeMgr),
|
||||||
NewBalanceChecker(meta, balancer, nodeMgr, scheduler),
|
Balance_Checker: NewBalanceChecker(meta, balancer, nodeMgr, scheduler),
|
||||||
}
|
}
|
||||||
for i, checker := range checkers {
|
|
||||||
checker.SetID(int64(i + 1))
|
id := 0
|
||||||
|
for _, checker := range checkers {
|
||||||
|
checker.SetID(int64(id + 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
manualCheckChs := map[string]chan struct{}{
|
||||||
|
Channel_Checker: make(chan struct{}, 1),
|
||||||
|
Segment_Checker: make(chan struct{}, 1),
|
||||||
|
Balance_Checker: make(chan struct{}, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
return &CheckerController{
|
return &CheckerController{
|
||||||
stopCh: make(chan struct{}),
|
stopCh: make(chan struct{}),
|
||||||
checkCh: make(chan struct{}, 1),
|
manualCheckChs: manualCheckChs,
|
||||||
meta: meta,
|
meta: meta,
|
||||||
dist: dist,
|
dist: dist,
|
||||||
targetMgr: targetMgr,
|
targetMgr: targetMgr,
|
||||||
scheduler: scheduler,
|
scheduler: scheduler,
|
||||||
checkers: checkers,
|
checkers: checkers,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (controller *CheckerController) Start(ctx context.Context) {
|
func (controller *CheckerController) Start(ctx context.Context) {
|
||||||
go func() {
|
for checkerType := range controller.checkers {
|
||||||
ticker := time.NewTicker(Params.QueryCoordCfg.CheckInterval.GetAsDuration(time.Millisecond))
|
go controller.StartChecker(ctx, checkerType)
|
||||||
defer ticker.Stop()
|
}
|
||||||
for {
|
}
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.Info("CheckerController stopped due to context canceled")
|
|
||||||
return
|
|
||||||
|
|
||||||
case <-controller.stopCh:
|
func getCheckerInterval(checkerType string) time.Duration {
|
||||||
log.Info("CheckerController stopped")
|
switch checkerType {
|
||||||
return
|
case Segment_Checker:
|
||||||
|
return Params.QueryCoordCfg.SegmentCheckInterval.GetAsDuration(time.Millisecond)
|
||||||
|
case Channel_Checker:
|
||||||
|
return Params.QueryCoordCfg.ChannelCheckInterval.GetAsDuration(time.Millisecond)
|
||||||
|
case Balance_Checker:
|
||||||
|
return Params.QueryCoordCfg.BalanceCheckInterval.GetAsDuration(time.Millisecond)
|
||||||
|
default:
|
||||||
|
return Params.QueryCoordCfg.CheckInterval.GetAsDuration(time.Millisecond)
|
||||||
|
}
|
||||||
|
|
||||||
case <-ticker.C:
|
}
|
||||||
controller.check(ctx)
|
|
||||||
|
|
||||||
case <-controller.checkCh:
|
func (controller *CheckerController) StartChecker(ctx context.Context, checkerType string) {
|
||||||
ticker.Stop()
|
interval := getCheckerInterval(checkerType)
|
||||||
controller.check(ctx)
|
ticker := time.NewTicker(interval)
|
||||||
ticker.Reset(Params.QueryCoordCfg.CheckInterval.GetAsDuration(time.Millisecond))
|
defer ticker.Stop()
|
||||||
}
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
log.Info("Checker stopped due to context canceled",
|
||||||
|
zap.String("type", checkerType))
|
||||||
|
return
|
||||||
|
|
||||||
|
case <-controller.stopCh:
|
||||||
|
log.Info("Checker stopped",
|
||||||
|
zap.String("type", checkerType))
|
||||||
|
return
|
||||||
|
|
||||||
|
case <-ticker.C:
|
||||||
|
controller.check(ctx, checkerType)
|
||||||
|
|
||||||
|
case <-controller.manualCheckChs[checkerType]:
|
||||||
|
ticker.Stop()
|
||||||
|
controller.check(ctx, checkerType)
|
||||||
|
ticker.Reset(Params.QueryCoordCfg.CheckInterval.GetAsDuration(time.Millisecond))
|
||||||
}
|
}
|
||||||
}()
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (controller *CheckerController) Stop() {
|
func (controller *CheckerController) Stop() {
|
||||||
|
@ -112,18 +149,18 @@ func (controller *CheckerController) Stop() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (controller *CheckerController) Check() {
|
func (controller *CheckerController) Check() {
|
||||||
select {
|
for _, checkCh := range controller.manualCheckChs {
|
||||||
case controller.checkCh <- struct{}{}:
|
select {
|
||||||
default:
|
case checkCh <- struct{}{}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check is the real implementation of Check
|
// check is the real implementation of Check
|
||||||
func (controller *CheckerController) check(ctx context.Context) {
|
func (controller *CheckerController) check(ctx context.Context, checkerType string) {
|
||||||
tasks := make([]task.Task, 0)
|
checker := controller.checkers[checkerType]
|
||||||
for _, checker := range controller.checkers {
|
tasks := checker.Check(ctx)
|
||||||
tasks = append(tasks, checker.Check(ctx)...)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, task := range tasks {
|
for _, task := range tasks {
|
||||||
err := controller.scheduler.Add(task)
|
err := controller.scheduler.Add(task)
|
||||||
|
|
|
@ -0,0 +1,132 @@
|
||||||
|
// Licensed to the LF AI & Data foundation under one
|
||||||
|
// or more contributor license agreements. See the NOTICE file
|
||||||
|
// distributed with this work for additional information
|
||||||
|
// regarding copyright ownership. The ASF licenses this file
|
||||||
|
// to you under the Apache License, Version 2.0 (the
|
||||||
|
// "License"); you may not use this file except in compliance
|
||||||
|
// with the License. You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package checkers
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/kv"
|
||||||
|
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
||||||
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||||
|
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
|
||||||
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
|
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
|
||||||
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
||||||
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
||||||
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/etcd"
|
||||||
|
"github.com/stretchr/testify/mock"
|
||||||
|
"github.com/stretchr/testify/suite"
|
||||||
|
"go.uber.org/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
type CheckerControllerSuite struct {
|
||||||
|
suite.Suite
|
||||||
|
kv kv.MetaKv
|
||||||
|
meta *meta.Meta
|
||||||
|
broker *meta.MockBroker
|
||||||
|
nodeMgr *session.NodeManager
|
||||||
|
dist *meta.DistributionManager
|
||||||
|
targetManager *meta.TargetManager
|
||||||
|
scheduler *task.MockScheduler
|
||||||
|
balancer *balance.MockBalancer
|
||||||
|
|
||||||
|
controller *CheckerController
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *CheckerControllerSuite) SetupSuite() {
|
||||||
|
Params.Init()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *CheckerControllerSuite) SetupTest() {
|
||||||
|
var err error
|
||||||
|
config := GenerateEtcdConfig()
|
||||||
|
cli, err := etcd.GetEtcdClient(
|
||||||
|
config.UseEmbedEtcd.GetAsBool(),
|
||||||
|
config.EtcdUseSSL.GetAsBool(),
|
||||||
|
config.Endpoints.GetAsStrings(),
|
||||||
|
config.EtcdTLSCert.GetValue(),
|
||||||
|
config.EtcdTLSKey.GetValue(),
|
||||||
|
config.EtcdTLSCACert.GetValue(),
|
||||||
|
config.EtcdTLSMinVersion.GetValue())
|
||||||
|
suite.Require().NoError(err)
|
||||||
|
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
|
||||||
|
|
||||||
|
// meta
|
||||||
|
store := meta.NewMetaStore(suite.kv)
|
||||||
|
idAllocator := RandomIncrementIDAllocator()
|
||||||
|
suite.nodeMgr = session.NewNodeManager()
|
||||||
|
suite.meta = meta.NewMeta(idAllocator, store, suite.nodeMgr)
|
||||||
|
suite.dist = meta.NewDistributionManager()
|
||||||
|
suite.broker = meta.NewMockBroker(suite.T())
|
||||||
|
suite.targetManager = meta.NewTargetManager(suite.broker, suite.meta)
|
||||||
|
|
||||||
|
suite.balancer = balance.NewMockBalancer(suite.T())
|
||||||
|
suite.scheduler = task.NewMockScheduler(suite.T())
|
||||||
|
suite.controller = NewCheckerController(suite.meta, suite.dist, suite.targetManager, suite.balancer, suite.nodeMgr, suite.scheduler)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *CheckerControllerSuite) TestBasic() {
|
||||||
|
|
||||||
|
// set meta
|
||||||
|
suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
|
||||||
|
suite.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1, 2}))
|
||||||
|
suite.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
|
||||||
|
suite.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
|
||||||
|
suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 1)
|
||||||
|
suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 2)
|
||||||
|
|
||||||
|
// set target
|
||||||
|
segments := []*datapb.SegmentInfo{
|
||||||
|
{
|
||||||
|
ID: 1,
|
||||||
|
PartitionID: 1,
|
||||||
|
InsertChannel: "test-insert-channel",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
suite.broker.EXPECT().GetRecoveryInfoV2(mock.Anything, int64(1)).Return(
|
||||||
|
nil, segments, nil)
|
||||||
|
suite.targetManager.UpdateCollectionNextTargetWithPartitions(int64(1), int64(1))
|
||||||
|
|
||||||
|
// set dist
|
||||||
|
suite.dist.ChannelDistManager.Update(2, utils.CreateTestChannel(1, 2, 1, "test-insert-channel"))
|
||||||
|
suite.dist.LeaderViewManager.Update(2, utils.CreateTestLeaderView(2, 1, "test-insert-channel", map[int64]int64{}, map[int64]*meta.Segment{}))
|
||||||
|
|
||||||
|
counter := atomic.NewInt64(0)
|
||||||
|
suite.scheduler.EXPECT().Add(mock.Anything).Run(func(task task.Task) {
|
||||||
|
counter.Inc()
|
||||||
|
}).Return(nil)
|
||||||
|
suite.scheduler.EXPECT().GetSegmentTaskNum().Return(0).Maybe()
|
||||||
|
suite.scheduler.EXPECT().GetChannelTaskNum().Return(0).Maybe()
|
||||||
|
|
||||||
|
suite.balancer.EXPECT().AssignSegment(mock.Anything, mock.Anything, mock.Anything).Return(nil)
|
||||||
|
suite.balancer.EXPECT().AssignChannel(mock.Anything, mock.Anything).Return(nil)
|
||||||
|
ctx := context.Background()
|
||||||
|
suite.controller.Start(ctx)
|
||||||
|
defer suite.controller.Stop()
|
||||||
|
|
||||||
|
suite.Eventually(func() bool {
|
||||||
|
suite.controller.Check()
|
||||||
|
return counter.Load() > 0
|
||||||
|
}, 5*time.Second, 1*time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckControllerSuite(t *testing.T) {
|
||||||
|
suite.Run(t, new(CheckerControllerSuite))
|
||||||
|
}
|
|
@ -1152,16 +1152,23 @@ type queryCoordConfig struct {
|
||||||
OverloadedMemoryThresholdPercentage ParamItem `refreshable:"true"`
|
OverloadedMemoryThresholdPercentage ParamItem `refreshable:"true"`
|
||||||
BalanceIntervalSeconds ParamItem `refreshable:"true"`
|
BalanceIntervalSeconds ParamItem `refreshable:"true"`
|
||||||
MemoryUsageMaxDifferencePercentage ParamItem `refreshable:"true"`
|
MemoryUsageMaxDifferencePercentage ParamItem `refreshable:"true"`
|
||||||
CheckInterval ParamItem `refreshable:"true"`
|
|
||||||
ChannelTaskTimeout ParamItem `refreshable:"true"`
|
SegmentCheckInterval ParamItem `refreshable:"true"`
|
||||||
SegmentTaskTimeout ParamItem `refreshable:"true"`
|
ChannelCheckInterval ParamItem `refreshable:"true"`
|
||||||
DistPullInterval ParamItem `refreshable:"false"`
|
BalanceCheckInterval ParamItem `refreshable:"true"`
|
||||||
HeartbeatAvailableInterval ParamItem `refreshable:"true"`
|
ChannelTaskTimeout ParamItem `refreshable:"true"`
|
||||||
LoadTimeoutSeconds ParamItem `refreshable:"true"`
|
SegmentTaskTimeout ParamItem `refreshable:"true"`
|
||||||
|
DistPullInterval ParamItem `refreshable:"false"`
|
||||||
|
HeartbeatAvailableInterval ParamItem `refreshable:"true"`
|
||||||
|
LoadTimeoutSeconds ParamItem `refreshable:"true"`
|
||||||
|
|
||||||
// Deprecated: Since 2.2.2, QueryCoord do not use HandOff logic anymore
|
// Deprecated: Since 2.2.2, QueryCoord do not use HandOff logic anymore
|
||||||
CheckHandoffInterval ParamItem `refreshable:"true"`
|
CheckHandoffInterval ParamItem `refreshable:"true"`
|
||||||
EnableActiveStandby ParamItem `refreshable:"false"`
|
EnableActiveStandby ParamItem `refreshable:"false"`
|
||||||
|
|
||||||
|
// Deprecated: Since 2.2.2, use different interval for different checker
|
||||||
|
CheckInterval ParamItem `refreshable:"true"`
|
||||||
|
|
||||||
NextTargetSurviveTime ParamItem `refreshable:"true"`
|
NextTargetSurviveTime ParamItem `refreshable:"true"`
|
||||||
UpdateNextTargetInterval ParamItem `refreshable:"false"`
|
UpdateNextTargetInterval ParamItem `refreshable:"false"`
|
||||||
CheckNodeInReplicaInterval ParamItem `refreshable:"false"`
|
CheckNodeInReplicaInterval ParamItem `refreshable:"false"`
|
||||||
|
@ -1300,6 +1307,33 @@ func (p *queryCoordConfig) init(base *BaseTable) {
|
||||||
}
|
}
|
||||||
p.CheckInterval.Init(base.mgr)
|
p.CheckInterval.Init(base.mgr)
|
||||||
|
|
||||||
|
p.SegmentCheckInterval = ParamItem{
|
||||||
|
Key: "queryCoord.checkSegmentInterval",
|
||||||
|
Version: "2.3.0",
|
||||||
|
DefaultValue: "1000",
|
||||||
|
PanicIfEmpty: true,
|
||||||
|
Export: true,
|
||||||
|
}
|
||||||
|
p.SegmentCheckInterval.Init(base.mgr)
|
||||||
|
|
||||||
|
p.ChannelCheckInterval = ParamItem{
|
||||||
|
Key: "queryCoord.checkChannelInterval",
|
||||||
|
Version: "2.3.0",
|
||||||
|
DefaultValue: "1000",
|
||||||
|
PanicIfEmpty: true,
|
||||||
|
Export: true,
|
||||||
|
}
|
||||||
|
p.ChannelCheckInterval.Init(base.mgr)
|
||||||
|
|
||||||
|
p.BalanceCheckInterval = ParamItem{
|
||||||
|
Key: "queryCoord.checkChannelInterval",
|
||||||
|
Version: "2.3.0",
|
||||||
|
DefaultValue: "10000",
|
||||||
|
PanicIfEmpty: true,
|
||||||
|
Export: true,
|
||||||
|
}
|
||||||
|
p.BalanceCheckInterval.Init(base.mgr)
|
||||||
|
|
||||||
p.ChannelTaskTimeout = ParamItem{
|
p.ChannelTaskTimeout = ParamItem{
|
||||||
Key: "queryCoord.channelTaskTimeout",
|
Key: "queryCoord.channelTaskTimeout",
|
||||||
Version: "2.0.0",
|
Version: "2.0.0",
|
||||||
|
|
|
@ -292,6 +292,10 @@ func TestComponentParam(t *testing.T) {
|
||||||
assert.Equal(t, 1.3, Params.ReverseUnbalanceTolerationFactor.GetAsFloat())
|
assert.Equal(t, 1.3, Params.ReverseUnbalanceTolerationFactor.GetAsFloat())
|
||||||
params.Save("queryCoord.reverseUnBalanceTolerationFactor", "1.5")
|
params.Save("queryCoord.reverseUnBalanceTolerationFactor", "1.5")
|
||||||
assert.Equal(t, 1.5, Params.ReverseUnbalanceTolerationFactor.GetAsFloat())
|
assert.Equal(t, 1.5, Params.ReverseUnbalanceTolerationFactor.GetAsFloat())
|
||||||
|
|
||||||
|
assert.Equal(t, 1000, Params.SegmentCheckInterval.GetAsInt())
|
||||||
|
assert.Equal(t, 1000, Params.ChannelCheckInterval.GetAsInt())
|
||||||
|
assert.Equal(t, 10000, Params.BalanceCheckInterval.GetAsInt())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("test queryNodeConfig", func(t *testing.T) {
|
t.Run("test queryNodeConfig", func(t *testing.T) {
|
||||||
|
|
Loading…
Reference in New Issue