enhance: Limit collection's normal balance speed (#34810) (#34987)

issue: #34798
pr: #34810

after we remove the task priority on query coord, to avoid load/release
segment blocked by too much balance task, we limit the balance task size
in each round. at same time, we reduce the balance interval to trigger
balance more frequently.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
pull/35018/head
wei liu 2024-07-26 10:13:46 +08:00 committed by GitHub
parent 99da94ee32
commit b3bc7f3985
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 42 additions and 1 deletions

View File

@ -26,6 +26,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/pkg/util/paramtable"
)
type SegmentAssignPlan struct {
@ -82,6 +83,8 @@ func (b *RoundRobinBalancer) AssignSegment(collectionID int64, segments []*meta.
delta1, delta2 := b.scheduler.GetSegmentTaskDelta(id1, -1), b.scheduler.GetSegmentTaskDelta(id2, -1)
return cnt1+delta1 < cnt2+delta2
})
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
ret := make([]SegmentAssignPlan, 0, len(segments))
for i, s := range segments {
plan := SegmentAssignPlan{
@ -90,6 +93,9 @@ func (b *RoundRobinBalancer) AssignSegment(collectionID int64, segments []*meta.
To: nodesInfo[i%len(nodesInfo)].ID(),
}
ret = append(ret, plan)
if len(ret) > balanceBatchSize {
break
}
}
return ret
}

View File

@ -27,6 +27,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/util/paramtable"
)
type BalanceTestSuite struct {
@ -35,6 +36,10 @@ type BalanceTestSuite struct {
roundRobinBalancer *RoundRobinBalancer
}
func (suite *BalanceTestSuite) SetupSuite() {
paramtable.Init()
}
func (suite *BalanceTestSuite) SetupTest() {
nodeManager := session.NewNodeManager()
suite.mockScheduler = task.NewMockScheduler(suite.T())

View File

@ -64,6 +64,7 @@ func (b *RowCountBasedBalancer) AssignSegment(collectionID int64, segments []*me
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
})
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
plans := make([]SegmentAssignPlan, 0, len(segments))
for _, s := range segments {
// pick the node with the least row count and allocate to it.
@ -74,6 +75,9 @@ func (b *RowCountBasedBalancer) AssignSegment(collectionID int64, segments []*me
Segment: s,
}
plans = append(plans, plan)
if len(plans) > balanceBatchSize {
break
}
// change node's priority and push back
p := ni.getPriority()
ni.setPriority(p + int(s.GetNumOfRows()))

View File

@ -82,6 +82,7 @@ func (b *ScoreBasedBalancer) AssignSegment(collectionID int64, segments []*meta.
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
})
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
plans := make([]SegmentAssignPlan, 0, len(segments))
for _, s := range segments {
func(s *meta.Segment) {
@ -112,6 +113,10 @@ func (b *ScoreBasedBalancer) AssignSegment(collectionID int64, segments []*meta.
}
targetNode.setPriority(targetNode.getPriority() + priorityChange)
}(s)
if len(plans) > balanceBatchSize {
break
}
}
return plans
}
@ -295,6 +300,8 @@ func (b *ScoreBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNodes [
return nil
}
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
// find the segment from the node which has more score than the average
segmentsToMove := make([]*meta.Segment, 0)
average := totalScore / len(onlineNodes)
@ -309,6 +316,9 @@ func (b *ScoreBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNodes [
})
for _, s := range segments {
segmentsToMove = append(segmentsToMove, s)
if len(segmentsToMove) >= balanceBatchSize {
break
}
leftScore -= b.calculateSegmentScore(s)
if leftScore <= average {
break

View File

@ -171,6 +171,11 @@ func (b *BalanceChecker) Check(ctx context.Context) []task.Task {
replicasToBalance := b.replicasToBalance()
segmentPlans, channelPlans := b.balanceReplicas(replicasToBalance)
// iterate all collection to find a collection to balance
for len(segmentPlans) == 0 && len(channelPlans) == 0 && b.normalBalanceCollectionsCurrentRound.Len() > 0 {
replicasToBalance := b.replicasToBalance()
segmentPlans, channelPlans = b.balanceReplicas(replicasToBalance)
}
tasks := balance.CreateSegmentTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), segmentPlans)
task.SetPriority(task.TaskPriorityLow, tasks...)

View File

@ -1638,6 +1638,7 @@ type queryCoordConfig struct {
CollectionObserverInterval ParamItem `refreshable:"false"`
CheckExecutedFlagInterval ParamItem `refreshable:"false"`
UpdateCollectionLoadStatusInterval ParamItem `refreshable:"false"`
CollectionBalanceSegmentBatchSize ParamItem `refreshable true`
}
func (p *queryCoordConfig) init(base *BaseTable) {
@ -1890,7 +1891,7 @@ func (p *queryCoordConfig) init(base *BaseTable) {
p.BalanceCheckInterval = ParamItem{
Key: "queryCoord.checkBalanceInterval",
Version: "2.3.0",
DefaultValue: "10000",
DefaultValue: "3000",
PanicIfEmpty: true,
Export: true,
}
@ -2162,6 +2163,15 @@ func (p *queryCoordConfig) init(base *BaseTable) {
Export: false,
}
p.CheckExecutedFlagInterval.Init(base.mgr)
p.CollectionBalanceSegmentBatchSize = ParamItem{
Key: "queryCoord.collectionBalanceSegmentBatchSize",
Version: "2.4.7",
DefaultValue: "5",
Doc: "the max balance task number for collection at each round",
Export: false,
}
p.CollectionBalanceSegmentBatchSize.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////

View File

@ -339,6 +339,7 @@ func TestComponentParam(t *testing.T) {
params.Reset("queryCoord.checkExecutedFlagInterval")
assert.Equal(t, 0.3, Params.DelegatorMemoryOverloadFactor.GetAsFloat())
assert.Equal(t, 5, Params.CollectionBalanceSegmentBatchSize.GetAsInt())
})
t.Run("test queryNodeConfig", func(t *testing.T) {