mirror of https://github.com/milvus-io/milvus.git
issue: #34798 pr: #34810 after we remove the task priority on query coord, to avoid load/release segment blocked by too much balance task, we limit the balance task size in each round. at same time, we reduce the balance interval to trigger balance more frequently. Signed-off-by: Wei Liu <wei.liu@zilliz.com>pull/35018/head
parent
99da94ee32
commit
b3bc7f3985
|
@ -26,6 +26,7 @@ import (
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SegmentAssignPlan struct {
|
type SegmentAssignPlan struct {
|
||||||
|
@ -82,6 +83,8 @@ func (b *RoundRobinBalancer) AssignSegment(collectionID int64, segments []*meta.
|
||||||
delta1, delta2 := b.scheduler.GetSegmentTaskDelta(id1, -1), b.scheduler.GetSegmentTaskDelta(id2, -1)
|
delta1, delta2 := b.scheduler.GetSegmentTaskDelta(id1, -1), b.scheduler.GetSegmentTaskDelta(id2, -1)
|
||||||
return cnt1+delta1 < cnt2+delta2
|
return cnt1+delta1 < cnt2+delta2
|
||||||
})
|
})
|
||||||
|
|
||||||
|
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
|
||||||
ret := make([]SegmentAssignPlan, 0, len(segments))
|
ret := make([]SegmentAssignPlan, 0, len(segments))
|
||||||
for i, s := range segments {
|
for i, s := range segments {
|
||||||
plan := SegmentAssignPlan{
|
plan := SegmentAssignPlan{
|
||||||
|
@ -90,6 +93,9 @@ func (b *RoundRobinBalancer) AssignSegment(collectionID int64, segments []*meta.
|
||||||
To: nodesInfo[i%len(nodesInfo)].ID(),
|
To: nodesInfo[i%len(nodesInfo)].ID(),
|
||||||
}
|
}
|
||||||
ret = append(ret, plan)
|
ret = append(ret, plan)
|
||||||
|
if len(ret) > balanceBatchSize {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import (
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
||||||
"github.com/milvus-io/milvus/pkg/common"
|
"github.com/milvus-io/milvus/pkg/common"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
)
|
)
|
||||||
|
|
||||||
type BalanceTestSuite struct {
|
type BalanceTestSuite struct {
|
||||||
|
@ -35,6 +36,10 @@ type BalanceTestSuite struct {
|
||||||
roundRobinBalancer *RoundRobinBalancer
|
roundRobinBalancer *RoundRobinBalancer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (suite *BalanceTestSuite) SetupSuite() {
|
||||||
|
paramtable.Init()
|
||||||
|
}
|
||||||
|
|
||||||
func (suite *BalanceTestSuite) SetupTest() {
|
func (suite *BalanceTestSuite) SetupTest() {
|
||||||
nodeManager := session.NewNodeManager()
|
nodeManager := session.NewNodeManager()
|
||||||
suite.mockScheduler = task.NewMockScheduler(suite.T())
|
suite.mockScheduler = task.NewMockScheduler(suite.T())
|
||||||
|
|
|
@ -64,6 +64,7 @@ func (b *RowCountBasedBalancer) AssignSegment(collectionID int64, segments []*me
|
||||||
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
|
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
|
||||||
plans := make([]SegmentAssignPlan, 0, len(segments))
|
plans := make([]SegmentAssignPlan, 0, len(segments))
|
||||||
for _, s := range segments {
|
for _, s := range segments {
|
||||||
// pick the node with the least row count and allocate to it.
|
// pick the node with the least row count and allocate to it.
|
||||||
|
@ -74,6 +75,9 @@ func (b *RowCountBasedBalancer) AssignSegment(collectionID int64, segments []*me
|
||||||
Segment: s,
|
Segment: s,
|
||||||
}
|
}
|
||||||
plans = append(plans, plan)
|
plans = append(plans, plan)
|
||||||
|
if len(plans) > balanceBatchSize {
|
||||||
|
break
|
||||||
|
}
|
||||||
// change node's priority and push back
|
// change node's priority and push back
|
||||||
p := ni.getPriority()
|
p := ni.getPriority()
|
||||||
ni.setPriority(p + int(s.GetNumOfRows()))
|
ni.setPriority(p + int(s.GetNumOfRows()))
|
||||||
|
|
|
@ -82,6 +82,7 @@ func (b *ScoreBasedBalancer) AssignSegment(collectionID int64, segments []*meta.
|
||||||
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
|
return segments[i].GetNumOfRows() > segments[j].GetNumOfRows()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
|
||||||
plans := make([]SegmentAssignPlan, 0, len(segments))
|
plans := make([]SegmentAssignPlan, 0, len(segments))
|
||||||
for _, s := range segments {
|
for _, s := range segments {
|
||||||
func(s *meta.Segment) {
|
func(s *meta.Segment) {
|
||||||
|
@ -112,6 +113,10 @@ func (b *ScoreBasedBalancer) AssignSegment(collectionID int64, segments []*meta.
|
||||||
}
|
}
|
||||||
targetNode.setPriority(targetNode.getPriority() + priorityChange)
|
targetNode.setPriority(targetNode.getPriority() + priorityChange)
|
||||||
}(s)
|
}(s)
|
||||||
|
|
||||||
|
if len(plans) > balanceBatchSize {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return plans
|
return plans
|
||||||
}
|
}
|
||||||
|
@ -295,6 +300,8 @@ func (b *ScoreBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNodes [
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
balanceBatchSize := paramtable.Get().QueryCoordCfg.CollectionBalanceSegmentBatchSize.GetAsInt()
|
||||||
|
|
||||||
// find the segment from the node which has more score than the average
|
// find the segment from the node which has more score than the average
|
||||||
segmentsToMove := make([]*meta.Segment, 0)
|
segmentsToMove := make([]*meta.Segment, 0)
|
||||||
average := totalScore / len(onlineNodes)
|
average := totalScore / len(onlineNodes)
|
||||||
|
@ -309,6 +316,9 @@ func (b *ScoreBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNodes [
|
||||||
})
|
})
|
||||||
for _, s := range segments {
|
for _, s := range segments {
|
||||||
segmentsToMove = append(segmentsToMove, s)
|
segmentsToMove = append(segmentsToMove, s)
|
||||||
|
if len(segmentsToMove) >= balanceBatchSize {
|
||||||
|
break
|
||||||
|
}
|
||||||
leftScore -= b.calculateSegmentScore(s)
|
leftScore -= b.calculateSegmentScore(s)
|
||||||
if leftScore <= average {
|
if leftScore <= average {
|
||||||
break
|
break
|
||||||
|
|
|
@ -171,6 +171,11 @@ func (b *BalanceChecker) Check(ctx context.Context) []task.Task {
|
||||||
|
|
||||||
replicasToBalance := b.replicasToBalance()
|
replicasToBalance := b.replicasToBalance()
|
||||||
segmentPlans, channelPlans := b.balanceReplicas(replicasToBalance)
|
segmentPlans, channelPlans := b.balanceReplicas(replicasToBalance)
|
||||||
|
// iterate all collection to find a collection to balance
|
||||||
|
for len(segmentPlans) == 0 && len(channelPlans) == 0 && b.normalBalanceCollectionsCurrentRound.Len() > 0 {
|
||||||
|
replicasToBalance := b.replicasToBalance()
|
||||||
|
segmentPlans, channelPlans = b.balanceReplicas(replicasToBalance)
|
||||||
|
}
|
||||||
|
|
||||||
tasks := balance.CreateSegmentTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), segmentPlans)
|
tasks := balance.CreateSegmentTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), segmentPlans)
|
||||||
task.SetPriority(task.TaskPriorityLow, tasks...)
|
task.SetPriority(task.TaskPriorityLow, tasks...)
|
||||||
|
|
|
@ -1638,6 +1638,7 @@ type queryCoordConfig struct {
|
||||||
CollectionObserverInterval ParamItem `refreshable:"false"`
|
CollectionObserverInterval ParamItem `refreshable:"false"`
|
||||||
CheckExecutedFlagInterval ParamItem `refreshable:"false"`
|
CheckExecutedFlagInterval ParamItem `refreshable:"false"`
|
||||||
UpdateCollectionLoadStatusInterval ParamItem `refreshable:"false"`
|
UpdateCollectionLoadStatusInterval ParamItem `refreshable:"false"`
|
||||||
|
CollectionBalanceSegmentBatchSize ParamItem `refreshable true`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *queryCoordConfig) init(base *BaseTable) {
|
func (p *queryCoordConfig) init(base *BaseTable) {
|
||||||
|
@ -1890,7 +1891,7 @@ func (p *queryCoordConfig) init(base *BaseTable) {
|
||||||
p.BalanceCheckInterval = ParamItem{
|
p.BalanceCheckInterval = ParamItem{
|
||||||
Key: "queryCoord.checkBalanceInterval",
|
Key: "queryCoord.checkBalanceInterval",
|
||||||
Version: "2.3.0",
|
Version: "2.3.0",
|
||||||
DefaultValue: "10000",
|
DefaultValue: "3000",
|
||||||
PanicIfEmpty: true,
|
PanicIfEmpty: true,
|
||||||
Export: true,
|
Export: true,
|
||||||
}
|
}
|
||||||
|
@ -2162,6 +2163,15 @@ func (p *queryCoordConfig) init(base *BaseTable) {
|
||||||
Export: false,
|
Export: false,
|
||||||
}
|
}
|
||||||
p.CheckExecutedFlagInterval.Init(base.mgr)
|
p.CheckExecutedFlagInterval.Init(base.mgr)
|
||||||
|
|
||||||
|
p.CollectionBalanceSegmentBatchSize = ParamItem{
|
||||||
|
Key: "queryCoord.collectionBalanceSegmentBatchSize",
|
||||||
|
Version: "2.4.7",
|
||||||
|
DefaultValue: "5",
|
||||||
|
Doc: "the max balance task number for collection at each round",
|
||||||
|
Export: false,
|
||||||
|
}
|
||||||
|
p.CollectionBalanceSegmentBatchSize.Init(base.mgr)
|
||||||
}
|
}
|
||||||
|
|
||||||
// /////////////////////////////////////////////////////////////////////////////
|
// /////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
|
@ -339,6 +339,7 @@ func TestComponentParam(t *testing.T) {
|
||||||
params.Reset("queryCoord.checkExecutedFlagInterval")
|
params.Reset("queryCoord.checkExecutedFlagInterval")
|
||||||
|
|
||||||
assert.Equal(t, 0.3, Params.DelegatorMemoryOverloadFactor.GetAsFloat())
|
assert.Equal(t, 0.3, Params.DelegatorMemoryOverloadFactor.GetAsFloat())
|
||||||
|
assert.Equal(t, 5, Params.CollectionBalanceSegmentBatchSize.GetAsInt())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("test queryNodeConfig", func(t *testing.T) {
|
t.Run("test queryNodeConfig", func(t *testing.T) {
|
||||||
|
|
Loading…
Reference in New Issue