milvus/internal/querycoordv2/checkers/balance_checker.go

312 lines
12 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package checkers
import (
"context"
"sort"
"strings"
"time"
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
// BalanceChecker checks the cluster distribution and generates balance tasks.
type BalanceChecker struct {
*checkerActivation
meta *meta.Meta
nodeManager *session.NodeManager
scheduler task.Scheduler
targetMgr meta.TargetManagerInterface
getBalancerFunc GetBalancerFunc
normalBalanceCollectionsCurrentRound typeutil.UniqueSet
stoppingBalanceCollectionsCurrentRound typeutil.UniqueSet
// record auto balance ts
autoBalanceTs time.Time
}
func NewBalanceChecker(meta *meta.Meta,
targetMgr meta.TargetManagerInterface,
nodeMgr *session.NodeManager,
scheduler task.Scheduler,
getBalancerFunc GetBalancerFunc,
) *BalanceChecker {
return &BalanceChecker{
checkerActivation: newCheckerActivation(),
meta: meta,
targetMgr: targetMgr,
nodeManager: nodeMgr,
normalBalanceCollectionsCurrentRound: typeutil.NewUniqueSet(),
stoppingBalanceCollectionsCurrentRound: typeutil.NewUniqueSet(),
scheduler: scheduler,
getBalancerFunc: getBalancerFunc,
}
}
func (b *BalanceChecker) ID() utils.CheckerType {
return utils.BalanceChecker
}
func (b *BalanceChecker) Description() string {
return "BalanceChecker checks the cluster distribution and generates balance tasks"
}
func (b *BalanceChecker) readyToCheck(ctx context.Context, collectionID int64) bool {
metaExist := (b.meta.GetCollection(ctx, collectionID) != nil)
targetExist := b.targetMgr.IsNextTargetExist(ctx, collectionID) || b.targetMgr.IsCurrentTargetExist(ctx, collectionID, common.AllPartitionsID)
return metaExist && targetExist
}
func (b *BalanceChecker) getReplicaForStoppingBalance(ctx context.Context) []int64 {
hasUnbalancedCollection := false
defer func() {
if !hasUnbalancedCollection {
b.stoppingBalanceCollectionsCurrentRound.Clear()
log.RatedDebug(10, "BalanceChecker has triggered stopping balance for all "+
"collections in one round, clear collectionIDs for this round")
}
}()
ids := b.meta.GetAll(ctx)
// Sort collections using the configured sort order
ids = b.sortCollections(ctx, ids)
if paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
for _, cid := range ids {
// if target and meta isn't ready, skip balance this collection
if !b.readyToCheck(ctx, cid) {
continue
}
if b.stoppingBalanceCollectionsCurrentRound.Contain(cid) {
continue
}
replicas := b.meta.ReplicaManager.GetByCollection(ctx, cid)
stoppingReplicas := make([]int64, 0)
for _, replica := range replicas {
if replica.RONodesCount() > 0 {
stoppingReplicas = append(stoppingReplicas, replica.GetID())
}
}
if len(stoppingReplicas) > 0 {
hasUnbalancedCollection = true
b.stoppingBalanceCollectionsCurrentRound.Insert(cid)
return stoppingReplicas
}
}
}
// finish current round for stopping balance if no unbalanced collection
hasUnbalancedCollection = false
return nil
}
func (b *BalanceChecker) getReplicaForNormalBalance(ctx context.Context) []int64 {
hasUnbalancedCollection := false
defer func() {
if !hasUnbalancedCollection {
b.normalBalanceCollectionsCurrentRound.Clear()
log.RatedDebug(10, "BalanceChecker has triggered normal balance for all "+
"collections in one round, clear collectionIDs for this round")
}
}()
// 1. no stopping balance and auto balance is disabled, return empty collections for balance
// 2. when balancer isn't active, skip auto balance
if !Params.QueryCoordCfg.AutoBalance.GetAsBool() || !b.IsActive() {
// finish current round for normal balance if normal balance isn't triggered
hasUnbalancedCollection = false
return nil
}
ids := b.meta.GetAll(ctx)
// all replicas belonging to loading collection will be skipped
loadedCollections := lo.Filter(ids, func(cid int64, _ int) bool {
collection := b.meta.GetCollection(ctx, cid)
return collection != nil && collection.GetStatus() == querypb.LoadStatus_Loaded
})
// Before performing balancing, check the CurrentTarget/LeaderView/Distribution for all collections.
// If any collection has unready info, skip the balance operation to avoid inconsistencies.
notReadyCollections := lo.Filter(loadedCollections, func(cid int64, _ int) bool {
// todo: should also check distribution and leader view in the future
return !b.targetMgr.IsCurrentTargetReady(ctx, cid)
})
if len(notReadyCollections) > 0 {
// finish current round for normal balance if any collection isn't ready
hasUnbalancedCollection = false
log.RatedInfo(10, "skip normal balance, cause collection not ready for balance", zap.Int64s("collectionIDs", notReadyCollections))
return nil
}
// Sort collections using the configured sort order
loadedCollections = b.sortCollections(ctx, loadedCollections)
// iterator one normal collection in one round
normalReplicasToBalance := make([]int64, 0)
for _, cid := range loadedCollections {
if b.normalBalanceCollectionsCurrentRound.Contain(cid) {
log.RatedDebug(10, "BalanceChecker is balancing this collection, skip balancing in this round",
zap.Int64("collectionID", cid))
continue
}
hasUnbalancedCollection = true
b.normalBalanceCollectionsCurrentRound.Insert(cid)
for _, replica := range b.meta.ReplicaManager.GetByCollection(ctx, cid) {
normalReplicasToBalance = append(normalReplicasToBalance, replica.GetID())
}
break
}
return normalReplicasToBalance
}
func (b *BalanceChecker) balanceReplicas(ctx context.Context, replicaIDs []int64) ([]balance.SegmentAssignPlan, []balance.ChannelAssignPlan) {
segmentPlans, channelPlans := make([]balance.SegmentAssignPlan, 0), make([]balance.ChannelAssignPlan, 0)
for _, rid := range replicaIDs {
replica := b.meta.ReplicaManager.Get(ctx, rid)
if replica == nil {
continue
}
sPlans, cPlans := b.getBalancerFunc().BalanceReplica(ctx, replica)
segmentPlans = append(segmentPlans, sPlans...)
channelPlans = append(channelPlans, cPlans...)
if len(segmentPlans) != 0 || len(channelPlans) != 0 {
balance.PrintNewBalancePlans(replica.GetCollectionID(), replica.GetID(), sPlans, cPlans)
}
}
return segmentPlans, channelPlans
}
// Notice: balance checker will generate tasks for multiple collections in one round,
// so generated tasks will be submitted to scheduler directly, and return nil
func (b *BalanceChecker) Check(ctx context.Context) []task.Task {
segmentBatchSize := paramtable.Get().QueryCoordCfg.BalanceSegmentBatchSize.GetAsInt()
channelBatchSize := paramtable.Get().QueryCoordCfg.BalanceChannelBatchSize.GetAsInt()
balanceOnMultipleCollections := paramtable.Get().QueryCoordCfg.EnableBalanceOnMultipleCollections.GetAsBool()
segmentTasks := make([]task.Task, 0)
channelTasks := make([]task.Task, 0)
generateBalanceTaskForReplicas := func(replicas []int64) {
segmentPlans, channelPlans := b.balanceReplicas(ctx, replicas)
tasks := balance.CreateSegmentTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), segmentPlans)
task.SetPriority(task.TaskPriorityLow, tasks...)
task.SetReason("segment unbalanced", tasks...)
segmentTasks = append(segmentTasks, tasks...)
tasks = balance.CreateChannelTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.ChannelTaskTimeout.GetAsDuration(time.Millisecond), channelPlans)
task.SetReason("channel unbalanced", tasks...)
channelTasks = append(channelTasks, tasks...)
}
stoppingReplicas := b.getReplicaForStoppingBalance(ctx)
if len(stoppingReplicas) > 0 {
// check for stopping balance first
generateBalanceTaskForReplicas(stoppingReplicas)
// iterate all collection to find a collection to balance
for len(segmentTasks) < segmentBatchSize && len(channelTasks) < channelBatchSize && b.stoppingBalanceCollectionsCurrentRound.Len() > 0 {
if !balanceOnMultipleCollections && (len(segmentTasks) > 0 || len(channelTasks) > 0) {
// if balance on multiple collections is disabled, and there are already some tasks, break
break
}
replicasToBalance := b.getReplicaForStoppingBalance(ctx)
generateBalanceTaskForReplicas(replicasToBalance)
}
} else {
// then check for auto balance
if time.Since(b.autoBalanceTs) > paramtable.Get().QueryCoordCfg.AutoBalanceInterval.GetAsDuration(time.Millisecond) {
b.autoBalanceTs = time.Now()
replicasToBalance := b.getReplicaForNormalBalance(ctx)
generateBalanceTaskForReplicas(replicasToBalance)
// iterate all collection to find a collection to balance
for len(segmentTasks) < segmentBatchSize && len(channelTasks) < channelBatchSize && b.normalBalanceCollectionsCurrentRound.Len() > 0 {
if !balanceOnMultipleCollections && (len(segmentTasks) > 0 || len(channelTasks) > 0) {
// if balance on multiple collections is disabled, and there are already some tasks, break
break
}
replicasToBalance := b.getReplicaForNormalBalance(ctx)
generateBalanceTaskForReplicas(replicasToBalance)
}
}
}
for _, task := range segmentTasks {
b.scheduler.Add(task)
}
for _, task := range channelTasks {
b.scheduler.Add(task)
}
return nil
}
func (b *BalanceChecker) sortCollections(ctx context.Context, collections []int64) []int64 {
sortOrder := strings.ToLower(Params.QueryCoordCfg.BalanceTriggerOrder.GetValue())
if sortOrder == "" {
sortOrder = "byrowcount" // Default to ByRowCount
}
collectionRowCountMap := make(map[int64]int64)
for _, cid := range collections {
collectionRowCountMap[cid] = b.targetMgr.GetCollectionRowCount(ctx, cid, meta.CurrentTargetFirst)
}
// Define sorting functions
sortByRowCount := func(i, j int) bool {
rowCount1 := collectionRowCountMap[collections[i]]
rowCount2 := collectionRowCountMap[collections[j]]
return rowCount1 > rowCount2 || (rowCount1 == rowCount2 && collections[i] < collections[j])
}
sortByCollectionID := func(i, j int) bool {
return collections[i] < collections[j]
}
// Select the appropriate sorting function
var sortFunc func(i, j int) bool
switch sortOrder {
case "byrowcount":
sortFunc = sortByRowCount
case "bycollectionid":
sortFunc = sortByCollectionID
default:
log.Warn("Invalid balance sort order configuration, using default ByRowCount", zap.String("sortOrder", sortOrder))
sortFunc = sortByRowCount
}
// Sort the collections
sort.Slice(collections, sortFunc)
return collections
}