mirror of https://github.com/milvus-io/milvus.git
312 lines
12 KiB
Go
312 lines
12 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package checkers
|
|
|
|
import (
|
|
"context"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/samber/lo"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
|
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
// BalanceChecker checks the cluster distribution and generates balance tasks.
|
|
type BalanceChecker struct {
|
|
*checkerActivation
|
|
meta *meta.Meta
|
|
nodeManager *session.NodeManager
|
|
scheduler task.Scheduler
|
|
targetMgr meta.TargetManagerInterface
|
|
getBalancerFunc GetBalancerFunc
|
|
|
|
normalBalanceCollectionsCurrentRound typeutil.UniqueSet
|
|
stoppingBalanceCollectionsCurrentRound typeutil.UniqueSet
|
|
|
|
// record auto balance ts
|
|
autoBalanceTs time.Time
|
|
}
|
|
|
|
func NewBalanceChecker(meta *meta.Meta,
|
|
targetMgr meta.TargetManagerInterface,
|
|
nodeMgr *session.NodeManager,
|
|
scheduler task.Scheduler,
|
|
getBalancerFunc GetBalancerFunc,
|
|
) *BalanceChecker {
|
|
return &BalanceChecker{
|
|
checkerActivation: newCheckerActivation(),
|
|
meta: meta,
|
|
targetMgr: targetMgr,
|
|
nodeManager: nodeMgr,
|
|
normalBalanceCollectionsCurrentRound: typeutil.NewUniqueSet(),
|
|
stoppingBalanceCollectionsCurrentRound: typeutil.NewUniqueSet(),
|
|
scheduler: scheduler,
|
|
getBalancerFunc: getBalancerFunc,
|
|
}
|
|
}
|
|
|
|
func (b *BalanceChecker) ID() utils.CheckerType {
|
|
return utils.BalanceChecker
|
|
}
|
|
|
|
func (b *BalanceChecker) Description() string {
|
|
return "BalanceChecker checks the cluster distribution and generates balance tasks"
|
|
}
|
|
|
|
func (b *BalanceChecker) readyToCheck(ctx context.Context, collectionID int64) bool {
|
|
metaExist := (b.meta.GetCollection(ctx, collectionID) != nil)
|
|
targetExist := b.targetMgr.IsNextTargetExist(ctx, collectionID) || b.targetMgr.IsCurrentTargetExist(ctx, collectionID, common.AllPartitionsID)
|
|
|
|
return metaExist && targetExist
|
|
}
|
|
|
|
func (b *BalanceChecker) getReplicaForStoppingBalance(ctx context.Context) []int64 {
|
|
hasUnbalancedCollection := false
|
|
defer func() {
|
|
if !hasUnbalancedCollection {
|
|
b.stoppingBalanceCollectionsCurrentRound.Clear()
|
|
log.RatedDebug(10, "BalanceChecker has triggered stopping balance for all "+
|
|
"collections in one round, clear collectionIDs for this round")
|
|
}
|
|
}()
|
|
|
|
ids := b.meta.GetAll(ctx)
|
|
// Sort collections using the configured sort order
|
|
ids = b.sortCollections(ctx, ids)
|
|
|
|
if paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
|
for _, cid := range ids {
|
|
// if target and meta isn't ready, skip balance this collection
|
|
if !b.readyToCheck(ctx, cid) {
|
|
continue
|
|
}
|
|
if b.stoppingBalanceCollectionsCurrentRound.Contain(cid) {
|
|
continue
|
|
}
|
|
|
|
replicas := b.meta.ReplicaManager.GetByCollection(ctx, cid)
|
|
stoppingReplicas := make([]int64, 0)
|
|
for _, replica := range replicas {
|
|
if replica.RONodesCount() > 0 {
|
|
stoppingReplicas = append(stoppingReplicas, replica.GetID())
|
|
}
|
|
}
|
|
if len(stoppingReplicas) > 0 {
|
|
hasUnbalancedCollection = true
|
|
b.stoppingBalanceCollectionsCurrentRound.Insert(cid)
|
|
return stoppingReplicas
|
|
}
|
|
}
|
|
}
|
|
|
|
// finish current round for stopping balance if no unbalanced collection
|
|
hasUnbalancedCollection = false
|
|
return nil
|
|
}
|
|
|
|
func (b *BalanceChecker) getReplicaForNormalBalance(ctx context.Context) []int64 {
|
|
hasUnbalancedCollection := false
|
|
defer func() {
|
|
if !hasUnbalancedCollection {
|
|
b.normalBalanceCollectionsCurrentRound.Clear()
|
|
log.RatedDebug(10, "BalanceChecker has triggered normal balance for all "+
|
|
"collections in one round, clear collectionIDs for this round")
|
|
}
|
|
}()
|
|
|
|
// 1. no stopping balance and auto balance is disabled, return empty collections for balance
|
|
// 2. when balancer isn't active, skip auto balance
|
|
if !Params.QueryCoordCfg.AutoBalance.GetAsBool() || !b.IsActive() {
|
|
// finish current round for normal balance if normal balance isn't triggered
|
|
hasUnbalancedCollection = false
|
|
return nil
|
|
}
|
|
|
|
ids := b.meta.GetAll(ctx)
|
|
// all replicas belonging to loading collection will be skipped
|
|
loadedCollections := lo.Filter(ids, func(cid int64, _ int) bool {
|
|
collection := b.meta.GetCollection(ctx, cid)
|
|
return collection != nil && collection.GetStatus() == querypb.LoadStatus_Loaded
|
|
})
|
|
|
|
// Before performing balancing, check the CurrentTarget/LeaderView/Distribution for all collections.
|
|
// If any collection has unready info, skip the balance operation to avoid inconsistencies.
|
|
notReadyCollections := lo.Filter(loadedCollections, func(cid int64, _ int) bool {
|
|
// todo: should also check distribution and leader view in the future
|
|
return !b.targetMgr.IsCurrentTargetReady(ctx, cid)
|
|
})
|
|
if len(notReadyCollections) > 0 {
|
|
// finish current round for normal balance if any collection isn't ready
|
|
hasUnbalancedCollection = false
|
|
log.RatedInfo(10, "skip normal balance, cause collection not ready for balance", zap.Int64s("collectionIDs", notReadyCollections))
|
|
return nil
|
|
}
|
|
|
|
// Sort collections using the configured sort order
|
|
loadedCollections = b.sortCollections(ctx, loadedCollections)
|
|
|
|
// iterator one normal collection in one round
|
|
normalReplicasToBalance := make([]int64, 0)
|
|
for _, cid := range loadedCollections {
|
|
if b.normalBalanceCollectionsCurrentRound.Contain(cid) {
|
|
log.RatedDebug(10, "BalanceChecker is balancing this collection, skip balancing in this round",
|
|
zap.Int64("collectionID", cid))
|
|
continue
|
|
}
|
|
hasUnbalancedCollection = true
|
|
b.normalBalanceCollectionsCurrentRound.Insert(cid)
|
|
for _, replica := range b.meta.ReplicaManager.GetByCollection(ctx, cid) {
|
|
normalReplicasToBalance = append(normalReplicasToBalance, replica.GetID())
|
|
}
|
|
break
|
|
}
|
|
return normalReplicasToBalance
|
|
}
|
|
|
|
func (b *BalanceChecker) balanceReplicas(ctx context.Context, replicaIDs []int64) ([]balance.SegmentAssignPlan, []balance.ChannelAssignPlan) {
|
|
segmentPlans, channelPlans := make([]balance.SegmentAssignPlan, 0), make([]balance.ChannelAssignPlan, 0)
|
|
for _, rid := range replicaIDs {
|
|
replica := b.meta.ReplicaManager.Get(ctx, rid)
|
|
if replica == nil {
|
|
continue
|
|
}
|
|
sPlans, cPlans := b.getBalancerFunc().BalanceReplica(ctx, replica)
|
|
segmentPlans = append(segmentPlans, sPlans...)
|
|
channelPlans = append(channelPlans, cPlans...)
|
|
if len(segmentPlans) != 0 || len(channelPlans) != 0 {
|
|
balance.PrintNewBalancePlans(replica.GetCollectionID(), replica.GetID(), sPlans, cPlans)
|
|
}
|
|
}
|
|
return segmentPlans, channelPlans
|
|
}
|
|
|
|
// Notice: balance checker will generate tasks for multiple collections in one round,
|
|
// so generated tasks will be submitted to scheduler directly, and return nil
|
|
func (b *BalanceChecker) Check(ctx context.Context) []task.Task {
|
|
segmentBatchSize := paramtable.Get().QueryCoordCfg.BalanceSegmentBatchSize.GetAsInt()
|
|
channelBatchSize := paramtable.Get().QueryCoordCfg.BalanceChannelBatchSize.GetAsInt()
|
|
balanceOnMultipleCollections := paramtable.Get().QueryCoordCfg.EnableBalanceOnMultipleCollections.GetAsBool()
|
|
|
|
segmentTasks := make([]task.Task, 0)
|
|
channelTasks := make([]task.Task, 0)
|
|
|
|
generateBalanceTaskForReplicas := func(replicas []int64) {
|
|
segmentPlans, channelPlans := b.balanceReplicas(ctx, replicas)
|
|
tasks := balance.CreateSegmentTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), segmentPlans)
|
|
task.SetPriority(task.TaskPriorityLow, tasks...)
|
|
task.SetReason("segment unbalanced", tasks...)
|
|
segmentTasks = append(segmentTasks, tasks...)
|
|
|
|
tasks = balance.CreateChannelTasksFromPlans(ctx, b.ID(), Params.QueryCoordCfg.ChannelTaskTimeout.GetAsDuration(time.Millisecond), channelPlans)
|
|
task.SetReason("channel unbalanced", tasks...)
|
|
channelTasks = append(channelTasks, tasks...)
|
|
}
|
|
|
|
stoppingReplicas := b.getReplicaForStoppingBalance(ctx)
|
|
if len(stoppingReplicas) > 0 {
|
|
// check for stopping balance first
|
|
generateBalanceTaskForReplicas(stoppingReplicas)
|
|
// iterate all collection to find a collection to balance
|
|
for len(segmentTasks) < segmentBatchSize && len(channelTasks) < channelBatchSize && b.stoppingBalanceCollectionsCurrentRound.Len() > 0 {
|
|
if !balanceOnMultipleCollections && (len(segmentTasks) > 0 || len(channelTasks) > 0) {
|
|
// if balance on multiple collections is disabled, and there are already some tasks, break
|
|
break
|
|
}
|
|
replicasToBalance := b.getReplicaForStoppingBalance(ctx)
|
|
generateBalanceTaskForReplicas(replicasToBalance)
|
|
}
|
|
} else {
|
|
// then check for auto balance
|
|
if time.Since(b.autoBalanceTs) > paramtable.Get().QueryCoordCfg.AutoBalanceInterval.GetAsDuration(time.Millisecond) {
|
|
b.autoBalanceTs = time.Now()
|
|
replicasToBalance := b.getReplicaForNormalBalance(ctx)
|
|
generateBalanceTaskForReplicas(replicasToBalance)
|
|
// iterate all collection to find a collection to balance
|
|
for len(segmentTasks) < segmentBatchSize && len(channelTasks) < channelBatchSize && b.normalBalanceCollectionsCurrentRound.Len() > 0 {
|
|
if !balanceOnMultipleCollections && (len(segmentTasks) > 0 || len(channelTasks) > 0) {
|
|
// if balance on multiple collections is disabled, and there are already some tasks, break
|
|
break
|
|
}
|
|
replicasToBalance := b.getReplicaForNormalBalance(ctx)
|
|
generateBalanceTaskForReplicas(replicasToBalance)
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, task := range segmentTasks {
|
|
b.scheduler.Add(task)
|
|
}
|
|
|
|
for _, task := range channelTasks {
|
|
b.scheduler.Add(task)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *BalanceChecker) sortCollections(ctx context.Context, collections []int64) []int64 {
|
|
sortOrder := strings.ToLower(Params.QueryCoordCfg.BalanceTriggerOrder.GetValue())
|
|
if sortOrder == "" {
|
|
sortOrder = "byrowcount" // Default to ByRowCount
|
|
}
|
|
|
|
collectionRowCountMap := make(map[int64]int64)
|
|
for _, cid := range collections {
|
|
collectionRowCountMap[cid] = b.targetMgr.GetCollectionRowCount(ctx, cid, meta.CurrentTargetFirst)
|
|
}
|
|
|
|
// Define sorting functions
|
|
sortByRowCount := func(i, j int) bool {
|
|
rowCount1 := collectionRowCountMap[collections[i]]
|
|
rowCount2 := collectionRowCountMap[collections[j]]
|
|
return rowCount1 > rowCount2 || (rowCount1 == rowCount2 && collections[i] < collections[j])
|
|
}
|
|
|
|
sortByCollectionID := func(i, j int) bool {
|
|
return collections[i] < collections[j]
|
|
}
|
|
|
|
// Select the appropriate sorting function
|
|
var sortFunc func(i, j int) bool
|
|
switch sortOrder {
|
|
case "byrowcount":
|
|
sortFunc = sortByRowCount
|
|
case "bycollectionid":
|
|
sortFunc = sortByCollectionID
|
|
default:
|
|
log.Warn("Invalid balance sort order configuration, using default ByRowCount", zap.String("sortOrder", sortOrder))
|
|
sortFunc = sortByRowCount
|
|
}
|
|
|
|
// Sort the collections
|
|
sort.Slice(collections, sortFunc)
|
|
return collections
|
|
}
|