milvus/internal/datacoord/segment_allocation_policy.go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package datacoord

import (
	"context"
	"fmt"
	"math/rand"
	"sort"
	"time"

	"github.com/cockroachdb/errors"
	"github.com/samber/lo"

	"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
	"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
	"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
	"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
	"github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
	"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)

type calUpperLimitPolicy func(schema *schemapb.CollectionSchema) (int, error)

func calBySchemaPolicy(schema *schemapb.CollectionSchema) (int, error) {
	if schema == nil {
		return -1, errors.New("nil schema")
	}
	sizePerRecord, err := typeutil.EstimateSizePerRecord(schema)
	if err != nil {
		return -1, err
	}
	// check zero value, preventing panicking
	if sizePerRecord == 0 {
		return -1, errors.New("zero size record schema found")
	}
	threshold := Params.DataCoordCfg.SegmentMaxSize.GetAsFloat() * 1024 * 1024
	return int(threshold / float64(sizePerRecord)), nil
}

func calBySchemaPolicyWithDiskIndex(schema *schemapb.CollectionSchema) (int, error) {
	if schema == nil {
		return -1, errors.New("nil schema")
	}
	sizePerRecord, err := typeutil.EstimateSizePerRecord(schema)
	if err != nil {
		return -1, err
	}
	// check zero value, preventing panicking
	if sizePerRecord == 0 {
		return -1, errors.New("zero size record schema found")
	}
	threshold := Params.DataCoordCfg.DiskSegmentMaxSize.GetAsFloat() * 1024 * 1024
	return int(threshold / float64(sizePerRecord)), nil
}

func calBySegmentSizePolicy(schema *schemapb.CollectionSchema, segmentSize int64) (int, error) {
	if schema == nil {
		return -1, errors.New("nil schema")
	}
	sizePerRecord, err := typeutil.EstimateSizePerRecord(schema)
	if err != nil {
		return -1, err
	}
	// check zero value, preventing panicking
	if sizePerRecord == 0 {
		return -1, errors.New("zero size record schema found")
	}
	return int(segmentSize) / sizePerRecord, nil
}

// AllocatePolicy helper function definition to allocate Segment space
type AllocatePolicy func(segments []*SegmentInfo, count int64,
	maxCountPerL1Segment int64, level datapb.SegmentLevel) ([]*Allocation, []*Allocation)

// alloca policy for L1 segment
func AllocatePolicyL1(segments []*SegmentInfo, count int64,
	maxCountPerL1Segment int64, level datapb.SegmentLevel,
) ([]*Allocation, []*Allocation) {
	newSegmentAllocations := make([]*Allocation, 0)
	existedSegmentAllocations := make([]*Allocation, 0)
	// create new segment if count >= max num
	for count >= maxCountPerL1Segment {
		allocation := getAllocation(maxCountPerL1Segment)
		newSegmentAllocations = append(newSegmentAllocations, allocation)
		count -= maxCountPerL1Segment
	}

	// allocate space for remaining count
	if count == 0 {
		return newSegmentAllocations, existedSegmentAllocations
	}
	for _, segment := range segments {
		var allocSize int64
		for _, allocation := range segment.allocations {
			allocSize += allocation.NumOfRows
		}

		// When inserts are too fast, hardTimeTick may lag, causing segment to be unable to seal in time.
		// To prevent allocating large segment, introducing the sealProportion factor here.
		// The condition `free < 0` ensures that the allocation exceeds the minimum sealable size,
		// preventing segments from remaining unsealable indefinitely.
		maxRowsWithSealProportion := int64(float64(segment.GetMaxRowNum()) * paramtable.Get().DataCoordCfg.SegmentSealProportion.GetAsFloat())
		free := maxRowsWithSealProportion - segment.GetNumOfRows() - allocSize
		if free < 0 {
			continue
		}

		free = segment.GetMaxRowNum() - segment.GetNumOfRows() - allocSize
		if free < count {
			continue
		}
		allocation := getAllocation(count)
		allocation.SegmentID = segment.GetID()
		existedSegmentAllocations = append(existedSegmentAllocations, allocation)
		return newSegmentAllocations, existedSegmentAllocations
	}

	// allocate new segment for remaining count
	allocation := getAllocation(count)
	newSegmentAllocations = append(newSegmentAllocations, allocation)
	return newSegmentAllocations, existedSegmentAllocations
}

type SegmentSealPolicy interface {
	ShouldSeal(segment *SegmentInfo, ts Timestamp) (bool, string)
}

// segmentSealPolicy seal policy applies to segment
type segmentSealPolicyFunc func(segment *SegmentInfo, ts Timestamp) (bool, string)

func (f segmentSealPolicyFunc) ShouldSeal(segment *SegmentInfo, ts Timestamp) (bool, string) {
	return f(segment, ts)
}

// sealL1SegmentByCapacity get segmentSealPolicy with segment size factor policy
// TODO: change numOfRows to size
func sealL1SegmentByCapacity(sizeFactor float64) segmentSealPolicyFunc {
	return func(segment *SegmentInfo, ts Timestamp) (bool, string) {
		jitter := paramtable.Get().DataCoordCfg.SegmentSealProportionJitter.GetAsFloat()
		ratio := (1 - jitter*rand.Float64())
		return float64(segment.GetNumOfRows()) >= sizeFactor*float64(segment.GetMaxRowNum())*ratio,
			fmt.Sprintf("Row count capacity full, current rows: %d, max row: %d, seal factor: %f, jitter ratio: %f", segment.GetNumOfRows(), segment.GetMaxRowNum(), sizeFactor, ratio)
	}
}

// sealL1SegmentByLifetimePolicy get segmentSealPolicy with lifetime limit compares ts - segment.lastExpireTime
func sealL1SegmentByLifetime() segmentSealPolicyFunc {
	return func(segment *SegmentInfo, ts Timestamp) (bool, string) {
		if segment.GetStartPosition() == nil {
			return false, ""
		}
		lifetime := Params.DataCoordCfg.SegmentMaxLifetime.GetAsDuration(time.Second)
		pts, _ := tsoutil.ParseTS(ts)
		epts, _ := tsoutil.ParseTS(segment.GetStartPosition().GetTimestamp())
		// epts, _ := tsoutil.ParseTS(segment.GetLastExpireTime())
		d := pts.Sub(epts)
		return d >= lifetime,
			fmt.Sprintf("Segment Lifetime expired, segment last expire: %v, now:%v, max lifetime %v",
				pts, epts, lifetime)
	}
}

// sealL1SegmentByBinlogFileNumber seal L1 segment if binlog file number of segment exceed configured max number
func sealL1SegmentByBinlogFileNumber(maxBinlogFileNumber int) segmentSealPolicyFunc {
	return func(segment *SegmentInfo, ts Timestamp) (bool, string) {
		logFileCounter := 0
		for _, fieldBinlog := range segment.GetBinlogs() {
			// Only count the binlog file number of the first field which is equal to the binlog file number of the primary field.
			// Remove the multiplier generated by the number of fields.
			logFileCounter += len(fieldBinlog.GetBinlogs())
			break
		}
		return logFileCounter >= maxBinlogFileNumber,
			fmt.Sprintf("Segment binlog number too large, binlog number: %d, max binlog number: %d", logFileCounter, maxBinlogFileNumber)
	}
}

// sealLongTimeIdlePolicy seal segment if the segment has been written with a high frequency before.
// serve for this case:
// If users insert entities into segment continuously within a certain period of time, but they forgot to flush/(seal)
// it and the size of segment didn't reach the seal proportion. Under this situation, Milvus will wait these segments to
// be expired and during this period search latency may be a little high. We can assume that entities won't be inserted
// into this segment anymore, so sealLongTimeIdlePolicy will seal these segments to trigger handoff of query cluster.
// Q: Why we don't decrease the expiry time directly?
// A: We don't want to influence segments which are accepting `frequent small` batch entities.
// TODO: replace rowNum with segment size
func sealL1SegmentByIdleTime(idleTimeTolerance time.Duration, minSizeToSealIdleSegment float64, maxSizeOfSegment float64) segmentSealPolicyFunc {
	return func(segment *SegmentInfo, ts Timestamp) (bool, string) {
		limit := (minSizeToSealIdleSegment / maxSizeOfSegment) * float64(segment.GetMaxRowNum())
		return time.Since(segment.lastWrittenTime) > idleTimeTolerance &&
				float64(segment.GetNumOfRows()) > limit,
			fmt.Sprintf("segment idle, segment row number :%d, last written time: %v, max idle duration: %v", segment.GetNumOfRows(), segment.lastWrittenTime, idleTimeTolerance)
	}
}

// channelSealPolicy seal policy applies to channel
type channelSealPolicy func(string, []*SegmentInfo, Timestamp) ([]*SegmentInfo, string)

// getChannelOpenSegCapacityPolicy get channelSealPolicy with channel segment capacity policy
func getChannelOpenSegCapacityPolicy(limit int) channelSealPolicy {
	return func(channel string, segs []*SegmentInfo, ts Timestamp) ([]*SegmentInfo, string) {
		if len(segs) <= limit {
			return []*SegmentInfo{}, ""
		}
		sortSegmentsByLastExpires(segs)
		offLen := len(segs) - limit
		if offLen > len(segs) {
			offLen = len(segs)
		}
		return segs[0:offLen], fmt.Sprintf("seal by channel segment capacity, len(segs)=%d, limit=%d", len(segs), limit)
	}
}

// sealByTotalGrowingSegmentsSize seals the largest growing segment
// if the total size of growing segments exceeds the threshold.
func sealByTotalGrowingSegmentsSize() channelSealPolicy {
	return func(channel string, segments []*SegmentInfo, ts Timestamp) ([]*SegmentInfo, string) {
		growingSegments := lo.Filter(segments, func(segment *SegmentInfo, _ int) bool {
			return segment != nil && segment.GetState() == commonpb.SegmentState_Growing
		})

		var totalSize int64
		sizeMap := lo.SliceToMap(growingSegments, func(segment *SegmentInfo) (int64, int64) {
			size := segment.getSegmentSize()
			totalSize += size
			return segment.GetID(), size
		})

		threshold := paramtable.Get().DataCoordCfg.GrowingSegmentsMemSizeInMB.GetAsInt64() * 1024 * 1024
		if totalSize >= threshold {
			target := lo.MaxBy(growingSegments, func(s1, s2 *SegmentInfo) bool {
				return sizeMap[s1.GetID()] > sizeMap[s2.GetID()]
			})
			return []*SegmentInfo{target}, fmt.Sprintf("seal by total growing segments size, "+
				"totalSize=%d, threshold=%d", totalSize, threshold)
		}
		return nil, ""
	}
}

func sealByBlockingL0(meta *meta) channelSealPolicy {
	return func(channel string, segments []*SegmentInfo, _ Timestamp) ([]*SegmentInfo, string) {
		if len(segments) == 0 {
			return nil, ""
		}

		sizeLimit := paramtable.Get().DataCoordCfg.BlockingL0SizeInMB.GetAsInt64() * 1024 * 1024 // MB to bytes
		entryNumLimit := paramtable.Get().DataCoordCfg.BlockingL0EntryNum.GetAsInt64()

		if sizeLimit < 0 && entryNumLimit < 0 {
			// both policies are disable, just return
			return nil, ""
		}

		isLimitMet := func(blockingSize, blockingEntryNum int64) bool {
			return (sizeLimit < 0 || blockingSize < sizeLimit) &&
				(entryNumLimit < 0 || blockingEntryNum < entryNumLimit)
		}

		l0segments := meta.SelectSegments(context.TODO(), WithChannel(channel), SegmentFilterFunc(func(segment *SegmentInfo) bool {
			return segment.GetLevel() == datapb.SegmentLevel_L0
		}))

		// sort growing by start pos
		sortSegmentsByStartPosition(segments)

		// example:
		// G1  [0-----------------------------
		// G2            [7-------------------
		// G3      [4-------------------------
		// G4			           [10--------
		// L0a [0-----5]
		// L0b          [6-------9]
		// L0c                     [10------20]
		// say L0a&L0b make total size/num exceed limit,
		// we shall seal G1,G2,G3 since they have overlap ts range blocking l0 compaction

		// calculate size & num
		id2Size := lo.SliceToMap(l0segments, func(l0Segment *SegmentInfo) (int64, int64) {
			return l0Segment.GetID(), int64(GetBinlogSizeAsBytes(l0Segment.GetDeltalogs()))
		})
		id2EntryNum := lo.SliceToMap(l0segments, func(l0Segment *SegmentInfo) (int64, int64) {
			return l0Segment.GetID(), int64(GetBinlogEntriesNum(l0Segment.GetDeltalogs()))
		})

		// util func to calculate blocking statistics
		blockingStats := func(l0Segments []*SegmentInfo, minStartTs uint64) (blockingSize int64, blockingEntryNum int64) {
			for _, l0Segment := range l0Segments {
				if l0Segment.GetDmlPosition().GetTimestamp() >= minStartTs {
					blockingSize += id2Size[l0Segment.GetID()]
					blockingEntryNum += id2EntryNum[l0Segment.GetID()]
				}
			}
			return blockingSize, blockingEntryNum
		}

		candidates := segments

		var result []*SegmentInfo
		for len(candidates) > 0 {
			// skip segments with nil start position
			if candidates[0].GetStartPosition() == nil {
				candidates = candidates[1:]
				continue
			}
			// minStartPos must be [0], since growing is sorted
			blockingSize, blockingEntryNum := blockingStats(l0segments, candidates[0].GetStartPosition().GetTimestamp())

			// if remaining blocking size and num are both less than configured limit, skip sealing segments
			if isLimitMet(blockingSize, blockingEntryNum) {
				break
			}
			result = append(result, candidates[0])
			candidates = candidates[1:]
		}
		return result, fmt.Sprintf("seal segments due to blocking l0 size/num")
	}
}

// sortSegmentsByLastExpires sort segmentStatus with lastExpireTime ascending order
func sortSegmentsByLastExpires(segs []*SegmentInfo) {
	sort.Slice(segs, func(i, j int) bool {
		return segs[i].LastExpireTime < segs[j].LastExpireTime
	})
}

// sortSegmentsByLastExpires sort segments with start position
func sortSegmentsByStartPosition(segs []*SegmentInfo) {
	sort.Slice(segs, func(i, j int) bool {
		return segs[i].GetStartPosition().GetTimestamp() < segs[j].GetStartPosition().GetTimestamp()
	})
}

type flushPolicy func(segment *SegmentInfo, t Timestamp) bool

func flushPolicyL1(segment *SegmentInfo, t Timestamp) bool {
	return segment.GetState() == commonpb.SegmentState_Sealed &&
		segment.Level != datapb.SegmentLevel_L0 &&
		time.Since(segment.lastFlushTime) >= paramtable.Get().DataCoordCfg.SegmentFlushInterval.GetAsDuration(time.Second) &&
		segment.GetLastExpireTime() <= t &&
		segment.GetNumOfRows() != 0 &&
		// Decoupling the importing segment from the flush process,
		// This check avoids notifying the datanode to flush the
		// importing segment which may not exist.
		!segment.GetIsImporting()
}