2021-10-25 11:48:23 +00:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
2021-06-08 11:25:37 +00:00
// with the License. You may obtain a copy of the License at
//
2021-10-25 11:48:23 +00:00
// http://www.apache.org/licenses/LICENSE-2.0
2021-06-08 11:25:37 +00:00
//
2021-10-25 11:48:23 +00:00
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2021-06-08 11:25:37 +00:00
2021-06-22 02:42:07 +00:00
package datacoord
2021-05-21 10:30:41 +00:00
import (
2024-07-11 03:40:51 +00:00
"fmt"
2024-07-15 06:47:39 +00:00
"math/rand"
2021-06-08 11:25:37 +00:00
"sort"
2021-08-19 06:08:10 +00:00
"time"
2021-06-08 11:25:37 +00:00
2023-02-26 03:31:49 +00:00
"github.com/cockroachdb/errors"
2024-07-17 13:45:41 +00:00
"github.com/samber/lo"
2023-02-26 03:31:49 +00:00
2023-06-08 17:28:37 +00:00
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
2023-11-06 07:26:16 +00:00
"github.com/milvus-io/milvus/internal/proto/datapb"
2024-07-12 10:59:35 +00:00
"github.com/milvus-io/milvus/pkg/util/paramtable"
2023-09-21 01:45:27 +00:00
"github.com/milvus-io/milvus/pkg/util/tsoutil"
2023-04-06 11:14:32 +00:00
"github.com/milvus-io/milvus/pkg/util/typeutil"
2021-05-21 10:30:41 +00:00
)
2021-07-12 09:24:25 +00:00
type calUpperLimitPolicy func ( schema * schemapb . CollectionSchema ) ( int , error )
2021-05-21 10:30:41 +00:00
2021-07-12 09:24:25 +00:00
func calBySchemaPolicy ( schema * schemapb . CollectionSchema ) ( int , error ) {
2021-08-20 09:50:12 +00:00
if schema == nil {
return - 1 , errors . New ( "nil schema" )
}
2021-05-21 10:30:41 +00:00
sizePerRecord , err := typeutil . EstimateSizePerRecord ( schema )
if err != nil {
return - 1 , err
}
2021-08-20 09:50:12 +00:00
// check zero value, preventing panicking
if sizePerRecord == 0 {
return - 1 , errors . New ( "zero size record schema found" )
}
2022-12-07 10:01:19 +00:00
threshold := Params . DataCoordCfg . SegmentMaxSize . GetAsFloat ( ) * 1024 * 1024
2021-05-21 10:30:41 +00:00
return int ( threshold / float64 ( sizePerRecord ) ) , nil
}
2022-09-25 12:12:52 +00:00
func calBySchemaPolicyWithDiskIndex ( schema * schemapb . CollectionSchema ) ( int , error ) {
if schema == nil {
return - 1 , errors . New ( "nil schema" )
}
sizePerRecord , err := typeutil . EstimateSizePerRecord ( schema )
if err != nil {
return - 1 , err
}
// check zero value, preventing panicking
if sizePerRecord == 0 {
return - 1 , errors . New ( "zero size record schema found" )
}
2022-12-07 10:01:19 +00:00
threshold := Params . DataCoordCfg . DiskSegmentMaxSize . GetAsFloat ( ) * 1024 * 1024
2022-09-25 12:12:52 +00:00
return int ( threshold / float64 ( sizePerRecord ) ) , nil
}
2024-08-01 14:04:14 +00:00
func calBySegmentSizePolicy ( schema * schemapb . CollectionSchema , segmentSize int64 ) ( int , error ) {
if schema == nil {
return - 1 , errors . New ( "nil schema" )
}
sizePerRecord , err := typeutil . EstimateSizePerRecord ( schema )
if err != nil {
return - 1 , err
}
// check zero value, preventing panicking
if sizePerRecord == 0 {
return - 1 , errors . New ( "zero size record schema found" )
}
return int ( segmentSize ) / sizePerRecord , nil
}
2021-09-29 12:52:09 +00:00
// AllocatePolicy helper function definition to allocate Segment space
2021-07-23 13:58:33 +00:00
type AllocatePolicy func ( segments [ ] * SegmentInfo , count int64 ,
2023-11-06 07:26:16 +00:00
maxCountPerL1Segment int64 , level datapb . SegmentLevel ) ( [ ] * Allocation , [ ] * Allocation )
2021-05-21 10:30:41 +00:00
2023-11-06 07:26:16 +00:00
// alloca policy for L1 segment
func AllocatePolicyL1 ( segments [ ] * SegmentInfo , count int64 ,
maxCountPerL1Segment int64 , level datapb . SegmentLevel ,
2023-09-21 01:45:27 +00:00
) ( [ ] * Allocation , [ ] * Allocation ) {
2021-07-23 13:58:33 +00:00
newSegmentAllocations := make ( [ ] * Allocation , 0 )
existedSegmentAllocations := make ( [ ] * Allocation , 0 )
// create new segment if count >= max num
2023-11-06 07:26:16 +00:00
for count >= maxCountPerL1Segment {
allocation := getAllocation ( maxCountPerL1Segment )
2021-07-23 13:58:33 +00:00
newSegmentAllocations = append ( newSegmentAllocations , allocation )
2023-11-06 07:26:16 +00:00
count -= maxCountPerL1Segment
2021-07-23 13:58:33 +00:00
}
// allocate space for remaining count
if count == 0 {
return newSegmentAllocations , existedSegmentAllocations
}
for _ , segment := range segments {
var allocSize int64
for _ , allocation := range segment . allocations {
allocSize += allocation . NumOfRows
}
free := segment . GetMaxRowNum ( ) - segment . GetNumOfRows ( ) - allocSize
if free < count {
continue
}
2021-09-07 05:59:58 +00:00
allocation := getAllocation ( count )
allocation . SegmentID = segment . GetID ( )
2021-07-23 13:58:33 +00:00
existedSegmentAllocations = append ( existedSegmentAllocations , allocation )
return newSegmentAllocations , existedSegmentAllocations
}
// allocate new segment for remaining count
2021-09-07 05:59:58 +00:00
allocation := getAllocation ( count )
2021-07-23 13:58:33 +00:00
newSegmentAllocations = append ( newSegmentAllocations , allocation )
return newSegmentAllocations , existedSegmentAllocations
2021-05-21 10:30:41 +00:00
}
2024-07-11 03:40:51 +00:00
type SegmentSealPolicy interface {
ShouldSeal ( segment * SegmentInfo , ts Timestamp ) ( bool , string )
}
2021-06-08 11:25:37 +00:00
// segmentSealPolicy seal policy applies to segment
2024-07-11 03:40:51 +00:00
type segmentSealPolicyFunc func ( segment * SegmentInfo , ts Timestamp ) ( bool , string )
func ( f segmentSealPolicyFunc ) ShouldSeal ( segment * SegmentInfo , ts Timestamp ) ( bool , string ) {
return f ( segment , ts )
}
2021-06-08 11:25:37 +00:00
2023-11-06 07:26:16 +00:00
// sealL1SegmentByCapacity get segmentSealPolicy with segment size factor policy
2024-07-11 03:40:51 +00:00
func sealL1SegmentByCapacity ( sizeFactor float64 ) segmentSealPolicyFunc {
return func ( segment * SegmentInfo , ts Timestamp ) ( bool , string ) {
2024-07-15 06:47:39 +00:00
jitter := paramtable . Get ( ) . DataCoordCfg . SegmentSealProportionJitter . GetAsFloat ( )
ratio := ( 1 - jitter * rand . Float64 ( ) )
return float64 ( segment . currRows ) >= sizeFactor * float64 ( segment . GetMaxRowNum ( ) ) * ratio ,
fmt . Sprintf ( "Row count capacity full, current rows: %d, max row: %d, seal factor: %f, jitter ratio: %f" , segment . currRows , segment . GetMaxRowNum ( ) , sizeFactor , ratio )
2021-06-08 11:25:37 +00:00
}
}
2023-11-06 07:26:16 +00:00
// sealL1SegmentByLifetimePolicy get segmentSealPolicy with lifetime limit compares ts - segment.lastExpireTime
2024-07-11 03:40:51 +00:00
func sealL1SegmentByLifetime ( lifetime time . Duration ) segmentSealPolicyFunc {
return func ( segment * SegmentInfo , ts Timestamp ) ( bool , string ) {
2021-08-20 07:42:12 +00:00
pts , _ := tsoutil . ParseTS ( ts )
epts , _ := tsoutil . ParseTS ( segment . GetLastExpireTime ( ) )
d := pts . Sub ( epts )
2024-07-11 03:40:51 +00:00
return d >= lifetime ,
fmt . Sprintf ( "Segment Lifetime expired, segment last expire: %v, now:%v, max lifetime %v" ,
pts , epts , lifetime )
2021-06-08 11:25:37 +00:00
}
}
2023-11-06 07:26:16 +00:00
// sealL1SegmentByBinlogFileNumber seal L1 segment if binlog file number of segment exceed configured max number
2024-07-11 03:40:51 +00:00
func sealL1SegmentByBinlogFileNumber ( maxBinlogFileNumber int ) segmentSealPolicyFunc {
return func ( segment * SegmentInfo , ts Timestamp ) ( bool , string ) {
2022-12-20 06:09:25 +00:00
logFileCounter := 0
2023-02-03 09:07:52 +00:00
for _ , fieldBinlog := range segment . GetStatslogs ( ) {
2022-12-20 06:09:25 +00:00
logFileCounter += len ( fieldBinlog . GetBinlogs ( ) )
}
2024-07-11 03:40:51 +00:00
return logFileCounter >= maxBinlogFileNumber ,
fmt . Sprintf ( "Segment binlog number too large, binlog number: %d, max binlog number: %d" , logFileCounter , maxBinlogFileNumber )
2022-12-20 06:09:25 +00:00
}
}
2022-09-20 12:54:50 +00:00
// sealLongTimeIdlePolicy seal segment if the segment has been written with a high frequency before.
// serve for this case:
// If users insert entities into segment continuously within a certain period of time, but they forgot to flush/(seal)
// it and the size of segment didn't reach the seal proportion. Under this situation, Milvus will wait these segments to
// be expired and during this period search latency may be a little high. We can assume that entities won't be inserted
// into this segment anymore, so sealLongTimeIdlePolicy will seal these segments to trigger handoff of query cluster.
// Q: Why we don't decrease the expiry time directly?
// A: We don't want to influence segments which are accepting `frequent small` batch entities.
2024-07-11 03:40:51 +00:00
func sealL1SegmentByIdleTime ( idleTimeTolerance time . Duration , minSizeToSealIdleSegment float64 , maxSizeOfSegment float64 ) segmentSealPolicyFunc {
return func ( segment * SegmentInfo , ts Timestamp ) ( bool , string ) {
2022-09-20 12:54:50 +00:00
limit := ( minSizeToSealIdleSegment / maxSizeOfSegment ) * float64 ( segment . GetMaxRowNum ( ) )
return time . Since ( segment . lastWrittenTime ) > idleTimeTolerance &&
2024-07-11 03:40:51 +00:00
float64 ( segment . currRows ) > limit ,
fmt . Sprintf ( "segment idle, segment row number :%d, last written time: %v, max idle duration: %v" , segment . currRows , segment . lastWrittenTime , idleTimeTolerance )
2022-09-20 12:54:50 +00:00
}
}
2021-08-20 07:42:12 +00:00
// channelSealPolicy seal policy applies to channel
2024-07-17 13:45:41 +00:00
type channelSealPolicy func ( string , [ ] * SegmentInfo , Timestamp ) ( [ ] * SegmentInfo , string )
2021-08-20 07:42:12 +00:00
2023-06-14 08:20:38 +00:00
// getChannelOpenSegCapacityPolicy get channelSealPolicy with channel segment capacity policy
2021-06-08 11:25:37 +00:00
func getChannelOpenSegCapacityPolicy ( limit int ) channelSealPolicy {
2024-07-17 13:45:41 +00:00
return func ( channel string , segs [ ] * SegmentInfo , ts Timestamp ) ( [ ] * SegmentInfo , string ) {
2021-06-08 11:25:37 +00:00
if len ( segs ) <= limit {
2024-07-17 13:45:41 +00:00
return [ ] * SegmentInfo { } , ""
2021-06-08 11:25:37 +00:00
}
2021-06-24 06:20:10 +00:00
sortSegmentsByLastExpires ( segs )
2021-06-08 11:25:37 +00:00
offLen := len ( segs ) - limit
2021-09-07 05:59:58 +00:00
if offLen > len ( segs ) {
offLen = len ( segs )
}
2024-07-17 13:45:41 +00:00
return segs [ 0 : offLen ] , fmt . Sprintf ( "seal by channel segment capacity, len(segs)=%d, limit=%d" , len ( segs ) , limit )
}
}
// sealByTotalGrowingSegmentsSize seals the largest growing segment
// if the total size of growing segments exceeds the threshold.
func sealByTotalGrowingSegmentsSize ( ) channelSealPolicy {
return func ( channel string , segments [ ] * SegmentInfo , ts Timestamp ) ( [ ] * SegmentInfo , string ) {
growingSegments := lo . Filter ( segments , func ( segment * SegmentInfo , _ int ) bool {
return segment != nil && segment . GetState ( ) == commonpb . SegmentState_Growing
} )
var totalSize int64
sizeMap := lo . SliceToMap ( growingSegments , func ( segment * SegmentInfo ) ( int64 , int64 ) {
size := segment . getSegmentSize ( )
totalSize += size
return segment . GetID ( ) , size
} )
2024-07-19 12:27:41 +00:00
threshold := paramtable . Get ( ) . DataCoordCfg . GrowingSegmentsMemSizeInMB . GetAsInt64 ( ) * 1024 * 1024
2024-07-17 13:45:41 +00:00
if totalSize >= threshold {
target := lo . MaxBy ( growingSegments , func ( s1 , s2 * SegmentInfo ) bool {
return sizeMap [ s1 . GetID ( ) ] > sizeMap [ s2 . GetID ( ) ]
} )
return [ ] * SegmentInfo { target } , fmt . Sprintf ( "seal by total growing segments size, " +
"totalSize=%d, threshold=%d" , totalSize , threshold )
}
return nil , ""
2021-06-08 11:25:37 +00:00
}
}
2023-06-14 08:20:38 +00:00
// sortSegmentsByLastExpires sort segmentStatus with lastExpireTime ascending order
2021-07-12 09:24:25 +00:00
func sortSegmentsByLastExpires ( segs [ ] * SegmentInfo ) {
2021-06-08 11:25:37 +00:00
sort . Slice ( segs , func ( i , j int ) bool {
2021-06-24 06:20:10 +00:00
return segs [ i ] . LastExpireTime < segs [ j ] . LastExpireTime
2021-06-08 11:25:37 +00:00
} )
}
2021-07-12 09:24:25 +00:00
type flushPolicy func ( segment * SegmentInfo , t Timestamp ) bool
2021-05-21 10:30:41 +00:00
2023-11-06 07:26:16 +00:00
func flushPolicyL1 ( segment * SegmentInfo , t Timestamp ) bool {
2024-02-03 05:01:12 +00:00
return segment . GetState ( ) == commonpb . SegmentState_Sealed &&
segment . Level != datapb . SegmentLevel_L0 &&
2024-07-12 10:59:35 +00:00
time . Since ( segment . lastFlushTime ) >= paramtable . Get ( ) . DataCoordCfg . SegmentFlushInterval . GetAsDuration ( time . Second ) &&
2024-02-03 05:01:12 +00:00
segment . GetLastExpireTime ( ) <= t &&
segment . currRows != 0 &&
// Decoupling the importing segment from the flush process,
// This check avoids notifying the datanode to flush the
// importing segment which may not exist.
! segment . GetIsImporting ( )
2021-05-21 10:30:41 +00:00
}