2021-12-29 03:38:57 +00:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2021-11-05 14:25:00 +00:00
package datacoord
import (
"context"
2022-09-25 12:12:52 +00:00
"fmt"
2022-06-15 15:14:10 +00:00
"sort"
2021-11-05 14:25:00 +00:00
"sync"
"time"
2023-01-04 11:37:36 +00:00
"github.com/samber/lo"
"go.uber.org/zap"
2023-06-08 17:28:37 +00:00
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
2023-11-23 09:30:25 +00:00
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
2021-11-05 14:25:00 +00:00
"github.com/milvus-io/milvus/internal/proto/datapb"
2023-04-06 11:14:32 +00:00
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/indexparamcheck"
"github.com/milvus-io/milvus/pkg/util/logutil"
2023-09-21 01:45:27 +00:00
"github.com/milvus-io/milvus/pkg/util/tsoutil"
2021-11-05 14:25:00 +00:00
)
2022-06-15 15:14:10 +00:00
type compactTime struct {
2022-10-10 12:31:22 +00:00
expireTime Timestamp
collectionTTL time . Duration
2021-11-05 14:25:00 +00:00
}
type trigger interface {
start ( )
stop ( )
2021-12-16 01:57:25 +00:00
// triggerCompaction triggers a compaction if any compaction condition satisfy.
2022-10-10 12:31:22 +00:00
triggerCompaction ( ) error
2021-12-16 01:57:25 +00:00
// triggerSingleCompaction triggers a compaction bundled with collection-partition-channel-segment
2022-10-10 12:31:22 +00:00
triggerSingleCompaction ( collectionID , partitionID , segmentID int64 , channel string ) error
2021-11-05 14:25:00 +00:00
// forceTriggerCompaction force to start a compaction
2022-10-10 12:31:22 +00:00
forceTriggerCompaction ( collectionID int64 ) ( UniqueID , error )
2021-11-05 14:25:00 +00:00
}
type compactionSignal struct {
id UniqueID
isForce bool
isGlobal bool
collectionID UniqueID
partitionID UniqueID
channel string
2023-11-23 09:30:25 +00:00
segmentID UniqueID
pos * msgpb . MsgPosition
2021-11-05 14:25:00 +00:00
}
var _ trigger = ( * compactionTrigger ) ( nil )
type compactionTrigger struct {
2023-01-04 11:37:36 +00:00
handler Handler
meta * meta
allocator allocator
signals chan * compactionSignal
compactionHandler compactionPlanContext
globalTrigger * time . Ticker
forceMu sync . Mutex
quit chan struct { }
wg sync . WaitGroup
2023-11-21 01:26:22 +00:00
indexEngineVersionManager IndexEngineVersionManager
2022-12-13 07:39:21 +00:00
estimateNonDiskSegmentPolicy calUpperLimitPolicy
estimateDiskSegmentPolicy calUpperLimitPolicy
// A sloopy hack, so we can test with different segment row count without worrying that
// they are re-calculated in every compaction.
testingOnly bool
2021-11-05 14:25:00 +00:00
}
2022-09-16 03:32:48 +00:00
func newCompactionTrigger (
meta * meta ,
compactionHandler compactionPlanContext ,
allocator allocator ,
2022-10-11 13:39:24 +00:00
handler Handler ,
2023-11-21 01:26:22 +00:00
indexVersionManager IndexEngineVersionManager ,
2022-09-16 03:32:48 +00:00
) * compactionTrigger {
2021-11-05 14:25:00 +00:00
return & compactionTrigger {
2023-11-21 01:26:22 +00:00
meta : meta ,
allocator : allocator ,
signals : make ( chan * compactionSignal , 100 ) ,
compactionHandler : compactionHandler ,
indexEngineVersionManager : indexVersionManager ,
2022-12-13 07:39:21 +00:00
estimateDiskSegmentPolicy : calBySchemaPolicyWithDiskIndex ,
estimateNonDiskSegmentPolicy : calBySchemaPolicy ,
handler : handler ,
2021-11-05 14:25:00 +00:00
}
}
func ( t * compactionTrigger ) start ( ) {
t . quit = make ( chan struct { } )
2022-12-07 10:01:19 +00:00
t . globalTrigger = time . NewTicker ( Params . DataCoordCfg . GlobalCompactionInterval . GetAsDuration ( time . Second ) )
2021-11-05 14:25:00 +00:00
t . wg . Add ( 2 )
go func ( ) {
defer logutil . LogPanic ( )
defer t . wg . Done ( )
for {
select {
case <- t . quit :
2022-03-02 07:35:55 +00:00
log . Info ( "compaction trigger quit" )
2021-11-05 14:25:00 +00:00
return
case signal := <- t . signals :
switch {
case signal . isGlobal :
2023-11-30 09:06:31 +00:00
// ManualCompaction also use use handleGlobalSignal
// so throw err here
err := t . handleGlobalSignal ( signal )
if err != nil {
log . Warn ( "unable to handleGlobalSignal" , zap . Error ( err ) )
}
2021-11-05 14:25:00 +00:00
default :
2023-11-30 09:06:31 +00:00
// no need to handle err in handleSignal
2021-11-05 14:25:00 +00:00
t . handleSignal ( signal )
2022-06-15 15:14:10 +00:00
// shouldn't reset, otherwise a frequent flushed collection will affect other collections
// t.globalTrigger.Reset(Params.DataCoordCfg.GlobalCompactionInterval)
2021-11-05 14:25:00 +00:00
}
}
}
} ( )
go t . startGlobalCompactionLoop ( )
}
func ( t * compactionTrigger ) startGlobalCompactionLoop ( ) {
defer logutil . LogPanic ( )
defer t . wg . Done ( )
2021-12-16 01:59:12 +00:00
// If AutoCompaction disabled, global loop will not start
2022-12-07 10:01:19 +00:00
if ! Params . DataCoordCfg . EnableAutoCompaction . GetAsBool ( ) {
2021-12-08 11:47:05 +00:00
return
}
2021-11-05 14:25:00 +00:00
for {
select {
case <- t . quit :
t . globalTrigger . Stop ( )
log . Info ( "global compaction loop exit" )
return
case <- t . globalTrigger . C :
2022-10-10 12:31:22 +00:00
err := t . triggerCompaction ( )
2022-01-07 09:41:40 +00:00
if err != nil {
log . Warn ( "unable to triggerCompaction" , zap . Error ( err ) )
}
2021-11-05 14:25:00 +00:00
}
}
}
func ( t * compactionTrigger ) stop ( ) {
close ( t . quit )
t . wg . Wait ( )
}
2022-10-10 12:31:22 +00:00
func ( t * compactionTrigger ) allocTs ( ) ( Timestamp , error ) {
cctx , cancel := context . WithTimeout ( context . Background ( ) , 5 * time . Second )
defer cancel ( )
ts , err := t . allocator . allocTimestamp ( cctx )
if err != nil {
return 0 , err
}
return ts , nil
}
2023-05-10 09:45:20 +00:00
func ( t * compactionTrigger ) getCollection ( collectionID UniqueID ) ( * collectionInfo , error ) {
2022-10-11 13:39:24 +00:00
ctx , cancel := context . WithTimeout ( context . Background ( ) , time . Second )
defer cancel ( )
coll , err := t . handler . GetCollection ( ctx , collectionID )
if err != nil {
return nil , fmt . Errorf ( "collection ID %d not found, err: %w" , collectionID , err )
2022-10-10 12:31:22 +00:00
}
2023-05-10 09:45:20 +00:00
return coll , nil
}
func ( t * compactionTrigger ) isCollectionAutoCompactionEnabled ( coll * collectionInfo ) bool {
enabled , err := getCollectionAutoCompactionEnabled ( coll . Properties )
if err != nil {
log . Warn ( "collection properties auto compaction not valid, returning false" , zap . Error ( err ) )
return false
}
return enabled
}
2022-10-10 12:31:22 +00:00
2023-05-10 09:45:20 +00:00
func ( t * compactionTrigger ) getCompactTime ( ts Timestamp , coll * collectionInfo ) ( * compactTime , error ) {
2022-10-10 12:31:22 +00:00
collectionTTL , err := getCollectionTTL ( coll . Properties )
if err != nil {
return nil , err
}
pts , _ := tsoutil . ParseTS ( ts )
if collectionTTL > 0 {
ttexpired := pts . Add ( - collectionTTL )
ttexpiredLogic := tsoutil . ComposeTS ( ttexpired . UnixNano ( ) / int64 ( time . Millisecond ) , 0 )
2023-09-04 09:41:48 +00:00
return & compactTime { ttexpiredLogic , collectionTTL } , nil
2022-10-10 12:31:22 +00:00
}
// no expiration time
2023-09-04 09:41:48 +00:00
return & compactTime { 0 , 0 } , nil
2022-10-10 12:31:22 +00:00
}
2021-11-05 14:25:00 +00:00
// triggerCompaction trigger a compaction if any compaction condition satisfy.
2022-10-10 12:31:22 +00:00
func ( t * compactionTrigger ) triggerCompaction ( ) error {
2021-11-05 14:25:00 +00:00
id , err := t . allocSignalID ( )
if err != nil {
return err
}
signal := & compactionSignal {
2022-10-10 12:31:22 +00:00
id : id ,
isForce : false ,
isGlobal : true ,
2021-11-05 14:25:00 +00:00
}
t . signals <- signal
return nil
}
2023-03-03 06:13:49 +00:00
// triggerSingleCompaction triger a compaction bundled with collection-partition-channel-segment
2022-10-10 12:31:22 +00:00
func ( t * compactionTrigger ) triggerSingleCompaction ( collectionID , partitionID , segmentID int64 , channel string ) error {
2023-03-03 06:13:49 +00:00
// If AutoCompaction disabled, flush request will not trigger compaction
2022-12-07 10:01:19 +00:00
if ! Params . DataCoordCfg . EnableAutoCompaction . GetAsBool ( ) {
2021-12-08 11:47:05 +00:00
return nil
}
2021-11-05 14:25:00 +00:00
id , err := t . allocSignalID ( )
if err != nil {
return err
}
signal := & compactionSignal {
id : id ,
isForce : false ,
isGlobal : false ,
collectionID : collectionID ,
partitionID : partitionID ,
segmentID : segmentID ,
channel : channel ,
}
t . signals <- signal
return nil
}
// forceTriggerCompaction force to start a compaction
2022-02-18 06:51:49 +00:00
// invoked by user `ManualCompaction` operation
2022-10-10 12:31:22 +00:00
func ( t * compactionTrigger ) forceTriggerCompaction ( collectionID int64 ) ( UniqueID , error ) {
2021-11-05 14:25:00 +00:00
id , err := t . allocSignalID ( )
if err != nil {
return - 1 , err
}
signal := & compactionSignal {
id : id ,
isForce : true ,
2022-02-18 06:51:49 +00:00
isGlobal : true ,
2021-11-05 14:25:00 +00:00
collectionID : collectionID ,
}
2023-11-30 09:06:31 +00:00
err = t . handleGlobalSignal ( signal )
if err != nil {
log . Warn ( "unable to handleGlobalSignal" , zap . Error ( err ) )
return - 1 , err
}
2021-11-05 14:25:00 +00:00
return id , nil
}
func ( t * compactionTrigger ) allocSignalID ( ) ( UniqueID , error ) {
ctx , cancel := context . WithTimeout ( context . Background ( ) , 5 * time . Second )
defer cancel ( )
return t . allocator . allocID ( ctx )
}
2022-12-13 07:39:21 +00:00
func ( t * compactionTrigger ) reCalcSegmentMaxNumOfRows ( collectionID UniqueID , isDisk bool ) ( int , error ) {
2022-10-11 13:39:24 +00:00
ctx , cancel := context . WithTimeout ( context . Background ( ) , time . Second )
defer cancel ( )
collMeta , err := t . handler . GetCollection ( ctx , collectionID )
if err != nil {
2022-09-25 12:12:52 +00:00
return - 1 , fmt . Errorf ( "failed to get collection %d" , collectionID )
}
2022-12-13 07:39:21 +00:00
if isDisk {
return t . estimateDiskSegmentPolicy ( collMeta . Schema )
}
return t . estimateNonDiskSegmentPolicy ( collMeta . Schema )
2022-09-25 12:12:52 +00:00
}
2022-12-13 07:39:21 +00:00
// TODO: Update segment info should be written back to Etcd.
2022-12-12 02:33:26 +00:00
func ( t * compactionTrigger ) updateSegmentMaxSize ( segments [ ] * SegmentInfo ) ( bool , error ) {
2022-09-25 12:12:52 +00:00
if len ( segments ) == 0 {
2022-12-12 02:33:26 +00:00
return false , nil
2022-09-25 12:12:52 +00:00
}
collectionID := segments [ 0 ] . GetCollectionID ( )
2023-01-04 11:37:36 +00:00
indexInfos := t . meta . GetIndexesForCollection ( segments [ 0 ] . GetCollectionID ( ) , "" )
2022-09-25 12:12:52 +00:00
2022-12-13 07:39:21 +00:00
isDiskANN := false
2023-01-04 11:37:36 +00:00
for _ , indexInfo := range indexInfos {
indexType := getIndexType ( indexInfo . IndexParams )
if indexType == indexparamcheck . IndexDISKANN {
// If index type is DiskANN, recalc segment max size here.
isDiskANN = true
newMaxRows , err := t . reCalcSegmentMaxNumOfRows ( collectionID , true )
if err != nil {
return false , err
}
if len ( segments ) > 0 && int64 ( newMaxRows ) != segments [ 0 ] . GetMaxRowNum ( ) {
log . Info ( "segment max rows recalculated for DiskANN collection" ,
zap . Int64 ( "old max rows" , segments [ 0 ] . GetMaxRowNum ( ) ) ,
zap . Int64 ( "new max rows" , int64 ( newMaxRows ) ) )
for _ , segment := range segments {
segment . MaxRowNum = int64 ( newMaxRows )
2022-09-25 12:12:52 +00:00
}
}
}
}
2022-12-13 07:39:21 +00:00
// If index type is not DiskANN, recalc segment max size using default policy.
if ! isDiskANN && ! t . testingOnly {
newMaxRows , err := t . reCalcSegmentMaxNumOfRows ( collectionID , false )
if err != nil {
return isDiskANN , err
}
if len ( segments ) > 0 && int64 ( newMaxRows ) != segments [ 0 ] . GetMaxRowNum ( ) {
log . Info ( "segment max rows recalculated for non-DiskANN collection" ,
zap . Int64 ( "old max rows" , segments [ 0 ] . GetMaxRowNum ( ) ) ,
zap . Int64 ( "new max rows" , int64 ( newMaxRows ) ) )
for _ , segment := range segments {
segment . MaxRowNum = int64 ( newMaxRows )
}
}
}
return isDiskANN , nil
2022-09-25 12:12:52 +00:00
}
2023-11-30 09:06:31 +00:00
func ( t * compactionTrigger ) handleGlobalSignal ( signal * compactionSignal ) error {
2021-11-05 14:25:00 +00:00
t . forceMu . Lock ( )
defer t . forceMu . Unlock ( )
2023-11-30 09:06:31 +00:00
log := log . With ( zap . Int64 ( "compactionID" , signal . id ) ,
zap . Int64 ( "signal.collectionID" , signal . collectionID ) ,
zap . Int64 ( "signal.partitionID" , signal . partitionID ) ,
zap . Int64 ( "signal.segmentID" , signal . segmentID ) )
2022-02-18 06:51:49 +00:00
m := t . meta . GetSegmentsChanPart ( func ( segment * SegmentInfo ) bool {
return ( signal . collectionID == 0 || segment . CollectionID == signal . collectionID ) &&
isSegmentHealthy ( segment ) &&
isFlush ( segment ) &&
2022-09-26 10:06:54 +00:00
! segment . isCompacting && // not compacting now
2023-12-05 10:44:37 +00:00
! segment . GetIsImporting ( ) && // not importing now
segment . GetLevel ( ) != datapb . SegmentLevel_L0 // ignore level zero segments
2022-02-18 06:51:49 +00:00
} ) // m is list of chanPartSegments, which is channel-partition organized segments
2022-09-22 07:48:50 +00:00
2022-10-10 12:31:22 +00:00
if len ( m ) == 0 {
2023-11-30 09:06:31 +00:00
log . Info ( "the length of SegmentsChanPart is 0, skip to handle compaction" )
return nil
2022-10-10 12:31:22 +00:00
}
ts , err := t . allocTs ( )
if err != nil {
2023-11-30 09:06:31 +00:00
log . Warn ( "allocate ts failed, skip to handle compaction" )
return err
2022-10-10 12:31:22 +00:00
}
2022-02-18 06:51:49 +00:00
for _ , group := range m {
2023-11-30 09:06:31 +00:00
log := log . With ( zap . Int64 ( "collectionID" , group . collectionID ) ,
zap . Int64 ( "partitionID" , group . partitionID ) ,
zap . String ( "channel" , group . channelName ) )
2022-02-18 06:51:49 +00:00
if ! signal . isForce && t . compactionHandler . isFull ( ) {
2023-11-30 09:06:31 +00:00
log . Warn ( "compaction plan skipped due to handler full" )
2022-02-18 06:51:49 +00:00
break
}
2023-07-23 13:31:00 +00:00
if Params . DataCoordCfg . IndexBasedCompaction . GetAsBool ( ) {
group . segments = FilterInIndexedSegments ( t . handler , t . meta , group . segments ... )
}
2022-09-25 12:12:52 +00:00
2022-12-12 02:33:26 +00:00
isDiskIndex , err := t . updateSegmentMaxSize ( group . segments )
2022-09-25 12:12:52 +00:00
if err != nil {
2022-12-13 07:39:21 +00:00
log . Warn ( "failed to update segment max size" , zap . Error ( err ) )
2022-09-25 12:12:52 +00:00
continue
}
2023-05-10 09:45:20 +00:00
coll , err := t . getCollection ( group . collectionID )
if err != nil {
2023-11-30 09:06:31 +00:00
log . Warn ( "get collection info failed, skip handling compaction" , zap . Error ( err ) )
return err
2023-05-10 09:45:20 +00:00
}
if ! signal . isForce && ! t . isCollectionAutoCompactionEnabled ( coll ) {
log . RatedInfo ( 20 , "collection auto compaction disabled" ,
zap . Int64 ( "collectionID" , group . collectionID ) ,
)
2023-11-30 09:06:31 +00:00
return nil
2023-05-10 09:45:20 +00:00
}
ct , err := t . getCompactTime ( ts , coll )
2022-10-10 12:31:22 +00:00
if err != nil {
log . Warn ( "get compact time failed, skip to handle compaction" ,
zap . Int64 ( "collectionID" , group . collectionID ) ,
zap . Int64 ( "partitionID" , group . partitionID ) ,
zap . String ( "channel" , group . channelName ) )
2023-11-30 09:06:31 +00:00
return err
2022-10-10 12:31:22 +00:00
}
2022-12-12 02:33:26 +00:00
plans := t . generatePlans ( group . segments , signal . isForce , isDiskIndex , ct )
2022-02-18 06:51:49 +00:00
for _ , plan := range plans {
2022-11-15 03:13:07 +00:00
segIDs := fetchSegIDs ( plan . GetSegmentBinlogs ( ) )
2022-02-18 06:51:49 +00:00
if ! signal . isForce && t . compactionHandler . isFull ( ) {
2022-11-15 03:13:07 +00:00
log . Warn ( "compaction plan skipped due to handler full" ,
2023-05-12 01:53:21 +00:00
zap . Int64 ( "collectionID" , signal . collectionID ) ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "segmentIDs" , segIDs ) )
2022-02-18 06:51:49 +00:00
break
}
start := time . Now ( )
2023-11-23 09:30:25 +00:00
if err := fillOriginPlan ( t . allocator , plan ) ; err != nil {
2022-11-15 03:13:07 +00:00
log . Warn ( "failed to fill plan" ,
2023-05-12 01:53:21 +00:00
zap . Int64 ( "collectionID" , signal . collectionID ) ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "segmentIDs" , segIDs ) ,
2022-11-15 03:13:07 +00:00
zap . Error ( err ) )
2022-02-18 06:51:49 +00:00
continue
}
2022-06-22 11:00:14 +00:00
err := t . compactionHandler . execCompactionPlan ( signal , plan )
if err != nil {
2022-11-15 03:13:07 +00:00
log . Warn ( "failed to execute compaction plan" ,
2023-05-12 01:53:21 +00:00
zap . Int64 ( "collectionID" , signal . collectionID ) ,
2022-11-15 03:13:07 +00:00
zap . Int64 ( "planID" , plan . PlanID ) ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "segmentIDs" , segIDs ) ,
2022-11-15 03:13:07 +00:00
zap . Error ( err ) )
2022-06-22 11:00:14 +00:00
continue
}
2021-11-05 14:25:00 +00:00
2022-11-15 03:13:07 +00:00
segIDMap := make ( map [ int64 ] [ ] * datapb . FieldBinlog , len ( plan . SegmentBinlogs ) )
2022-07-28 06:52:31 +00:00
for _ , seg := range plan . SegmentBinlogs {
2022-11-15 03:13:07 +00:00
segIDMap [ seg . SegmentID ] = seg . Deltalogs
2022-07-28 06:52:31 +00:00
}
2022-11-15 03:13:07 +00:00
log . Info ( "time cost of generating global compaction" ,
zap . Any ( "segID2DeltaLogs" , segIDMap ) ,
zap . Int64 ( "planID" , plan . PlanID ) ,
2023-07-14 07:56:31 +00:00
zap . Int64 ( "time cost" , time . Since ( start ) . Milliseconds ( ) ) ,
2022-11-15 03:13:07 +00:00
zap . Int64 ( "collectionID" , signal . collectionID ) ,
zap . String ( "channel" , group . channelName ) ,
zap . Int64 ( "partitionID" , group . partitionID ) ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "segmentIDs" , segIDs ) )
2022-02-18 06:51:49 +00:00
}
2021-12-29 02:06:47 +00:00
}
2023-11-30 09:06:31 +00:00
return nil
2021-11-05 14:25:00 +00:00
}
2022-02-18 06:51:49 +00:00
// handleSignal processes segment flush caused partition-chan level compaction signal
2021-11-05 14:25:00 +00:00
func ( t * compactionTrigger ) handleSignal ( signal * compactionSignal ) {
t . forceMu . Lock ( )
defer t . forceMu . Unlock ( )
// 1. check whether segment's binlogs should be compacted or not
if t . compactionHandler . isFull ( ) {
2023-11-30 09:06:31 +00:00
log . Warn ( "compaction plan skipped due to handler full" )
2021-11-05 14:25:00 +00:00
return
}
2023-03-03 06:13:49 +00:00
segment := t . meta . GetHealthySegment ( signal . segmentID )
2021-12-22 13:23:10 +00:00
if segment == nil {
log . Warn ( "segment in compaction signal not found in meta" , zap . Int64 ( "segmentID" , signal . segmentID ) )
return
}
2021-11-05 14:25:00 +00:00
channel := segment . GetInsertChannel ( )
partitionID := segment . GetPartitionID ( )
2023-05-10 09:45:20 +00:00
collectionID := segment . GetCollectionID ( )
2021-11-05 14:25:00 +00:00
segments := t . getCandidateSegments ( channel , partitionID )
2022-09-25 12:12:52 +00:00
2022-10-10 12:31:22 +00:00
if len ( segments ) == 0 {
2023-11-30 09:06:31 +00:00
log . Info ( "the length of segments is 0, skip to handle compaction" )
2022-10-10 12:31:22 +00:00
return
}
2022-12-12 02:33:26 +00:00
isDiskIndex , err := t . updateSegmentMaxSize ( segments )
2022-09-25 12:12:52 +00:00
if err != nil {
log . Warn ( "failed to update segment max size" , zap . Error ( err ) )
2023-01-04 08:37:35 +00:00
return
2022-09-25 12:12:52 +00:00
}
2022-10-10 12:31:22 +00:00
ts , err := t . allocTs ( )
if err != nil {
log . Warn ( "allocate ts failed, skip to handle compaction" , zap . Int64 ( "collectionID" , signal . collectionID ) ,
zap . Int64 ( "partitionID" , signal . partitionID ) , zap . Int64 ( "segmentID" , signal . segmentID ) )
return
}
2023-05-10 09:45:20 +00:00
coll , err := t . getCollection ( collectionID )
if err != nil {
log . Warn ( "get collection info failed, skip handling compaction" ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64 ( "partitionID" , partitionID ) ,
zap . String ( "channel" , channel ) ,
zap . Error ( err ) ,
)
return
}
if ! signal . isForce && ! t . isCollectionAutoCompactionEnabled ( coll ) {
log . RatedInfo ( 20 , "collection auto compaction disabled" ,
zap . Int64 ( "collectionID" , collectionID ) ,
)
return
}
ct , err := t . getCompactTime ( ts , coll )
2022-10-10 12:31:22 +00:00
if err != nil {
log . Warn ( "get compact time failed, skip to handle compaction" , zap . Int64 ( "collectionID" , segment . GetCollectionID ( ) ) ,
zap . Int64 ( "partitionID" , partitionID ) , zap . String ( "channel" , channel ) )
return
}
2022-12-12 02:33:26 +00:00
plans := t . generatePlans ( segments , signal . isForce , isDiskIndex , ct )
2022-02-18 06:51:49 +00:00
for _ , plan := range plans {
if t . compactionHandler . isFull ( ) {
log . Warn ( "compaction plan skipped due to handler full" , zap . Int64 ( "collection" , signal . collectionID ) , zap . Int64 ( "planID" , plan . PlanID ) )
break
}
start := time . Now ( )
2023-11-23 09:30:25 +00:00
if err := fillOriginPlan ( t . allocator , plan ) ; err != nil {
2022-02-18 06:51:49 +00:00
log . Warn ( "failed to fill plan" , zap . Error ( err ) )
continue
}
2022-11-15 03:13:07 +00:00
if err := t . compactionHandler . execCompactionPlan ( signal , plan ) ; err != nil {
log . Warn ( "failed to execute compaction plan" ,
zap . Int64 ( "collection" , signal . collectionID ) ,
zap . Int64 ( "planID" , plan . PlanID ) ,
zap . Int64s ( "segment IDs" , fetchSegIDs ( plan . GetSegmentBinlogs ( ) ) ) ,
zap . Error ( err ) )
continue
}
log . Info ( "time cost of generating compaction" ,
zap . Int64 ( "plan ID" , plan . PlanID ) ,
zap . Any ( "time cost" , time . Since ( start ) . Milliseconds ( ) ) ,
2023-07-14 07:56:31 +00:00
zap . Int64 ( "collectionID" , signal . collectionID ) ,
2022-11-15 03:13:07 +00:00
zap . String ( "channel" , channel ) ,
2023-07-14 07:56:31 +00:00
zap . Int64 ( "partitionID" , partitionID ) ,
2022-11-15 03:13:07 +00:00
zap . Int64s ( "segment IDs" , fetchSegIDs ( plan . GetSegmentBinlogs ( ) ) ) )
2021-11-05 14:25:00 +00:00
}
}
2022-12-12 02:33:26 +00:00
func ( t * compactionTrigger ) generatePlans ( segments [ ] * SegmentInfo , force bool , isDiskIndex bool , compactTime * compactTime ) [ ] * datapb . CompactionPlan {
2022-02-18 06:51:49 +00:00
// find segments need internal compaction
2022-06-15 15:14:10 +00:00
// TODO add low priority candidates, for example if the segment is smaller than full 0.9 * max segment size but larger than small segment boundary, we only execute compaction when there are no compaction running actively
var prioritizedCandidates [ ] * SegmentInfo
var smallCandidates [ ] * SegmentInfo
2022-12-09 08:03:20 +00:00
var nonPlannedSegments [ ] * SegmentInfo
2022-06-15 15:14:10 +00:00
// TODO, currently we lack of the measurement of data distribution, there should be another compaction help on redistributing segment based on scalar/vector field distribution
2022-02-18 06:51:49 +00:00
for _ , segment := range segments {
segment := segment . ShadowClone ( )
2022-06-15 15:14:10 +00:00
// TODO should we trigger compaction periodically even if the segment has no obvious reason to be compacted?
2022-12-12 02:33:26 +00:00
if force || t . ShouldDoSingleCompaction ( segment , isDiskIndex , compactTime ) {
2022-06-15 15:14:10 +00:00
prioritizedCandidates = append ( prioritizedCandidates , segment )
} else if t . isSmallSegment ( segment ) {
smallCandidates = append ( smallCandidates , segment )
2022-12-09 08:03:20 +00:00
} else {
nonPlannedSegments = append ( nonPlannedSegments , segment )
2022-02-18 06:51:49 +00:00
}
}
2023-09-04 09:41:48 +00:00
2022-02-18 06:51:49 +00:00
var plans [ ] * datapb . CompactionPlan
2022-06-15 15:14:10 +00:00
// sort segment from large to small
sort . Slice ( prioritizedCandidates , func ( i , j int ) bool {
2022-06-22 11:00:14 +00:00
if prioritizedCandidates [ i ] . GetNumOfRows ( ) != prioritizedCandidates [ j ] . GetNumOfRows ( ) {
return prioritizedCandidates [ i ] . GetNumOfRows ( ) > prioritizedCandidates [ j ] . GetNumOfRows ( )
2022-06-15 15:14:10 +00:00
}
return prioritizedCandidates [ i ] . GetID ( ) < prioritizedCandidates [ j ] . GetID ( )
} )
2022-02-18 06:51:49 +00:00
2022-06-15 15:14:10 +00:00
sort . Slice ( smallCandidates , func ( i , j int ) bool {
2022-06-22 11:00:14 +00:00
if smallCandidates [ i ] . GetNumOfRows ( ) != smallCandidates [ j ] . GetNumOfRows ( ) {
return smallCandidates [ i ] . GetNumOfRows ( ) > smallCandidates [ j ] . GetNumOfRows ( )
2022-06-15 15:14:10 +00:00
}
return smallCandidates [ i ] . GetID ( ) < smallCandidates [ j ] . GetID ( )
} )
2022-12-09 08:03:20 +00:00
// Sort non-planned from small to large.
sort . Slice ( nonPlannedSegments , func ( i , j int ) bool {
if nonPlannedSegments [ i ] . GetNumOfRows ( ) != nonPlannedSegments [ j ] . GetNumOfRows ( ) {
return nonPlannedSegments [ i ] . GetNumOfRows ( ) < nonPlannedSegments [ j ] . GetNumOfRows ( )
}
return nonPlannedSegments [ i ] . GetID ( ) > nonPlannedSegments [ j ] . GetID ( )
} )
getSegmentIDs := func ( segment * SegmentInfo , _ int ) int64 {
return segment . GetID ( )
}
2022-06-15 15:14:10 +00:00
// greedy pick from large segment to small, the goal is to fill each segment to reach 512M
// we must ensure all prioritized candidates is in a plan
2023-09-21 01:45:27 +00:00
// TODO the compaction selection policy should consider if compaction workload is high
2022-06-15 15:14:10 +00:00
for len ( prioritizedCandidates ) > 0 {
2022-02-18 06:51:49 +00:00
var bucket [ ] * SegmentInfo
2022-06-15 15:14:10 +00:00
// pop out the first element
segment := prioritizedCandidates [ 0 ]
2022-02-18 06:51:49 +00:00
bucket = append ( bucket , segment )
2022-06-15 15:14:10 +00:00
prioritizedCandidates = prioritizedCandidates [ 1 : ]
// only do single file compaction if segment is already large enough
2022-06-22 11:00:14 +00:00
if segment . GetNumOfRows ( ) < segment . GetMaxRowNum ( ) {
2022-06-15 15:14:10 +00:00
var result [ ] * SegmentInfo
2022-06-22 11:00:14 +00:00
free := segment . GetMaxRowNum ( ) - segment . GetNumOfRows ( )
2022-12-07 10:01:19 +00:00
maxNum := Params . DataCoordCfg . MaxSegmentToMerge . GetAsInt ( ) - 1
2022-06-15 15:14:10 +00:00
prioritizedCandidates , result , free = greedySelect ( prioritizedCandidates , free , maxNum )
bucket = append ( bucket , result ... )
maxNum -= len ( result )
if maxNum > 0 {
smallCandidates , result , _ = greedySelect ( smallCandidates , free , maxNum )
bucket = append ( bucket , result ... )
}
}
// since this is priority compaction, we will execute even if there is only segment
2022-06-22 11:00:14 +00:00
plan := segmentsToPlan ( bucket , compactTime )
var size int64
var row int64
for _ , s := range bucket {
size += s . getSegmentSize ( )
row += s . GetNumOfRows ( )
}
log . Info ( "generate a plan for priority candidates" , zap . Any ( "plan" , plan ) ,
zap . Int64 ( "target segment row" , row ) , zap . Int64 ( "target segment size" , size ) )
plans = append ( plans , plan )
2022-02-18 06:51:49 +00:00
}
2022-12-09 08:03:20 +00:00
getSegIDsFromPlan := func ( plan * datapb . CompactionPlan ) [ ] int64 {
var segmentIDs [ ] int64
for _ , binLog := range plan . GetSegmentBinlogs ( ) {
segmentIDs = append ( segmentIDs , binLog . GetSegmentID ( ) )
}
return segmentIDs
}
var remainingSmallSegs [ ] * SegmentInfo
2022-06-15 15:14:10 +00:00
// check if there are small candidates left can be merged into large segments
for len ( smallCandidates ) > 0 {
var bucket [ ] * SegmentInfo
// pop out the first element
segment := smallCandidates [ 0 ]
bucket = append ( bucket , segment )
smallCandidates = smallCandidates [ 1 : ]
var result [ ] * SegmentInfo
2022-06-22 11:00:14 +00:00
free := segment . GetMaxRowNum ( ) - segment . GetNumOfRows ( )
2022-06-15 15:14:10 +00:00
// for small segment merge, we pick one largest segment and merge as much as small segment together with it
// Why reverse? try to merge as many segments as expected.
// for instance, if a 255M and 255M is the largest small candidates, they will never be merged because of the MinSegmentToMerge limit.
2022-12-07 10:01:19 +00:00
smallCandidates , result , _ = reverseGreedySelect ( smallCandidates , free , Params . DataCoordCfg . MaxSegmentToMerge . GetAsInt ( ) - 1 )
2022-06-15 15:14:10 +00:00
bucket = append ( bucket , result ... )
2022-02-18 06:51:49 +00:00
2022-06-22 11:00:14 +00:00
var size int64
var targetRow int64
for _ , s := range bucket {
size += s . getSegmentSize ( )
targetRow += s . GetNumOfRows ( )
}
// only merge if candidate number is large than MinSegmentToMerge or if target row is large enough
2022-12-09 08:03:20 +00:00
if len ( bucket ) >= Params . DataCoordCfg . MinSegmentToMerge . GetAsInt ( ) ||
2023-07-26 06:49:01 +00:00
len ( bucket ) > 1 && t . isCompactableSegment ( targetRow , segment ) {
2022-06-22 11:00:14 +00:00
plan := segmentsToPlan ( bucket , compactTime )
2022-12-09 08:03:20 +00:00
log . Info ( "generate a plan for small candidates" ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "plan segmentIDs" , lo . Map ( bucket , getSegmentIDs ) ) ,
2022-12-09 08:03:20 +00:00
zap . Int64 ( "target segment row" , targetRow ) ,
zap . Int64 ( "target segment size" , size ) )
2022-06-22 11:00:14 +00:00
plans = append ( plans , plan )
2022-12-09 08:03:20 +00:00
} else {
remainingSmallSegs = append ( remainingSmallSegs , bucket ... )
}
}
// Try adding remaining segments to existing plans.
for i := len ( remainingSmallSegs ) - 1 ; i >= 0 ; i -- {
s := remainingSmallSegs [ i ]
if ! isExpandableSmallSegment ( s ) {
continue
}
// Try squeeze this segment into existing plans. This could cause segment size to exceed maxSize.
for _ , plan := range plans {
if plan . TotalRows + s . GetNumOfRows ( ) <= int64 ( Params . DataCoordCfg . SegmentExpansionRate . GetAsFloat ( ) * float64 ( s . GetMaxRowNum ( ) ) ) {
segmentBinLogs := & datapb . CompactionSegmentBinlogs {
SegmentID : s . GetID ( ) ,
FieldBinlogs : s . GetBinlogs ( ) ,
Field2StatslogPaths : s . GetStatslogs ( ) ,
Deltalogs : s . GetDeltalogs ( ) ,
}
plan . TotalRows += s . GetNumOfRows ( )
plan . SegmentBinlogs = append ( plan . SegmentBinlogs , segmentBinLogs )
log . Info ( "small segment appended on existing plan" ,
2023-07-14 07:56:31 +00:00
zap . Int64 ( "segmentID" , s . GetID ( ) ) ,
2022-12-09 08:03:20 +00:00
zap . Int64 ( "target rows" , plan . GetTotalRows ( ) ) ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "plan segmentID" , getSegIDsFromPlan ( plan ) ) ,
2022-12-09 08:03:20 +00:00
)
remainingSmallSegs = append ( remainingSmallSegs [ : i ] , remainingSmallSegs [ i + 1 : ] ... )
break
}
}
}
// If there are still remaining small segments, try adding them to non-planned segments.
for _ , npSeg := range nonPlannedSegments {
bucket := [ ] * SegmentInfo { npSeg }
targetRow := npSeg . GetNumOfRows ( )
for i := len ( remainingSmallSegs ) - 1 ; i >= 0 ; i -- {
// Note: could also simply use MaxRowNum as limit.
if targetRow + remainingSmallSegs [ i ] . GetNumOfRows ( ) <=
int64 ( Params . DataCoordCfg . SegmentExpansionRate . GetAsFloat ( ) * float64 ( npSeg . GetMaxRowNum ( ) ) ) {
bucket = append ( bucket , remainingSmallSegs [ i ] )
targetRow += remainingSmallSegs [ i ] . GetNumOfRows ( )
remainingSmallSegs = append ( remainingSmallSegs [ : i ] , remainingSmallSegs [ i + 1 : ] ... )
}
}
if len ( bucket ) > 1 {
plan := segmentsToPlan ( bucket , compactTime )
plans = append ( plans , plan )
log . Info ( "generate a plan for to squeeze small candidates into non-planned segment" ,
2023-07-14 07:56:31 +00:00
zap . Int64s ( "plan segmentIDs" , lo . Map ( bucket , getSegmentIDs ) ) ,
2022-12-09 08:03:20 +00:00
zap . Int64 ( "target segment row" , targetRow ) ,
)
2022-06-15 15:14:10 +00:00
}
2022-02-18 06:51:49 +00:00
}
return plans
}
2022-06-15 15:14:10 +00:00
func segmentsToPlan ( segments [ ] * SegmentInfo , compactTime * compactTime ) * datapb . CompactionPlan {
2022-02-18 06:51:49 +00:00
plan := & datapb . CompactionPlan {
2022-10-10 12:31:22 +00:00
Type : datapb . CompactionType_MixCompaction ,
Channel : segments [ 0 ] . GetInsertChannel ( ) ,
CollectionTtl : compactTime . collectionTTL . Nanoseconds ( ) ,
2022-02-18 06:51:49 +00:00
}
for _ , s := range segments {
segmentBinlogs := & datapb . CompactionSegmentBinlogs {
SegmentID : s . GetID ( ) ,
FieldBinlogs : s . GetBinlogs ( ) ,
Field2StatslogPaths : s . GetStatslogs ( ) ,
Deltalogs : s . GetDeltalogs ( ) ,
}
2022-12-09 08:03:20 +00:00
plan . TotalRows += s . GetNumOfRows ( )
2022-02-18 06:51:49 +00:00
plan . SegmentBinlogs = append ( plan . SegmentBinlogs , segmentBinlogs )
}
return plan
}
2022-06-15 15:14:10 +00:00
func greedySelect ( candidates [ ] * SegmentInfo , free int64 , maxSegment int ) ( [ ] * SegmentInfo , [ ] * SegmentInfo , int64 ) {
var result [ ] * SegmentInfo
for i := 0 ; i < len ( candidates ) ; {
candidate := candidates [ i ]
2022-06-22 11:00:14 +00:00
if len ( result ) < maxSegment && candidate . GetNumOfRows ( ) < free {
2022-06-15 15:14:10 +00:00
result = append ( result , candidate )
2022-06-22 11:00:14 +00:00
free -= candidate . GetNumOfRows ( )
2022-06-15 15:14:10 +00:00
candidates = append ( candidates [ : i ] , candidates [ i + 1 : ] ... )
} else {
i ++
}
}
return candidates , result , free
}
func reverseGreedySelect ( candidates [ ] * SegmentInfo , free int64 , maxSegment int ) ( [ ] * SegmentInfo , [ ] * SegmentInfo , int64 ) {
2022-02-18 06:51:49 +00:00
var result [ ] * SegmentInfo
2022-06-15 15:14:10 +00:00
for i := len ( candidates ) - 1 ; i >= 0 ; i -- {
candidate := candidates [ i ]
2022-06-22 11:00:14 +00:00
if ( len ( result ) < maxSegment ) && ( candidate . GetNumOfRows ( ) < free ) {
2022-06-15 15:14:10 +00:00
result = append ( result , candidate )
2022-06-22 11:00:14 +00:00
free -= candidate . GetNumOfRows ( )
2022-06-15 15:14:10 +00:00
candidates = append ( candidates [ : i ] , candidates [ i + 1 : ] ... )
}
2022-02-18 06:51:49 +00:00
}
2022-06-15 15:14:10 +00:00
return candidates , result , free
2021-11-05 14:25:00 +00:00
}
func ( t * compactionTrigger ) getCandidateSegments ( channel string , partitionID UniqueID ) [ ] * SegmentInfo {
segments := t . meta . GetSegmentsByChannel ( channel )
2023-07-23 13:31:00 +00:00
if Params . DataCoordCfg . IndexBasedCompaction . GetAsBool ( ) {
segments = FilterInIndexedSegments ( t . handler , t . meta , segments ... )
}
2022-02-18 06:51:49 +00:00
var res [ ] * SegmentInfo
2021-11-05 14:25:00 +00:00
for _ , s := range segments {
2022-09-22 10:34:50 +00:00
if ! isSegmentHealthy ( s ) ||
! isFlush ( s ) ||
s . GetInsertChannel ( ) != channel ||
s . GetPartitionID ( ) != partitionID ||
2022-09-26 10:06:54 +00:00
s . isCompacting ||
2023-12-05 10:44:37 +00:00
s . GetIsImporting ( ) ||
s . GetLevel ( ) == datapb . SegmentLevel_L0 {
2021-11-05 14:25:00 +00:00
continue
}
res = append ( res , s )
}
2022-09-16 03:32:48 +00:00
2021-11-05 14:25:00 +00:00
return res
}
2022-02-18 06:51:49 +00:00
func ( t * compactionTrigger ) isSmallSegment ( segment * SegmentInfo ) bool {
2022-12-07 10:01:19 +00:00
return segment . GetNumOfRows ( ) < int64 ( float64 ( segment . GetMaxRowNum ( ) ) * Params . DataCoordCfg . SegmentSmallProportion . GetAsFloat ( ) )
2021-11-05 14:25:00 +00:00
}
2023-07-26 06:49:01 +00:00
func ( t * compactionTrigger ) isCompactableSegment ( targetRow int64 , segment * SegmentInfo ) bool {
smallProportion := Params . DataCoordCfg . SegmentSmallProportion . GetAsFloat ( )
compactableProportion := Params . DataCoordCfg . SegmentCompactableProportion . GetAsFloat ( )
// avoid invalid single segment compaction
if compactableProportion < smallProportion {
compactableProportion = smallProportion
}
return targetRow > int64 ( float64 ( segment . GetMaxRowNum ( ) ) * compactableProportion )
}
2022-12-09 08:03:20 +00:00
func isExpandableSmallSegment ( segment * SegmentInfo ) bool {
return segment . GetNumOfRows ( ) < int64 ( float64 ( segment . GetMaxRowNum ( ) ) * ( Params . DataCoordCfg . SegmentExpansionRate . GetAsFloat ( ) - 1 ) )
}
2022-10-31 11:13:34 +00:00
func ( t * compactionTrigger ) isStaleSegment ( segment * SegmentInfo ) bool {
return time . Since ( segment . lastFlushTime ) . Minutes ( ) >= segmentTimedFlushDuration
}
2022-12-12 02:33:26 +00:00
func ( t * compactionTrigger ) ShouldDoSingleCompaction ( segment * SegmentInfo , isDiskIndex bool , compactTime * compactTime ) bool {
// no longer restricted binlog numbers because this is now related to field numbers
2023-11-23 09:30:25 +00:00
binlogCount := GetBinlogCount ( segment . GetBinlogs ( ) )
2022-06-15 15:14:10 +00:00
2022-12-12 02:33:26 +00:00
// count all the statlog file count, only for flush generated segments
if len ( segment . CompactionFrom ) == 0 {
2023-11-23 09:30:25 +00:00
statsLogCount := GetBinlogCount ( segment . GetStatslogs ( ) )
2022-12-12 02:33:26 +00:00
var maxSize int
if isDiskIndex {
maxSize = int ( Params . DataCoordCfg . DiskSegmentMaxSize . GetAsInt64 ( ) * 1024 * 1024 / Params . DataNodeCfg . BinLogMaxSize . GetAsInt64 ( ) )
} else {
maxSize = int ( Params . DataCoordCfg . SegmentMaxSize . GetAsInt64 ( ) * 1024 * 1024 / Params . DataNodeCfg . BinLogMaxSize . GetAsInt64 ( ) )
}
// if stats log is more than expected, trigger compaction to reduce stats log size.
// TODO maybe we want to compact to single statslog to reduce watch dml channel cost
// TODO avoid rebuild index twice.
2023-11-23 09:30:25 +00:00
if statsLogCount > maxSize * 2.0 {
log . Info ( "stats number is too much, trigger compaction" , zap . Int64 ( "segmentID" , segment . ID ) , zap . Int ( "Bin logs" , binlogCount ) , zap . Int ( "Stat logs" , statsLogCount ) )
2022-12-12 02:33:26 +00:00
return true
}
2022-06-15 15:14:10 +00:00
}
2023-11-23 09:30:25 +00:00
deltaLogCount := GetBinlogCount ( segment . GetDeltalogs ( ) )
if deltaLogCount > Params . DataCoordCfg . SingleCompactionDeltalogMaxNum . GetAsInt ( ) {
log . Info ( "total delta number is too much, trigger compaction" , zap . Int64 ( "segmentID" , segment . ID ) , zap . Int ( "Bin logs" , binlogCount ) , zap . Int ( "Delta logs" , deltaLogCount ) )
2022-06-15 15:14:10 +00:00
return true
}
// if expire time is enabled, put segment into compaction candidate
totalExpiredSize := int64 ( 0 )
2022-06-22 11:00:14 +00:00
totalExpiredRows := 0
2022-06-15 15:14:10 +00:00
for _ , binlogs := range segment . GetBinlogs ( ) {
for _ , l := range binlogs . GetBinlogs ( ) {
// TODO, we should probably estimate expired log entries by total rows in binlog and the ralationship of timeTo, timeFrom and expire time
if l . TimestampTo < compactTime . expireTime {
2023-08-08 13:17:21 +00:00
log . RatedDebug ( 10 , "mark binlog as expired" ,
zap . Int64 ( "segmentID" , segment . ID ) ,
zap . Int64 ( "binlogID" , l . GetLogID ( ) ) ,
zap . Uint64 ( "binlogTimestampTo" , l . TimestampTo ) ,
zap . Uint64 ( "compactExpireTime" , compactTime . expireTime ) )
2022-06-22 11:00:14 +00:00
totalExpiredRows += int ( l . GetEntriesNum ( ) )
2022-06-15 15:14:10 +00:00
totalExpiredSize += l . GetLogSize ( )
}
}
}
2023-08-08 13:17:21 +00:00
if float64 ( totalExpiredRows ) / float64 ( segment . GetNumOfRows ( ) ) >= Params . DataCoordCfg . SingleCompactionRatioThreshold . GetAsFloat ( ) ||
totalExpiredSize > Params . DataCoordCfg . SingleCompactionExpiredLogMaxSize . GetAsInt64 ( ) {
2023-07-14 07:56:31 +00:00
log . Info ( "total expired entities is too much, trigger compaction" , zap . Int64 ( "segmentID" , segment . ID ) ,
2023-08-08 13:17:21 +00:00
zap . Int ( "expiredRows" , totalExpiredRows ) , zap . Int64 ( "expiredLogSize" , totalExpiredSize ) ,
zap . Bool ( "createdByCompaction" , segment . CreatedByCompaction ) , zap . Int64s ( "compactionFrom" , segment . CompactionFrom ) )
2022-06-15 15:14:10 +00:00
return true
}
2021-11-05 14:25:00 +00:00
totalDeletedRows := 0
totalDeleteLogSize := int64 ( 0 )
2022-06-15 15:14:10 +00:00
for _ , deltaLogs := range segment . GetDeltalogs ( ) {
for _ , l := range deltaLogs . GetBinlogs ( ) {
2023-09-04 09:41:48 +00:00
totalDeletedRows += int ( l . GetEntriesNum ( ) )
totalDeleteLogSize += l . GetLogSize ( )
2021-11-05 14:25:00 +00:00
}
}
// currently delta log size and delete ratio policy is applied
2022-12-07 10:01:19 +00:00
if float64 ( totalDeletedRows ) / float64 ( segment . GetNumOfRows ( ) ) >= Params . DataCoordCfg . SingleCompactionRatioThreshold . GetAsFloat ( ) || totalDeleteLogSize > Params . DataCoordCfg . SingleCompactionDeltaLogMaxSize . GetAsInt64 ( ) {
2023-09-04 09:41:48 +00:00
log . Info ( "total delete entities is too much, trigger compaction" ,
zap . Int64 ( "segmentID" , segment . ID ) ,
zap . Int64 ( "numRows" , segment . GetNumOfRows ( ) ) ,
zap . Int ( "deleted rows" , totalDeletedRows ) ,
zap . Int64 ( "delete log size" , totalDeleteLogSize ) )
2022-06-15 15:14:10 +00:00
return true
2022-02-18 06:51:49 +00:00
}
2023-11-21 01:26:22 +00:00
// index version of segment lower than current version and IndexFileKeys should have value, trigger compaction
for _ , index := range segment . segmentIndexes {
if index . CurrentIndexVersion < t . indexEngineVersionManager . GetCurrentIndexEngineVersion ( ) &&
len ( index . IndexFileKeys ) > 0 {
log . Info ( "index version is too old, trigger compaction" ,
zap . Int64 ( "segmentID" , segment . ID ) ,
zap . Int64 ( "indexID" , index . IndexID ) ,
zap . Strings ( "indexFileKeys" , index . IndexFileKeys ) ,
zap . Int32 ( "currentIndexVersion" , index . CurrentIndexVersion ) ,
zap . Int32 ( "currentEngineVersion" , t . indexEngineVersionManager . GetCurrentIndexEngineVersion ( ) ) )
return true
}
}
2022-02-18 06:51:49 +00:00
return false
}
2021-12-14 05:55:07 +00:00
func isFlush ( segment * SegmentInfo ) bool {
return segment . GetState ( ) == commonpb . SegmentState_Flushed || segment . GetState ( ) == commonpb . SegmentState_Flushing
}
2022-11-15 03:13:07 +00:00
func fetchSegIDs ( segBinLogs [ ] * datapb . CompactionSegmentBinlogs ) [ ] int64 {
var segIDs [ ] int64
for _ , segBinLog := range segBinLogs {
segIDs = append ( segIDs , segBinLog . GetSegmentID ( ) )
}
return segIDs
}