mirror of https://github.com/milvus-io/milvus.git
enhance: Change the fixed value to a ratio for clustering segment size (#35076)
issue: #34495 --------- Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>pull/35030/head
parent
a642a26ed4
commit
196a7986b3
|
@ -540,8 +540,8 @@ dataCoord:
|
||||||
minInterval: 3600 # The minimum interval between clustering compaction executions of one collection, to avoid redundant compaction
|
minInterval: 3600 # The minimum interval between clustering compaction executions of one collection, to avoid redundant compaction
|
||||||
maxInterval: 259200 # If a collection haven't been clustering compacted for longer than maxInterval, force compact
|
maxInterval: 259200 # If a collection haven't been clustering compacted for longer than maxInterval, force compact
|
||||||
newDataSizeThreshold: 512m # If new data size is large than newDataSizeThreshold, execute clustering compaction
|
newDataSizeThreshold: 512m # If new data size is large than newDataSizeThreshold, execute clustering compaction
|
||||||
preferSegmentSize: 512m
|
preferSegmentSizeRatio: 0.8
|
||||||
maxSegmentSize: 1024m
|
maxSegmentSizeRatio: 1
|
||||||
maxTrainSizeRatio: 0.8 # max data size ratio in Kmeans train, if larger than it, will down sampling to meet this limit
|
maxTrainSizeRatio: 0.8 # max data size ratio in Kmeans train, if larger than it, will down sampling to meet this limit
|
||||||
maxCentroidsNum: 10240 # maximum centroids number in Kmeans train
|
maxCentroidsNum: 10240 # maximum centroids number in Kmeans train
|
||||||
minCentroidsNum: 16 # minimum centroids number in Kmeans train
|
minCentroidsNum: 16 # minimum centroids number in Kmeans train
|
||||||
|
|
|
@ -197,16 +197,20 @@ func (policy *clusteringCompactionPolicy) collectionIsClusteringCompacting(colle
|
||||||
return false, 0
|
return false, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func calculateClusteringCompactionConfig(view CompactionView) (segmentIDs []int64, totalRows, maxSegmentRows, preferSegmentRows int64) {
|
func calculateClusteringCompactionConfig(coll *collectionInfo, view CompactionView, expectedSegmentSize int64) (segmentIDs []int64, totalRows, maxSegmentRows, preferSegmentRows int64, err error) {
|
||||||
for _, s := range view.GetSegmentsView() {
|
for _, s := range view.GetSegmentsView() {
|
||||||
totalRows += s.NumOfRows
|
totalRows += s.NumOfRows
|
||||||
segmentIDs = append(segmentIDs, s.ID)
|
segmentIDs = append(segmentIDs, s.ID)
|
||||||
}
|
}
|
||||||
clusteringMaxSegmentSize := paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSize.GetAsSize()
|
clusteringMaxSegmentSizeRatio := paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSizeRatio.GetAsFloat()
|
||||||
clusteringPreferSegmentSize := paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSize.GetAsSize()
|
clusteringPreferSegmentSizeRatio := paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSizeRatio.GetAsFloat()
|
||||||
segmentMaxSize := paramtable.Get().DataCoordCfg.SegmentMaxSize.GetAsInt64() * 1024 * 1024
|
|
||||||
maxSegmentRows = view.GetSegmentsView()[0].MaxRowNum * clusteringMaxSegmentSize / segmentMaxSize
|
maxRows, err := calBySegmentSizePolicy(coll.Schema, expectedSegmentSize)
|
||||||
preferSegmentRows = view.GetSegmentsView()[0].MaxRowNum * clusteringPreferSegmentSize / segmentMaxSize
|
if err != nil {
|
||||||
|
return nil, 0, 0, 0, err
|
||||||
|
}
|
||||||
|
maxSegmentRows = int64(float64(maxRows) * clusteringMaxSegmentSizeRatio)
|
||||||
|
preferSegmentRows = int64(float64(maxRows) * clusteringPreferSegmentSizeRatio)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -182,3 +182,7 @@ func (s *ClusteringCompactionPolicySuite) TestCollectionIsClusteringCompacting()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ClusteringCompactionPolicySuite) TestGetExpectedSegmentSize() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -24,10 +24,14 @@ import (
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||||
|
"github.com/milvus-io/milvus/internal/metastore/model"
|
||||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/indexparamcheck"
|
||||||
"github.com/milvus-io/milvus/pkg/util/lock"
|
"github.com/milvus-io/milvus/pkg/util/lock"
|
||||||
"github.com/milvus-io/milvus/pkg/util/logutil"
|
"github.com/milvus-io/milvus/pkg/util/logutil"
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type CompactionTriggerType int8
|
type CompactionTriggerType int8
|
||||||
|
@ -296,19 +300,29 @@ func (m *CompactionTriggerManager) SubmitL0ViewToScheduler(ctx context.Context,
|
||||||
func (m *CompactionTriggerManager) SubmitClusteringViewToScheduler(ctx context.Context, view CompactionView) {
|
func (m *CompactionTriggerManager) SubmitClusteringViewToScheduler(ctx context.Context, view CompactionView) {
|
||||||
taskID, _, err := m.allocator.allocN(2)
|
taskID, _, err := m.allocator.allocN(2)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Failed to submit compaction view to scheduler because allocate id fail", zap.String("view", view.String()))
|
log.Warn("Failed to submit compaction view to scheduler because allocate id fail", zap.String("view", view.String()),
|
||||||
|
zap.Error(err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
collection, err := m.handler.GetCollection(ctx, view.GetGroupLabel().CollectionID)
|
collection, err := m.handler.GetCollection(ctx, view.GetGroupLabel().CollectionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("Failed to submit compaction view to scheduler because get collection fail", zap.String("view", view.String()))
|
log.Warn("Failed to submit compaction view to scheduler because get collection fail", zap.String("view", view.String()),
|
||||||
|
zap.Error(err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
_, totalRows, maxSegmentRows, preferSegmentRows := calculateClusteringCompactionConfig(view)
|
|
||||||
|
expectedSegmentSize := m.getExpectedSegmentSize(collection)
|
||||||
|
|
||||||
|
_, totalRows, maxSegmentRows, preferSegmentRows, err := calculateClusteringCompactionConfig(collection, view, expectedSegmentSize)
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("Failed to calculate cluster compaction config fail", zap.String("view", view.String()), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
resultSegmentNum := totalRows / preferSegmentRows * 2
|
resultSegmentNum := totalRows / preferSegmentRows * 2
|
||||||
start, end, err := m.allocator.allocN(resultSegmentNum)
|
start, end, err := m.allocator.allocN(resultSegmentNum)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("pre-allocate result segments failed", zap.String("view", view.String()))
|
log.Warn("pre-allocate result segments failed", zap.String("view", view.String()), zap.Error(err))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
task := &datapb.CompactionTask{
|
task := &datapb.CompactionTask{
|
||||||
|
@ -397,6 +411,29 @@ func (m *CompactionTriggerManager) SubmitSingleViewToScheduler(ctx context.Conte
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *CompactionTriggerManager) getExpectedSegmentSize(collection *collectionInfo) int64 {
|
||||||
|
indexInfos := m.meta.indexMeta.GetIndexesForCollection(collection.ID, "")
|
||||||
|
|
||||||
|
vectorFields := typeutil.GetVectorFieldSchemas(collection.Schema)
|
||||||
|
fieldIndexTypes := lo.SliceToMap(indexInfos, func(t *model.Index) (int64, indexparamcheck.IndexType) {
|
||||||
|
return t.FieldID, GetIndexType(t.IndexParams)
|
||||||
|
})
|
||||||
|
vectorFieldsWithDiskIndex := lo.Filter(vectorFields, func(field *schemapb.FieldSchema, _ int) bool {
|
||||||
|
if indexType, ok := fieldIndexTypes[field.FieldID]; ok {
|
||||||
|
return indexparamcheck.IsDiskIndex(indexType)
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
|
||||||
|
allDiskIndex := len(vectorFields) == len(vectorFieldsWithDiskIndex)
|
||||||
|
if allDiskIndex {
|
||||||
|
// Only if all vector fields index type are DiskANN, recalc segment max size here.
|
||||||
|
return Params.DataCoordCfg.DiskSegmentMaxSize.GetAsInt64() * 1024 * 1024
|
||||||
|
}
|
||||||
|
// If some vector fields index type are not DiskANN, recalc segment max size using default policy.
|
||||||
|
return Params.DataCoordCfg.SegmentMaxSize.GetAsInt64() * 1024 * 1024
|
||||||
|
}
|
||||||
|
|
||||||
// chanPartSegments is an internal result struct, which is aggregates of SegmentInfos with same collectionID, partitionID and channelName
|
// chanPartSegments is an internal result struct, which is aggregates of SegmentInfos with same collectionID, partitionID and channelName
|
||||||
type chanPartSegments struct {
|
type chanPartSegments struct {
|
||||||
collectionID UniqueID
|
collectionID UniqueID
|
||||||
|
|
|
@ -2,13 +2,22 @@ package datacoord
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"strconv"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/metastore/model"
|
||||||
|
"github.com/milvus-io/milvus/pkg/common"
|
||||||
|
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
"github.com/stretchr/testify/mock"
|
"github.com/stretchr/testify/mock"
|
||||||
"github.com/stretchr/testify/suite"
|
"github.com/stretchr/testify/suite"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
)
|
)
|
||||||
|
@ -140,3 +149,168 @@ func (s *CompactionTriggerManagerSuite) TestNotifyByViewChange() {
|
||||||
s.mockAlloc.EXPECT().allocID(mock.Anything).Return(19530, nil).Maybe()
|
s.mockAlloc.EXPECT().allocID(mock.Anything).Return(19530, nil).Maybe()
|
||||||
s.triggerManager.notify(context.Background(), TriggerTypeLevelZeroViewChange, levelZeroView)
|
s.triggerManager.notify(context.Background(), TriggerTypeLevelZeroViewChange, levelZeroView)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *CompactionTriggerManagerSuite) TestGetExpectedSegmentSize() {
|
||||||
|
var (
|
||||||
|
collectionID = int64(1000)
|
||||||
|
fieldID = int64(2000)
|
||||||
|
indexID = int64(3000)
|
||||||
|
)
|
||||||
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.SegmentMaxSize.Key, strconv.Itoa(100))
|
||||||
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.SegmentMaxSize.Key)
|
||||||
|
|
||||||
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.DiskSegmentMaxSize.Key, strconv.Itoa(200))
|
||||||
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.DiskSegmentMaxSize.Key)
|
||||||
|
|
||||||
|
s.triggerManager.meta = &meta{
|
||||||
|
indexMeta: &indexMeta{
|
||||||
|
indexes: map[UniqueID]map[UniqueID]*model.Index{
|
||||||
|
collectionID: {
|
||||||
|
indexID + 1: &model.Index{
|
||||||
|
CollectionID: collectionID,
|
||||||
|
FieldID: fieldID + 1,
|
||||||
|
IndexID: indexID + 1,
|
||||||
|
IndexName: "",
|
||||||
|
IsDeleted: false,
|
||||||
|
CreateTime: 0,
|
||||||
|
TypeParams: nil,
|
||||||
|
IndexParams: []*commonpb.KeyValuePair{
|
||||||
|
{Key: common.IndexTypeKey, Value: "DISKANN"},
|
||||||
|
},
|
||||||
|
IsAutoIndex: false,
|
||||||
|
UserIndexParams: nil,
|
||||||
|
},
|
||||||
|
indexID + 2: &model.Index{
|
||||||
|
CollectionID: collectionID,
|
||||||
|
FieldID: fieldID + 2,
|
||||||
|
IndexID: indexID + 2,
|
||||||
|
IndexName: "",
|
||||||
|
IsDeleted: false,
|
||||||
|
CreateTime: 0,
|
||||||
|
TypeParams: nil,
|
||||||
|
IndexParams: []*commonpb.KeyValuePair{
|
||||||
|
{Key: common.IndexTypeKey, Value: "DISKANN"},
|
||||||
|
},
|
||||||
|
IsAutoIndex: false,
|
||||||
|
UserIndexParams: nil,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Run("all DISKANN", func() {
|
||||||
|
collection := &collectionInfo{
|
||||||
|
ID: collectionID,
|
||||||
|
Schema: &schemapb.CollectionSchema{
|
||||||
|
Name: "coll1",
|
||||||
|
Description: "",
|
||||||
|
Fields: []*schemapb.FieldSchema{
|
||||||
|
{FieldID: fieldID, Name: "field0", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
||||||
|
{FieldID: fieldID + 1, Name: "field1", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
{FieldID: fieldID + 2, Name: "field2", DataType: schemapb.DataType_Float16Vector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
},
|
||||||
|
EnableDynamicField: false,
|
||||||
|
Properties: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Equal(int64(200*1024*1024), s.triggerManager.getExpectedSegmentSize(collection))
|
||||||
|
})
|
||||||
|
|
||||||
|
s.Run("HNSW & DISKANN", func() {
|
||||||
|
s.triggerManager.meta = &meta{
|
||||||
|
indexMeta: &indexMeta{
|
||||||
|
indexes: map[UniqueID]map[UniqueID]*model.Index{
|
||||||
|
collectionID: {
|
||||||
|
indexID + 1: &model.Index{
|
||||||
|
CollectionID: collectionID,
|
||||||
|
FieldID: fieldID + 1,
|
||||||
|
IndexID: indexID + 1,
|
||||||
|
IndexName: "",
|
||||||
|
IsDeleted: false,
|
||||||
|
CreateTime: 0,
|
||||||
|
TypeParams: nil,
|
||||||
|
IndexParams: []*commonpb.KeyValuePair{
|
||||||
|
{Key: common.IndexTypeKey, Value: "HNSW"},
|
||||||
|
},
|
||||||
|
IsAutoIndex: false,
|
||||||
|
UserIndexParams: nil,
|
||||||
|
},
|
||||||
|
indexID + 2: &model.Index{
|
||||||
|
CollectionID: collectionID,
|
||||||
|
FieldID: fieldID + 2,
|
||||||
|
IndexID: indexID + 2,
|
||||||
|
IndexName: "",
|
||||||
|
IsDeleted: false,
|
||||||
|
CreateTime: 0,
|
||||||
|
TypeParams: nil,
|
||||||
|
IndexParams: []*commonpb.KeyValuePair{
|
||||||
|
{Key: common.IndexTypeKey, Value: "DISKANN"},
|
||||||
|
},
|
||||||
|
IsAutoIndex: false,
|
||||||
|
UserIndexParams: nil,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
collection := &collectionInfo{
|
||||||
|
ID: collectionID,
|
||||||
|
Schema: &schemapb.CollectionSchema{
|
||||||
|
Name: "coll1",
|
||||||
|
Description: "",
|
||||||
|
Fields: []*schemapb.FieldSchema{
|
||||||
|
{FieldID: fieldID, Name: "field0", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
||||||
|
{FieldID: fieldID + 1, Name: "field1", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
{FieldID: fieldID + 2, Name: "field2", DataType: schemapb.DataType_Float16Vector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
},
|
||||||
|
EnableDynamicField: false,
|
||||||
|
Properties: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Equal(int64(100*1024*1024), s.triggerManager.getExpectedSegmentSize(collection))
|
||||||
|
})
|
||||||
|
|
||||||
|
s.Run("some vector has no index", func() {
|
||||||
|
s.triggerManager.meta = &meta{
|
||||||
|
indexMeta: &indexMeta{
|
||||||
|
indexes: map[UniqueID]map[UniqueID]*model.Index{
|
||||||
|
collectionID: {
|
||||||
|
indexID + 1: &model.Index{
|
||||||
|
CollectionID: collectionID,
|
||||||
|
FieldID: fieldID + 1,
|
||||||
|
IndexID: indexID + 1,
|
||||||
|
IndexName: "",
|
||||||
|
IsDeleted: false,
|
||||||
|
CreateTime: 0,
|
||||||
|
TypeParams: nil,
|
||||||
|
IndexParams: []*commonpb.KeyValuePair{
|
||||||
|
{Key: common.IndexTypeKey, Value: "HNSW"},
|
||||||
|
},
|
||||||
|
IsAutoIndex: false,
|
||||||
|
UserIndexParams: nil,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
collection := &collectionInfo{
|
||||||
|
ID: collectionID,
|
||||||
|
Schema: &schemapb.CollectionSchema{
|
||||||
|
Name: "coll1",
|
||||||
|
Description: "",
|
||||||
|
Fields: []*schemapb.FieldSchema{
|
||||||
|
{FieldID: fieldID, Name: "field0", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
||||||
|
{FieldID: fieldID + 1, Name: "field1", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
{FieldID: fieldID + 2, Name: "field2", DataType: schemapb.DataType_Float16Vector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
},
|
||||||
|
EnableDynamicField: false,
|
||||||
|
Properties: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Equal(int64(100*1024*1024), s.triggerManager.getExpectedSegmentSize(collection))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
|
@ -67,6 +67,21 @@ func calBySchemaPolicyWithDiskIndex(schema *schemapb.CollectionSchema) (int, err
|
||||||
return int(threshold / float64(sizePerRecord)), nil
|
return int(threshold / float64(sizePerRecord)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func calBySegmentSizePolicy(schema *schemapb.CollectionSchema, segmentSize int64) (int, error) {
|
||||||
|
if schema == nil {
|
||||||
|
return -1, errors.New("nil schema")
|
||||||
|
}
|
||||||
|
sizePerRecord, err := typeutil.EstimateSizePerRecord(schema)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
// check zero value, preventing panicking
|
||||||
|
if sizePerRecord == 0 {
|
||||||
|
return -1, errors.New("zero size record schema found")
|
||||||
|
}
|
||||||
|
return int(segmentSize) / sizePerRecord, nil
|
||||||
|
}
|
||||||
|
|
||||||
// AllocatePolicy helper function definition to allocate Segment space
|
// AllocatePolicy helper function definition to allocate Segment space
|
||||||
type AllocatePolicy func(segments []*SegmentInfo, count int64,
|
type AllocatePolicy func(segments []*SegmentInfo, count int64,
|
||||||
maxCountPerL1Segment int64, level datapb.SegmentLevel) ([]*Allocation, []*Allocation)
|
maxCountPerL1Segment int64, level datapb.SegmentLevel) ([]*Allocation, []*Allocation)
|
||||||
|
|
|
@ -140,6 +140,58 @@ func TestGetChannelOpenSegCapacityPolicy(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCalBySegmentSizePolicy(t *testing.T) {
|
||||||
|
t.Run("nil schema", func(t *testing.T) {
|
||||||
|
rows, err := calBySegmentSizePolicy(nil, 1024)
|
||||||
|
|
||||||
|
assert.Error(t, err)
|
||||||
|
assert.Equal(t, -1, rows)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("get dim failed", func(t *testing.T) {
|
||||||
|
schema := &schemapb.CollectionSchema{
|
||||||
|
Name: "coll1",
|
||||||
|
Description: "",
|
||||||
|
Fields: []*schemapb.FieldSchema{
|
||||||
|
{FieldID: fieldID, Name: "field0", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
||||||
|
{FieldID: fieldID + 1, Name: "field1", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "fake"}}},
|
||||||
|
},
|
||||||
|
EnableDynamicField: false,
|
||||||
|
Properties: nil,
|
||||||
|
}
|
||||||
|
|
||||||
|
rows, err := calBySegmentSizePolicy(schema, 1024)
|
||||||
|
assert.Error(t, err)
|
||||||
|
assert.Equal(t, -1, rows)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("sizePerRecord is zero", func(t *testing.T) {
|
||||||
|
schema := &schemapb.CollectionSchema{Fields: nil}
|
||||||
|
rows, err := calBySegmentSizePolicy(schema, 1024)
|
||||||
|
|
||||||
|
assert.Error(t, err)
|
||||||
|
assert.Equal(t, -1, rows)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("normal case", func(t *testing.T) {
|
||||||
|
schema := &schemapb.CollectionSchema{
|
||||||
|
Name: "coll1",
|
||||||
|
Description: "",
|
||||||
|
Fields: []*schemapb.FieldSchema{
|
||||||
|
{FieldID: fieldID, Name: "field0", DataType: schemapb.DataType_Int64, IsPrimaryKey: true},
|
||||||
|
{FieldID: fieldID + 1, Name: "field1", DataType: schemapb.DataType_FloatVector, TypeParams: []*commonpb.KeyValuePair{{Key: "dim", Value: "8"}}},
|
||||||
|
},
|
||||||
|
EnableDynamicField: false,
|
||||||
|
Properties: nil,
|
||||||
|
}
|
||||||
|
|
||||||
|
rows, err := calBySegmentSizePolicy(schema, 1200)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
// 1200/(4*8+8)
|
||||||
|
assert.Equal(t, 30, rows)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestSortSegmentsByLastExpires(t *testing.T) {
|
func TestSortSegmentsByLastExpires(t *testing.T) {
|
||||||
segs := make([]*SegmentInfo, 0, 10)
|
segs := make([]*SegmentInfo, 0, 10)
|
||||||
for i := 0; i < 10; i++ {
|
for i := 0; i < 10; i++ {
|
||||||
|
|
|
@ -184,7 +184,7 @@ func (at *analyzeTask) PreCheck(ctx context.Context, dependency *taskScheduler)
|
||||||
at.req.Dim = int64(dim)
|
at.req.Dim = int64(dim)
|
||||||
|
|
||||||
totalSegmentsRawDataSize := float64(totalSegmentsRows) * float64(dim) * typeutil.VectorTypeSize(t.FieldType) // Byte
|
totalSegmentsRawDataSize := float64(totalSegmentsRows) * float64(dim) * typeutil.VectorTypeSize(t.FieldType) // Byte
|
||||||
numClusters := int64(math.Ceil(totalSegmentsRawDataSize / float64(Params.DataCoordCfg.ClusteringCompactionPreferSegmentSize.GetAsSize())))
|
numClusters := int64(math.Ceil(totalSegmentsRawDataSize / (Params.DataCoordCfg.SegmentMaxSize.GetAsFloat() * 1024 * 1024 * Params.DataCoordCfg.ClusteringCompactionMaxSegmentSizeRatio.GetAsFloat())))
|
||||||
if numClusters < Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64() {
|
if numClusters < Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64() {
|
||||||
log.Ctx(ctx).Info("data size is too small, skip analyze task", zap.Float64("raw data size", totalSegmentsRawDataSize), zap.Int64("num clusters", numClusters), zap.Int64("minimum num clusters required", Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64()))
|
log.Ctx(ctx).Info("data size is too small, skip analyze task", zap.Float64("raw data size", totalSegmentsRawDataSize), zap.Int64("num clusters", numClusters), zap.Int64("minimum num clusters required", Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64()))
|
||||||
at.SetState(indexpb.JobState_JobStateFinished, "")
|
at.SetState(indexpb.JobState_JobStateFinished, "")
|
||||||
|
|
|
@ -3002,21 +3002,21 @@ type dataCoordConfig struct {
|
||||||
SyncSegmentsInterval ParamItem `refreshable:"false"`
|
SyncSegmentsInterval ParamItem `refreshable:"false"`
|
||||||
|
|
||||||
// Clustering Compaction
|
// Clustering Compaction
|
||||||
ClusteringCompactionEnable ParamItem `refreshable:"true"`
|
ClusteringCompactionEnable ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionAutoEnable ParamItem `refreshable:"true"`
|
ClusteringCompactionAutoEnable ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionTriggerInterval ParamItem `refreshable:"true"`
|
ClusteringCompactionTriggerInterval ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMinInterval ParamItem `refreshable:"true"`
|
ClusteringCompactionMinInterval ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxInterval ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxInterval ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionNewDataSizeThreshold ParamItem `refreshable:"true"`
|
ClusteringCompactionNewDataSizeThreshold ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionPreferSegmentSize ParamItem `refreshable:"true"`
|
ClusteringCompactionPreferSegmentSizeRatio ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxSegmentSize ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxSegmentSizeRatio ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxTrainSizeRatio ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxTrainSizeRatio ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionTimeoutInSeconds ParamItem `refreshable:"true"`
|
ClusteringCompactionTimeoutInSeconds ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxCentroidsNum ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxCentroidsNum ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMinCentroidsNum ParamItem `refreshable:"true"`
|
ClusteringCompactionMinCentroidsNum ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMinClusterSizeRatio ParamItem `refreshable:"true"`
|
ClusteringCompactionMinClusterSizeRatio ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxClusterSizeRatio ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxClusterSizeRatio ParamItem `refreshable:"true"`
|
||||||
ClusteringCompactionMaxClusterSize ParamItem `refreshable:"true"`
|
ClusteringCompactionMaxClusterSize ParamItem `refreshable:"true"`
|
||||||
|
|
||||||
// LevelZero Segment
|
// LevelZero Segment
|
||||||
EnableLevelZeroSegment ParamItem `refreshable:"false"`
|
EnableLevelZeroSegment ParamItem `refreshable:"false"`
|
||||||
|
@ -3449,7 +3449,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionEnable = ParamItem{
|
p.ClusteringCompactionEnable = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.enable",
|
Key: "dataCoord.compaction.clustering.enable",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "false",
|
DefaultValue: "false",
|
||||||
Doc: "Enable clustering compaction",
|
Doc: "Enable clustering compaction",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3458,7 +3458,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionAutoEnable = ParamItem{
|
p.ClusteringCompactionAutoEnable = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.autoEnable",
|
Key: "dataCoord.compaction.clustering.autoEnable",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "false",
|
DefaultValue: "false",
|
||||||
Doc: "Enable auto clustering compaction",
|
Doc: "Enable auto clustering compaction",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3467,7 +3467,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionTriggerInterval = ParamItem{
|
p.ClusteringCompactionTriggerInterval = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.triggerInterval",
|
Key: "dataCoord.compaction.clustering.triggerInterval",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "600",
|
DefaultValue: "600",
|
||||||
Doc: "clustering compaction trigger interval in seconds",
|
Doc: "clustering compaction trigger interval in seconds",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3476,7 +3476,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMinInterval = ParamItem{
|
p.ClusteringCompactionMinInterval = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.minInterval",
|
Key: "dataCoord.compaction.clustering.minInterval",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
Doc: "The minimum interval between clustering compaction executions of one collection, to avoid redundant compaction",
|
Doc: "The minimum interval between clustering compaction executions of one collection, to avoid redundant compaction",
|
||||||
DefaultValue: "3600",
|
DefaultValue: "3600",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3485,7 +3485,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMaxInterval = ParamItem{
|
p.ClusteringCompactionMaxInterval = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxInterval",
|
Key: "dataCoord.compaction.clustering.maxInterval",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
Doc: "If a collection haven't been clustering compacted for longer than maxInterval, force compact",
|
Doc: "If a collection haven't been clustering compacted for longer than maxInterval, force compact",
|
||||||
DefaultValue: "86400",
|
DefaultValue: "86400",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3494,7 +3494,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionNewDataSizeThreshold = ParamItem{
|
p.ClusteringCompactionNewDataSizeThreshold = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.newDataSizeThreshold",
|
Key: "dataCoord.compaction.clustering.newDataSizeThreshold",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
Doc: "If new data size is large than newDataSizeThreshold, execute clustering compaction",
|
Doc: "If new data size is large than newDataSizeThreshold, execute clustering compaction",
|
||||||
DefaultValue: "512m",
|
DefaultValue: "512m",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3503,32 +3503,32 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionTimeoutInSeconds = ParamItem{
|
p.ClusteringCompactionTimeoutInSeconds = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.timeout",
|
Key: "dataCoord.compaction.clustering.timeout",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "3600",
|
DefaultValue: "3600",
|
||||||
}
|
}
|
||||||
p.ClusteringCompactionTimeoutInSeconds.Init(base.mgr)
|
p.ClusteringCompactionTimeoutInSeconds.Init(base.mgr)
|
||||||
|
|
||||||
p.ClusteringCompactionPreferSegmentSize = ParamItem{
|
p.ClusteringCompactionPreferSegmentSizeRatio = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.preferSegmentSize",
|
Key: "dataCoord.compaction.clustering.preferSegmentSizeRatio",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "512m",
|
DefaultValue: "0.8",
|
||||||
PanicIfEmpty: false,
|
PanicIfEmpty: false,
|
||||||
Export: true,
|
Export: true,
|
||||||
}
|
}
|
||||||
p.ClusteringCompactionPreferSegmentSize.Init(base.mgr)
|
p.ClusteringCompactionPreferSegmentSizeRatio.Init(base.mgr)
|
||||||
|
|
||||||
p.ClusteringCompactionMaxSegmentSize = ParamItem{
|
p.ClusteringCompactionMaxSegmentSizeRatio = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxSegmentSize",
|
Key: "dataCoord.compaction.clustering.maxSegmentSizeRatio",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "1024m",
|
DefaultValue: "1.0",
|
||||||
PanicIfEmpty: false,
|
PanicIfEmpty: false,
|
||||||
Export: true,
|
Export: true,
|
||||||
}
|
}
|
||||||
p.ClusteringCompactionMaxSegmentSize.Init(base.mgr)
|
p.ClusteringCompactionMaxSegmentSizeRatio.Init(base.mgr)
|
||||||
|
|
||||||
p.ClusteringCompactionMaxTrainSizeRatio = ParamItem{
|
p.ClusteringCompactionMaxTrainSizeRatio = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxTrainSizeRatio",
|
Key: "dataCoord.compaction.clustering.maxTrainSizeRatio",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "0.8",
|
DefaultValue: "0.8",
|
||||||
Doc: "max data size ratio in Kmeans train, if larger than it, will down sampling to meet this limit",
|
Doc: "max data size ratio in Kmeans train, if larger than it, will down sampling to meet this limit",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3537,7 +3537,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMaxCentroidsNum = ParamItem{
|
p.ClusteringCompactionMaxCentroidsNum = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxCentroidsNum",
|
Key: "dataCoord.compaction.clustering.maxCentroidsNum",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "10240",
|
DefaultValue: "10240",
|
||||||
Doc: "maximum centroids number in Kmeans train",
|
Doc: "maximum centroids number in Kmeans train",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3546,7 +3546,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMinCentroidsNum = ParamItem{
|
p.ClusteringCompactionMinCentroidsNum = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.minCentroidsNum",
|
Key: "dataCoord.compaction.clustering.minCentroidsNum",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "16",
|
DefaultValue: "16",
|
||||||
Doc: "minimum centroids number in Kmeans train",
|
Doc: "minimum centroids number in Kmeans train",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3555,7 +3555,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMinClusterSizeRatio = ParamItem{
|
p.ClusteringCompactionMinClusterSizeRatio = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.minClusterSizeRatio",
|
Key: "dataCoord.compaction.clustering.minClusterSizeRatio",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "0.01",
|
DefaultValue: "0.01",
|
||||||
Doc: "minimum cluster size / avg size in Kmeans train",
|
Doc: "minimum cluster size / avg size in Kmeans train",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3564,7 +3564,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMaxClusterSizeRatio = ParamItem{
|
p.ClusteringCompactionMaxClusterSizeRatio = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxClusterSizeRatio",
|
Key: "dataCoord.compaction.clustering.maxClusterSizeRatio",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "10",
|
DefaultValue: "10",
|
||||||
Doc: "maximum cluster size / avg size in Kmeans train",
|
Doc: "maximum cluster size / avg size in Kmeans train",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
@ -3573,7 +3573,7 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||||
|
|
||||||
p.ClusteringCompactionMaxClusterSize = ParamItem{
|
p.ClusteringCompactionMaxClusterSize = ParamItem{
|
||||||
Key: "dataCoord.compaction.clustering.maxClusterSize",
|
Key: "dataCoord.compaction.clustering.maxClusterSize",
|
||||||
Version: "2.4.6",
|
Version: "2.4.7",
|
||||||
DefaultValue: "5g",
|
DefaultValue: "5g",
|
||||||
Doc: "maximum cluster size in Kmeans train",
|
Doc: "maximum cluster size in Kmeans train",
|
||||||
Export: true,
|
Export: true,
|
||||||
|
|
|
@ -483,10 +483,10 @@ func TestComponentParam(t *testing.T) {
|
||||||
assert.Equal(t, int64(10*1024*1024), Params.ClusteringCompactionNewDataSizeThreshold.GetAsSize())
|
assert.Equal(t, int64(10*1024*1024), Params.ClusteringCompactionNewDataSizeThreshold.GetAsSize())
|
||||||
params.Save("dataCoord.compaction.clustering.newDataSizeThreshold", "10g")
|
params.Save("dataCoord.compaction.clustering.newDataSizeThreshold", "10g")
|
||||||
assert.Equal(t, int64(10*1024*1024*1024), Params.ClusteringCompactionNewDataSizeThreshold.GetAsSize())
|
assert.Equal(t, int64(10*1024*1024*1024), Params.ClusteringCompactionNewDataSizeThreshold.GetAsSize())
|
||||||
params.Save("dataCoord.compaction.clustering.maxSegmentSize", "100m")
|
params.Save("dataCoord.compaction.clustering.maxSegmentSizeRatio", "1.2")
|
||||||
assert.Equal(t, int64(100*1024*1024), Params.ClusteringCompactionMaxSegmentSize.GetAsSize())
|
assert.Equal(t, 1.2, Params.ClusteringCompactionMaxSegmentSizeRatio.GetAsFloat())
|
||||||
params.Save("dataCoord.compaction.clustering.preferSegmentSize", "10m")
|
params.Save("dataCoord.compaction.clustering.preferSegmentSizeRatio", "0.5")
|
||||||
assert.Equal(t, int64(10*1024*1024), Params.ClusteringCompactionPreferSegmentSize.GetAsSize())
|
assert.Equal(t, 0.5, Params.ClusteringCompactionPreferSegmentSizeRatio.GetAsFloat())
|
||||||
params.Save("dataCoord.slot.clusteringCompactionUsage", "10")
|
params.Save("dataCoord.slot.clusteringCompactionUsage", "10")
|
||||||
assert.Equal(t, 10, Params.ClusteringCompactionSlotUsage.GetAsInt())
|
assert.Equal(t, 10, Params.ClusteringCompactionSlotUsage.GetAsInt())
|
||||||
params.Save("dataCoord.slot.mixCompactionUsage", "5")
|
params.Save("dataCoord.slot.mixCompactionUsage", "5")
|
||||||
|
|
|
@ -71,11 +71,13 @@ func (s *ClusteringCompactionSuite) TestClusteringCompaction() {
|
||||||
paramtable.Get().Save(paramtable.Get().DataCoordCfg.EnableAutoCompaction.Key, "false")
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.EnableAutoCompaction.Key, "false")
|
||||||
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.EnableAutoCompaction.Key)
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.EnableAutoCompaction.Key)
|
||||||
|
|
||||||
paramtable.Get().Save(paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSize.Key, "1m")
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.SegmentMaxSize.Key, "1")
|
||||||
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSize.Key)
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.SegmentMaxSize.Key)
|
||||||
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSizeRatio.Key, "1.0")
|
||||||
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.ClusteringCompactionMaxSegmentSizeRatio.Key)
|
||||||
|
|
||||||
paramtable.Get().Save(paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSize.Key, "1m")
|
paramtable.Get().Save(paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSizeRatio.Key, "1.0")
|
||||||
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSize.Key)
|
defer paramtable.Get().Reset(paramtable.Get().DataCoordCfg.ClusteringCompactionPreferSegmentSizeRatio.Key)
|
||||||
|
|
||||||
schema := ConstructScalarClusteringSchema(collectionName, dim, true)
|
schema := ConstructScalarClusteringSchema(collectionName, dim, true)
|
||||||
marshaledSchema, err := proto.Marshal(schema)
|
marshaledSchema, err := proto.Marshal(schema)
|
||||||
|
|
Loading…
Reference in New Issue