// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package segments /* #cgo pkg-config: milvus_core #include "futures/future_c.h" #include "segcore/collection_c.h" #include "segcore/plan_c.h" #include "segcore/reduce_c.h" */ import "C" import ( "context" "fmt" "strings" "sync" "time" "unsafe" "github.com/cockroachdb/errors" "go.opentelemetry.io/otel" "go.uber.org/atomic" "go.uber.org/zap" "google.golang.org/protobuf/proto" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/msgpb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/querycoordv2/params" "github.com/milvus-io/milvus/internal/querynodev2/pkoracle" "github.com/milvus-io/milvus/internal/querynodev2/segments/state" "github.com/milvus-io/milvus/internal/storage" "github.com/milvus-io/milvus/internal/util/indexparamcheck" "github.com/milvus-io/milvus/internal/util/segcore" "github.com/milvus-io/milvus/internal/util/vecindexmgr" "github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/metrics" "github.com/milvus-io/milvus/pkg/v2/proto/cgopb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb" "github.com/milvus-io/milvus/pkg/v2/proto/indexcgopb" "github.com/milvus-io/milvus/pkg/v2/proto/querypb" "github.com/milvus-io/milvus/pkg/v2/proto/segcorepb" "github.com/milvus-io/milvus/pkg/v2/util/funcutil" "github.com/milvus-io/milvus/pkg/v2/util/indexparams" "github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/metautil" "github.com/milvus-io/milvus/pkg/v2/util/metric" "github.com/milvus-io/milvus/pkg/v2/util/paramtable" "github.com/milvus-io/milvus/pkg/v2/util/timerecord" "github.com/milvus-io/milvus/pkg/v2/util/typeutil" ) type SegmentType = commonpb.SegmentState const ( SegmentTypeGrowing = commonpb.SegmentState_Growing SegmentTypeSealed = commonpb.SegmentState_Sealed ) var ErrSegmentUnhealthy = errors.New("segment unhealthy") // IndexedFieldInfo contains binlog info of vector field type IndexedFieldInfo struct { FieldBinlog *datapb.FieldBinlog IndexInfo *querypb.FieldIndexInfo IsLoaded bool } type baseSegment struct { collection *Collection version *atomic.Int64 segmentType SegmentType bloomFilterSet *pkoracle.BloomFilterSet loadInfo *atomic.Pointer[querypb.SegmentLoadInfo] isLazyLoad bool skipGrowingBF bool // Skip generating or maintaining BF for growing segments; deletion checks will be handled in segcore. channel metautil.Channel bm25Stats map[int64]*storage.BM25Stats resourceUsageCache *atomic.Pointer[ResourceUsage] needUpdatedVersion *atomic.Int64 // only for lazy load mode update index } func newBaseSegment(collection *Collection, segmentType SegmentType, version int64, loadInfo *querypb.SegmentLoadInfo) (baseSegment, error) { channel, err := metautil.ParseChannel(loadInfo.GetInsertChannel(), channelMapper) if err != nil { return baseSegment{}, err } bs := baseSegment{ collection: collection, loadInfo: atomic.NewPointer[querypb.SegmentLoadInfo](loadInfo), version: atomic.NewInt64(version), segmentType: segmentType, bloomFilterSet: pkoracle.NewBloomFilterSet(loadInfo.GetSegmentID(), loadInfo.GetPartitionID(), segmentType), bm25Stats: make(map[int64]*storage.BM25Stats), channel: channel, isLazyLoad: isLazyLoad(collection, segmentType), skipGrowingBF: segmentType == SegmentTypeGrowing && paramtable.Get().QueryNodeCfg.SkipGrowingSegmentBF.GetAsBool(), resourceUsageCache: atomic.NewPointer[ResourceUsage](nil), needUpdatedVersion: atomic.NewInt64(0), } return bs, nil } // isLazyLoad checks if the segment is lazy load func isLazyLoad(collection *Collection, segmentType SegmentType) bool { return segmentType == SegmentTypeSealed && // only sealed segment enable lazy load (common.IsCollectionLazyLoadEnabled(collection.Schema().Properties...) || // collection level lazy load (!common.HasLazyload(collection.Schema().Properties) && params.Params.QueryNodeCfg.LazyLoadEnabled.GetAsBool())) // global level lazy load } // ID returns the identity number. func (s *baseSegment) ID() int64 { return s.loadInfo.Load().GetSegmentID() } func (s *baseSegment) Collection() int64 { return s.loadInfo.Load().GetCollectionID() } func (s *baseSegment) GetCollection() *Collection { return s.collection } func (s *baseSegment) Partition() int64 { return s.loadInfo.Load().GetPartitionID() } func (s *baseSegment) DatabaseName() string { return s.collection.GetDBName() } func (s *baseSegment) ResourceGroup() string { return s.collection.GetResourceGroup() } func (s *baseSegment) Shard() metautil.Channel { return s.channel } func (s *baseSegment) Type() SegmentType { return s.segmentType } func (s *baseSegment) Level() datapb.SegmentLevel { return s.loadInfo.Load().GetLevel() } func (s *baseSegment) IsSorted() bool { return s.loadInfo.Load().GetIsSorted() } func (s *baseSegment) StartPosition() *msgpb.MsgPosition { return s.loadInfo.Load().GetStartPosition() } func (s *baseSegment) Version() int64 { return s.version.Load() } func (s *baseSegment) CASVersion(old, newVersion int64) bool { return s.version.CompareAndSwap(old, newVersion) } func (s *baseSegment) LoadInfo() *querypb.SegmentLoadInfo { return s.loadInfo.Load() } func (s *baseSegment) UpdateBloomFilter(pks []storage.PrimaryKey) { if s.skipGrowingBF { return } s.bloomFilterSet.UpdateBloomFilter(pks) } func (s *baseSegment) UpdateBM25Stats(stats map[int64]*storage.BM25Stats) { for fieldID, new := range stats { if current, ok := s.bm25Stats[fieldID]; ok { current.Merge(new) } else { s.bm25Stats[fieldID] = new } } } func (s *baseSegment) GetBM25Stats() map[int64]*storage.BM25Stats { return s.bm25Stats } // MayPkExist returns true if the given PK exists in the PK range and being positive through the bloom filter, // false otherwise, // may returns true even the PK doesn't exist actually func (s *baseSegment) MayPkExist(pk *storage.LocationsCache) bool { if s.skipGrowingBF { return true } return s.bloomFilterSet.MayPkExist(pk) } func (s *baseSegment) BatchPkExist(lc *storage.BatchLocationsCache) []bool { if s.skipGrowingBF { allPositive := make([]bool, lc.Size()) for i := 0; i < lc.Size(); i++ { allPositive[i] = true } return allPositive } return s.bloomFilterSet.BatchPkExist(lc) } // ResourceUsageEstimate returns the estimated resource usage of the segment. func (s *baseSegment) ResourceUsageEstimate() ResourceUsage { if s.segmentType == SegmentTypeGrowing { // Growing segment cannot do resource usage estimate. return ResourceUsage{} } cache := s.resourceUsageCache.Load() if cache != nil { return *cache } usage, err := getResourceUsageEstimateOfSegment(s.collection.Schema(), s.LoadInfo(), resourceEstimateFactor{ memoryUsageFactor: 1.0, memoryIndexUsageFactor: 1.0, enableTempSegmentIndex: false, deltaDataExpansionFactor: paramtable.Get().QueryNodeCfg.DeltaDataExpansionRate.GetAsFloat(), }) if err != nil { // Should never failure, if failed, segment should never be loaded. log.Warn("unreachable: failed to get resource usage estimate of segment", zap.Error(err), zap.Int64("collectionID", s.Collection()), zap.Int64("segmentID", s.ID())) return ResourceUsage{} } s.resourceUsageCache.Store(usage) return *usage } func (s *baseSegment) IsLazyLoad() bool { return s.isLazyLoad } func (s *baseSegment) NeedUpdatedVersion() int64 { return s.needUpdatedVersion.Load() } func (s *baseSegment) SetLoadInfo(loadInfo *querypb.SegmentLoadInfo) { s.loadInfo.Store(loadInfo) } func (s *baseSegment) SetNeedUpdatedVersion(version int64) { s.needUpdatedVersion.Store(version) } type FieldInfo struct { *datapb.FieldBinlog RowCount int64 } var _ Segment = (*LocalSegment)(nil) // Segment is a wrapper of the underlying C-structure segment. type LocalSegment struct { baseSegment ptrLock *state.LoadStateLock ptr C.CSegmentInterface // TODO: Remove in future, after move load index into segcore package. // always keep same with csegment.RawPtr(), for eaiser to access, csegment segcore.CSegment // cached results, to avoid too many CGO calls memSize *atomic.Int64 rowNum *atomic.Int64 insertCount *atomic.Int64 lastDeltaTimestamp *atomic.Uint64 fields *typeutil.ConcurrentMap[int64, *FieldInfo] fieldIndexes *typeutil.ConcurrentMap[int64, *IndexedFieldInfo] // indexID -> IndexedFieldInfo warmupDispatcher *AsyncWarmupDispatcher } func NewSegment(ctx context.Context, collection *Collection, segmentType SegmentType, version int64, loadInfo *querypb.SegmentLoadInfo, warmupDispatcher *AsyncWarmupDispatcher, ) (Segment, error) { log := log.Ctx(ctx) /* CStatus NewSegment(CCollection collection, uint64_t segment_id, SegmentType seg_type, CSegmentInterface* newSegment); */ if loadInfo.GetLevel() == datapb.SegmentLevel_L0 { return NewL0Segment(collection, segmentType, version, loadInfo) } base, err := newBaseSegment(collection, segmentType, version, loadInfo) if err != nil { return nil, err } var locker *state.LoadStateLock switch segmentType { case SegmentTypeSealed: locker = state.NewLoadStateLock(state.LoadStateOnlyMeta) case SegmentTypeGrowing: locker = state.NewLoadStateLock(state.LoadStateDataLoaded) default: return nil, fmt.Errorf("illegal segment type %d when create segment %d", segmentType, loadInfo.GetSegmentID()) } logger := log.With( zap.Int64("collectionID", loadInfo.GetCollectionID()), zap.Int64("partitionID", loadInfo.GetPartitionID()), zap.Int64("segmentID", loadInfo.GetSegmentID()), zap.String("segmentType", segmentType.String()), zap.String("level", loadInfo.GetLevel().String()), ) var csegment segcore.CSegment if _, err := GetDynamicPool().Submit(func() (any, error) { var err error csegment, err = segcore.CreateCSegment(&segcore.CreateCSegmentRequest{ Collection: collection.ccollection, SegmentID: loadInfo.GetSegmentID(), SegmentType: segmentType, IsSorted: loadInfo.GetIsSorted(), EnableChunked: paramtable.Get().QueryNodeCfg.MultipleChunkedEnable.GetAsBool(), }) return nil, err }).Await(); err != nil { logger.Warn("create segment failed", zap.Error(err)) return nil, err } log.Info("create segment done") segment := &LocalSegment{ baseSegment: base, ptrLock: locker, ptr: C.CSegmentInterface(csegment.RawPointer()), csegment: csegment, lastDeltaTimestamp: atomic.NewUint64(0), fields: typeutil.NewConcurrentMap[int64, *FieldInfo](), fieldIndexes: typeutil.NewConcurrentMap[int64, *IndexedFieldInfo](), memSize: atomic.NewInt64(-1), rowNum: atomic.NewInt64(-1), insertCount: atomic.NewInt64(0), warmupDispatcher: warmupDispatcher, } if err := segment.initializeSegment(); err != nil { csegment.Release() return nil, err } return segment, nil } func (s *LocalSegment) initializeSegment() error { loadInfo := s.loadInfo.Load() indexedFieldInfos, fieldBinlogs := separateIndexAndBinlog(loadInfo) schemaHelper, _ := typeutil.CreateSchemaHelper(s.collection.Schema()) for _, info := range indexedFieldInfos { fieldID := info.IndexInfo.FieldID field, err := schemaHelper.GetFieldFromID(fieldID) if err != nil { return err } indexInfo := info.IndexInfo s.fieldIndexes.Insert(indexInfo.GetIndexID(), &IndexedFieldInfo{ FieldBinlog: &datapb.FieldBinlog{ FieldID: indexInfo.GetFieldID(), }, IndexInfo: indexInfo, IsLoaded: false, }) if !typeutil.IsVectorType(field.GetDataType()) && !s.HasRawData(fieldID) { s.fields.Insert(fieldID, &FieldInfo{ FieldBinlog: info.FieldBinlog, RowCount: loadInfo.GetNumOfRows(), }) } } for _, binlogs := range fieldBinlogs { s.fields.Insert(binlogs.FieldID, &FieldInfo{ FieldBinlog: binlogs, RowCount: loadInfo.GetNumOfRows(), }) } // Update the insert count when initialize the segment and update the metrics. s.insertCount.Store(loadInfo.GetNumOfRows()) return nil } // PinIfNotReleased acquires the `ptrLock` and returns true if the pointer is valid // Provide ONLY the read lock operations, // don't make `ptrLock` public to avoid abusing of the mutex. func (s *LocalSegment) PinIfNotReleased() error { if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } return nil } func (s *LocalSegment) Unpin() { s.ptrLock.Unpin() } func (s *LocalSegment) InsertCount() int64 { return s.insertCount.Load() } func (s *LocalSegment) RowNum() int64 { // if segment is not loaded, return 0 (maybe not loaded or release by lru) if !s.ptrLock.PinIf(state.IsDataLoaded) { return 0 } defer s.ptrLock.Unpin() rowNum := s.rowNum.Load() if rowNum < 0 { GetDynamicPool().Submit(func() (any, error) { rowNum = s.csegment.RowNum() s.rowNum.Store(rowNum) return nil, nil }).Await() } return rowNum } func (s *LocalSegment) MemSize() int64 { if !s.ptrLock.PinIf(state.IsNotReleased) { return 0 } defer s.ptrLock.Unpin() memSize := s.memSize.Load() if memSize < 0 { GetDynamicPool().Submit(func() (any, error) { memSize = s.csegment.MemSize() s.memSize.Store(memSize) return nil, nil }).Await() } return memSize } func (s *LocalSegment) LastDeltaTimestamp() uint64 { return s.lastDeltaTimestamp.Load() } func (s *LocalSegment) GetIndexByID(indexID int64) *IndexedFieldInfo { info, _ := s.fieldIndexes.Get(indexID) return info } func (s *LocalSegment) GetIndex(fieldID int64) []*IndexedFieldInfo { var info []*IndexedFieldInfo s.fieldIndexes.Range(func(key int64, value *IndexedFieldInfo) bool { if value.IndexInfo.FieldID == fieldID { info = append(info, value) } return true }) return info } func (s *LocalSegment) ExistIndex(fieldID int64) bool { contain := false s.fieldIndexes.Range(func(key int64, value *IndexedFieldInfo) bool { if value.IndexInfo.FieldID == fieldID { contain = true } return !contain }) return contain } func (s *LocalSegment) HasRawData(fieldID int64) bool { if !s.ptrLock.PinIf(state.IsNotReleased) { return false } defer s.ptrLock.Unpin() return s.csegment.HasRawData(fieldID) } func (s *LocalSegment) Indexes() []*IndexedFieldInfo { var result []*IndexedFieldInfo s.fieldIndexes.Range(func(key int64, value *IndexedFieldInfo) bool { result = append(result, value) return true }) return result } func (s *LocalSegment) ResetIndexesLazyLoad(lazyState bool) { for _, indexInfo := range s.Indexes() { indexInfo.IsLoaded = lazyState } } func (s *LocalSegment) Search(ctx context.Context, searchReq *segcore.SearchRequest) (*segcore.SearchResult, error) { log := log.Ctx(ctx).WithLazy( zap.Int64("collectionID", s.Collection()), zap.Int64("segmentID", s.ID()), zap.String("segmentType", s.segmentType.String()), ) if !s.ptrLock.PinIf(state.IsNotReleased) { // TODO: check if the segment is readable but not released. too many related logic need to be refactor. return nil, merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() hasIndex := s.ExistIndex(searchReq.SearchFieldID()) log = log.With(zap.Bool("withIndex", hasIndex)) log.Debug("search segment...") tr := timerecord.NewTimeRecorder("cgoSearch") result, err := s.csegment.Search(ctx, searchReq) if err != nil { log.Warn("Search failed") return nil, err } metrics.QueryNodeSQSegmentLatencyInCore.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), metrics.SearchLabel).Observe(float64(tr.ElapseSpan().Milliseconds())) log.Debug("search segment done") return result, nil } func (s *LocalSegment) retrieve(ctx context.Context, plan *segcore.RetrievePlan, log *zap.Logger) (*segcore.RetrieveResult, error) { if !s.ptrLock.PinIf(state.IsNotReleased) { // TODO: check if the segment is readable but not released. too many related logic need to be refactor. return nil, merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() log.Debug("begin to retrieve") tr := timerecord.NewTimeRecorder("cgoRetrieve") result, err := s.csegment.Retrieve(ctx, plan) if err != nil { log.Warn("Retrieve failed") return nil, err } metrics.QueryNodeSQSegmentLatencyInCore.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), metrics.QueryLabel).Observe(float64(tr.ElapseSpan().Milliseconds())) return result, nil } func (s *LocalSegment) Retrieve(ctx context.Context, plan *segcore.RetrievePlan) (*segcorepb.RetrieveResults, error) { log := log.Ctx(ctx).WithLazy( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("msgID", plan.MsgID()), zap.String("segmentType", s.segmentType.String()), ) result, err := s.retrieve(ctx, plan, log) if err != nil { return nil, err } defer result.Release() _, span := otel.Tracer(typeutil.QueryNodeRole).Start(ctx, "partial-segcore-results-deserialization") defer span.End() retrieveResult, err := result.GetResult() if err != nil { log.Warn("unmarshal retrieve result failed", zap.Error(err)) return nil, err } log.Debug("retrieve segment done", zap.Int("resultNum", len(retrieveResult.Offset))) return retrieveResult, nil } func (s *LocalSegment) retrieveByOffsets(ctx context.Context, plan *segcore.RetrievePlanWithOffsets, log *zap.Logger) (*segcore.RetrieveResult, error) { if !s.ptrLock.PinIf(state.IsNotReleased) { // TODO: check if the segment is readable but not released. too many related logic need to be refactor. return nil, merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() log.Debug("begin to retrieve by offsets") tr := timerecord.NewTimeRecorder("cgoRetrieveByOffsets") result, err := s.csegment.RetrieveByOffsets(ctx, plan) if err != nil { log.Warn("RetrieveByOffsets failed") return nil, err } metrics.QueryNodeSQSegmentLatencyInCore.WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), metrics.QueryLabel).Observe(float64(tr.ElapseSpan().Milliseconds())) return result, nil } func (s *LocalSegment) RetrieveByOffsets(ctx context.Context, plan *segcore.RetrievePlanWithOffsets) (*segcorepb.RetrieveResults, error) { log := log.Ctx(ctx).WithLazy(zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("msgID", plan.MsgID()), zap.String("segmentType", s.segmentType.String()), zap.Int("resultNum", len(plan.Offsets)), ) result, err := s.retrieveByOffsets(ctx, plan, log) if err != nil { return nil, err } defer result.Release() _, span := otel.Tracer(typeutil.QueryNodeRole).Start(ctx, "reduced-segcore-results-deserialization") defer span.End() retrieveResult, err := result.GetResult() if err != nil { log.Warn("unmarshal retrieve by offsets result failed", zap.Error(err)) return nil, err } log.Debug("retrieve by segment offsets done", zap.Int("resultNum", len(retrieveResult.Offset))) return retrieveResult, nil } func (s *LocalSegment) GetFieldDataPath(index *IndexedFieldInfo, offset int64) (dataPath string, offsetInBinlog int64) { offsetInBinlog = offset for _, binlog := range index.FieldBinlog.Binlogs { if offsetInBinlog < binlog.EntriesNum { dataPath = binlog.GetLogPath() break } else { offsetInBinlog -= binlog.EntriesNum } } return dataPath, offsetInBinlog } func (s *LocalSegment) Insert(ctx context.Context, rowIDs []int64, timestamps []typeutil.Timestamp, record *segcorepb.InsertRecord) error { if s.Type() != SegmentTypeGrowing { return fmt.Errorf("unexpected segmentType when segmentInsert, segmentType = %s", s.segmentType.String()) } if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() var result *segcore.InsertResult var err error GetDynamicPool().Submit(func() (any, error) { start := time.Now() defer func() { metrics.QueryNodeCGOCallLatency.WithLabelValues( fmt.Sprint(paramtable.GetNodeID()), "Insert", "Sync", ).Observe(float64(time.Since(start).Milliseconds())) }() result, err = s.csegment.Insert(ctx, &segcore.InsertRequest{ RowIDs: rowIDs, Timestamps: timestamps, Record: record, }) return nil, nil }).Await() if err != nil { return err } s.insertCount.Add(result.InsertedRows) s.rowNum.Store(-1) s.memSize.Store(-1) return nil } func (s *LocalSegment) Delete(ctx context.Context, primaryKeys storage.PrimaryKeys, timestamps []typeutil.Timestamp) error { /* CStatus Delete(CSegmentInterface c_segment, long int reserved_offset, long size, const long* primary_keys, const unsigned long* timestamps); */ if primaryKeys.Len() == 0 { return nil } if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() var err error GetDynamicPool().Submit(func() (any, error) { start := time.Now() defer func() { metrics.QueryNodeCGOCallLatency.WithLabelValues( fmt.Sprint(paramtable.GetNodeID()), "Delete", "Sync", ).Observe(float64(time.Since(start).Milliseconds())) }() _, err = s.csegment.Delete(ctx, &segcore.DeleteRequest{ PrimaryKeys: primaryKeys, Timestamps: timestamps, }) return nil, nil }).Await() if err != nil { return err } s.rowNum.Store(-1) s.lastDeltaTimestamp.Store(timestamps[len(timestamps)-1]) return nil } // -------------------------------------------------------------------------------------- interfaces for sealed segment func (s *LocalSegment) LoadMultiFieldData(ctx context.Context) error { loadInfo := s.loadInfo.Load() rowCount := loadInfo.GetNumOfRows() fields := loadInfo.GetBinlogPaths() if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), ) req := &segcore.LoadFieldDataRequest{ MMapDir: paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue(), RowCount: rowCount, } for _, field := range fields { req.Fields = append(req.Fields, segcore.LoadFieldDataInfo{ Field: field, }) } var err error GetLoadPool().Submit(func() (any, error) { start := time.Now() defer func() { metrics.QueryNodeCGOCallLatency.WithLabelValues( fmt.Sprint(paramtable.GetNodeID()), "LoadFieldData", "Sync", ).Observe(float64(time.Since(start).Milliseconds())) }() _, err = s.csegment.LoadFieldData(ctx, req) return nil, nil }).Await() if err != nil { log.Warn("LoadMultiFieldData failed", zap.Error(err)) return err } log.Info("load mutil field done", zap.Int64("row count", rowCount), zap.Int64("segmentID", s.ID())) return nil } func (s *LocalSegment) LoadFieldData(ctx context.Context, fieldID int64, rowCount int64, field *datapb.FieldBinlog) error { if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() ctx, sp := otel.Tracer(typeutil.QueryNodeRole).Start(ctx, fmt.Sprintf("LoadFieldData-%d-%d", s.ID(), fieldID)) defer sp.End() log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", fieldID), zap.Int64("rowCount", rowCount), ) log.Info("start loading field data for field") // TODO retrieve_enable should be considered collection := s.collection fieldSchema, err := getFieldSchema(collection.Schema(), fieldID) if err != nil { return err } mmapEnabled := isDataMmapEnable(fieldSchema) req := &segcore.LoadFieldDataRequest{ MMapDir: paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue(), Fields: []segcore.LoadFieldDataInfo{{ Field: field, EnableMMap: mmapEnabled, }}, RowCount: rowCount, } GetLoadPool().Submit(func() (any, error) { start := time.Now() defer func() { metrics.QueryNodeCGOCallLatency.WithLabelValues( fmt.Sprint(paramtable.GetNodeID()), "LoadFieldData", "Sync", ).Observe(float64(time.Since(start).Milliseconds())) }() _, err = s.csegment.LoadFieldData(ctx, req) log.Info("submitted loadFieldData task to load pool") return nil, nil }).Await() if err != nil { log.Warn("LoadFieldData failed", zap.Error(err)) return err } log.Info("load field done") return nil } func (s *LocalSegment) AddFieldDataInfo(ctx context.Context, rowCount int64, fields []*datapb.FieldBinlog) error { if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() log := log.Ctx(ctx).WithLazy( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("row count", rowCount), ) req := &segcore.AddFieldDataInfoRequest{ Fields: make([]segcore.LoadFieldDataInfo, 0, len(fields)), RowCount: rowCount, } for _, field := range fields { req.Fields = append(req.Fields, segcore.LoadFieldDataInfo{ Field: field, }) } var err error GetLoadPool().Submit(func() (any, error) { _, err = s.csegment.AddFieldDataInfo(ctx, req) return nil, nil }).Await() if err != nil { log.Warn("AddFieldDataInfo failed", zap.Error(err)) return err } log.Info("add field data info done") return nil } func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error { pks, tss := deltaData.DeletePks(), deltaData.DeleteTimestamps() rowNum := deltaData.DeleteRowCount() if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), ) ids, err := storage.ParsePrimaryKeysBatch2IDs(pks) if err != nil { return err } idsBlob, err := proto.Marshal(ids) if err != nil { return err } loadInfo := C.CLoadDeletedRecordInfo{ timestamps: unsafe.Pointer(&tss[0]), primary_keys: (*C.uint8_t)(unsafe.Pointer(&idsBlob[0])), primary_keys_size: C.uint64_t(len(idsBlob)), row_count: C.int64_t(rowNum), } /* CStatus LoadDeletedRecord(CSegmentInterface c_segment, CLoadDeletedRecordInfo deleted_record_info) */ var status C.CStatus GetDynamicPool().Submit(func() (any, error) { start := time.Now() defer func() { metrics.QueryNodeCGOCallLatency.WithLabelValues( fmt.Sprint(paramtable.GetNodeID()), "LoadDeletedRecord", "Sync", ).Observe(float64(time.Since(start).Milliseconds())) }() status = C.LoadDeletedRecord(s.ptr, loadInfo) return nil, nil }).Await() if err := HandleCStatus(ctx, &status, "LoadDeletedRecord failed", zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID())); err != nil { return err } s.rowNum.Store(-1) s.lastDeltaTimestamp.Store(tss[len(tss)-1]) log.Info("load deleted record done", zap.Int64("rowNum", rowNum), zap.String("segmentType", s.Type().String())) return nil } func GetCLoadInfoWithFunc(ctx context.Context, fieldSchema *schemapb.FieldSchema, s *querypb.SegmentLoadInfo, indexInfo *querypb.FieldIndexInfo, f func(c *LoadIndexInfo) error, ) error { // 1. loadIndexInfo, err := newLoadIndexInfo(ctx) if err != nil { return err } defer deleteLoadIndexInfo(loadIndexInfo) indexParams := funcutil.KeyValuePair2Map(indexInfo.IndexParams) // as Knowhere reports error if encounter an unknown param, we need to delete it delete(indexParams, common.MmapEnabledKey) // some build params also exist in indexParams, which are useless during loading process if vecindexmgr.GetVecIndexMgrInstance().IsDiskANN(indexParams["index_type"]) { if err := indexparams.SetDiskIndexLoadParams(paramtable.Get(), indexParams, indexInfo.GetNumRows()); err != nil { return err } } // set whether enable offset cache for bitmap index if indexParams["index_type"] == indexparamcheck.IndexBitmap { indexparams.SetBitmapIndexLoadParams(paramtable.Get(), indexParams) } if err := indexparams.AppendPrepareLoadParams(paramtable.Get(), indexParams); err != nil { return err } enableMmap := isIndexMmapEnable(fieldSchema, indexInfo) indexInfoProto := &cgopb.LoadIndexInfo{ CollectionID: s.GetCollectionID(), PartitionID: s.GetPartitionID(), SegmentID: s.GetSegmentID(), Field: fieldSchema, EnableMmap: enableMmap, MmapDirPath: paramtable.Get().QueryNodeCfg.MmapDirPath.GetValue(), IndexID: indexInfo.GetIndexID(), IndexBuildID: indexInfo.GetBuildID(), IndexVersion: indexInfo.GetIndexVersion(), IndexParams: indexParams, IndexFiles: indexInfo.GetIndexFilePaths(), IndexEngineVersion: indexInfo.GetCurrentIndexVersion(), IndexStoreVersion: indexInfo.GetIndexStoreVersion(), IndexFileSize: indexInfo.GetIndexSize(), } // 2. if err := loadIndexInfo.appendLoadIndexInfo(ctx, indexInfoProto); err != nil { log.Warn("fail to append load index info", zap.Error(err)) return err } return f(loadIndexInfo) } func (s *LocalSegment) LoadIndex(ctx context.Context, indexInfo *querypb.FieldIndexInfo, fieldType schemapb.DataType) error { log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", indexInfo.GetFieldID()), zap.Int64("indexID", indexInfo.GetIndexID()), ) old := s.GetIndexByID(indexInfo.GetIndexID()) // the index loaded if old != nil && old.IsLoaded { log.Warn("index already loaded") return nil } ctx, sp := otel.Tracer(typeutil.QueryNodeRole).Start(ctx, fmt.Sprintf("LoadIndex-%d-%d", s.ID(), indexInfo.GetFieldID())) defer sp.End() tr := timerecord.NewTimeRecorder("loadIndex") schemaHelper, err := typeutil.CreateSchemaHelper(s.GetCollection().Schema()) if err != nil { return err } fieldSchema, err := schemaHelper.GetFieldFromID(indexInfo.GetFieldID()) if err != nil { return err } // // if segment is pk sorted, user created indexes bring no performance gain but extra memory usage if s.IsSorted() && fieldSchema.GetIsPrimaryKey() { log.Info("skip loading index for pk field in sorted segment") // set field index, preventing repeated loading index task s.fieldIndexes.Insert(indexInfo.GetFieldID(), &IndexedFieldInfo{ FieldBinlog: &datapb.FieldBinlog{ FieldID: indexInfo.GetFieldID(), }, IndexInfo: indexInfo, IsLoaded: true, }) return nil } return s.innerLoadIndex(ctx, fieldSchema, indexInfo, tr, fieldType) } func (s *LocalSegment) innerLoadIndex(ctx context.Context, fieldSchema *schemapb.FieldSchema, indexInfo *querypb.FieldIndexInfo, tr *timerecord.TimeRecorder, fieldType schemapb.DataType, ) error { err := GetCLoadInfoWithFunc(ctx, fieldSchema, s.LoadInfo(), indexInfo, func(loadIndexInfo *LoadIndexInfo) error { newLoadIndexInfoSpan := tr.RecordSpan() if err := loadIndexInfo.loadIndex(ctx); err != nil { if loadIndexInfo.cleanLocalData(ctx) != nil { log.Warn("failed to clean cached data on disk after append index failed", zap.Int64("buildID", indexInfo.BuildID), zap.Int64("index version", indexInfo.IndexVersion)) } return err } if s.Type() != SegmentTypeSealed { errMsg := fmt.Sprintln("updateSegmentIndex failed, illegal segment type ", s.segmentType, "segmentID = ", s.ID()) return errors.New(errMsg) } appendLoadIndexInfoSpan := tr.RecordSpan() // 3. err := s.UpdateIndexInfo(ctx, indexInfo, loadIndexInfo) if err != nil { return err } updateIndexInfoSpan := tr.RecordSpan() // Skip warnup chunk cache when // . scalar data // . index has row data // . vector was bm25 function output if !typeutil.IsVectorType(fieldType) || s.HasRawData(indexInfo.GetFieldID()) { return nil } metricType, err := funcutil.GetAttrByKeyFromRepeatedKV(common.MetricTypeKey, indexInfo.IndexParams) if err != nil { return fmt.Errorf("metric type not exist in index params") } if metricType == metric.BM25 { return nil } // 4. mmapChunkCache := paramtable.Get().QueryNodeCfg.MmapChunkCache.GetAsBool() s.WarmupChunkCache(ctx, indexInfo.GetFieldID(), mmapChunkCache) warmupChunkCacheSpan := tr.RecordSpan() log.Info("Finish loading index", zap.Duration("newLoadIndexInfoSpan", newLoadIndexInfoSpan), zap.Duration("appendLoadIndexInfoSpan", appendLoadIndexInfoSpan), zap.Duration("updateIndexInfoSpan", updateIndexInfoSpan), zap.Duration("warmupChunkCacheSpan", warmupChunkCacheSpan), ) return nil }) if err != nil { log.Warn("load index failed", zap.Error(err)) } return err } func (s *LocalSegment) LoadTextIndex(ctx context.Context, textLogs *datapb.TextIndexStats, schemaHelper *typeutil.SchemaHelper) error { log.Ctx(ctx).Info("load text index", zap.Int64("field id", textLogs.GetFieldID()), zap.Any("text logs", textLogs)) f, err := schemaHelper.GetFieldFromID(textLogs.GetFieldID()) if err != nil { return err } cgoProto := &indexcgopb.LoadTextIndexInfo{ FieldID: textLogs.GetFieldID(), Version: textLogs.GetVersion(), BuildID: textLogs.GetBuildID(), Files: textLogs.GetFiles(), Schema: f, CollectionID: s.Collection(), PartitionID: s.Partition(), } marshaled, err := proto.Marshal(cgoProto) if err != nil { return err } var status C.CStatus _, _ = GetLoadPool().Submit(func() (any, error) { status = C.LoadTextIndex(s.ptr, (*C.uint8_t)(unsafe.Pointer(&marshaled[0])), (C.uint64_t)(len(marshaled))) return nil, nil }).Await() return HandleCStatus(ctx, &status, "LoadTextIndex failed") } func (s *LocalSegment) UpdateIndexInfo(ctx context.Context, indexInfo *querypb.FieldIndexInfo, info *LoadIndexInfo) error { log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", indexInfo.FieldID), ) if !s.ptrLock.PinIf(state.IsNotReleased) { return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") } defer s.ptrLock.Unpin() var status C.CStatus GetDynamicPool().Submit(func() (any, error) { status = C.UpdateSealedSegmentIndex(s.ptr, info.cLoadIndexInfo) return nil, nil }).Await() if err := HandleCStatus(ctx, &status, "UpdateSealedSegmentIndex failed", zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", indexInfo.FieldID)); err != nil { return err } s.fieldIndexes.Insert(indexInfo.GetIndexID(), &IndexedFieldInfo{ FieldBinlog: &datapb.FieldBinlog{ FieldID: indexInfo.GetFieldID(), }, IndexInfo: indexInfo, IsLoaded: true, }) log.Info("updateSegmentIndex done") return nil } func (s *LocalSegment) WarmupChunkCache(ctx context.Context, fieldID int64, mmapEnabled bool) { log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", fieldID), zap.Bool("mmapEnabled", mmapEnabled), ) if !s.ptrLock.PinIf(state.IsNotReleased) { return } defer s.ptrLock.Unpin() var status C.CStatus warmingUp := strings.ToLower(paramtable.Get().QueryNodeCfg.ChunkCacheWarmingUp.GetValue()) switch warmingUp { case "sync": GetWarmupPool().Submit(func() (any, error) { cFieldID := C.int64_t(fieldID) cMmapEnabled := C.bool(mmapEnabled) status = C.WarmupChunkCache(s.ptr, cFieldID, cMmapEnabled) if err := HandleCStatus(ctx, &status, "warming up chunk cache failed"); err != nil { log.Warn("warming up chunk cache synchronously failed", zap.Error(err)) return nil, err } log.Info("warming up chunk cache synchronously done") return nil, nil }).Await() case "async": task := func() (any, error) { // failed to wait for state update, return directly if !s.ptrLock.BlockUntilDataLoadedOrReleased() { return nil, nil } if s.PinIfNotReleased() != nil { return nil, nil } defer s.Unpin() cFieldID := C.int64_t(fieldID) cMmapEnabled := C.bool(mmapEnabled) status = C.WarmupChunkCache(s.ptr, cFieldID, cMmapEnabled) if err := HandleCStatus(ctx, &status, ""); err != nil { log.Warn("warming up chunk cache asynchronously failed", zap.Error(err)) return nil, err } log.Info("warming up chunk cache asynchronously done") return nil, nil } s.warmupDispatcher.AddTask(task) default: // no warming up } } func (s *LocalSegment) UpdateFieldRawDataSize(ctx context.Context, numRows int64, fieldBinlog *datapb.FieldBinlog) error { var status C.CStatus fieldID := fieldBinlog.FieldID fieldDataSize := int64(0) for _, binlog := range fieldBinlog.GetBinlogs() { fieldDataSize += binlog.GetMemorySize() } GetDynamicPool().Submit(func() (any, error) { status = C.UpdateFieldRawDataSize(s.ptr, C.int64_t(fieldID), C.int64_t(numRows), C.int64_t(fieldDataSize)) return nil, nil }).Await() if err := HandleCStatus(ctx, &status, "updateFieldRawDataSize failed"); err != nil { return err } log.Ctx(ctx).Info("updateFieldRawDataSize done", zap.Int64("segmentID", s.ID())) return nil } func (s *LocalSegment) CreateTextIndex(ctx context.Context, fieldID int64) error { var status C.CStatus log.Ctx(ctx).Info("create text index for segment", zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", fieldID)) GetLoadPool().Submit(func() (any, error) { status = C.CreateTextIndex(s.ptr, C.int64_t(fieldID)) return nil, nil }).Await() if err := HandleCStatus(ctx, &status, "CreateTextIndex failed"); err != nil { return err } log.Ctx(ctx).Info("create text index for segment done", zap.Int64("segmentID", s.ID()), zap.Int64("fieldID", fieldID)) return nil } type ReleaseScope int const ( ReleaseScopeAll ReleaseScope = iota ReleaseScopeData ) type releaseOptions struct { Scope ReleaseScope } func newReleaseOptions() *releaseOptions { return &releaseOptions{ Scope: ReleaseScopeAll, } } type releaseOption func(*releaseOptions) func WithReleaseScope(scope ReleaseScope) releaseOption { return func(options *releaseOptions) { options.Scope = scope } } func (s *LocalSegment) Release(ctx context.Context, opts ...releaseOption) { options := newReleaseOptions() for _, opt := range opts { opt(options) } stateLockGuard := s.startRelease(options.Scope) if stateLockGuard == nil { // release is already done. return } // release will never fail defer stateLockGuard.Done(nil) log := log.Ctx(ctx).With(zap.Int64("collectionID", s.Collection()), zap.Int64("partitionID", s.Partition()), zap.Int64("segmentID", s.ID()), zap.String("segmentType", s.segmentType.String()), zap.Int64("insertCount", s.InsertCount()), ) // wait all read ops finished ptr := s.ptr if options.Scope == ReleaseScopeData { s.ReleaseSegmentData() log.Info("release segment data done and the field indexes info has been set lazy load=true") return } GetDynamicPool().Submit(func() (any, error) { C.DeleteSegment(ptr) return nil, nil }).Await() log.Info("delete segment from memory") } // ReleaseSegmentData releases the segment data. func (s *LocalSegment) ReleaseSegmentData() { GetDynamicPool().Submit(func() (any, error) { C.ClearSegmentData(s.ptr) return nil, nil }).Await() for _, indexInfo := range s.Indexes() { indexInfo.IsLoaded = false } } // StartLoadData starts the loading process of the segment. func (s *LocalSegment) StartLoadData() (state.LoadStateLockGuard, error) { return s.ptrLock.StartLoadData() } // startRelease starts the releasing process of the segment. func (s *LocalSegment) startRelease(scope ReleaseScope) state.LoadStateLockGuard { switch scope { case ReleaseScopeData: return s.ptrLock.StartReleaseData() case ReleaseScopeAll: return s.ptrLock.StartReleaseAll() default: panic(fmt.Sprintf("unexpected release scope %d", scope)) } } func (s *LocalSegment) RemoveFieldFile(fieldId int64) { GetDynamicPool().Submit(func() (any, error) { C.RemoveFieldFile(s.ptr, C.int64_t(fieldId)) return nil, nil }).Await() } func (s *LocalSegment) RemoveUnusedFieldFiles() error { schema := s.collection.Schema() indexInfos, _ := separateIndexAndBinlog(s.LoadInfo()) for _, indexInfo := range indexInfos { need, err := s.indexNeedLoadRawData(schema, indexInfo) if err != nil { return err } if !need { s.RemoveFieldFile(indexInfo.IndexInfo.FieldID) } } return nil } func (s *LocalSegment) indexNeedLoadRawData(schema *schemapb.CollectionSchema, indexInfo *IndexedFieldInfo) (bool, error) { schemaHelper, err := typeutil.CreateSchemaHelper(schema) if err != nil { return false, err } fieldSchema, err := schemaHelper.GetFieldFromID(indexInfo.IndexInfo.FieldID) if err != nil { return false, err } return !typeutil.IsVectorType(fieldSchema.DataType) && s.HasRawData(indexInfo.IndexInfo.FieldID), nil } type ( WarmupTask = func() (any, error) AsyncWarmupDispatcher struct { mu sync.RWMutex tasks []WarmupTask notify chan struct{} } ) func NewWarmupDispatcher() *AsyncWarmupDispatcher { return &AsyncWarmupDispatcher{ notify: make(chan struct{}, 1), } } func (d *AsyncWarmupDispatcher) AddTask(task func() (any, error)) { d.mu.Lock() d.tasks = append(d.tasks, task) d.mu.Unlock() select { case d.notify <- struct{}{}: default: } } func (d *AsyncWarmupDispatcher) Run(ctx context.Context) { for { select { case <-ctx.Done(): return case <-d.notify: d.mu.RLock() tasks := make([]WarmupTask, len(d.tasks)) copy(tasks, d.tasks) d.mu.RUnlock() for _, task := range tasks { select { case <-ctx.Done(): return default: GetWarmupPool().Submit(task) } } d.mu.Lock() d.tasks = d.tasks[len(tasks):] d.mu.Unlock() } } }