mirror of https://github.com/milvus-io/milvus.git
Change bloom filter use pk (#10193)
Signed-off-by: godchen <qingxiang.chen@zilliz.com>pull/10216/head
parent
a58b0161f0
commit
8daeb0d519
|
@ -550,6 +550,10 @@ func (ibNode *insertBufferNode) bufferInsertMsg(msg *msgstream.InsertMsg, endPos
|
|||
|
||||
fieldData.NumRows = append(fieldData.NumRows, int64(len(msg.RowData)))
|
||||
}
|
||||
if field.IsPrimaryKey {
|
||||
// update segment pk filter
|
||||
ibNode.replica.updateSegmentPKRange(currentSegID, fieldData.Data)
|
||||
}
|
||||
|
||||
case schemapb.DataType_Float:
|
||||
if _, ok := idata.Data[field.FieldID]; !ok {
|
||||
|
@ -598,8 +602,6 @@ func (ibNode *insertBufferNode) bufferInsertMsg(msg *msgstream.InsertMsg, endPos
|
|||
// store current endPositions as Segment->EndPostion
|
||||
ibNode.replica.updateSegmentEndPosition(currentSegID, endPos)
|
||||
|
||||
// update segment pk filter
|
||||
ibNode.replica.updateSegmentPKRange(currentSegID, msg.GetRowIDs())
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -263,12 +263,13 @@ func (mf *MetaFactory) CollectionMetaFactory(collectionID UniqueID, collectionNa
|
|||
IndexParams: []*commonpb.KeyValuePair{},
|
||||
},
|
||||
{
|
||||
FieldID: 106,
|
||||
Name: "int64_field",
|
||||
Description: "field 106",
|
||||
DataType: schemapb.DataType_Int64,
|
||||
TypeParams: []*commonpb.KeyValuePair{},
|
||||
IndexParams: []*commonpb.KeyValuePair{},
|
||||
FieldID: 106,
|
||||
Name: "int64_field",
|
||||
Description: "field 106",
|
||||
DataType: schemapb.DataType_Int64,
|
||||
TypeParams: []*commonpb.KeyValuePair{},
|
||||
IndexParams: []*commonpb.KeyValuePair{},
|
||||
IsPrimaryKey: true,
|
||||
},
|
||||
{
|
||||
FieldID: 107,
|
||||
|
|
|
@ -28,7 +28,6 @@ import (
|
|||
"go.uber.org/zap"
|
||||
|
||||
"github.com/bits-and-blooms/bloom/v3"
|
||||
"github.com/milvus-io/milvus/internal/common"
|
||||
"github.com/milvus-io/milvus/internal/kv"
|
||||
miniokv "github.com/milvus-io/milvus/internal/kv/minio"
|
||||
"github.com/milvus-io/milvus/internal/log"
|
||||
|
@ -103,16 +102,16 @@ type SegmentReplica struct {
|
|||
minIOKV kv.BaseKV
|
||||
}
|
||||
|
||||
func (s *Segment) updatePKRange(rowIDs []int64) {
|
||||
func (s *Segment) updatePKRange(pks []int64) {
|
||||
buf := make([]byte, 8)
|
||||
for _, rowID := range rowIDs {
|
||||
binary.BigEndian.PutUint64(buf, uint64(rowID))
|
||||
for _, pk := range pks {
|
||||
binary.BigEndian.PutUint64(buf, uint64(pk))
|
||||
s.pkFilter.Add(buf)
|
||||
if rowID > s.maxPK {
|
||||
s.maxPK = rowID
|
||||
if pk > s.maxPK {
|
||||
s.maxPK = pk
|
||||
}
|
||||
if rowID < s.minPK {
|
||||
s.minPK = rowID
|
||||
if pk < s.minPK {
|
||||
s.minPK = pk
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -288,9 +287,6 @@ func (replica *SegmentReplica) filterSegments(channelName string, partitionID Un
|
|||
// addNormalSegment adds a *NotNew* and *NotFlushed* segment. Before add, please make sure there's no
|
||||
// such segment by `hasSegment`
|
||||
func (replica *SegmentReplica) addNormalSegment(segID, collID, partitionID UniqueID, channelName string, numOfRows int64, cp *segmentCheckPoint) error {
|
||||
replica.segMu.Lock()
|
||||
defer replica.segMu.Unlock()
|
||||
|
||||
if collID != replica.collectionID {
|
||||
log.Warn("Mismatch collection",
|
||||
zap.Int64("input ID", collID),
|
||||
|
@ -319,47 +315,24 @@ func (replica *SegmentReplica) addNormalSegment(segID, collID, partitionID Uniqu
|
|||
minPK: math.MaxInt64, // use max value, represents no value
|
||||
maxPK: math.MinInt64, // use min value represents no value
|
||||
}
|
||||
|
||||
p := path.Join(Params.StatsBinlogRootPath, JoinIDPath(collID, partitionID, segID, common.RowIDField))
|
||||
keys, values, err := replica.minIOKV.LoadWithPrefix(p)
|
||||
err := replica.initPKBloomFilter(seg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
blobs := make([]*Blob, 0)
|
||||
for i := 0; i < len(keys); i++ {
|
||||
blobs = append(blobs, &Blob{Key: keys[i], Value: []byte(values[i])})
|
||||
}
|
||||
|
||||
stats, err := storage.DeserializeStats(blobs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, stat := range stats {
|
||||
err = seg.pkFilter.Merge(stat.BF)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if seg.minPK > stat.Min {
|
||||
seg.minPK = stat.Min
|
||||
}
|
||||
|
||||
if seg.maxPK < stat.Max {
|
||||
seg.maxPK = stat.Max
|
||||
}
|
||||
}
|
||||
|
||||
seg.isNew.Store(false)
|
||||
seg.isFlushed.Store(false)
|
||||
|
||||
replica.segMu.Lock()
|
||||
replica.normalSegments[segID] = seg
|
||||
replica.segMu.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// addFlushedSegment adds a *Flushed* segment. Before add, please make sure there's no
|
||||
// such segment by `hasSegment`
|
||||
func (replica *SegmentReplica) addFlushedSegment(segID, collID, partitionID UniqueID, channelName string, numOfRows int64) error {
|
||||
replica.segMu.Lock()
|
||||
defer replica.segMu.Unlock()
|
||||
|
||||
if collID != replica.collectionID {
|
||||
log.Warn("Mismatch collection",
|
||||
|
@ -388,7 +361,36 @@ func (replica *SegmentReplica) addFlushedSegment(segID, collID, partitionID Uniq
|
|||
maxPK: math.MinInt64, // use min value represents no value
|
||||
}
|
||||
|
||||
p := path.Join(Params.StatsBinlogRootPath, JoinIDPath(collID, partitionID, segID, common.RowIDField))
|
||||
err := replica.initPKBloomFilter(seg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
seg.isNew.Store(false)
|
||||
seg.isFlushed.Store(true)
|
||||
|
||||
replica.segMu.Lock()
|
||||
replica.flushedSegments[segID] = seg
|
||||
replica.segMu.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (replica *SegmentReplica) initPKBloomFilter(s *Segment) error {
|
||||
schema, err := replica.getCollectionSchema(s.collectionID, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pkField := int64(-1)
|
||||
for _, field := range schema.Fields {
|
||||
if field.IsPrimaryKey {
|
||||
pkField = field.FieldID
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
p := path.Join(Params.StatsBinlogRootPath, JoinIDPath(s.collectionID, s.partitionID, s.segmentID, pkField))
|
||||
keys, values, err := replica.minIOKV.LoadWithPrefix(p)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -403,23 +405,18 @@ func (replica *SegmentReplica) addFlushedSegment(segID, collID, partitionID Uniq
|
|||
return err
|
||||
}
|
||||
for _, stat := range stats {
|
||||
err = seg.pkFilter.Merge(stat.BF)
|
||||
err = s.pkFilter.Merge(stat.BF)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if seg.minPK > stat.Min {
|
||||
seg.minPK = stat.Min
|
||||
if s.minPK > stat.Min {
|
||||
s.minPK = stat.Min
|
||||
}
|
||||
|
||||
if seg.maxPK < stat.Max {
|
||||
seg.maxPK = stat.Max
|
||||
if s.maxPK < stat.Max {
|
||||
s.maxPK = stat.Max
|
||||
}
|
||||
}
|
||||
|
||||
seg.isNew.Store(false)
|
||||
seg.isFlushed.Store(true)
|
||||
|
||||
replica.flushedSegments[segID] = seg
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -481,19 +478,19 @@ func (replica *SegmentReplica) updateSegmentEndPosition(segID UniqueID, endPos *
|
|||
log.Warn("No match segment", zap.Int64("ID", segID))
|
||||
}
|
||||
|
||||
func (replica *SegmentReplica) updateSegmentPKRange(segID UniqueID, rowIDs []int64) {
|
||||
func (replica *SegmentReplica) updateSegmentPKRange(segID UniqueID, pks []int64) {
|
||||
replica.segMu.Lock()
|
||||
defer replica.segMu.Unlock()
|
||||
|
||||
seg, ok := replica.newSegments[segID]
|
||||
if ok {
|
||||
seg.updatePKRange(rowIDs)
|
||||
seg.updatePKRange(pks)
|
||||
return
|
||||
}
|
||||
|
||||
seg, ok = replica.normalSegments[segID]
|
||||
if ok {
|
||||
seg.updatePKRange(rowIDs)
|
||||
seg.updatePKRange(pks)
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -575,9 +572,6 @@ func (replica *SegmentReplica) getCollectionID() UniqueID {
|
|||
// getCollectionSchema gets collection schema from rootcoord for a certain timestamp.
|
||||
// If you want the latest collection schema, ts should be 0.
|
||||
func (replica *SegmentReplica) getCollectionSchema(collID UniqueID, ts Timestamp) (*schemapb.CollectionSchema, error) {
|
||||
replica.segMu.Lock()
|
||||
defer replica.segMu.Unlock()
|
||||
|
||||
if !replica.validCollection(collID) {
|
||||
log.Warn("Mismatch collection for the replica",
|
||||
zap.Int64("Want", replica.collectionID),
|
||||
|
|
|
@ -561,99 +561,100 @@ func TestSegmentReplica_InterfaceMethod(te *testing.T) {
|
|||
assert.NotNil(to, err)
|
||||
})
|
||||
|
||||
te.Run("Test inner function segment", func(t *testing.T) {
|
||||
collID := UniqueID(1)
|
||||
replica, err := newReplica(context.Background(), rc, collID)
|
||||
assert.Nil(t, err)
|
||||
replica.minIOKV = &mockMinioKV{}
|
||||
assert.False(t, replica.hasSegment(0, true))
|
||||
assert.False(t, replica.hasSegment(0, false))
|
||||
}
|
||||
func TestInnerFunctionSegment(t *testing.T) {
|
||||
rc := &RootCoordFactory{}
|
||||
collID := UniqueID(1)
|
||||
replica, err := newReplica(context.Background(), rc, collID)
|
||||
assert.Nil(t, err)
|
||||
replica.minIOKV = &mockMinioKV{}
|
||||
assert.False(t, replica.hasSegment(0, true))
|
||||
assert.False(t, replica.hasSegment(0, false))
|
||||
|
||||
startPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(100)}
|
||||
endPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(200)}
|
||||
err = replica.addNewSegment(0, 1, 2, "insert-01", startPos, endPos)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, replica.hasSegment(0, true))
|
||||
assert.Equal(t, 1, len(replica.newSegments))
|
||||
startPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(100)}
|
||||
endPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(200)}
|
||||
err = replica.addNewSegment(0, 1, 2, "insert-01", startPos, endPos)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, replica.hasSegment(0, true))
|
||||
assert.Equal(t, 1, len(replica.newSegments))
|
||||
|
||||
seg, ok := replica.newSegments[UniqueID(0)]
|
||||
assert.True(t, ok)
|
||||
require.NotNil(t, seg)
|
||||
assert.Equal(t, UniqueID(0), seg.segmentID)
|
||||
assert.Equal(t, UniqueID(1), seg.collectionID)
|
||||
assert.Equal(t, UniqueID(2), seg.partitionID)
|
||||
assert.Equal(t, "insert-01", seg.channelName)
|
||||
assert.Equal(t, Timestamp(100), seg.startPos.Timestamp)
|
||||
assert.Equal(t, Timestamp(200), seg.endPos.Timestamp)
|
||||
assert.Equal(t, startPos.ChannelName, seg.checkPoint.pos.ChannelName)
|
||||
assert.Equal(t, startPos.Timestamp, seg.checkPoint.pos.Timestamp)
|
||||
assert.Equal(t, int64(0), seg.numRows)
|
||||
assert.True(t, seg.isNew.Load().(bool))
|
||||
assert.False(t, seg.isFlushed.Load().(bool))
|
||||
seg, ok := replica.newSegments[UniqueID(0)]
|
||||
assert.True(t, ok)
|
||||
require.NotNil(t, seg)
|
||||
assert.Equal(t, UniqueID(0), seg.segmentID)
|
||||
assert.Equal(t, UniqueID(1), seg.collectionID)
|
||||
assert.Equal(t, UniqueID(2), seg.partitionID)
|
||||
assert.Equal(t, "insert-01", seg.channelName)
|
||||
assert.Equal(t, Timestamp(100), seg.startPos.Timestamp)
|
||||
assert.Equal(t, Timestamp(200), seg.endPos.Timestamp)
|
||||
assert.Equal(t, startPos.ChannelName, seg.checkPoint.pos.ChannelName)
|
||||
assert.Equal(t, startPos.Timestamp, seg.checkPoint.pos.Timestamp)
|
||||
assert.Equal(t, int64(0), seg.numRows)
|
||||
assert.True(t, seg.isNew.Load().(bool))
|
||||
assert.False(t, seg.isFlushed.Load().(bool))
|
||||
|
||||
replica.updateStatistics(0, 10)
|
||||
assert.Equal(t, int64(10), seg.numRows)
|
||||
replica.updateStatistics(0, 10)
|
||||
assert.Equal(t, int64(10), seg.numRows)
|
||||
|
||||
cpPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(10)}
|
||||
cp := &segmentCheckPoint{int64(10), *cpPos}
|
||||
err = replica.addNormalSegment(1, 1, 2, "insert-01", int64(10), cp)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, replica.hasSegment(1, true))
|
||||
assert.Equal(t, 1, len(replica.normalSegments))
|
||||
seg, ok = replica.normalSegments[UniqueID(1)]
|
||||
assert.True(t, ok)
|
||||
require.NotNil(t, seg)
|
||||
assert.Equal(t, UniqueID(1), seg.segmentID)
|
||||
assert.Equal(t, UniqueID(1), seg.collectionID)
|
||||
assert.Equal(t, UniqueID(2), seg.partitionID)
|
||||
assert.Equal(t, "insert-01", seg.channelName)
|
||||
assert.Equal(t, cpPos.ChannelName, seg.checkPoint.pos.ChannelName)
|
||||
assert.Equal(t, cpPos.Timestamp, seg.checkPoint.pos.Timestamp)
|
||||
assert.Equal(t, int64(10), seg.numRows)
|
||||
assert.False(t, seg.isNew.Load().(bool))
|
||||
assert.False(t, seg.isFlushed.Load().(bool))
|
||||
cpPos := &internalpb.MsgPosition{ChannelName: "insert-01", Timestamp: Timestamp(10)}
|
||||
cp := &segmentCheckPoint{int64(10), *cpPos}
|
||||
err = replica.addNormalSegment(1, 1, 2, "insert-01", int64(10), cp)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, replica.hasSegment(1, true))
|
||||
assert.Equal(t, 1, len(replica.normalSegments))
|
||||
seg, ok = replica.normalSegments[UniqueID(1)]
|
||||
assert.True(t, ok)
|
||||
require.NotNil(t, seg)
|
||||
assert.Equal(t, UniqueID(1), seg.segmentID)
|
||||
assert.Equal(t, UniqueID(1), seg.collectionID)
|
||||
assert.Equal(t, UniqueID(2), seg.partitionID)
|
||||
assert.Equal(t, "insert-01", seg.channelName)
|
||||
assert.Equal(t, cpPos.ChannelName, seg.checkPoint.pos.ChannelName)
|
||||
assert.Equal(t, cpPos.Timestamp, seg.checkPoint.pos.Timestamp)
|
||||
assert.Equal(t, int64(10), seg.numRows)
|
||||
assert.False(t, seg.isNew.Load().(bool))
|
||||
assert.False(t, seg.isFlushed.Load().(bool))
|
||||
|
||||
err = replica.addNormalSegment(1, 100000, 2, "invalid", int64(0), &segmentCheckPoint{})
|
||||
assert.Error(t, err)
|
||||
err = replica.addNormalSegment(1, 100000, 2, "invalid", int64(0), &segmentCheckPoint{})
|
||||
assert.Error(t, err)
|
||||
|
||||
replica.updateStatistics(1, 10)
|
||||
assert.Equal(t, int64(20), seg.numRows)
|
||||
replica.updateStatistics(1, 10)
|
||||
assert.Equal(t, int64(20), seg.numRows)
|
||||
|
||||
segPos := replica.listNewSegmentsStartPositions()
|
||||
assert.Equal(t, 1, len(segPos))
|
||||
assert.Equal(t, UniqueID(0), segPos[0].SegmentID)
|
||||
assert.Equal(t, "insert-01", segPos[0].StartPosition.ChannelName)
|
||||
assert.Equal(t, Timestamp(100), segPos[0].StartPosition.Timestamp)
|
||||
segPos := replica.listNewSegmentsStartPositions()
|
||||
assert.Equal(t, 1, len(segPos))
|
||||
assert.Equal(t, UniqueID(0), segPos[0].SegmentID)
|
||||
assert.Equal(t, "insert-01", segPos[0].StartPosition.ChannelName)
|
||||
assert.Equal(t, Timestamp(100), segPos[0].StartPosition.Timestamp)
|
||||
|
||||
assert.Equal(t, 0, len(replica.newSegments))
|
||||
assert.Equal(t, 2, len(replica.normalSegments))
|
||||
assert.Equal(t, 0, len(replica.newSegments))
|
||||
assert.Equal(t, 2, len(replica.normalSegments))
|
||||
|
||||
cps := replica.listSegmentsCheckPoints()
|
||||
assert.Equal(t, 2, len(cps))
|
||||
assert.Equal(t, startPos.Timestamp, cps[UniqueID(0)].pos.Timestamp)
|
||||
assert.Equal(t, int64(0), cps[UniqueID(0)].numRows)
|
||||
assert.Equal(t, cp.pos.Timestamp, cps[UniqueID(1)].pos.Timestamp)
|
||||
assert.Equal(t, int64(10), cps[UniqueID(1)].numRows)
|
||||
cps := replica.listSegmentsCheckPoints()
|
||||
assert.Equal(t, 2, len(cps))
|
||||
assert.Equal(t, startPos.Timestamp, cps[UniqueID(0)].pos.Timestamp)
|
||||
assert.Equal(t, int64(0), cps[UniqueID(0)].numRows)
|
||||
assert.Equal(t, cp.pos.Timestamp, cps[UniqueID(1)].pos.Timestamp)
|
||||
assert.Equal(t, int64(10), cps[UniqueID(1)].numRows)
|
||||
|
||||
updates, err := replica.getSegmentStatisticsUpdates(0)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int64(10), updates.NumRows)
|
||||
updates, err := replica.getSegmentStatisticsUpdates(0)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int64(10), updates.NumRows)
|
||||
|
||||
updates, err = replica.getSegmentStatisticsUpdates(1)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int64(20), updates.NumRows)
|
||||
updates, err = replica.getSegmentStatisticsUpdates(1)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, int64(20), updates.NumRows)
|
||||
|
||||
replica.updateSegmentCheckPoint(0)
|
||||
assert.Equal(t, int64(10), replica.normalSegments[UniqueID(0)].checkPoint.numRows)
|
||||
replica.updateSegmentCheckPoint(1)
|
||||
assert.Equal(t, int64(20), replica.normalSegments[UniqueID(1)].checkPoint.numRows)
|
||||
replica.updateSegmentCheckPoint(0)
|
||||
assert.Equal(t, int64(10), replica.normalSegments[UniqueID(0)].checkPoint.numRows)
|
||||
replica.updateSegmentCheckPoint(1)
|
||||
assert.Equal(t, int64(20), replica.normalSegments[UniqueID(1)].checkPoint.numRows)
|
||||
|
||||
err = replica.addFlushedSegment(1, 1, 2, "insert-01", int64(0))
|
||||
assert.Nil(t, err)
|
||||
err = replica.addFlushedSegment(1, 1, 2, "insert-01", int64(0))
|
||||
assert.Nil(t, err)
|
||||
|
||||
totalSegments := replica.filterSegments("insert-01", 0)
|
||||
assert.Equal(t, len(totalSegments), 3)
|
||||
})
|
||||
totalSegments := replica.filterSegments("insert-01", 0)
|
||||
assert.Equal(t, len(totalSegments), 3)
|
||||
}
|
||||
|
||||
func TestSegmentReplica_UpdatePKRange(t *testing.T) {
|
||||
|
|
|
@ -105,7 +105,8 @@ func (kv *MinIOKV) LoadWithPrefix(key string) ([]string, []string, error) {
|
|||
}
|
||||
objectsValues, err := kv.MultiLoad(objectsKeys)
|
||||
if err != nil {
|
||||
log.Debug("MinIO", zap.String("cannot load value with prefix:%s", key))
|
||||
log.Error(fmt.Sprintf("MinIO load with prefix error. path = %s", key), zap.Error(err))
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return objectsKeys, objectsValues, nil
|
||||
|
|
|
@ -12,15 +12,20 @@
|
|||
package querynode
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"sync"
|
||||
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/log"
|
||||
"github.com/milvus-io/milvus/internal/msgstream"
|
||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||
"github.com/milvus-io/milvus/internal/proto/schemapb"
|
||||
"github.com/milvus-io/milvus/internal/util/flowgraph"
|
||||
"github.com/milvus-io/milvus/internal/util/trace"
|
||||
)
|
||||
|
@ -35,6 +40,7 @@ type insertData struct {
|
|||
insertTimestamps map[UniqueID][]Timestamp
|
||||
insertRecords map[UniqueID][]*commonpb.Blob
|
||||
insertOffset map[UniqueID]int64
|
||||
insertPKs map[UniqueID][]int64
|
||||
}
|
||||
|
||||
type deleteData struct {
|
||||
|
@ -66,6 +72,7 @@ func (iNode *insertNode) Operate(in []flowgraph.Msg) []flowgraph.Msg {
|
|||
insertTimestamps: make(map[UniqueID][]Timestamp),
|
||||
insertRecords: make(map[UniqueID][]*commonpb.Blob),
|
||||
insertOffset: make(map[UniqueID]int64),
|
||||
insertPKs: make(map[UniqueID][]int64),
|
||||
}
|
||||
|
||||
if iMsg == nil {
|
||||
|
@ -102,6 +109,7 @@ func (iNode *insertNode) Operate(in []flowgraph.Msg) []flowgraph.Msg {
|
|||
iData.insertIDs[task.SegmentID] = append(iData.insertIDs[task.SegmentID], task.RowIDs...)
|
||||
iData.insertTimestamps[task.SegmentID] = append(iData.insertTimestamps[task.SegmentID], task.Timestamps...)
|
||||
iData.insertRecords[task.SegmentID] = append(iData.insertRecords[task.SegmentID], task.RowData...)
|
||||
iData.insertPKs[task.SegmentID] = iNode.getPrimaryKeys(task)
|
||||
}
|
||||
|
||||
// 2. do preInsert
|
||||
|
@ -119,6 +127,7 @@ func (iNode *insertNode) Operate(in []flowgraph.Msg) []flowgraph.Msg {
|
|||
}
|
||||
iData.insertOffset[segmentID] = offset
|
||||
log.Debug("insertNode operator", zap.Int("insert size", numOfRecords), zap.Int64("insert offset", offset), zap.Int64("segment id", segmentID))
|
||||
targetSegment.updateBloomFilter(iData.insertPKs[segmentID])
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -291,6 +300,82 @@ func (iNode *insertNode) delete(deleteData *deleteData, segmentID UniqueID, wg *
|
|||
log.Debug("Do delete done", zap.Int("len", len(deleteData.deleteIDs[segmentID])), zap.Int64("segmentID", segmentID))
|
||||
}
|
||||
|
||||
func (iNode *insertNode) getPrimaryKeys(msg *msgstream.InsertMsg) []int64 {
|
||||
if len(msg.RowIDs) != len(msg.Timestamps) || len(msg.RowIDs) != len(msg.RowData) {
|
||||
log.Warn("misaligned messages detected")
|
||||
return nil
|
||||
}
|
||||
collectionID := msg.GetCollectionID()
|
||||
|
||||
collection, err := iNode.replica.getCollectionByID(collectionID)
|
||||
if err != nil {
|
||||
log.Warn("collection cannot be found")
|
||||
return nil
|
||||
}
|
||||
|
||||
offset := 0
|
||||
for _, field := range collection.schema.Fields {
|
||||
if field.IsPrimaryKey {
|
||||
break
|
||||
}
|
||||
switch field.DataType {
|
||||
case schemapb.DataType_Bool:
|
||||
offset++
|
||||
case schemapb.DataType_Int8:
|
||||
offset++
|
||||
case schemapb.DataType_Int16:
|
||||
offset += 2
|
||||
case schemapb.DataType_Int32:
|
||||
offset += 4
|
||||
case schemapb.DataType_Int64:
|
||||
offset += 8
|
||||
case schemapb.DataType_Float:
|
||||
offset += 4
|
||||
case schemapb.DataType_Double:
|
||||
offset += 8
|
||||
case schemapb.DataType_FloatVector:
|
||||
for _, t := range field.TypeParams {
|
||||
if t.Key == "dim" {
|
||||
dim, err := strconv.Atoi(t.Value)
|
||||
if err != nil {
|
||||
log.Error("strconv wrong on get dim", zap.Error(err))
|
||||
break
|
||||
}
|
||||
offset += dim * 4
|
||||
break
|
||||
}
|
||||
}
|
||||
case schemapb.DataType_BinaryVector:
|
||||
var dim int
|
||||
for _, t := range field.TypeParams {
|
||||
if t.Key == "dim" {
|
||||
dim, err = strconv.Atoi(t.Value)
|
||||
if err != nil {
|
||||
log.Error("strconv wrong on get dim", zap.Error(err))
|
||||
return nil
|
||||
}
|
||||
offset += dim / 8
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
blobReaders := make([]io.Reader, len(msg.RowData))
|
||||
for i, blob := range msg.RowData {
|
||||
blobReaders[i] = bytes.NewReader(blob.GetValue()[offset : offset+8])
|
||||
}
|
||||
pks := make([]int64, len(blobReaders))
|
||||
|
||||
for i, reader := range blobReaders {
|
||||
err = binary.Read(reader, binary.LittleEndian, &pks[i])
|
||||
if err != nil {
|
||||
log.Warn("binary read blob value failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
return pks
|
||||
}
|
||||
func newInsertNode(replica ReplicaInterface) *insertNode {
|
||||
maxQueueLength := Params.FlowGraphMaxQueueLength
|
||||
maxParallelism := Params.FlowGraphMaxParallelism
|
||||
|
|
|
@ -245,6 +245,14 @@ func TestFlowGraphInsertNode_operate(t *testing.T) {
|
|||
}
|
||||
msg := []flowgraph.Msg{&iMsg}
|
||||
insertNode.Operate(msg)
|
||||
s, err := replica.getSegmentByID(defaultSegmentID)
|
||||
assert.Nil(t, err)
|
||||
buf := make([]byte, 8)
|
||||
for i := 0; i < defaultMsgLength; i++ {
|
||||
binary.BigEndian.PutUint64(buf, uint64(i))
|
||||
assert.True(t, s.pkFilter.Test(buf))
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
t.Run("test invalid partitionID", func(t *testing.T) {
|
||||
|
|
|
@ -49,6 +49,7 @@ const (
|
|||
|
||||
defaultVecFieldName = "vec"
|
||||
defaultConstFieldName = "const"
|
||||
defaultPKFieldName = "pk"
|
||||
defaultTopK = int64(10)
|
||||
defaultRoundDecimal = int64(6)
|
||||
defaultDim = 128
|
||||
|
@ -108,6 +109,11 @@ var simpleConstField = constFieldParam{
|
|||
dataType: schemapb.DataType_Int32,
|
||||
}
|
||||
|
||||
var simplePKField = constFieldParam{
|
||||
id: 102,
|
||||
dataType: schemapb.DataType_Int64,
|
||||
}
|
||||
|
||||
var uidField = constFieldParam{
|
||||
id: rowIDFieldID,
|
||||
dataType: schemapb.DataType_Int64,
|
||||
|
@ -128,6 +134,16 @@ func genConstantField(param constFieldParam) *schemapb.FieldSchema {
|
|||
return field
|
||||
}
|
||||
|
||||
func genPKField(param constFieldParam) *schemapb.FieldSchema {
|
||||
field := &schemapb.FieldSchema{
|
||||
FieldID: param.id,
|
||||
Name: defaultPKFieldName,
|
||||
IsPrimaryKey: true,
|
||||
DataType: param.dataType,
|
||||
}
|
||||
return field
|
||||
}
|
||||
|
||||
func genFloatVectorField(param vecFieldParam) *schemapb.FieldSchema {
|
||||
fieldVec := &schemapb.FieldSchema{
|
||||
FieldID: param.id,
|
||||
|
@ -284,13 +300,15 @@ func generateIndex(segmentID UniqueID) ([]string, error) {
|
|||
func genSimpleSegCoreSchema() *schemapb.CollectionSchema {
|
||||
fieldVec := genFloatVectorField(simpleVecField)
|
||||
fieldInt := genConstantField(simpleConstField)
|
||||
fieldPK := genPKField(simplePKField)
|
||||
|
||||
schema := schemapb.CollectionSchema{ // schema for segCore
|
||||
Name: defaultCollectionName,
|
||||
AutoID: true,
|
||||
AutoID: false,
|
||||
Fields: []*schemapb.FieldSchema{
|
||||
fieldVec,
|
||||
fieldInt,
|
||||
fieldPK,
|
||||
},
|
||||
}
|
||||
return &schema
|
||||
|
@ -580,6 +598,10 @@ func genCommonBlob(msgLength int, schema *schemapb.CollectionSchema) ([]*commonp
|
|||
bs := make([]byte, 4)
|
||||
binary.LittleEndian.PutUint32(bs, uint32(i))
|
||||
rawData = append(rawData, bs...)
|
||||
case schemapb.DataType_Int64:
|
||||
bs := make([]byte, 8)
|
||||
binary.LittleEndian.PutUint32(bs, uint32(i))
|
||||
rawData = append(rawData, bs...)
|
||||
case schemapb.DataType_FloatVector:
|
||||
dim := simpleVecField.dim // if no dim specified, use simpleVecField's dim
|
||||
for _, p := range f.TypeParams {
|
||||
|
|
|
@ -541,6 +541,14 @@ func (s *Segment) checkIndexReady(fieldID int64) bool {
|
|||
return s.indexInfos[fieldID].getReadyLoad()
|
||||
}
|
||||
|
||||
func (s *Segment) updateBloomFilter(pks []int64) {
|
||||
buf := make([]byte, 8)
|
||||
for _, pk := range pks {
|
||||
binary.BigEndian.PutUint64(buf, uint64(pk))
|
||||
s.pkFilter.Add(buf)
|
||||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------- interfaces for growing segment
|
||||
func (s *Segment) segmentPreInsert(numOfRecords int) (int64, error) {
|
||||
/*
|
||||
|
@ -601,12 +609,6 @@ func (s *Segment) segmentInsert(offset int64, entityIDs *[]UniqueID, timestamps
|
|||
return errors.New("null seg core pointer")
|
||||
}
|
||||
|
||||
for _, id := range *entityIDs {
|
||||
b := make([]byte, 8)
|
||||
binary.BigEndian.PutUint64(b, uint64(id))
|
||||
s.pkFilter.Add(b)
|
||||
}
|
||||
|
||||
// Blobs to one big blob
|
||||
var numOfRow = len(*entityIDs)
|
||||
var sizeofPerRow = len((*records)[0].Value)
|
||||
|
|
|
@ -322,8 +322,20 @@ func (loader *segmentLoader) loadSegmentFieldsData(segment *Segment, fieldBinlog
|
|||
}
|
||||
func (loader *segmentLoader) loadSegmentBloomFilter(segment *Segment) error {
|
||||
// Todo: get path from etcd
|
||||
p := path.Join("files/stats_log", JoinIDPath(segment.collectionID, segment.partitionID, segment.segmentID, common.RowIDField))
|
||||
keys, values, err := loader.minioKV.LoadWithPrefix(p)
|
||||
collection, err := loader.historicalReplica.getCollectionByID(segment.collectionID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
pkField := int64(-1)
|
||||
for _, field := range collection.schema.Fields {
|
||||
if field.IsPrimaryKey {
|
||||
pkField = field.FieldID
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
p := path.Join("files/stats_log", JoinIDPath(segment.collectionID, segment.partitionID, segment.segmentID, pkField))
|
||||
keys, values, err := loader.minioKV.LoadWithPrefix(p + "/")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -306,7 +306,7 @@ func (insertCodec *InsertCodec) Serialize(partitionID UniqueID, segmentID Unique
|
|||
statsWriter := &StatsWriter{}
|
||||
switch field.DataType {
|
||||
case schemapb.DataType_Int64:
|
||||
err = statsWriter.StatsInt64(field.FieldID, singleData.(*Int64FieldData).Data)
|
||||
err = statsWriter.StatsInt64(field.FieldID, field.IsPrimaryKey, singleData.(*Int64FieldData).Data)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
|
|
|
@ -16,7 +16,6 @@ import (
|
|||
"encoding/json"
|
||||
|
||||
"github.com/bits-and-blooms/bloom/v3"
|
||||
"github.com/milvus-io/milvus/internal/common"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -43,7 +42,7 @@ func (sw *StatsWriter) GetBuffer() []byte {
|
|||
return sw.buffer
|
||||
}
|
||||
|
||||
func (sw *StatsWriter) StatsInt64(fieldID int64, msgs []int64) error {
|
||||
func (sw *StatsWriter) StatsInt64(fieldID int64, isPrimaryKey bool, msgs []int64) error {
|
||||
if len(msgs) < 1 {
|
||||
// return error: msgs must has one element at least
|
||||
return nil
|
||||
|
@ -53,9 +52,9 @@ func (sw *StatsWriter) StatsInt64(fieldID int64, msgs []int64) error {
|
|||
FieldID: fieldID,
|
||||
Max: msgs[len(msgs)-1],
|
||||
Min: msgs[0],
|
||||
BF: bloom.NewWithEstimates(bloomFilterSize, maxBloomFalsePositive),
|
||||
}
|
||||
if fieldID == common.RowIDField {
|
||||
if isPrimaryKey {
|
||||
stats.BF = bloom.NewWithEstimates(bloomFilterSize, maxBloomFalsePositive)
|
||||
b := make([]byte, 8)
|
||||
for _, msg := range msgs {
|
||||
binary.LittleEndian.PutUint64(b, uint64(msg))
|
||||
|
|
|
@ -23,7 +23,7 @@ import (
|
|||
func TestStatsWriter_StatsInt64(t *testing.T) {
|
||||
data := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9}
|
||||
sw := &StatsWriter{}
|
||||
err := sw.StatsInt64(common.RowIDField, data)
|
||||
err := sw.StatsInt64(common.RowIDField, true, data)
|
||||
assert.NoError(t, err)
|
||||
b := sw.GetBuffer()
|
||||
|
||||
|
@ -40,6 +40,6 @@ func TestStatsWriter_StatsInt64(t *testing.T) {
|
|||
}
|
||||
|
||||
msgs := []int64{}
|
||||
err = sw.StatsInt64(rootcoord.RowIDField, msgs)
|
||||
err = sw.StatsInt64(rootcoord.RowIDField, true, msgs)
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue