mirror of https://github.com/milvus-io/milvus.git
feat: integrate new deltalog format (#35522)
See #34123 --------- Signed-off-by: Ted Xu <ted.xu@zilliz.com>pull/35580/head
parent
bc2bbdc82f
commit
41646c8439
|
@ -642,6 +642,9 @@ dataNode:
|
|||
clusteringCompaction:
|
||||
memoryBufferRatio: 0.1 # The ratio of memory buffer of clustering compaction. Data larger than threshold will be flushed to storage.
|
||||
workPoolSize: 8 # worker pool size for one clustering compaction job.
|
||||
bloomFilterApplyParallelFactor: 4 # parallel factor when to apply pk to bloom filter, default to 4*CPU_CORE_NUM
|
||||
storage:
|
||||
deltalog: json # deltalog format, options: [json, parquet]
|
||||
ip: # TCP/IP address of dataNode. If not specified, use the first unicastable address
|
||||
port: 21124 # TCP port of dataNode
|
||||
grpc:
|
||||
|
|
|
@ -18,6 +18,7 @@ package compaction
|
|||
|
||||
import (
|
||||
"context"
|
||||
sio "io"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
|
@ -25,7 +26,6 @@ import (
|
|||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/allocator"
|
||||
iter "github.com/milvus-io/milvus/internal/datanode/iterators"
|
||||
"github.com/milvus-io/milvus/internal/flushcommon/io"
|
||||
"github.com/milvus-io/milvus/internal/metastore/kv/binlog"
|
||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||
|
@ -55,12 +55,12 @@ func mergeDeltalogs(ctx context.Context, io io.BinlogIO, dpaths map[typeutil.Uni
|
|||
return pk2ts, nil
|
||||
}
|
||||
|
||||
allIters := make([]*iter.DeltalogIterator, 0)
|
||||
blobs := make([]*storage.Blob, 0)
|
||||
for segID, paths := range dpaths {
|
||||
if len(paths) == 0 {
|
||||
continue
|
||||
}
|
||||
blobs, err := io.Download(ctx, paths)
|
||||
binaries, err := io.Download(ctx, paths)
|
||||
if err != nil {
|
||||
log.Warn("compact wrong, fail to download deltalogs",
|
||||
zap.Int64("segment", segID),
|
||||
|
@ -69,19 +69,34 @@ func mergeDeltalogs(ctx context.Context, io io.BinlogIO, dpaths map[typeutil.Uni
|
|||
return nil, err
|
||||
}
|
||||
|
||||
allIters = append(allIters, iter.NewDeltalogIterator(blobs, nil))
|
||||
}
|
||||
|
||||
for _, deltaIter := range allIters {
|
||||
for deltaIter.HasNext() {
|
||||
labeled, _ := deltaIter.Next()
|
||||
ts := labeled.GetTimestamp()
|
||||
if lastTs, ok := pk2ts[labeled.GetPk().GetValue()]; ok && lastTs > ts {
|
||||
ts = lastTs
|
||||
}
|
||||
pk2ts[labeled.GetPk().GetValue()] = ts
|
||||
for i := range binaries {
|
||||
blobs = append(blobs, &storage.Blob{Value: binaries[i]})
|
||||
}
|
||||
}
|
||||
reader, err := storage.CreateDeltalogReader(blobs)
|
||||
if err != nil {
|
||||
log.Error("malformed delta file", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
for {
|
||||
err := reader.Next()
|
||||
if err != nil {
|
||||
if err == sio.EOF {
|
||||
break
|
||||
}
|
||||
log.Error("compact wrong, fail to read deltalogs", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dl := reader.Value()
|
||||
// If pk already exists in pk2ts, record the later one.
|
||||
if ts, ok := pk2ts[dl.Pk.GetValue()]; ok && ts > dl.Ts {
|
||||
continue
|
||||
}
|
||||
pk2ts[dl.Pk.GetValue()] = dl.Ts
|
||||
}
|
||||
|
||||
log.Info("compact mergeDeltalogs end",
|
||||
zap.Int("deleted pk counts", len(pk2ts)))
|
||||
|
|
|
@ -19,6 +19,7 @@ package compaction
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
sio "io"
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
|
@ -391,10 +392,28 @@ func (t *LevelZeroCompactionTask) loadDelta(ctx context.Context, deltaLogs []str
|
|||
for _, blob := range blobBytes {
|
||||
blobs = append(blobs, &storage.Blob{Value: blob})
|
||||
}
|
||||
_, _, dData, err := storage.NewDeleteCodec().Deserialize(blobs)
|
||||
|
||||
reader, err := storage.CreateDeltalogReader(blobs)
|
||||
if err != nil {
|
||||
log.Error("malformed delta file", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
dData := &storage.DeleteData{}
|
||||
for {
|
||||
err := reader.Next()
|
||||
if err != nil {
|
||||
if err == sio.EOF {
|
||||
break
|
||||
}
|
||||
log.Error("compact wrong, fail to read deltalogs", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dl := reader.Value()
|
||||
dData.Append(dl.Pk, dl.Ts)
|
||||
}
|
||||
|
||||
return dData, nil
|
||||
}
|
||||
|
|
|
@ -191,7 +191,7 @@ func (s *L0ImportSuite) TestL0Import() {
|
|||
|
||||
deltaLog := actual.GetBinlogs()[0]
|
||||
s.Equal(int64(s.delCnt), deltaLog.GetEntriesNum())
|
||||
s.Equal(s.deleteData.Size(), deltaLog.GetMemorySize())
|
||||
// s.Equal(s.deleteData.Size(), deltaLog.GetMemorySize())
|
||||
}
|
||||
|
||||
func TestL0Import(t *testing.T) {
|
||||
|
|
|
@ -42,8 +42,7 @@ type storageV1Serializer struct {
|
|||
schema *schemapb.CollectionSchema
|
||||
pkField *schemapb.FieldSchema
|
||||
|
||||
inCodec *storage.InsertCodec
|
||||
delCodec *storage.DeleteCodec
|
||||
inCodec *storage.InsertCodec
|
||||
|
||||
allocator allocator.Interface
|
||||
metacache metacache.MetaCache
|
||||
|
@ -68,7 +67,6 @@ func NewStorageSerializer(allocator allocator.Interface, metacache metacache.Met
|
|||
pkField: pkField,
|
||||
|
||||
inCodec: inCodec,
|
||||
delCodec: storage.NewDeleteCodec(),
|
||||
allocator: allocator,
|
||||
metacache: metacache,
|
||||
metaWriter: metaWriter,
|
||||
|
@ -226,5 +224,26 @@ func (s *storageV1Serializer) serializeMergedPkStats(pack *SyncPack) (*storage.B
|
|||
}
|
||||
|
||||
func (s *storageV1Serializer) serializeDeltalog(pack *SyncPack) (*storage.Blob, error) {
|
||||
return s.delCodec.Serialize(pack.collectionID, pack.partitionID, pack.segmentID, pack.deltaData)
|
||||
if len(pack.deltaData.Pks) == 0 {
|
||||
return &storage.Blob{}, nil
|
||||
}
|
||||
|
||||
writer, finalizer, err := storage.CreateDeltalogWriter(pack.collectionID, pack.partitionID, pack.segmentID, pack.deltaData.Pks[0].Type(), 1024)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(pack.deltaData.Pks) != len(pack.deltaData.Tss) {
|
||||
return nil, fmt.Errorf("pk and ts should have same length in delta log, but get %d and %d", len(pack.deltaData.Pks), len(pack.deltaData.Tss))
|
||||
}
|
||||
|
||||
for i := 0; i < len(pack.deltaData.Pks); i++ {
|
||||
deleteLog := storage.NewDeleteLog(pack.deltaData.Pks[i], pack.deltaData.Tss[i])
|
||||
err = writer.Write(deleteLog)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
writer.Close()
|
||||
return finalizer()
|
||||
}
|
||||
|
|
|
@ -135,15 +135,6 @@ func (s *StorageV1SerializerSuite) getDeleteBuffer() *storage.DeleteData {
|
|||
return buf
|
||||
}
|
||||
|
||||
func (s *StorageV1SerializerSuite) getDeleteBufferZeroTs() *storage.DeleteData {
|
||||
buf := &storage.DeleteData{}
|
||||
for i := 0; i < 10; i++ {
|
||||
pk := storage.NewInt64PrimaryKey(int64(i + 1))
|
||||
buf.Append(pk, 0)
|
||||
}
|
||||
return buf
|
||||
}
|
||||
|
||||
func (s *StorageV1SerializerSuite) getBasicPack() *SyncPack {
|
||||
pack := &SyncPack{}
|
||||
|
||||
|
@ -284,15 +275,6 @@ func (s *StorageV1SerializerSuite) TestSerializeDelete() {
|
|||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
s.Run("serialize_failed", func() {
|
||||
pack := s.getBasicPack()
|
||||
pack.WithDeleteData(s.getDeleteBufferZeroTs())
|
||||
pack.WithTimeRange(50, 100)
|
||||
|
||||
_, err := s.serializer.EncodeBuffer(ctx, pack)
|
||||
s.Error(err)
|
||||
})
|
||||
|
||||
s.Run("serialize_normal", func() {
|
||||
pack := s.getBasicPack()
|
||||
pack.WithDeleteData(s.getDeleteBuffer())
|
||||
|
|
|
@ -26,6 +26,7 @@ import "C"
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"path"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
|
@ -963,7 +964,6 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
|
|||
)
|
||||
log.Info("loading delta...")
|
||||
|
||||
dCodec := storage.DeleteCodec{}
|
||||
var blobs []*storage.Blob
|
||||
var futures []*conc.Future[any]
|
||||
for _, deltaLog := range deltaLogs {
|
||||
|
@ -999,10 +999,24 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
|
|||
log.Info("there are no delta logs saved with segment, skip loading delete record")
|
||||
return nil
|
||||
}
|
||||
_, _, deltaData, err := dCodec.Deserialize(blobs)
|
||||
|
||||
deltaData := &storage.DeleteData{}
|
||||
reader, err := storage.CreateDeltalogReader(blobs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
for {
|
||||
err := reader.Next()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return err
|
||||
}
|
||||
dl := reader.Value()
|
||||
deltaData.Append(dl.Pk, dl.Ts)
|
||||
}
|
||||
|
||||
err = segment.LoadDeltaData(ctx, deltaData)
|
||||
if err != nil {
|
||||
|
|
|
@ -33,6 +33,7 @@ import (
|
|||
"github.com/milvus-io/milvus/pkg/common"
|
||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||
"github.com/milvus-io/milvus/pkg/util/metautil"
|
||||
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
|
@ -231,7 +232,7 @@ func NewBinlogDeserializeReader(blobs []*Blob, PKfieldID UniqueID) (*Deserialize
|
|||
}), nil
|
||||
}
|
||||
|
||||
func NewDeltalogOneFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
func newDeltalogOneFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
reader, err := newCompositeBinlogRecordReader(blobs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -488,7 +489,7 @@ func (dsw *DeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func NewDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID) *DeltalogStreamWriter {
|
||||
func newDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID) *DeltalogStreamWriter {
|
||||
return &DeltalogStreamWriter{
|
||||
collectionID: collectionID,
|
||||
partitionID: partitionID,
|
||||
|
@ -501,8 +502,7 @@ func NewDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID) *Del
|
|||
}
|
||||
}
|
||||
|
||||
func NewDeltalogSerializeWriter(partitionID, segmentID UniqueID, eventWriter *DeltalogStreamWriter, batchSize int,
|
||||
) (*SerializeWriter[*DeleteLog], error) {
|
||||
func newDeltalogSerializeWriter(eventWriter *DeltalogStreamWriter, batchSize int) (*SerializeWriter[*DeleteLog], error) {
|
||||
rws := make(map[FieldID]RecordWriter, 1)
|
||||
rw, err := eventWriter.GetRecordWriter()
|
||||
if err != nil {
|
||||
|
@ -521,6 +521,7 @@ func NewDeltalogSerializeWriter(partitionID, segmentID UniqueID, eventWriter *De
|
|||
}
|
||||
|
||||
builder.AppendValueFromString(string(strVal))
|
||||
eventWriter.memorySize += len(strVal)
|
||||
memorySize += uint64(len(strVal))
|
||||
}
|
||||
arr := []arrow.Array{builder.NewArray()}
|
||||
|
@ -638,12 +639,12 @@ func newSimpleArrowRecordReader(blobs []*Blob) (*simpleArrowRecordReader, error)
|
|||
}, nil
|
||||
}
|
||||
|
||||
func NewMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID, schema []*schemapb.FieldSchema) *MultiFieldDeltalogStreamWriter {
|
||||
func newMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID, pkType schemapb.DataType) *MultiFieldDeltalogStreamWriter {
|
||||
return &MultiFieldDeltalogStreamWriter{
|
||||
collectionID: collectionID,
|
||||
partitionID: partitionID,
|
||||
segmentID: segmentID,
|
||||
fieldSchemas: schema,
|
||||
pkType: pkType,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -651,7 +652,7 @@ type MultiFieldDeltalogStreamWriter struct {
|
|||
collectionID UniqueID
|
||||
partitionID UniqueID
|
||||
segmentID UniqueID
|
||||
fieldSchemas []*schemapb.FieldSchema
|
||||
pkType schemapb.DataType
|
||||
|
||||
memorySize int // To be updated on the fly
|
||||
buf bytes.Buffer
|
||||
|
@ -663,17 +664,18 @@ func (dsw *MultiFieldDeltalogStreamWriter) GetRecordWriter() (RecordWriter, erro
|
|||
return dsw.rw, nil
|
||||
}
|
||||
|
||||
fieldIds := make([]FieldID, len(dsw.fieldSchemas))
|
||||
fields := make([]arrow.Field, len(dsw.fieldSchemas))
|
||||
|
||||
for i, fieldSchema := range dsw.fieldSchemas {
|
||||
fieldIds[i] = fieldSchema.FieldID
|
||||
dim, _ := typeutil.GetDim(fieldSchema)
|
||||
fields[i] = arrow.Field{
|
||||
Name: fieldSchema.Name,
|
||||
Type: serdeMap[fieldSchema.DataType].arrowType(int(dim)),
|
||||
Nullable: false, // No nullable check here.
|
||||
}
|
||||
fieldIds := []FieldID{common.RowIDField, common.TimeStampField} // Not used.
|
||||
fields := []arrow.Field{
|
||||
{
|
||||
Name: "pk",
|
||||
Type: serdeMap[dsw.pkType].arrowType(0),
|
||||
Nullable: false,
|
||||
},
|
||||
{
|
||||
Name: "ts",
|
||||
Type: arrow.PrimitiveTypes.Int64,
|
||||
Nullable: false,
|
||||
},
|
||||
}
|
||||
|
||||
rw, err := newMultiFieldRecordWriter(fieldIds, fields, &dsw.buf)
|
||||
|
@ -734,8 +736,7 @@ func (dsw *MultiFieldDeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) err
|
|||
return nil
|
||||
}
|
||||
|
||||
func NewDeltalogMultiFieldWriter(partitionID, segmentID UniqueID, eventWriter *MultiFieldDeltalogStreamWriter, batchSize int,
|
||||
) (*SerializeWriter[*DeleteLog], error) {
|
||||
func newDeltalogMultiFieldWriter(eventWriter *MultiFieldDeltalogStreamWriter, batchSize int) (*SerializeWriter[*DeleteLog], error) {
|
||||
rw, err := eventWriter.GetRecordWriter()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -765,7 +766,7 @@ func NewDeltalogMultiFieldWriter(partitionID, segmentID UniqueID, eventWriter *M
|
|||
for _, vv := range v {
|
||||
pk := vv.Pk.GetValue().(int64)
|
||||
pb.Append(pk)
|
||||
memorySize += uint64(pk)
|
||||
memorySize += 8
|
||||
}
|
||||
case schemapb.DataType_VarChar:
|
||||
pb := builder.Field(0).(*array.StringBuilder)
|
||||
|
@ -780,8 +781,9 @@ func NewDeltalogMultiFieldWriter(partitionID, segmentID UniqueID, eventWriter *M
|
|||
|
||||
for _, vv := range v {
|
||||
builder.Field(1).(*array.Int64Builder).Append(int64(vv.Ts))
|
||||
memorySize += vv.Ts
|
||||
memorySize += 8
|
||||
}
|
||||
eventWriter.memorySize += int(memorySize)
|
||||
|
||||
arr := []arrow.Array{builder.Field(0).NewArray(), builder.Field(1).NewArray()}
|
||||
|
||||
|
@ -797,7 +799,7 @@ func NewDeltalogMultiFieldWriter(partitionID, segmentID UniqueID, eventWriter *M
|
|||
}, batchSize), nil
|
||||
}
|
||||
|
||||
func NewDeltalogMultiFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
func newDeltalogMultiFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
reader, err := newSimpleArrowRecordReader(blobs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -840,11 +842,11 @@ func NewDeltalogMultiFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog],
|
|||
// NewDeltalogDeserializeReader is the entry point for the delta log reader.
|
||||
// It includes NewDeltalogOneFieldReader, which uses the existing log format with only one column in a log file,
|
||||
// and NewDeltalogMultiFieldReader, which uses the new format and supports multiple fields in a log file.
|
||||
func NewDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
func newDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
if supportMultiFieldFormat(blobs) {
|
||||
return NewDeltalogMultiFieldReader(blobs)
|
||||
return newDeltalogMultiFieldReader(blobs)
|
||||
}
|
||||
return NewDeltalogOneFieldReader(blobs)
|
||||
return newDeltalogOneFieldReader(blobs)
|
||||
}
|
||||
|
||||
// check delta log description data to see if it is the format with
|
||||
|
@ -852,12 +854,30 @@ func NewDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog]
|
|||
func supportMultiFieldFormat(blobs []*Blob) bool {
|
||||
if len(blobs) > 0 {
|
||||
reader, err := NewBinlogReader(blobs[0].Value)
|
||||
defer reader.Close()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer reader.Close()
|
||||
version := reader.descriptorEventData.Extras[version]
|
||||
return version != nil && version.(string) == MultiField
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func CreateDeltalogReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
|
||||
return newDeltalogDeserializeReader(blobs)
|
||||
}
|
||||
|
||||
func CreateDeltalogWriter(collectionID, partitionID, segmentID UniqueID, pkType schemapb.DataType, batchSize int) (*SerializeWriter[*DeleteLog], func() (*Blob, error), error) {
|
||||
format := paramtable.Get().DataNodeCfg.DeltalogFormat.GetValue()
|
||||
if format == "json" {
|
||||
eventWriter := newDeltalogStreamWriter(collectionID, partitionID, segmentID)
|
||||
writer, err := newDeltalogSerializeWriter(eventWriter, batchSize)
|
||||
return writer, eventWriter.Finalize, err
|
||||
} else if format == "parquet" {
|
||||
eventWriter := newMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID, pkType)
|
||||
writer, err := newDeltalogMultiFieldWriter(eventWriter, batchSize)
|
||||
return writer, eventWriter.Finalize, err
|
||||
}
|
||||
return nil, nil, merr.WrapErrParameterInvalid("unsupported deltalog format %s", format)
|
||||
}
|
||||
|
|
|
@ -253,7 +253,7 @@ func assertTestDeltalogData(t *testing.T, i int, value *DeleteLog) {
|
|||
|
||||
func TestDeltalogDeserializeReader(t *testing.T) {
|
||||
t.Run("test empty data", func(t *testing.T) {
|
||||
reader, err := NewDeltalogDeserializeReader(nil)
|
||||
reader, err := newDeltalogDeserializeReader(nil)
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
err = reader.Next()
|
||||
|
@ -264,7 +264,7 @@ func TestDeltalogDeserializeReader(t *testing.T) {
|
|||
size := 3
|
||||
blob, err := generateTestDeltalogData(size)
|
||||
assert.NoError(t, err)
|
||||
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
|
||||
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
|
||||
|
@ -283,7 +283,7 @@ func TestDeltalogDeserializeReader(t *testing.T) {
|
|||
|
||||
func TestDeltalogSerializeWriter(t *testing.T) {
|
||||
t.Run("test empty data", func(t *testing.T) {
|
||||
reader, err := NewDeltalogDeserializeReader(nil)
|
||||
reader, err := newDeltalogDeserializeReader(nil)
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
err = reader.Next()
|
||||
|
@ -294,13 +294,13 @@ func TestDeltalogSerializeWriter(t *testing.T) {
|
|||
size := 16
|
||||
blob, err := generateTestDeltalogData(size)
|
||||
assert.NoError(t, err)
|
||||
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
|
||||
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
|
||||
// Copy write the generated data
|
||||
eventWriter := NewDeltalogStreamWriter(0, 0, 0)
|
||||
writer, err := NewDeltalogSerializeWriter(0, 0, eventWriter, 7)
|
||||
eventWriter := newDeltalogStreamWriter(0, 0, 0)
|
||||
writer, err := newDeltalogSerializeWriter(eventWriter, 7)
|
||||
assert.NoError(t, err)
|
||||
|
||||
for i := 0; i < size; i++ {
|
||||
|
@ -323,7 +323,7 @@ func TestDeltalogSerializeWriter(t *testing.T) {
|
|||
assert.NoError(t, err)
|
||||
assert.NotNil(t, newblob)
|
||||
// assert.Equal(t, blobs[0].Value, newblobs[0].Value)
|
||||
reader, err = NewDeltalogDeserializeReader([]*Blob{newblob})
|
||||
reader, err = newDeltalogDeserializeReader([]*Blob{newblob})
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
for i := 0; i < size; i++ {
|
||||
|
@ -338,8 +338,8 @@ func TestDeltalogSerializeWriter(t *testing.T) {
|
|||
|
||||
func TestDeltalogPkTsSeparateFormat(t *testing.T) {
|
||||
t.Run("test empty data", func(t *testing.T) {
|
||||
eventWriter := NewMultiFieldDeltalogStreamWriter(0, 0, 0, nil)
|
||||
writer, err := NewDeltalogMultiFieldWriter(0, 0, eventWriter, 7)
|
||||
eventWriter := newMultiFieldDeltalogStreamWriter(0, 0, 0, schemapb.DataType_Int64)
|
||||
writer, err := newDeltalogMultiFieldWriter(eventWriter, 7)
|
||||
assert.NoError(t, err)
|
||||
defer writer.Close()
|
||||
err = writer.Close()
|
||||
|
@ -380,7 +380,7 @@ func TestDeltalogPkTsSeparateFormat(t *testing.T) {
|
|||
assert.NoError(t, err)
|
||||
|
||||
// Deserialize data
|
||||
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
|
||||
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
|
||||
assert.NoError(t, err)
|
||||
defer reader.Close()
|
||||
for i := 0; i < size; i++ {
|
||||
|
@ -424,8 +424,8 @@ func BenchmarkDeltalogFormatWriter(b *testing.B) {
|
|||
b.Run("one string format writer", func(b *testing.B) {
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
eventWriter := NewDeltalogStreamWriter(0, 0, 0)
|
||||
writer, _ := NewDeltalogSerializeWriter(0, 0, eventWriter, size)
|
||||
eventWriter := newDeltalogStreamWriter(0, 0, 0)
|
||||
writer, _ := newDeltalogSerializeWriter(eventWriter, size)
|
||||
var value *DeleteLog
|
||||
for j := 0; j < size; j++ {
|
||||
value = NewDeleteLog(NewInt64PrimaryKey(int64(j)), uint64(j+1))
|
||||
|
@ -448,11 +448,8 @@ func BenchmarkDeltalogFormatWriter(b *testing.B) {
|
|||
|
||||
func writeDeltalogNewFormat(size int, pkType schemapb.DataType, batchSize int) (*Blob, error) {
|
||||
var err error
|
||||
eventWriter := NewMultiFieldDeltalogStreamWriter(0, 0, 0, []*schemapb.FieldSchema{
|
||||
{FieldID: common.RowIDField, Name: "pk", DataType: pkType},
|
||||
{FieldID: common.TimeStampField, Name: "ts", DataType: schemapb.DataType_Int64},
|
||||
})
|
||||
writer, err := NewDeltalogMultiFieldWriter(0, 0, eventWriter, batchSize)
|
||||
eventWriter := newMultiFieldDeltalogStreamWriter(0, 0, 0, pkType)
|
||||
writer, err := newDeltalogMultiFieldWriter(eventWriter, batchSize)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -479,7 +476,7 @@ func writeDeltalogNewFormat(size int, pkType schemapb.DataType, batchSize int) (
|
|||
}
|
||||
|
||||
func readDeltaLog(size int, blob *Blob) error {
|
||||
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
|
||||
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -82,24 +82,37 @@ func (r *l0Reader) Read() (*storage.DeleteData, error) {
|
|||
return nil, io.EOF
|
||||
}
|
||||
path := r.deltaLogs[r.readIdx]
|
||||
br, err := newBinlogReader(r.ctx, r.cm, path)
|
||||
|
||||
bytes, err := r.cm.Read(r.ctx, path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rowsSet, err := readData(br, storage.DeleteEventType)
|
||||
blobs := []*storage.Blob{{
|
||||
Key: path,
|
||||
Value: bytes,
|
||||
}}
|
||||
// TODO: support multiple delta logs
|
||||
reader, err := storage.CreateDeltalogReader(blobs)
|
||||
if err != nil {
|
||||
log.Error("malformed delta file", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
for _, rows := range rowsSet {
|
||||
for _, row := range rows.([]string) {
|
||||
dl := &storage.DeleteLog{}
|
||||
err = dl.Parse(row)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
defer reader.Close()
|
||||
|
||||
for {
|
||||
err := reader.Next()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
deleteData.Append(dl.Pk, dl.Ts)
|
||||
log.Error("error on importing L0 segment, fail to read deltalogs", zap.Error(err))
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dl := reader.Value()
|
||||
deleteData.Append(dl.Pk, dl.Ts)
|
||||
}
|
||||
|
||||
r.readIdx++
|
||||
if deleteData.Size() >= int64(r.bufferSize) {
|
||||
break
|
||||
|
|
|
@ -4047,6 +4047,8 @@ type dataNodeConfig struct {
|
|||
ClusteringCompactionWorkerPoolSize ParamItem `refreshable:"true"`
|
||||
|
||||
BloomFilterApplyParallelFactor ParamItem `refreshable:"true"`
|
||||
|
||||
DeltalogFormat ParamItem `refreshable:"false"`
|
||||
}
|
||||
|
||||
func (p *dataNodeConfig) init(base *BaseTable) {
|
||||
|
@ -4393,7 +4395,7 @@ if this parameter <= 0, will set it as 10`,
|
|||
p.ClusteringCompactionWorkerPoolSize.Init(base.mgr)
|
||||
|
||||
p.BloomFilterApplyParallelFactor = ParamItem{
|
||||
Key: "datanode.bloomFilterApplyParallelFactor",
|
||||
Key: "dataNode.bloomFilterApplyParallelFactor",
|
||||
FallbackKeys: []string{"datanode.bloomFilterApplyBatchSize"},
|
||||
Version: "2.4.5",
|
||||
DefaultValue: "4",
|
||||
|
@ -4401,6 +4403,15 @@ if this parameter <= 0, will set it as 10`,
|
|||
Export: true,
|
||||
}
|
||||
p.BloomFilterApplyParallelFactor.Init(base.mgr)
|
||||
|
||||
p.DeltalogFormat = ParamItem{
|
||||
Key: "dataNode.storage.deltalog",
|
||||
Version: "2.5.0",
|
||||
DefaultValue: "json",
|
||||
Doc: "deltalog format, options: [json, parquet]",
|
||||
Export: true,
|
||||
}
|
||||
p.DeltalogFormat.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Reference in New Issue