milvus/internal/storage/serde_events.go

1251 lines
32 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"sort"
"strconv"
"github.com/apache/arrow/go/v17/arrow"
"github.com/apache/arrow/go/v17/arrow/array"
"github.com/apache/arrow/go/v17/arrow/memory"
"github.com/samber/lo"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/json"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/etcdpb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/metautil"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
var _ RecordReader = (*CompositeBinlogRecordReader)(nil)
// ChunkedBlobsReader returns a chunk composed of blobs, or io.EOF if no more data
type ChunkedBlobsReader func() ([]*Blob, error)
type CompositeBinlogRecordReader struct {
BlobsReader ChunkedBlobsReader
brs []*BinlogReader
rrs []array.RecordReader
schema *schemapb.CollectionSchema
index map[FieldID]int16
}
func (crr *CompositeBinlogRecordReader) iterateNextBatch() error {
if crr.brs != nil {
for _, er := range crr.brs {
er.Close()
}
for _, rr := range crr.rrs {
rr.Release()
}
}
blobs, err := crr.BlobsReader()
if err != nil {
return err
}
if crr.rrs == nil {
crr.rrs = make([]array.RecordReader, len(blobs))
crr.brs = make([]*BinlogReader, len(blobs))
crr.index = make(map[FieldID]int16, len(blobs))
}
for i, b := range blobs {
reader, err := NewBinlogReader(b.Value)
if err != nil {
return err
}
er, err := reader.NextEventReader()
if err != nil {
return err
}
rr, err := er.GetArrowRecordReader()
if err != nil {
return err
}
crr.rrs[i] = rr
crr.index[reader.FieldID] = int16(i)
crr.brs[i] = reader
}
return nil
}
func (crr *CompositeBinlogRecordReader) Next() (Record, error) {
if crr.rrs == nil {
if err := crr.iterateNextBatch(); err != nil {
return nil, err
}
}
composeRecord := func() (Record, bool) {
recs := make([]arrow.Array, len(crr.rrs))
for i, rr := range crr.rrs {
if ok := rr.Next(); !ok {
return nil, false
}
recs[i] = rr.Record().Column(0)
}
return &compositeRecord{
index: crr.index,
recs: recs,
}, true
}
// Try compose records
var (
r Record
ok bool
)
r, ok = composeRecord()
if !ok {
// If failed the first time, try iterate next batch (blob), the error may be io.EOF
if err := crr.iterateNextBatch(); err != nil {
return nil, err
}
// If iterate next batch success, try compose again
if r, ok = composeRecord(); !ok {
// If the next blob is empty, return io.EOF (it's rare).
return nil, io.EOF
}
}
return r, nil
}
func (crr *CompositeBinlogRecordReader) Close() error {
if crr.brs != nil {
for _, er := range crr.brs {
if er != nil {
er.Close()
}
}
}
if crr.rrs != nil {
for _, rr := range crr.rrs {
if rr != nil {
rr.Release()
}
}
}
return nil
}
func parseBlobKey(blobKey string) (colId FieldID, logId UniqueID) {
if _, _, _, colId, logId, ok := metautil.ParseInsertLogPath(blobKey); ok {
return colId, logId
}
if colId, err := strconv.ParseInt(blobKey, 10, 64); err == nil {
// data_codec.go generate single field id as blob key.
return colId, 0
}
return InvalidUniqueID, InvalidUniqueID
}
func MakeBlobsReader(blobs []*Blob) ChunkedBlobsReader {
blobMap := make(map[FieldID][]*Blob)
for _, blob := range blobs {
colId, _ := parseBlobKey(blob.Key)
if _, exists := blobMap[colId]; !exists {
blobMap[colId] = []*Blob{blob}
} else {
blobMap[colId] = append(blobMap[colId], blob)
}
}
sortedBlobs := make([][]*Blob, 0, len(blobMap))
for _, blobsForField := range blobMap {
sort.Slice(blobsForField, func(i, j int) bool {
_, iLog := parseBlobKey(blobsForField[i].Key)
_, jLog := parseBlobKey(blobsForField[j].Key)
return iLog < jLog
})
sortedBlobs = append(sortedBlobs, blobsForField)
}
chunkPos := 0
return func() ([]*Blob, error) {
if len(sortedBlobs) == 0 || chunkPos >= len(sortedBlobs[0]) {
return nil, io.EOF
}
blobs := make([]*Blob, len(sortedBlobs))
for fieldPos := range blobs {
blobs[fieldPos] = sortedBlobs[fieldPos][chunkPos]
}
chunkPos++
return blobs, nil
}
}
func newCompositeBinlogRecordReader(schema *schemapb.CollectionSchema, blobsReader ChunkedBlobsReader) (*CompositeBinlogRecordReader, error) {
return &CompositeBinlogRecordReader{
schema: schema,
BlobsReader: blobsReader,
}, nil
}
func ValueDeserializer(r Record, v []*Value, fieldSchema []*schemapb.FieldSchema) error {
pkField := func() *schemapb.FieldSchema {
for _, field := range fieldSchema {
if field.GetIsPrimaryKey() {
return field
}
}
return nil
}()
if pkField == nil {
return merr.WrapErrServiceInternal("no primary key field found")
}
for i := 0; i < r.Len(); i++ {
value := v[i]
if value == nil {
value = &Value{}
value.Value = make(map[FieldID]interface{}, len(fieldSchema))
v[i] = value
}
m := value.Value.(map[FieldID]interface{})
for _, f := range fieldSchema {
j := f.FieldID
dt := f.DataType
if r.Column(j) == nil {
if f.GetDefaultValue() != nil {
m[j] = getDefaultValue(f)
} else {
m[j] = nil
}
continue
}
if r.Column(j).IsNull(i) {
m[j] = nil
} else {
d, ok := serdeMap[dt].deserialize(r.Column(j), i)
if ok {
m[j] = d // TODO: avoid memory copy here.
} else {
return merr.WrapErrServiceInternal(fmt.Sprintf("unexpected type %s", dt))
}
}
}
rowID, ok := m[common.RowIDField].(int64)
if !ok {
return merr.WrapErrIoKeyNotFound("no row id column found")
}
value.ID = rowID
value.Timestamp = m[common.TimeStampField].(int64)
pk, err := GenPrimaryKeyByRawData(m[pkField.FieldID], pkField.DataType)
if err != nil {
return err
}
value.PK = pk
value.IsDeleted = false
value.Value = m
}
return nil
}
func NewBinlogDeserializeReader(schema *schemapb.CollectionSchema, blobsReader ChunkedBlobsReader) (*DeserializeReader[*Value], error) {
reader, err := newCompositeBinlogRecordReader(schema, blobsReader)
if err != nil {
return nil, err
}
return NewDeserializeReader(reader, func(r Record, v []*Value) error {
return ValueDeserializer(r, v, schema.Fields)
}), nil
}
func newDeltalogOneFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
reader, err := newCompositeBinlogRecordReader(nil, MakeBlobsReader(blobs))
if err != nil {
return nil, err
}
return NewDeserializeReader(reader, func(r Record, v []*DeleteLog) error {
for i := 0; i < r.Len(); i++ {
if v[i] == nil {
v[i] = &DeleteLog{}
}
// retrieve the only field
a := r.(*compositeRecord).recs[0].(*array.String)
strVal := a.Value(i)
if err := v[i].Parse(strVal); err != nil {
return err
}
}
return nil
}), nil
}
type BinlogStreamWriter struct {
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
fieldSchema *schemapb.FieldSchema
buf bytes.Buffer
rw *singleFieldRecordWriter
}
func (bsw *BinlogStreamWriter) GetRecordWriter() (RecordWriter, error) {
if bsw.rw != nil {
return bsw.rw, nil
}
fid := bsw.fieldSchema.FieldID
dim, _ := typeutil.GetDim(bsw.fieldSchema)
rw, err := newSingleFieldRecordWriter(fid, arrow.Field{
Name: strconv.Itoa(int(fid)),
Type: serdeMap[bsw.fieldSchema.DataType].arrowType(int(dim)),
Nullable: true, // No nullable check here.
}, &bsw.buf, WithRecordWriterProps(getFieldWriterProps(bsw.fieldSchema)))
if err != nil {
return nil, err
}
bsw.rw = rw
return rw, nil
}
func (bsw *BinlogStreamWriter) Finalize() (*Blob, error) {
if bsw.rw == nil {
return nil, io.ErrUnexpectedEOF
}
bsw.rw.Close()
var b bytes.Buffer
if err := bsw.writeBinlogHeaders(&b); err != nil {
return nil, err
}
if _, err := b.Write(bsw.buf.Bytes()); err != nil {
return nil, err
}
return &Blob{
Key: strconv.Itoa(int(bsw.fieldSchema.FieldID)),
Value: b.Bytes(),
RowNum: int64(bsw.rw.numRows),
MemorySize: int64(bsw.rw.writtenUncompressed),
}, nil
}
func (bsw *BinlogStreamWriter) writeBinlogHeaders(w io.Writer) error {
// Write magic number
if err := binary.Write(w, common.Endian, MagicNumber); err != nil {
return err
}
// Write descriptor
de := NewBaseDescriptorEvent(bsw.collectionID, bsw.partitionID, bsw.segmentID)
de.PayloadDataType = bsw.fieldSchema.DataType
de.FieldID = bsw.fieldSchema.FieldID
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(int(bsw.rw.writtenUncompressed)))
de.descriptorEventData.AddExtra(nullableKey, bsw.fieldSchema.Nullable)
if err := de.Write(w); err != nil {
return err
}
// Write event header
eh := newEventHeader(InsertEventType)
// Write event data
ev := newInsertEventData()
ev.StartTimestamp = 1
ev.EndTimestamp = 1
eh.EventLength = int32(bsw.buf.Len()) + eh.GetMemoryUsageInBytes() + int32(binary.Size(ev))
// eh.NextPosition = eh.EventLength + w.Offset()
if err := eh.Write(w); err != nil {
return err
}
if err := ev.WriteEventData(w); err != nil {
return err
}
return nil
}
func NewBinlogStreamWriters(collectionID, partitionID, segmentID UniqueID,
schema []*schemapb.FieldSchema,
) map[FieldID]*BinlogStreamWriter {
bws := make(map[FieldID]*BinlogStreamWriter, len(schema))
for _, f := range schema {
bws[f.FieldID] = &BinlogStreamWriter{
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
fieldSchema: f,
}
}
return bws
}
func ValueSerializer(v []*Value, fieldSchema []*schemapb.FieldSchema) (Record, error) {
builders := make(map[FieldID]array.Builder, len(fieldSchema))
types := make(map[FieldID]schemapb.DataType, len(fieldSchema))
for _, f := range fieldSchema {
dim, _ := typeutil.GetDim(f)
builders[f.FieldID] = array.NewBuilder(memory.DefaultAllocator, serdeMap[f.DataType].arrowType(int(dim)))
types[f.FieldID] = f.DataType
}
for _, vv := range v {
m := vv.Value.(map[FieldID]any)
for fid, e := range m {
typeEntry, ok := serdeMap[types[fid]]
if !ok {
panic("unknown type")
}
ok = typeEntry.serialize(builders[fid], e)
if !ok {
return nil, merr.WrapErrServiceInternal(fmt.Sprintf("serialize error on type %s", types[fid]))
}
}
}
arrays := make([]arrow.Array, len(fieldSchema))
fields := make([]arrow.Field, len(fieldSchema))
field2Col := make(map[FieldID]int, len(fieldSchema))
for i, field := range fieldSchema {
builder := builders[field.FieldID]
arrays[i] = builder.NewArray()
builder.Release()
fields[i] = arrow.Field{
Name: field.Name,
Type: arrays[i].DataType(),
Metadata: arrow.NewMetadata([]string{"FieldID"}, []string{strconv.Itoa(int(field.FieldID))}),
}
field2Col[field.FieldID] = i
}
return NewSimpleArrowRecord(array.NewRecord(arrow.NewSchema(fields, nil), arrays, int64(len(v))), field2Col), nil
}
type BinlogRecordWriter interface {
RecordWriter
GetLogs() (
fieldBinlogs map[FieldID]*datapb.FieldBinlog,
statsLog *datapb.FieldBinlog,
bm25StatsLog map[FieldID]*datapb.FieldBinlog,
)
GetRowNum() int64
}
type ChunkedBlobsWriter func([]*Blob) error
type CompositeBinlogRecordWriter struct {
// attributes
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
schema *schemapb.CollectionSchema
BlobsWriter ChunkedBlobsWriter
allocator allocator.Interface
chunkSize uint64
rootPath string
maxRowNum int64
pkstats *PrimaryKeyStats
bm25Stats map[int64]*BM25Stats
// writers and stats generated at runtime
fieldWriters map[FieldID]*BinlogStreamWriter
rw RecordWriter
tsFrom typeutil.Timestamp
tsTo typeutil.Timestamp
rowNum int64
// results
fieldBinlogs map[FieldID]*datapb.FieldBinlog
statsLog *datapb.FieldBinlog
bm25StatsLog map[FieldID]*datapb.FieldBinlog
}
var _ BinlogRecordWriter = (*CompositeBinlogRecordWriter)(nil)
func (c *CompositeBinlogRecordWriter) Write(r Record) error {
if err := c.initWriters(); err != nil {
return err
}
tsArray := r.Column(common.TimeStampField).(*array.Int64)
rows := r.Len()
for i := 0; i < rows; i++ {
ts := typeutil.Timestamp(tsArray.Value(i))
if ts < c.tsFrom {
c.tsFrom = ts
}
if ts > c.tsTo {
c.tsTo = ts
}
switch schemapb.DataType(c.pkstats.PkType) {
case schemapb.DataType_Int64:
pkArray := r.Column(c.pkstats.FieldID).(*array.Int64)
pk := &Int64PrimaryKey{
Value: pkArray.Value(i),
}
c.pkstats.Update(pk)
case schemapb.DataType_VarChar:
pkArray := r.Column(c.pkstats.FieldID).(*array.String)
pk := &VarCharPrimaryKey{
Value: pkArray.Value(i),
}
c.pkstats.Update(pk)
default:
panic("invalid data type")
}
for fieldID, stats := range c.bm25Stats {
field, ok := r.Column(fieldID).(*array.Binary)
if !ok {
return fmt.Errorf("bm25 field value not found")
}
stats.AppendBytes(field.Value(i))
}
}
if err := c.rw.Write(r); err != nil {
return err
}
c.rowNum += int64(rows)
// flush if size exceeds chunk size
if c.rw.GetWrittenUncompressed() >= c.chunkSize {
return c.flushChunk()
}
return nil
}
func (c *CompositeBinlogRecordWriter) initWriters() error {
if c.rw == nil {
c.fieldWriters = NewBinlogStreamWriters(c.collectionID, c.partitionID, c.segmentID, c.schema.Fields)
rws := make(map[FieldID]RecordWriter, len(c.fieldWriters))
for fid, w := range c.fieldWriters {
rw, err := w.GetRecordWriter()
if err != nil {
return err
}
rws[fid] = rw
}
c.rw = NewCompositeRecordWriter(rws)
}
return nil
}
func (c *CompositeBinlogRecordWriter) resetWriters() {
c.fieldWriters = nil
c.rw = nil
c.tsFrom = math.MaxUint64
c.tsTo = 0
}
func (c *CompositeBinlogRecordWriter) Close() error {
if err := c.writeStats(); err != nil {
return err
}
if err := c.writeBm25Stats(); err != nil {
return err
}
if c.rw != nil {
// if rw is not nil, it means there is data to be flushed
if err := c.flushChunk(); err != nil {
return err
}
}
return nil
}
func (c *CompositeBinlogRecordWriter) GetWrittenUncompressed() uint64 {
return 0
}
func (c *CompositeBinlogRecordWriter) flushChunk() error {
if c.fieldWriters == nil {
return nil
}
id, _, err := c.allocator.Alloc(uint32(len(c.fieldWriters)))
if err != nil {
return err
}
blobs := make(map[FieldID]*Blob, len(c.fieldWriters))
for fid, w := range c.fieldWriters {
b, err := w.Finalize()
if err != nil {
return err
}
// assign blob key
b.Key = metautil.BuildInsertLogPath(c.rootPath, c.collectionID, c.partitionID, c.segmentID, fid, id)
blobs[fid] = b
id++
}
if err := c.BlobsWriter(lo.Values(blobs)); err != nil {
return err
}
// attach binlogs
if c.fieldBinlogs == nil {
c.fieldBinlogs = make(map[FieldID]*datapb.FieldBinlog, len(c.fieldWriters))
for fid := range c.fieldWriters {
c.fieldBinlogs[fid] = &datapb.FieldBinlog{
FieldID: fid,
}
}
}
for fid, b := range blobs {
c.fieldBinlogs[fid].Binlogs = append(c.fieldBinlogs[fid].Binlogs, &datapb.Binlog{
LogSize: int64(len(b.Value)),
MemorySize: b.MemorySize,
LogPath: b.Key,
EntriesNum: b.RowNum,
TimestampFrom: c.tsFrom,
TimestampTo: c.tsTo,
})
}
// reset writers
c.resetWriters()
return nil
}
func (c *CompositeBinlogRecordWriter) writeStats() error {
if c.pkstats == nil {
return nil
}
id, err := c.allocator.AllocOne()
if err != nil {
return err
}
codec := NewInsertCodecWithSchema(&etcdpb.CollectionMeta{
ID: c.collectionID,
Schema: c.schema,
})
sblob, err := codec.SerializePkStats(c.pkstats, c.rowNum)
if err != nil {
return err
}
sblob.Key = metautil.BuildStatsLogPath(c.rootPath,
c.collectionID, c.partitionID, c.segmentID, c.pkstats.FieldID, id)
if err := c.BlobsWriter([]*Blob{sblob}); err != nil {
return err
}
c.statsLog = &datapb.FieldBinlog{
FieldID: c.pkstats.FieldID,
Binlogs: []*datapb.Binlog{
{
LogSize: int64(len(sblob.GetValue())),
MemorySize: int64(len(sblob.GetValue())),
LogPath: sblob.Key,
EntriesNum: c.rowNum,
},
},
}
return nil
}
func (c *CompositeBinlogRecordWriter) writeBm25Stats() error {
if len(c.bm25Stats) == 0 {
return nil
}
id, _, err := c.allocator.Alloc(uint32(len(c.bm25Stats)))
if err != nil {
return err
}
if c.bm25StatsLog == nil {
c.bm25StatsLog = make(map[FieldID]*datapb.FieldBinlog)
}
for fid, stats := range c.bm25Stats {
bytes, err := stats.Serialize()
if err != nil {
return err
}
key := metautil.BuildBm25LogPath(c.rootPath,
c.collectionID, c.partitionID, c.segmentID, fid, id)
blob := &Blob{
Key: key,
Value: bytes,
RowNum: stats.NumRow(),
MemorySize: int64(len(bytes)),
}
if err := c.BlobsWriter([]*Blob{blob}); err != nil {
return err
}
fieldLog := &datapb.FieldBinlog{
FieldID: fid,
Binlogs: []*datapb.Binlog{
{
LogSize: int64(len(blob.GetValue())),
MemorySize: int64(len(blob.GetValue())),
LogPath: key,
EntriesNum: c.rowNum,
},
},
}
c.bm25StatsLog[fid] = fieldLog
id++
}
return nil
}
func (c *CompositeBinlogRecordWriter) GetLogs() (
fieldBinlogs map[FieldID]*datapb.FieldBinlog,
statsLog *datapb.FieldBinlog,
bm25StatsLog map[FieldID]*datapb.FieldBinlog,
) {
return c.fieldBinlogs, c.statsLog, c.bm25StatsLog
}
func (c *CompositeBinlogRecordWriter) GetRowNum() int64 {
return c.rowNum
}
func newCompositeBinlogRecordWriter(collectionID, partitionID, segmentID UniqueID, schema *schemapb.CollectionSchema,
blobsWriter ChunkedBlobsWriter, allocator allocator.Interface, chunkSize uint64, rootPath string, maxRowNum int64,
) (*CompositeBinlogRecordWriter, error) {
pkField, err := typeutil.GetPrimaryFieldSchema(schema)
if err != nil {
log.Warn("failed to get pk field from schema")
return nil, err
}
stats, err := NewPrimaryKeyStats(pkField.GetFieldID(), int64(pkField.GetDataType()), maxRowNum)
if err != nil {
return nil, err
}
bm25FieldIDs := lo.FilterMap(schema.GetFunctions(), func(function *schemapb.FunctionSchema, _ int) (int64, bool) {
if function.GetType() == schemapb.FunctionType_BM25 {
return function.GetOutputFieldIds()[0], true
}
return 0, false
})
bm25Stats := make(map[int64]*BM25Stats, len(bm25FieldIDs))
for _, fid := range bm25FieldIDs {
bm25Stats[fid] = NewBM25Stats()
}
return &CompositeBinlogRecordWriter{
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
schema: schema,
BlobsWriter: blobsWriter,
allocator: allocator,
chunkSize: chunkSize,
rootPath: rootPath,
maxRowNum: maxRowNum,
pkstats: stats,
bm25Stats: bm25Stats,
}, nil
}
func NewChunkedBinlogSerializeWriter(collectionID, partitionID, segmentID UniqueID, schema *schemapb.CollectionSchema,
blobsWriter ChunkedBlobsWriter, allocator allocator.Interface, chunkSize uint64, rootPath string, maxRowNum int64, batchSize int,
) (*SerializeWriter[*Value], error) {
rw, err := newCompositeBinlogRecordWriter(collectionID, partitionID, segmentID, schema, blobsWriter, allocator, chunkSize, rootPath, maxRowNum)
if err != nil {
return nil, err
}
return NewSerializeRecordWriter[*Value](rw, func(v []*Value) (Record, error) {
return ValueSerializer(v, schema.Fields)
}, batchSize), nil
}
func NewBinlogSerializeWriter(schema *schemapb.CollectionSchema, partitionID, segmentID UniqueID,
eventWriters map[FieldID]*BinlogStreamWriter, batchSize int,
) (*SerializeWriter[*Value], error) {
rws := make(map[FieldID]RecordWriter, len(eventWriters))
for fid := range eventWriters {
w := eventWriters[fid]
rw, err := w.GetRecordWriter()
if err != nil {
return nil, err
}
rws[fid] = rw
}
compositeRecordWriter := NewCompositeRecordWriter(rws)
return NewSerializeRecordWriter[*Value](compositeRecordWriter, func(v []*Value) (Record, error) {
return ValueSerializer(v, schema.Fields)
}, batchSize), nil
}
type DeltalogStreamWriter struct {
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
fieldSchema *schemapb.FieldSchema
buf bytes.Buffer
rw *singleFieldRecordWriter
}
func (dsw *DeltalogStreamWriter) GetRecordWriter() (RecordWriter, error) {
if dsw.rw != nil {
return dsw.rw, nil
}
dim, _ := typeutil.GetDim(dsw.fieldSchema)
rw, err := newSingleFieldRecordWriter(dsw.fieldSchema.FieldID, arrow.Field{
Name: dsw.fieldSchema.Name,
Type: serdeMap[dsw.fieldSchema.DataType].arrowType(int(dim)),
Nullable: false,
}, &dsw.buf, WithRecordWriterProps(getFieldWriterProps(dsw.fieldSchema)))
if err != nil {
return nil, err
}
dsw.rw = rw
return rw, nil
}
func (dsw *DeltalogStreamWriter) Finalize() (*Blob, error) {
if dsw.rw == nil {
return nil, io.ErrUnexpectedEOF
}
dsw.rw.Close()
var b bytes.Buffer
if err := dsw.writeDeltalogHeaders(&b); err != nil {
return nil, err
}
if _, err := b.Write(dsw.buf.Bytes()); err != nil {
return nil, err
}
return &Blob{
Value: b.Bytes(),
RowNum: int64(dsw.rw.numRows),
MemorySize: int64(dsw.rw.writtenUncompressed),
}, nil
}
func (dsw *DeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) error {
// Write magic number
if err := binary.Write(w, common.Endian, MagicNumber); err != nil {
return err
}
// Write descriptor
de := NewBaseDescriptorEvent(dsw.collectionID, dsw.partitionID, dsw.segmentID)
de.PayloadDataType = dsw.fieldSchema.DataType
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(int(dsw.rw.writtenUncompressed)))
if err := de.Write(w); err != nil {
return err
}
// Write event header
eh := newEventHeader(DeleteEventType)
// Write event data
ev := newDeleteEventData()
ev.StartTimestamp = 1
ev.EndTimestamp = 1
eh.EventLength = int32(dsw.buf.Len()) + eh.GetMemoryUsageInBytes() + int32(binary.Size(ev))
// eh.NextPosition = eh.EventLength + w.Offset()
if err := eh.Write(w); err != nil {
return err
}
if err := ev.WriteEventData(w); err != nil {
return err
}
return nil
}
func newDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID) *DeltalogStreamWriter {
return &DeltalogStreamWriter{
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
fieldSchema: &schemapb.FieldSchema{
FieldID: common.RowIDField,
Name: "delta",
DataType: schemapb.DataType_String,
},
}
}
func newDeltalogSerializeWriter(eventWriter *DeltalogStreamWriter, batchSize int) (*SerializeWriter[*DeleteLog], error) {
rws := make(map[FieldID]RecordWriter, 1)
rw, err := eventWriter.GetRecordWriter()
if err != nil {
return nil, err
}
rws[0] = rw
compositeRecordWriter := NewCompositeRecordWriter(rws)
return NewSerializeRecordWriter[*DeleteLog](compositeRecordWriter, func(v []*DeleteLog) (Record, error) {
builder := array.NewBuilder(memory.DefaultAllocator, arrow.BinaryTypes.String)
for _, vv := range v {
strVal, err := json.Marshal(vv)
if err != nil {
return nil, err
}
builder.AppendValueFromString(string(strVal))
}
arr := []arrow.Array{builder.NewArray()}
field := []arrow.Field{{
Name: "delta",
Type: arrow.BinaryTypes.String,
Nullable: false,
}}
field2Col := map[FieldID]int{
0: 0,
}
return NewSimpleArrowRecord(array.NewRecord(arrow.NewSchema(field, nil), arr, int64(len(v))), field2Col), nil
}, batchSize), nil
}
var _ RecordReader = (*simpleArrowRecordReader)(nil)
type simpleArrowRecordReader struct {
blobs []*Blob
blobPos int
rr array.RecordReader
closer func()
r simpleArrowRecord
}
func (crr *simpleArrowRecordReader) iterateNextBatch() error {
if crr.closer != nil {
crr.closer()
}
crr.blobPos++
if crr.blobPos >= len(crr.blobs) {
return io.EOF
}
reader, err := NewBinlogReader(crr.blobs[crr.blobPos].Value)
if err != nil {
return err
}
er, err := reader.NextEventReader()
if err != nil {
return err
}
rr, err := er.GetArrowRecordReader()
if err != nil {
return err
}
crr.rr = rr
crr.closer = func() {
crr.rr.Release()
er.Close()
reader.Close()
}
return nil
}
func (crr *simpleArrowRecordReader) Next() (Record, error) {
if crr.rr == nil {
if len(crr.blobs) == 0 {
return nil, io.EOF
}
crr.blobPos = -1
crr.r = simpleArrowRecord{
field2Col: make(map[FieldID]int),
}
if err := crr.iterateNextBatch(); err != nil {
return nil, err
}
}
composeRecord := func() bool {
if ok := crr.rr.Next(); !ok {
return false
}
record := crr.rr.Record()
for i := range record.Schema().Fields() {
crr.r.field2Col[FieldID(i)] = i
}
crr.r.r = record
return true
}
if ok := composeRecord(); !ok {
if err := crr.iterateNextBatch(); err != nil {
return nil, err
}
if ok := composeRecord(); !ok {
return nil, io.EOF
}
}
return &crr.r, nil
}
func (crr *simpleArrowRecordReader) Close() error {
if crr.closer != nil {
crr.closer()
}
return nil
}
func newSimpleArrowRecordReader(blobs []*Blob) (*simpleArrowRecordReader, error) {
return &simpleArrowRecordReader{
blobs: blobs,
}, nil
}
func newMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID, pkType schemapb.DataType) *MultiFieldDeltalogStreamWriter {
return &MultiFieldDeltalogStreamWriter{
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
pkType: pkType,
}
}
type MultiFieldDeltalogStreamWriter struct {
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
pkType schemapb.DataType
buf bytes.Buffer
rw *multiFieldRecordWriter
}
func (dsw *MultiFieldDeltalogStreamWriter) GetRecordWriter() (RecordWriter, error) {
if dsw.rw != nil {
return dsw.rw, nil
}
fieldIds := []FieldID{common.RowIDField, common.TimeStampField} // Not used.
fields := []arrow.Field{
{
Name: "pk",
Type: serdeMap[dsw.pkType].arrowType(0),
Nullable: false,
},
{
Name: "ts",
Type: arrow.PrimitiveTypes.Int64,
Nullable: false,
},
}
rw, err := newMultiFieldRecordWriter(fieldIds, fields, &dsw.buf)
if err != nil {
return nil, err
}
dsw.rw = rw
return rw, nil
}
func (dsw *MultiFieldDeltalogStreamWriter) Finalize() (*Blob, error) {
if dsw.rw == nil {
return nil, io.ErrUnexpectedEOF
}
dsw.rw.Close()
var b bytes.Buffer
if err := dsw.writeDeltalogHeaders(&b); err != nil {
return nil, err
}
if _, err := b.Write(dsw.buf.Bytes()); err != nil {
return nil, err
}
return &Blob{
Value: b.Bytes(),
RowNum: int64(dsw.rw.numRows),
MemorySize: int64(dsw.rw.writtenUncompressed),
}, nil
}
func (dsw *MultiFieldDeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) error {
// Write magic number
if err := binary.Write(w, common.Endian, MagicNumber); err != nil {
return err
}
// Write descriptor
de := NewBaseDescriptorEvent(dsw.collectionID, dsw.partitionID, dsw.segmentID)
de.PayloadDataType = schemapb.DataType_Int64
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(int(dsw.rw.writtenUncompressed)))
de.descriptorEventData.AddExtra(version, MultiField)
if err := de.Write(w); err != nil {
return err
}
// Write event header
eh := newEventHeader(DeleteEventType)
// Write event data
ev := newDeleteEventData()
ev.StartTimestamp = 1
ev.EndTimestamp = 1
eh.EventLength = int32(dsw.buf.Len()) + eh.GetMemoryUsageInBytes() + int32(binary.Size(ev))
// eh.NextPosition = eh.EventLength + w.Offset()
if err := eh.Write(w); err != nil {
return err
}
if err := ev.WriteEventData(w); err != nil {
return err
}
return nil
}
func newDeltalogMultiFieldWriter(eventWriter *MultiFieldDeltalogStreamWriter, batchSize int) (*SerializeWriter[*DeleteLog], error) {
rw, err := eventWriter.GetRecordWriter()
if err != nil {
return nil, err
}
return NewSerializeRecordWriter[*DeleteLog](rw, func(v []*DeleteLog) (Record, error) {
fields := []arrow.Field{
{
Name: "pk",
Type: serdeMap[schemapb.DataType(v[0].PkType)].arrowType(0),
Nullable: false,
},
{
Name: "ts",
Type: arrow.PrimitiveTypes.Int64,
Nullable: false,
},
}
arrowSchema := arrow.NewSchema(fields, nil)
builder := array.NewRecordBuilder(memory.DefaultAllocator, arrowSchema)
defer builder.Release()
pkType := schemapb.DataType(v[0].PkType)
switch pkType {
case schemapb.DataType_Int64:
pb := builder.Field(0).(*array.Int64Builder)
for _, vv := range v {
pk := vv.Pk.GetValue().(int64)
pb.Append(pk)
}
case schemapb.DataType_VarChar:
pb := builder.Field(0).(*array.StringBuilder)
for _, vv := range v {
pk := vv.Pk.GetValue().(string)
pb.Append(pk)
}
default:
return nil, fmt.Errorf("unexpected pk type %v", v[0].PkType)
}
for _, vv := range v {
builder.Field(1).(*array.Int64Builder).Append(int64(vv.Ts))
}
arr := []arrow.Array{builder.Field(0).NewArray(), builder.Field(1).NewArray()}
field2Col := map[FieldID]int{
common.RowIDField: 0,
common.TimeStampField: 1,
}
return NewSimpleArrowRecord(array.NewRecord(arrowSchema, arr, int64(len(v))), field2Col), nil
}, batchSize), nil
}
func newDeltalogMultiFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
reader, err := newSimpleArrowRecordReader(blobs)
if err != nil {
return nil, err
}
return NewDeserializeReader(reader, func(r Record, v []*DeleteLog) error {
rec, ok := r.(*simpleArrowRecord)
if !ok {
return fmt.Errorf("can not cast to simple arrow record")
}
fields := rec.r.Schema().Fields()
switch fields[0].Type.ID() {
case arrow.INT64:
arr := r.Column(0).(*array.Int64)
for j := 0; j < r.Len(); j++ {
if v[j] == nil {
v[j] = &DeleteLog{}
}
v[j].Pk = NewInt64PrimaryKey(arr.Value(j))
}
case arrow.STRING:
arr := r.Column(0).(*array.String)
for j := 0; j < r.Len(); j++ {
if v[j] == nil {
v[j] = &DeleteLog{}
}
v[j].Pk = NewVarCharPrimaryKey(arr.Value(j))
}
default:
return fmt.Errorf("unexpected delta log pkType %v", fields[0].Type.Name())
}
arr := r.Column(1).(*array.Int64)
for j := 0; j < r.Len(); j++ {
v[j].Ts = uint64(arr.Value(j))
}
return nil
}), nil
}
// NewDeltalogDeserializeReader is the entry point for the delta log reader.
// It includes NewDeltalogOneFieldReader, which uses the existing log format with only one column in a log file,
// and NewDeltalogMultiFieldReader, which uses the new format and supports multiple fields in a log file.
func newDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
if supportMultiFieldFormat(blobs) {
return newDeltalogMultiFieldReader(blobs)
}
return newDeltalogOneFieldReader(blobs)
}
// check delta log description data to see if it is the format with
// pk and ts column separately
func supportMultiFieldFormat(blobs []*Blob) bool {
if len(blobs) > 0 {
reader, err := NewBinlogReader(blobs[0].Value)
if err != nil {
return false
}
defer reader.Close()
version := reader.descriptorEventData.Extras[version]
return version != nil && version.(string) == MultiField
}
return false
}
func CreateDeltalogReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
return newDeltalogDeserializeReader(blobs)
}
func CreateDeltalogWriter(collectionID, partitionID, segmentID UniqueID, pkType schemapb.DataType, batchSize int) (*SerializeWriter[*DeleteLog], func() (*Blob, error), error) {
format := paramtable.Get().DataNodeCfg.DeltalogFormat.GetValue()
if format == "json" {
eventWriter := newDeltalogStreamWriter(collectionID, partitionID, segmentID)
writer, err := newDeltalogSerializeWriter(eventWriter, batchSize)
return writer, eventWriter.Finalize, err
} else if format == "parquet" {
eventWriter := newMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID, pkType)
writer, err := newDeltalogMultiFieldWriter(eventWriter, batchSize)
return writer, eventWriter.Finalize, err
}
return nil, nil, merr.WrapErrParameterInvalid("unsupported deltalog format %s", format)
}