enhance: add delta log stream new format reader and writer (#34116)

issue: #34123

Benchmark case: The benchmark run the go benchmark function
`BenchmarkDeltalogFormat` which is put in the Files changed. It tests
the performance of serializing and deserializing from two different data
formats under a 10 million delete log dataset.

Metrics: The benchmarks measure the average time taken per operation
(ns/op), memory allocated per operation (MB/op), and the number of
memory allocations per operation (allocs/op).
| Test Name | Avg Time (ns/op) | Time Comparison | Memory Allocation
(MB/op) | Memory Comparison | Allocation Count (allocs/op) | Allocation
Comparison |

|---------------------------------|------------------|-----------------|---------------------------|-------------------|------------------------------|------------------------|
| one_string_format_reader | 2,781,990,000 | Baseline | 2,422 | Baseline
| 20,336,539 | Baseline |
| pk_ts_separate_format_reader | 480,682,639 | -82.72% | 1,765 | -27.14%
| 20,396,958 | +0.30% |
| one_string_format_writer | 5,483,436,041 | Baseline | 13,900 |
Baseline | 70,057,473 | Baseline |
| pk_and_ts_separate_format_writer| 798,591,584 | -85.43% | 2,178 |
-84.34% | 30,270,488 | -56.78% |

Both read and write operations show significant improvements in both
speed and memory allocation.

Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
pull/34464/head
shaoting-huang 2024-07-06 09:08:09 +08:00 committed by GitHub
parent 43fd8d19c2
commit f4dd7c7efb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 605 additions and 42 deletions

View File

@ -36,6 +36,11 @@ const (
nullableKey = "nullable"
)
const version = "version"
// mark useMultiFieldFormat if there are multi fields in a log file
const MULTI_FIELD = "MULTI_FIELD"
type descriptorEventData struct {
DescriptorEventDataFixPart
ExtraLength int32

View File

@ -212,6 +212,16 @@ func newDescriptorEvent() *descriptorEvent {
}
}
func NewBaseDescriptorEvent(collectionID int64, partitionID int64, segmentID int64) *descriptorEvent {
de := newDescriptorEvent()
de.CollectionID = collectionID
de.PartitionID = partitionID
de.SegmentID = segmentID
de.StartTimestamp = 0
de.EndTimestamp = 0
return de
}
func newInsertEventWriter(dataType schemapb.DataType, nullable bool, dim ...int) (*insertEventWriter, error) {
var payloadWriter PayloadWriterInterface
var err error

View File

@ -35,6 +35,7 @@ import (
type Record interface {
Schema() map[FieldID]schemapb.DataType
ArrowSchema() *arrow.Schema
Column(i FieldID) arrow.Array
Len() int
Release()
@ -83,6 +84,14 @@ func (r *compositeRecord) Schema() map[FieldID]schemapb.DataType {
return r.schema
}
func (r *compositeRecord) ArrowSchema() *arrow.Schema {
var fields []arrow.Field
for _, rec := range r.recs {
fields = append(fields, rec.Schema().Field(0))
}
return arrow.NewSchema(fields, nil)
}
type serdeEntry struct {
// arrowType returns the arrow type for the given dimension
arrowType func(int) arrow.DataType
@ -575,6 +584,10 @@ func (r *selectiveRecord) Schema() map[FieldID]schemapb.DataType {
return r.schema
}
func (r *selectiveRecord) ArrowSchema() *arrow.Schema {
return r.r.ArrowSchema()
}
func (r *selectiveRecord) Column(i FieldID) arrow.Array {
if i == r.selectedFieldId {
return r.r.Column(i)
@ -663,9 +676,10 @@ func (sfw *singleFieldRecordWriter) Close() {
func newSingleFieldRecordWriter(fieldId FieldID, field arrow.Field, writer io.Writer) (*singleFieldRecordWriter, error) {
schema := arrow.NewSchema([]arrow.Field{field}, nil)
// use writer properties as same as payload writer's for now
fw, err := pqarrow.NewFileWriter(schema, writer,
parquet.NewWriterProperties(
parquet.WithMaxRowGroupLength(math.MaxInt64), // No additional grouping for now.
parquet.WithCompression(compress.Codecs.Zstd),
parquet.WithCompressionLevel(3)),
pqarrow.DefaultWriterProps())
@ -679,6 +693,46 @@ func newSingleFieldRecordWriter(fieldId FieldID, field arrow.Field, writer io.Wr
}, nil
}
var _ RecordWriter = (*multiFieldRecordWriter)(nil)
type multiFieldRecordWriter struct {
fw *pqarrow.FileWriter
fieldIds []FieldID
schema *arrow.Schema
numRows int
}
func (mfw *multiFieldRecordWriter) Write(r Record) error {
mfw.numRows += r.Len()
columns := make([]arrow.Array, len(mfw.fieldIds))
for i, fieldId := range mfw.fieldIds {
columns[i] = r.Column(fieldId)
}
rec := array.NewRecord(mfw.schema, columns, int64(r.Len()))
defer rec.Release()
return mfw.fw.WriteBuffered(rec)
}
func (mfw *multiFieldRecordWriter) Close() {
mfw.fw.Close()
}
func newMultiFieldRecordWriter(fieldIds []FieldID, fields []arrow.Field, writer io.Writer) (*multiFieldRecordWriter, error) {
schema := arrow.NewSchema(fields, nil)
fw, err := pqarrow.NewFileWriter(schema, writer,
parquet.NewWriterProperties(parquet.WithMaxRowGroupLength(math.MaxInt64)), // No additional grouping for now.
pqarrow.DefaultWriterProps())
if err != nil {
return nil, err
}
return &multiFieldRecordWriter{
fw: fw,
fieldIds: fieldIds,
schema: schema,
}, nil
}
type SerializeWriter[T any] struct {
rw RecordWriter
serializer Serializer[T]
@ -773,6 +827,10 @@ func (sr *simpleArrowRecord) Release() {
sr.r.Release()
}
func (sr *simpleArrowRecord) ArrowSchema() *arrow.Schema {
return sr.r.Schema()
}
func newSimpleArrowRecord(r arrow.Record, schema map[FieldID]schemapb.DataType, field2Col map[FieldID]int) *simpleArrowRecord {
return &simpleArrowRecord{
r: r,

View File

@ -28,7 +28,6 @@ import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/util/merr"
@ -231,7 +230,7 @@ func NewBinlogDeserializeReader(blobs []*Blob, PKfieldID UniqueID) (*Deserialize
}), nil
}
func NewDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
func NewDeltalogOneFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
reader, err := newCompositeBinlogRecordReader(blobs)
if err != nil {
return nil, err
@ -314,14 +313,9 @@ func (bsw *BinlogStreamWriter) writeBinlogHeaders(w io.Writer) error {
return err
}
// Write descriptor
de := newDescriptorEvent()
de := NewBaseDescriptorEvent(bsw.collectionID, bsw.partitionID, bsw.segmentID)
de.PayloadDataType = bsw.fieldSchema.DataType
de.CollectionID = bsw.collectionID
de.PartitionID = bsw.partitionID
de.SegmentID = bsw.segmentID
de.FieldID = bsw.fieldSchema.FieldID
de.StartTimestamp = 0
de.EndTimestamp = 0
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(bsw.memorySize))
if err := de.Write(w); err != nil {
return err
@ -420,6 +414,7 @@ type DeltalogStreamWriter struct {
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
fieldSchema *schemapb.FieldSchema
memorySize int // To be updated on the fly
buf bytes.Buffer
@ -430,10 +425,10 @@ func (dsw *DeltalogStreamWriter) GetRecordWriter() (RecordWriter, error) {
if dsw.rw != nil {
return dsw.rw, nil
}
rw, err := newSingleFieldRecordWriter(0, arrow.Field{
Name: "delta",
Type: arrow.BinaryTypes.String,
dim, _ := typeutil.GetDim(dsw.fieldSchema)
rw, err := newSingleFieldRecordWriter(dsw.fieldSchema.FieldID, arrow.Field{
Name: dsw.fieldSchema.Name,
Type: serdeMap[dsw.fieldSchema.DataType].arrowType(int(dim)),
Nullable: false,
}, &dsw.buf)
if err != nil {
@ -469,13 +464,8 @@ func (dsw *DeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) error {
return err
}
// Write descriptor
de := newDescriptorEvent()
de.PayloadDataType = schemapb.DataType_String
de.CollectionID = dsw.collectionID
de.PartitionID = dsw.partitionID
de.SegmentID = dsw.segmentID
de.StartTimestamp = 0
de.EndTimestamp = 0
de := NewBaseDescriptorEvent(dsw.collectionID, dsw.partitionID, dsw.segmentID)
de.PayloadDataType = dsw.fieldSchema.DataType
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(dsw.memorySize))
if err := de.Write(w); err != nil {
return err
@ -502,6 +492,11 @@ func NewDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID) *Del
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
fieldSchema: &schemapb.FieldSchema{
FieldID: common.RowIDField,
Name: "delta",
DataType: schemapb.DataType_String,
},
}
}
@ -542,3 +537,326 @@ func NewDeltalogSerializeWriter(partitionID, segmentID UniqueID, eventWriter *De
return newSimpleArrowRecord(array.NewRecord(arrow.NewSchema(field, nil), arr, int64(len(v))), schema, field2Col), memorySize, nil
}, batchSize), nil
}
var _ RecordReader = (*simpleArrowRecordReader)(nil)
type simpleArrowRecordReader struct {
blobs []*Blob
blobPos int
rr array.RecordReader
closer func()
r simpleArrowRecord
}
func (crr *simpleArrowRecordReader) iterateNextBatch() error {
if crr.closer != nil {
crr.closer()
}
crr.blobPos++
if crr.blobPos >= len(crr.blobs) {
return io.EOF
}
reader, err := NewBinlogReader(crr.blobs[crr.blobPos].Value)
if err != nil {
return err
}
er, err := reader.NextEventReader()
if err != nil {
return err
}
rr, err := er.GetArrowRecordReader()
if err != nil {
return err
}
crr.rr = rr
crr.closer = func() {
crr.rr.Release()
er.Close()
reader.Close()
}
return nil
}
func (crr *simpleArrowRecordReader) Next() error {
if crr.rr == nil {
if crr.blobs == nil || len(crr.blobs) == 0 {
return io.EOF
}
crr.blobPos = -1
crr.r = simpleArrowRecord{
schema: make(map[FieldID]schemapb.DataType),
field2Col: make(map[FieldID]int),
}
if err := crr.iterateNextBatch(); err != nil {
return err
}
}
composeRecord := func() bool {
if ok := crr.rr.Next(); !ok {
return false
}
record := crr.rr.Record()
for i := range record.Schema().Fields() {
crr.r.field2Col[FieldID(i)] = i
}
crr.r.r = record
return true
}
if ok := composeRecord(); !ok {
if err := crr.iterateNextBatch(); err != nil {
return err
}
if ok := composeRecord(); !ok {
return io.EOF
}
}
return nil
}
func (crr *simpleArrowRecordReader) Record() Record {
return &crr.r
}
func (crr *simpleArrowRecordReader) Close() {
if crr.closer != nil {
crr.closer()
}
}
func newSimpleArrowRecordReader(blobs []*Blob) (*simpleArrowRecordReader, error) {
return &simpleArrowRecordReader{
blobs: blobs,
}, nil
}
func NewMultiFieldDeltalogStreamWriter(collectionID, partitionID, segmentID UniqueID, schema []*schemapb.FieldSchema) *MultiFieldDeltalogStreamWriter {
return &MultiFieldDeltalogStreamWriter{
collectionID: collectionID,
partitionID: partitionID,
segmentID: segmentID,
fieldSchemas: schema,
}
}
type MultiFieldDeltalogStreamWriter struct {
collectionID UniqueID
partitionID UniqueID
segmentID UniqueID
fieldSchemas []*schemapb.FieldSchema
memorySize int // To be updated on the fly
buf bytes.Buffer
rw *multiFieldRecordWriter
}
func (dsw *MultiFieldDeltalogStreamWriter) GetRecordWriter() (RecordWriter, error) {
if dsw.rw != nil {
return dsw.rw, nil
}
fieldIds := make([]FieldID, len(dsw.fieldSchemas))
fields := make([]arrow.Field, len(dsw.fieldSchemas))
for i, fieldSchema := range dsw.fieldSchemas {
fieldIds[i] = fieldSchema.FieldID
dim, _ := typeutil.GetDim(fieldSchema)
fields[i] = arrow.Field{
Name: fieldSchema.Name,
Type: serdeMap[fieldSchema.DataType].arrowType(int(dim)),
Nullable: false, // No nullable check here.
}
}
rw, err := newMultiFieldRecordWriter(fieldIds, fields, &dsw.buf)
if err != nil {
return nil, err
}
dsw.rw = rw
return rw, nil
}
func (dsw *MultiFieldDeltalogStreamWriter) Finalize() (*Blob, error) {
if dsw.rw == nil {
return nil, io.ErrUnexpectedEOF
}
dsw.rw.Close()
var b bytes.Buffer
if err := dsw.writeDeltalogHeaders(&b); err != nil {
return nil, err
}
if _, err := b.Write(dsw.buf.Bytes()); err != nil {
return nil, err
}
return &Blob{
Value: b.Bytes(),
RowNum: int64(dsw.rw.numRows),
MemorySize: int64(dsw.memorySize),
}, nil
}
func (dsw *MultiFieldDeltalogStreamWriter) writeDeltalogHeaders(w io.Writer) error {
// Write magic number
if err := binary.Write(w, common.Endian, MagicNumber); err != nil {
return err
}
// Write descriptor
de := NewBaseDescriptorEvent(dsw.collectionID, dsw.partitionID, dsw.segmentID)
de.PayloadDataType = schemapb.DataType_Int64
de.descriptorEventData.AddExtra(originalSizeKey, strconv.Itoa(dsw.memorySize))
de.descriptorEventData.AddExtra(version, MULTI_FIELD)
if err := de.Write(w); err != nil {
return err
}
// Write event header
eh := newEventHeader(DeleteEventType)
// Write event data
ev := newDeleteEventData()
ev.StartTimestamp = 1
ev.EndTimestamp = 1
eh.EventLength = int32(dsw.buf.Len()) + eh.GetMemoryUsageInBytes() + int32(binary.Size(ev))
// eh.NextPosition = eh.EventLength + w.Offset()
if err := eh.Write(w); err != nil {
return err
}
if err := ev.WriteEventData(w); err != nil {
return err
}
return nil
}
func NewDeltalogMultiFieldWriter(partitionID, segmentID UniqueID, eventWriter *MultiFieldDeltalogStreamWriter, batchSize int,
) (*SerializeWriter[*DeleteLog], error) {
rw, err := eventWriter.GetRecordWriter()
if err != nil {
return nil, err
}
return NewSerializeRecordWriter[*DeleteLog](rw, func(v []*DeleteLog) (Record, uint64, error) {
fields := []arrow.Field{
{
Name: "pk",
Type: serdeMap[schemapb.DataType(v[0].PkType)].arrowType(0),
Nullable: false,
},
{
Name: "ts",
Type: arrow.PrimitiveTypes.Int64,
Nullable: false,
},
}
arrowSchema := arrow.NewSchema(fields, nil)
builder := array.NewRecordBuilder(memory.DefaultAllocator, arrowSchema)
defer builder.Release()
var memorySize uint64
pkType := schemapb.DataType(v[0].PkType)
switch pkType {
case schemapb.DataType_Int64:
pb := builder.Field(0).(*array.Int64Builder)
for _, vv := range v {
pk := vv.Pk.GetValue().(int64)
pb.Append(pk)
memorySize += uint64(pk)
}
case schemapb.DataType_VarChar:
pb := builder.Field(0).(*array.StringBuilder)
for _, vv := range v {
pk := vv.Pk.GetValue().(string)
pb.Append(pk)
memorySize += uint64(binary.Size(pk))
}
default:
return nil, 0, fmt.Errorf("unexpected pk type %v", v[0].PkType)
}
for _, vv := range v {
builder.Field(1).(*array.Int64Builder).Append(int64(vv.Ts))
memorySize += uint64(vv.Ts)
}
arr := []arrow.Array{builder.Field(0).NewArray(), builder.Field(1).NewArray()}
field2Col := map[FieldID]int{
common.RowIDField: 0,
common.TimeStampField: 1,
}
schema := map[FieldID]schemapb.DataType{
common.RowIDField: pkType,
common.TimeStampField: schemapb.DataType_Int64,
}
return newSimpleArrowRecord(array.NewRecord(arrowSchema, arr, int64(len(v))), schema, field2Col), memorySize, nil
}, batchSize), nil
}
func NewDeltalogMultiFieldReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
reader, err := newSimpleArrowRecordReader(blobs)
if err != nil {
return nil, err
}
return NewDeserializeReader(reader, func(r Record, v []*DeleteLog) error {
rec, ok := r.(*simpleArrowRecord)
if !ok {
return fmt.Errorf("can not cast to simple arrow record")
}
fields := rec.r.Schema().Fields()
switch fields[0].Type.ID() {
case arrow.INT64:
arr := r.Column(0).(*array.Int64)
for j := 0; j < r.Len(); j++ {
if v[j] == nil {
v[j] = &DeleteLog{}
}
v[j].Pk = NewInt64PrimaryKey(arr.Value(j))
}
case arrow.STRING:
arr := r.Column(0).(*array.String)
for j := 0; j < r.Len(); j++ {
if v[j] == nil {
v[j] = &DeleteLog{}
}
v[j].Pk = NewVarCharPrimaryKey(arr.Value(j))
}
default:
return fmt.Errorf("unexpected delta log pkType %v", fields[0].Type.Name())
}
arr := r.Column(1).(*array.Int64)
for j := 0; j < r.Len(); j++ {
v[j].Ts = uint64(arr.Value(j))
}
return nil
}), nil
}
// NewDeltalogDeserializeReader is the entry point for the delta log reader.
// It includes NewDeltalogOneFieldReader, which uses the existing log format with only one column in a log file,
// and NewDeltalogMultiFieldReader, which uses the new format and supports multiple fields in a log file.
func NewDeltalogDeserializeReader(blobs []*Blob) (*DeserializeReader[*DeleteLog], error) {
if supportMultiFieldFormat(blobs) {
return NewDeltalogMultiFieldReader(blobs)
}
return NewDeltalogOneFieldReader(blobs)
}
// check delta log description data to see if it is the format with
// pk and ts column separately
func supportMultiFieldFormat(blobs []*Blob) bool {
if blobs != nil && len(blobs) > 0 {
reader, err := NewBinlogReader(blobs[0].Value)
defer reader.Close()
if err != nil {
return false
}
version := reader.descriptorEventData.Extras[version]
return version != nil && version.(string) == MULTI_FIELD
}
return false
}

View File

@ -20,6 +20,7 @@ import (
"bytes"
"context"
"io"
"strconv"
"testing"
"github.com/apache/arrow/go/v12/arrow"
@ -40,12 +41,6 @@ func TestBinlogDeserializeReader(t *testing.T) {
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
// blobs := generateTestData(t, 0)
// reader, err = NewBinlogDeserializeReader(blobs, common.RowIDField)
// assert.NoError(t, err)
// err = reader.Next()
// assert.Equal(t, io.EOF, err)
})
t.Run("test deserialize", func(t *testing.T) {
@ -186,24 +181,11 @@ func TestNull(t *testing.T) {
assert.NoError(t, err)
m := make(map[FieldID]any)
for _, fs := range schema.Fields {
m[fs.FieldID] = nil
}
m[common.RowIDField] = int64(0)
m[common.TimeStampField] = int64(0)
m[10] = nil
m[11] = nil
m[12] = nil
m[13] = nil
m[14] = nil
m[15] = nil
m[16] = nil
m[17] = nil
m[18] = nil
m[19] = nil
m[101] = nil
m[102] = nil
m[103] = nil
m[104] = nil
m[105] = nil
m[106] = nil
pk, err := GenPrimaryKeyByRawData(m[common.RowIDField], schemapb.DataType_Int64)
assert.NoError(t, err)
@ -343,3 +325,161 @@ func TestDeltalogSerializeWriter(t *testing.T) {
}
})
}
func TestDeltalogPkTsSeparateFormat(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
eventWriter := NewMultiFieldDeltalogStreamWriter(0, 0, 0, nil)
writer, err := NewDeltalogMultiFieldWriter(0, 0, eventWriter, 7)
assert.NoError(t, err)
defer writer.Close()
err = writer.Close()
assert.NoError(t, err)
blob, err := eventWriter.Finalize()
assert.NoError(t, err)
assert.NotNil(t, blob)
})
testCases := []struct {
name string
pkType schemapb.DataType
assertPk func(t *testing.T, i int, value *DeleteLog)
}{
{
name: "test int64 pk",
pkType: schemapb.DataType_Int64,
assertPk: func(t *testing.T, i int, value *DeleteLog) {
assert.Equal(t, NewInt64PrimaryKey(int64(i)), value.Pk)
assert.Equal(t, uint64(i+1), value.Ts)
},
},
{
name: "test varchar pk",
pkType: schemapb.DataType_VarChar,
assertPk: func(t *testing.T, i int, value *DeleteLog) {
assert.Equal(t, NewVarCharPrimaryKey(strconv.Itoa(i)), value.Pk)
assert.Equal(t, uint64(i+1), value.Ts)
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// serialize data
size := 10
blob, err := writeDeltalogNewFormat(size, tc.pkType, 7)
assert.NoError(t, err)
// Deserialize data
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
assert.NoError(t, err)
defer reader.Close()
for i := 0; i < size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
tc.assertPk(t, i, value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
}
}
func BenchmarkDeltalogReader(b *testing.B) {
size := 1000000
blob, err := generateTestDeltalogData(size)
assert.NoError(b, err)
b.Run("one string format reader", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
readDeltaLog(size, blob)
}
})
blob, err = writeDeltalogNewFormat(size, schemapb.DataType_Int64, size)
assert.NoError(b, err)
b.Run("pk ts separate format reader", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
readDeltaLog(size, blob)
}
})
}
func BenchmarkDeltalogFormatWriter(b *testing.B) {
size := 1000000
b.Run("one string format writer", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
eventWriter := NewDeltalogStreamWriter(0, 0, 0)
writer, _ := NewDeltalogSerializeWriter(0, 0, eventWriter, size)
var value *DeleteLog
for j := 0; j < size; j++ {
value = NewDeleteLog(NewInt64PrimaryKey(int64(j)), uint64(j+1))
writer.Write(value)
}
writer.Close()
eventWriter.Finalize()
}
b.ReportAllocs()
})
b.Run("pk and ts separate format writer", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
writeDeltalogNewFormat(size, schemapb.DataType_Int64, size)
}
b.ReportAllocs()
})
}
func writeDeltalogNewFormat(size int, pkType schemapb.DataType, batchSize int) (*Blob, error) {
var err error
eventWriter := NewMultiFieldDeltalogStreamWriter(0, 0, 0, []*schemapb.FieldSchema{
{FieldID: common.RowIDField, Name: "pk", DataType: pkType},
{FieldID: common.TimeStampField, Name: "ts", DataType: schemapb.DataType_Int64},
})
writer, err := NewDeltalogMultiFieldWriter(0, 0, eventWriter, batchSize)
if err != nil {
return nil, err
}
var value *DeleteLog
for i := 0; i < size; i++ {
switch pkType {
case schemapb.DataType_Int64:
value = NewDeleteLog(NewInt64PrimaryKey(int64(i)), uint64(i+1))
case schemapb.DataType_VarChar:
value = NewDeleteLog(NewVarCharPrimaryKey(strconv.Itoa(i)), uint64(i+1))
}
if err = writer.Write(value); err != nil {
return nil, err
}
}
if err = writer.Close(); err != nil {
return nil, err
}
blob, err := eventWriter.Finalize()
if err != nil {
return nil, err
}
return blob, nil
}
func readDeltaLog(size int, blob *Blob) error {
reader, err := NewDeltalogDeserializeReader([]*Blob{blob})
if err != nil {
return err
}
defer reader.Close()
for j := 0; j < size; j++ {
err = reader.Next()
_ = reader.Value()
if err != nil {
return err
}
}
return nil
}

View File

@ -21,6 +21,7 @@ import (
"reflect"
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/stretchr/testify/assert"
@ -100,6 +101,37 @@ func TestSerDe(t *testing.T) {
}
}
func TestArrowSchema(t *testing.T) {
fields := []arrow.Field{{Name: "1", Type: arrow.BinaryTypes.String, Nullable: true}}
builder := array.NewBuilder(memory.DefaultAllocator, arrow.BinaryTypes.String)
builder.AppendValueFromString("1")
record := array.NewRecord(arrow.NewSchema(fields, nil), []arrow.Array{builder.NewArray()}, 1)
t.Run("test composite record", func(t *testing.T) {
cr := &compositeRecord{
recs: make(map[FieldID]arrow.Record, 1),
schema: make(map[FieldID]schemapb.DataType, 1),
}
cr.recs[0] = record
cr.schema[0] = schemapb.DataType_String
expected := arrow.NewSchema(fields, nil)
assert.Equal(t, expected, cr.ArrowSchema())
})
t.Run("test simple arrow record", func(t *testing.T) {
cr := &simpleArrowRecord{
r: record,
schema: make(map[FieldID]schemapb.DataType, 1),
field2Col: make(map[FieldID]int, 1),
}
cr.schema[0] = schemapb.DataType_String
expected := arrow.NewSchema(fields, nil)
assert.Equal(t, expected, cr.ArrowSchema())
sr := newSelectiveRecord(cr, 0)
assert.Equal(t, expected, sr.ArrowSchema())
})
}
func BenchmarkDeserializeReader(b *testing.B) {
len := 1000000
blobs, err := generateTestData(len)