milvus/internal/storage/serde_events_test.go

493 lines
13 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"bytes"
"context"
"io"
"strconv"
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/parquet/file"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/common"
)
func TestBinlogDeserializeReader(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := NewBinlogDeserializeReader(nil, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
t.Run("test deserialize", func(t *testing.T) {
size := 3
blobs, err := generateTestData(size)
assert.NoError(t, err)
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestData(t, i, value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
}
func TestBinlogStreamWriter(t *testing.T) {
t.Run("test write", func(t *testing.T) {
size := 3
field := arrow.Field{Name: "bool", Type: arrow.FixedWidthTypes.Boolean}
var w bytes.Buffer
rw, err := newSingleFieldRecordWriter(1, field, &w)
assert.NoError(t, err)
builder := array.NewBooleanBuilder(memory.DefaultAllocator)
builder.AppendValues([]bool{true, false, true}, nil)
arr := builder.NewArray()
defer arr.Release()
ar := array.NewRecord(
arrow.NewSchema(
[]arrow.Field{field},
nil,
),
[]arrow.Array{arr},
int64(size),
)
r := newSimpleArrowRecord(ar, map[FieldID]schemapb.DataType{1: schemapb.DataType_Bool}, map[FieldID]int{1: 0})
defer r.Release()
err = rw.Write(r)
assert.NoError(t, err)
rw.Close()
reader, err := file.NewParquetReader(bytes.NewReader(w.Bytes()))
assert.NoError(t, err)
arrowReader, err := pqarrow.NewFileReader(reader, pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator)
assert.NoError(t, err)
rr, err := arrowReader.GetRecordReader(context.Background(), nil, nil)
assert.NoError(t, err)
defer rr.Release()
ok := rr.Next()
assert.True(t, ok)
rec := rr.Record()
defer rec.Release()
assert.Equal(t, int64(size), rec.NumRows())
ok = rr.Next()
assert.False(t, ok)
})
}
func TestBinlogSerializeWriter(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := NewBinlogDeserializeReader(nil, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
t.Run("test serialize", func(t *testing.T) {
size := 16
blobs, err := generateTestData(size)
assert.NoError(t, err)
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
schema := generateTestSchema()
// Copy write the generated data
writers := NewBinlogStreamWriters(0, 0, 0, schema.Fields)
writer, err := NewBinlogSerializeWriter(schema, 0, 0, writers, 7)
assert.NoError(t, err)
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestData(t, i, value)
err := writer.Write(value)
assert.NoError(t, err)
}
for _, f := range schema.Fields {
props := writers[f.FieldID].rw.writerProps
assert.Equal(t, !f.IsPrimaryKey, props.DictionaryEnabled())
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
err = writer.Close()
assert.NoError(t, err)
assert.True(t, writer.WrittenMemorySize() >= 429)
// Read from the written data
newblobs := make([]*Blob, len(writers))
i := 0
for _, w := range writers {
blob, err := w.Finalize()
assert.NoError(t, err)
assert.NotNil(t, blob)
assert.True(t, blob.MemorySize > 0)
newblobs[i] = blob
i++
}
// Both field pk and field 17 are with datatype string and auto id
// in test data. Field pk uses delta byte array encoding, while
// field 17 uses dict encoding.
assert.Less(t, writers[16].buf.Len(), writers[17].buf.Len())
// assert.Equal(t, blobs[0].Value, newblobs[0].Value)
reader, err = NewBinlogDeserializeReader(newblobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
for i := 1; i <= size; i++ {
err = reader.Next()
assert.NoError(t, err, i)
value := reader.Value()
assertTestData(t, i, value)
}
})
}
func TestNull(t *testing.T) {
t.Run("test null", func(t *testing.T) {
schema := generateTestSchema()
// Copy write the generated data
writers := NewBinlogStreamWriters(0, 0, 0, schema.Fields)
writer, err := NewBinlogSerializeWriter(schema, 0, 0, writers, 1024)
assert.NoError(t, err)
m := make(map[FieldID]any)
for _, fs := range schema.Fields {
m[fs.FieldID] = nil
}
m[common.RowIDField] = int64(0)
m[common.TimeStampField] = int64(0)
pk, err := GenPrimaryKeyByRawData(m[common.RowIDField], schemapb.DataType_Int64)
assert.NoError(t, err)
value := &Value{
ID: 0,
PK: pk,
Timestamp: 0,
IsDeleted: false,
Value: m,
}
writer.Write(value)
err = writer.Close()
assert.NoError(t, err)
// Read from the written data
blobs := make([]*Blob, len(writers))
i := 0
for _, w := range writers {
blob, err := w.Finalize()
assert.NoError(t, err)
assert.NotNil(t, blob)
blobs[i] = blob
i++
}
reader, err := NewBinlogDeserializeReader(blobs, common.RowIDField)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.NoError(t, err)
readValue := reader.Value()
assert.Equal(t, value, readValue)
})
}
func generateTestDeltalogData(size int) (*Blob, error) {
codec := NewDeleteCodec()
pks := make([]int64, size)
tss := make([]uint64, size)
for i := 0; i < size; i++ {
pks[i] = int64(i)
tss[i] = uint64(i + 1)
}
data := &DeleteData{}
for i := range pks {
data.Append(NewInt64PrimaryKey(pks[i]), tss[i])
}
return codec.Serialize(0, 0, 0, data)
}
func assertTestDeltalogData(t *testing.T, i int, value *DeleteLog) {
assert.Equal(t, &Int64PrimaryKey{int64(i)}, value.Pk)
assert.Equal(t, uint64(i+1), value.Ts)
}
func TestDeltalogDeserializeReader(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := newDeltalogDeserializeReader(nil)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
t.Run("test deserialize", func(t *testing.T) {
size := 3
blob, err := generateTestDeltalogData(size)
assert.NoError(t, err)
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
assert.NoError(t, err)
defer reader.Close()
for i := 0; i < size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestDeltalogData(t, i, value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
}
func TestDeltalogSerializeWriter(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
reader, err := newDeltalogDeserializeReader(nil)
assert.NoError(t, err)
defer reader.Close()
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
t.Run("test serialize", func(t *testing.T) {
size := 16
blob, err := generateTestDeltalogData(size)
assert.NoError(t, err)
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
assert.NoError(t, err)
defer reader.Close()
// Copy write the generated data
eventWriter := newDeltalogStreamWriter(0, 0, 0)
writer, err := newDeltalogSerializeWriter(eventWriter, 7)
assert.NoError(t, err)
for i := 0; i < size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
assertTestDeltalogData(t, i, value)
err := writer.Write(value)
assert.NoError(t, err)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
err = writer.Close()
assert.NoError(t, err)
// Read from the written data
newblob, err := eventWriter.Finalize()
assert.NoError(t, err)
assert.NotNil(t, newblob)
// assert.Equal(t, blobs[0].Value, newblobs[0].Value)
reader, err = newDeltalogDeserializeReader([]*Blob{newblob})
assert.NoError(t, err)
defer reader.Close()
for i := 0; i < size; i++ {
err = reader.Next()
assert.NoError(t, err, i)
value := reader.Value()
assertTestDeltalogData(t, i, value)
}
})
}
func TestDeltalogPkTsSeparateFormat(t *testing.T) {
t.Run("test empty data", func(t *testing.T) {
eventWriter := newMultiFieldDeltalogStreamWriter(0, 0, 0, schemapb.DataType_Int64)
writer, err := newDeltalogMultiFieldWriter(eventWriter, 7)
assert.NoError(t, err)
defer writer.Close()
err = writer.Close()
assert.NoError(t, err)
blob, err := eventWriter.Finalize()
assert.NoError(t, err)
assert.NotNil(t, blob)
})
testCases := []struct {
name string
pkType schemapb.DataType
assertPk func(t *testing.T, i int, value *DeleteLog)
}{
{
name: "test int64 pk",
pkType: schemapb.DataType_Int64,
assertPk: func(t *testing.T, i int, value *DeleteLog) {
assert.Equal(t, NewInt64PrimaryKey(int64(i)), value.Pk)
assert.Equal(t, uint64(i+1), value.Ts)
},
},
{
name: "test varchar pk",
pkType: schemapb.DataType_VarChar,
assertPk: func(t *testing.T, i int, value *DeleteLog) {
assert.Equal(t, NewVarCharPrimaryKey(strconv.Itoa(i)), value.Pk)
assert.Equal(t, uint64(i+1), value.Ts)
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// serialize data
size := 10
blob, err := writeDeltalogNewFormat(size, tc.pkType, 7)
assert.NoError(t, err)
// Deserialize data
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
assert.NoError(t, err)
defer reader.Close()
for i := 0; i < size; i++ {
err = reader.Next()
assert.NoError(t, err)
value := reader.Value()
tc.assertPk(t, i, value)
}
err = reader.Next()
assert.Equal(t, io.EOF, err)
})
}
}
func BenchmarkDeltalogReader(b *testing.B) {
size := 1000000
blob, err := generateTestDeltalogData(size)
assert.NoError(b, err)
b.Run("one string format reader", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
readDeltaLog(size, blob)
}
})
blob, err = writeDeltalogNewFormat(size, schemapb.DataType_Int64, size)
assert.NoError(b, err)
b.Run("pk ts separate format reader", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
readDeltaLog(size, blob)
}
})
}
func BenchmarkDeltalogFormatWriter(b *testing.B) {
size := 1000000
b.Run("one string format writer", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
eventWriter := newDeltalogStreamWriter(0, 0, 0)
writer, _ := newDeltalogSerializeWriter(eventWriter, size)
var value *DeleteLog
for j := 0; j < size; j++ {
value = NewDeleteLog(NewInt64PrimaryKey(int64(j)), uint64(j+1))
writer.Write(value)
}
writer.Close()
eventWriter.Finalize()
}
b.ReportAllocs()
})
b.Run("pk and ts separate format writer", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
writeDeltalogNewFormat(size, schemapb.DataType_Int64, size)
}
b.ReportAllocs()
})
}
func writeDeltalogNewFormat(size int, pkType schemapb.DataType, batchSize int) (*Blob, error) {
var err error
eventWriter := newMultiFieldDeltalogStreamWriter(0, 0, 0, pkType)
writer, err := newDeltalogMultiFieldWriter(eventWriter, batchSize)
if err != nil {
return nil, err
}
var value *DeleteLog
for i := 0; i < size; i++ {
switch pkType {
case schemapb.DataType_Int64:
value = NewDeleteLog(NewInt64PrimaryKey(int64(i)), uint64(i+1))
case schemapb.DataType_VarChar:
value = NewDeleteLog(NewVarCharPrimaryKey(strconv.Itoa(i)), uint64(i+1))
}
if err = writer.Write(value); err != nil {
return nil, err
}
}
if err = writer.Close(); err != nil {
return nil, err
}
blob, err := eventWriter.Finalize()
if err != nil {
return nil, err
}
return blob, nil
}
func readDeltaLog(size int, blob *Blob) error {
reader, err := newDeltalogDeserializeReader([]*Blob{blob})
if err != nil {
return err
}
defer reader.Close()
for j := 0; j < size; j++ {
err = reader.Next()
_ = reader.Value()
if err != nil {
return err
}
}
return nil
}