mirror of https://github.com/milvus-io/milvus.git
209 lines
6.1 KiB
Go
209 lines
6.1 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package storage
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
|
|
"github.com/apache/arrow/go/v17/arrow"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/storagev2/packed"
|
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
)
|
|
|
|
type packedRecordReader struct {
|
|
reader *packed.PackedReader
|
|
|
|
bufferSize int64
|
|
schema *schemapb.CollectionSchema
|
|
field2Col map[FieldID]int
|
|
}
|
|
|
|
var _ RecordReader = (*packedRecordReader)(nil)
|
|
|
|
func (pr *packedRecordReader) Next() (Record, error) {
|
|
if pr.reader == nil {
|
|
return nil, io.EOF
|
|
}
|
|
rec, err := pr.reader.ReadNext()
|
|
if err != nil || rec == nil {
|
|
return nil, io.EOF
|
|
}
|
|
return NewSimpleArrowRecord(rec, pr.field2Col), nil
|
|
}
|
|
|
|
func (pr *packedRecordReader) Close() error {
|
|
if pr.reader != nil {
|
|
return pr.reader.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func newPackedRecordReader(paths []string, schema *schemapb.CollectionSchema, bufferSize int64,
|
|
) (*packedRecordReader, error) {
|
|
arrowSchema, err := ConvertToArrowSchema(schema.Fields)
|
|
if err != nil {
|
|
return nil, merr.WrapErrParameterInvalid("convert collection schema [%s] to arrow schema error: %s", schema.Name, err.Error())
|
|
}
|
|
reader, err := packed.NewPackedReader(paths, arrowSchema, bufferSize)
|
|
if err != nil {
|
|
return nil, merr.WrapErrParameterInvalid("New binlog record packed reader error: %s", err.Error())
|
|
}
|
|
field2Col := make(map[FieldID]int)
|
|
for i, field := range schema.Fields {
|
|
field2Col[field.FieldID] = i
|
|
}
|
|
return &packedRecordReader{
|
|
reader: reader,
|
|
schema: schema,
|
|
bufferSize: bufferSize,
|
|
field2Col: field2Col,
|
|
}, nil
|
|
}
|
|
|
|
func NewPackedDeserializeReader(paths []string, schema *schemapb.CollectionSchema,
|
|
bufferSize int64, pkFieldID FieldID,
|
|
) (*DeserializeReader[*Value], error) {
|
|
reader, err := newPackedRecordReader(paths, schema, bufferSize)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return NewDeserializeReader(reader, func(r Record, v []*Value) error {
|
|
rec, ok := r.(*simpleArrowRecord)
|
|
if !ok {
|
|
return merr.WrapErrServiceInternal("can not cast to simple arrow record")
|
|
}
|
|
|
|
schema := reader.schema
|
|
numFields := len(schema.Fields)
|
|
for i := 0; i < rec.Len(); i++ {
|
|
if v[i] == nil {
|
|
v[i] = &Value{
|
|
Value: make(map[FieldID]interface{}, numFields),
|
|
}
|
|
}
|
|
value := v[i]
|
|
m := value.Value.(map[FieldID]interface{})
|
|
for _, field := range schema.Fields {
|
|
fieldID := field.FieldID
|
|
column := r.Column(fieldID)
|
|
if column.IsNull(i) {
|
|
m[fieldID] = nil
|
|
} else {
|
|
d, ok := serdeMap[field.DataType].deserialize(column, i)
|
|
if ok {
|
|
m[fieldID] = d
|
|
} else {
|
|
return merr.WrapErrServiceInternal(fmt.Sprintf("can not deserialize field [%s]", field.Name))
|
|
}
|
|
}
|
|
}
|
|
|
|
rowID, ok := m[common.RowIDField].(int64)
|
|
if !ok {
|
|
return merr.WrapErrIoKeyNotFound("no row id column found")
|
|
}
|
|
value.ID = rowID
|
|
value.Timestamp = m[common.TimeStampField].(int64)
|
|
|
|
pkCol := rec.field2Col[pkFieldID]
|
|
pk, err := GenPrimaryKeyByRawData(m[pkFieldID], schema.Fields[pkCol].DataType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
value.PK = pk
|
|
value.IsDeleted = false
|
|
value.Value = m
|
|
}
|
|
return nil
|
|
}), nil
|
|
}
|
|
|
|
var _ RecordWriter = (*packedRecordWriter)(nil)
|
|
|
|
type packedRecordWriter struct {
|
|
writer *packed.PackedWriter
|
|
|
|
bufferSize int64
|
|
multiPartUploadSize int64
|
|
columnGroups [][]int
|
|
paths []string
|
|
schema *arrow.Schema
|
|
|
|
numRows int
|
|
writtenUncompressed uint64
|
|
}
|
|
|
|
func (pw *packedRecordWriter) Write(r Record) error {
|
|
rec, ok := r.(*simpleArrowRecord)
|
|
if !ok {
|
|
return merr.WrapErrServiceInternal("can not cast to simple arrow record")
|
|
}
|
|
pw.numRows += r.Len()
|
|
for _, arr := range rec.r.Columns() {
|
|
pw.writtenUncompressed += uint64(calculateArraySize(arr))
|
|
}
|
|
defer rec.Release()
|
|
return pw.writer.WriteRecordBatch(rec.r)
|
|
}
|
|
|
|
func (pw *packedRecordWriter) GetWrittenUncompressed() uint64 {
|
|
return pw.writtenUncompressed
|
|
}
|
|
|
|
func (pw *packedRecordWriter) Close() error {
|
|
if pw.writer != nil {
|
|
return pw.writer.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func NewPackedRecordWriter(paths []string, schema *arrow.Schema, bufferSize int64, multiPartUploadSize int64, columnGroups [][]int) (*packedRecordWriter, error) {
|
|
writer, err := packed.NewPackedWriter(paths, schema, bufferSize, multiPartUploadSize, columnGroups)
|
|
if err != nil {
|
|
return nil, merr.WrapErrServiceInternal(
|
|
fmt.Sprintf("can not new packed record writer %s", err.Error()))
|
|
}
|
|
return &packedRecordWriter{
|
|
writer: writer,
|
|
schema: schema,
|
|
bufferSize: bufferSize,
|
|
paths: paths,
|
|
}, nil
|
|
}
|
|
|
|
func NewPackedSerializeWriter(paths []string, schema *schemapb.CollectionSchema, bufferSize int64, multiPartUploadSize int64, columnGroups [][]int, batchSize int) (*SerializeWriter[*Value], error) {
|
|
arrowSchema, err := ConvertToArrowSchema(schema.Fields)
|
|
if err != nil {
|
|
return nil, merr.WrapErrServiceInternal(
|
|
fmt.Sprintf("can not convert collection schema %s to arrow schema: %s", schema.Name, err.Error()))
|
|
}
|
|
packedRecordWriter, err := NewPackedRecordWriter(paths, arrowSchema, bufferSize, multiPartUploadSize, columnGroups)
|
|
if err != nil {
|
|
return nil, merr.WrapErrServiceInternal(
|
|
fmt.Sprintf("can not new packed record writer %s", err.Error()))
|
|
}
|
|
return NewSerializeRecordWriter[*Value](packedRecordWriter, func(v []*Value) (Record, error) {
|
|
return ValueSerializer(v, schema.Fields)
|
|
}, batchSize), nil
|
|
}
|