milvus/internal/storage/index_data_codec.go

372 lines
10 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"fmt"
"strconv"
"time"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/json"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type IndexFileBinlogCodec struct{}
// NewIndexFileBinlogCodec is constructor for IndexFileBinlogCodec
func NewIndexFileBinlogCodec() *IndexFileBinlogCodec {
return &IndexFileBinlogCodec{}
}
func (codec *IndexFileBinlogCodec) serializeImpl(
indexBuildID UniqueID,
version int64,
collectionID UniqueID,
partitionID UniqueID,
segmentID UniqueID,
fieldID UniqueID,
indexName string,
indexID UniqueID,
key string,
value []byte,
ts Timestamp,
) (*Blob, error) {
writer := NewIndexFileBinlogWriter(indexBuildID, version, collectionID, partitionID, segmentID, fieldID, indexName, indexID, key)
defer writer.Close()
eventWriter, err := writer.NextIndexFileEventWriter()
if err != nil {
return nil, err
}
defer eventWriter.Close()
err = eventWriter.AddOneStringToPayload(typeutil.UnsafeBytes2str(value), true)
if err != nil {
return nil, err
}
eventWriter.SetEventTimestamp(ts, ts)
writer.SetEventTimeStamp(ts, ts)
// https://github.com/milvus-io/milvus/issues/9620
// len(params) is also not accurate, indexParams is a map
writer.AddExtra(originalSizeKey, fmt.Sprintf("%v", len(value)))
err = writer.Finish()
if err != nil {
return nil, err
}
buffer, err := writer.GetBuffer()
if err != nil {
return nil, err
}
return &Blob{
Key: key,
// Key: strconv.Itoa(len(datas)),
Value: buffer,
}, nil
}
// SerializeIndexParams serilizes index params as blob.
func (codec *IndexFileBinlogCodec) SerializeIndexParams(
indexBuildID UniqueID,
version int64,
collectionID UniqueID,
partitionID UniqueID,
segmentID UniqueID,
fieldID UniqueID,
indexParams map[string]string,
indexName string,
indexID UniqueID,
) (*Blob, error) {
ts := Timestamp(time.Now().UnixNano())
// save index params.
// querycoord will parse index extra info from binlog, better to let this key appear first.
params, _ := json.Marshal(indexParams)
indexParamBlob, err := codec.serializeImpl(indexBuildID, version, collectionID, partitionID, segmentID, fieldID, indexName, indexID, IndexParamsKey, params, ts)
if err != nil {
return nil, err
}
return indexParamBlob, nil
}
// Serialize serilizes data as blobs.
func (codec *IndexFileBinlogCodec) Serialize(
indexBuildID UniqueID,
version int64,
collectionID UniqueID,
partitionID UniqueID,
segmentID UniqueID,
fieldID UniqueID,
indexParams map[string]string,
indexName string,
indexID UniqueID,
datas []*Blob,
) ([]*Blob, error) {
var err error
var blobs []*Blob
ts := Timestamp(time.Now().UnixNano())
// save index params.
// querycoord will parse index extra info from binlog, better to let this key appear first.
indexParamBlob, err := codec.SerializeIndexParams(indexBuildID, version, collectionID, partitionID, segmentID, fieldID, indexParams, indexName, indexID)
if err != nil {
return nil, err
}
blobs = append(blobs, indexParamBlob)
for pos := range datas {
blob, err := codec.serializeImpl(indexBuildID, version, collectionID, partitionID, segmentID, fieldID, indexName, indexID, datas[pos].Key, datas[pos].Value, ts)
if err != nil {
return nil, err
}
blobs = append(blobs, blob)
}
return blobs, nil
}
func (codec *IndexFileBinlogCodec) DeserializeImpl(blobs []*Blob) (
indexBuildID UniqueID,
version int64,
collectionID UniqueID,
partitionID UniqueID,
segmentID UniqueID,
fieldID UniqueID,
indexParams map[string]string,
indexName string,
indexID UniqueID,
datas []*Blob,
err error,
) {
if len(blobs) == 0 {
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, errors.New("blobs is empty")
}
indexParams = make(map[string]string)
datas = make([]*Blob, 0)
for _, blob := range blobs {
binlogReader, err := NewBinlogReader(blob.Value)
if err != nil {
log.Warn("failed to read binlog",
zap.Error(err))
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
}
dataType := binlogReader.PayloadDataType
//desc, err := binlogReader.readDescriptorEvent()
//if err != nil {
// log.Warn("failed to read descriptor event",
// zap.Error(err))
// return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
//}
desc := binlogReader.descriptorEvent
extraBytes := desc.ExtraBytes
extra := make(map[string]interface{})
_ = json.Unmarshal(extraBytes, &extra)
value, _ := strconv.Atoi(extra["indexBuildID"].(string))
indexBuildID = UniqueID(value)
value, _ = strconv.Atoi(extra["version"].(string))
version = int64(value)
collectionID = desc.CollectionID
partitionID = desc.PartitionID
segmentID = desc.SegmentID
fieldID = desc.FieldID
indexName = extra["indexName"].(string)
value, _ = strconv.Atoi(extra["indexID"].(string))
indexID = UniqueID(value)
key := extra["key"].(string)
for {
eventReader, err := binlogReader.NextEventReader()
if err != nil {
log.Warn("failed to get next event reader",
zap.Error(err))
binlogReader.Close()
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
}
if eventReader == nil {
break
}
switch dataType {
// just for backward compatibility
case schemapb.DataType_Int8:
// todo: valid_data may need to check when create index
content, _, err := eventReader.GetByteFromPayload()
if err != nil {
log.Warn("failed to get byte from payload",
zap.Error(err))
eventReader.Close()
binlogReader.Close()
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
}
if key == IndexParamsKey {
_ = json.Unmarshal(content, &indexParams)
} else {
blob := &Blob{Key: key}
blob.Value = content
datas = append(datas, blob)
}
case schemapb.DataType_String:
content, _, err := eventReader.GetStringFromPayload()
if err != nil {
log.Warn("failed to get string from payload", zap.Error(err))
eventReader.Close()
binlogReader.Close()
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
}
// make sure there is one string
if len(content) != 1 {
err := fmt.Errorf("failed to parse index event because content length is not one %d", len(content))
eventReader.Close()
binlogReader.Close()
return 0, 0, 0, 0, 0, 0, nil, "", 0, nil, err
}
contentByte := typeutil.UnsafeStr2bytes(content[0])
if key == IndexParamsKey {
_ = json.Unmarshal(contentByte, &indexParams)
} else {
blob := &Blob{Key: key}
blob.Value = contentByte
datas = append(datas, blob)
}
}
eventReader.Close()
}
binlogReader.Close()
}
return indexBuildID, version, collectionID, partitionID, segmentID, fieldID, indexParams, indexName, indexID, datas, nil
}
func (codec *IndexFileBinlogCodec) Deserialize(blobs []*Blob) (
datas []*Blob,
indexParams map[string]string,
indexName string,
indexID UniqueID,
err error,
) {
_, _, _, _, _, _, indexParams, indexName, indexID, datas, err = codec.DeserializeImpl(blobs)
return datas, indexParams, indexName, indexID, err
}
// IndexCodec can serialize and deserialize index
type IndexCodec struct{}
// NewIndexCodec creates IndexCodec
func NewIndexCodec() *IndexCodec {
return &IndexCodec{}
}
// Serialize serializes index
func (indexCodec *IndexCodec) Serialize(blobs []*Blob, params map[string]string, indexName string, indexID UniqueID) ([]*Blob, error) {
paramsBytes, err := json.Marshal(struct {
Params map[string]string
IndexName string
IndexID UniqueID
}{
Params: params,
IndexName: indexName,
IndexID: indexID,
})
if err != nil {
return nil, err
}
blobs = append(blobs, &Blob{Key: IndexParamsKey, Value: paramsBytes})
return blobs, nil
}
// Deserialize deserializes index
func (indexCodec *IndexCodec) Deserialize(blobs []*Blob) ([]*Blob, map[string]string, string, UniqueID, error) {
var file *Blob
for i := 0; i < len(blobs); i++ {
if blobs[i].Key != IndexParamsKey {
continue
}
file = blobs[i]
blobs = append(blobs[:i], blobs[i+1:]...)
break
}
if file == nil {
return nil, nil, "", InvalidUniqueID, fmt.Errorf("can not find params blob")
}
info := struct {
Params map[string]string
IndexName string
IndexID UniqueID
}{}
if err := json.Unmarshal(file.Value, &info); err != nil {
return nil, nil, "", InvalidUniqueID, fmt.Errorf("json unmarshal error: %s", err.Error())
}
return blobs, info.Params, info.IndexName, info.IndexID, nil
}
// NewIndexFileBinlogWriter returns a new IndexFileBinlogWriter with provided parameters
func NewIndexFileBinlogWriter(
indexBuildID UniqueID,
version int64,
collectionID UniqueID,
partitionID UniqueID,
segmentID UniqueID,
fieldID UniqueID,
indexName string,
indexID UniqueID,
key string,
) *IndexFileBinlogWriter {
descriptorEvent := newDescriptorEvent()
descriptorEvent.CollectionID = collectionID
descriptorEvent.PartitionID = partitionID
descriptorEvent.SegmentID = segmentID
descriptorEvent.FieldID = fieldID
descriptorEvent.PayloadDataType = schemapb.DataType_String
descriptorEvent.AddExtra("indexBuildID", fmt.Sprintf("%d", indexBuildID))
descriptorEvent.AddExtra("version", fmt.Sprintf("%d", version))
descriptorEvent.AddExtra("indexName", indexName)
descriptorEvent.AddExtra("indexID", fmt.Sprintf("%d", indexID))
descriptorEvent.AddExtra("key", key)
w := &IndexFileBinlogWriter{
baseBinlogWriter: baseBinlogWriter{
descriptorEvent: *descriptorEvent,
magicNumber: MagicNumber,
binlogType: IndexFileBinlog,
eventWriters: make([]EventWriter, 0),
buffer: nil,
},
}
return w
}