mirror of https://github.com/milvus-io/milvus.git
209 lines
7.8 KiB
Go
209 lines
7.8 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package binlog
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"path"
|
|
"sort"
|
|
"strconv"
|
|
|
|
"github.com/samber/lo"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/storagecommon"
|
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
func readData(reader *storage.BinlogReader, et storage.EventTypeCode) ([]any, [][]bool, error) {
|
|
rowsSet := make([]any, 0)
|
|
validDataRowsSet := make([][]bool, 0)
|
|
for {
|
|
event, err := reader.NextEventReader()
|
|
if err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("failed to iterate events reader, error: %v", err))
|
|
}
|
|
if event == nil {
|
|
break // end of the file
|
|
}
|
|
if event.TypeCode != et {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("wrong binlog type, expect:%s, actual:%s",
|
|
et.String(), event.TypeCode.String()))
|
|
}
|
|
rows, validDataRows, _, err := event.PayloadReaderInterface.GetDataFromPayload()
|
|
if err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("failed to read data, error: %v", err))
|
|
}
|
|
rowsSet = append(rowsSet, rows)
|
|
validDataRowsSet = append(validDataRowsSet, validDataRows)
|
|
}
|
|
return rowsSet, validDataRowsSet, nil
|
|
}
|
|
|
|
func newBinlogReader(ctx context.Context, cm storage.ChunkManager, path string) (*storage.BinlogReader, error) {
|
|
bytes, err := cm.Read(ctx, path) // TODO: dyh, checks if the error is a retryable error
|
|
if err != nil {
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("failed to open binlog %s", path))
|
|
}
|
|
var reader *storage.BinlogReader
|
|
reader, err = storage.NewBinlogReader(bytes)
|
|
if err != nil {
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("failed to create reader, binlog:%s, error:%v", path, err))
|
|
}
|
|
return reader, nil
|
|
}
|
|
|
|
func listInsertLogs(ctx context.Context, cm storage.ChunkManager, insertPrefix string) (map[int64][]string, error) {
|
|
insertLogs := make(map[int64][]string)
|
|
var walkErr error
|
|
if err := cm.WalkWithPrefix(ctx, insertPrefix, true, func(insertLog *storage.ChunkObjectInfo) bool {
|
|
fieldPath := path.Dir(insertLog.FilePath)
|
|
fieldStrID := path.Base(fieldPath)
|
|
fieldID, err := strconv.ParseInt(fieldStrID, 10, 64)
|
|
if err != nil {
|
|
walkErr = merr.WrapErrImportFailed(fmt.Sprintf("failed to parse field id from log, error: %v", err))
|
|
return false
|
|
}
|
|
insertLogs[fieldID] = append(insertLogs[fieldID], insertLog.FilePath)
|
|
return true
|
|
}); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if walkErr != nil {
|
|
return nil, walkErr
|
|
}
|
|
|
|
for _, v := range insertLogs {
|
|
sort.Strings(v)
|
|
}
|
|
return insertLogs, nil
|
|
}
|
|
|
|
func createFieldBinlogList(insertLogs map[int64][]string) []*datapb.FieldBinlog {
|
|
binlogFields := make([]*datapb.FieldBinlog, 0)
|
|
for fieldID, logs := range insertLogs {
|
|
binlog := &datapb.FieldBinlog{
|
|
FieldID: fieldID,
|
|
Binlogs: lo.Map(logs, func(path string, _ int) *datapb.Binlog {
|
|
return &datapb.Binlog{
|
|
LogPath: path,
|
|
}
|
|
}),
|
|
}
|
|
binlogFields = append(binlogFields, binlog)
|
|
}
|
|
return binlogFields
|
|
}
|
|
|
|
func verify(schema *schemapb.CollectionSchema, storageVersion int64, insertLogs map[int64][]string) (map[int64][]string, *schemapb.CollectionSchema, error) {
|
|
// check system fields (ts and rowID)
|
|
if storageVersion == storage.StorageV2 {
|
|
if _, ok := insertLogs[storagecommon.DefaultShortColumnGroupID]; !ok {
|
|
return nil, nil, merr.WrapErrImportFailed("no binlog for system fields")
|
|
}
|
|
} else {
|
|
// storage v1
|
|
if _, ok := insertLogs[common.RowIDField]; !ok {
|
|
return nil, nil, merr.WrapErrImportFailed("no binlog for RowID field")
|
|
}
|
|
if _, ok := insertLogs[common.TimeStampField]; !ok {
|
|
return nil, nil, merr.WrapErrImportFailed("no binlog for Timestamp field")
|
|
}
|
|
}
|
|
|
|
// check binlog file count, must be equal for all fields
|
|
for fieldID, logs := range insertLogs {
|
|
if len(logs) != len(insertLogs[common.RowIDField]) {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("misaligned binlog count, field%d:%d, field%d:%d",
|
|
fieldID, len(logs), common.RowIDField, len(insertLogs[common.RowIDField])))
|
|
}
|
|
}
|
|
|
|
if storageVersion == storage.StorageV2 {
|
|
for _, field := range schema.GetFields() {
|
|
if typeutil.IsVectorType(field.GetDataType()) {
|
|
if _, ok := insertLogs[field.GetFieldID()]; !ok {
|
|
// vector field must be provided
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("no binlog for field:%s", field.GetName()))
|
|
}
|
|
}
|
|
}
|
|
return insertLogs, schema, nil
|
|
}
|
|
|
|
// Goal: support import binlog files to a different schema collection.
|
|
//
|
|
// What we know:
|
|
// - The milvus-backup tool knows the original collection's schema, it has the ability to create
|
|
// a new collection from the old schema, it also supports restore data into an existing collection
|
|
// which could be different schema.
|
|
// - The binlog field ids are parsed from the storage path, here we don't know the original field name.
|
|
// The binlogs are mapped by fieldID, which could lead to type error later.
|
|
//
|
|
// How to do:
|
|
// - Always import pk field into the target collection no matter it is auto-id or not.
|
|
// - Returns an error if the target collection has a Function output field, but the source collection has no this field.
|
|
// - If both collections have the function output field, we do not re-run the Function when restoring from a backup.
|
|
// - For nullable/defaultValue fields, the binlog is optional.
|
|
// - For dynamic field, the binlog is optional.
|
|
// - For other fields, if a field is not in the target collection's schema, just skip the binlog of this field.
|
|
// - If the target collection's schema contains a field(not nullable/defaultValue) that the source collection
|
|
// doesn't have, reports an error
|
|
|
|
// Here we copy the schema for reading part of collection's data. The storage.NewBinlogRecordReader() requires
|
|
// a schema and the schema must be consistent with the binglog files([]*datapb.FieldBinlog)
|
|
cloneSchema := typeutil.Clone(schema)
|
|
cloneSchema.Fields = []*schemapb.FieldSchema{} // the Fields will be reset according to the validInsertLogs
|
|
cloneSchema.EnableDynamicField = false // this flag will be reset
|
|
|
|
// this loop will reset the cloneSchema.Fields and return validInsertLogs
|
|
validInsertLogs := make(map[int64][]string)
|
|
|
|
for _, field := range schema.GetFields() {
|
|
id := field.GetFieldID()
|
|
logs, ok := insertLogs[id]
|
|
if !ok {
|
|
// dynamic field, nullable field, default value field, optional, no need to check
|
|
// they will be filled by AppendNullableDefaultFieldsData
|
|
if field.GetIsDynamic() || field.GetNullable() || field.GetDefaultValue() != nil {
|
|
continue
|
|
}
|
|
// primary key is required
|
|
// function output field is also required
|
|
// the other field must be provided
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("no binlog for field:%s", field.GetName()))
|
|
} else {
|
|
// these binlogs are intend to be imported
|
|
validInsertLogs[id] = logs
|
|
|
|
// reset the cloneSchema.Fields
|
|
cloneSchema.Fields = append(cloneSchema.Fields, field)
|
|
if field.IsDynamic {
|
|
cloneSchema.EnableDynamicField = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return validInsertLogs, cloneSchema, nil
|
|
}
|