mirror of https://github.com/milvus-io/milvus.git
284 lines
11 KiB
Go
284 lines
11 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package importutil
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"path"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/schemapb"
|
|
"github.com/milvus-io/milvus/internal/log"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type BinlogParser struct {
|
|
ctx context.Context // for canceling parse process
|
|
collectionSchema *schemapb.CollectionSchema // collection schema
|
|
shardNum int32 // sharding number of the collection
|
|
blockSize int64 // maximum size of a read block(unit:byte)
|
|
chunkManager storage.ChunkManager // storage interfaces to browse/read the files
|
|
callFlushFunc ImportFlushFunc // call back function to flush segment
|
|
updateProgressFunc func(percent int64) // update working progress percent value
|
|
|
|
// a timestamp to define the start time point of restore, data before this time point will be ignored
|
|
// set this value to 0, all the data will be imported
|
|
// set this value to math.MaxUint64, all the data will be ignored
|
|
// the tsStartPoint value must be less/equal than tsEndPoint
|
|
tsStartPoint uint64
|
|
|
|
// a timestamp to define the end time point of restore, data after this time point will be ignored
|
|
// set this value to 0, all the data will be ignored
|
|
// set this value to math.MaxUint64, all the data will be imported
|
|
// the tsEndPoint value must be larger/equal than tsStartPoint
|
|
tsEndPoint uint64
|
|
}
|
|
|
|
func NewBinlogParser(ctx context.Context,
|
|
collectionSchema *schemapb.CollectionSchema,
|
|
shardNum int32,
|
|
blockSize int64,
|
|
chunkManager storage.ChunkManager,
|
|
flushFunc ImportFlushFunc,
|
|
updateProgressFunc func(percent int64),
|
|
tsStartPoint uint64,
|
|
tsEndPoint uint64) (*BinlogParser, error) {
|
|
if collectionSchema == nil {
|
|
log.Error("Binlog parser: collection schema is nil")
|
|
return nil, errors.New("collection schema is nil")
|
|
}
|
|
|
|
if chunkManager == nil {
|
|
log.Error("Binlog parser: chunk manager pointer is nil")
|
|
return nil, errors.New("chunk manager pointer is nil")
|
|
}
|
|
|
|
if flushFunc == nil {
|
|
log.Error("Binlog parser: flush function is nil")
|
|
return nil, errors.New("flush function is nil")
|
|
}
|
|
|
|
if tsStartPoint > tsEndPoint {
|
|
log.Error("Binlog parser: the tsStartPoint should be less than tsEndPoint",
|
|
zap.Uint64("tsStartPoint", tsStartPoint), zap.Uint64("tsEndPoint", tsEndPoint))
|
|
return nil, fmt.Errorf("Binlog parser: the tsStartPoint %d should be less than tsEndPoint %d", tsStartPoint, tsEndPoint)
|
|
}
|
|
|
|
v := &BinlogParser{
|
|
ctx: ctx,
|
|
collectionSchema: collectionSchema,
|
|
shardNum: shardNum,
|
|
blockSize: blockSize,
|
|
chunkManager: chunkManager,
|
|
callFlushFunc: flushFunc,
|
|
tsStartPoint: tsStartPoint,
|
|
tsEndPoint: tsEndPoint,
|
|
}
|
|
|
|
return v, nil
|
|
}
|
|
|
|
// constructSegmentHolders builds a list of SegmentFilesHolder, each SegmentFilesHolder represents a segment folder
|
|
// For instance, the insertlogRoot is "backup/bak1/data/insert_log/435978159196147009/435978159196147010".
|
|
// 435978159196147009 is a collection id, 435978159196147010 is a partition id,
|
|
// there is a segment(id is 435978159261483009) under this partition.
|
|
// ListWithPrefix() will return all the insert logs under this partition:
|
|
//
|
|
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/0/435978159903735811"
|
|
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/1/435978159903735812"
|
|
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/100/435978159903735809"
|
|
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/101/435978159903735810"
|
|
//
|
|
// The deltalogRoot is "backup/bak1/data/delta_log/435978159196147009/435978159196147010".
|
|
// Then we get all the delta logs under this partition:
|
|
//
|
|
// "backup/bak1/data/delta_log/435978159196147009/435978159196147010/435978159261483009/434574382554415105"
|
|
//
|
|
// In this function, we will constuct a list of SegmentFilesHolder objects, each SegmentFilesHolder holds the
|
|
// insert logs and delta logs of a segment.
|
|
func (p *BinlogParser) constructSegmentHolders(insertlogRoot string, deltalogRoot string) ([]*SegmentFilesHolder, error) {
|
|
holders := make(map[int64]*SegmentFilesHolder)
|
|
// TODO add context
|
|
insertlogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), insertlogRoot, true)
|
|
if err != nil {
|
|
log.Error("Binlog parser: list insert logs error", zap.Error(err))
|
|
return nil, fmt.Errorf("failed to list insert logs with root path %s, error: %w", insertlogRoot, err)
|
|
}
|
|
|
|
// collect insert log paths
|
|
log.Info("Binlog parser: list insert logs", zap.Int("logsCount", len(insertlogs)))
|
|
for _, insertlog := range insertlogs {
|
|
log.Info("Binlog parser: mapping insert log to segment", zap.String("insertlog", insertlog))
|
|
filePath := path.Base(insertlog)
|
|
// skip file with prefix '.', such as .success .DS_Store
|
|
if strings.HasPrefix(filePath, ".") {
|
|
log.Debug("file path might not be a real bin log", zap.String("filePath", filePath))
|
|
continue
|
|
}
|
|
fieldPath := path.Dir(insertlog)
|
|
fieldStrID := path.Base(fieldPath)
|
|
fieldID, err := strconv.ParseInt(fieldStrID, 10, 64)
|
|
if err != nil {
|
|
log.Error("Binlog parser: failed to parse field id", zap.String("fieldPath", fieldPath), zap.Error(err))
|
|
return nil, fmt.Errorf("failed to parse field id from insert log path %s, error: %w", insertlog, err)
|
|
}
|
|
|
|
segmentPath := path.Dir(fieldPath)
|
|
segmentStrID := path.Base(segmentPath)
|
|
segmentID, err := strconv.ParseInt(segmentStrID, 10, 64)
|
|
if err != nil {
|
|
log.Error("Binlog parser: failed to parse segment id", zap.String("segmentPath", segmentPath), zap.Error(err))
|
|
return nil, fmt.Errorf("failed to parse segment id from insert log path %s, error: %w", insertlog, err)
|
|
}
|
|
|
|
holder, ok := holders[segmentID]
|
|
if ok {
|
|
holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog)
|
|
} else {
|
|
holder = &SegmentFilesHolder{
|
|
segmentID: segmentID,
|
|
fieldFiles: make(map[int64][]string),
|
|
deltaFiles: make([]string, 0),
|
|
}
|
|
holder.fieldFiles[fieldID] = make([]string, 0)
|
|
holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog)
|
|
holders[segmentID] = holder
|
|
}
|
|
}
|
|
|
|
// sort the insert log paths of each field by ascendent sequence
|
|
// there might be several insert logs under a field, for example:
|
|
// 2 insert logs under field a: a_1, a_2
|
|
// 2 insert logs under field b: b_1, b_2
|
|
// the row count of a_1 is equal to b_1, the row count of a_2 is equal to b_2
|
|
// when we read these logs, we firstly read a_1 and b_1, then read a_2 and b_2
|
|
// so, here we must ensure the paths are arranged correctly
|
|
segmentIDs := make([]int64, 0)
|
|
for id, holder := range holders {
|
|
segmentIDs = append(segmentIDs, id)
|
|
for _, v := range holder.fieldFiles {
|
|
sort.Strings(v)
|
|
}
|
|
}
|
|
|
|
// collect delta log paths
|
|
if len(deltalogRoot) > 0 {
|
|
// TODO add context
|
|
deltalogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), deltalogRoot, true)
|
|
if err != nil {
|
|
log.Error("Binlog parser: failed to list delta logs", zap.Error(err))
|
|
return nil, fmt.Errorf("failed to list delta logs, error: %w", err)
|
|
}
|
|
|
|
log.Info("Binlog parser: list delta logs", zap.Int("logsCount", len(deltalogs)))
|
|
for _, deltalog := range deltalogs {
|
|
log.Info("Binlog parser: mapping delta log to segment", zap.String("deltalog", deltalog))
|
|
segmentPath := path.Dir(deltalog)
|
|
segmentStrID := path.Base(segmentPath)
|
|
segmentID, err := strconv.ParseInt(segmentStrID, 10, 64)
|
|
if err != nil {
|
|
log.Error("Binlog parser: failed to parse segment id", zap.String("segmentPath", segmentPath), zap.Error(err))
|
|
return nil, fmt.Errorf("failed to parse segment id from delta log path %s, error: %w", deltalog, err)
|
|
}
|
|
|
|
// if the segment id doesn't exist, no need to process this deltalog
|
|
holder, ok := holders[segmentID]
|
|
if ok {
|
|
holder.deltaFiles = append(holder.deltaFiles, deltalog)
|
|
}
|
|
}
|
|
}
|
|
|
|
// since the map in golang is not sorted, we sort the segment id array to return holder list with ascending sequence
|
|
sort.Slice(segmentIDs, func(i, j int) bool { return segmentIDs[i] < segmentIDs[j] })
|
|
holdersList := make([]*SegmentFilesHolder, 0)
|
|
for _, id := range segmentIDs {
|
|
holdersList = append(holdersList, holders[id])
|
|
}
|
|
|
|
return holdersList, nil
|
|
}
|
|
|
|
func (p *BinlogParser) parseSegmentFiles(segmentHolder *SegmentFilesHolder) error {
|
|
if segmentHolder == nil {
|
|
log.Error("Binlog parser: segment files holder is nil")
|
|
return errors.New("segment files holder is nil")
|
|
}
|
|
|
|
adapter, err := NewBinlogAdapter(p.ctx, p.collectionSchema, p.shardNum, p.blockSize,
|
|
MaxTotalSizeInMemory, p.chunkManager, p.callFlushFunc, p.tsStartPoint, p.tsEndPoint)
|
|
if err != nil {
|
|
log.Error("Binlog parser: failed to create binlog adapter", zap.Error(err))
|
|
return fmt.Errorf("failed to create binlog adapter, error: %w", err)
|
|
}
|
|
|
|
return adapter.Read(segmentHolder)
|
|
}
|
|
|
|
// Parse requires two paths:
|
|
// 1. the insert log path of a partition
|
|
// 2. the delta log path of a partiion (optional)
|
|
func (p *BinlogParser) Parse(filePaths []string) error {
|
|
if len(filePaths) != 1 && len(filePaths) != 2 {
|
|
log.Error("Binlog parser: illegal paths for binlog import, partition binlog path and delta path are required")
|
|
return errors.New("illegal paths for binlog import, partition binlog path and delta path are required")
|
|
}
|
|
|
|
insertlogPath := filePaths[0]
|
|
deltalogPath := ""
|
|
if len(filePaths) == 2 {
|
|
deltalogPath = filePaths[1]
|
|
}
|
|
log.Info("Binlog parser: target paths",
|
|
zap.String("insertlogPath", insertlogPath),
|
|
zap.String("deltalogPath", deltalogPath))
|
|
|
|
segmentHolders, err := p.constructSegmentHolders(insertlogPath, deltalogPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
updateProgress := func(readBatch int) {
|
|
if p.updateProgressFunc != nil && len(segmentHolders) != 0 {
|
|
percent := (readBatch * ProgressValueForPersist) / len(segmentHolders)
|
|
log.Debug("Binlog parser: working progress", zap.Int("readBatch", readBatch),
|
|
zap.Int("totalBatchCount", len(segmentHolders)), zap.Int("percent", percent))
|
|
p.updateProgressFunc(int64(percent))
|
|
}
|
|
}
|
|
|
|
for i, segmentHolder := range segmentHolders {
|
|
err = p.parseSegmentFiles(segmentHolder)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
updateProgress(i + 1)
|
|
|
|
// trigger gb after each segment finished
|
|
triggerGC()
|
|
}
|
|
|
|
return nil
|
|
}
|