enhance: Tidy import options (#37077) (#37078)

1. Tidy import options.
2. Tidy common import util functions.

issue: https://github.com/milvus-io/milvus/issues/34150

pr: https://github.com/milvus-io/milvus/pull/37077

---------

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
pull/37142/head
yihao.dai 2024-10-25 14:35:45 +08:00 committed by GitHub
parent 6bc8aba17f
commit ca2057c57d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 180 additions and 58 deletions

View File

@ -1636,19 +1636,12 @@ func (s *Server) ImportV2(ctx context.Context, in *internalpb.ImportRequestInter
zap.Int64("collection", in.GetCollectionID()),
zap.Int64s("partitions", in.GetPartitionIDs()),
zap.Strings("channels", in.GetChannelNames()))
log.Info("receive import request", zap.Any("files", in.GetFiles()))
log.Info("receive import request", zap.Any("files", in.GetFiles()), zap.Any("options", in.GetOptions()))
var timeoutTs uint64 = math.MaxUint64
timeoutStr, err := funcutil.GetAttrByKeyFromRepeatedKV("timeout", in.GetOptions())
if err == nil {
// Specifies the timeout duration for import, such as "300s", "1.5h" or "1h45m".
dur, err := time.ParseDuration(timeoutStr)
if err != nil {
resp.Status = merr.Status(merr.WrapErrImportFailed(fmt.Sprint("parse import timeout failed, err=%w", err)))
return resp, nil
}
curTs := tsoutil.GetCurrentTime()
timeoutTs = tsoutil.AddPhysicalDurationOnTs(curTs, dur)
timeoutTs, err := importutilv2.GetTimeoutTs(in.GetOptions())
if err != nil {
resp.Status = merr.Status(merr.WrapErrImportFailed(err.Error()))
return resp, nil
}
files := in.GetFiles()

View File

@ -6163,6 +6163,7 @@ func (node *Proxy) ImportV2(ctx context.Context, req *internalpb.ImportRequest)
zap.String("partition name", req.GetPartitionName()),
zap.Any("files", req.GetFiles()),
zap.String("role", typeutil.ProxyRole),
zap.Any("options", req.GetOptions()),
)
resp := &internalpb.ImportResponse{

View File

@ -78,3 +78,14 @@ func CheckArrayCapacity(arrLength int, maxCapacity int64) error {
}
return nil
}
func EstimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
if 1000*sizePerRecord <= bufferSize {
return 1000, nil
}
return int64(bufferSize) / int64(sizePerRecord), nil
}

View File

@ -0,0 +1,68 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package common
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/common"
)
func TestUtil_EstimateReadCountPerBatch(t *testing.T) {
schema := &schemapb.CollectionSchema{
Fields: []*schemapb.FieldSchema{
{
FieldID: 100,
Name: "pk",
IsPrimaryKey: true,
DataType: schemapb.DataType_Int64,
},
{
FieldID: 101,
Name: "vec",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "128",
},
},
},
},
}
count, err := EstimateReadCountPerBatch(16*1024*1024, schema)
assert.NoError(t, err)
assert.Equal(t, int64(1000), count)
schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
FieldID: 102,
Name: "vec2",
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "invalidDim",
},
},
})
_, err = EstimateReadCountPerBatch(16*1024*1024, schema)
assert.Error(t, err)
}

View File

@ -27,8 +27,8 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
const (
@ -58,7 +58,7 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
if err != nil {
return nil, merr.WrapErrImportFailed(fmt.Sprintf("read json file failed, path=%s, err=%s", path, err.Error()))
}
count, err := estimateReadCountPerBatch(bufferSize, schema)
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}
@ -181,14 +181,3 @@ func (j *reader) Size() (int64, error) {
}
func (j *reader) Close() {}
func estimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
if 1000*sizePerRecord <= bufferSize {
return 1000, nil
}
return int64(bufferSize) / int64(sizePerRecord), nil
}

View File

@ -396,7 +396,9 @@ func (r *rowParser) parseEntity(fieldID int64, obj any) (any, error) {
}
case schemapb.DataType_Array:
arr, ok := obj.([]interface{})
if !ok {
return nil, r.wrapTypeError(obj, fieldID)
}
maxCapacity, err := parameterutil.GetMaxCapacity(r.id2Field[fieldID])
if err != nil {
return nil, err
@ -404,9 +406,6 @@ func (r *rowParser) parseEntity(fieldID int64, obj any) (any, error) {
if err = common.CheckArrayCapacity(len(arr), maxCapacity); err != nil {
return nil, err
}
if !ok {
return nil, r.wrapTypeError(obj, fieldID)
}
scalarFieldData, err := r.arrayToFieldData(arr, r.id2Field[fieldID].GetElementType())
if err != nil {
return nil, err

View File

@ -48,7 +48,7 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
return field.GetFieldID()
})
count, err := calcRowCount(bufferSize, schema)
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}

View File

@ -30,7 +30,6 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
var (
@ -241,12 +240,3 @@ func validateHeader(npyReader *npy.Reader, field *schemapb.FieldSchema, dim int)
}
return nil
}
func calcRowCount(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
rowCount := int64(bufferSize) / int64(sizePerRecord)
return rowCount, nil
}

View File

@ -21,6 +21,7 @@ import (
"math"
"strconv"
"strings"
"time"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus/pkg/util/funcutil"
@ -29,17 +30,45 @@ import (
)
const (
StartTs = "start_ts"
StartTs2 = "startTs"
EndTs = "end_ts"
EndTs2 = "endTs"
// Timeout specifies the timeout duration for import, such as "300s", "1.5h" or "1h45m".
Timeout = "timeout"
// SkipDQC indicates whether to bypass the disk quota check, default to false.
SkipDQC = "skip_disk_quota_check"
)
// Options for backup-restore mode.
const (
// BackupFlag indicates whether the import is in backup-restore mode, default to false.
BackupFlag = "backup"
L0Import = "l0_import"
SkipDQC = "skip_disk_quota_check"
// L0Import indicates whether to import l0 segments only.
L0Import = "l0_import"
// StartTs StartTs2 EndTs EndTs2 are used to filter data during backup-restore import.
StartTs = "start_ts"
StartTs2 = "startTs"
EndTs = "end_ts"
EndTs2 = "endTs"
)
type Options []*commonpb.KeyValuePair
func GetTimeoutTs(options Options) (uint64, error) {
var timeoutTs uint64 = math.MaxUint64
timeoutStr, err := funcutil.GetAttrByKeyFromRepeatedKV(Timeout, options)
if err == nil {
var dur time.Duration
dur, err = time.ParseDuration(timeoutStr)
if err != nil {
return 0, fmt.Errorf("parse timeout failed, err=%w", err)
}
curTs := tsoutil.GetCurrentTime()
timeoutTs = tsoutil.AddPhysicalDurationOnTs(curTs, dur)
}
return timeoutTs, nil
}
func ParseTimeRange(options Options) (uint64, uint64, error) {
importOptions := funcutil.KeyValuePair2Map(options)
getTimestamp := func(defaultValue uint64, targetKeys ...string) (uint64, error) {

View File

@ -0,0 +1,53 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importutilv2
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus/pkg/util/tsoutil"
)
func TestOption_GetTimeout(t *testing.T) {
const delta = 3 * time.Second
options := []*commonpb.KeyValuePair{{Key: Timeout, Value: "300s"}}
ts, err := GetTimeoutTs(options)
assert.NoError(t, err)
pt := tsoutil.PhysicalTime(ts)
assert.WithinDuration(t, time.Now().Add(300*time.Second), pt, delta)
options = []*commonpb.KeyValuePair{{Key: Timeout, Value: "1.5h"}}
ts, err = GetTimeoutTs(options)
assert.NoError(t, err)
pt = tsoutil.PhysicalTime(ts)
assert.WithinDuration(t, time.Now().Add(90*time.Minute), pt, delta)
options = []*commonpb.KeyValuePair{{Key: Timeout, Value: "1h45m"}}
ts, err = GetTimeoutTs(options)
assert.NoError(t, err)
pt = tsoutil.PhysicalTime(ts)
assert.WithinDuration(t, time.Now().Add(105*time.Minute), pt, delta)
options = []*commonpb.KeyValuePair{{Key: Timeout, Value: "invalidTime"}}
_, err = GetTimeoutTs(options)
assert.Error(t, err)
}

View File

@ -74,7 +74,7 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co
if err != nil {
return nil, err
}
count, err := estimateReadCountPerBatch(bufferSize, schema)
count, err := common.EstimateReadCountPerBatch(bufferSize, schema)
if err != nil {
return nil, err
}

View File

@ -250,14 +250,3 @@ func isSchemaEqual(schema *schemapb.CollectionSchema, arrSchema *arrow.Schema) e
}
return nil
}
func estimateReadCountPerBatch(bufferSize int, schema *schemapb.CollectionSchema) (int64, error) {
sizePerRecord, err := typeutil.EstimateMaxSizePerRecord(schema)
if err != nil {
return 0, err
}
if 1000*sizePerRecord <= bufferSize {
return 1000, nil
}
return int64(bufferSize) / int64(sizePerRecord), nil
}