fix: Make the dynamic column optional in parquet import (#32738) (#32802)

issue: https://github.com/milvus-io/milvus/issues/32729

pr: https://github.com/milvus-io/milvus/pull/32738

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
pull/32823/head
yihao.dai 2024-05-07 17:07:31 +08:00 committed by GitHub
parent 46acc07cef
commit 54e00db012
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 70 additions and 43 deletions

View File

@ -0,0 +1,60 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package common
import (
"github.com/samber/lo"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
func FillDynamicData(data *storage.InsertData, schema *schemapb.CollectionSchema) error {
if !schema.GetEnableDynamicField() {
return nil
}
dynamicField := typeutil.GetDynamicField(schema)
if dynamicField == nil {
return nil
}
totalRowNum := getInsertDataRowNum(data, schema)
dynamicData := data.Data[dynamicField.GetFieldID()]
jsonFD := dynamicData.(*storage.JSONFieldData)
bs := []byte("{}")
existedRowNum := dynamicData.RowNum()
for i := 0; i < totalRowNum-existedRowNum; i++ {
jsonFD.Data = append(jsonFD.Data, bs)
}
data.Data[dynamicField.GetFieldID()] = dynamicData
return nil
}
func getInsertDataRowNum(data *storage.InsertData, schema *schemapb.CollectionSchema) int {
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
return field.GetFieldID()
})
for fieldID, fd := range data.Data {
if fields[fieldID].GetIsDynamic() {
continue
}
if fd.RowNum() != 0 {
return fd.RowNum()
}
}
return 0
}

View File

@ -28,6 +28,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/util/merr"
)
@ -93,7 +94,7 @@ func (r *reader) Read() (*storage.InsertData, error) {
return nil, err
}
}
err = fillDynamicData(insertData, r.schema)
err = common.FillDynamicData(insertData, r.schema)
if err != nil {
return nil, err
}

View File

@ -24,13 +24,11 @@ import (
"strconv"
"unicode/utf8"
"github.com/samber/lo"
"github.com/sbinet/npyio"
"github.com/sbinet/npyio/npy"
"golang.org/x/text/encoding/unicode"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -252,38 +250,3 @@ func calcRowCount(bufferSize int, schema *schemapb.CollectionSchema) (int64, err
rowCount := int64(bufferSize) / int64(sizePerRecord)
return rowCount, nil
}
func fillDynamicData(data *storage.InsertData, schema *schemapb.CollectionSchema) error {
if !schema.GetEnableDynamicField() {
return nil
}
dynamicField := typeutil.GetDynamicField(schema)
if dynamicField == nil {
return nil
}
totalRowNum := getInsertDataRowNum(data, schema)
dynamicData := data.Data[dynamicField.GetFieldID()]
jsonFD := dynamicData.(*storage.JSONFieldData)
bs := []byte("{}")
existedRowNum := dynamicData.RowNum()
for i := 0; i < totalRowNum-existedRowNum; i++ {
jsonFD.Data = append(jsonFD.Data, bs)
}
data.Data[dynamicField.GetFieldID()] = dynamicData
return nil
}
func getInsertDataRowNum(data *storage.InsertData, schema *schemapb.CollectionSchema) int {
fields := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
return field.GetFieldID()
})
for fieldID, fd := range data.Data {
if fields[fieldID].GetIsDynamic() {
continue
}
if fd.RowNum() != 0 {
return fd.RowNum()
}
}
return 0
}

View File

@ -30,6 +30,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
@ -119,6 +120,10 @@ OUTER:
return nil, io.EOF
}
}
err = common.FillDynamicData(insertData, r.schema)
if err != nil {
return nil, err
}
return insertData, nil
}

View File

@ -239,6 +239,9 @@ func isSchemaEqual(schema *schemapb.CollectionSchema, arrSchema *arrow.Schema) e
}
arrField, ok := arrNameToField[field.GetName()]
if !ok {
if field.GetIsDynamic() {
continue
}
return merr.WrapErrImportFailed(fmt.Sprintf("field '%s' not in arrow schema", field.GetName()))
}
toArrDataType, err := convertToArrowDataType(field, false)

View File

@ -120,11 +120,6 @@ func (s *BulkInsertSuite) testImportDynamicField() {
}
case importutilv2.Parquet:
filePath := fmt.Sprintf("/tmp/test_%d.parquet", rand.Int())
schema.Fields = append(schema.Fields, &schemapb.FieldSchema{
FieldID: 102,
Name: "$meta",
DataType: schemapb.DataType_JSON,
})
err = GenerateParquetFile(filePath, schema, rowCount)
s.NoError(err)
defer os.Remove(filePath)