2024-01-08 11:42:49 +00:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package numpy
import (
2024-01-31 12:45:04 +00:00
"context"
"fmt"
2024-01-08 11:42:49 +00:00
"io"
2024-01-31 12:45:04 +00:00
"path/filepath"
"strings"
2024-01-08 11:42:49 +00:00
"github.com/samber/lo"
2024-03-25 12:29:07 +00:00
"go.uber.org/atomic"
2024-01-08 11:42:49 +00:00
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/storage"
2024-05-07 03:21:29 +00:00
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
2024-01-31 12:45:04 +00:00
"github.com/milvus-io/milvus/pkg/util/merr"
2024-01-08 11:42:49 +00:00
)
2024-03-25 12:29:07 +00:00
type reader struct {
ctx context . Context
cm storage . ChunkManager
2024-01-08 11:42:49 +00:00
schema * schemapb . CollectionSchema
2024-03-25 12:29:07 +00:00
fileSize * atomic . Int64
paths [ ] string
count int64
frs map [ int64 ] * FieldReader // fieldID -> FieldReader
2024-01-08 11:42:49 +00:00
}
2024-04-28 11:39:25 +00:00
func NewReader ( ctx context . Context , cm storage . ChunkManager , schema * schemapb . CollectionSchema , paths [ ] string , bufferSize int ) ( * reader , error ) {
2024-09-14 07:35:07 +00:00
for _ , fieldSchema := range schema . Fields {
if fieldSchema . GetNullable ( ) {
return nil , merr . WrapErrParameterInvalidMsg ( fmt . Sprintf ( "not support bulk insert numpy files in field(%s) which set nullable == true" , fieldSchema . GetName ( ) ) )
}
}
2024-01-08 11:42:49 +00:00
fields := lo . KeyBy ( schema . GetFields ( ) , func ( field * schemapb . FieldSchema ) int64 {
return field . GetFieldID ( )
} )
count , err := calcRowCount ( bufferSize , schema )
if err != nil {
return nil , err
}
crs := make ( map [ int64 ] * FieldReader )
2024-04-28 11:39:25 +00:00
readers , err := CreateReaders ( ctx , cm , schema , paths )
2024-01-31 12:45:04 +00:00
if err != nil {
return nil , err
}
2024-01-08 11:42:49 +00:00
for fieldID , r := range readers {
cr , err := NewFieldReader ( r , fields [ fieldID ] )
if err != nil {
return nil , err
}
crs [ fieldID ] = cr
}
2024-03-25 12:29:07 +00:00
return & reader {
ctx : ctx ,
cm : cm ,
schema : schema ,
fileSize : atomic . NewInt64 ( 0 ) ,
paths : paths ,
count : count ,
frs : crs ,
2024-01-08 11:42:49 +00:00
} , nil
}
2024-03-25 12:29:07 +00:00
func ( r * reader ) Read ( ) ( * storage . InsertData , error ) {
2024-01-08 11:42:49 +00:00
insertData , err := storage . NewInsertData ( r . schema )
if err != nil {
return nil , err
}
for fieldID , cr := range r . frs {
2024-01-31 12:45:04 +00:00
var data any
data , err = cr . Next ( r . count )
2024-01-08 11:42:49 +00:00
if err != nil {
return nil , err
}
if data == nil {
2024-01-31 12:45:04 +00:00
return nil , io . EOF
2024-01-08 11:42:49 +00:00
}
2024-08-20 08:50:55 +00:00
err = insertData . Data [ fieldID ] . AppendRows ( data , nil )
2024-01-08 11:42:49 +00:00
if err != nil {
return nil , err
}
}
2024-05-07 03:21:29 +00:00
err = common . FillDynamicData ( insertData , r . schema )
2024-01-31 12:45:04 +00:00
if err != nil {
return nil , err
}
2024-01-08 11:42:49 +00:00
return insertData , nil
}
2024-03-25 12:29:07 +00:00
func ( r * reader ) Size ( ) ( int64 , error ) {
if size := r . fileSize . Load ( ) ; size != 0 {
return size , nil
}
size , err := storage . GetFilesSize ( r . ctx , r . paths , r . cm )
if err != nil {
return 0 , err
}
r . fileSize . Store ( size )
return size , nil
}
func ( r * reader ) Close ( ) {
2024-01-08 11:42:49 +00:00
for _ , cr := range r . frs {
cr . Close ( )
}
}
2024-01-31 12:45:04 +00:00
2024-04-28 11:39:25 +00:00
func CreateReaders ( ctx context . Context , cm storage . ChunkManager , schema * schemapb . CollectionSchema , paths [ ] string ) ( map [ int64 ] io . Reader , error ) {
2024-01-31 12:45:04 +00:00
readers := make ( map [ int64 ] io . Reader )
nameToPath := lo . SliceToMap ( paths , func ( path string ) ( string , string ) {
nameWithExt := filepath . Base ( path )
name := strings . TrimSuffix ( nameWithExt , filepath . Ext ( nameWithExt ) )
return name , path
} )
for _ , field := range schema . GetFields ( ) {
if field . GetIsPrimaryKey ( ) && field . GetAutoID ( ) {
2024-04-28 11:39:25 +00:00
if _ , ok := nameToPath [ field . GetName ( ) ] ; ok {
return nil , merr . WrapErrImportFailed (
fmt . Sprintf ( "the primary key '%s' is auto-generated, no need to provide" , field . GetName ( ) ) )
}
2024-01-31 12:45:04 +00:00
continue
}
if _ , ok := nameToPath [ field . GetName ( ) ] ; ! ok {
2024-04-28 11:39:25 +00:00
if field . GetIsDynamic ( ) {
continue
}
2024-01-31 12:45:04 +00:00
return nil , merr . WrapErrImportFailed (
fmt . Sprintf ( "no file for field: %s, files: %v" , field . GetName ( ) , lo . Values ( nameToPath ) ) )
}
reader , err := cm . Reader ( ctx , nameToPath [ field . GetName ( ) ] )
if err != nil {
return nil , merr . WrapErrImportFailed (
fmt . Sprintf ( "failed to read the file '%s', error: %s" , nameToPath [ field . GetName ( ) ] , err . Error ( ) ) )
}
readers [ field . GetFieldID ( ) ] = reader
}
return readers , nil
}