mirror of https://github.com/milvus-io/milvus.git
80 lines
2.5 KiB
Go
80 lines
2.5 KiB
Go
|
// Licensed to the LF AI & Data foundation under one
|
||
|
// or more contributor license agreements. See the NOTICE file
|
||
|
// distributed with this work for additional information
|
||
|
// regarding copyright ownership. The ASF licenses this file
|
||
|
// to you under the Apache License, Version 2.0 (the
|
||
|
// "License"); you may not use this file except in compliance
|
||
|
// with the License. You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package importutil
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
|
||
|
"github.com/apache/arrow/go/v12/arrow"
|
||
|
"github.com/apache/arrow/go/v12/arrow/array"
|
||
|
"github.com/apache/arrow/go/v12/parquet/pqarrow"
|
||
|
"go.uber.org/zap"
|
||
|
|
||
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||
|
"github.com/milvus-io/milvus/pkg/log"
|
||
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||
|
)
|
||
|
|
||
|
type ParquetColumnReader struct {
|
||
|
fieldName string
|
||
|
fieldID int64
|
||
|
columnIndex int
|
||
|
// columnSchema *parquet.SchemaElement
|
||
|
dataType schemapb.DataType
|
||
|
elementType schemapb.DataType
|
||
|
columnReader *pqarrow.ColumnReader
|
||
|
dimension int
|
||
|
}
|
||
|
|
||
|
func ReadData[T any](pcr *ParquetColumnReader, count int64, getDataFunc func(chunk arrow.Array) ([]T, error)) ([]T, error) {
|
||
|
chunked, err := pcr.columnReader.NextBatch(count)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
data := make([]T, 0, count)
|
||
|
for _, chunk := range chunked.Chunks() {
|
||
|
chunkData, err := getDataFunc(chunk)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
data = append(data, chunkData...)
|
||
|
}
|
||
|
return data, nil
|
||
|
}
|
||
|
|
||
|
func ReadArrayData[T any](pcr *ParquetColumnReader, count int64, getArrayData func(offsets []int32, array arrow.Array) ([][]T, error)) ([][]T, error) {
|
||
|
chunked, err := pcr.columnReader.NextBatch(count)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
arrayData := make([][]T, 0, count)
|
||
|
for _, chunk := range chunked.Chunks() {
|
||
|
listReader, ok := chunk.(*array.List)
|
||
|
if !ok {
|
||
|
log.Warn("the column data in parquet is not array", zap.String("fieldName", pcr.fieldName))
|
||
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not array of field: %s", pcr.fieldName))
|
||
|
}
|
||
|
offsets := listReader.Offsets()
|
||
|
chunkData, err := getArrayData(offsets, listReader.ListValues())
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
arrayData = append(arrayData, chunkData...)
|
||
|
}
|
||
|
return arrayData, nil
|
||
|
}
|