milvus/internal/util/importutil/parquet_column_reader.go

80 lines
2.5 KiB
Go
Raw Normal View History

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importutil
import (
"fmt"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
type ParquetColumnReader struct {
fieldName string
fieldID int64
columnIndex int
// columnSchema *parquet.SchemaElement
dataType schemapb.DataType
elementType schemapb.DataType
columnReader *pqarrow.ColumnReader
dimension int
}
func ReadData[T any](pcr *ParquetColumnReader, count int64, getDataFunc func(chunk arrow.Array) ([]T, error)) ([]T, error) {
chunked, err := pcr.columnReader.NextBatch(count)
if err != nil {
return nil, err
}
data := make([]T, 0, count)
for _, chunk := range chunked.Chunks() {
chunkData, err := getDataFunc(chunk)
if err != nil {
return nil, err
}
data = append(data, chunkData...)
}
return data, nil
}
func ReadArrayData[T any](pcr *ParquetColumnReader, count int64, getArrayData func(offsets []int32, array arrow.Array) ([][]T, error)) ([][]T, error) {
chunked, err := pcr.columnReader.NextBatch(count)
if err != nil {
return nil, err
}
arrayData := make([][]T, 0, count)
for _, chunk := range chunked.Chunks() {
listReader, ok := chunk.(*array.List)
if !ok {
log.Warn("the column data in parquet is not array", zap.String("fieldName", pcr.fieldName))
return nil, merr.WrapErrImportFailed(fmt.Sprintf("the column data in parquet is not array of field: %s", pcr.fieldName))
}
offsets := listReader.Offsets()
chunkData, err := getArrayData(offsets, listReader.ListValues())
if err != nil {
return nil, err
}
arrayData = append(arrayData, chunkData...)
}
return arrayData, nil
}