2023-11-29 12:52:27 +00:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importutil
import (
"fmt"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"go.uber.org/zap"
2023-12-12 08:14:44 +00:00
"golang.org/x/exp/constraints"
2023-11-29 12:52:27 +00:00
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
2023-12-12 08:14:44 +00:00
"github.com/milvus-io/milvus/pkg/util/typeutil"
2023-11-29 12:52:27 +00:00
)
type ParquetColumnReader struct {
fieldName string
fieldID int64
columnIndex int
// columnSchema *parquet.SchemaElement
dataType schemapb . DataType
elementType schemapb . DataType
columnReader * pqarrow . ColumnReader
dimension int
}
2023-12-12 08:14:44 +00:00
func ReadBoolData ( pcr * ParquetColumnReader , count int64 ) ( [ ] bool , error ) {
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] bool , 0 , count )
for _ , chunk := range chunked . Chunks ( ) {
dataNums := chunk . Data ( ) . Len ( )
chunkData := make ( [ ] bool , dataNums )
boolReader , ok := chunk . ( * array . Boolean )
if ! ok {
log . Warn ( "the column data in parquet is not bool" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not bool of field: %s, but: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = boolReader . Value ( i )
}
data = append ( data , chunkData ... )
}
return data , nil
}
func ReadIntegerOrFloatData [ T constraints . Integer | constraints . Float ] ( pcr * ParquetColumnReader , count int64 ) ( [ ] T , error ) {
2023-11-29 12:52:27 +00:00
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] T , 0 , count )
for _ , chunk := range chunked . Chunks ( ) {
2023-12-12 08:14:44 +00:00
dataNums := chunk . Data ( ) . Len ( )
chunkData := make ( [ ] T , dataNums )
switch chunk . DataType ( ) . ID ( ) {
case arrow . INT8 :
int8Reader := chunk . ( * array . Int8 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( int8Reader . Value ( i ) )
}
case arrow . INT16 :
int16Reader := chunk . ( * array . Int16 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( int16Reader . Value ( i ) )
}
case arrow . INT32 :
int32Reader := chunk . ( * array . Int32 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( int32Reader . Value ( i ) )
}
case arrow . INT64 :
int64Reader := chunk . ( * array . Int64 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( int64Reader . Value ( i ) )
}
case arrow . FLOAT32 :
float32Reader := chunk . ( * array . Float32 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( float32Reader . Value ( i ) )
}
case arrow . FLOAT64 :
float64Reader := chunk . ( * array . Float64 )
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = T ( float64Reader . Value ( i ) )
}
default :
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data type is not integer, neither float, but: %s" , chunk . DataType ( ) . Name ( ) ) )
}
data = append ( data , chunkData ... )
}
return data , nil
}
func ReadStringData ( pcr * ParquetColumnReader , count int64 ) ( [ ] string , error ) {
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] string , 0 , count )
for _ , chunk := range chunked . Chunks ( ) {
dataNums := chunk . Data ( ) . Len ( )
chunkData := make ( [ ] string , dataNums )
stringReader , ok := chunk . ( * array . String )
if ! ok {
log . Warn ( "the column data in parquet is not string" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not string of field: %s, but: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
for i := 0 ; i < dataNums ; i ++ {
chunkData [ i ] = stringReader . Value ( i )
2023-11-29 12:52:27 +00:00
}
data = append ( data , chunkData ... )
}
return data , nil
}
2023-12-12 08:14:44 +00:00
func ReadBinaryData ( pcr * ParquetColumnReader , count int64 ) ( [ ] byte , error ) {
2023-11-29 12:52:27 +00:00
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
2023-12-12 08:14:44 +00:00
data := make ( [ ] byte , 0 , count )
for _ , chunk := range chunked . Chunks ( ) {
dataNums := chunk . Data ( ) . Len ( )
switch chunk . DataType ( ) . ID ( ) {
case arrow . BINARY :
binaryReader := chunk . ( * array . Binary )
for i := 0 ; i < dataNums ; i ++ {
data = append ( data , binaryReader . Value ( i ) ... )
}
case arrow . LIST :
listReader := chunk . ( * array . List )
if ! checkVectorIsRegular ( listReader . Offsets ( ) , pcr . dimension , true ) {
log . Warn ( "Parquet parser: binary vector is irregular" , zap . Int ( "dim" , pcr . dimension ) , zap . Int32s ( "offsets" , listReader . Offsets ( ) ) )
return nil , merr . WrapErrImportFailed ( "binary vector is irregular" )
}
uint8Reader , ok := listReader . ListValues ( ) . ( * array . Uint8 )
if ! ok {
log . Warn ( "the column element data of array in parquet is not binary" , zap . String ( "fieldName" , pcr . fieldName ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column element data of array in parquet is not binary: %s" , pcr . fieldName ) )
}
for i := 0 ; i < uint8Reader . Len ( ) ; i ++ {
data = append ( data , uint8Reader . Value ( i ) )
}
default :
log . Warn ( "the column element data of array in parquet is not binary" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual data type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column element data of array in parquet is not binary: %s, it's: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
}
return data , nil
}
func checkVectorIsRegular ( offsets [ ] int32 , dim int , isBinary bool ) bool {
if len ( offsets ) < 1 {
return false
}
if isBinary {
dim = dim / 8
}
start := offsets [ 0 ]
for i := 1 ; i < len ( offsets ) ; i ++ {
if offsets [ i ] - start != int32 ( dim ) {
return false
}
start = offsets [ i ]
}
return true
}
func ReadBoolArrayData ( pcr * ParquetColumnReader , count int64 ) ( [ ] [ ] bool , error ) {
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] [ ] bool , 0 , count )
2023-11-29 12:52:27 +00:00
for _ , chunk := range chunked . Chunks ( ) {
listReader , ok := chunk . ( * array . List )
if ! ok {
2023-12-12 08:14:44 +00:00
log . Warn ( "the column data in parquet is not list" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not list of field: %s, but: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
boolReader , ok := listReader . ListValues ( ) . ( * array . Boolean )
if ! ok {
log . Warn ( "the column data in parquet is not bool array" , zap . String ( "fieldName" , pcr . fieldName ) ,
zap . String ( "actual type" , listReader . ListValues ( ) . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not bool array of field: %s, but: %s list" , pcr . fieldName , listReader . ListValues ( ) . DataType ( ) . Name ( ) ) )
2023-11-29 12:52:27 +00:00
}
offsets := listReader . Offsets ( )
2023-12-12 08:14:44 +00:00
for i := 1 ; i < len ( offsets ) ; i ++ {
start , end := offsets [ i - 1 ] , offsets [ i ]
2023-12-27 07:30:46 +00:00
elementData := make ( [ ] bool , 0 , end - start )
2023-12-12 08:14:44 +00:00
for j := start ; j < end ; j ++ {
elementData = append ( elementData , boolReader . Value ( int ( j ) ) )
}
data = append ( data , elementData )
}
}
return data , nil
}
func ReadIntegerOrFloatArrayData [ T constraints . Integer | constraints . Float ] ( pcr * ParquetColumnReader , count int64 ) ( [ ] [ ] T , error ) {
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] [ ] T , 0 , count )
getDataFunc := func ( offsets [ ] int32 , getValue func ( int ) T ) {
for i := 1 ; i < len ( offsets ) ; i ++ {
start , end := offsets [ i - 1 ] , offsets [ i ]
elementData := make ( [ ] T , 0 , end - start )
for j := start ; j < end ; j ++ {
elementData = append ( elementData , getValue ( int ( j ) ) )
}
data = append ( data , elementData )
2023-11-29 12:52:27 +00:00
}
}
2023-12-12 08:14:44 +00:00
for _ , chunk := range chunked . Chunks ( ) {
listReader , ok := chunk . ( * array . List )
if ! ok {
log . Warn ( "the column data in parquet is not list" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not list of field: %s, but: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
offsets := listReader . Offsets ( )
if typeutil . IsVectorType ( pcr . dataType ) && ! checkVectorIsRegular ( offsets , pcr . dimension , pcr . dataType == schemapb . DataType_BinaryVector ) {
log . Warn ( "Parquet parser: float vector is irregular" , zap . Int ( "dim" , pcr . dimension ) , zap . Int32s ( "offsets" , listReader . Offsets ( ) ) )
return nil , merr . WrapErrImportFailed ( "float vector is irregular" )
}
valueReader := listReader . ListValues ( )
switch valueReader . DataType ( ) . ID ( ) {
case arrow . INT8 :
int8Reader := valueReader . ( * array . Int8 )
getDataFunc ( offsets , func ( i int ) T {
return T ( int8Reader . Value ( i ) )
} )
case arrow . INT16 :
int16Reader := valueReader . ( * array . Int16 )
getDataFunc ( offsets , func ( i int ) T {
return T ( int16Reader . Value ( i ) )
} )
case arrow . INT32 :
int32Reader := valueReader . ( * array . Int32 )
getDataFunc ( offsets , func ( i int ) T {
return T ( int32Reader . Value ( i ) )
} )
case arrow . INT64 :
int64Reader := valueReader . ( * array . Int64 )
getDataFunc ( offsets , func ( i int ) T {
return T ( int64Reader . Value ( i ) )
} )
case arrow . FLOAT32 :
float32Reader := valueReader . ( * array . Float32 )
getDataFunc ( offsets , func ( i int ) T {
return T ( float32Reader . Value ( i ) )
} )
case arrow . FLOAT64 :
float64Reader := valueReader . ( * array . Float64 )
getDataFunc ( offsets , func ( i int ) T {
return T ( float64Reader . Value ( i ) )
} )
default :
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data type is not integer array, neither float array, but: %s" , valueReader . DataType ( ) . Name ( ) ) )
}
}
return data , nil
}
func ReadStringArrayData ( pcr * ParquetColumnReader , count int64 ) ( [ ] [ ] string , error ) {
chunked , err := pcr . columnReader . NextBatch ( count )
if err != nil {
return nil , err
}
data := make ( [ ] [ ] string , 0 , count )
for _ , chunk := range chunked . Chunks ( ) {
listReader , ok := chunk . ( * array . List )
if ! ok {
log . Warn ( "the column data in parquet is not list" , zap . String ( "fieldName" , pcr . fieldName ) , zap . String ( "actual type" , chunk . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not list of field: %s, but: %s" , pcr . fieldName , chunk . DataType ( ) . Name ( ) ) )
}
stringReader , ok := listReader . ListValues ( ) . ( * array . String )
if ! ok {
log . Warn ( "the column data in parquet is not string array" , zap . String ( "fieldName" , pcr . fieldName ) ,
zap . String ( "actual type" , listReader . ListValues ( ) . DataType ( ) . Name ( ) ) )
return nil , merr . WrapErrImportFailed ( fmt . Sprintf ( "the column data in parquet is not string array of field: %s, but: %s list" , pcr . fieldName , listReader . ListValues ( ) . DataType ( ) . Name ( ) ) )
}
offsets := listReader . Offsets ( )
for i := 1 ; i < len ( offsets ) ; i ++ {
start , end := offsets [ i - 1 ] , offsets [ i ]
2023-12-27 07:30:46 +00:00
elementData := make ( [ ] string , 0 , end - start )
2023-12-12 08:14:44 +00:00
for j := start ; j < end ; j ++ {
elementData = append ( elementData , stringReader . Value ( int ( j ) ) )
}
data = append ( data , elementData )
}
}
return data , nil
2023-11-29 12:52:27 +00:00
}