// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package importutil import ( "bytes" "encoding/binary" "errors" "fmt" "io" "io/ioutil" "os" "reflect" "regexp" "strconv" "unicode/utf8" "github.com/milvus-io/milvus-proto/go-api/schemapb" "github.com/milvus-io/milvus/internal/log" "github.com/sbinet/npyio" "github.com/sbinet/npyio/npy" "go.uber.org/zap" "golang.org/x/text/encoding/unicode" ) var ( reStrPre = regexp.MustCompile(`^[|]*?(\d.*)[Sa]$`) reStrPost = regexp.MustCompile(`^[|]*?[Sa](\d.*)$`) reUniPre = regexp.MustCompile(`^[<|>]*?(\d.*)U$`) reUniPost = regexp.MustCompile(`^[<|>]*?U(\d.*)$`) ) func CreateNumpyFile(path string, data interface{}) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() err = npyio.Write(f, data) if err != nil { return err } return nil } func CreateNumpyData(data interface{}) ([]byte, error) { buf := new(bytes.Buffer) err := npyio.Write(buf, data) if err != nil { return nil, err } return buf.Bytes(), nil } // NumpyAdapter is the class to expand other numpy lib ability // we evaluate two go-numpy lins: github.com/kshedden/gonpy and github.com/sbinet/npyio // the npyio lib read data one by one, the performance is poor, we expand the read methods // to read data in one batch, the performance is 100X faster // the gonpy lib also read data in one batch, but it has no method to read bool data, and the ability // to handle different data type is not strong as the npylib, so we choose the npyio lib to expand. type NumpyAdapter struct { reader io.Reader // data source, typically is os.File npyReader *npy.Reader // reader of npyio lib order binary.ByteOrder // LittleEndian or BigEndian readPosition int // how many elements have been read dataType schemapb.DataType // data type parsed from numpy file header } func NewNumpyAdapter(reader io.Reader) (*NumpyAdapter, error) { r, err := npyio.NewReader(reader) if err != nil { log.Error("Numpy adapter: failed to read numpy header", zap.Error(err)) return nil, err } dataType, err := convertNumpyType(r.Header.Descr.Type) if err != nil { log.Error("Numpy adapter: failed to detect data type", zap.Error(err)) return nil, err } adapter := &NumpyAdapter{ reader: reader, npyReader: r, readPosition: 0, dataType: dataType, } adapter.setByteOrder() log.Info("Numpy adapter: numpy header info", zap.Any("shape", r.Header.Descr.Shape), zap.String("dType", r.Header.Descr.Type), zap.Uint8("majorVer", r.Header.Major), zap.Uint8("minorVer", r.Header.Minor), zap.String("ByteOrder", adapter.order.String())) return adapter, nil } // convertNumpyType gets data type converted from numpy header description, for vector field, the type is int8(binary vector) or float32(float vector) func convertNumpyType(typeStr string) (schemapb.DataType, error) { switch typeStr { case "b1", "i1", "int8": return schemapb.DataType_Int8, nil case "i2", "i2", "int16": return schemapb.DataType_Int16, nil case "i4", "i4", "int32": return schemapb.DataType_Int32, nil case "i8", "i8", "int64": return schemapb.DataType_Int64, nil case "f4", "f4", "float32": return schemapb.DataType_Float, nil case "f8", "f8", "float64": return schemapb.DataType_Double, nil default: if isStringType(typeStr) { return schemapb.DataType_VarChar, nil } log.Error("Numpy adapter: the numpy file data type is not supported", zap.String("dtype", typeStr)) return schemapb.DataType_None, fmt.Errorf("the numpy file dtype '%s' is not supported", typeStr) } } func stringLen(dtype string) (int, bool, error) { var utf bool switch { case reStrPre.MatchString(dtype), reStrPost.MatchString(dtype): utf = false case reUniPre.MatchString(dtype), reUniPost.MatchString(dtype): utf = true } if m := reStrPre.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reStrPost.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reUniPre.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } if m := reUniPost.FindStringSubmatch(dtype); m != nil { v, err := strconv.Atoi(m[1]) if err != nil { return 0, false, err } return v, utf, nil } log.Error("Numpy adapter: the numpy file dtype is not varchar data type", zap.String("dtype", dtype)) return 0, false, fmt.Errorf("dtype '%s' of numpy file is not varchar data type", dtype) } func isStringType(typeStr string) bool { rt := npyio.TypeFrom(typeStr) return rt == reflect.TypeOf((*string)(nil)).Elem() } // setByteOrder sets BigEndian/LittleEndian, the logic of this method is copied from npyio lib func (n *NumpyAdapter) setByteOrder() { var nativeEndian binary.ByteOrder v := uint16(1) switch byte(v >> 8) { case 0: nativeEndian = binary.LittleEndian case 1: nativeEndian = binary.BigEndian } switch n.npyReader.Header.Descr.Type[0] { case '<': n.order = binary.LittleEndian case '>': n.order = binary.BigEndian default: n.order = nativeEndian } } func (n *NumpyAdapter) Reader() io.Reader { return n.reader } func (n *NumpyAdapter) NpyReader() *npy.Reader { return n.npyReader } func (n *NumpyAdapter) GetType() schemapb.DataType { return n.dataType } func (n *NumpyAdapter) GetShape() []int { return n.npyReader.Header.Descr.Shape } func (n *NumpyAdapter) checkCount(count int) int { shape := n.GetShape() // empty file? if len(shape) == 0 { return 0 } total := 1 for i := 0; i < len(shape); i++ { total *= shape[i] } if total == 0 { return 0 } // overflow? if count > (total - n.readPosition) { return total - n.readPosition } return count } func (n *NumpyAdapter) ReadBool(count int) ([]bool, error) { if count <= 0 { log.Error("Numpy adapter: cannot read bool data with a zero or nagative count") return nil, errors.New("cannot read bool data with a zero or nagative count") } // incorrect type if n.dataType != schemapb.DataType_Bool { log.Error("Numpy adapter: numpy data is not bool type") return nil, errors.New("numpy data is not bool type") } // avoid read overflow readSize := n.checkCount(count) if readSize <= 0 { // end of file, nothing to read log.Info("Numpy adapter: read to end of file, type: bool") return nil, nil } // read data data := make([]bool, readSize) err := binary.Read(n.reader, n.order, &data) if err != nil { log.Error("Numpy adapter: failed to read bool data", zap.Int("count", count), zap.Error(err)) return nil, fmt.Errorf(" failed to read bool data with count %d, error: %w", readSize, err) } // update read position after successfully read n.readPosition += readSize return data, nil } func (n *NumpyAdapter) ReadUint8(count int) ([]uint8, error) { if count <= 0 { log.Error("Numpy adapter: cannot read uint8 data with a zero or nagative count") return nil, errors.New("cannot read uint8 data with a zero or nagative count") } // incorrect type // here we don't use n.dataType to check because currently milvus has no uint8 type switch n.npyReader.Header.Descr.Type { case "u1", " 0 { buf = buf[:n] } data = append(data, string(buf)) } } // update read position after successfully read n.readPosition += readSize return data, nil } func decodeUtf32(src []byte, order binary.ByteOrder) (string, error) { if len(src)%4 != 0 { log.Error("Numpy adapter: invalid utf32 bytes length, the byte array length should be multiple of 4", zap.Int("byteLen", len(src))) return "", fmt.Errorf("invalid utf32 bytes length %d, the byte array length should be multiple of 4", len(src)) } var str string for len(src) > 0 { // check the high bytes, if high bytes are 0, the UNICODE is less than U+FFFF, we can use unicode.UTF16 to decode isUtf16 := false var lowbytesPosition int uOrder := unicode.LittleEndian if order == binary.LittleEndian { if src[2] == 0 && src[3] == 0 { isUtf16 = true } lowbytesPosition = 0 } else { if src[0] == 0 && src[1] == 0 { isUtf16 = true } lowbytesPosition = 2 uOrder = unicode.BigEndian } if isUtf16 { // use unicode.UTF16 to decode the low bytes to utf8 // utf32 and utf16 is same if the unicode code is less than 65535 if src[lowbytesPosition] != 0 || src[lowbytesPosition+1] != 0 { decoder := unicode.UTF16(uOrder, unicode.IgnoreBOM).NewDecoder() res, err := decoder.Bytes(src[lowbytesPosition : lowbytesPosition+2]) if err != nil { log.Error("Numpy adapter: failed to decode utf32 binary bytes", zap.Error(err)) return "", fmt.Errorf("failed to decode utf32 binary bytes, error: %w", err) } str += string(res) } } else { // convert the 4 bytes to a unicode and encode to utf8 // Golang strongly opposes utf32 coding, this kind of encoding has been excluded from standard lib var x uint32 if order == binary.LittleEndian { x = uint32(src[3])<<24 | uint32(src[2])<<16 | uint32(src[1])<<8 | uint32(src[0]) } else { x = uint32(src[0])<<24 | uint32(src[1])<<16 | uint32(src[2])<<8 | uint32(src[3]) } r := rune(x) utf8Code := make([]byte, 4) utf8.EncodeRune(utf8Code, r) if r == utf8.RuneError { log.Error("Numpy adapter: failed to convert 4 bytes unicode to utf8 rune", zap.Uint32("code", x)) return "", fmt.Errorf("failed to convert 4 bytes unicode %d to utf8 rune", x) } str += string(utf8Code) } src = src[4:] } return str, nil }