feat: introduce third-party milvus-storage (#39418)

related: https://github.com/milvus-io/milvus/issues/39173

Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
pull/39586/head
sthuang 2025-01-24 17:21:13 +08:00 committed by GitHub
parent f32830e016
commit c4ae9f4ece
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
73 changed files with 7222 additions and 29 deletions

View File

@ -307,7 +307,8 @@ ${CMAKE_EXTRA_ARGS} \
-DUSE_DYNAMIC_SIMD=${USE_DYNAMIC_SIMD} \
-DCPU_ARCH=${CPU_ARCH} \
-DINDEX_ENGINE=${INDEX_ENGINE} \
-DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} "
-DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} \
-DENABLE_AZURE_FS=${ENABLE_AZURE_FS} "
if [ -z "$BUILD_WITHOUT_AZURE" ]; then
CMAKE_CMD=${CMAKE_CMD}"-DAZURE_BUILD_DIR=${AZURE_BUILD_DIR} \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} "

18
go.mod
View File

@ -18,7 +18,7 @@ require (
github.com/gin-gonic/gin v1.9.1
github.com/go-playground/validator/v10 v10.14.0
github.com/gofrs/flock v0.8.1
github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/protobuf v1.5.4
github.com/google/btree v1.1.2
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/klauspost/compress v1.17.9
@ -101,9 +101,9 @@ require (
github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible // indirect
github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68 // indirect
github.com/alibabacloud-go/tea v1.1.8 // indirect
github.com/andybalholm/brotli v1.0.4 // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/apache/pulsar-client-go v0.6.1-0.20210728062540-29414db801a7 // indirect
github.com/apache/thrift v0.18.1 // indirect
github.com/apache/thrift v0.19.0 // indirect
github.com/ardielle/ardielle-go v1.5.2 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.7 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.21 // indirect
@ -158,7 +158,7 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/mock v1.6.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/flatbuffers v2.0.8+incompatible // indirect
github.com/google/flatbuffers v24.3.25+incompatible // indirect
github.com/google/s2a-go v0.1.7 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.5 // indirect
@ -205,7 +205,7 @@ require (
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pierrec/lz4 v2.5.2+incompatible // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/pierrec/lz4/v4 v4.1.21 // indirect
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect
github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 // indirect
github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 // indirect
@ -260,13 +260,13 @@ require (
go.opentelemetry.io/proto/otlp v1.0.0 // indirect
go.uber.org/automaxprocs v1.5.3 // indirect
golang.org/x/arch v0.3.0 // indirect
golang.org/x/mod v0.17.0 // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/sys v0.28.0 // indirect
golang.org/x/term v0.27.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
gonum.org/v1/gonum v0.11.0 // indirect
golang.org/x/tools v0.22.0 // indirect
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
gonum.org/v1/gonum v0.14.0 // indirect
google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect

32
go.sum
View File

@ -104,14 +104,14 @@ github.com/alibabacloud-go/tea v1.1.8 h1:vFF0707fqjGiQTxrtMnIXRjOCvQXf49CuDVRtTo
github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4=
github.com/aliyun/credentials-go v1.2.7 h1:gLtFylxLZ1TWi1pStIt1O6a53GFU1zkNwjtJir2B4ow=
github.com/aliyun/credentials-go v1.2.7/go.mod h1:/KowD1cfGSLrLsH28Jr8W+xwoId0ywIy5lNzDz6O1vw=
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/antihax/optional v0.0.0-20180407024304-ca021399b1a6/go.mod h1:V8iCPQYkqmusNa815XgQio277wI47sdRh1dUOLdyC6Q=
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ=
github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw=
github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg=
github.com/apache/thrift v0.18.1/go.mod h1:rdQn/dCcDKEWjjylUeueum4vQEjG2v8v2PqriUnbr+I=
github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk=
github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I=
github.com/ardielle/ardielle-go v1.5.2 h1:TilHTpHIQJ27R1Tl/iITBzMwiUGSlVfiVhwDNGM3Zj4=
github.com/ardielle/ardielle-go v1.5.2/go.mod h1:I4hy1n795cUhaVt/ojz83SNVCYIGsAFAONtv2Dr7HUI=
github.com/ardielle/ardielle-tools v1.5.4/go.mod h1:oZN+JRMnqGiIhrzkRN9l26Cej9dEx4jeNG6A+AdkShk=
@ -427,8 +427,8 @@ github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ
github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA=
github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU=
github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/flatbuffers v2.0.8+incompatible h1:ivUb1cGomAB101ZM1T0nOiWz9pSrTMoa9+EiY7igmkM=
github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.2.1-0.20190312032427-6f77996f0c42/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
@ -742,8 +742,8 @@ github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2
github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI=
github.com/pierrec/lz4 v2.5.2+incompatible h1:WCjObylUIOlKy/+7Abdn34TLIkXiA4UWUMhxq9m9ZXI=
github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pingcap/errors v0.11.0/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8=
github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c h1:xpW9bvK+HuuTmyFqUwr+jcCvpVkK7sumiz+ko5H9eq4=
@ -1122,8 +1122,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@ -1367,19 +1367,19 @@ golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA=
golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU=
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo=
gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0=
gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0=
gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0=
gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU=
gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw=
gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc=
gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY=

View File

@ -13,7 +13,7 @@ class MilvusConan(ConanFile):
"lz4/1.9.4#c5afb86edd69ac0df30e3a9e192e43db",
"snappy/1.1.9#0519333fef284acd04806243de7d3070",
"lzo/2.10#9517fc1bcc4d4cc229a79806003a1baa",
"arrow/15.0.0#0456d916ff25d509e0724c5b219b4c45",
"arrow/17.0.0#8cea917a6e06ca17c28411966d6fcdd7",
"openssl/3.1.2#02594c4c0a6e2b4feb3cd15119993597",
"aws-sdk-cpp/1.9.234#28d6d2c175975900ce292bafe8022c88",
"googleapis/cci.20221108#65604e1b3b9a6b363044da625b201a2a",
@ -72,6 +72,7 @@ class MilvusConan(ConanFile):
"aws-sdk-cpp:transfer": False,
"gtest:build_gmock": False,
"boost:without_locale": False,
"boost:without_test": True,
"glog:with_gflags": True,
"glog:shared": True,
"prometheus-cpp:with_pull": False,

View File

@ -32,6 +32,7 @@ include_directories(
${SIMDJSON_INCLUDE_DIR}
${TANTIVY_INCLUDE_DIR}
${CONAN_INCLUDE_DIRS}
${MILVUS_STORAGE_INCLUDE_DIR}
)
add_subdirectory( pb )
@ -73,6 +74,7 @@ set(LINK_TARGETS
simdjson
tantivy_binding
knowhere
milvus-storage
${OpenMP_CXX_FLAGS}
${CONAN_LIBS})

View File

@ -0,0 +1,97 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "segcore/packed_reader_c.h"
#include "milvus-storage/packed/reader.h"
#include "milvus-storage/common/log.h"
#include "milvus-storage/filesystem/fs.h"
#include "milvus-storage/common/config.h"
#include <arrow/c/bridge.h>
#include <arrow/filesystem/filesystem.h>
#include <arrow/status.h>
#include <memory>
int
NewPackedReader(const char* path,
struct ArrowSchema* schema,
const int64_t buffer_size,
CPackedReader* c_packed_reader) {
try {
auto truePath = std::string(path);
auto factory = std::make_shared<milvus_storage::FileSystemFactory>();
auto conf = milvus_storage::StorageConfig();
conf.uri = "file:///tmp/";
auto trueFs = factory->BuildFileSystem(conf, &truePath).value();
auto trueSchema = arrow::ImportSchema(schema).ValueOrDie();
std::set<int> needed_columns;
for (int i = 0; i < trueSchema->num_fields(); i++) {
needed_columns.emplace(i);
}
auto reader = std::make_unique<milvus_storage::PackedRecordBatchReader>(
*trueFs, path, trueSchema, needed_columns, buffer_size);
*c_packed_reader = reader.release();
return 0;
} catch (std::exception& e) {
return -1;
}
}
int
ReadNext(CPackedReader c_packed_reader,
CArrowArray* out_array,
CArrowSchema* out_schema) {
try {
auto packed_reader =
static_cast<milvus_storage::PackedRecordBatchReader*>(
c_packed_reader);
std::shared_ptr<arrow::RecordBatch> record_batch;
auto status = packed_reader->ReadNext(&record_batch);
if (!status.ok()) {
return -1;
}
if (record_batch == nullptr) {
// end of file
return 0;
} else {
std::unique_ptr<ArrowArray> arr = std::make_unique<ArrowArray>();
std::unique_ptr<ArrowSchema> schema =
std::make_unique<ArrowSchema>();
auto status = arrow::ExportRecordBatch(
*record_batch, arr.get(), schema.get());
if (!status.ok()) {
return -1;
}
*out_array = arr.release();
*out_schema = schema.release();
return 0;
}
return 0;
} catch (std::exception& e) {
return -1;
}
}
int
CloseReader(CPackedReader c_packed_reader) {
try {
auto packed_reader =
static_cast<milvus_storage::PackedRecordBatchReader*>(
c_packed_reader);
delete packed_reader;
return 0;
} catch (std::exception& e) {
return -1;
}
}

View File

@ -0,0 +1,64 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <arrow/c/abi.h>
typedef void* CPackedReader;
typedef void* CArrowArray;
typedef void* CArrowSchema;
/**
* @brief Open a packed reader to read needed columns in the specified path.
*
* @param path The root path of the packed files to read.
* @param schema The original schema of data.
* @param buffer_size The max buffer size of the packed reader.
* @param c_packed_reader The output pointer of the packed reader.
*/
int
NewPackedReader(const char* path,
struct ArrowSchema* schema,
const int64_t buffer_size,
CPackedReader* c_packed_reader);
/**
* @brief Read the next record batch from the packed reader.
* By default, the maximum return batch is 1024 rows.
*
* @param c_packed_reader The packed reader to read.
* @param out_array The output pointer of the arrow array.
* @param out_schema The output pointer of the arrow schema.
*/
int
ReadNext(CPackedReader c_packed_reader,
CArrowArray* out_array,
CArrowSchema* out_schema);
/**
* @brief Close the packed reader and release the resources.
*
* @param c_packed_reader The packed reader to close.
*/
int
CloseReader(CPackedReader c_packed_reader);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,81 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "segcore/packed_writer_c.h"
#include "milvus-storage/packed/writer.h"
#include "milvus-storage/common/log.h"
#include "milvus-storage/common/config.h"
#include "milvus-storage/filesystem/fs.h"
#include <arrow/c/bridge.h>
#include <arrow/filesystem/filesystem.h>
int
NewPackedWriter(const char* path,
struct ArrowSchema* schema,
const int64_t buffer_size,
CPackedWriter* c_packed_writer) {
try {
auto truePath = std::string(path);
auto factory = std::make_shared<milvus_storage::FileSystemFactory>();
auto conf = milvus_storage::StorageConfig();
conf.uri = "file:///tmp/";
auto trueFs = factory->BuildFileSystem(conf, &truePath).value();
auto trueSchema = arrow::ImportSchema(schema).ValueOrDie();
auto writer = std::make_unique<milvus_storage::PackedRecordBatchWriter>(
buffer_size, trueSchema, trueFs, truePath, conf);
*c_packed_writer = writer.release();
return 0;
} catch (std::exception& e) {
return -1;
}
}
int
WriteRecordBatch(CPackedWriter c_packed_writer,
struct ArrowArray* array,
struct ArrowSchema* schema) {
try {
auto packed_writer =
static_cast<milvus_storage::PackedRecordBatchWriter*>(
c_packed_writer);
auto record_batch =
arrow::ImportRecordBatch(array, schema).ValueOrDie();
auto status = packed_writer->Write(record_batch);
if (!status.ok()) {
return -1;
}
return 0;
} catch (std::exception& e) {
return -1;
}
}
int
CloseWriter(CPackedWriter c_packed_writer) {
try {
auto packed_writer =
static_cast<milvus_storage::PackedRecordBatchWriter*>(
c_packed_writer);
auto status = packed_writer->Close();
delete packed_writer;
if (!status.ok()) {
return -1;
}
return 0;
} catch (std::exception& e) {
return -1;
}
}

View File

@ -0,0 +1,41 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#include <arrow/c/abi.h>
typedef void* CPackedWriter;
int
NewPackedWriter(const char* path,
struct ArrowSchema* schema,
const int64_t buffer_size,
CPackedWriter* c_packed_writer);
int
WriteRecordBatch(CPackedWriter c_packed_writer,
struct ArrowArray* array,
struct ArrowSchema* schema);
int
CloseWriter(CPackedWriter c_packed_writer);
#ifdef __cplusplus
}
#endif

View File

@ -43,7 +43,7 @@ PayloadReader::init(std::shared_ptr<arrow::io::BufferReader> input,
// Configure general Parquet reader settings
auto reader_properties = parquet::ReaderProperties(pool);
reader_properties.set_buffer_size(4096 * 4);
reader_properties.enable_buffered_stream();
// reader_properties.enable_buffered_stream();
// Configure Arrow-specific Parquet reader settings
auto arrow_reader_props = parquet::ArrowReaderProperties();

View File

@ -45,3 +45,4 @@ if (LINUX)
add_subdirectory(jemalloc)
endif()
add_subdirectory(milvus-storage)

View File

@ -0,0 +1,51 @@
#-------------------------------------------------------------------------------
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License.
#-------------------------------------------------------------------------------
# Update milvus-storage_VERSION for the first occurrence
milvus_add_pkg_config("milvus-storage")
set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES "")
set( milvus-storage_VERSION 7475494 )
set( GIT_REPOSITORY "https://github.com/milvus-io/milvus-storage.git")
message(STATUS "milvus-storage repo: ${GIT_REPOSITORY}")
message(STATUS "milvus-storage version: ${milvus-storage_VERSION}")
message(STATUS "Building milvus-storage-${milvus-storage_SOURCE_VER} from source")
message(STATUS ${CMAKE_BUILD_TYPE})
if ( ENABLE_AZURE_FS STREQUAL "ON" )
set(WITH_AZURE_FS ON CACHE BOOL "" FORCE )
else ()
set(WITH_AZURE_FS OFF CACHE BOOL "" FORCE )
endif ()
set( CMAKE_PREFIX_PATH ${CONAN_BOOST_ROOT} )
FetchContent_Declare(
milvus-storage
GIT_REPOSITORY ${GIT_REPOSITORY}
GIT_TAG ${milvus-storage_VERSION}
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/milvus-storage-src
BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/milvus-storage-build
SOURCE_SUBDIR cpp
DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH} )
FetchContent_GetProperties( milvus-storage )
if ( NOT milvus-storage_POPULATED )
FetchContent_Populate( milvus-storage )
# Adding the following target:
# milvus-storage
add_subdirectory( ${milvus-storage_SOURCE_DIR}/cpp
${milvus-storage_BINARY_DIR} )
endif()
set( MILVUS_STORAGE_INCLUDE_DIR ${milvus-storage_SOURCE_DIR}/cpp/include CACHE INTERNAL "Path to milvus-storage include directory" )

View File

@ -0,0 +1,9 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: Milvus Storage
Description: Storage modules for Milvus
Version: @MILVUS_VERSION@
Libs: -L${libdir} -lmilvus-storage
Cflags: -I${includedir}

View File

@ -16,6 +16,7 @@ include_directories(
${SIMDJSON_INCLUDE_DIR}
${TANTIVY_INCLUDE_DIR}
${CONAN_INCLUDE_DIRS}
${MILVUS_STORAGE_INCLUDE_DIR}
)
add_definitions(-DMILVUS_TEST_SEGCORE_YAML_PATH="${CMAKE_SOURCE_DIR}/unittest/test_utils/test_segcore.yaml")
@ -157,6 +158,7 @@ if (LINUX)
gtest
milvus_core
knowhere
milvus-storage
)
install(TARGETS index_builder_test DESTINATION unittest)
endif()
@ -169,6 +171,7 @@ target_link_libraries(all_tests
gtest
milvus_core
knowhere
milvus-storage
)
install(TARGETS all_tests DESTINATION unittest)

View File

@ -0,0 +1,7 @@
reviewers:
- tedxu
- shaoting-huang
- sunby
approvers:
- maintainers

View File

@ -0,0 +1,80 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package arrowutil
import (
"context"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/parquet/file"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
func MakeArrowFileReader(fs fs.Fs, filePath string) (*pqarrow.FileReader, error) {
f, err := fs.OpenFile(filePath)
if err != nil {
return nil, err
}
parquetReader, err := file.NewParquetReader(f)
if err != nil {
return nil, err
}
return pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator)
}
func MakeArrowRecordReader(reader *pqarrow.FileReader, opts *options.ReadOptions) (array.RecordReader, error) {
var rowGroupsIndices []int
var columnIndices []int
metadata := reader.ParquetReader().MetaData()
for _, c := range opts.Columns {
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(c))
}
for _, f := range opts.Filters {
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(f.GetColumnName()))
}
for i := 0; i < len(metadata.RowGroups); i++ {
rg := metadata.RowGroup(i)
var canIgnored bool
for _, filter := range opts.Filters {
columnIndex := rg.Schema.ColumnIndexByName(filter.GetColumnName())
columnChunk, err := rg.ColumnChunk(columnIndex)
if err != nil {
return nil, err
}
columnStats, err := columnChunk.Statistics()
if err != nil {
return nil, err
}
if columnStats == nil || !columnStats.HasMinMax() {
continue
}
if filter.CheckStatistics(columnStats) {
canIgnored = true
break
}
}
if !canIgnored {
rowGroupsIndices = append(rowGroupsIndices, i)
}
}
return reader.GetRecordReader(context.TODO(), columnIndices, rowGroupsIndices)
}

View File

@ -0,0 +1,31 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package constant
const (
ReadBatchSize = 1024
ManifestTempFileSuffix = ".manifest.tmp"
ManifestFileSuffix = ".manifest"
ManifestDir = "versions"
BlobDir = "blobs"
ParquetDataFileSuffix = ".parquet"
OffsetFieldName = "__offset"
VectorDataDir = "vector"
ScalarDataDir = "scalar"
DeleteDataDir = "delete"
LatestManifestVersion = -1
EndpointOverride = "endpoint_override"
)

View File

@ -0,0 +1,27 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package errors
import "github.com/cockroachdb/errors"
var (
ErrSchemaIsNil = errors.New("schema is nil")
ErrBlobAlreadyExist = errors.New("blob already exist")
ErrBlobNotExist = errors.New("blob not exist")
ErrSchemaNotMatch = errors.New("schema not match")
ErrColumnNotExist = errors.New("column not exist")
ErrInvalidPath = errors.New("invalid path")
ErrNoEndpoint = errors.New("no endpoint is specified")
)

View File

@ -0,0 +1,70 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import "go.uber.org/zap"
var (
// not lint
Skip = zap.Skip
Binary = zap.Binary
Bool = zap.Bool
Boolp = zap.Boolp
ByteString = zap.ByteString
Complex128 = zap.Complex128
Complex128p = zap.Complex128p
Complex64 = zap.Complex64
Complex64p = zap.Complex64p
Float64 = zap.Float64
Float64p = zap.Float64p
Float32 = zap.Float32
Float32p = zap.Float32p
Int = zap.Int
Intp = zap.Intp
Int64 = zap.Int64
Int64p = zap.Int64p
Int32 = zap.Int32
Int32p = zap.Int32p
Int16 = zap.Int16
Int16p = zap.Int16p
Int8 = zap.Int8
Int8p = zap.Int8p
String = zap.String
Stringp = zap.Stringp
Uint = zap.Uint
Uintp = zap.Uintp
Uint64 = zap.Uint64
Uint64p = zap.Uint64p
Uint32 = zap.Uint32
Uint32p = zap.Uint32p
Uint16 = zap.Uint16
Uint16p = zap.Uint16p
Uint8 = zap.Uint8
Uint8p = zap.Uint8p
Uintptr = zap.Uintptr
Uintptrp = zap.Uintptrp
Reflect = zap.Reflect
Namespace = zap.Namespace
Stringer = zap.Stringer
Time = zap.Time
Timep = zap.Timep
Stack = zap.Stack
StackSkip = zap.StackSkip
Duration = zap.Duration
Durationp = zap.Durationp
Object = zap.Object
Inline = zap.Inline
Any = zap.Any
)

View File

@ -0,0 +1,106 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import (
"io"
"os"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
type Level = zapcore.Level
const (
DebugLevel = zapcore.DebugLevel
InfoLevel = zapcore.InfoLevel
WarnLevel = zapcore.WarnLevel
ErrorLevel = zapcore.ErrorLevel
PanicLevel = zapcore.PanicLevel
FatalLevel = zapcore.FatalLevel
)
type Logger struct {
l *zap.Logger
al *zap.AtomicLevel
}
func New(out io.Writer, level Level) *Logger {
if out == nil {
out = os.Stderr
}
al := zap.NewAtomicLevelAt(level)
cfg := zap.NewDevelopmentEncoderConfig()
core := zapcore.NewCore(
zapcore.NewConsoleEncoder(cfg),
zapcore.AddSync(out),
al,
)
return &Logger{l: zap.New(core, zap.AddCaller(), zap.AddCallerSkip(2)), al: &al}
}
func (l *Logger) SetLevel(level Level) {
if l.al != nil {
l.al.SetLevel(level)
}
}
type Field = zap.Field
func (l *Logger) Debug(msg string, fields ...Field) {
l.l.Debug(msg, fields...)
}
func (l *Logger) Info(msg string, fields ...Field) {
l.l.Info(msg, fields...)
}
func (l *Logger) Warn(msg string, fields ...Field) {
l.l.Warn(msg, fields...)
}
func (l *Logger) Error(msg string, fields ...Field) {
l.l.Error(msg, fields...)
}
func (l *Logger) Panic(msg string, fields ...Field) {
l.l.Panic(msg, fields...)
}
func (l *Logger) Fatal(msg string, fields ...Field) {
l.l.Fatal(msg, fields...)
}
func (l *Logger) Sync() error {
return l.l.Sync()
}
var std = New(os.Stderr, DebugLevel)
func Default() *Logger { return std }
func ReplaceDefault(l *Logger) { std = l }
func SetLevel(level Level) { std.SetLevel(level) }
func Debug(msg string, fields ...Field) { std.Debug(msg, fields...) }
func Info(msg string, fields ...Field) { std.Info(msg, fields...) }
func Warn(msg string, fields ...Field) { std.Warn(msg, fields...) }
func Error(msg string, fields ...Field) { std.Error(msg, fields...) }
func Panic(msg string, fields ...Field) { std.Panic(msg, fields...) }
func Fatal(msg string, fields ...Field) { std.Fatal(msg, fields...) }
func Sync() error { return std.Sync() }

View File

@ -0,0 +1,33 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import (
"testing"
)
func TestLogger(t *testing.T) {
defer Sync()
Info("Testing")
Debug("Testing")
Warn("Testing")
Error("Testing")
defer func() {
if err := recover(); err != nil {
Debug("logPanic recover")
}
}()
Panic("Testing")
}

View File

@ -0,0 +1,34 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import "go.uber.org/zap"
type Option = zap.Option
var (
WrapCore = zap.WrapCore
Hooks = zap.Hooks
Fields = zap.Fields
ErrorOutput = zap.ErrorOutput
Development = zap.Development
AddCaller = zap.AddCaller
WithCaller = zap.WithCaller
AddCallerSkip = zap.AddCallerSkip
AddStacktrace = zap.AddStacktrace
IncreaseLevel = zap.IncreaseLevel
WithFatalHook = zap.WithFatalHook
WithClock = zap.WithClock
)

View File

@ -0,0 +1,404 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package utils
import (
"fmt"
"path/filepath"
"strconv"
"strings"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/endian"
"github.com/cockroachdb/errors"
"github.com/google/uuid"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
var ErrInvalidArgument = errors.New("invalid argument")
func ToProtobufType(dataType arrow.Type) (storagev2pb.LogicType, error) {
typeId := int(dataType)
if typeId < 0 || typeId >= int(storagev2pb.LogicType_MAX_ID) {
return storagev2pb.LogicType_NA, fmt.Errorf("parse data type %v: %w", dataType, ErrInvalidArgument)
}
return storagev2pb.LogicType(typeId), nil
}
func ToProtobufMetadata(metadata *arrow.Metadata) (*storagev2pb.KeyValueMetadata, error) {
keys := metadata.Keys()
values := metadata.Values()
return &storagev2pb.KeyValueMetadata{Keys: keys, Values: values}, nil
}
func ToProtobufDataType(dataType arrow.DataType) (*storagev2pb.DataType, error) {
protoType := &storagev2pb.DataType{}
err := SetTypeValues(protoType, dataType)
if err != nil {
return nil, err
}
logicType, err := ToProtobufType(dataType.ID())
if err != nil {
return nil, err
}
protoType.LogicType = logicType
if len(GetFields(dataType)) > 0 {
for _, field := range GetFields(dataType) {
fieldCopy := field
protoFieldType, err := ToProtobufField(&fieldCopy)
if err != nil {
return nil, err
}
protoType.Children = append(protoType.Children, protoFieldType)
}
}
return protoType, nil
}
// GetFields TODO CHECK MORE TYPES
func GetFields(dataType arrow.DataType) []arrow.Field {
switch dataType.ID() {
case arrow.LIST:
listType, _ := dataType.(*arrow.ListType)
return listType.Fields()
case arrow.STRUCT:
structType, _ := dataType.(*arrow.StructType)
return structType.Fields()
case arrow.MAP:
mapType, _ := dataType.(*arrow.MapType)
return mapType.Fields()
case arrow.FIXED_SIZE_LIST:
listType, _ := dataType.(*arrow.FixedSizeListType)
return listType.Fields()
default:
return nil
}
}
func ToProtobufField(field *arrow.Field) (*storagev2pb.Field, error) {
protoField := &storagev2pb.Field{}
protoField.Name = field.Name
protoField.Nullable = field.Nullable
if field.Metadata.Len() != 0 {
fieldMetadata, err := ToProtobufMetadata(&field.Metadata)
if err != nil {
return nil, fmt.Errorf("convert to protobuf field: %w", err)
}
protoField.Metadata = fieldMetadata
}
dataType, err := ToProtobufDataType(field.Type)
if err != nil {
return nil, fmt.Errorf("convert to protobuf field: %w", err)
}
protoField.DataType = dataType
return protoField, nil
}
func SetTypeValues(protoType *storagev2pb.DataType, dataType arrow.DataType) error {
switch dataType.ID() {
case arrow.FIXED_SIZE_BINARY:
realType, ok := dataType.(*arrow.FixedSizeBinaryType)
if !ok {
return fmt.Errorf("convert to fixed size binary type: %w", ErrInvalidArgument)
}
fixedSizeBinaryType := &storagev2pb.FixedSizeBinaryType{}
fixedSizeBinaryType.ByteWidth = int32(realType.ByteWidth)
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeBinaryType{FixedSizeBinaryType: fixedSizeBinaryType}
case arrow.FIXED_SIZE_LIST:
realType, ok := dataType.(*arrow.FixedSizeListType)
if !ok {
return fmt.Errorf("convert to fixed size list type: %w", ErrInvalidArgument)
}
fixedSizeListType := &storagev2pb.FixedSizeListType{}
fixedSizeListType.ListSize = realType.Len()
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeListType{FixedSizeListType: fixedSizeListType}
case arrow.DICTIONARY:
realType, ok := dataType.(*arrow.DictionaryType)
if !ok {
return fmt.Errorf("convert to dictionary type: %w", ErrInvalidArgument)
}
dictionaryType := &storagev2pb.DictionaryType{}
indexType, err := ToProtobufDataType(realType.IndexType)
if err != nil {
return err
}
dictionaryType.IndexType = indexType
valueType, err := ToProtobufDataType(realType.ValueType)
if err != nil {
return err
}
dictionaryType.ValueType = valueType
dictionaryType.Ordered = realType.Ordered
protoType.TypeRelatedValues = &storagev2pb.DataType_DictionaryType{DictionaryType: dictionaryType}
case arrow.MAP:
realType, ok := dataType.(*arrow.MapType)
if !ok {
return fmt.Errorf("convert to map type: %w", ErrInvalidArgument)
}
mapType := &storagev2pb.MapType{}
mapType.KeysSorted = realType.KeysSorted
protoType.TypeRelatedValues = &storagev2pb.DataType_MapType{MapType: mapType}
default:
}
return nil
}
func ToProtobufSchema(schema *arrow.Schema) (*storagev2pb.ArrowSchema, error) {
protoSchema := &storagev2pb.ArrowSchema{}
for _, field := range schema.Fields() {
fieldCopy := field
protoField, err := ToProtobufField(&fieldCopy)
if err != nil {
return nil, err
}
protoSchema.Fields = append(protoSchema.Fields, protoField)
}
if schema.Endianness() == endian.LittleEndian {
protoSchema.Endianness = storagev2pb.Endianness_Little
} else if schema.Endianness() == endian.BigEndian {
protoSchema.Endianness = storagev2pb.Endianness_Big
}
// TODO FIX ME: golang proto not support proto_schema->mutable_metadata()->add_keys(key);
if schema.HasMetadata() && !schema.HasMetadata() {
for _, key := range schema.Metadata().Keys() {
protoKeyValue := protoSchema.GetMetadata()
protoKeyValue.Keys = append(protoKeyValue.Keys, key)
}
for _, value := range schema.Metadata().Values() {
protoKeyValue := protoSchema.GetMetadata()
protoKeyValue.Values = append(protoKeyValue.Values, value)
}
}
return protoSchema, nil
}
func FromProtobufSchema(schema *storagev2pb.ArrowSchema) (*arrow.Schema, error) {
fields := make([]arrow.Field, 0, len(schema.Fields))
for _, field := range schema.Fields {
tmp, err := FromProtobufField(field)
if err != nil {
return nil, err
}
fields = append(fields, *tmp)
}
tmp, err := FromProtobufKeyValueMetadata(schema.Metadata)
if err != nil {
return nil, err
}
newSchema := arrow.NewSchema(fields, tmp)
return newSchema, nil
}
func FromProtobufField(field *storagev2pb.Field) (*arrow.Field, error) {
datatype, err := FromProtobufDataType(field.DataType)
if err != nil {
return nil, err
}
metadata, err := FromProtobufKeyValueMetadata(field.GetMetadata())
if err != nil {
return nil, err
}
return &arrow.Field{Name: field.Name, Type: datatype, Nullable: field.Nullable, Metadata: *metadata}, nil
}
func FromProtobufKeyValueMetadata(metadata *storagev2pb.KeyValueMetadata) (*arrow.Metadata, error) {
keys := make([]string, 0)
values := make([]string, 0)
if metadata != nil {
keys = metadata.Keys
values = metadata.Values
}
newMetadata := arrow.NewMetadata(keys, values)
return &newMetadata, nil
}
func FromProtobufDataType(dataType *storagev2pb.DataType) (arrow.DataType, error) {
switch dataType.LogicType {
case storagev2pb.LogicType_NA:
return &arrow.NullType{}, nil
case storagev2pb.LogicType_BOOL:
return &arrow.BooleanType{}, nil
case storagev2pb.LogicType_UINT8:
return &arrow.Uint8Type{}, nil
case storagev2pb.LogicType_INT8:
return &arrow.Int8Type{}, nil
case storagev2pb.LogicType_UINT16:
return &arrow.Uint16Type{}, nil
case storagev2pb.LogicType_INT16:
return &arrow.Int16Type{}, nil
case storagev2pb.LogicType_UINT32:
return &arrow.Uint32Type{}, nil
case storagev2pb.LogicType_INT32:
return &arrow.Int32Type{}, nil
case storagev2pb.LogicType_UINT64:
return &arrow.Uint64Type{}, nil
case storagev2pb.LogicType_INT64:
return &arrow.Int64Type{}, nil
case storagev2pb.LogicType_HALF_FLOAT:
return &arrow.Float16Type{}, nil
case storagev2pb.LogicType_FLOAT:
return &arrow.Float32Type{}, nil
case storagev2pb.LogicType_DOUBLE:
return &arrow.Float64Type{}, nil
case storagev2pb.LogicType_STRING:
return &arrow.StringType{}, nil
case storagev2pb.LogicType_BINARY:
return &arrow.BinaryType{}, nil
case storagev2pb.LogicType_LIST:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
listType := arrow.ListOf(fieldType.Type)
return listType, nil
case storagev2pb.LogicType_STRUCT:
fields := make([]arrow.Field, 0, len(dataType.Children))
for _, child := range dataType.Children {
field, err := FromProtobufField(child)
if err != nil {
return nil, err
}
fields = append(fields, *field)
}
structType := arrow.StructOf(fields...)
return structType, nil
case storagev2pb.LogicType_DICTIONARY:
keyType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
valueType, err := FromProtobufField(dataType.Children[1])
if err != nil {
return nil, err
}
dictType := &arrow.DictionaryType{
IndexType: keyType.Type,
ValueType: valueType.Type,
}
return dictType, nil
case storagev2pb.LogicType_MAP:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
// TODO FIX ME
return arrow.MapOf(fieldType.Type, fieldType.Type), nil
case storagev2pb.LogicType_FIXED_SIZE_BINARY:
sizeBinaryType := arrow.FixedSizeBinaryType{ByteWidth: int(dataType.GetFixedSizeBinaryType().ByteWidth)}
return &sizeBinaryType, nil
case storagev2pb.LogicType_FIXED_SIZE_LIST:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
fixedSizeListType := arrow.FixedSizeListOf(int32(int(dataType.GetFixedSizeListType().ListSize)), fieldType.Type)
return fixedSizeListType, nil
default:
return nil, fmt.Errorf("parse protobuf datatype: %w", ErrInvalidArgument)
}
}
func GetNewParquetFilePath(path string) string {
scalarFileId := uuid.New()
path = filepath.Join(path, scalarFileId.String()+constant.ParquetDataFileSuffix)
return path
}
func GetManifestFilePath(path string, version int64) string {
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestFileSuffix)
return path
}
func GetManifestTmpFilePath(path string, version int64) string {
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestTempFileSuffix)
return path
}
func GetBlobFilePath(path string) string {
blobId := uuid.New()
return filepath.Join(GetBlobDir(path), blobId.String())
}
func GetManifestDir(path string) string {
path = filepath.Join(path, constant.ManifestDir)
return path
}
func GetVectorDataDir(path string) string {
return filepath.Join(path, constant.VectorDataDir)
}
func GetScalarDataDir(path string) string {
return filepath.Join(path, constant.ScalarDataDir)
}
func GetBlobDir(path string) string {
return filepath.Join(path, constant.BlobDir)
}
func GetDeleteDataDir(path string) string {
return filepath.Join(path, constant.DeleteDataDir)
}
func ParseVersionFromFileName(path string) int64 {
pos := strings.Index(path, constant.ManifestFileSuffix)
if pos == -1 || !strings.HasSuffix(path, constant.ManifestFileSuffix) {
log.Warn("manifest file suffix not match", log.String("path", path))
return -1
}
version := path[0:pos]
versionInt, err := strconv.ParseInt(version, 10, 64)
if err != nil {
log.Error("parse version from file name error", log.String("path", path), log.String("version", version))
return -1
}
return versionInt
}
func ProjectSchema(sc *arrow.Schema, columns []string) *arrow.Schema {
var fields []arrow.Field
for _, field := range sc.Fields() {
for _, column := range columns {
if field.Name == column {
fields = append(fields, field)
break
}
}
}
return arrow.NewSchema(fields, nil)
}

View File

@ -0,0 +1,22 @@
**storage layer interface**: supply reader/writer of storage which contains read options. Maintain meta of storage and handle atomic read/write with multiple files (maybe have different format) on disks.
---
**File Reader/Writer interface**: receive data and read options from upper layer and turn the raw data to our defined data.
---
**File Format Reader/Writer**: file format reader/writer (eg. parquet/raw/others like orc).
---
**File system interface**: support different file system (eg. in-memory, aws, minio, posix, windows).

View File

@ -0,0 +1,41 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package blob
import (
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
type Blob struct {
Name string
Size int64
File string
}
func (b Blob) ToProtobuf() *storagev2pb.Blob {
blob := &storagev2pb.Blob{}
blob.Name = b.Name
blob.Size = b.Size
blob.File = b.File
return blob
}
func FromProtobuf(blob *storagev2pb.Blob) Blob {
return Blob{
Name: blob.Name,
Size: blob.Size,
File: blob.File,
}
}

View File

@ -0,0 +1,45 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fragment
import (
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type (
pkType any
DeleteFragmentVector []DeleteFragment
DeleteFragment struct {
id int64
schema *schema.Schema
fs fs.Fs
data map[pkType][]int64
}
)
func NewDeleteFragment(id int64, schema *schema.Schema, fs fs.Fs) *DeleteFragment {
return &DeleteFragment{
id: id,
schema: schema,
fs: fs,
data: make(map[pkType][]int64),
}
}
func Make(f fs.Fs, s *schema.Schema, frag Fragment) DeleteFragment {
// TODO: implement
panic("implement me")
}

View File

@ -0,0 +1,76 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fragment
import "github.com/milvus-io/milvus/pkg/proto/storagev2pb"
type FragmentType int32
const (
kUnknown FragmentType = 0
kData FragmentType = 1
kDelete FragmentType = 2
)
type Fragment struct {
fragmentId int64
files []string
}
type FragmentVector []Fragment
func ToFilesVector(fragments []Fragment) []string {
files := make([]string, 0)
for _, fragment := range fragments {
files = append(files, fragment.files...)
}
return files
}
func NewFragment() Fragment {
return Fragment{
files: make([]string, 0),
}
}
func (f *Fragment) AddFile(file string) {
f.files = append(f.files, file)
}
func (f *Fragment) Files() []string {
return f.files
}
func (f *Fragment) FragmentId() int64 {
return f.fragmentId
}
func (f *Fragment) SetFragmentId(fragmentId int64) {
f.fragmentId = fragmentId
}
func (f *Fragment) ToProtobuf() *storagev2pb.Fragment {
fragment := &storagev2pb.Fragment{}
fragment.Id = f.fragmentId
fragment.Files = append(fragment.Files, f.files...)
return fragment
}
func FromProtobuf(fragment *storagev2pb.Fragment) Fragment {
newFragment := NewFragment()
newFragment.SetFragmentId(fragment.GetId())
newFragment.files = append(newFragment.files, fragment.Files...)
return newFragment
}

View File

@ -0,0 +1,84 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package filter
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/parquet/metadata"
"github.com/bits-and-blooms/bitset"
)
type ConjunctionAndFilter struct {
filters []Filter
columnName string
}
func (f *ConjunctionAndFilter) GetColumnName() string {
return f.columnName
}
// FIXME: should have 3 cases.
// 1. all records satisfy the filter, this group dont need to check filter again.
// 2. no record satisfies the filter.
// 3. some records satisfy the filter, this group should check filter again.
func (f *ConjunctionAndFilter) CheckStatistics(stats metadata.TypedStatistics) bool {
for _, filter := range f.filters {
if filter.CheckStatistics(stats) {
return true
}
}
return false
}
func (f *ConjunctionAndFilter) Type() FilterType {
return And
}
func (f *ConjunctionAndFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) {
for i := 0; i < len(f.filters); i++ {
f.filters[i].Apply(colData, filterBitSet)
}
}
type ConjunctionOrFilter struct {
filters []Filter
}
func (f *ConjunctionOrFilter) CheckStatistics(stats metadata.TypedStatistics) bool {
for _, filter := range f.filters {
if !filter.CheckStatistics(stats) {
return false
}
}
return true
}
func (f *ConjunctionOrFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) {
orBitSet := bitset.New(filterBitSet.Len())
for i := 1; i < len(f.filters); i++ {
childBitSet := filterBitSet.Clone()
f.filters[i].Apply(colData, childBitSet)
orBitSet.Intersection(childBitSet)
}
filterBitSet.Union(orBitSet)
}
func (f *ConjunctionOrFilter) Type() FilterType {
return Or
}
func NewConjunctionAndFilter(filters ...Filter) *ConjunctionAndFilter {
return &ConjunctionAndFilter{filters: filters}
}

View File

@ -0,0 +1,151 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package filter
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/parquet"
"github.com/apache/arrow/go/v12/parquet/metadata"
"github.com/bits-and-blooms/bitset"
)
type ConstantFilter struct {
cmpType ComparisonType
value interface{}
columnName string
}
func (f *ConstantFilter) GetColumnName() string {
return f.columnName
}
func (f *ConstantFilter) CheckStatistics(stats metadata.TypedStatistics) bool {
// FIXME: value may be int8/uint8/...., we should encapsulate the value type, now we just do type assertion for prototype
switch stats.Type() {
case parquet.Types.Int32:
i32stats := stats.(*metadata.Int32Statistics)
if i32stats.HasMinMax() {
return checkStats(f.value.(int32), i32stats.Min(), i32stats.Max(), f.cmpType)
}
case parquet.Types.Int64:
i64stats := stats.(*metadata.Int64Statistics)
if i64stats.HasMinMax() {
return checkStats(f.value.(int64), i64stats.Min(), i64stats.Max(), f.cmpType)
}
case parquet.Types.Float:
floatstats := stats.(*metadata.Float32Statistics)
if floatstats.HasMinMax() {
return checkStats(f.value.(float32), floatstats.Min(), floatstats.Max(), f.cmpType)
}
case parquet.Types.Double:
doublestats := stats.(*metadata.Float64Statistics)
if doublestats.HasMinMax() {
return checkStats(f.value.(float64), doublestats.Min(), doublestats.Max(), f.cmpType)
}
}
return false
}
type comparableValue interface {
int32 | int64 | float32 | float64
}
func checkStats[T comparableValue](value, min, max T, cmpType ComparisonType) bool {
switch cmpType {
case Equal:
return value < min || value > max
case NotEqual:
return value == min && value == max
case LessThan:
return value <= min
case LessThanOrEqual:
return value < min
case GreaterThan:
return value >= max
case GreaterThanOrEqual:
return value > max
default:
return false
}
}
func (f *ConstantFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) {
switch data := colData.(type) {
case *array.Int8:
filterColumn(f.value.(int8), data.Int8Values(), f.cmpType, filterBitSet)
case *array.Uint8:
filterColumn(f.value.(uint8), data.Uint8Values(), f.cmpType, filterBitSet)
case *array.Int16:
filterColumn(f.value.(int16), data.Int16Values(), f.cmpType, filterBitSet)
case *array.Uint16:
filterColumn(f.value.(uint16), data.Uint16Values(), f.cmpType, filterBitSet)
case *array.Int32:
filterColumn(f.value.(int32), data.Int32Values(), f.cmpType, filterBitSet)
case *array.Uint32:
filterColumn(f.value.(uint32), data.Uint32Values(), f.cmpType, filterBitSet)
case *array.Int64:
filterColumn(f.value.(int64), data.Int64Values(), f.cmpType, filterBitSet)
case *array.Uint64:
filterColumn(f.value.(uint64), data.Uint64Values(), f.cmpType, filterBitSet)
case *array.Float32:
filterColumn(f.value.(float32), data.Float32Values(), f.cmpType, filterBitSet)
case *array.Float64:
filterColumn(f.value.(float64), data.Float64Values(), f.cmpType, filterBitSet)
}
}
type comparableColumnType interface {
int8 | uint8 | int16 | uint16 | int32 | uint32 | int64 | uint64 | float32 | float64
}
func filterColumn[T comparableColumnType](value T, targets []T, cmpType ComparisonType, filterBitSet *bitset.BitSet) {
for i, target := range targets {
if checkColumn(value, target, cmpType) {
filterBitSet.Set(uint(i))
}
}
}
func checkColumn[T comparableColumnType](value, target T, cmpType ComparisonType) bool {
switch cmpType {
case Equal:
return value != target
case NotEqual:
return value == target
case LessThan:
return value <= target
case LessThanOrEqual:
return value < target
case GreaterThan:
return value >= target
case GreaterThanOrEqual:
return value > target
default:
return false
}
}
func (f *ConstantFilter) Type() FilterType {
return Constant
}
func NewConstantFilter(cmpType ComparisonType, columnName string, value interface{}) *ConstantFilter {
return &ConstantFilter{
cmpType: cmpType,
columnName: columnName,
value: value,
}
}

View File

@ -0,0 +1,48 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package filter
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/parquet/metadata"
"github.com/bits-and-blooms/bitset"
)
type FilterType int8
const (
And FilterType = iota
Or
Constant
Range
)
type Filter interface {
CheckStatistics(metadata.TypedStatistics) bool
Type() FilterType
Apply(colData arrow.Array, filterBitSet *bitset.BitSet)
GetColumnName() string
}
type ComparisonType int8
const (
Equal ComparisonType = iota
NotEqual
LessThan
LessThanOrEqual
GreaterThan
GreaterThanOrEqual
)

View File

@ -0,0 +1,220 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package parquet
import (
"context"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/parquet/file"
"github.com/apache/arrow/go/v12/parquet/metadata"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/bits-and-blooms/bitset"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/filter"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type FileReader struct {
reader *pqarrow.FileReader
options *options.ReadOptions
recReader pqarrow.RecordReader
}
// When the Reader reaches the end of the underlying stream, it returns (nil, io.EOF)
func (r *FileReader) Read() (arrow.Record, error) {
if r.recReader == nil {
// lazy init
if err := r.initRecReader(); err != nil {
return nil, err
}
}
rec, err := r.recReader.Read()
if err != nil {
return nil, err
}
return applyFilters(rec, r.options.Filters), nil
}
func applyFilters(rec arrow.Record, filters map[string]filter.Filter) arrow.Record {
filterBitSet := bitset.New(uint(rec.NumRows()))
for col, f := range filters {
colIndices := rec.Schema().FieldIndices(col)
if len(colIndices) == 0 {
panic("column not found")
}
colIndex := colIndices[0]
arr := rec.Column(colIndex)
f.Apply(arr, filterBitSet)
}
if filterBitSet.None() {
return rec
}
var cols []arrow.Array
for i := 0; i < int(rec.NumCols()); i++ {
col := rec.Column(i)
switch t := col.(type) {
case *array.Int8:
builder := array.NewInt8Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Int8Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Uint8:
builder := array.NewUint8Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Uint8Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Int16:
builder := array.NewInt16Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Int16Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Uint16:
builder := array.NewUint16Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Uint16Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Int32:
builder := array.NewInt32Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Int32Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Uint32:
builder := array.NewUint32Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Uint32Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Int64:
builder := array.NewInt64Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Int64Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
case *array.Uint64:
builder := array.NewUint64Builder(memory.DefaultAllocator)
filtered := filterRecord(t.Uint64Values(), filterBitSet)
builder.AppendValues(filtered, nil)
cols = append(cols, builder.NewArray())
default:
panic("unsupported type")
}
}
return array.NewRecord(rec.Schema(), cols, int64(cols[0].Len()))
}
type comparableColumnType interface {
int8 | uint8 | int16 | uint16 | int32 | uint32 | int64 | uint64 | float32 | float64
}
func filterRecord[T comparableColumnType](targets []T, filterBitSet *bitset.BitSet) []T {
var res []T
for i := 0; i < int(filterBitSet.Len()); i++ {
if !filterBitSet.Test(uint(i)) {
res = append(res, targets[i])
}
}
return res
}
func (r *FileReader) initRecReader() error {
var (
filters map[string]filter.Filter = r.options.Filters
columns []string = r.options.Columns
)
var (
rowGroupNum int = r.reader.ParquetReader().NumRowGroups()
fileMetaData *metadata.FileMetaData = r.reader.ParquetReader().MetaData()
)
var rowGroups []int
var colIndices []int
// filters check column statistics
x1:
for i := 0; i < rowGroupNum; i++ {
rowGroupMetaData := fileMetaData.RowGroup(i)
for col, filter := range filters {
if checkColumnStats(rowGroupMetaData, col, filter) {
// ignore the row group
break x1
}
}
rowGroups = append(rowGroups, i)
}
for _, col := range columns {
colIndex := fileMetaData.Schema.Root().FieldIndexByName(col)
if colIndex == -1 {
panic("column not found")
}
colIndices = append(colIndices, colIndex)
}
recReader, err := r.reader.GetRecordReader(context.TODO(), colIndices, rowGroups)
if err != nil {
return err
}
r.recReader = recReader
return nil
}
func checkColumnStats(rowGroupMetaData *metadata.RowGroupMetaData, col string, f filter.Filter) bool {
colIndex := rowGroupMetaData.Schema.Root().FieldIndexByName(col)
if colIndex == -1 {
panic("column not found")
}
colMetaData, err := rowGroupMetaData.ColumnChunk(colIndex)
if err != nil {
panic(err)
}
stats, err := colMetaData.Statistics()
if err != nil || stats == nil {
return false
}
return f.CheckStatistics(stats)
}
func (r *FileReader) Close() error {
if r.recReader != nil {
r.recReader.Release()
}
return nil
}
func NewFileReader(fs fs.Fs, filePath string, options *options.ReadOptions) (*FileReader, error) {
f, err := fs.OpenFile(filePath)
if err != nil {
return nil, err
}
parquetReader, err := file.NewParquetReader(f)
if err != nil {
return nil, err
}
reader, err := pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator)
if err != nil {
return nil, err
}
return &FileReader{reader: reader, options: options}, nil
}

View File

@ -0,0 +1,61 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package parquet
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/parquet"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/milvus-io/milvus/internal/storagev2/io/format"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
)
var _ format.Writer = (*FileWriter)(nil)
type FileWriter struct {
writer *pqarrow.FileWriter
count int64
}
func (f *FileWriter) Write(record arrow.Record) error {
if err := f.writer.Write(record); err != nil {
return err
}
f.count += record.NumRows()
return nil
}
func (f *FileWriter) Count() int64 {
return f.count
}
func (f *FileWriter) Close() error {
return f.writer.Close()
}
func NewFileWriter(schema *arrow.Schema, fs fs.Fs, filePath string) (*FileWriter, error) {
file, err := fs.OpenFile(filePath)
if err != nil {
return nil, err
}
w, err := pqarrow.NewFileWriter(schema, file, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
if err != nil {
return nil, err
}
return &FileWriter{writer: w}, nil
}

View File

@ -0,0 +1,24 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package format
import (
"github.com/apache/arrow/go/v12/arrow"
)
type Reader interface {
Read() (arrow.Record, error)
Close() error
}

View File

@ -0,0 +1,23 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package format
import "github.com/apache/arrow/go/v12/arrow"
type Writer interface {
Write(record arrow.Record) error
Count() int64
Close() error
}

View File

@ -0,0 +1,40 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"net/url"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type Factory struct{}
func (f *Factory) Create(fsType options.FsType, uri *url.URL) (Fs, error) {
switch fsType {
case options.InMemory:
return NewMemoryFs(), nil
case options.LocalFS:
return NewLocalFs(uri), nil
case options.S3:
return NewMinioFs(uri)
default:
panic("unknown fs type")
}
}
func NewFsFactory() *Factory {
return &Factory{}
}

View File

@ -0,0 +1,25 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package file
import "io"
type File interface {
io.Writer
io.ReaderAt
io.Seeker
io.Reader
io.Closer
}

View File

@ -0,0 +1,52 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package file
import (
"io"
"os"
)
var EOF = io.EOF
type LocalFile struct {
file os.File
}
func (l *LocalFile) Read(p []byte) (n int, err error) {
return l.file.Read(p)
}
func (l *LocalFile) Write(p []byte) (n int, err error) {
return l.file.Write(p)
}
func (l *LocalFile) ReadAt(p []byte, off int64) (n int, err error) {
return l.file.ReadAt(p, off)
}
func (l *LocalFile) Seek(offset int64, whence int) (int64, error) {
return l.file.Seek(offset, whence)
}
func (l *LocalFile) Close() error {
return l.file.Close()
}
func NewLocalFile(f *os.File) *LocalFile {
return &LocalFile{
file: *f,
}
}

View File

@ -0,0 +1,116 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package file
import (
"io"
"github.com/cockroachdb/errors"
)
var errInvalid = errors.New("invalid argument")
type MemoryFile struct {
b []byte
i int
}
func (f *MemoryFile) Close() error {
return nil
}
func (f *MemoryFile) Read(p []byte) (n int, err error) {
if f.i >= len(f.b) {
return 0, io.EOF
}
n = copy(p, f.b[f.i:])
f.i += n
return n, nil
}
func (f *MemoryFile) Write(b []byte) (int, error) {
n, err := f.writeAt(b, int64(f.i))
f.i += n
return n, err
}
func (f *MemoryFile) writeAt(b []byte, off int64) (int, error) {
if off < 0 || int64(int(off)) < off {
return 0, errInvalid
}
if off > int64(len(f.b)) {
f.truncate(off)
}
n := copy(f.b[off:], b)
f.b = append(f.b, b[n:]...)
return len(b), nil
}
func (f *MemoryFile) truncate(n int64) error {
switch {
case n < 0 || int64(int(n)) < n:
return errInvalid
case n <= int64(len(f.b)):
f.b = f.b[:n]
return nil
default:
f.b = append(f.b, make([]byte, int(n)-len(f.b))...)
return nil
}
}
func (f *MemoryFile) ReadAt(b []byte, off int64) (n int, err error) {
if off < 0 || int64(int(off)) < off {
return 0, errInvalid
}
if off > int64(len(f.b)) {
return 0, io.EOF
}
n = copy(b, f.b[off:])
f.i += n
if n < len(b) {
return n, io.EOF
}
return n, nil
}
func (f *MemoryFile) Seek(offset int64, whence int) (int64, error) {
var abs int64
switch whence {
case io.SeekStart:
abs = offset
case io.SeekCurrent:
abs = int64(f.i) + offset
case io.SeekEnd:
abs = int64(len(f.b)) + offset
default:
return 0, errInvalid
}
if abs < 0 {
return 0, errInvalid
}
f.i = int(abs)
return abs, nil
}
func (f *MemoryFile) Bytes() []byte {
return f.b
}
func NewMemoryFile(b []byte) *MemoryFile {
return &MemoryFile{
b: b,
}
}

View File

@ -0,0 +1,73 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package file
import (
"bytes"
"context"
"github.com/minio/minio-go/v7"
)
var _ File = (*MinioFile)(nil)
type MinioFile struct {
*minio.Object
writer *MemoryFile
client *minio.Client
fileName string
bucketName string
}
func (f *MinioFile) Write(b []byte) (int, error) {
return f.writer.Write(b)
}
func (f *MinioFile) Close() error {
if len(f.writer.b) == 0 {
return nil
}
_, err := f.client.PutObject(context.TODO(), f.bucketName, f.fileName, bytes.NewReader(f.writer.b), int64(len(f.writer.b)), minio.PutObjectOptions{})
return err
}
func NewMinioFile(client *minio.Client, fileName string, bucketName string) (*MinioFile, error) {
_, err := client.StatObject(context.TODO(), bucketName, fileName, minio.StatObjectOptions{})
if err != nil {
eresp := minio.ToErrorResponse(err)
if eresp.Code != "NoSuchKey" {
return nil, err
}
return &MinioFile{
writer: NewMemoryFile(nil),
client: client,
fileName: fileName,
bucketName: bucketName,
}, nil
}
object, err := client.GetObject(context.TODO(), bucketName, fileName, minio.GetObjectOptions{})
if err != nil {
return nil, err
}
return &MinioFile{
Object: object,
writer: NewMemoryFile(nil),
client: client,
fileName: fileName,
bucketName: bucketName,
}, nil
}

View File

@ -0,0 +1,34 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"github.com/milvus-io/milvus/internal/storagev2/io/fs/file"
)
type Fs interface {
OpenFile(path string) (file.File, error)
Rename(src string, dst string) error
DeleteFile(path string) error
CreateDir(path string) error
List(path string) ([]FileEntry, error)
ReadFile(path string) ([]byte, error)
Exist(path string) (bool, error)
Path() string
MkdirAll(dir string, i int) error
}
type FileEntry struct {
Path string
}

View File

@ -0,0 +1,42 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"fmt"
"net/url"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
var ErrInvalidFsType = errors.New("invalid fs type")
func BuildFileSystem(uri string) (Fs, error) {
parsedURI, err := url.Parse(uri)
if err != nil {
return nil, fmt.Errorf("build file system with uri %s: %w", uri, err)
}
switch parsedURI.Scheme {
case "file":
return NewFsFactory().Create(options.LocalFS, parsedURI)
case "s3":
return NewFsFactory().Create(options.S3, parsedURI)
default:
return nil, fmt.Errorf("build file system with uri %s: %w", uri, ErrInvalidFsType)
}
}

View File

@ -0,0 +1,95 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"net/url"
"os"
"path/filepath"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/io/fs/file"
)
type LocalFS struct {
path string
}
func (l *LocalFS) MkdirAll(dir string, i int) error {
return os.MkdirAll(dir, os.FileMode(i))
}
func (l *LocalFS) OpenFile(path string) (file.File, error) {
// Extract the directory from the path
dir := filepath.Dir(path)
// Create the directory (including all necessary parent directories)
err := os.MkdirAll(dir, os.ModePerm)
if err != nil {
return nil, err
}
open, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o666)
if err != nil {
return nil, err
}
return file.NewLocalFile(open), nil
}
// Rename renames (moves) a file. If newpath already exists and is not a directory, Rename replaces it.
func (l *LocalFS) Rename(src string, dst string) error {
return os.Rename(src, dst)
}
func (l *LocalFS) DeleteFile(path string) error {
return os.Remove(path)
}
func (l *LocalFS) CreateDir(path string) error {
err := os.MkdirAll(path, os.ModePerm)
if err != nil && !os.IsExist(err) {
log.Error(err.Error())
}
return nil
}
func (l *LocalFS) List(path string) ([]FileEntry, error) {
entries, err := os.ReadDir(path)
if err != nil {
log.Error(err.Error())
return nil, err
}
ret := make([]FileEntry, 0, len(entries))
for _, entry := range entries {
ret = append(ret, FileEntry{Path: filepath.Join(path, entry.Name())})
}
return ret, nil
}
func (l *LocalFS) ReadFile(path string) ([]byte, error) {
return os.ReadFile(path)
}
func (l *LocalFS) Exist(path string) (bool, error) {
panic("not implemented")
}
func (l *LocalFS) Path() string {
return l.path
}
func NewLocalFs(uri *url.URL) *LocalFS {
return &LocalFS{uri.Path}
}

View File

@ -0,0 +1,78 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"github.com/milvus-io/milvus/internal/storagev2/io/fs/file"
)
type MemoryFs struct {
files map[string]*file.MemoryFile
}
func (m *MemoryFs) MkdirAll(dir string, i int) error {
// TODO implement me
panic("implement me")
}
func (m *MemoryFs) List(path string) ([]FileEntry, error) {
// TODO implement me
panic("implement me")
}
func (m *MemoryFs) OpenFile(path string) (file.File, error) {
if f, ok := m.files[path]; ok {
return file.NewMemoryFile(f.Bytes()), nil
}
f := file.NewMemoryFile(nil)
m.files[path] = f
return f, nil
}
func (m *MemoryFs) Rename(path string, path2 string) error {
if _, ok := m.files[path]; !ok {
return nil
}
m.files[path2] = m.files[path]
delete(m.files, path)
return nil
}
func (m *MemoryFs) DeleteFile(path string) error {
delete(m.files, path)
return nil
}
func (m *MemoryFs) CreateDir(path string) error {
return nil
}
func (m *MemoryFs) ReadFile(path string) ([]byte, error) {
panic("implement me")
}
func (m *MemoryFs) Exist(path string) (bool, error) {
panic("not implemented")
}
func (m *MemoryFs) Path() string {
panic("not implemented")
}
func NewMemoryFs() *MemoryFs {
return &MemoryFs{
files: make(map[string]*file.MemoryFile),
}
}

View File

@ -0,0 +1,201 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"context"
"fmt"
"io"
"net/url"
"path"
"strings"
"github.com/minio/minio-go/v7"
"github.com/minio/minio-go/v7/pkg/credentials"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/errors"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/io/fs/file"
)
type MinioFs struct {
client *minio.Client
bucketName string
path string
}
func (fs *MinioFs) MkdirAll(dir string, i int) error {
// TODO implement me
panic("implement me")
}
func (fs *MinioFs) OpenFile(path string) (file.File, error) {
err, bucket, path := getRealPath(path)
if err != nil {
return nil, err
}
return file.NewMinioFile(fs.client, path, bucket)
}
func (fs *MinioFs) Rename(src string, dst string) error {
err, dstBucket, dst := getRealPath(dst)
if err != nil {
return err
}
err, srcBucket, src := getRealPath(src)
if err != nil {
return err
}
_, err = fs.client.CopyObject(context.TODO(), minio.CopyDestOptions{Bucket: dstBucket, Object: dst}, minio.CopySrcOptions{Bucket: srcBucket, Object: src})
if err != nil {
return err
}
err = fs.client.RemoveObject(context.TODO(), srcBucket, src, minio.RemoveObjectOptions{})
if err != nil {
log.Warn("failed to remove source object", log.String("source", src))
}
return nil
}
func (fs *MinioFs) DeleteFile(path string) error {
err, bucket, path := getRealPath(path)
if err != nil {
return err
}
return fs.client.RemoveObject(context.TODO(), bucket, path, minio.RemoveObjectOptions{})
}
func (fs *MinioFs) CreateDir(path string) error {
return nil
}
func (fs *MinioFs) List(prefix string) ([]FileEntry, error) {
err, bucket, prefix := getRealPath(prefix)
if err != nil {
return nil, err
}
ret := make([]FileEntry, 0)
for objInfo := range fs.client.ListObjects(context.TODO(), bucket, minio.ListObjectsOptions{Prefix: prefix, Recursive: true}) {
if objInfo.Err != nil {
log.Warn("list object error", zap.Error(objInfo.Err))
return nil, objInfo.Err
}
ret = append(ret, FileEntry{Path: path.Join(bucket, objInfo.Key)})
}
return ret, nil
}
func (fs *MinioFs) ReadFile(path string) ([]byte, error) {
err, bucket, path := getRealPath(path)
if err != nil {
return nil, err
}
obj, err := fs.client.GetObject(context.TODO(), bucket, path, minio.GetObjectOptions{})
if err != nil {
return nil, err
}
stat, err := obj.Stat()
if err != nil {
return nil, err
}
buf := make([]byte, stat.Size)
n, err := obj.Read(buf)
if err != nil && err != io.EOF {
return nil, err
}
if n != int(stat.Size) {
return nil, fmt.Errorf("failed to read full file, expect: %d, actual: %d", stat.Size, n)
}
return buf, nil
}
func (fs *MinioFs) Exist(path string) (bool, error) {
err, bucket, path := getRealPath(path)
if err != nil {
return false, err
}
_, err = fs.client.StatObject(context.TODO(), bucket, path, minio.StatObjectOptions{})
if err != nil {
resp := minio.ToErrorResponse(err)
if resp.Code == "NoSuchKey" {
return false, nil
}
return false, err
}
return true, nil
}
func (fs *MinioFs) Path() string {
return path.Join(fs.bucketName, strings.TrimPrefix(fs.path, "/"))
}
// uri should be s3://username:password@bucket/path?endpoint_override=localhost%3A9000
func NewMinioFs(uri *url.URL) (*MinioFs, error) {
accessKey := uri.User.Username()
secretAccessKey, set := uri.User.Password()
if !set {
log.Warn("secret access key not set")
}
endpoints, ok := uri.Query()[constant.EndpointOverride]
if !ok || len(endpoints) == 0 {
return nil, errors.ErrNoEndpoint
}
cli, err := minio.New(endpoints[0], &minio.Options{
BucketLookup: minio.BucketLookupAuto,
Creds: credentials.NewStaticV4(accessKey, secretAccessKey, ""),
})
if err != nil {
return nil, err
}
bucket := uri.Host
path := uri.Path
log.Info("minio fs infos", zap.String("endpoint", endpoints[0]), zap.String("bucket", bucket), zap.String("path", path))
exist, err := cli.BucketExists(context.TODO(), bucket)
if err != nil {
return nil, err
}
if !exist {
if err = cli.MakeBucket(context.TODO(), bucket, minio.MakeBucketOptions{}); err != nil {
return nil, err
}
}
return &MinioFs{
client: cli,
bucketName: bucket,
path: path,
}, nil
}
func getRealPath(path string) (error, string, string) {
if strings.HasPrefix(path, "/") {
return fmt.Errorf("Invalid path, %s should not start with '/'", path), "", ""
}
words := strings.SplitN(path, "/", 2)
if (len(words)) != 2 {
return fmt.Errorf("Invalid path, %s should contains at least one '/'", path), "", ""
}
return nil, words[0], words[1]
}

View File

@ -0,0 +1,95 @@
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef ARROW_C_DATA_INTERFACE
#define ARROW_C_DATA_INTERFACE
#define ARROW_FLAG_DICTIONARY_ORDERED 1
#define ARROW_FLAG_NULLABLE 2
#define ARROW_FLAG_MAP_KEYS_SORTED 4
struct ArrowSchema {
// Array type description
const char* format;
const char* name;
const char* metadata;
int64_t flags;
int64_t n_children;
struct ArrowSchema** children;
struct ArrowSchema* dictionary;
// Release callback
void (*release)(struct ArrowSchema*);
// Opaque producer-specific data
void* private_data;
};
struct ArrowArray {
// Array data description
int64_t length;
int64_t null_count;
int64_t offset;
int64_t n_buffers;
int64_t n_children;
const void** buffers;
struct ArrowArray** children;
struct ArrowArray* dictionary;
// Release callback
void (*release)(struct ArrowArray*);
// Opaque producer-specific data
void* private_data;
};
#endif // ARROW_C_DATA_INTERFACE
#ifndef ARROW_C_STREAM_INTERFACE
#define ARROW_C_STREAM_INTERFACE
struct ArrowArrayStream {
// Callback to get the stream type
// (will be the same for all arrays in the stream).
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowSchema must be released independently from the stream.
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
// Callback to get the next array
// (if no error and the array is released, the stream has ended)
//
// Return value: 0 if successful, an `errno`-compatible error code otherwise.
//
// If successful, the ArrowArray must be released independently from the stream.
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
// Callback to get optional detailed error information.
// This must only be called if the last stream operation failed
// with a non-0 return code.
//
// Return value: pointer to a null-terminated character array describing
// the last error, or NULL if no description is available.
//
// The returned pointer is only valid until the next operation on this stream
// (including release).
const char* (*get_last_error)(struct ArrowArrayStream*);
// Release callback: release the stream's own resources.
// Note that arrays returned by `get_next` must be individually released.
void (*release)(struct ArrowArrayStream*);
// Opaque producer-specific data
void* private_data;
};
#endif // ARROW_C_STREAM_INTERFACE
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,115 @@
#pragma once
#include <assert.h>
#include <string.h>
#include "arrow/c/abi.h"
#ifdef __cplusplus
extern "C" {
#endif
/// Query whether the C schema is released
static inline int
ArrowSchemaIsReleased(const struct ArrowSchema* schema) {
return schema->release == NULL;
}
/// Mark the C schema released (for use in release callbacks)
static inline void
ArrowSchemaMarkReleased(struct ArrowSchema* schema) {
schema->release = NULL;
}
/// Move the C schema from `src` to `dest`
///
/// Note `dest` must *not* point to a valid schema already, otherwise there
/// will be a memory leak.
static inline void
ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) {
assert(dest != src);
assert(!ArrowSchemaIsReleased(src));
memcpy(dest, src, sizeof(struct ArrowSchema));
ArrowSchemaMarkReleased(src);
}
/// Release the C schema, if necessary, by calling its release callback
static inline void
ArrowSchemaRelease(struct ArrowSchema* schema) {
if (!ArrowSchemaIsReleased(schema)) {
schema->release(schema);
assert(ArrowSchemaIsReleased(schema));
}
}
/// Query whether the C array is released
static inline int
ArrowArrayIsReleased(const struct ArrowArray* array) {
return array->release == NULL;
}
/// Mark the C array released (for use in release callbacks)
static inline void
ArrowArrayMarkReleased(struct ArrowArray* array) {
array->release = NULL;
}
/// Move the C array from `src` to `dest`
///
/// Note `dest` must *not* point to a valid array already, otherwise there
/// will be a memory leak.
static inline void
ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) {
assert(dest != src);
assert(!ArrowArrayIsReleased(src));
memcpy(dest, src, sizeof(struct ArrowArray));
ArrowArrayMarkReleased(src);
}
/// Release the C array, if necessary, by calling its release callback
static inline void
ArrowArrayRelease(struct ArrowArray* array) {
if (!ArrowArrayIsReleased(array)) {
array->release(array);
assert(ArrowArrayIsReleased(array));
}
}
/// Query whether the C array stream is released
static inline int
ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) {
return stream->release == NULL;
}
/// Mark the C array stream released (for use in release callbacks)
static inline void
ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) {
stream->release = NULL;
}
/// Move the C array stream from `src` to `dest`
///
/// Note `dest` must *not* point to a valid stream already, otherwise there
/// will be a memory leak.
static inline void
ArrowArrayStreamMove(struct ArrowArrayStream* src,
struct ArrowArrayStream* dest) {
assert(dest != src);
assert(!ArrowArrayStreamIsReleased(src));
memcpy(dest, src, sizeof(struct ArrowArrayStream));
ArrowArrayStreamMarkReleased(src);
}
/// Release the C array stream, if necessary, by calling its release callback
static inline void
ArrowArrayStreamRelease(struct ArrowArrayStream* stream) {
if (!ArrowArrayStreamIsReleased(stream)) {
stream->release(stream);
assert(ArrowArrayStreamIsReleased(stream));
}
}
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,84 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package packed
/*
#cgo pkg-config: milvus_core
#include <stdlib.h>
#include "segcore/packed_reader_c.h"
#include "arrow/c/abi.h"
#include "arrow/c/helpers.h"
*/
import "C"
import (
"fmt"
"unsafe"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/cdata"
"github.com/cockroachdb/errors"
)
func NewPackedReader(path string, schema *arrow.Schema, bufferSize int) (*PackedReader, error) {
var cas cdata.CArrowSchema
cdata.ExportArrowSchema(schema, &cas)
cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas))
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
cBufferSize := C.int64_t(bufferSize)
var cPackedReader C.CPackedReader
status := C.NewPackedReader(cPath, cSchema, cBufferSize, &cPackedReader)
if status != 0 {
return nil, fmt.Errorf("failed to new packed reader: %s, status: %d", path, status)
}
return &PackedReader{cPackedReader: cPackedReader, schema: schema}, nil
}
func (pr *PackedReader) ReadNext() (arrow.Record, error) {
var cArr C.CArrowArray
var cSchema C.CArrowSchema
status := C.ReadNext(pr.cPackedReader, &cArr, &cSchema)
if status != 0 {
return nil, fmt.Errorf("ReadNext failed with error code %d", status)
}
if cArr == nil {
return nil, nil // end of stream, no more records to read
}
// Convert ArrowArray to Go RecordBatch using cdata
goCArr := (*cdata.CArrowArray)(unsafe.Pointer(cArr))
goCSchema := (*cdata.CArrowSchema)(unsafe.Pointer(cSchema))
recordBatch, err := cdata.ImportCRecordBatch(goCArr, goCSchema)
if err != nil {
return nil, fmt.Errorf("failed to convert ArrowArray to Record: %w", err)
}
// Return the RecordBatch as an arrow.Record
return recordBatch, nil
}
func (pr *PackedReader) Close() error {
status := C.CloseReader(pr.cPackedReader)
if status != 0 {
return errors.New("PackedReader: failed to close file")
}
return nil
}

View File

@ -0,0 +1,156 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package packed
import (
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/stretchr/testify/suite"
"golang.org/x/exp/rand"
)
func TestPackedReadAndWrite(t *testing.T) {
suite.Run(t, new(PackedTestSuite))
}
type PackedTestSuite struct {
suite.Suite
schema *arrow.Schema
rec arrow.Record
}
func (suite *PackedTestSuite) SetupTest() {
schema := arrow.NewSchema([]arrow.Field{
{Name: "a", Type: arrow.PrimitiveTypes.Int32},
{Name: "b", Type: arrow.PrimitiveTypes.Int64},
{Name: "c", Type: arrow.BinaryTypes.String},
}, nil)
suite.schema = schema
b := array.NewRecordBuilder(memory.DefaultAllocator, schema)
defer b.Release()
for idx := range schema.Fields() {
switch idx {
case 0:
b.Field(idx).(*array.Int32Builder).AppendValues(
[]int32{int32(1), int32(2), int32(3)}, nil,
)
case 1:
b.Field(idx).(*array.Int64Builder).AppendValues(
[]int64{int64(4), int64(5), int64(6)}, nil,
)
case 2:
b.Field(idx).(*array.StringBuilder).AppendValues(
[]string{"a", "b", "c"}, nil,
)
}
}
rec := b.NewRecord()
suite.rec = rec
}
func (suite *PackedTestSuite) TestPackedOneFile() {
batches := 100
path := "/tmp"
bufferSize := 10 * 1024 * 1024 // 10MB
pw, err := NewPackedWriter(path, suite.schema, bufferSize)
suite.NoError(err)
for i := 0; i < batches; i++ {
err = pw.WriteRecordBatch(suite.rec)
suite.NoError(err)
}
err = pw.Close()
suite.NoError(err)
reader, err := NewPackedReader(path, suite.schema, bufferSize)
suite.NoError(err)
rr, err := reader.ReadNext()
suite.NoError(err)
defer rr.Release()
suite.Equal(int64(3*batches), rr.NumRows())
}
func (suite *PackedTestSuite) TestPackedMultiFiles() {
batches := 1000
b := array.NewRecordBuilder(memory.DefaultAllocator, suite.schema)
strLen := 1000
arrLen := 30
defer b.Release()
for idx := range suite.schema.Fields() {
switch idx {
case 0:
values := make([]int32, arrLen)
for i := 0; i < arrLen; i++ {
values[i] = int32(i + 1)
}
b.Field(idx).(*array.Int32Builder).AppendValues(values, nil)
case 1:
values := make([]int64, arrLen)
for i := 0; i < arrLen; i++ {
values[i] = int64(i + 1)
}
b.Field(idx).(*array.Int64Builder).AppendValues(values, nil)
case 2:
values := make([]string, arrLen)
for i := 0; i < arrLen; i++ {
values[i] = randomString(strLen)
}
b.Field(idx).(*array.StringBuilder).AppendValues(values, nil)
}
}
rec := b.NewRecord()
defer rec.Release()
path := "/tmp"
bufferSize := 10 * 1024 * 1024 // 10MB
pw, err := NewPackedWriter(path, suite.schema, bufferSize)
suite.NoError(err)
for i := 0; i < batches; i++ {
err = pw.WriteRecordBatch(rec)
suite.NoError(err)
}
err = pw.Close()
suite.NoError(err)
reader, err := NewPackedReader(path, suite.schema, bufferSize)
suite.NoError(err)
var rows int64 = 0
var rr arrow.Record
for {
rr, err = reader.ReadNext()
suite.NoError(err)
if rr == nil {
// end of file
break
}
rows += rr.NumRows()
}
suite.Equal(int64(arrLen*batches), rows)
}
func randomString(length int) string {
const charset = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
result := make([]byte, length)
for i := range result {
result[i] = charset[rand.Intn(len(charset))]
}
return string(result)
}

View File

@ -0,0 +1,77 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package packed
/*
#cgo pkg-config: milvus_core
#include <stdlib.h>
#include "segcore/packed_writer_c.h"
#include "arrow/c/abi.h"
#include "arrow/c/helpers.h"
*/
import "C"
import (
"fmt"
"unsafe"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/cdata"
"github.com/cockroachdb/errors"
)
func NewPackedWriter(path string, schema *arrow.Schema, bufferSize int) (*PackedWriter, error) {
var cas cdata.CArrowSchema
cdata.ExportArrowSchema(schema, &cas)
cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas))
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
cBufferSize := C.int64_t(bufferSize)
var cPackedWriter C.CPackedWriter
status := C.NewPackedWriter(cPath, cSchema, cBufferSize, &cPackedWriter)
if status != 0 {
return nil, fmt.Errorf("failed to new packed writer: %s, status: %d", path, status)
}
return &PackedWriter{cPackedWriter: cPackedWriter}, nil
}
func (pw *PackedWriter) WriteRecordBatch(recordBatch arrow.Record) error {
var caa cdata.CArrowArray
var cas cdata.CArrowSchema
cdata.ExportArrowRecordBatch(recordBatch, &caa, &cas)
cArr := (*C.struct_ArrowArray)(unsafe.Pointer(&caa))
cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas))
status := C.WriteRecordBatch(pw.cPackedWriter, cArr, cSchema)
if status != 0 {
return errors.New("PackedWriter: failed to write record batch")
}
return nil
}
func (pw *PackedWriter) Close() error {
status := C.CloseWriter(pw.cPackedWriter)
if status != 0 {
return errors.New("PackedWriter: failed to close file")
}
return nil
}

View File

@ -0,0 +1,46 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package packed
/*
#include <stdlib.h>
#include "arrow/c/abi.h"
#include "arrow/c/helpers.h"
#include "segcore/packed_reader_c.h"
#include "segcore/packed_writer_c.h"
*/
import "C"
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/cdata"
)
type PackedWriter struct {
cPackedWriter C.CPackedWriter
}
type PackedReader struct {
cPackedReader C.CPackedReader
arr *cdata.CArrowArray
schema *arrow.Schema
}
type (
// CArrowSchema is the C Data Interface for ArrowSchemas
CArrowSchema = C.struct_ArrowSchema
// CArrowArray is the C Data Interface object for Arrow Arrays as defined in abi.h
CArrowArray = C.struct_ArrowArray
)

View File

@ -0,0 +1,65 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package commonreader
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type DeleteReader struct {
recordReader array.RecordReader
schemaOptions *schema.SchemaOptions
deleteFragments fragment.DeleteFragmentVector
options *options.ReadOptions
}
func (d DeleteReader) Retain() {
// TODO implement me
panic("implement me")
}
func (d DeleteReader) Release() {
// TODO implement me
panic("implement me")
}
func (d DeleteReader) Schema() *arrow.Schema {
// TODO implement me
panic("implement me")
}
func (d DeleteReader) Next() bool {
// TODO implement me
panic("implement me")
}
func (d DeleteReader) Record() arrow.Record {
// TODO implement me
panic("implement me")
}
func (d DeleteReader) Err() error {
// TODO implement me
panic("implement me")
}
func NewDeleteReader(recordReader array.RecordReader, schemaOptions *schema.SchemaOptions, deleteFragments fragment.DeleteFragmentVector, options *options.ReadOptions) *DeleteReader {
return &DeleteReader{recordReader: recordReader, schemaOptions: schemaOptions, deleteFragments: deleteFragments, options: options}
}

View File

@ -0,0 +1,84 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package commonreader
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type FilterReader struct {
recordReader array.RecordReader
option *options.ReadOptions
currentFilteredBatchReader array.RecordReader
}
func (r *FilterReader) Retain() {
// TODO implement me
panic("implement me")
}
func (r *FilterReader) Release() {
// TODO implement me
panic("implement me")
}
func (r *FilterReader) Schema() *arrow.Schema {
// TODO implement me
panic("implement me")
}
func (r *FilterReader) Record() arrow.Record {
// TODO implement me
panic("implement me")
}
func (r *FilterReader) Err() error {
// TODO implement me
panic("implement me")
}
func MakeFilterReader(recordReader array.RecordReader, option *options.ReadOptions) *FilterReader {
return &FilterReader{
recordReader: recordReader,
option: option,
}
}
func (r *FilterReader) Next() bool {
//for {
// if r.currentFilteredBatchReader != nil {
// filteredBatch := r.currentFilteredBatchReader.Next()
// if err != nil {
// return false
// }
// if filteredBatch == nil {
// r.currentFilteredBatchReader = nil
// continue
// }
// return filteredBatch, nil
// }
// err := r.NextFilteredBatchReader()
// if err != nil {
// return nil
// }
// if r.currentFilteredBatchReader == nil {
// return nil
// }
//}
return false
}

View File

@ -0,0 +1,35 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package commonreader
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type ProjectionReader struct {
array.RecordReader
reader array.RecordReader
options *options.ReadOptions
schema *arrow.Schema
}
func NewProjectionReader(reader array.RecordReader, options *options.ReadOptions, schema *arrow.Schema) array.RecordReader {
projectionSchema := utils.ProjectSchema(schema, options.Columns)
return &ProjectionReader{reader: reader, options: options, schema: projectionSchema}
}

View File

@ -0,0 +1,49 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package recordreader
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type FilterQueryRecordReader struct {
// TODO implement me
ref int64
schema *schema.Schema
options *options.ReadOptions
fs fs.Fs
scalarFragment fragment.FragmentVector
vectorFragment fragment.FragmentVector
deleteFragments fragment.DeleteFragmentVector
record arrow.Record
}
func NewFilterQueryReader(
s *schema.Schema,
options *options.ReadOptions,
f fs.Fs,
scalarFragment fragment.FragmentVector,
vectorFragment fragment.FragmentVector,
deleteFragments fragment.DeleteFragmentVector,
) array.RecordReader {
// TODO implement me
panic("implement me")
}

View File

@ -0,0 +1,77 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package recordreader
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type MergeRecordReader struct {
ref int64
schema *schema.Schema
options *options.ReadOptions
fs fs.Fs
scalarFragments fragment.FragmentVector
vectorFragments fragment.FragmentVector
deleteFragments fragment.DeleteFragmentVector
record arrow.Record
}
func (m MergeRecordReader) Retain() {
// TODO implement me
panic("implement me")
}
func (m MergeRecordReader) Release() {
// TODO implement me
panic("implement me")
}
func (m MergeRecordReader) Schema() *arrow.Schema {
// TODO implement me
panic("implement me")
}
func (m MergeRecordReader) Next() bool {
// TODO implement me
panic("implement me")
}
func (m MergeRecordReader) Record() arrow.Record {
// TODO implement me
panic("implement me")
}
func (m MergeRecordReader) Err() error {
// TODO implement me
panic("implement me")
}
func NewMergeRecordReader(
s *schema.Schema,
options *options.ReadOptions,
f fs.Fs,
scalarFragment fragment.FragmentVector,
vectorFragment fragment.FragmentVector,
deleteFragments fragment.DeleteFragmentVector,
) *MergeRecordReader {
// TODO implement me
panic("implement me")
}

View File

@ -0,0 +1,119 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package recordreader
import (
"sync/atomic"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/milvus-io/milvus/internal/storagev2/common/arrowutil"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type MultiFilesSequentialReader struct {
fs fs.Fs
schema *arrow.Schema
files []string
nextPos int
options *options.ReadOptions
currReader array.RecordReader
err error
ref int64
}
func (m *MultiFilesSequentialReader) Retain() {
atomic.AddInt64(&m.ref, 1)
}
func (m *MultiFilesSequentialReader) Release() {
if atomic.AddInt64(&m.ref, -1) == 0 {
if m.currReader != nil {
m.currReader.Release()
m.currReader = nil
}
}
}
func (m *MultiFilesSequentialReader) Schema() *arrow.Schema {
return m.schema
}
func (m *MultiFilesSequentialReader) Next() bool {
for {
if m.currReader == nil {
if m.nextPos >= len(m.files) {
return false
}
m.nextReader()
if m.err != nil {
return false
}
m.nextPos++
}
if m.currReader.Next() {
return true
}
if m.currReader.Err() != nil {
m.err = m.currReader.Err()
return false
}
if m.currReader != nil {
m.currReader.Release()
m.currReader = nil
}
}
}
func (m *MultiFilesSequentialReader) Record() arrow.Record {
if m.currReader != nil {
return m.currReader.Record()
}
return nil
}
func (m *MultiFilesSequentialReader) Err() error {
return m.err
}
func (m *MultiFilesSequentialReader) nextReader() {
var fileReader *pqarrow.FileReader
fileReader, m.err = arrowutil.MakeArrowFileReader(m.fs, m.files[m.nextPos])
if m.err != nil {
return
}
m.currReader, m.err = arrowutil.MakeArrowRecordReader(fileReader, m.options)
}
func NewMultiFilesSequentialReader(fs fs.Fs, fragments fragment.FragmentVector, schema *arrow.Schema, options *options.ReadOptions) *MultiFilesSequentialReader {
files := make([]string, 0, len(fragments))
for _, f := range fragments {
files = append(files, f.Files()...)
}
return &MultiFilesSequentialReader{
fs: fs,
schema: schema,
options: options,
files: files,
nextPos: 0,
ref: 1,
}
}

View File

@ -0,0 +1,93 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package recordreader
import (
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/filter"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/manifest"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
func MakeRecordReader(
m *manifest.Manifest,
s *schema.Schema,
f fs.Fs,
deleteFragments fragment.DeleteFragmentVector,
options *options.ReadOptions,
) array.RecordReader {
relatedColumns := make([]string, 0)
relatedColumns = append(relatedColumns, options.Columns...)
for _, filter := range options.Filters {
relatedColumns = append(relatedColumns, filter.GetColumnName())
}
scalarData := m.GetScalarFragments()
vectorData := m.GetVectorFragments()
onlyScalar := onlyContainScalarColumns(s, relatedColumns)
onlyVector := onlyContainVectorColumns(s, relatedColumns)
if onlyScalar || onlyVector {
var dataFragments fragment.FragmentVector
if onlyScalar {
dataFragments = scalarData
} else {
dataFragments = vectorData
}
return NewScanRecordReader(s, options, f, dataFragments, deleteFragments)
}
if len(options.Filters) > 0 && filtersOnlyContainPKAndVersion(s, options.FiltersV2) {
return NewMergeRecordReader(s, options, f, scalarData, vectorData, deleteFragments)
}
return NewFilterQueryReader(s, options, f, scalarData, vectorData, deleteFragments)
}
func onlyContainVectorColumns(schema *schema.Schema, relatedColumns []string) bool {
for _, column := range relatedColumns {
if schema.Options().VectorColumn != column && schema.Options().PrimaryColumn != column && schema.Options().VersionColumn != column {
return false
}
}
return true
}
func onlyContainScalarColumns(schema *schema.Schema, relatedColumns []string) bool {
for _, column := range relatedColumns {
if schema.Options().VectorColumn == column {
return false
}
}
return true
}
func filtersOnlyContainPKAndVersion(s *schema.Schema, filters []filter.Filter) bool {
for _, f := range filters {
if f.GetColumnName() != s.Options().PrimaryColumn &&
f.GetColumnName() != s.Options().VersionColumn {
return false
}
}
return true
}
func MakeScanDeleteReader(manifest *manifest.Manifest, fs fs.Fs) array.RecordReader {
return NewMultiFilesSequentialReader(fs, manifest.GetDeleteFragments(), manifest.GetSchema().DeleteSchema(), options.NewReadOptions())
}

View File

@ -0,0 +1,151 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package recordreader
import (
"io"
"sync/atomic"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/format"
"github.com/milvus-io/milvus/internal/storagev2/io/format/parquet"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/reader/commonreader"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type ScanRecordReader struct {
ref int64
schema *schema.Schema
options *options.ReadOptions
fs fs.Fs
dataFragments fragment.FragmentVector
deleteFragments fragment.DeleteFragmentVector
rec arrow.Record
curReader format.Reader
reader array.RecordReader
nextPos int
err error
}
func NewScanRecordReader(
s *schema.Schema,
options *options.ReadOptions,
f fs.Fs,
dataFragments fragment.FragmentVector,
deleteFragments fragment.DeleteFragmentVector,
) *ScanRecordReader {
return &ScanRecordReader{
ref: 1,
schema: s,
options: options,
fs: f,
dataFragments: dataFragments,
deleteFragments: deleteFragments,
}
}
func (r *ScanRecordReader) Schema() *arrow.Schema {
return utils.ProjectSchema(r.schema.Schema(), r.options.OutputColumns())
}
func (r *ScanRecordReader) Retain() {
atomic.AddInt64(&r.ref, 1)
}
func (r *ScanRecordReader) Release() {
if atomic.AddInt64(&r.ref, -1) == 0 {
if r.rec != nil {
r.rec.Release()
r.rec = nil
}
if r.curReader != nil {
r.curReader.Close()
r.curReader = nil
}
}
}
func (r *ScanRecordReader) Next() bool {
datafiles := fragment.ToFilesVector(r.dataFragments)
log.Debug("ScanRecordReader Next", zap.Any("datafiles", datafiles))
if r.rec != nil {
r.rec.Release()
r.rec = nil
}
for {
if r.curReader == nil {
if r.nextPos >= len(datafiles) {
return false
}
// FIXME: nil options
reader, err := parquet.NewFileReader(r.fs, datafiles[r.nextPos], r.options)
if err != nil {
r.err = err
return false
}
r.nextPos++
r.curReader = reader
}
rec, err := r.curReader.Read()
if err != nil {
if err == io.EOF {
r.curReader.Close()
r.curReader = nil
continue
}
// if error occurs in the middle of reading, return false
r.curReader.Close()
r.curReader = nil
r.err = err
return false
}
if rec.NumRows() == 0 {
continue
}
r.rec = rec
return true
}
}
func (r *ScanRecordReader) Record() arrow.Record {
return r.rec
}
func (r *ScanRecordReader) Err() error {
return r.err
}
func (r *ScanRecordReader) MakeInnerReader() array.RecordReader {
// TODO implement me
reader := NewMultiFilesSequentialReader(r.fs, r.dataFragments, r.Schema(), r.options)
filterReader := commonreader.MakeFilterReader(reader, r.options)
deleteReader := commonreader.NewDeleteReader(filterReader, r.schema.Options(), r.deleteFragments, r.options)
res := commonreader.NewProjectionReader(deleteReader, r.options, r.schema.Schema())
return res
}

View File

@ -0,0 +1,98 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lock
import (
"sync"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
)
type LockManager interface {
// Acquire the lock, wait until the lock is available, return the version to be modified or use the newest version
Acquire() (version int64, useLatestVersion bool, err error)
// Release the lock, accepts the new allocated manifest version and success state of operations between Acquire and Release as parameters
Release(version int64, success bool) error
}
type EmptyLockManager struct{}
func (h *EmptyLockManager) Acquire() (version int64, useLatestVersion bool, err error) {
return constant.LatestManifestVersion, true, nil
}
func (h *EmptyLockManager) Release(_ int64, _ bool) error {
return nil
}
type MemoryLockManager struct {
mu sync.Mutex
locks map[int64]bool
nextVersion int64
}
func NewMemoryLockManager() *MemoryLockManager {
return &MemoryLockManager{
mu: sync.Mutex{},
locks: make(map[int64]bool),
nextVersion: 0,
}
}
func (m *MemoryLockManager) Acquire() (version int64, useLatestVersion bool, err error) {
m.mu.Lock()
defer m.mu.Unlock()
version = m.nextVersion
if m.locks[version] {
log.Warn("lock is already acquired", zap.Int64("version", version))
return version, false, errors.New("lock is already acquired")
}
if version == constant.LatestManifestVersion {
useLatestVersion = true
} else {
useLatestVersion = false
}
m.locks[version] = true
log.Info("acquire lock", zap.Int64("version", version), zap.Bool("useLatestVersion", useLatestVersion))
return version, useLatestVersion, nil
}
func (m *MemoryLockManager) Release(version int64, success bool) error {
m.mu.Lock()
defer m.mu.Unlock()
realVersion := int64(0)
realVersion = version - 1
if !m.locks[realVersion] {
return errors.New("lock is already released or does not exist")
}
m.locks[realVersion] = false
log.Info("release lock", zap.Int64("version", realVersion), zap.Bool("success", success))
if success {
m.nextVersion = version
} else {
m.nextVersion = constant.LatestManifestVersion
}
return nil
}

View File

@ -0,0 +1,80 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package manifest
import (
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/storage/lock"
)
type ManifestCommit struct {
ops []ManifestCommitOp
lock lock.LockManager
rw ManifestReaderWriter
}
func (m *ManifestCommit) AddOp(op ...ManifestCommitOp) {
m.ops = append(m.ops, op...)
}
func (m ManifestCommit) Commit() (manifest *Manifest, err error) {
ver, latest, err := m.lock.Acquire()
if err != nil {
return nil, err
}
var version int64
defer func() {
if err != nil {
if err2 := m.lock.Release(-1, false); err2 != nil {
err = err2
}
} else {
err = m.lock.Release(version, true)
}
}()
var base *Manifest
if latest {
base, err = m.rw.Read(constant.LatestManifestVersion)
if err != nil {
return nil, err
}
base.version++
} else {
base, err = m.rw.Read(ver)
if err != nil {
return nil, err
}
maxVersion, err := m.rw.MaxVersion()
if err != nil {
return nil, err
}
base.version = maxVersion + 1
}
for _, op := range m.ops {
op.commit(base)
}
version = base.version
err = m.rw.Write(base)
if err != nil {
return nil, err
}
return base, nil
}
func NewManifestCommit(lock lock.LockManager, rw ManifestReaderWriter) ManifestCommit {
return ManifestCommit{nil, lock, rw}
}

View File

@ -0,0 +1,68 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package manifest
import (
"github.com/milvus-io/milvus/internal/storagev2/common/errors"
"github.com/milvus-io/milvus/internal/storagev2/file/blob"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
)
type ManifestCommitOp interface {
commit(manifest *Manifest) error
}
type AddScalarFragmentOp struct {
ScalarFragment fragment.Fragment
}
func (op AddScalarFragmentOp) commit(manifest *Manifest) error {
op.ScalarFragment.SetFragmentId(manifest.Version())
manifest.AddScalarFragment(op.ScalarFragment)
return nil
}
type AddVectorFragmentOp struct {
VectorFragment fragment.Fragment
}
func (op AddVectorFragmentOp) commit(manifest *Manifest) error {
op.VectorFragment.SetFragmentId(manifest.Version())
manifest.AddVectorFragment(op.VectorFragment)
return nil
}
type AddDeleteFragmentOp struct {
DeleteFragment fragment.Fragment
}
func (op AddDeleteFragmentOp) commit(manifest *Manifest) error {
op.DeleteFragment.SetFragmentId(manifest.Version())
manifest.AddDeleteFragment(op.DeleteFragment)
return nil
}
type AddBlobOp struct {
Replace bool
Blob blob.Blob
}
func (op AddBlobOp) commit(manifest *Manifest) error {
if !op.Replace && manifest.HasBlob(op.Blob.Name) {
return errors.ErrBlobAlreadyExist
}
manifest.AddBlob(op.Blob)
return nil
}

View File

@ -0,0 +1,243 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package manifest
import (
"fmt"
"github.com/apache/arrow/go/v12/arrow"
"google.golang.org/protobuf/proto"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/file/blob"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/io/fs/file"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
type Manifest struct {
schema *schema.Schema
ScalarFragments fragment.FragmentVector
vectorFragments fragment.FragmentVector
deleteFragments fragment.FragmentVector
blobs []blob.Blob
version int64
}
func NewManifest(schema *schema.Schema) *Manifest {
return &Manifest{
schema: schema,
}
}
func Init() *Manifest {
return &Manifest{
schema: schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions()),
}
}
func (m *Manifest) Copy() *Manifest {
copied := *m
return &copied
}
func (m *Manifest) GetSchema() *schema.Schema {
return m.schema
}
func (m *Manifest) AddScalarFragment(fragment fragment.Fragment) {
m.ScalarFragments = append(m.ScalarFragments, fragment)
}
func (m *Manifest) AddVectorFragment(fragment fragment.Fragment) {
m.vectorFragments = append(m.vectorFragments, fragment)
}
func (m *Manifest) AddDeleteFragment(fragment fragment.Fragment) {
m.deleteFragments = append(m.deleteFragments, fragment)
}
func (m *Manifest) GetScalarFragments() fragment.FragmentVector {
return m.ScalarFragments
}
func (m *Manifest) GetVectorFragments() fragment.FragmentVector {
return m.vectorFragments
}
func (m *Manifest) GetDeleteFragments() fragment.FragmentVector {
return m.deleteFragments
}
func (m *Manifest) Version() int64 {
return m.version
}
func (m *Manifest) SetVersion(version int64) {
m.version = version
}
func (m *Manifest) GetBlobs() []blob.Blob {
return m.blobs
}
func (m *Manifest) ToProtobuf() (*storagev2pb.Manifest, error) {
manifest := &storagev2pb.Manifest{}
manifest.Version = m.version
for _, vectorFragment := range m.vectorFragments {
manifest.VectorFragments = append(manifest.VectorFragments, vectorFragment.ToProtobuf())
}
for _, scalarFragment := range m.ScalarFragments {
manifest.ScalarFragments = append(manifest.ScalarFragments, scalarFragment.ToProtobuf())
}
for _, deleteFragment := range m.deleteFragments {
manifest.DeleteFragments = append(manifest.DeleteFragments, deleteFragment.ToProtobuf())
}
for _, blob := range m.blobs {
manifest.Blobs = append(manifest.Blobs, blob.ToProtobuf())
}
schemaProto, err := m.schema.ToProtobuf()
if err != nil {
return nil, err
}
manifest.Schema = schemaProto
return manifest, nil
}
func (m *Manifest) FromProtobuf(manifest *storagev2pb.Manifest) error {
err := m.schema.FromProtobuf(manifest.Schema)
if err != nil {
return err
}
for _, vectorFragment := range manifest.VectorFragments {
m.vectorFragments = append(m.vectorFragments, fragment.FromProtobuf(vectorFragment))
}
for _, scalarFragment := range manifest.ScalarFragments {
m.ScalarFragments = append(m.ScalarFragments, fragment.FromProtobuf(scalarFragment))
}
for _, deleteFragment := range manifest.DeleteFragments {
m.deleteFragments = append(m.deleteFragments, fragment.FromProtobuf(deleteFragment))
}
for _, b := range manifest.Blobs {
m.blobs = append(m.blobs, blob.FromProtobuf(b))
}
m.version = manifest.Version
return nil
}
func WriteManifestFile(manifest *Manifest, output file.File) error {
protoManifest, err := manifest.ToProtobuf()
if err != nil {
return err
}
bytes, err := proto.Marshal(protoManifest)
if err != nil {
return fmt.Errorf("write manifest file: %w", err)
}
write, err := output.Write(bytes)
if err != nil {
return fmt.Errorf("write manifest file: %w", err)
}
if write != len(bytes) {
return fmt.Errorf("failed to write whole file, expect: %v, actual: %v", len(bytes), write)
}
if err = output.Close(); err != nil {
return err
}
return nil
}
func (m *Manifest) HasBlob(name string) bool {
for _, b := range m.blobs {
if b.Name == name {
return true
}
}
return false
}
func (m *Manifest) AddBlob(blob blob.Blob) {
m.blobs = append(m.blobs, blob)
}
func (m *Manifest) RemoveBlobIfExist(name string) {
idx := -1
for i, b := range m.blobs {
if b.Name == name {
idx = i
break
}
}
m.blobs = append(m.blobs[0:idx], m.blobs[idx+1:]...)
}
func (m *Manifest) GetBlob(name string) (blob.Blob, bool) {
for _, b := range m.blobs {
if b.Name == name {
return b, true
}
}
return blob.Blob{}, false
}
func ParseFromFile(f fs.Fs, path string) (*Manifest, error) {
manifest := Init()
manifestProto := &storagev2pb.Manifest{}
buf, err := f.ReadFile(path)
if err != nil {
return nil, err
}
err = proto.Unmarshal(buf, manifestProto)
if err != nil {
log.Error("Failed to unmarshal manifest proto", log.String("err", err.Error()))
return nil, fmt.Errorf("parse from file: %w", err)
}
err = manifest.FromProtobuf(manifestProto)
if err != nil {
return nil, err
}
return manifest, nil
}
// TODO REMOVE BELOW CODE
type DataFile struct {
path string
cols []string
}
func (d *DataFile) Path() string {
return d.path
}
func NewDataFile(path string) *DataFile {
return &DataFile{path: path}
}

View File

@ -0,0 +1,119 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package manifest
import (
"fmt"
"path/filepath"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
)
var ErrManifestNotFound = errors.New("manifest not found")
type ManifestReaderWriter struct {
fs fs.Fs
root string
}
func findAllManifest(fs fs.Fs, path string) ([]fs.FileEntry, error) {
files, err := fs.List(path)
log.Debug("list all manifest:", log.Any("files", files))
if err != nil {
return nil, err
}
return files, nil
}
func (rw ManifestReaderWriter) Read(version int64) (*Manifest, error) {
manifests, err := findAllManifest(rw.fs, utils.GetManifestDir(rw.root))
if err != nil {
return nil, err
}
var maxVersionManifest string
var maxVersion int64 = -1
for _, m := range manifests {
ver := utils.ParseVersionFromFileName(filepath.Base(m.Path))
if ver == -1 {
continue
}
if version != constant.LatestManifestVersion {
if ver == version {
return ParseFromFile(rw.fs, m.Path)
}
} else if ver > maxVersion {
maxVersion = ver
maxVersionManifest = m.Path
}
}
if maxVersion != -1 {
return ParseFromFile(rw.fs, maxVersionManifest)
}
return nil, ErrManifestNotFound
}
func (rw ManifestReaderWriter) MaxVersion() (int64, error) {
manifests, err := findAllManifest(rw.fs, utils.GetManifestDir(rw.root))
if err != nil {
return -1, err
}
var max int64 = -1
for _, m := range manifests {
ver := utils.ParseVersionFromFileName(filepath.Base(m.Path))
if ver == -1 {
continue
}
if ver > max {
max = ver
}
}
if max == -1 {
return -1, ErrManifestNotFound
}
return max, nil
}
func (rw ManifestReaderWriter) Write(m *Manifest) error {
tmpManifestFilePath := utils.GetManifestTmpFilePath(rw.root, m.Version())
manifestFilePath := utils.GetManifestFilePath(rw.root, m.Version())
log.Debug("path", log.String("tmpManifestFilePath", tmpManifestFilePath), log.String("manifestFilePath", manifestFilePath))
output, err := rw.fs.OpenFile(tmpManifestFilePath)
if err != nil {
return fmt.Errorf("open file error: %w", err)
}
if err = WriteManifestFile(m, output); err != nil {
return err
}
err = rw.fs.Rename(tmpManifestFilePath, manifestFilePath)
if err != nil {
return fmt.Errorf("rename file error: %w", err)
}
log.Debug("save manifest file success", log.String("path", manifestFilePath))
return nil
}
func NewManifestReaderWriter(fs fs.Fs, root string) ManifestReaderWriter {
return ManifestReaderWriter{fs, root}
}

View File

@ -0,0 +1,144 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package options
import (
"math"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/filter"
"github.com/milvus-io/milvus/internal/storagev2/storage/lock"
"github.com/milvus-io/milvus/internal/storagev2/storage/schema"
)
type Options struct {
Schema *schema.Schema // optional
Version int64 // optional
LockManager lock.LockManager // optional, no lock manager as default
}
type SpaceOptionsBuilder struct {
options Options
}
func (b *SpaceOptionsBuilder) SetSchema(schema *schema.Schema) *SpaceOptionsBuilder {
b.options.Schema = schema
return b
}
func (b *SpaceOptionsBuilder) SetVersion(version int64) *SpaceOptionsBuilder {
b.options.Version = version
return b
}
func (b *SpaceOptionsBuilder) SetLockManager(lockManager lock.LockManager) *SpaceOptionsBuilder {
b.options.LockManager = lockManager
return b
}
func (b *SpaceOptionsBuilder) Reset() {
b.options = Options{LockManager: &lock.EmptyLockManager{}}
}
func (b *SpaceOptionsBuilder) Build() Options { return b.options }
func NewSpaceOptionBuilder() *SpaceOptionsBuilder {
return &SpaceOptionsBuilder{
options: Options{
Version: constant.LatestManifestVersion,
LockManager: &lock.EmptyLockManager{},
},
}
}
func DefaultOptions() *Options {
return &Options{}
}
type WriteOptions struct {
MaxRecordPerFile int64
}
var DefaultWriteOptions = WriteOptions{
MaxRecordPerFile: 1024,
}
func NewWriteOption() *WriteOptions {
return &WriteOptions{
MaxRecordPerFile: 1024,
}
}
type FsType int8
const (
InMemory FsType = iota
LocalFS
S3
)
type SpaceOptions struct {
Fs FsType
VectorColumns []string
}
// TODO: Change to FilterSet type
type FilterSet []filter.Filter
var version int64 = math.MaxInt64
type ReadOptions struct {
// Filters map[string]filter.Filter
Filters map[string]filter.Filter
FiltersV2 FilterSet
Columns []string
ManifestVersion int64
version int64
}
func NewReadOptions() *ReadOptions {
return &ReadOptions{
Filters: make(map[string]filter.Filter),
FiltersV2: make(FilterSet, 0),
Columns: make([]string, 0),
ManifestVersion: constant.LatestManifestVersion,
version: math.MaxInt64,
}
}
func (o *ReadOptions) AddFilter(filter filter.Filter) {
o.Filters[filter.GetColumnName()] = filter
o.FiltersV2 = append(o.FiltersV2, filter)
}
func (o *ReadOptions) AddColumn(column string) {
o.Columns = append(o.Columns, column)
}
func (o *ReadOptions) SetColumns(columns []string) {
o.Columns = columns
}
func (o *ReadOptions) SetVersion(version int64) {
o.version = version
}
func (o *ReadOptions) GetVersion() int64 {
return o.version
}
func (o *ReadOptions) OutputColumns() []string {
return o.Columns
}

View File

@ -0,0 +1,150 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package schema
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
// Schema is a wrapper of arrow schema
type Schema struct {
schema *arrow.Schema
scalarSchema *arrow.Schema
vectorSchema *arrow.Schema
deleteSchema *arrow.Schema
options *SchemaOptions
}
func (s *Schema) Schema() *arrow.Schema {
return s.schema
}
func (s *Schema) Options() *SchemaOptions {
return s.options
}
func NewSchema(schema *arrow.Schema, options *SchemaOptions) *Schema {
return &Schema{
schema: schema,
options: options,
}
}
func (s *Schema) Validate() error {
err := s.options.Validate(s.schema)
if err != nil {
return err
}
err = s.BuildScalarSchema()
if err != nil {
return err
}
err = s.BuildVectorSchema()
if err != nil {
return err
}
err = s.BuildDeleteSchema()
if err != nil {
return err
}
return nil
}
func (s *Schema) ScalarSchema() *arrow.Schema {
return s.scalarSchema
}
func (s *Schema) VectorSchema() *arrow.Schema {
return s.vectorSchema
}
func (s *Schema) DeleteSchema() *arrow.Schema {
return s.deleteSchema
}
func (s *Schema) FromProtobuf(schema *storagev2pb.Schema) error {
schemaType, err := utils.FromProtobufSchema(schema.ArrowSchema)
if err != nil {
return err
}
s.schema = schemaType
s.options.FromProtobuf(schema.GetSchemaOptions())
s.BuildScalarSchema()
s.BuildVectorSchema()
s.BuildDeleteSchema()
return nil
}
func (s *Schema) ToProtobuf() (*storagev2pb.Schema, error) {
schema := &storagev2pb.Schema{}
arrowSchema, err := utils.ToProtobufSchema(s.schema)
if err != nil {
return nil, err
}
schema.ArrowSchema = arrowSchema
schema.SchemaOptions = s.options.ToProtobuf()
return schema, nil
}
func (s *Schema) BuildScalarSchema() error {
fields := make([]arrow.Field, 0, len(s.schema.Fields()))
for _, field := range s.schema.Fields() {
if field.Name == s.options.VectorColumn {
continue
}
fields = append(fields, field)
}
offsetFiled := arrow.Field{Name: constant.OffsetFieldName, Type: arrow.DataType(&arrow.Int64Type{})}
fields = append(fields, offsetFiled)
s.scalarSchema = arrow.NewSchema(fields, nil)
return nil
}
func (s *Schema) BuildVectorSchema() error {
fields := make([]arrow.Field, 0, len(s.schema.Fields()))
for _, field := range s.schema.Fields() {
if field.Name == s.options.VectorColumn ||
field.Name == s.options.PrimaryColumn ||
field.Name == s.options.VersionColumn {
fields = append(fields, field)
}
}
s.vectorSchema = arrow.NewSchema(fields, nil)
return nil
}
func (s *Schema) BuildDeleteSchema() error {
pkColumn, ok := s.schema.FieldsByName(s.options.PrimaryColumn)
if !ok {
return ErrPrimaryColumnNotFound
}
versionField, ok := s.schema.FieldsByName(s.options.VersionColumn)
if !ok {
return ErrVersionColumnNotFound
}
fields := make([]arrow.Field, 0, 2)
fields = append(fields, pkColumn[0])
fields = append(fields, versionField[0])
s.deleteSchema = arrow.NewSchema(fields, nil)
return nil
}

View File

@ -0,0 +1,97 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package schema
import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
var (
ErrPrimaryColumnNotFound = errors.New("primary column not found")
ErrPrimaryColumnType = errors.New("primary column is not int64 or string")
ErrPrimaryColumnEmpty = errors.New("primary column is empty")
ErrVersionColumnNotFound = errors.New("version column not found")
ErrVersionColumnType = errors.New("version column is not int64")
ErrVectorColumnNotFound = errors.New("vector column not found")
ErrVectorColumnType = errors.New("vector column is not fixed size binary or fixed size list")
ErrVectorColumnEmpty = errors.New("vector column is empty")
)
type SchemaOptions struct {
PrimaryColumn string
VersionColumn string
VectorColumn string
}
func DefaultSchemaOptions() *SchemaOptions {
return &SchemaOptions{
PrimaryColumn: "",
VersionColumn: "",
VectorColumn: "",
}
}
func (o *SchemaOptions) ToProtobuf() *storagev2pb.SchemaOptions {
options := &storagev2pb.SchemaOptions{}
options.PrimaryColumn = o.PrimaryColumn
options.VersionColumn = o.VersionColumn
options.VectorColumn = o.VectorColumn
return options
}
func (o *SchemaOptions) FromProtobuf(options *storagev2pb.SchemaOptions) {
o.PrimaryColumn = options.PrimaryColumn
o.VersionColumn = options.VersionColumn
o.VectorColumn = options.VectorColumn
}
func (o *SchemaOptions) Validate(schema *arrow.Schema) error {
if o.PrimaryColumn != "" {
primaryField, ok := schema.FieldsByName(o.PrimaryColumn)
if !ok {
return ErrPrimaryColumnNotFound
} else if primaryField[0].Type.ID() != arrow.STRING && primaryField[0].Type.ID() != arrow.INT64 {
return ErrPrimaryColumnType
}
} else {
return ErrPrimaryColumnEmpty
}
if o.VersionColumn != "" {
versionField, ok := schema.FieldsByName(o.VersionColumn)
if !ok {
return ErrVersionColumnNotFound
} else if versionField[0].Type.ID() != arrow.INT64 {
return ErrVersionColumnType
}
}
if o.VectorColumn != "" {
vectorField, b := schema.FieldsByName(o.VectorColumn)
if !b {
return ErrVectorColumnNotFound
} else if vectorField[0].Type.ID() != arrow.FIXED_SIZE_BINARY && vectorField[0].Type.ID() != arrow.FIXED_SIZE_LIST {
return ErrVectorColumnType
}
} else {
return ErrVectorColumnEmpty
}
return nil
}
func (o *SchemaOptions) HasVersionColumn() bool {
return o.VersionColumn != ""
}

View File

@ -0,0 +1,53 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package schema
import (
"testing"
"github.com/apache/arrow/go/v12/arrow"
"github.com/stretchr/testify/assert"
)
// Test Schema.Schema
func TestBuildSchema(t *testing.T) {
pkField := arrow.Field{
Name: "pk_field",
Type: arrow.DataType(&arrow.Int64Type{}),
Nullable: false,
}
vsField := arrow.Field{
Name: "vs_field",
Type: arrow.DataType(&arrow.Int64Type{}),
Nullable: false,
}
vecField := arrow.Field{
Name: "vec_field",
Type: arrow.DataType(&arrow.FixedSizeBinaryType{ByteWidth: 16}),
Nullable: false,
}
fields := []arrow.Field{pkField, vsField, vecField}
as := arrow.NewSchema(fields, nil)
schemaOptions := &SchemaOptions{
PrimaryColumn: "pk_field",
VersionColumn: "vs_field",
VectorColumn: "vec_field",
}
sc := NewSchema(as, schemaOptions)
err := sc.Validate()
assert.NoError(t, err)
}

View File

@ -0,0 +1,220 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"math"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/milvus-io/milvus/internal/storagev2/common/errors"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/internal/storagev2/file/blob"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/filter"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/reader/recordreader"
"github.com/milvus-io/milvus/internal/storagev2/storage/lock"
"github.com/milvus-io/milvus/internal/storagev2/storage/manifest"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
"github.com/milvus-io/milvus/internal/storagev2/storage/transaction"
)
type Space struct {
path string
fs fs.Fs
deleteFragments fragment.DeleteFragmentVector
manifest *manifest.Manifest
lockManager lock.LockManager
}
func (s *Space) init() error {
for _, f := range s.manifest.GetDeleteFragments() {
deleteFragment := fragment.Make(s.fs, s.manifest.GetSchema(), f)
s.deleteFragments = append(s.deleteFragments, deleteFragment)
}
return nil
}
func NewSpace(f fs.Fs, path string, m *manifest.Manifest, lockManager lock.LockManager) *Space {
deleteFragments := fragment.DeleteFragmentVector{}
return &Space{
fs: f,
path: path,
manifest: m,
deleteFragments: deleteFragments,
lockManager: lockManager,
}
}
func (s *Space) NewTransaction() transaction.Transaction {
return transaction.NewConcurrentWriteTransaction(s)
}
func (s *Space) Write(reader array.RecordReader, options *options.WriteOptions) error {
return transaction.NewConcurrentWriteTransaction(s).Write(reader, options).Commit()
}
func (s *Space) Delete(reader array.RecordReader) error {
return transaction.NewConcurrentWriteTransaction(s).Delete(reader).Commit()
}
// Open opened a space or create if the space does not exist.
// If space does not exist. schema should not be nullptr, or an error will be returned.
// If space exists and version is specified, it will restore to the state at this version,
// or it will choose the latest version.
func Open(uri string, opt options.Options) (*Space, error) {
var f fs.Fs
var m *manifest.Manifest
var path string
f, err := fs.BuildFileSystem(uri)
if err != nil {
return nil, err
}
path = f.Path()
log.Debug("open space", log.String("path", path))
log.Debug(utils.GetManifestDir(path))
// create if not exist
if err = f.CreateDir(utils.GetManifestDir(path)); err != nil {
return nil, err
}
if err = f.CreateDir(utils.GetScalarDataDir(path)); err != nil {
return nil, err
}
if err = f.CreateDir(utils.GetVectorDataDir(path)); err != nil {
return nil, err
}
if err = f.CreateDir(utils.GetBlobDir(path)); err != nil {
return nil, err
}
if err = f.CreateDir(utils.GetDeleteDataDir(path)); err != nil {
return nil, err
}
rw := manifest.NewManifestReaderWriter(f, path)
m, err = rw.Read(opt.Version)
if err != nil {
// create the first manifest file
if err == manifest.ErrManifestNotFound {
if opt.Schema == nil {
log.Error("schema is nil")
return nil, errors.ErrSchemaIsNil
}
if err = opt.Schema.Validate(); err != nil {
return nil, err
}
m = manifest.NewManifest(opt.Schema)
m.SetVersion(0) // TODO: check if this is necessary
if err = rw.Write(m); err != nil {
return nil, err
}
} else {
return nil, err
}
}
space := NewSpace(f, path, m, opt.LockManager)
return space, nil
}
func (s *Space) readManifest(version int64) error {
rw := manifest.NewManifestReaderWriter(s.fs, s.path)
manifest, err := rw.Read(version)
if err != nil {
return err
}
s.manifest = manifest
return nil
}
func (s *Space) Read(readOptions *options.ReadOptions) (array.RecordReader, error) {
if s.manifest == nil || readOptions.ManifestVersion != s.manifest.Version() {
if err := s.readManifest(readOptions.ManifestVersion); err != nil {
return nil, err
}
}
if s.manifest.GetSchema().Options().HasVersionColumn() {
f := filter.NewConstantFilter(filter.LessThanOrEqual, s.manifest.GetSchema().Options().VersionColumn, int64(math.MaxInt64))
readOptions.AddFilter(f)
readOptions.AddColumn(s.manifest.GetSchema().Options().VersionColumn)
}
log.Debug("read", log.Any("readOption", readOptions))
return recordreader.MakeRecordReader(s.manifest, s.manifest.GetSchema(), s.fs, s.deleteFragments, readOptions), nil
}
func (s *Space) WriteBlob(content []byte, name string, replace bool) error {
return transaction.NewConcurrentWriteTransaction(s).WriteBlob(content, name, replace).Commit()
}
func (s *Space) ReadBlob(name string, output []byte) (int, error) {
blob, ok := s.manifest.GetBlob(name)
if !ok {
return -1, errors.ErrBlobNotExist
}
f, err := s.fs.OpenFile(blob.File)
if err != nil {
return -1, err
}
return f.Read(output)
}
func (s *Space) GetBlobByteSize(name string) (int64, error) {
blob, ok := s.manifest.GetBlob(name)
if !ok {
return -1, errors.ErrBlobNotExist
}
return blob.Size, nil
}
func (s *Space) GetCurrentVersion() int64 {
return s.manifest.Version()
}
func (s *Space) ScanDelete() (array.RecordReader, error) {
return recordreader.MakeScanDeleteReader(s.manifest, s.fs), nil
}
func (s *Space) Path() string {
return s.path
}
func (s *Space) Fs() fs.Fs {
return s.fs
}
func (s *Space) Manifest() *manifest.Manifest {
return s.manifest
}
func (s *Space) SetManifest(manifest *manifest.Manifest) {
s.manifest = manifest
}
func (s *Space) LockManager() lock.LockManager {
return s.lockManager
}
func (s *Space) SetLockManager(lockManager lock.LockManager) {
s.lockManager = lockManager
}
func (s *Space) StatisticsBlobs() []blob.Blob {
return s.manifest.GetBlobs()
}

View File

@ -0,0 +1,327 @@
// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package transaction
import (
"fmt"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/milvus-io/milvus/internal/storagev2/common/errors"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/internal/storagev2/common/utils"
"github.com/milvus-io/milvus/internal/storagev2/file/blob"
"github.com/milvus-io/milvus/internal/storagev2/file/fragment"
"github.com/milvus-io/milvus/internal/storagev2/io/format"
"github.com/milvus-io/milvus/internal/storagev2/io/format/parquet"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/lock"
"github.com/milvus-io/milvus/internal/storagev2/storage/manifest"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
type SpaceMeta interface {
Path() string
Fs() fs.Fs
Manifest() *manifest.Manifest
LockManager() lock.LockManager
SetManifest(manifest *manifest.Manifest)
}
type Transaction interface {
Write(reader array.RecordReader, options *options.WriteOptions) Transaction
Delete(reader array.RecordReader) Transaction
WriteBlob(content []byte, name string, replace bool) Transaction
Commit() error
}
type ConcurrentWriteTransaction struct {
operations []Operation
commit manifest.ManifestCommit
space SpaceMeta
}
func (t *ConcurrentWriteTransaction) Write(reader array.RecordReader, options *options.WriteOptions) Transaction {
operation := &WriteOperation{
reader: reader,
options: options,
space: t.space,
transaction: t,
}
t.operations = append(t.operations, operation)
return t
}
func (t *ConcurrentWriteTransaction) Delete(reader array.RecordReader) Transaction {
operation := &DeleteOperation{
reader: reader,
space: t.space,
transaction: t,
}
t.operations = append(t.operations, operation)
return t
}
func (t *ConcurrentWriteTransaction) WriteBlob(content []byte, name string, replace bool) Transaction {
operation := &WriteBlobOperation{
content: content,
name: name,
replace: replace,
space: t.space,
transaction: t,
}
t.operations = append(t.operations, operation)
return t
}
func (t *ConcurrentWriteTransaction) Commit() error {
for _, op := range t.operations {
op.Execute()
}
nxtManifest, err := t.commit.Commit()
if err != nil {
return err
}
t.space.SetManifest(nxtManifest)
return nil
}
func NewConcurrentWriteTransaction(space SpaceMeta) *ConcurrentWriteTransaction {
return &ConcurrentWriteTransaction{
operations: make([]Operation, 0),
commit: manifest.NewManifestCommit(space.LockManager(), manifest.NewManifestReaderWriter(space.Fs(), space.Path())),
space: space,
}
}
type Operation interface {
Execute() error
}
type WriteOperation struct {
reader array.RecordReader
options *options.WriteOptions
space SpaceMeta
transaction *ConcurrentWriteTransaction
}
func (w *WriteOperation) Execute() error {
if !w.space.Manifest().GetSchema().Schema().Equal(w.reader.Schema()) {
return errors.ErrSchemaNotMatch
}
scalarSchema, vectorSchema := w.space.Manifest().GetSchema().ScalarSchema(), w.space.Manifest().GetSchema().VectorSchema()
var (
scalarWriter format.Writer
vectorWriter format.Writer
)
scalarFragment := fragment.NewFragment()
vectorFragment := fragment.NewFragment()
isEmpty := true
for w.reader.Next() {
rec := w.reader.Record()
if rec.NumRows() == 0 {
continue
}
var err error
scalarWriter, err = w.write(scalarSchema, rec, scalarWriter, &scalarFragment, w.options, true)
if err != nil {
return err
}
vectorWriter, err = w.write(vectorSchema, rec, vectorWriter, &vectorFragment, w.options, false)
if err != nil {
return err
}
isEmpty = false
}
if scalarWriter != nil {
if err := scalarWriter.Close(); err != nil {
return err
}
}
if vectorWriter != nil {
if err := vectorWriter.Close(); err != nil {
return err
}
}
if isEmpty {
return nil
}
op1 := manifest.AddScalarFragmentOp{ScalarFragment: scalarFragment}
op2 := manifest.AddVectorFragmentOp{VectorFragment: vectorFragment}
w.transaction.commit.AddOp(op1, op2)
return nil
}
func (w *WriteOperation) write(
schema *arrow.Schema,
rec arrow.Record,
writer format.Writer,
fragment *fragment.Fragment,
opt *options.WriteOptions,
isScalar bool,
) (format.Writer, error) {
var columns []arrow.Array
cols := rec.Columns()
for k := range cols {
_, has := schema.FieldsByName(rec.ColumnName(k))
if has {
columns = append(columns, cols[k])
}
}
var rootPath string
if isScalar {
// add offset column for scalar
offsetValues := make([]int64, rec.NumRows())
for i := 0; i < int(rec.NumRows()); i++ {
offsetValues[i] = int64(i)
}
builder := array.NewInt64Builder(memory.DefaultAllocator)
builder.AppendValues(offsetValues, nil)
offsetColumn := builder.NewArray()
columns = append(columns, offsetColumn)
rootPath = utils.GetScalarDataDir(w.space.Path())
} else {
rootPath = utils.GetVectorDataDir(w.space.Path())
}
var err error
record := array.NewRecord(schema, columns, rec.NumRows())
if writer == nil {
filePath := utils.GetNewParquetFilePath(rootPath)
writer, err = parquet.NewFileWriter(schema, w.space.Fs(), filePath)
if err != nil {
return nil, err
}
fragment.AddFile(filePath)
}
err = writer.Write(record)
if err != nil {
return nil, err
}
if writer.Count() >= opt.MaxRecordPerFile {
log.Debug("close writer", log.Any("count", writer.Count()))
err = writer.Close()
if err != nil {
return nil, err
}
writer = nil
}
return writer, nil
}
type DeleteOperation struct {
reader array.RecordReader
space SpaceMeta
transaction *ConcurrentWriteTransaction
}
func (o *DeleteOperation) Execute() error {
schema := o.space.Manifest().GetSchema().DeleteSchema()
fragment := fragment.NewFragment()
var (
err error
writer format.Writer
deleteFile string
)
for o.reader.Next() {
rec := o.reader.Record()
if rec.NumRows() == 0 {
continue
}
if writer == nil {
deleteFile = utils.GetNewParquetFilePath(utils.GetDeleteDataDir(o.space.Path()))
writer, err = parquet.NewFileWriter(schema, o.space.Fs(), deleteFile)
if err != nil {
return err
}
fragment.AddFile(deleteFile)
}
if err = writer.Write(rec); err != nil {
return err
}
}
if writer != nil {
if err = writer.Close(); err != nil {
return err
}
op := manifest.AddDeleteFragmentOp{DeleteFragment: fragment}
o.transaction.commit.AddOp(op)
}
return nil
}
type WriteBlobOperation struct {
content []byte
name string
replace bool
space SpaceMeta
transaction *ConcurrentWriteTransaction
}
func (o *WriteBlobOperation) Execute() error {
if !o.replace && o.space.Manifest().HasBlob(o.name) {
return errors.ErrBlobAlreadyExist
}
blobFile := utils.GetBlobFilePath(o.space.Path())
f, err := o.space.Fs().OpenFile(blobFile)
if err != nil {
return err
}
n, err := f.Write(o.content)
if err != nil {
return err
}
if n != len(o.content) {
return fmt.Errorf("blob not written completely, written %d but expect %d", n, len(o.content))
}
if err = f.Close(); err != nil {
return err
}
op := manifest.AddBlobOp{
Replace: o.replace,
Blob: blob.Blob{
Name: o.name,
Size: int64(len(o.content)),
File: blobFile,
},
}
o.transaction.commit.AddOp(op)
return nil
}

131
pkg/proto/storagev2.proto Normal file
View File

@ -0,0 +1,131 @@
syntax = "proto3";
package milvus.proto.storagev2;
option go_package = "github.com/milvus-io/milvus/pkg/proto/storagev2pb";
enum LogicType {
NA = 0;
BOOL = 1;
UINT8 = 2;
INT8 = 3;
UINT16 = 4;
INT16 = 5;
UINT32 = 6;
INT32 = 7;
UINT64 = 8;
INT64 = 9;
HALF_FLOAT = 10;
FLOAT = 11;
DOUBLE = 12;
STRING = 13;
BINARY = 14;
FIXED_SIZE_BINARY = 15;
// DATE32 = 16;
// DATE64 = 17;
// TIMESTAMP = 18;
// TIME32 = 19;
// TIME64 = 20;
// INTERVAL_MONTHS = 21;
// INTERVAL_DAY_TIME = 22;
// DECIMAL128 = 23;
// option allow_alias = true;
// DECIMAL = 23; // DECIMAL==DECIMAL128
// DECIMAL256 = 24;
LIST = 25;
STRUCT = 26;
// SPARSE_UNION = 27;
// DENSE_UNION = 28;
DICTIONARY = 29;
MAP = 30;
// EXTENSION = 31;
FIXED_SIZE_LIST = 32;
// DURATION = 33;
// LARGE_STRING = 34;
// LARGE_BINARY = 35;
// LARGE_LIST = 36;
// INTERVAL_MONTH_DAY_NANO = 37;
// RUN_END_ENCODED = 38;
MAX_ID = 39;
}
enum Endianness {
Little = 0;
Big = 1;
}
message FixedSizeBinaryType { int32 byte_width = 1; }
message FixedSizeListType { int32 list_size = 1; }
message DictionaryType {
DataType index_type = 1;
DataType value_type = 2;
bool ordered = 3;
}
message MapType { bool keys_sorted = 1; }
message DataType {
oneof type_related_values {
FixedSizeBinaryType fixed_size_binary_type = 1;
FixedSizeListType fixed_size_list_type = 2;
DictionaryType dictionary_type = 3;
MapType map_type = 4;
}
LogicType logic_type = 100;
repeated Field children = 101;
}
message KeyValueMetadata {
repeated string keys = 1;
repeated string values = 2;
}
message Field {
string name = 1;
bool nullable = 2;
DataType data_type = 3;
KeyValueMetadata metadata = 4;
}
message SchemaOptions {
string primary_column = 1;
string version_column = 2;
string vector_column = 3;
}
message ArrowSchema {
repeated Field fields = 1;
Endianness endianness = 2;
KeyValueMetadata metadata = 3;
}
message Schema {
ArrowSchema arrow_schema = 1;
SchemaOptions schema_options = 2;
}
message Options { string uri = 1; }
message Manifest {
int64 version = 1;
Options options = 2;
Schema schema = 3;
repeated Fragment scalar_fragments = 4;
repeated Fragment vector_fragments = 5;
repeated Fragment delete_fragments = 6;
repeated Blob blobs = 7;
}
message Fragment {
int64 id = 1;
repeated string files = 2;
}
message Blob {
string name = 1;
int64 size = 2;
string file = 3;
}

File diff suppressed because it is too large Load Diff

View File

@ -101,6 +101,7 @@ USE_ASAN="OFF"
USE_DYNAMIC_SIMD="ON"
USE_OPENDAL="OFF"
INDEX_ENGINE="KNOWHERE"
ENABLE_AZURE_FS="OFF"
: "${ENABLE_GCP_NATIVE:="OFF"}"
while getopts "p:d:t:s:f:n:i:y:a:x:o:ulrcghzmebZ" arg; do
@ -257,7 +258,8 @@ ${CMAKE_EXTRA_ARGS} \
-DCPU_ARCH=${CPU_ARCH} \
-DUSE_OPENDAL=${USE_OPENDAL} \
-DINDEX_ENGINE=${INDEX_ENGINE} \
-DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} "
-DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} \
-DENABLE_AZURE_FS=${ENABLE_AZURE_FS} "
if [ -z "$BUILD_WITHOUT_AZURE" ]; then
CMAKE_CMD=${CMAKE_CMD}"-DAZURE_BUILD_DIR=${AZURE_BUILD_DIR} \
-DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} "

View File

@ -62,10 +62,12 @@ mkdir -p ./planpb
mkdir -p ./workerpb
mkdir -p ./messagespb
mkdir -p ./streamingpb
mkdir -p ./storagev2pb
mkdir -p $ROOT_DIR/cmd/tools/migration/legacy/legacypb
protoc_opt="${PROTOC_BIN} --proto_path=${API_PROTO_DIR} --proto_path=."
${protoc_opt} --go_out=paths=source_relative:./storagev2pb --go-grpc_out=require_unimplemented_servers=false,paths=source_relative:./storagev2pb storagev2.proto || { echo 'generate storagev2.proto failed'; exit 1; }
${protoc_opt} --go_out=paths=source_relative:./etcdpb --go-grpc_out=require_unimplemented_servers=false,paths=source_relative:./etcdpb etcd_meta.proto || { echo 'generate etcd_meta.proto failed'; exit 1; }
${protoc_opt} --go_out=paths=source_relative:./indexcgopb --go-grpc_out=require_unimplemented_servers=false,paths=source_relative:./indexcgopb index_cgo_msg.proto || { echo 'generate index_cgo_msg failed '; exit 1; }
${protoc_opt} --go_out=paths=source_relative:./cgopb --go-grpc_out=require_unimplemented_servers=false,paths=source_relative:./cgopb cgo_msg.proto || { echo 'generate cgo_msg failed '; exit 1; }