Support google cloud for C++ chunk manager (#21100) (#22449)

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
pull/22474/head
zhagnlu 2023-02-28 14:17:47 +08:00 committed by GitHub
parent 4b521bef3f
commit 0ebaf51380
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 352 additions and 19 deletions

View File

@ -51,7 +51,7 @@ add_library(milvus_storage SHARED ${STORAGE_FILES})
find_package(Boost REQUIRED COMPONENTS filesystem)
if(BUILD_DISK_ANN STREQUAL "ON")
target_link_libraries(milvus_storage PUBLIC milvus_common Boost::filesystem aws-cpp-sdk-s3 pthread)
target_link_libraries(milvus_storage PUBLIC milvus_common Boost::filesystem aws-cpp-sdk-s3 google_cloud_cpp_storage google_cloud_cpp_common google_cloud_cpp_rest_internal pthread)
else()
target_link_libraries(milvus_storage PUBLIC milvus_common Boost::filesystem pthread)
endif()

View File

@ -69,10 +69,18 @@ ConvertFromAwsString(const Aws::String& aws_str) {
}
void
MinioChunkManager::InitSDKAPI() {
MinioChunkManager::InitSDKAPI(RemoteStorageType type) {
std::scoped_lock lock{client_mutex_};
const size_t initCount = init_count_++;
if (initCount == 0) {
if (type == STORAGE_GOOGLE_CLOUD) {
sdk_options_.httpOptions.httpClientFactory_create_fn = []() {
// auto credentials = google::cloud::oauth2_internal::GOOGLE_CLOUD_CPP_NS::GoogleDefaultCredentials();
auto credentials =
std::make_shared<google::cloud::oauth2_internal::GOOGLE_CLOUD_CPP_NS::ComputeEngineCredentials>();
return Aws::MakeShared<GoogleHttpClientFactory>(GOOGLE_CLIENT_FACTORY_ALLOCATION_TAG, credentials);
};
}
Aws::InitAPI(sdk_options_);
}
}
@ -86,20 +94,8 @@ MinioChunkManager::ShutdownSDKAPI() {
}
}
MinioChunkManager::MinioChunkManager(const StorageConfig& storage_config)
: default_bucket_name_(storage_config.bucket_name) {
InitSDKAPI();
Aws::Client::ClientConfiguration config;
config.endpointOverride = ConvertToAwsString(storage_config.address);
if (storage_config.useSSL) {
config.scheme = Aws::Http::Scheme::HTTPS;
config.verifySSL = true;
} else {
config.scheme = Aws::Http::Scheme::HTTP;
config.verifySSL = false;
}
void
MinioChunkManager::BuildS3Client(const StorageConfig& storage_config, const Aws::Client::ClientConfiguration& config) {
if (storage_config.useIAM) {
auto provider = std::make_shared<Aws::Auth::DefaultAWSCredentialsProviderChain>();
auto aws_credentials = provider->GetAWSCredentials();
@ -118,6 +114,46 @@ MinioChunkManager::MinioChunkManager(const StorageConfig& storage_config)
ConvertToAwsString(storage_config.access_key_value)),
config, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, false);
}
}
void
MinioChunkManager::BuildGoogleCloudClient(const StorageConfig& storage_config,
const Aws::Client::ClientConfiguration& config) {
if (storage_config.useIAM) {
// Using S3 client instead of google client because of compatible protocol
client_ = std::make_shared<Aws::S3::S3Client>(config, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
false);
} else {
throw std::runtime_error("google cloud only support iam mode now");
}
}
MinioChunkManager::MinioChunkManager(const StorageConfig& storage_config)
: default_bucket_name_(storage_config.bucket_name) {
RemoteStorageType storageType;
if (storage_config.address.find("google") != std::string::npos) {
storageType = STORAGE_GOOGLE_CLOUD;
} else {
storageType = STORAGE_S3;
}
InitSDKAPI(storageType);
Aws::Client::ClientConfiguration config;
config.endpointOverride = ConvertToAwsString(storage_config.address);
if (storage_config.useSSL) {
config.scheme = Aws::Http::Scheme::HTTPS;
config.verifySSL = true;
} else {
config.scheme = Aws::Http::Scheme::HTTP;
config.verifySSL = false;
}
if (storageType == STORAGE_S3) {
BuildS3Client(storage_config, config);
} else if (storageType == STORAGE_GOOGLE_CLOUD) {
BuildGoogleCloudClient(storage_config, config);
}
// TODO ::BucketExist and CreateBucket func not work, should be fixed
// index node has already tried to create bucket when receive index task if bucket not exist

View File

@ -17,7 +17,19 @@
#pragma once
#include <aws/core/Aws.h>
#include <aws/core/http/HttpClientFactory.h>
#include <aws/core/http/HttpRequest.h>
#include <aws/core/http/HttpTypes.h>
#include <aws/core/http/URI.h>
#include <aws/core/http/curl/CurlHttpClient.h>
#include <aws/core/http/standard/StandardHttpRequest.h>
#include <aws/s3/S3Client.h>
#include <google/cloud/credentials.h>
#include <google/cloud/internal/oauth2_credentials.h>
#include <google/cloud/internal/oauth2_google_credentials.h>
#include <google/cloud/storage/oauth2/compute_engine_credentials.h>
#include <google/cloud/storage/oauth2/google_credentials.h>
#include <google/cloud/status_or.h>
#include <map>
#include <memory>
#include <string>
@ -30,6 +42,9 @@
namespace milvus::storage {
enum RemoteStorageType { STORAGE_S3 = 0, STORAGE_GOOGLE_CLOUD = 1 };
typedef enum RemoteStorageType RemoteStorageType;
/**
* @brief This MinioChunkManager is responsible for read and write file in S3.
*/
@ -113,12 +128,16 @@ class MinioChunkManager : public RemoteChunkManager {
std::vector<std::string>
ListObjects(const char* bucket_name, const char* prefix = NULL);
void
InitSDKAPI();
InitSDKAPI(RemoteStorageType type);
void
ShutdownSDKAPI();
void
BuildS3Client(const StorageConfig& storage_config, const Aws::Client::ClientConfiguration& config);
void
BuildGoogleCloudClient(const StorageConfig& storage_config, const Aws::Client::ClientConfiguration& config);
private:
const Aws::SDKOptions sdk_options_;
Aws::SDKOptions sdk_options_;
static std::atomic<size_t> init_count_;
static std::mutex client_mutex_;
std::shared_ptr<Aws::S3::S3Client> client_;
@ -127,4 +146,50 @@ class MinioChunkManager : public RemoteChunkManager {
using MinioChunkManagerPtr = std::unique_ptr<MinioChunkManager>;
static const char* GOOGLE_CLIENT_FACTORY_ALLOCATION_TAG = "GoogleHttpClientFactory";
class GoogleHttpClientFactory : public Aws::Http::HttpClientFactory {
public:
explicit GoogleHttpClientFactory(std::shared_ptr<google::cloud::oauth2_internal::Credentials> credentials) {
credentials_ = credentials;
}
void
SetCredentials(std::shared_ptr<google::cloud::oauth2_internal::Credentials> credentials) {
credentials_ = credentials;
}
std::shared_ptr<Aws::Http::HttpClient>
CreateHttpClient(const Aws::Client::ClientConfiguration& clientConfiguration) const override {
return Aws::MakeShared<Aws::Http::CurlHttpClient>(GOOGLE_CLIENT_FACTORY_ALLOCATION_TAG, clientConfiguration);
}
std::shared_ptr<Aws::Http::HttpRequest>
CreateHttpRequest(const Aws::String& uri,
Aws::Http::HttpMethod method,
const Aws::IOStreamFactory& streamFactory) const override {
return CreateHttpRequest(Aws::Http::URI(uri), method, streamFactory);
}
std::shared_ptr<Aws::Http::HttpRequest>
CreateHttpRequest(const Aws::Http::URI& uri,
Aws::Http::HttpMethod method,
const Aws::IOStreamFactory& streamFactory) const override {
auto request = Aws::MakeShared<Aws::Http::Standard::StandardHttpRequest>(GOOGLE_CLIENT_FACTORY_ALLOCATION_TAG,
uri, method);
request->SetResponseStreamFactory(streamFactory);
auto auth_header = credentials_->AuthorizationHeader();
if (!auth_header.ok()) {
throw std::runtime_error("get authorization failed, errcode:" +
StatusCodeToString(auth_header.status().code()));
}
request->SetHeaderValue(auth_header->first.c_str(), auth_header->second.c_str());
return request;
}
private:
std::shared_ptr<google::cloud::oauth2_internal::Credentials> credentials_;
};
} // namespace milvus::storage

View File

@ -69,6 +69,11 @@ if ( LINUX AND BUILD_DISK_ANN STREQUAL "ON" )
add_subdirectory( aws_sdk )
endif()
# ******************************* Thirdparty google cloud sdk ********************************
if ( LINUX AND BUILD_DISK_ANN STREQUAL "ON" )
add_subdirectory( google_cloud_sdk )
endif()
# ******************************* Thirdparty marisa ********************************
# TODO: support win.
if ( LINUX OR APPLE)

View File

@ -36,7 +36,8 @@ macro(build_aws_sdk_s3)
# BINARY_DIR aws-s3-bin
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/aws-sdk-subbuild
BUILD_IN_SOURCE 1
#PATCH_COMMAND sh prefetch_crt_dependency.sh
# PATCH_COMMAND sh prefetch_crt_dependency.sh
LIST_SEPARATOR "|"
BUILD_COMMAND ${AWS_SDK_BUILD_COMMAND}
INSTALL_COMMAND ${AWS_SDK_INSTALL_COMMAND}

View File

@ -0,0 +1,142 @@
# -------------------------------------------------------------------------------
# Copyright (C) 2019-2020 Zilliz. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing permissions and limitations under the License.
# -------------------------------------------------------------------------------
set(GOOGLE_SDK_VERSION "v2.5.0")
set(GOOGLE_CRC32_VERSION "1.1.2")
set(GOOGLE_GRPC_VERSION "v1.50.1")
set(GOOGLE_ABSEIL_VERSION "20220623.1")
macro(build_google_sdk_s3)
message(STATUS "Building GOOGLE_ABSEIL-${GOOGLE_ABSEIL_VERSION} from source")
set(GOOGLE_ABSEIL_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
set(GOOGLE_ABSEIL_BUILD_COMMAND make -j $(nproc))
set(GOOGLE_ABSEIL_INSTALL_COMMAND make install)
set(GOOGLE_PATCH_COMMAND sed -i "s/^#define ABSL_OPTION_USE_\\(.*\\) 2/#define ABSL_OPTION_USE_\\1 0/" "absl/base/options.h")
set(GOOGLE_ABSEIL_CMAKE_ARGS
"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
"-DABSL_BUILD_TESTING=OFF"
"-DBUILD_SHARED_LIBS=yes")
ExternalProject_Add(google_abseil_ep
GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
GIT_TAG ${GOOGLE_ABSEIL_VERSION}
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/google-abseil-subbuild
PATCH_COMMAND ${GOOGLE_PATCH_COMMAND}
BUILD_COMMAND ${GOOGLE_ABSEIL_BUILD_COMMAND}
INSTALL_COMMAND ${GOOGLE_ABSEIL_INSTALL_COMMAND}
CMAKE_ARGS ${GOOGLE_ABSEIL_CMAKE_ARGS}
)
message(STATUS "Building GOOGLE_CRC32-${GOOGLE_CRC32_VERSION} from source")
set(GOOGLE_CRC32_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
set(GOOGLE_CRC32_BUILD_COMMAND make -j $(nproc))
set(GOOGLE_CRC32_INSTALL_COMMAND make install)
set(GOOGLE_CRC32_CMAKE_ARGS
"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
"-DBUILD_SHARED_LIBS=yes"
"-DCRC32C_BUILD_TESTS=OFF"
"-DCRC32C_BUILD_BENCHMARKS=OFF"
"-DCRC32C_USE_GLOG=OFF")
ExternalProject_Add(google_crc32_ep
GIT_REPOSITORY https://github.com/google/crc32c.git
GIT_TAG ${GOOGLE_CRC32_VERSION}
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/google-crc32-subbuild
BUILD_COMMAND ${GOOGLE_CRC32_BUILD_COMMAND}
INSTALL_COMMAND ${GOOGLE_CRC32_INSTALL_COMMAND}
CMAKE_ARGS ${GOOGLE_CRC32_CMAKE_ARGS}
)
message(STATUS "Building GOOGLE_JSON-${NLOHMANN_JSON_VERSION} from source")
set(GOOGLE_JSON_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
set(GOOGLE_JSON_BUILD_COMMAND make -j $(nproc))
set(GOOGLE_JSON_INSTALL_COMMAND make install)
set(GOOGLE_JSON_CMAKE_ARGS
"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
"-DBUILD_SHARED_LIBS=yes"
"-DBUILD_TESTING=OFF"
"-DJSON_BuildTests=OFF")
ExternalProject_Add(google_json_ep
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG ${GOOGLE_JSON_VERSION}
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/google-json-subbuild
BUILD_COMMAND ${GOOGLE_JSON_BUILD_COMMAND}
INSTALL_COMMAND ${GOOGLE_JSON_INSTALL_COMMAND}
CMAKE_ARGS ${GOOGLE_JSON_CMAKE_ARGS}
)
message(STATUS "Building GOOGLE_SDK-${GOOGLE_SDK_VERSION} from source")
set(GOOGLE_SDK_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
set(GOOGLE_SDK_BUILD_COMMAND make)
set(GOOGLE_SDK_INSTALL_COMMAND make install)
set(GOOGLE_SDK_CMAKE_ARGS
"-DCMAKE_BUILD_TYPE=Release"
"-DBUILD_DEPS=OFF"
"-DBUILD_TESTING=OFF"
"-DBUILD_SHARED_LIBS=ON"
"-DGOOGLE_CLOUD_CPP_ENABLE=storage"
"-DGOOGLE_CLOUD_CPP_ENABLE_EXAMPLES=OFF"
"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}")
ExternalProject_Add(google_sdk_ep
GIT_REPOSITORY https://github.com/googleapis/google-cloud-cpp.git
GIT_TAG ${GOOGLE_SDK_VERSION}
PREFIX ${CMAKE_BINARY_DIR}/3rdparty_download/google-sdk-subbuild
BUILD_IN_SOURCE 1
BUILD_COMMAND ${GOOGLE_SDK_BUILD_COMMAND}
INSTALL_COMMAND ${GOOGLE_SDK_INSTALL_COMMAND}
CMAKE_ARGS ${GOOGLE_SDK_CMAKE_ARGS}
)
add_dependencies(google_sdk_ep google_abseil_ep)
add_dependencies(google_sdk_ep google_json_ep)
add_dependencies(google_sdk_ep google_crc32_ep)
add_library(google_cloud_cpp_storage SHARED IMPORTED)
set_target_properties(google_cloud_cpp_storage
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}google_cloud_cpp_storage${CMAKE_SHARED_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
add_dependencies(google_cloud_cpp_storage google_sdk_ep)
add_library(google_cloud_cpp_common SHARED IMPORTED)
set_target_properties(google_cloud_cpp_common
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}google_cloud_cpp_common${CMAKE_SHARED_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
add_dependencies(google_cloud_cpp_common google_sdk_ep)
add_library(google_cloud_cpp_rest_internal SHARED IMPORTED)
set_target_properties(google_cloud_cpp_rest_internal
PROPERTIES
IMPORTED_GLOBAL TRUE
IMPORTED_LOCATION ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}google_cloud_cpp_rest_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${GOOGLE_SDK_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
add_dependencies(google_cloud_cpp_rest_internal google_sdk_ep)
endmacro()
build_google_sdk_s3()

View File

@ -37,6 +37,36 @@ class MinioChunkManagerTest : public testing::Test {
MinioChunkManagerPtr chunk_manager_;
};
StorageConfig
get_google_cloud_storage_config() {
auto endpoint = "storage.googleapis.com:443";
auto accessKey = "";
auto accessValue = "";
auto rootPath = "files";
auto useSSL = true;
auto useIam = true;
auto iamEndPoint = "";
auto bucketName = "gcp-zilliz-infra-test";
return StorageConfig{endpoint, bucketName, accessKey, accessValue, rootPath, "minio", iamEndPoint, useSSL, useIam};
}
class GoogleChunkManagerTest : public testing::Test {
public:
GoogleChunkManagerTest() {
}
~GoogleChunkManagerTest() {
}
virtual void
SetUp() {
chunk_manager_ = std::make_unique<MinioChunkManager>(get_google_cloud_storage_config());
}
protected:
MinioChunkManagerPtr chunk_manager_;
};
TEST_F(MinioChunkManagerTest, BucketPositive) {
string testBucketName = "test-bucket";
chunk_manager_->SetBucketName(testBucketName);
@ -218,3 +248,52 @@ TEST_F(MinioChunkManagerTest, ListWithPrefixPositive) {
chunk_manager_->Remove(path3);
chunk_manager_->DeleteBucket(testBucketName);
}
TEST_F(GoogleChunkManagerTest, ReadPositive) {
string testBucketName = "gcp-zilliz-infra-test";
chunk_manager_->SetBucketName(testBucketName);
EXPECT_EQ(chunk_manager_->GetBucketName(), testBucketName);
// if (!chunk_manager_->BucketExists(testBucketName)) {
// chunk_manager_->CreateBucket(testBucketName);
// }
uint8_t data[5] = {0x17, 0x32, 0x45, 0x34, 0x23};
string path = "1/4/6";
chunk_manager_->Write(path, data, sizeof(data));
bool exist = chunk_manager_->Exist(path);
EXPECT_EQ(exist, true);
auto size = chunk_manager_->Size(path);
EXPECT_EQ(size, 5);
uint8_t readdata[20] = {0};
size = chunk_manager_->Read(path, readdata, 20);
EXPECT_EQ(size, 5);
EXPECT_EQ(readdata[0], 0x17);
EXPECT_EQ(readdata[1], 0x32);
EXPECT_EQ(readdata[2], 0x45);
EXPECT_EQ(readdata[3], 0x34);
EXPECT_EQ(readdata[4], 0x23);
size = chunk_manager_->Read(path, readdata, 3);
EXPECT_EQ(size, 3);
EXPECT_EQ(readdata[0], 0x17);
EXPECT_EQ(readdata[1], 0x32);
EXPECT_EQ(readdata[2], 0x45);
uint8_t dataWithNULL[] = {0x17, 0x32, 0x00, 0x34, 0x23};
chunk_manager_->Write(path, dataWithNULL, sizeof(dataWithNULL));
exist = chunk_manager_->Exist(path);
EXPECT_EQ(exist, true);
size = chunk_manager_->Size(path);
EXPECT_EQ(size, 5);
size = chunk_manager_->Read(path, readdata, 20);
EXPECT_EQ(size, 5);
EXPECT_EQ(readdata[0], 0x17);
EXPECT_EQ(readdata[1], 0x32);
EXPECT_EQ(readdata[2], 0x00);
EXPECT_EQ(readdata[3], 0x34);
EXPECT_EQ(readdata[4], 0x23);
chunk_manager_->Remove(path);
}

View File

@ -29,6 +29,7 @@ ROOT_DIR="$( cd -P "$( dirname "$SOURCE" )/.." && pwd )"
MILVUS_CORE_DIR="${ROOT_DIR}/internal/core/"
MILVUS_CORE_UNITTEST_DIR="${MILVUS_CORE_DIR}/output/unittest/"
MILVUS_CORE_LIB_DIR="${MILVUS_CORE_DIR}/output/lib/"
echo "ROOT_DIR = ${ROOT_DIR}"
echo "MILVUS_CORE_DIR = ${MILVUS_CORE_DIR}"
@ -58,6 +59,10 @@ if [ $? -ne 0 ]; then
exit -1
fi
if [ -d "${MILVUS_CORE_LIB_DIR}" ]; then
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MILVUS_CORE_LIB_DIR}
fi
# run unittest
for test in `ls ${MILVUS_CORE_UNITTEST_DIR}`; do
echo "Running cpp unittest: ${MILVUS_CORE_UNITTEST_DIR}/$test"