Server down during loading data (#2807)

* Server down during loading data

Signed-off-by: fishpenguin <kun.yu@zilliz.com>

* Fix test_config.cpp

Signed-off-by: fishpenguin <kun.yu@zilliz.com>

* ci retry

Signed-off-by: fishpenguin <kun.yu@zilliz.com>

* Change limit_in_bytes init value to max

Signed-off-by: fishpenguin <kun.yu@zilliz.com>

Co-authored-by: Jin Hai <hai.jin@zilliz.com>
pull/2793/head^2
yukun 2020-07-14 00:40:20 +08:00 committed by GitHub
parent 585c8ea018
commit a474ff269a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 64 deletions

View File

@ -20,6 +20,7 @@ Please mark all change in change log and use the issue from GitHub
- \#2752 Milvus formats vectors data to double-precision and return to http client
- \#2767 Fix a bug of getting wrong nprobe limitation in knowhere on GPU version
- \#2768 After building the index,the number of vectors increases
- \#2774 Server down during loading data
- \#2776 Fix too many data copies during creating IVF index
- \#2813 To implemente RNSG IP

View File

@ -16,6 +16,7 @@
#include <chrono>
#include <fstream>
#include <iostream>
#include <limits>
#include <regex>
#include <string>
#include <thread>
@ -1333,6 +1334,15 @@ Config::CheckCacheConfigCpuCacheCapacity(const std::string& value) {
int64_t total_mem = 0, free_mem = 0;
CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_mem = std::numeric_limits<int64_t>::max();
CommonUtil::GetSysCgroupMemLimit(cgroup_limit_mem);
if (cgroup_limit_mem < total_mem && cache_size >= cgroup_limit_mem) {
std::string msg = "Invalid cpu cache size: " + value +
". Possible reason: cache.cache_size exceeds system cgroup memory.";
return Status{SERVER_INVALID_ARGUMENT, msg};
}
if (cache_size >= total_mem) {
std::string msg =
"Invalid cpu cache size: " + value + ". Possible reason: cache.cache_size exceeds system memory.";

View File

@ -53,6 +53,19 @@ CommonUtil::GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem) {
return ret == 0; // succeed 0, failed -1
}
bool
CommonUtil::GetSysCgroupMemLimit(int64_t& limit_in_bytes) {
try {
std::ifstream file("/sys/fs/cgroup/memory/memory.limit_in_bytes");
file >> limit_in_bytes;
} catch (std::exception& ex) {
std::string msg =
"Failed to read /sys/fs/cgroup/memory/memory.limit_in_bytes, reason: " + std::string(ex.what());
LOG_SERVER_ERROR_ << msg;
return 0;
}
}
bool
CommonUtil::GetSystemAvailableThreads(int64_t& thread_count) {
// threadCnt = std::thread::hardware_concurrency();

View File

@ -24,6 +24,8 @@ class CommonUtil {
static bool
GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem);
static bool
GetSysCgroupMemLimit(int64_t& limit_in_bytes);
static bool
GetSystemAvailableThreads(int64_t& thread_count);
static bool

View File

@ -294,8 +294,15 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
// #2564
int64_t total_mem = 0, free_mem = 0;
milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_size = 0;
milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size);
ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok());
int64_t cache_cpu_cache_size = total_mem / 2;
int64_t cache_cpu_cache_size = 0;
if (cgroup_limit_size < total_mem) {
cache_cpu_cache_size = cgroup_limit_size / 2;
} else {
cache_cpu_cache_size = total_mem / 2;
}
float cache_cpu_cache_threshold = 0.7;
ASSERT_TRUE(config.SetCacheConfigCpuCacheThreshold(std::to_string(cache_cpu_cache_threshold)).ok());
ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok());
@ -306,14 +313,20 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
{
int64_t total_mem = 0, free_mem = 0;
milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem);
int64_t cgroup_limit_size = 0;
milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size);
ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok());
int64_t cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1
int64_t cache_cpu_cache_size = 0;
if (cgroup_limit_size < total_mem) {
cache_cpu_cache_size = cgroup_limit_size - 1073741824 - 1;
} else {
cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1
}
ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok());
ASSERT_TRUE(config.GetCacheConfigCpuCacheCapacity(int64_val).ok());
ASSERT_TRUE(int64_val == cache_cpu_cache_size);
}
/* engine config */
int64_t engine_use_blas_threshold = 50;
ASSERT_TRUE(config.SetEngineConfigUseBlasThreshold(std::to_string(engine_use_blas_threshold)).ok());
@ -498,8 +511,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_CLI_TEST) {
s = config.ProcessConfigCli(result, get_cmd);
ASSERT_TRUE(s.ok());
/* cache config */
std::string cache_cpu_cache_capacity = "1";
get_cmd = gen_get_command(ms::CONFIG_CACHE, ms::CONFIG_CACHE_CPU_CACHE_CAPACITY);
@ -681,7 +692,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) {
ASSERT_FALSE(config.SetDBConfigArchiveDaysThreshold("0x10").ok());
/* storage config */
ASSERT_FALSE(config.SetStorageConfigPath("").ok());
ASSERT_FALSE(config.SetStorageConfigPath("./milvus").ok());
@ -1288,8 +1298,8 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) {
std::string reply_set, reply_get;
std::string cmd_set, cmd_get;
auto lambda = [&conf_file](const std::string& key, const std::string& child_key,
const std::string& default_value, std::string& value) {
auto lambda = [&conf_file](const std::string& key, const std::string& child_key, const std::string& default_value,
std::string& value) {
auto* ymgr = milvus::server::YamlConfigMgr::GetInstance();
auto status = ymgr->LoadConfigFile(conf_file);
@ -1310,52 +1320,58 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) {
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_CACHE, ms::CONFIG_CACHE_INSERT_BUFFER_SIZE,
ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value).ok());
ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("2", yaml_value);
// test boolean config value
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "True");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("true", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "On");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("true", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "False");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("false", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "Off");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR,
ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT,
yaml_value)
.ok());
ASSERT_EQ("false", yaml_value);
// test path
cmd_set = gen_set_command(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, "/tmp/milvus_config_unittest");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH,
ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok());
ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok());
ASSERT_EQ("/tmp/milvus_config_unittest", yaml_value);
#ifdef MILVUS_GPU_VERSION
cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "gpu0");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES,
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok());
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("gpu0", yaml_value);
cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "GPU0");
ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok());
ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES,
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok());
ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value)
.ok());
ASSERT_EQ("gpu0", yaml_value);
#endif
}