From a474ff269ae737232b396d471088b5ba6b3808e1 Mon Sep 17 00:00:00 2001 From: yukun Date: Tue, 14 Jul 2020 00:40:20 +0800 Subject: [PATCH] Server down during loading data (#2807) * Server down during loading data Signed-off-by: fishpenguin * Fix test_config.cpp Signed-off-by: fishpenguin * ci retry Signed-off-by: fishpenguin * Change limit_in_bytes init value to max Signed-off-by: fishpenguin Co-authored-by: Jin Hai --- CHANGELOG.md | 1 + core/src/config/Config.cpp | 10 ++ core/src/utils/CommonUtil.cpp | 13 +++ core/src/utils/CommonUtil.h | 2 + core/unittest/server/test_config.cpp | 144 +++++++++++++++------------ 5 files changed, 106 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2bb28076f..d1c4b9cb56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Please mark all change in change log and use the issue from GitHub - \#2752 Milvus formats vectors data to double-precision and return to http client - \#2767 Fix a bug of getting wrong nprobe limitation in knowhere on GPU version - \#2768 After building the index,the number of vectors increases +- \#2774 Server down during loading data - \#2776 Fix too many data copies during creating IVF index - \#2813 To implemente RNSG IP diff --git a/core/src/config/Config.cpp b/core/src/config/Config.cpp index 619474c1d2..a86230d7fa 100644 --- a/core/src/config/Config.cpp +++ b/core/src/config/Config.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1333,6 +1334,15 @@ Config::CheckCacheConfigCpuCacheCapacity(const std::string& value) { int64_t total_mem = 0, free_mem = 0; CommonUtil::GetSystemMemInfo(total_mem, free_mem); + + int64_t cgroup_limit_mem = std::numeric_limits::max(); + CommonUtil::GetSysCgroupMemLimit(cgroup_limit_mem); + if (cgroup_limit_mem < total_mem && cache_size >= cgroup_limit_mem) { + std::string msg = "Invalid cpu cache size: " + value + + ". Possible reason: cache.cache_size exceeds system cgroup memory."; + return Status{SERVER_INVALID_ARGUMENT, msg}; + } + if (cache_size >= total_mem) { std::string msg = "Invalid cpu cache size: " + value + ". Possible reason: cache.cache_size exceeds system memory."; diff --git a/core/src/utils/CommonUtil.cpp b/core/src/utils/CommonUtil.cpp index bb60bd2410..210e6603e9 100644 --- a/core/src/utils/CommonUtil.cpp +++ b/core/src/utils/CommonUtil.cpp @@ -53,6 +53,19 @@ CommonUtil::GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem) { return ret == 0; // succeed 0, failed -1 } +bool +CommonUtil::GetSysCgroupMemLimit(int64_t& limit_in_bytes) { + try { + std::ifstream file("/sys/fs/cgroup/memory/memory.limit_in_bytes"); + file >> limit_in_bytes; + } catch (std::exception& ex) { + std::string msg = + "Failed to read /sys/fs/cgroup/memory/memory.limit_in_bytes, reason: " + std::string(ex.what()); + LOG_SERVER_ERROR_ << msg; + return 0; + } +} + bool CommonUtil::GetSystemAvailableThreads(int64_t& thread_count) { // threadCnt = std::thread::hardware_concurrency(); diff --git a/core/src/utils/CommonUtil.h b/core/src/utils/CommonUtil.h index 649957d29a..e81b402a27 100644 --- a/core/src/utils/CommonUtil.h +++ b/core/src/utils/CommonUtil.h @@ -24,6 +24,8 @@ class CommonUtil { static bool GetSystemMemInfo(int64_t& total_mem, int64_t& free_mem); static bool + GetSysCgroupMemLimit(int64_t& limit_in_bytes); + static bool GetSystemAvailableThreads(int64_t& thread_count); static bool diff --git a/core/unittest/server/test_config.cpp b/core/unittest/server/test_config.cpp index d02ddd201d..ea39e3d0b0 100644 --- a/core/unittest/server/test_config.cpp +++ b/core/unittest/server/test_config.cpp @@ -223,35 +223,35 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) { ASSERT_TRUE(config.GetStorageConfigPath(str_val).ok()); ASSERT_TRUE(str_val == storage_primary_path); -// bool storage_s3_enable = true; -// ASSERT_TRUE(config.SetStorageConfigS3Enable(std::to_string(storage_s3_enable)).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3Enable(bool_val).ok()); -// ASSERT_TRUE(bool_val == storage_s3_enable); -// -// std::string storage_s3_addr = "192.168.1.100"; -// ASSERT_TRUE(config.SetStorageConfigS3Address(storage_s3_addr).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3Address(str_val).ok()); -// ASSERT_TRUE(str_val == storage_s3_addr); -// -// std::string storage_s3_port = "12345"; -// ASSERT_TRUE(config.SetStorageConfigS3Port(storage_s3_port).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3Port(str_val).ok()); -// ASSERT_TRUE(str_val == storage_s3_port); -// -// std::string storage_s3_access_key = "minioadmin"; -// ASSERT_TRUE(config.SetStorageConfigS3AccessKey(storage_s3_access_key).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3AccessKey(str_val).ok()); -// ASSERT_TRUE(str_val == storage_s3_access_key); -// -// std::string storage_s3_secret_key = "minioadmin"; -// ASSERT_TRUE(config.SetStorageConfigS3SecretKey(storage_s3_secret_key).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3SecretKey(str_val).ok()); -// ASSERT_TRUE(str_val == storage_s3_secret_key); -// -// std::string storage_s3_bucket = "s3bucket"; -// ASSERT_TRUE(config.SetStorageConfigS3Bucket(storage_s3_bucket).ok()); -// ASSERT_TRUE(config.GetStorageConfigS3Bucket(str_val).ok()); -// ASSERT_TRUE(str_val == storage_s3_bucket); + // bool storage_s3_enable = true; + // ASSERT_TRUE(config.SetStorageConfigS3Enable(std::to_string(storage_s3_enable)).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3Enable(bool_val).ok()); + // ASSERT_TRUE(bool_val == storage_s3_enable); + // + // std::string storage_s3_addr = "192.168.1.100"; + // ASSERT_TRUE(config.SetStorageConfigS3Address(storage_s3_addr).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3Address(str_val).ok()); + // ASSERT_TRUE(str_val == storage_s3_addr); + // + // std::string storage_s3_port = "12345"; + // ASSERT_TRUE(config.SetStorageConfigS3Port(storage_s3_port).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3Port(str_val).ok()); + // ASSERT_TRUE(str_val == storage_s3_port); + // + // std::string storage_s3_access_key = "minioadmin"; + // ASSERT_TRUE(config.SetStorageConfigS3AccessKey(storage_s3_access_key).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3AccessKey(str_val).ok()); + // ASSERT_TRUE(str_val == storage_s3_access_key); + // + // std::string storage_s3_secret_key = "minioadmin"; + // ASSERT_TRUE(config.SetStorageConfigS3SecretKey(storage_s3_secret_key).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3SecretKey(str_val).ok()); + // ASSERT_TRUE(str_val == storage_s3_secret_key); + // + // std::string storage_s3_bucket = "s3bucket"; + // ASSERT_TRUE(config.SetStorageConfigS3Bucket(storage_s3_bucket).ok()); + // ASSERT_TRUE(config.GetStorageConfigS3Bucket(str_val).ok()); + // ASSERT_TRUE(str_val == storage_s3_bucket); /* metric config */ bool metric_enable_monitor = false; @@ -294,8 +294,15 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) { // #2564 int64_t total_mem = 0, free_mem = 0; milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem); + int64_t cgroup_limit_size = 0; + milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size); ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok()); - int64_t cache_cpu_cache_size = total_mem / 2; + int64_t cache_cpu_cache_size = 0; + if (cgroup_limit_size < total_mem) { + cache_cpu_cache_size = cgroup_limit_size / 2; + } else { + cache_cpu_cache_size = total_mem / 2; + } float cache_cpu_cache_threshold = 0.7; ASSERT_TRUE(config.SetCacheConfigCpuCacheThreshold(std::to_string(cache_cpu_cache_threshold)).ok()); ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok()); @@ -306,14 +313,20 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) { { int64_t total_mem = 0, free_mem = 0; milvus::server::CommonUtil::GetSystemMemInfo(total_mem, free_mem); + int64_t cgroup_limit_size = 0; + milvus::server::CommonUtil::GetSysCgroupMemLimit(cgroup_limit_size); ASSERT_TRUE(config.SetCacheConfigInsertBufferSize("1GB").ok()); - int64_t cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1 + int64_t cache_cpu_cache_size = 0; + if (cgroup_limit_size < total_mem) { + cache_cpu_cache_size = cgroup_limit_size - 1073741824 - 1; + } else { + cache_cpu_cache_size = total_mem - 1073741824 - 1; // total_size - 1GB - 1 + } ASSERT_TRUE(config.SetCacheConfigCpuCacheCapacity(std::to_string(cache_cpu_cache_size)).ok()); ASSERT_TRUE(config.GetCacheConfigCpuCacheCapacity(int64_val).ok()); ASSERT_TRUE(int64_val == cache_cpu_cache_size); } - /* engine config */ int64_t engine_use_blas_threshold = 50; ASSERT_TRUE(config.SetEngineConfigUseBlasThreshold(std::to_string(engine_use_blas_threshold)).ok()); @@ -389,7 +402,7 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) { ASSERT_TRUE(config.GetWalConfigRecoveryErrorIgnore(bool_val).ok()); ASSERT_TRUE(bool_val == wal_recovery_ignore); - int64_t wal_buffer_size = 128 * 1024 * 1024; // 128 M + int64_t wal_buffer_size = 128 * 1024 * 1024; // 128 M ASSERT_TRUE(config.SetWalConfigBufferSize(std::to_string(wal_buffer_size)).ok()); ASSERT_TRUE(config.GetWalConfigBufferSize(int64_val).ok()); ASSERT_TRUE(int64_val == wal_buffer_size); @@ -419,7 +432,7 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) { auto s = config.SetLogsMaxLogFileSize(logs_max_log_file_size); ASSERT_TRUE(s.ok()) << s.message(); ASSERT_TRUE(config.GetLogsMaxLogFileSize(int64_val).ok()); - ASSERT_TRUE(int64_val == 1000 * 1024 * 1024); // 1000MB + ASSERT_TRUE(int64_val == 1000 * 1024 * 1024); // 1000MB int64_t logs_log_rotate_num = 100; ASSERT_TRUE(config.SetLogsLogRotateNum(std::to_string(logs_log_rotate_num)).ok()); @@ -498,8 +511,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_CLI_TEST) { s = config.ProcessConfigCli(result, get_cmd); ASSERT_TRUE(s.ok()); - - /* cache config */ std::string cache_cpu_cache_capacity = "1"; get_cmd = gen_get_command(ms::CONFIG_CACHE, ms::CONFIG_CACHE_CPU_CACHE_CAPACITY); @@ -681,7 +692,6 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) { ASSERT_FALSE(config.SetDBConfigArchiveDaysThreshold("0x10").ok()); - /* storage config */ ASSERT_FALSE(config.SetStorageConfigPath("").ok()); ASSERT_FALSE(config.SetStorageConfigPath("./milvus").ok()); @@ -691,18 +701,18 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) { ASSERT_FALSE(config.SetStorageConfigAutoFlushInterval("0.1").ok()); -// ASSERT_FALSE(config.SetStorageConfigS3Enable("10").ok()); -// -// ASSERT_FALSE(config.SetStorageConfigS3Address("127.0.0").ok()); -// -// ASSERT_FALSE(config.SetStorageConfigS3Port("100").ok()); -// ASSERT_FALSE(config.SetStorageConfigS3Port("100000").ok()); -// -// ASSERT_FALSE(config.SetStorageConfigS3AccessKey("").ok()); -// -// ASSERT_FALSE(config.SetStorageConfigS3SecretKey("").ok()); -// -// ASSERT_FALSE(config.SetStorageConfigS3Bucket("").ok()); + // ASSERT_FALSE(config.SetStorageConfigS3Enable("10").ok()); + // + // ASSERT_FALSE(config.SetStorageConfigS3Address("127.0.0").ok()); + // + // ASSERT_FALSE(config.SetStorageConfigS3Port("100").ok()); + // ASSERT_FALSE(config.SetStorageConfigS3Port("100000").ok()); + // + // ASSERT_FALSE(config.SetStorageConfigS3AccessKey("").ok()); + // + // ASSERT_FALSE(config.SetStorageConfigS3SecretKey("").ok()); + // + // ASSERT_FALSE(config.SetStorageConfigS3Bucket("").ok()); /* metric config */ ASSERT_FALSE(config.SetMetricConfigEnableMonitor("Y").ok()); @@ -1288,8 +1298,8 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) { std::string reply_set, reply_get; std::string cmd_set, cmd_get; - auto lambda = [&conf_file](const std::string& key, const std::string& child_key, - const std::string& default_value, std::string& value) { + auto lambda = [&conf_file](const std::string& key, const std::string& child_key, const std::string& default_value, + std::string& value) { auto* ymgr = milvus::server::YamlConfigMgr::GetInstance(); auto status = ymgr->LoadConfigFile(conf_file); @@ -1310,52 +1320,58 @@ TEST_F(ConfigTest, SERVER_CONFIG_UPDATE_TEST) { ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); ASSERT_TRUE(lambda(ms::CONFIG_CACHE, ms::CONFIG_CACHE_INSERT_BUFFER_SIZE, - ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value).ok()); + ms::CONFIG_CACHE_INSERT_BUFFER_SIZE_DEFAULT, yaml_value) + .ok()); ASSERT_EQ("2", yaml_value); // test boolean config value cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "True"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); - ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, - ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok()); + ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, + yaml_value) + .ok()); ASSERT_EQ("true", yaml_value); cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "On"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); - ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, - ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok()); + ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, + yaml_value) + .ok()); ASSERT_EQ("true", yaml_value); cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "False"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); - ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, - ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok()); + ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, + yaml_value) + .ok()); ASSERT_EQ("false", yaml_value); cmd_set = gen_set_command(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, "Off"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); - ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, - ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, yaml_value).ok()); + ASSERT_TRUE(lambda(ms::CONFIG_METRIC, ms::CONFIG_METRIC_ENABLE_MONITOR, ms::CONFIG_METRIC_ENABLE_MONITOR_DEFAULT, + yaml_value) + .ok()); ASSERT_EQ("false", yaml_value); // test path cmd_set = gen_set_command(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, "/tmp/milvus_config_unittest"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); - ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, - ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok()); + ASSERT_TRUE(lambda(ms::CONFIG_STORAGE, ms::CONFIG_STORAGE_PATH, ms::CONFIG_STORAGE_PATH_DEFAULT, yaml_value).ok()); ASSERT_EQ("/tmp/milvus_config_unittest", yaml_value); #ifdef MILVUS_GPU_VERSION cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "gpu0"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, - ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok()); + ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value) + .ok()); ASSERT_EQ("gpu0", yaml_value); cmd_set = gen_set_command(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, "GPU0"); ASSERT_TRUE(config.ProcessConfigCli(reply_set, cmd_set).ok()); ASSERT_TRUE(lambda(ms::CONFIG_GPU_RESOURCE, ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES, - ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value).ok()); + ms::CONFIG_GPU_RESOURCE_BUILD_INDEX_RESOURCES_DEFAULT, yaml_value) + .ok()); ASSERT_EQ("gpu0", yaml_value); #endif }