mirror of https://github.com/milvus-io/milvus.git
247 lines
11 KiB
Python
247 lines
11 KiB
Python
import threading
|
|
import pytest
|
|
import os
|
|
import time
|
|
import json
|
|
from time import sleep
|
|
|
|
from pymilvus import connections
|
|
from chaos.checker import (CreateChecker, InsertChecker, FlushChecker,
|
|
SearchChecker, QueryChecker, IndexChecker, DeleteChecker, Op)
|
|
from common.cus_resource_opts import CustomResourceOperations as CusResource
|
|
from utils.util_log import test_log as log
|
|
from utils.util_k8s import wait_pods_ready, get_pod_list
|
|
from utils.util_common import findkeys
|
|
from chaos import chaos_commons as cc
|
|
from chaos.chaos_commons import assert_statistic
|
|
from common.common_type import CaseLabel
|
|
from chaos import constants
|
|
from delayed_assert import assert_expectations
|
|
|
|
|
|
def check_cluster_nodes(chaos_config):
|
|
# if all pods will be effected, the expect is all fail.
|
|
# Even though the replicas is greater than 1, it can not provide HA, so cluster_nodes is set as 1 for this situation.
|
|
if "all" in chaos_config["metadata"]["name"]:
|
|
return 1
|
|
|
|
selector = findkeys(chaos_config, "selector")
|
|
selector = list(selector)
|
|
log.info(f"chaos target selector: {selector}")
|
|
# assert len(selector) == 1
|
|
selector = selector[0] # chaos yaml file must place the effected pod selector in the first position
|
|
namespace = selector["namespaces"][0]
|
|
labels_dict = selector["labelSelectors"]
|
|
labels_list = []
|
|
for k,v in labels_dict.items():
|
|
labels_list.append(k+"="+v)
|
|
labels_str = ",".join(labels_list)
|
|
pods = get_pod_list(namespace, labels_str)
|
|
return len(pods)
|
|
|
|
|
|
def record_results(checkers):
|
|
res = ""
|
|
for k in checkers.keys():
|
|
check_result = checkers[k].check_result()
|
|
res += f"{str(k):10} {check_result}\n"
|
|
return res
|
|
|
|
|
|
class TestChaosBase:
|
|
expect_create = constants.SUCC
|
|
expect_insert = constants.SUCC
|
|
expect_flush = constants.SUCC
|
|
expect_index = constants.SUCC
|
|
expect_search = constants.SUCC
|
|
expect_query = constants.SUCC
|
|
host = '127.0.0.1'
|
|
port = 19530
|
|
_chaos_config = None
|
|
health_checkers = {}
|
|
|
|
def parser_testcase_config(self, chaos_yaml, chaos_config):
|
|
cluster_nodes = check_cluster_nodes(chaos_config)
|
|
tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
|
|
tests_config = cc.gen_experiment_config(tests_yaml)
|
|
test_collections = tests_config.get('Collections', None)
|
|
for t in test_collections:
|
|
test_chaos = t.get('testcase', {}).get('chaos', {})
|
|
if test_chaos in chaos_yaml:
|
|
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {})
|
|
# for the cluster_n_node
|
|
if cluster_nodes > 1:
|
|
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {})
|
|
log.info(f"yaml.expects: {expects}")
|
|
self.expect_create = expects.get(Op.create.value, constants.SUCC)
|
|
self.expect_insert = expects.get(Op.insert.value, constants.SUCC)
|
|
self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
|
|
self.expect_index = expects.get(Op.index.value, constants.SUCC)
|
|
self.expect_search = expects.get(Op.search.value, constants.SUCC)
|
|
self.expect_query = expects.get(Op.query.value, constants.SUCC)
|
|
log.info(f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
|
|
f"flush:{self.expect_flush}, index:{self.expect_index}, "
|
|
f"search:{self.expect_search}, query:{self.expect_query}")
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
class TestChaos(TestChaosBase):
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def connection(self, host, port):
|
|
connections.add_connection(default={"host": host, "port": port})
|
|
connections.connect(alias='default')
|
|
|
|
if connections.has_connection("default") is False:
|
|
raise Exception("no connections")
|
|
self.host = host
|
|
self.port = port
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def init_health_checkers(self):
|
|
checkers = {
|
|
Op.create: CreateChecker(),
|
|
Op.insert: InsertChecker(),
|
|
Op.flush: FlushChecker(),
|
|
Op.index: IndexChecker(),
|
|
Op.search: SearchChecker(),
|
|
Op.query: QueryChecker(),
|
|
Op.delete: DeleteChecker()
|
|
}
|
|
self.health_checkers = checkers
|
|
|
|
def teardown(self):
|
|
chaos_res = CusResource(kind=self._chaos_config['kind'],
|
|
group=constants.CHAOS_GROUP,
|
|
version=constants.CHAOS_VERSION,
|
|
namespace=constants.CHAOS_NAMESPACE)
|
|
meta_name = self._chaos_config.get('metadata', None).get('name', None)
|
|
chaos_res.delete(meta_name, raise_ex=False)
|
|
sleep(2)
|
|
log.info(f'Alive threads: {threading.enumerate()}')
|
|
|
|
@pytest.mark.tags(CaseLabel.L3)
|
|
@pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
|
|
def test_chaos(self, chaos_yaml):
|
|
# start the monitor threads to check the milvus ops
|
|
log.info("*********************Chaos Test Start**********************")
|
|
log.info(f"chaos_yaml: {chaos_yaml}")
|
|
log.info(connections.get_connection_addr('default'))
|
|
cc.start_monitor_threads(self.health_checkers)
|
|
|
|
# parse chaos object
|
|
chaos_config = cc.gen_experiment_config(chaos_yaml)
|
|
release_name = constants.RELEASE_NAME
|
|
log.info(f"release_name: {release_name}")
|
|
chaos_config['metadata']['name'] = release_name
|
|
kind = chaos_config['kind']
|
|
meta_name = chaos_config.get('metadata', None).get('name', None)
|
|
chaos_config_str = json.dumps(chaos_config)
|
|
chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
|
|
chaos_config = json.loads(chaos_config_str)
|
|
self._chaos_config = chaos_config # cache the chaos config for tear down
|
|
log.info(f"chaos_config: {chaos_config}")
|
|
# parse the test expectations in testcases.yaml
|
|
if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
|
|
log.error("Fail to get the testcase info in testcases.yaml")
|
|
assert False
|
|
|
|
# init report
|
|
dir_name = "./reports"
|
|
file_name = f"./reports/{meta_name}.log"
|
|
if not os.path.exists(dir_name):
|
|
os.makedirs(dir_name)
|
|
# wait 20s
|
|
sleep(constants.WAIT_PER_OP * 2)
|
|
|
|
# assert statistic:all ops 100% succ
|
|
log.info("******1st assert before chaos: ")
|
|
assert_statistic(self.health_checkers)
|
|
try:
|
|
with open(file_name, "a+") as f:
|
|
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
f.write(f"{meta_name}-{ts}\n")
|
|
f.write("1st assert before chaos:\n")
|
|
f.write(record_results(self.health_checkers))
|
|
except Exception as e:
|
|
log.info(f"Fail to write to file: {e}")
|
|
# apply chaos object
|
|
chaos_res = CusResource(kind=chaos_config['kind'],
|
|
group=constants.CHAOS_GROUP,
|
|
version=constants.CHAOS_VERSION,
|
|
namespace=constants.CHAOS_NAMESPACE)
|
|
chaos_res.create(chaos_config)
|
|
log.info("chaos injected")
|
|
# verify the chaos is injected
|
|
log.info(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
|
|
os.system(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
|
|
sleep(constants.WAIT_PER_OP * 2)
|
|
# reset counting
|
|
cc.reset_counting(self.health_checkers)
|
|
|
|
# wait 120s
|
|
sleep(constants.CHAOS_DURATION)
|
|
log.info(f'Alive threads: {threading.enumerate()}')
|
|
|
|
# assert statistic
|
|
log.info("******2nd assert after chaos injected: ")
|
|
assert_statistic(self.health_checkers,
|
|
expectations={Op.create: self.expect_create,
|
|
Op.insert: self.expect_insert,
|
|
Op.flush: self.expect_flush,
|
|
Op.index: self.expect_index,
|
|
Op.search: self.expect_search,
|
|
Op.query: self.expect_query
|
|
})
|
|
try:
|
|
with open(file_name, "a+") as f:
|
|
f.write("2nd assert after chaos injected:\n")
|
|
f.write(record_results(self.health_checkers))
|
|
except Exception as e:
|
|
log.error(f"Fail to write the report: {e}")
|
|
# delete chaos
|
|
chaos_res.delete(meta_name)
|
|
log.info("chaos deleted")
|
|
# verify the chaos is deleted
|
|
log.info(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
|
|
os.system(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
|
|
log.info(f'Alive threads: {threading.enumerate()}')
|
|
sleep(2)
|
|
# wait all pods ready
|
|
log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
|
|
wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
|
|
log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
|
|
wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
|
|
log.info("all pods are ready")
|
|
# reconnect if needed
|
|
sleep(constants.WAIT_PER_OP * 2)
|
|
cc.reconnect(connections, alias='default')
|
|
# reset counting again
|
|
cc.reset_counting(self.health_checkers)
|
|
# wait 50s (varies by feature)
|
|
sleep(constants.WAIT_PER_OP * 5)
|
|
# assert statistic: all ops success again
|
|
log.info("******3rd assert after chaos deleted: ")
|
|
assert_statistic(self.health_checkers)
|
|
try:
|
|
with open(file_name, "a+") as f:
|
|
f.write("3rd assert after chaos deleted:\n")
|
|
f.write(record_results(self.health_checkers))
|
|
except Exception as e:
|
|
log.info(f"Fail to write the report: {e}")
|
|
# terminate and restart threads
|
|
for k, checker in self.health_checkers.items():
|
|
checker.terminate()
|
|
sleep(5)
|
|
log.info(f'Alive threads: {threading.enumerate()}')
|
|
cc.start_monitor_threads(self.health_checkers)
|
|
sleep(constants.WAIT_PER_OP * 2)
|
|
log.info("******4th assert after chaos deleted: ")
|
|
assert_statistic(self.health_checkers)
|
|
|
|
# assert all expectations
|
|
assert_expectations()
|
|
|
|
log.info("*********************Chaos Test Completed**********************") |