milvus/tests/python_client/chaos/test_chaos.py

247 lines
11 KiB
Python

import threading
import pytest
import os
import time
import json
from time import sleep
from pymilvus import connections
from chaos.checker import (CreateChecker, InsertChecker, FlushChecker,
SearchChecker, QueryChecker, IndexChecker, DeleteChecker, Op)
from common.cus_resource_opts import CustomResourceOperations as CusResource
from utils.util_log import test_log as log
from utils.util_k8s import wait_pods_ready, get_pod_list
from utils.util_common import findkeys
from chaos import chaos_commons as cc
from chaos.chaos_commons import assert_statistic
from common.common_type import CaseLabel
from chaos import constants
from delayed_assert import assert_expectations
def check_cluster_nodes(chaos_config):
# if all pods will be effected, the expect is all fail.
# Even though the replicas is greater than 1, it can not provide HA, so cluster_nodes is set as 1 for this situation.
if "all" in chaos_config["metadata"]["name"]:
return 1
selector = findkeys(chaos_config, "selector")
selector = list(selector)
log.info(f"chaos target selector: {selector}")
# assert len(selector) == 1
selector = selector[0] # chaos yaml file must place the effected pod selector in the first position
namespace = selector["namespaces"][0]
labels_dict = selector["labelSelectors"]
labels_list = []
for k,v in labels_dict.items():
labels_list.append(k+"="+v)
labels_str = ",".join(labels_list)
pods = get_pod_list(namespace, labels_str)
return len(pods)
def record_results(checkers):
res = ""
for k in checkers.keys():
check_result = checkers[k].check_result()
res += f"{str(k):10} {check_result}\n"
return res
class TestChaosBase:
expect_create = constants.SUCC
expect_insert = constants.SUCC
expect_flush = constants.SUCC
expect_index = constants.SUCC
expect_search = constants.SUCC
expect_query = constants.SUCC
host = '127.0.0.1'
port = 19530
_chaos_config = None
health_checkers = {}
def parser_testcase_config(self, chaos_yaml, chaos_config):
cluster_nodes = check_cluster_nodes(chaos_config)
tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
tests_config = cc.gen_experiment_config(tests_yaml)
test_collections = tests_config.get('Collections', None)
for t in test_collections:
test_chaos = t.get('testcase', {}).get('chaos', {})
if test_chaos in chaos_yaml:
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_1_node', {})
# for the cluster_n_node
if cluster_nodes > 1:
expects = t.get('testcase', {}).get('expectation', {}).get('cluster_n_node', {})
log.info(f"yaml.expects: {expects}")
self.expect_create = expects.get(Op.create.value, constants.SUCC)
self.expect_insert = expects.get(Op.insert.value, constants.SUCC)
self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
self.expect_index = expects.get(Op.index.value, constants.SUCC)
self.expect_search = expects.get(Op.search.value, constants.SUCC)
self.expect_query = expects.get(Op.query.value, constants.SUCC)
log.info(f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
f"flush:{self.expect_flush}, index:{self.expect_index}, "
f"search:{self.expect_search}, query:{self.expect_query}")
return True
return False
class TestChaos(TestChaosBase):
@pytest.fixture(scope="function", autouse=True)
def connection(self, host, port):
connections.add_connection(default={"host": host, "port": port})
connections.connect(alias='default')
if connections.has_connection("default") is False:
raise Exception("no connections")
self.host = host
self.port = port
@pytest.fixture(scope="function", autouse=True)
def init_health_checkers(self):
checkers = {
Op.create: CreateChecker(),
Op.insert: InsertChecker(),
Op.flush: FlushChecker(),
Op.index: IndexChecker(),
Op.search: SearchChecker(),
Op.query: QueryChecker(),
Op.delete: DeleteChecker()
}
self.health_checkers = checkers
def teardown(self):
chaos_res = CusResource(kind=self._chaos_config['kind'],
group=constants.CHAOS_GROUP,
version=constants.CHAOS_VERSION,
namespace=constants.CHAOS_NAMESPACE)
meta_name = self._chaos_config.get('metadata', None).get('name', None)
chaos_res.delete(meta_name, raise_ex=False)
sleep(2)
log.info(f'Alive threads: {threading.enumerate()}')
@pytest.mark.tags(CaseLabel.L3)
@pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls())
def test_chaos(self, chaos_yaml):
# start the monitor threads to check the milvus ops
log.info("*********************Chaos Test Start**********************")
log.info(f"chaos_yaml: {chaos_yaml}")
log.info(connections.get_connection_addr('default'))
cc.start_monitor_threads(self.health_checkers)
# parse chaos object
chaos_config = cc.gen_experiment_config(chaos_yaml)
release_name = constants.RELEASE_NAME
log.info(f"release_name: {release_name}")
chaos_config['metadata']['name'] = release_name
kind = chaos_config['kind']
meta_name = chaos_config.get('metadata', None).get('name', None)
chaos_config_str = json.dumps(chaos_config)
chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
chaos_config = json.loads(chaos_config_str)
self._chaos_config = chaos_config # cache the chaos config for tear down
log.info(f"chaos_config: {chaos_config}")
# parse the test expectations in testcases.yaml
if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
log.error("Fail to get the testcase info in testcases.yaml")
assert False
# init report
dir_name = "./reports"
file_name = f"./reports/{meta_name}.log"
if not os.path.exists(dir_name):
os.makedirs(dir_name)
# wait 20s
sleep(constants.WAIT_PER_OP * 2)
# assert statistic:all ops 100% succ
log.info("******1st assert before chaos: ")
assert_statistic(self.health_checkers)
try:
with open(file_name, "a+") as f:
ts = time.strftime("%Y-%m-%d %H:%M:%S")
f.write(f"{meta_name}-{ts}\n")
f.write("1st assert before chaos:\n")
f.write(record_results(self.health_checkers))
except Exception as e:
log.info(f"Fail to write to file: {e}")
# apply chaos object
chaos_res = CusResource(kind=chaos_config['kind'],
group=constants.CHAOS_GROUP,
version=constants.CHAOS_VERSION,
namespace=constants.CHAOS_NAMESPACE)
chaos_res.create(chaos_config)
log.info("chaos injected")
# verify the chaos is injected
log.info(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
os.system(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
sleep(constants.WAIT_PER_OP * 2)
# reset counting
cc.reset_counting(self.health_checkers)
# wait 120s
sleep(constants.CHAOS_DURATION)
log.info(f'Alive threads: {threading.enumerate()}')
# assert statistic
log.info("******2nd assert after chaos injected: ")
assert_statistic(self.health_checkers,
expectations={Op.create: self.expect_create,
Op.insert: self.expect_insert,
Op.flush: self.expect_flush,
Op.index: self.expect_index,
Op.search: self.expect_search,
Op.query: self.expect_query
})
try:
with open(file_name, "a+") as f:
f.write("2nd assert after chaos injected:\n")
f.write(record_results(self.health_checkers))
except Exception as e:
log.error(f"Fail to write the report: {e}")
# delete chaos
chaos_res.delete(meta_name)
log.info("chaos deleted")
# verify the chaos is deleted
log.info(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
os.system(f"kubectl get {kind} {meta_name} -n {constants.CHAOS_NAMESPACE}")
log.info(f'Alive threads: {threading.enumerate()}')
sleep(2)
# wait all pods ready
log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
log.info("all pods are ready")
# reconnect if needed
sleep(constants.WAIT_PER_OP * 2)
cc.reconnect(connections, alias='default')
# reset counting again
cc.reset_counting(self.health_checkers)
# wait 50s (varies by feature)
sleep(constants.WAIT_PER_OP * 5)
# assert statistic: all ops success again
log.info("******3rd assert after chaos deleted: ")
assert_statistic(self.health_checkers)
try:
with open(file_name, "a+") as f:
f.write("3rd assert after chaos deleted:\n")
f.write(record_results(self.health_checkers))
except Exception as e:
log.info(f"Fail to write the report: {e}")
# terminate and restart threads
for k, checker in self.health_checkers.items():
checker.terminate()
sleep(5)
log.info(f'Alive threads: {threading.enumerate()}')
cc.start_monitor_threads(self.health_checkers)
sleep(constants.WAIT_PER_OP * 2)
log.info("******4th assert after chaos deleted: ")
assert_statistic(self.health_checkers)
# assert all expectations
assert_expectations()
log.info("*********************Chaos Test Completed**********************")