[test]Add test case for multi replicase (#16668)

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2022-04-26 18:35:45 +08:00 · 2022-04-26 18:35:45 +08:00 · 337338685a
parent 60f7fef3b3
commit 337338685a
4 changed files with 362 additions and 11 deletions
--- a/tests/python_client/base/utility_wrapper.py
+++ b/tests/python_client/base/utility_wrapper.py
@ -110,12 +110,12 @@ class ApiUtilityWrapper:
                                       timeout=timeout, using=using).run()
        return res, check_result

-    def load_balance(self, src_node_id, dst_node_ids, sealed_segment_ids, timeout=None,
+    def load_balance(self, collection_name, src_node_id, dst_node_ids, sealed_segment_ids, timeout=None,
                     using="default", check_task=None, check_items=None):
        timeout = TIMEOUT if timeout is None else timeout

        func_name = sys._getframe().f_code.co_name
-        res, is_succ = api_request([self.ut.load_balance, src_node_id, dst_node_ids,
+        res, is_succ = api_request([self.ut.load_balance, collection_name, src_node_id, dst_node_ids,
                                    sealed_segment_ids, timeout, using])
        check_result = ResponseChecker(res, func_name, check_task, check_items, is_succ,
                                       timeout=timeout, using=using).run()
--- a/tests/python_client/chaos/chaos_objects/template/pod-failure-by-pod-list.yaml
+++ b/tests/python_client/chaos/chaos_objects/template/pod-failure-by-pod-list.yaml
@ -0,0 +1,17 @@
+apiVersion: chaos-mesh.org/v1alpha1
+kind: PodChaos
+metadata:
+  name: test-querynode-pod-failure
+  namespace: chaos-testing
+spec:
+  selector:
+    pods:
+      chaos-testing:
+        - milvus-multi-querynode-querynode-bcdc595d9-7vmcj
+        - milvus-multi-querynode-querynode-bcdc595d9-ccxls
+        - milvus-multi-querynode-querynode-bcdc595d9-dpwgp
+
+  mode: all
+  action: pod-failure
+  duration: 2m
+  gracePeriod: 0
--- a/tests/python_client/chaos/test_chaos_multi_replicas.py
+++ b/tests/python_client/chaos/test_chaos_multi_replicas.py
@ -0,0 +1,252 @@
+
+import threading
+
+import pytest
+import os
+import time
+import json
+from time import sleep
+
+from pymilvus import connections
+from chaos.checker import (CreateChecker, InsertFlushChecker,
+                           SearchChecker, QueryChecker, IndexChecker, Op)
+from common.cus_resource_opts import CustomResourceOperations as CusResource
+from common.milvus_sys import MilvusSys
+from utils.util_log import test_log as log
+from utils.util_k8s import wait_pods_ready, get_pod_list, get_pod_ip_name_pairs
+from utils.util_common import findkeys
+from chaos import chaos_commons as cc
+from common.common_type import CaseLabel
+from chaos import constants
+from delayed_assert import expect, assert_expectations
+
+
+def assert_statistic(checkers, expectations={}):
+    for k in checkers.keys():
+        # expect succ if no expectations
+        succ_rate = checkers[k].succ_rate()
+        total = checkers[k].total()
+        average_time = checkers[k].average_time
+        if expectations.get(k, '') == constants.FAIL:
+            log.info(
+                f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
+            expect(succ_rate < 0.49 or total < 2,
+                   f"Expect Fail: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
+        else:
+            log.info(
+                f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
+            expect(succ_rate > 0.90 and total > 2,
+                   f"Expect Succ: {str(k)} succ rate {succ_rate}, total: {total}, average time: {average_time:.4f}")
+
+
+def check_cluster_nodes(chaos_config):
+
+    # if all pods will be effected, the expect is all fail.
+    # Even though the replicas is greater than 1, it can not provide HA, so cluster_nodes is set as 1 for this situation.
+    if "all" in chaos_config["metadata"]["name"]:
+        return 1
+
+    selector = findkeys(chaos_config, "selector")
+    selector = list(selector)
+    log.info(f"chaos target selector: {selector}")
+    # assert len(selector) == 1
+    # chaos yaml file must place the effected pod selector in the first position
+    selector = selector[0]
+    namespace = selector["namespaces"][0]
+    labels_dict = selector["labelSelectors"]
+    labels_list = []
+    for k, v in labels_dict.items():
+        labels_list.append(k+"="+v)
+    labels_str = ",".join(labels_list)
+    pods = get_pod_list(namespace, labels_str)
+    return len(pods)
+
+
+def record_results(checkers):
+    res = ""
+    for k in checkers.keys():
+        check_result = checkers[k].check_result()
+        res += f"{str(k):10} {check_result}\n"
+    return res
+
+
+class TestChaosBase:
+    expect_create = constants.SUCC
+    expect_insert = constants.SUCC
+    expect_flush = constants.SUCC
+    expect_index = constants.SUCC
+    expect_search = constants.SUCC
+    expect_query = constants.SUCC
+    host = '127.0.0.1'
+    port = 19530
+    _chaos_config = None
+    health_checkers = {}
+
+    def parser_testcase_config(self, chaos_yaml, chaos_config):
+        cluster_nodes = check_cluster_nodes(chaos_config)
+        tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
+        tests_config = cc.gen_experiment_config(tests_yaml)
+        test_collections = tests_config.get('Collections', None)
+        for t in test_collections:
+            test_chaos = t.get('testcase', {}).get('chaos', {})
+            if test_chaos in chaos_yaml:
+                expects = t.get('testcase', {}).get(
+                    'expectation', {}).get('cluster_1_node', {})
+                # for the cluster_n_node
+                if cluster_nodes > 1:
+                    expects = t.get('testcase', {}).get(
+                        'expectation', {}).get('cluster_n_node', {})
+                log.info(f"yaml.expects: {expects}")
+                self.expect_create = expects.get(
+                    Op.create.value, constants.SUCC)
+                self.expect_insert = expects.get(
+                    Op.insert.value, constants.SUCC)
+                self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
+                self.expect_index = expects.get(Op.index.value, constants.SUCC)
+                self.expect_search = expects.get(
+                    Op.search.value, constants.SUCC)
+                self.expect_query = expects.get(Op.query.value, constants.SUCC)
+                log.info(f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
+                         f"flush:{self.expect_flush}, index:{self.expect_index}, "
+                         f"search:{self.expect_search}, query:{self.expect_query}")
+                return True
+
+        return False
+
+
+class TestChaos(TestChaosBase):
+
+    @pytest.fixture(scope="function", autouse=True)
+    def connection(self, host, port):
+        connections.add_connection(default={"host": host, "port": port})
+        connections.connect(alias='default')
+
+        if connections.has_connection("default") is False:
+            raise Exception("no connections")
+        self.host = host
+        self.port = port
+
+    @pytest.fixture(scope="function", autouse=True)
+    def init_health_checkers(self):
+        checkers = {
+            Op.search: SearchChecker(replica_number=2),
+            Op.query: QueryChecker(replica_number=2)
+        }
+        self.health_checkers = checkers
+
+    def teardown(self):
+        chaos_res = CusResource(kind=self._chaos_config['kind'],
+                                group=constants.CHAOS_GROUP,
+                                version=constants.CHAOS_VERSION,
+                                namespace=constants.CHAOS_NAMESPACE)
+        meta_name = self._chaos_config.get('metadata', None).get('name', None)
+        chaos_res.delete(meta_name, raise_ex=False)
+        sleep(2)
+        log.info(f'Alive threads: {threading.enumerate()}')
+
+    @pytest.mark.tags(CaseLabel.L3)
+    # @pytest.mark.parametrize('chaos_yaml', "chaos/chaos_objects/template/pod-failure-by-pod-list.yaml")
+    def test_chaos(self):
+        # start the monitor threads to check the milvus ops
+        log.info("*********************Chaos Test Start**********************")
+        # log.info(f"chaos_yaml: {chaos_yaml}")
+        log.info(connections.get_connection_addr('default'))
+        cc.start_monitor_threads(self.health_checkers)
+
+        # get replicas info
+        release_name = "milvus-multi-querynode"
+        replicas_info, _ = self.health_checkers[Op.search].c_wrap.get_replicas()
+        querynode_ip_pod_pair = get_pod_ip_name_pairs(
+            "chaos-testing", "app.kubernetes.io/instance=milvus-multi-querynode, component=querynode")
+        querynode_id_pod_pair = {}
+        ms = MilvusSys()
+        for node in ms.query_nodes:
+            ip = node["infos"]['hardware_infos']["ip"].split(":")[0]
+            querynode_id_pod_pair[node["identifier"]
+                                  ] = querynode_ip_pod_pair[ip]
+        group_list = []
+        for g in replicas_info.groups:
+            group_list.append(list(g.group_nodes))
+        # keep only one group in healthy status, other groups all are unhealthy by injecting chaos,
+        # In the effected groups, each group has one pod is in pod failure status 
+        target_pod_list = []
+        for g in group_list[1:]:
+            pod = querynode_id_pod_pair[g[0]]
+            target_pod_list.append(pod)
+        chaos_config = cc.gen_experiment_config("chaos/chaos_objects/template/pod-failure-by-pod-list.yaml")
+        chaos_config['metadata']['name'] = f"test-multi-replicase-{int(time.time())}"
+        kind = chaos_config['kind']
+        meta_name = chaos_config.get('metadata', None).get('name', None)
+        chaos_config['spec']['selector']['pods']['chaos-testing'] = target_pod_list
+        self._chaos_config = chaos_config  # cache the chaos config for tear down
+        
+        log.info(f"chaos_config: {chaos_config}")
+        # wait 20s
+        sleep(constants.WAIT_PER_OP * 2)
+        # replicas info
+        replicas_info, _ = self.health_checkers[Op.search].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}")
+
+        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}")        
+
+        # assert statistic:all ops 100% succ
+        log.info("******1st assert before chaos: ")
+        assert_statistic(self.health_checkers)
+        # apply chaos object
+        chaos_res = CusResource(kind=chaos_config['kind'],
+                                group=constants.CHAOS_GROUP,
+                                version=constants.CHAOS_VERSION,
+                                namespace=constants.CHAOS_NAMESPACE)
+        chaos_res.create(chaos_config)
+        log.info("chaos injected")
+        sleep(constants.WAIT_PER_OP * 2)
+        # reset counting
+        cc.reset_counting(self.health_checkers)
+
+        # wait 120s
+        sleep(constants.CHAOS_DURATION)
+        log.info(f'Alive threads: {threading.enumerate()}')
+        # replicas info
+        replicas_info, _ = self.health_checkers[Op.search].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}")
+
+        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}") 
+        # assert statistic
+        log.info("******2nd assert after chaos injected: ")
+        assert_statistic(self.health_checkers,
+                         expectations={
+                                       Op.search: constants.SUCC,
+                                       Op.query: constants.SUCC
+                                       })
+        # delete chaos
+        chaos_res.delete(meta_name)
+        log.info("chaos deleted")
+        sleep(2)
+        # wait all pods ready
+        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}")
+        wait_pods_ready(constants.CHAOS_NAMESPACE,f"app.kubernetes.io/instance={release_name}")
+        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}")
+        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}")
+        log.info("all pods are ready")
+        # reconnect if needed
+        sleep(constants.WAIT_PER_OP * 2)
+        cc.reconnect(connections, alias='default')
+        # reset counting again
+        cc.reset_counting(self.health_checkers)
+        # wait 50s (varies by feature)
+        sleep(constants.WAIT_PER_OP * 5)
+        # replicas info
+        replicas_info, _ = self.health_checkers[Op.search].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}")
+
+        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
+        log.info(f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}")         
+        # assert statistic: all ops success again
+        log.info("******3rd assert after chaos deleted: ")
+        assert_statistic(self.health_checkers)
+        # assert all expectations
+        assert_expectations()
+
+        log.info("*********************Chaos Test Completed**********************")
--- a/tests/python_client/testcases/test_utility.py
+++ b/tests/python_client/testcases/test_utility.py
@ -1458,8 +1458,7 @@ class TestUtilityAdvanced(TestcaseBase):
                cnt += r.num_rows
        assert cnt == nb

-    @pytest.mark.skip(reason="wait for zhuwenxing to update")
-    @pytest.mark.tags(CaseLabel.Loadbalance)
+    @pytest.mark.tags(CaseLabel.L3)
    def test_load_balance_normal(self):
        """
        target: test load balance of collection
@ -1490,7 +1489,7 @@ class TestUtilityAdvanced(TestcaseBase):
        des_node_ids = all_querynodes[1:]
        sealed_segment_ids = segment_distribution[src_node_id]["sealed"]
        # load balance
-        self.utility_wrap.load_balance(src_node_id, des_node_ids, sealed_segment_ids)
+        self.utility_wrap.load_balance(collection_w.name, src_node_id, des_node_ids, sealed_segment_ids)
        # get segments distribution after load balance
        res, _ = self.utility_wrap.get_query_segment_info(c_name)
        segment_distribution = cf.get_segment_distribution(res)
@ -1500,7 +1499,6 @@ class TestUtilityAdvanced(TestcaseBase):
        # assert sealed_segment_ids is subset of des_sealed_segment_ids
        assert set(sealed_segment_ids).issubset(des_sealed_segment_ids)

-    @pytest.mark.skip(reason="wait for zhuwenxing to update")
    @pytest.mark.tags(CaseLabel.L1)
    def test_load_balance_with_src_node_not_exist(self):
        """
@ -1533,11 +1531,10 @@ class TestUtilityAdvanced(TestcaseBase):
        dst_node_ids = all_querynodes[1:]
        sealed_segment_ids = segment_distribution[src_node_id]["sealed"]
        # load balance
-        self.utility_wrap.load_balance(invalid_src_node_id, dst_node_ids, sealed_segment_ids, 
+        self.utility_wrap.load_balance(collection_w.name, invalid_src_node_id, dst_node_ids, sealed_segment_ids,
                                       check_task=CheckTasks.err_res,
                                       check_items={ct.err_code: 1, ct.err_msg: "is not exist to balance"})

-    @pytest.mark.skip(reason="wait for zhuwenxing to update")
    @pytest.mark.tags(CaseLabel.L1)
    def test_load_balance_with_all_dst_node_not_exist(self):
        """
@ -1569,11 +1566,10 @@ class TestUtilityAdvanced(TestcaseBase):
        dst_node_ids = [node["identifier"] for node in ms.index_nodes]
        sealed_segment_ids = segment_distribution[src_node_id]["sealed"]
        # load balance
-        self.utility_wrap.load_balance(src_node_id, dst_node_ids, sealed_segment_ids, 
+        self.utility_wrap.load_balance(collection_w.name, src_node_id, dst_node_ids, sealed_segment_ids,
                                       check_task=CheckTasks.err_res,
                                       check_items={ct.err_code: 1, ct.err_msg: "no available queryNode to allocate"})

-    @pytest.mark.skip(reason="wait for zhuwenxing to update")
    @pytest.mark.tags(CaseLabel.L1)
    def test_load_balance_with_one_sealed_segment_id_not_exist(self):
        """
@ -1610,6 +1606,92 @@ class TestUtilityAdvanced(TestcaseBase):
        else:
            sealed_segment_ids.append(max(segment_distribution[src_node_id]["sealed"]) + 1)
        # load balance
-        self.utility_wrap.load_balance(src_node_id, dst_node_ids, sealed_segment_ids, 
+        self.utility_wrap.load_balance(collection_w.name, src_node_id, dst_node_ids, sealed_segment_ids,
                                       check_task=CheckTasks.err_res,
                                       check_items={ct.err_code: 1, ct.err_msg: "is not exist"})
+
+    @pytest.mark.tags(CaseLabel.L3)
+    def test_load_balance_in_one_group(self):
+        """
+        target: test load balance of collection in one group
+        method: init a collection, load with multi replicas and load balance among the querynodes in one group
+        expected: load balance successfully
+        """
+        # init a collection
+        c_name = cf.gen_unique_str(prefix)
+        collection_w = self.init_collection_wrap(name=c_name)
+        ms = MilvusSys()
+        nb = 3000
+        df = cf.gen_default_dataframe_data(nb)
+        collection_w.insert(df)
+        # get sealed segments
+        collection_w.num_entities
+        collection_w.load(replica_number=2)
+        # get growing segments
+        collection_w.insert(df)
+        # get replicas information
+        res, _ = collection_w.get_replicas()
+        # prepare load balance params
+        # find a group which has multi nodes
+        group_nodes = []
+        for g in res.groups:
+            if len(g.group_nodes) >= 2:
+                group_nodes = list(g.group_nodes)
+                break
+        src_node_id = group_nodes[0]
+        dst_node_ids = group_nodes[1:]
+        res, _ = self.utility_wrap.get_query_segment_info(c_name)
+        segment_distribution = cf.get_segment_distribution(res)
+        sealed_segment_ids = segment_distribution[src_node_id]["sealed"]
+        # load balance
+        self.utility_wrap.load_balance(collection_w.name, src_node_id, dst_node_ids, sealed_segment_ids)
+        # get segments distribution after load balance
+        res, _ = self.utility_wrap.get_query_segment_info(c_name)
+        segment_distribution = cf.get_segment_distribution(res)
+        sealed_segment_ids_after_load_banalce = segment_distribution[src_node_id]["sealed"]
+        # assert
+        assert sealed_segment_ids_after_load_banalce == []
+        des_sealed_segment_ids = []
+        for des_node_id in dst_node_ids:
+            des_sealed_segment_ids += segment_distribution[des_node_id]["sealed"]
+        # assert sealed_segment_ids is subset of des_sealed_segment_ids
+        assert set(sealed_segment_ids).issubset(des_sealed_segment_ids)
+
+    @pytest.mark.tags(CaseLabel.L3)
+    def test_load_balance_not_in_one_group(self):
+        """
+        target: test load balance of collection in one group
+        method: init a collection, load with multi replicas and load balance among the querynodes in different group
+        expected: load balance failed
+        """
+        # init a collection
+        c_name = cf.gen_unique_str(prefix)
+        collection_w = self.init_collection_wrap(name=c_name)
+        ms = MilvusSys()
+        nb = 3000
+        df = cf.gen_default_dataframe_data(nb)
+        collection_w.insert(df)
+        # get sealed segments
+        collection_w.num_entities
+        collection_w.load(replica_number=2)
+        # get growing segments
+        collection_w.insert(df)
+        # get replicas information
+        res, _ = collection_w.get_replicas()
+        # prepare load balance params
+        all_querynodes = [node["identifier"] for node in ms.query_nodes]
+        # find a group which has multi nodes
+        group_nodes = []
+        for g in res.groups:
+            if len(g.group_nodes) >= 2:
+                group_nodes = list(g.group_nodes)
+                break
+        src_node_id = group_nodes[0]              
+        dst_node_ids = list(set(all_querynodes) - set(group_nodes))
+        res, _ = self.utility_wrap.get_query_segment_info(c_name)
+        segment_distribution = cf.get_segment_distribution(res)
+        sealed_segment_ids = segment_distribution[src_node_id]["sealed"]
+        # load balance
+        self.utility_wrap.load_balance(collection_w.name, src_node_id, dst_node_ids, sealed_segment_ids,
+                                       check_task=CheckTasks.err_res,
+                                       check_items={ct.err_code: 1, ct.err_msg: "must be in the same replica group"})