[skip ci] Add chaos tests for etcd and minio (#6159)

* [skip ci] Add chaos tests for etcd and minio

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>

* Update timeout for nightly

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>

* Add more mins for CI and nightly to workaround  #6164

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
pull/6168/head
yanliang567 2021-06-28 14:14:13 +08:00 committed by GitHub
parent c468ff30cf
commit 6f4ad331c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 175 additions and 20 deletions

View File

@ -13,7 +13,7 @@ pipeline {
}
options {
timestamps()
timeout(time: 1, unit: 'HOURS')
timeout(time: 90, unit: 'MINUTES')
buildDiscarder logRotator(artifactDaysToKeepStr: '30')
// parallelsAlwaysFailFast()
}

View File

@ -15,7 +15,7 @@ pipeline {
options {
timestamps()
timeout(time: 30, unit: 'MINUTES')
timeout(time: 36, unit: 'MINUTES')
// parallelsAlwaysFailFast()
}

View File

@ -0,0 +1,16 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: test-etcd-podkill
namespace: chaos-testing
spec:
action: pod-kill
mode: one
selector:
namespaces:
- chaos-testing # target namespace of milvus deployment
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
app.kubernetes.io/name: etcd
scheduler:
cron: '@every 5s'

View File

@ -0,0 +1,16 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: test-minio-podkill
namespace: chaos-testing
spec:
action: pod-kill
mode: one
selector:
namespaces:
- chaos-testing # target namespace of milvus deployment
labelSelectors:
release: milvus-chaos
app: minio
scheduler:
cron: '@every 5s'

View File

@ -14,4 +14,4 @@ spec:
app.kubernetes.io/name: milvus
component: standalone
scheduler:
cron: '@every 10s'
cron: '@every 3s'

View File

@ -33,8 +33,8 @@ class ChaosOpt(object):
metadata_name)
log.debug(f"delete chaos response: {data}")
except ApiException as e:
log.error("Exception when calling CustomObjectsApi->delete_namespaced_custom_object: %s\n" % e)
if raise_ex:
log.error("Exception when calling CustomObjectsApi->delete_namespaced_custom_object: %s\n" % e)
raise Exception(str(e))
def list_chaos_object(self):

View File

@ -141,14 +141,13 @@ class QueryChecker(Checker):
int_values = []
for _ in range(5):
int_values.append(randint(0, constants.ENTITIES_FOR_SEARCH))
# term_expr = f'{ct.default_int64_field_name} in {int_values}'
# _, result = self.c_wrap.query(term_expr, check_task='check_nothing')
result = False
sleep(constants.WAIT_PER_OP/10)
term_expr = f'{ct.default_int64_field_name} in {int_values}'
_, result = self.c_wrap.query(term_expr, check_task='check_nothing')
if result:
self._succ += 1
else:
self._fail += 1
sleep(constants.WAIT_PER_OP / 10)
#
# if __name__ == '__main__':

View File

@ -6,7 +6,6 @@ from checker import CreateChecker, InsertFlushChecker, \
SearchChecker, QueryChecker, IndexChecker, Op
from chaos_opt import ChaosOpt
from utils.util_log import test_log as log
from base.collection_wrapper import ApiCollectionWrapper
from common import common_func as cf
from chaos_commons import *
from common.common_type import CaseLabel
@ -122,14 +121,14 @@ class TestChaos(TestChaosBase):
log.debug("******1st assert before chaos: ")
assert_statistic(self.health_checkers)
# reset counting
reset_counting(self.health_checkers)
# apply chaos object
chaos_opt = ChaosOpt(chaos_config['kind'])
chaos_opt.create_chaos_object(chaos_config)
log.debug("chaos injected")
# reset counting
reset_counting(self.health_checkers)
# wait 120s
sleep(constants.WAIT_PER_OP*4)
@ -155,19 +154,17 @@ class TestChaos(TestChaosBase):
log.debug(f"Thread {k} is_alive(): {t.is_alive()}")
sleep(2)
# reconnect if needed
sleep(constants.WAIT_PER_OP)
sleep(constants.WAIT_PER_OP*2)
reconnect(connections, self.host, self.port)
# reset counting again
reset_counting(self.health_checkers)
# wait 300s (varies by feature)
sleep(constants.WAIT_PER_OP*1.5)
sleep(constants.WAIT_PER_OP*2.5)
# assert statistic: all ops success again
log.debug("******3rd assert after chaos deleted: ")
assert_statistic(self.health_checkers)
log.debug("*********************Chaos Test Completed**********************")

View File

@ -0,0 +1,127 @@
import pytest
import datetime
from time import sleep
from pymilvus_orm import connections, utility
from base.collection_wrapper import ApiCollectionWrapper
from chaos_opt import ChaosOpt
from common import common_func as cf
from common import common_type as ct
from chaos_commons import *
from common.common_type import CaseLabel, CheckTasks
import constants
def reboot_pod(chaos_yaml):
# parse chaos object
chaos_config = gen_experiment_config(chaos_yaml)
log.debug(chaos_config)
# inject chaos
chaos_opt = ChaosOpt(chaos_config['kind'])
chaos_opt.create_chaos_object(chaos_config)
log.debug("chaos injected")
sleep(1)
# delete chaos
meta_name = chaos_config.get('metadata', None).get('name', None)
chaos_opt.delete_chaos_object(meta_name)
log.debug("chaos deleted")
class TestChaosData:
host = 'localhost'
port = 19530
@pytest.fixture(scope="function", autouse=True)
def connection(self, host, port):
connections.add_connection(default={"host": host, "port": port})
conn = connections.connect(alias='default')
if conn is None:
raise Exception("no connections")
self.host = host
self.port = port
return conn
@pytest.mark.tags(CaseLabel.L3)
@pytest.mark.parametrize('chaos_yaml', get_chaos_yamls())
def test_chaos_data_consist(self, connection, chaos_yaml):
c_name = cf.gen_unique_str('chaos_collection_')
nb = 5000
i_name = cf.gen_unique_str('chaos_index_')
index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}
# create
t0 = datetime.datetime.now()
collection_w = ApiCollectionWrapper()
collection_w.init_collection(name=c_name,
schema=cf.gen_default_collection_schema())
tt = datetime.datetime.now() - t0
log.debug(f"assert create: {tt}")
assert collection_w.name == c_name
# insert
data = cf.gen_default_list_data(nb=nb)
t0 = datetime.datetime.now()
_, res = collection_w.insert(data)
tt = datetime.datetime.now() - t0
log.debug(f"assert insert: {tt}")
assert res
# flush
t0 = datetime.datetime.now()
assert collection_w.num_entities == nb
tt = datetime.datetime.now() - t0
log.debug(f"assert flush: {tt}")
# search
collection_w.load()
search_vectors = cf.gen_vectors(1, ct.default_dim)
t0 = datetime.datetime.now()
search_res, _ = collection_w.search(data=search_vectors,
anns_field=ct.default_float_vec_field_name,
param={"nprobe": 16}, limit=1)
tt = datetime.datetime.now() - t0
log.debug(f"assert search: {tt}")
assert len(search_res) == 1
# index
t0 = datetime.datetime.now()
index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name,
index_params=index_params,
name=i_name)
tt = datetime.datetime.now() - t0
log.debug(f"assert index: {tt}")
assert len(collection_w.indexes) == 1
# query
term_expr = f'{ct.default_int64_field_name} in [3001,4001,4999,2999]'
t0 = datetime.datetime.now()
query_res, _ = collection_w.query(term_expr)
tt = datetime.datetime.now() - t0
log.debug(f"assert query: {tt}")
assert len(query_res) == 4
# reboot a pod
reboot_pod(chaos_yaml)
# reconnect if needed
sleep(constants.WAIT_PER_OP * 4)
reconnect(connections, self.host, self.port)
# verify collection persists
assert utility.has_collection(c_name)
log.debug("assert collection persists")
collection_w2 = ApiCollectionWrapper()
collection_w2.init_collection(c_name)
# verify data persist
assert collection_w2.num_entities == nb
log.debug("assert data persists")
# verify index persists
assert collection_w2.has_index(i_name)
log.debug("assert index persists")
# verify search results persist
# verify query results persist
query_res2, _ = collection_w2.query(term_expr)
assert query_res2 == query_res
log.debug("assert query result persists")

View File

@ -37,7 +37,7 @@ class TestPartitionParams(TestcaseBase):
assert collection_w.has_partition(partition_name)[0]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.xfail(reason="issue #5375")
# @pytest.mark.xfail(reason="issue #5375")
@pytest.mark.parametrize("partition_name", [""])
def test_partition_empty_name(self, partition_name):
"""
@ -455,7 +455,7 @@ class TestPartitionOperations(TestcaseBase):
# verify that drop the partition again with exception
partition_w.drop(check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: "None Type"})
check_items={ct.err_code: 1, ct.err_msg: "Partition doesn't exist"})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("partition_name", [cf.gen_unique_str(prefix)])
@ -578,7 +578,7 @@ class TestPartitionOperations(TestcaseBase):
# release the dropped partition and check err response
partition_w.release(check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: "None Type"})
check_items={ct.err_code: 1, ct.err_msg: "Partition doesn't exist"})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("partition_name", [cf.gen_unique_str(prefix)])
@ -690,7 +690,7 @@ class TestPartitionOperations(TestcaseBase):
# insert data to partition
partition_w.insert(cf.gen_default_dataframe_data(),
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: "can not be find"})
check_items={ct.err_code: 1, ct.err_msg: "Partition doesn't exist"})
# TODO: update the assert error
@pytest.mark.tags(CaseLabel.L1)