[skip ci] Add part chaos experment pod failure and container kill (#7024)

* fix chaos checker

Signed-off-by: ThreadDao <yufen.zong@zilliz.com>

* add chaos queryNode pod failure

Signed-off-by: ThreadDao <yufen.zong@zilliz.com>

* add datanode pod failure chaos experiment

add export pods logs func

Signed-off-by: ThreadDao <yufen.zong@zilliz.com>
pull/7063/head
ThreadDao 2021-08-12 10:22:08 +08:00 committed by GitHub
parent ecfebff801
commit 8bf4524f6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 115 additions and 13 deletions

View File

@ -0,0 +1,18 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: test-datanode-container-kill
namespace: chaos-testing
spec:
action: container-kill
mode: one
containerName: 'datanode'
selector:
namespaces:
- chaos-testing # target namespace of milvus deployment
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
app.kubernetes.io/name: milvus
component: datanode
scheduler:
cron: '@every 2s'

View File

@ -0,0 +1,19 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: test-datanode-pod-failure
namespace: chaos-testing
spec:
action: pod-failure
mode: one
value: ''
duration: '20s'
selector:
namespaces:
- chaos-testing # target namespace of milvus deployment
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
app.kubernetes.io/name: milvus
component: datanode
scheduler:
cron: '@every 30s'

View File

@ -0,0 +1,19 @@
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: test-querynode-pod-failure
namespace: chaos-testing
spec:
action: pod-failure
mode: one
value: ''
duration: '20s'
selector:
namespaces:
- chaos-testing # target namespace of milvus deployment
labelSelectors:
app.kubernetes.io/instance: milvus-chaos
app.kubernetes.io/name: milvus
component: querynode
scheduler:
cron: '@every 30s'

View File

@ -140,4 +140,26 @@ Collections:
testcase:
name: test_querynode_network_isolation
chaos: chaos_querynode_network_isolation.yaml
# and 10 more for the other pods
# and 10 more for the other pods
-
testcase:
name: test_datanode_container_kill
chaos: chaos_datanode_container_kill.yaml
expectation:
cluster_1_node:
insert: succ
flush: fail
cluster_n_nodes:
insert: degrade
-
testcase:
name: test_datanode_pod_failure
chaos: chaos_datanode_pod_failure.yaml
expectation:
cluster_1_node:
insert: succ
flush: fail
cluster_n_nodes:
insert: degrade

View File

@ -34,7 +34,7 @@ class Checker:
schema=cf.gen_default_collection_schema(),
timeout=timeout)
self.c_wrap.insert(data=cf.gen_default_list_data(nb=constants.ENTITIES_FOR_SEARCH),
timeout=timeout, check_task='check_nothing')
timeout=timeout)
self.initial_entities = self.c_wrap.num_entities # do as a flush
def total(self):
@ -63,7 +63,7 @@ class SearchChecker(Checker):
data=search_vec,
anns_field=ct.default_float_vec_field_name,
param={"nprobe": 32},
limit=1, timeout=timeout, check_task='check_nothing'
limit=1, timeout=timeout
)
if result:
self._succ += 1
@ -82,7 +82,7 @@ class InsertFlushChecker(Checker):
while self._running:
_, insert_result = \
self.c_wrap.insert(data=cf.gen_default_list_data(nb=constants.DELTA_PER_INS),
timeout=timeout, check_task='check_nothing')
timeout=timeout)
if not self._flush:
if insert_result:
self._succ += 1
@ -106,11 +106,11 @@ class CreateChecker(Checker):
_, result = self.c_wrap.init_collection(
name=cf.gen_unique_str("CreateChecker_"),
schema=cf.gen_default_collection_schema(),
timeout=timeout, check_task='check_nothing'
timeout=timeout
)
if result:
self._succ += 1
self.c_wrap.drop(timeout=timeout, check_task="check_nothing")
self.c_wrap.drop(timeout=timeout)
else:
self._fail += 1
sleep(constants.WAIT_PER_OP / 10)
@ -120,7 +120,7 @@ class IndexChecker(Checker):
def __init__(self):
super().__init__()
self.c_wrap.insert(data=cf.gen_default_list_data(nb=5*constants.ENTITIES_FOR_SEARCH),
timeout=timeout, check_task='check_nothing')
timeout=timeout)
log.debug(f"Index ready entities: {self.c_wrap.num_entities }") # do as a flush before indexing
def keep_running(self):
@ -128,10 +128,10 @@ class IndexChecker(Checker):
_, result = self.c_wrap.create_index(ct.default_float_vec_field_name,
constants.DEFAULT_INDEX_PARAM,
name=cf.gen_unique_str('index_'),
timeout=timeout, check_task='check_nothing')
timeout=timeout)
if result:
self._succ += 1
self.c_wrap.drop_index(timeout=timeout, check_task='check_nothing')
self.c_wrap.drop_index(timeout=timeout)
else:
self._fail += 1
@ -147,7 +147,7 @@ class QueryChecker(Checker):
for _ in range(5):
int_values.append(randint(0, constants.ENTITIES_FOR_SEARCH))
term_expr = f'{ct.default_int64_field_name} in {int_values}'
_, result = self.c_wrap.query(term_expr, timeout=timeout, check_task='check_nothing')
_, result = self.c_wrap.query(term_expr, timeout=timeout)
if result:
self._succ += 1
else:

View File

@ -129,7 +129,7 @@ class TestChaos(TestChaosBase):
chaos_opt = ChaosOpt(chaos_config['kind'])
chaos_opt.create_chaos_object(chaos_config)
log.debug("chaos injected")
sleep(constants.WAIT_PER_OP * 2.1)
# reset counting
reset_counting(self.health_checkers)

View File

@ -12,3 +12,4 @@ QUERY_NODE = "queryNode"
# my values.yaml path
MILVUS_CHART_ENV = 'MILVUS_CHART_ENV'
MILVUS_CHART_PATH = '/home/zong/milvus-helm/charts/milvus'
MILVUS_LOGS_PATH = '/tmp/milvus'

View File

@ -95,14 +95,37 @@ class HelmEnv:
service = v1.read_namespaced_service(f'{self.release_name}-milvus', constants.NAMESPACE)
return service.status.load_balancer.ingress[0].ip
def export_all_logs(self):
"""
export all cluster logs to /tmp/milvus, and temporarily missing minio pod logs
:return: export all pods' log to constants.MILVUS_LOGS_PATH
"""
pods = self.list_all_pods()
for pod in pods:
os.system(f'kubectl logs {pod} > {constants.MILVUS_LOGS_PATH}/{pod}.log 2>&1')
def list_all_pods(self):
from kubernetes import client, config
config.load_kube_config()
v1 = client.CoreV1Api()
label_selector = f'app.kubernetes.io/instance={self.release_name}'
ret = v1.list_namespaced_pod(namespace=constants.NAMESPACE, label_selector=label_selector)
pods = []
# # label_selector = 'release=zong-single'
for i in ret.items:
pods.append(i.metadata.name)
# # print("%s\t%s\t%s" % (i.status.pod_ip, i.metadata.namespace, i.metadata.name))
return pods
if __name__ == '__main__':
# default deploy q replicas
release_name = "scale-test"
release_name = "milvus-chaos"
env = HelmEnv(release_name=release_name)
# host = env.get_svc_external_ip()
# log.debug(host)
# env.helm_install_cluster_milvus()
# env.helm_upgrade_cluster_milvus(queryNode=2)
env.helm_uninstall_cluster_milvus()
sleep(5)
# sleep(5)
# env.export_all_logs()