diff --git a/.github/workflows/mem-stress-chaos-test.yaml b/.github/workflows/mem-stress-chaos-test.yaml new file mode 100644 index 0000000000..2e1393ff58 --- /dev/null +++ b/.github/workflows/mem-stress-chaos-test.yaml @@ -0,0 +1,197 @@ +name: Memory Stress Chaos Test + +on: + workflow_dispatch: +jobs: + + test-memory-stress-chaos: + + runs-on: ubuntu-latest + timeout-minutes: 40 + strategy: + fail-fast: false + matrix: + pod: [datanode, indexnode, proxy, pulsar, querynode, etcd, minio] + + steps: + + - name: Set env param + run: | + echo "RELEASE=test-${{ matrix.pod }}-memory-stress" >> $GITHUB_ENV + + - name: Creating kind cluster + uses: helm/kind-action@v1.2.0 + + - name: Print cluster information + run: | + kubectl config view + kubectl cluster-info + kubectl get nodes + kubectl get pods -n kube-system + helm version + kubectl version + + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependency + uses: nick-invision/retry@v2 + with: + timeout_minutes: 5 + max_attempts: 3 + retry_on: error + shell: bash + command: | + pip install -r tests/python_client/requirements.txt --trusted-host https://test.pypi.org + pip install --upgrade protobuf + + - name: Deploy Chaos Mesh + shell: bash + run: | + helm repo add chaos-mesh https://charts.chaos-mesh.org + helm search repo chaos-mesh + kubectl create ns chaos-testing + helm install --wait --timeout 360s chaos-mesh chaos-mesh/chaos-mesh --namespace=chaos-testing --version v2.0.3 --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock + kubectl get po -n chaos-testing + + - name: Deploy Milvus + shell: bash + working-directory: tests/python_client/chaos + run: | + echo "latest tag:" + bash ../../../scripts/docker_image_find_tag.sh -n milvusdb/milvus-dev -t master-latest -f master- -F -L -q + helm repo add milvus https://milvus-io.github.io/milvus-helm + helm repo update + if [[ ${{ matrix.pod }} != *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi + if [[ ${{ matrix.pod }} == *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f standalone-values.yaml -n=chaos-testing; fi + kubectl get pods -n chaos-testing + sleep 20s + kubectl get pods -n chaos-testing + kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & + sleep 20s + # check whether port-forward success + nc -vz 127.0.0.1 19530 + # check whether milvus server is healthy + python scripts/hello_milvus.py + + - name: Chaos Test + timeout-minutes: 15 + shell: bash + working-directory: tests/python_client/chaos + run: | + # replace chaos object + sed -i "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/mem_stress\/'/g" constants.py + sed -i "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${{ matrix.pod }}_mem_stress.yaml\'/g" constants.py + sed -i "s/RELEASE_NAME =.*/RELEASE_NAME = \'${{ env.RELEASE }}\'/g" constants.py + cat constants.py + timeout 14m pytest -s -v test_chaos.py --host 127.0.0.1 --log-cli-level=INFO --capture=no || echo "chaos test failed" + + - name: Result Analysis + timeout-minutes: 15 + shell: bash + working-directory: tests/python_client/chaos/reports + run: | + echo "result analysis" + cat ${{ env.RELEASE }}.log || echo "no log file" + + - name: Milvus E2E Test + timeout-minutes: 10 + if: ${{ always() }} + shell: bash + working-directory: tests/python_client + run: | + kubectl get networkchaos -n chaos-testing + kubectl get pod -n chaos-testing + # wait all pod to be ready + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s + kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s + kubectl get pod -n chaos-testing + ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 + kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & + + sleep 20s + nc -vz 127.0.0.1 19530 + + pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no + python chaos/scripts/hello_milvus.py --host 127.0.0.1 + + - name: Export logs + if: ${{ always() }} + shell: bash + working-directory: tests/python_client/chaos + run: | + #in this step, verify whether pod has been killed by pod's age + kubectl get po -n chaos-testing + # export k8s log for chaos mesh and milvus + bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/chaos-test + + - name: Deploy Milvus Again If Previous E2E Test Failed + timeout-minutes: 15 + if: ${{ failure() }} + shell: bash + working-directory: tests/python_client/chaos + run: | + kubectl config set-context --current --namespace=chaos-testing + bash scripts/uninstall_milvus.sh ${{ env.RELEASE }} + if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi + if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set cluster.enabled=false --set etcd.replicaCount=1 --set minio.mode=standalone --set pulsar.enabled=false -n=chaos-testing; fi + kubectl get pods -n chaos-testing + sleep 20s + kubectl get pods -n chaos-testing + ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 + kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & + sleep 20s + # check whether port-forward success + nc -vz 127.0.0.1 19530 + # check whether milvus server is healthy + python scripts/hello_milvus.py + + - name: Data Consist Test + timeout-minutes: 5 + if: ${{ always() }} + shell: bash + working-directory: tests/python_client/chaos + run: | + pytest -s -v test_chaos_data_consist.py --host 127.0.0.1 --log-cli-level=INFO --capture=no || echo "data consist chaos test failed" + + - name: Milvus E2E Test + timeout-minutes: 10 + if: ${{ always() }} + shell: bash + working-directory: tests/python_client + run: | + kubectl get pod -n chaos-testing + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s + kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s + kubectl get pod -n chaos-testing + ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 + kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & + sleep 20s + nc -vz 127.0.0.1 19530 + + pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no + python chaos/scripts/hello_milvus.py --host 127.0.0.1 + + - name: Export logs + if: ${{ always() }} + shell: bash + working-directory: tests/python_client/chaos + run: | + #in this step, verify whether pod has been killed by pod's age + kubectl get po -n chaos-testing + # export k8s log for chaos mesh and milvus + bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/data-consist-test + bash ../../scripts/export_log_k8s.sh chaos-testing chaos-daemon k8s_logs/chaos-mesh-daemon + + - name: Upload logs + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: logs-${{ matrix.pod }} + path: | + tests/python_client/chaos/k8s_logs + tests/python_client/chaos/reports diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_datanode_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_datanode_mem_stress.yaml new file mode 100644 index 0000000000..3af33f98b9 --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_datanode_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-datanode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: milvus-chaos + component: datanode + mode: all + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_etcd_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_etcd_mem_stress.yaml new file mode 100644 index 0000000000..5fdc6b0981 --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_etcd_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-etcd-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: milvus-chaos + app.kubernetes.io/name: etcd + mode: all + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_indexnode_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_indexnode_mem_stress.yaml new file mode 100644 index 0000000000..34914e1061 --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_indexnode_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-indexnode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: pulsar-mem-stress-14-04-25 + component: indexnode + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_minio_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_minio_mem_stress.yaml new file mode 100644 index 0000000000..157d38c90b --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_minio_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-datanode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + release: milvus-chaos + app: minio + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_proxy_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_proxy_mem_stress.yaml new file mode 100644 index 0000000000..8ac653bfcd --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_proxy_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-datanode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: milvus-chaos + component: proxy + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_pulsar_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_pulsar_mem_stress.yaml new file mode 100644 index 0000000000..7be3fffcf2 --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_pulsar_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-datanode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + release: milvus-chaos + app: pulsar + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_querynode_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_querynode_mem_stress.yaml new file mode 100644 index 0000000000..38d5a20fea --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_querynode_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-querynode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: milvus-chaos + component: querynode + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/chaos_standalone_mem_stress.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_standalone_mem_stress.yaml new file mode 100644 index 0000000000..4336760dd2 --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/chaos_standalone_mem_stress.yaml @@ -0,0 +1,21 @@ +kind: StressChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + name: test-datanode-memory-stress + namespace: chaos-testing +spec: + selector: + namespaces: + - chaos-testing + labelSelectors: + app.kubernetes.io/instance: milvus-chaos + component: standalone + mode: one + stressors: + cpu: + workers: 4 + load: 80 + memory: + workers: 4 + size: 2048Mi + duration: 5m \ No newline at end of file diff --git a/tests/python_client/chaos/chaos_objects/mem_stress/testcases.yaml b/tests/python_client/chaos/chaos_objects/mem_stress/testcases.yaml new file mode 100644 index 0000000000..0d3b382b2e --- /dev/null +++ b/tests/python_client/chaos/chaos_objects/mem_stress/testcases.yaml @@ -0,0 +1,85 @@ +# Memory Stress Testcases All-in-one +# memory stress +# standalone +# todo +# cluster-1-node +# 11 pods(querynode, datanode, indexnode, pulsar, etcd, minio) +# cluster-n-nodes +# todo + +Collections: + - + testcase: + name: test_querynode_mem_stress + chaos: chaos_querynode_mem_stress.yaml + expectation: + cluster_1_node: + search: fail + query: fail + cluster_n_nodes: + search: degrade + query: degrade + - + testcase: + name: test_datanode_mem_stress + chaos: chaos_datanode_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + - + testcase: + name: test_proxy_mem_stress + chaos: chaos_proxy_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + - + testcase: + name: test_indexnode_mem_stress + chaos: chaos_indexnode_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + + - + testcase: + name: test_pulsar_mem_stress + chaos: chaos_pulsar_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + - + testcase: + name: test_minio_mem_stress + chaos: chaos_minio_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + - + testcase: + name: test_etcd_mem_stress + chaos: chaos_etcd_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade + - + testcase: + name: test_standalone_mem_stress + chaos: chaos_standalone_mem_stress.yaml + expectation: + cluster_1_node: + flush: fail + cluster_n_nodes: + flush: degrade \ No newline at end of file