name: Pod Kill Chaos Test on: workflow_dispatch: inputs: image_tag: description: The image tag to use for the chaos test required: true default: 'master-latest' schedule: - cron: "30 18 * * *" jobs: test-pod-kill-chaos: runs-on: ubuntu-latest timeout-minutes: 40 strategy: fail-fast: false matrix: pod: [allstandalone, allcluster, standalone, datacoord, datanode, indexcoord, indexnode, proxy, pulsar, querycoord, querynode, rootcoord, etcd, minio] steps: - name: Set env param env: DEFAULT_IMAGE_TAG: master-latest run: | echo "RELEASE=test-${{ matrix.pod }}-pod-kill" >> $GITHUB_ENV echo "IMAGE_TAG=${{ github.event.inputs.image_tag || env.DEFAULT_IMAGE_TAG}}" >> $GITHUB_ENV - name: Creating kind cluster uses: helm/kind-action@v1.2.0 - name: Print cluster information run: | kubectl config view kubectl cluster-info kubectl get nodes kubectl get pods -n kube-system helm version kubectl version - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 - name: Install dependency uses: nick-invision/retry@v2 with: timeout_minutes: 5 max_attempts: 3 retry_on: error shell: bash command: | pip install -r tests/python_client/requirements.txt --trusted-host https://test.pypi.org pip install --upgrade protobuf - name: Deploy Chaos Mesh timeout-minutes: 2 shell: bash run: | helm repo add chaos-mesh https://charts.chaos-mesh.org helm search repo chaos-mesh kubectl create ns chaos-testing helm install --wait --timeout 360s chaos-mesh chaos-mesh/chaos-mesh --namespace=chaos-testing --version v2.0.3 --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock kubectl get po -n chaos-testing - name: Deploy Milvus timeout-minutes: 15 shell: bash working-directory: tests/python_client/chaos run: | echo "latest tag:" bash ../../../scripts/docker_image_find_tag.sh -n milvusdb/milvus-dev -t master-latest -f master- -F -L -q helm repo add milvus https://milvus-io.github.io/milvus-helm helm repo update if [[ ${{ matrix.pod }} != *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set image.all.tag=${{ env.IMAGE_TAG }} -f cluster-values.yaml -n=chaos-testing; fi if [[ ${{ matrix.pod }} == *"standalone"* ]]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set image.all.tag=${{ env.IMAGE_TAG }} -f standalone-values.yaml -n=chaos-testing; fi kubectl get pods -n chaos-testing sleep 20s kubectl get pods -n chaos-testing kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s # check whether port-forward success nc -vz 127.0.0.1 19530 # check whether milvus server is healthy python scripts/hello_milvus.py - name: Chaos Test timeout-minutes: 15 shell: bash working-directory: tests/python_client/chaos run: | # replace chaos object sed -i "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/pod_kill\/'/g" constants.py sed -i "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${{ matrix.pod }}_pod_kill.yaml\'/g" constants.py sed -i "s/RELEASE_NAME =.*/RELEASE_NAME = \'${{ env.RELEASE }}\'/g" constants.py cat constants.py timeout 14m pytest -s -v test_chaos.py --host 127.0.0.1 --log-cli-level=INFO --capture=no || echo "chaos test failed" - name: Result Analysis timeout-minutes: 1 shell: bash working-directory: tests/python_client/chaos/reports run: | echo "result analysis" cat ${{ env.RELEASE }}.log || echo "no log file" - name: Wait all pods ready timeout-minutes: 5 shell: bash working-directory: tests/python_client run: | kubectl get pod -n chaos-testing # wait all pod to be ready kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl get pod -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s nc -vz 127.0.0.1 19530 - name: Run e2e test after chaos timeout-minutes: 5 shell: bash working-directory: tests/python_client run: | pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no - name: Run hello_milvus after chaos timeout-minutes: 5 shell: bash working-directory: tests/python_client run: | python chaos/scripts/hello_milvus.py --host 127.0.0.1 - name: Verify all collections after chaos timeout-minutes: 15 shell: bash working-directory: tests/python_client run: | python chaos/scripts/verify_all_collections.py --host 127.0.0.1 - name: Export logs if: ${{ always() }} shell: bash working-directory: tests/python_client/chaos run: | #in this step, verify whether pod has been killed by pod's age kubectl get po -n chaos-testing # export k8s log for chaos mesh and milvus bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/chaos-test - name: Deploy Milvus Again If Previous E2E Test Failed timeout-minutes: 15 if: ${{ failure() }} shell: bash working-directory: tests/python_client/chaos run: | kubectl config set-context --current --namespace=chaos-testing bash scripts/uninstall_milvus.sh ${{ env.RELEASE }} if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set cluster.enabled=false --set etcd.replicaCount=1 --set minio.mode=standalone --set pulsar.enabled=false -n=chaos-testing; fi kubectl get pods -n chaos-testing sleep 20s kubectl get pods -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s # check whether port-forward success nc -vz 127.0.0.1 19530 # check whether milvus server is healthy python scripts/hello_milvus.py - name: Data Consist Test timeout-minutes: 5 if: ${{ always() }} shell: bash working-directory: tests/python_client/chaos run: | pytest -s -v test_chaos_data_consist.py --host 127.0.0.1 --log-cli-level=INFO --capture=no - name: Milvus E2E Test timeout-minutes: 10 if: ${{ always() }} shell: bash working-directory: tests/python_client run: | kubectl get pod -n chaos-testing kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl get pod -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s nc -vz 127.0.0.1 19530 pytest -s -v testcases/test_e2e.py --host 127.0.0.1 --log-cli-level=INFO --capture=no python chaos/scripts/hello_milvus.py --host 127.0.0.1 - name: Export logs if: ${{ always() }} shell: bash working-directory: tests/python_client/chaos run: | #in this step, verify whether pod has been killed by pod's age kubectl get po -n chaos-testing # export k8s log for chaos mesh and milvus bash ../../scripts/export_log_k8s.sh chaos-testing ${{ env.RELEASE }} k8s_logs/data-consist-test bash ../../scripts/export_log_k8s.sh chaos-testing chaos-daemon k8s_logs/chaos-mesh-daemon - name: Upload logs if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: logs-${{ matrix.pod }} path: | tests/python_client/chaos/k8s_logs tests/python_client/chaos/reports