name: Pod Kill Chaos Test on: workflow_dispatch: schedule: - cron: "30 18 * * *" jobs: test-pod-kill-chaos: runs-on: ubuntu-latest timeout-minutes: 40 strategy: fail-fast: false matrix: pod: [standalone, datacoord, datanode, indexcoord, indexnode, proxy, pulsar, querycoord, querynode, rootcoord, etcd, minio] steps: - name: Set env param run: | echo "RELEASE=test-${{ matrix.pod }}-pod-kill" >> $GITHUB_ENV - name: Creating kind cluster uses: helm/kind-action@v1.2.0 - name: Print cluster information run: | kubectl config view kubectl cluster-info kubectl get nodes kubectl get pods -n kube-system helm version kubectl version - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: 3.8 - name: Install dependency uses: nick-invision/retry@v2 with: timeout_minutes: 5 max_attempts: 3 retry_on: error shell: bash command: | pip install -r tests/python_client/requirements.txt --trusted-host pip install --upgrade protobuf - name: Deploy Chaos Mesh shell: bash run: | helm repo add chaos-mesh helm search repo chaos-mesh kubectl create ns chaos-testing helm install --wait --timeout 360s chaos-mesh chaos-mesh/chaos-mesh --namespace=chaos-testing --version v2.0.3 --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock kubectl get po -n chaos-testing - name: Deploy Milvus shell: bash working-directory: tests/python_client/chaos run: | echo "latest tag:" bash ../../../scripts/ -n milvusdb/milvus-dev -t master-latest -f master- -F -L -q helm repo add milvus helm repo update if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f standalone-values.yaml -n=chaos-testing; fi kubectl get pods -n chaos-testing sleep 20s kubectl get pods -n chaos-testing kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s # check whether port-forward success nc -vz 19530 # check whether milvus server is healthy python scripts/ - name: Chaos Test timeout-minutes: 15 shell: bash working-directory: tests/python_client/chaos run: | # replace chaos object sed -i "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/pod_kill\/'/g" sed -i "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${{ matrix.pod }}_pod_kill.yaml\'/g" cat timeout 14m pytest -s -v --host --log-cli-level=INFO --capture=no || echo "chaos test failed" - name: Milvus E2E Test timeout-minutes: 10 if: ${{ always() }} shell: bash working-directory: tests/python_client run: | kubectl get pod -n chaos-testing # wait all pod to be ready kubectl wait --for=condition=Ready pod -l${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl get pod -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s nc -vz 19530 pytest -s -v testcases/ --host --log-cli-level=INFO --capture=no python chaos/scripts/ --host - name: Deploy Milvus Again If Previous E2E Test Failed timeout-minutes: 15 if: ${{ failure() }} shell: bash working-directory: tests/python_client/chaos run: | kubectl config set-context --current --namespace=chaos-testing bash scripts/ if [ ${{ matrix.pod }} != "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus -f cluster-values.yaml -n=chaos-testing; fi if [ ${{ matrix.pod }} == "standalone" ]; then helm install --wait --timeout 720s ${{ env.RELEASE }} milvus/milvus --set cluster.enabled=false --set etcd.replicaCount=1 --set minio.mode=standalone --set pulsar.enabled=false -n=chaos-testing; fi kubectl get pods -n chaos-testing sleep 20s kubectl get pods -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s # check whether port-forward success nc -vz 19530 # check whether milvus server is healthy python scripts/ - name: Data Consist Test timeout-minutes: 5 if: ${{ always() }} shell: bash working-directory: tests/python_client/chaos run: | pytest -s -v --host --log-cli-level=INFO --capture=no || echo "data consist chaos test failed" - name: Milvus E2E Test timeout-minutes: 10 if: ${{ always() }} shell: bash working-directory: tests/python_client run: | kubectl get pod -n chaos-testing kubectl wait --for=condition=Ready pod -l${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl wait --for=condition=Ready pod -l release=${{ env.RELEASE }} -n chaos-testing --timeout=360s kubectl get pod -n chaos-testing ps aux|grep forward|grep -v grep|awk '{print $2}'|xargs kill -9 kubectl port-forward service/${{ env.RELEASE }}-milvus 19530 -n chaos-testing >/dev/null 2>&1 & sleep 20s nc -vz 19530 pytest -s -v testcases/ --host --log-cli-level=INFO --capture=no python chaos/scripts/ --host - name: Export logs if: ${{ always() }} shell: bash working-directory: tests/python_client/chaos run: | #in this step, verify whether pod has been killed by pod's age kubectl get po -n chaos-testing # export k8s log for chaos mesh and milvus bash ../../scripts/ chaos-testing ${{ env.RELEASE }} bash ../../scripts/ chaos-testing chaos-daemon - name: Upload logs if: ${{ always() }} uses: actions/upload-artifact@v2 with: name: logs-${{ matrix.pod }} path: | tests/python_client/chaos/k8s_logs tests/python_client/chaos/reports