milvus/tests/python_client/chaos/chaos_test.sh

#!/bin/bash

# Set PS4 prompt to display line number, function name and timestamp
export PS4='+(${BASH_SOURCE}:${LINENO}):${FUNCNAME[0]:+${FUNCNAME[0]}(): }'

set -e  # Exit immediately if a command exits with a non-zero status
set -x  # Print commands and their arguments as they are executed

# Store the initial directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Function to log important steps
declare -A colors=(
    ["INFO"]=$'\033[32m'    # Green
    ["BOLD"]=$'\033[1m'     # Bold
    ["TIME"]=$'\033[36m'    # Cyan
    ["RESET"]=$'\033[0m'    # Reset
)

log_step() {
    # Check if stdout is a terminal
    if [ -t 1 ]; then
        local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
        echo -e "${colors[INFO]}${colors[BOLD]}===> STEP [${colors[TIME]}${timestamp}${colors[INFO]}]: $1${colors[RESET]}"
    else
        # If not terminal (e.g., redirected to file), don't use colors
        local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
        echo "===> STEP [${timestamp}]: $1"
    fi
}

# Cleanup function to ensure resources are properly released
cleanup() {
    local exit_code=$?
    log_step "Performing cleanup (exit code: $exit_code)"

    # Make sure we're in the correct directory for cleanup
    cd "${SCRIPT_DIR}" || true

    # Export logs
    cur_time=$(date +%Y-%m-%d-%H-%M-%S)
    if [ -f "../../scripts/export_log_k8s.sh" ]; then
        bash ../../scripts/export_log_k8s.sh ${ns} ${release} k8s_log/${target_component}-${chaos_type}-${cur_time} || true
    else
        echo "Warning: export_log_k8s.sh not found in expected location"
    fi

    # Uninstall Milvus
    if [ -f "./scripts/uninstall_milvus.sh" ]; then
        bash ./scripts/uninstall_milvus.sh ${release} ${ns} || true
    else
        echo "Warning: uninstall_milvus.sh not found in expected location"
    fi

    exit $exit_code
}

# Set up trap to catch exits
trap cleanup EXIT

# Initialize basic variables
ns="chaos-testing"
cur_time=$(date +%H-%M-%S)
target_component=${1:-"standalone"}
chaos_type=${2:-"pod_kill"}
node_num=${3:-1}

log_step "Initializing with parameters: target_component=${target_component}, chaos_type=${chaos_type}, node_num=${node_num}"

# Generate release name
release_name="test"-${target_component}-${chaos_type/_/-}-${cur_time}
release=${RELEASE_NAME:-"${release_name}"}

# Normalize chaos type format
chaos_type=${chaos_type/-/_}
log_step "Configured chaos_type: ${chaos_type}"

# Change to scripts directory
pushd ./scripts || exit 1
log_step "Uninstalling existing Milvus instance if any"
bash uninstall_milvus.sh ${release} ${ns} || true

# Map component names
declare -A target_component_map=(["querynode"]="queryNode" ["indexnode"]="indexNode" ["datanode"]="dataNode" ["proxy"]="proxy")
log_step "Installing Milvus"

# Install cluster configuration if not standalone
if [[ ${target_component} != *"standalone"* ]]; then
    log_step "Installing cluster configuration"
    helm repo add milvus https://zilliztech.github.io/milvus-helm/
    helm repo update milvus
    helm install --wait --debug --timeout 360s ${release} milvus/milvus \
         --set ${target_component_map[${target_component}]}.replicas=$node_num \
         -f ../cluster-values.yaml -n=${ns}
fi

# Install standalone configuration
if [[ ${target_component} == *"standalone"* ]]; then
    log_step "Installing standalone configuration"
    helm install --wait --debug --timeout 360s ${release} milvus/milvus \
         -f ../standalone-values.yaml -n=${ns}
fi

# Wait for all pods to be ready
log_step "Waiting for pods to be ready"
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${release} -n ${ns} --timeout=360s
kubectl wait --for=condition=Ready pod -l release=${release} -n ${ns} --timeout=360s
kubectl get pod -o wide -l app.kubernetes.io/instance=${release} -n ${ns}
popd || exit 1

# Configure service and get LoadBalancer IP
log_step "Starting chaos testing"
kubectl patch svc ${release}-milvus -p='{"spec":{"type":"LoadBalancer"}}' -n ${ns}
loadbalancer_ip=$(kubectl get svc ${release}-milvus -n ${ns} -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
host=${loadbalancer_ip}

# Run initial e2e tests
log_step "Running initial e2e tests"
pytest -s -v ../testcases/test_e2e.py --host "$host" --log-cli-level=INFO --capture=no
python3 scripts/hello_milvus.py --host "$host"

# Run parallel chaos and request tests
log_step "Starting parallel chaos and request tests"
pytest test_chaos_apply.py --milvus_ns ${ns} --chaos_type ${chaos_type} \
      --target_component ${target_component} --host "$host" \
      --log-cli-level=INFO --capture=no &
pytest testcases/test_single_request_operation.py --host "$host" \
      --request_duration 15m --log-cli-level=INFO --capture=no &
wait

# Wait for system recovery after chaos tests
log_step "Waiting for pods to be ready after chaos tests"
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${release} -n ${ns} --timeout=360s
kubectl wait --for=condition=Ready pod -l release=${release} -n ${ns} --timeout=360s

# Run final verification tests
log_step "Running final e2e tests"
pytest -s -v ../testcases/test_e2e.py --host "$host" --log-cli-level=INFO --capture=no || echo "e2e test fail"
python3 scripts/hello_milvus.py --host "$host" || echo "e2e test fail"