diff --git a/.github/workflows/benchmark-ci.yml b/.github/workflows/benchmark-ci.yml new file mode 100644 index 000000000..1f5bf867b --- /dev/null +++ b/.github/workflows/benchmark-ci.yml @@ -0,0 +1,301 @@ +name: Benchmark CI + +on: + workflow_dispatch: + branches: [master] + inputs: + agents: + description: 'Agents to run (comma-separated)' + required: false + default: 'gpt-engineer,smol-developer,Auto-GPT,mini-agi,beebot,BabyAGI,PolyGPT,Turbo' # Default agents if none are specified + schedule: + - cron: '0 8 * * *' + push: + branches: [master, ci-test*] + paths: + - 'benchmark/**' + - '!benchmark/reports/**' + pull_request: + branches: [stable, master, release-*] + +jobs: + lint: + runs-on: ubuntu-latest + env: + min-python-version: '3.10' + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + + - name: Set up Python ${{ env.min-python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.min-python-version }} + + - id: get_date + name: Get date + working-directory: ./benchmark/ + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + working-directory: ./benchmark/ + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Install dependencies + working-directory: ./benchmark/ + run: | + export POETRY_VIRTUALENVS_IN_PROJECT=true + poetry install -vvv + + - name: Lint with flake8 + working-directory: ./benchmark/ + run: poetry run flake8 + + - name: Check black formatting + working-directory: ./benchmark/ + run: poetry run black . --exclude test.py --check + if: success() || failure() + + - name: Check isort formatting + working-directory: ./benchmark/ + run: poetry run isort . --check + if: success() || failure() + + - name: Check for unused imports and pass statements + working-directory: ./benchmark/ + run: | + cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark" + $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1) + if: success() || failure() + matrix-setup: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + env-name: ${{ steps.set-matrix.outputs.env-name }} + steps: + - id: set-matrix + run: | + if [ "${{ github.event_name }}" == "schedule" ]; then + echo "::set-output name=env-name::production" + echo "::set-output name=matrix::[ 'gpt-engineer', 'smol-developer', 'Auto-GPT', 'mini-agi', 'beebot', 'BabyAGI', 'PolyGPT', 'Turbo' ]" + elif [ "${{ github.event_name }}" == "workflow_dispatch" ]; then + IFS=',' read -ra matrix_array <<< "${{ github.event.inputs.agents }}" + matrix_string="[ \"$(echo "${matrix_array[@]}" | sed 's/ /", "/g')\" ]" + echo "::set-output name=env-name::production" + echo "::set-output name=matrix::$matrix_string" + else + echo "::set-output name=env-name::develop" + echo "::set-output name=matrix::[ 'mini-agi' ]" + fi + + tests: + environment: + name: '${{ needs.matrix-setup.outputs.env-name }}' + needs: matrix-setup + env: + min-python-version: '3.10' + name: '${{ matrix.agent-name }}' + runs-on: ubuntu-latest + timeout-minutes: 50 + strategy: + fail-fast: false + matrix: + agent-name: ${{fromJson(needs.matrix-setup.outputs.matrix)}} + steps: + - name: Print Environment Name + run: | + echo "Matrix Setup Environment Name: ${{ needs.matrix-setup.outputs.env-name }}" + + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + token: ${{ secrets.PAT_REVIEW }} + + - name: Set up Python ${{ env.min-python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.min-python-version }} + + - id: get_date + name: Get date + working-directory: ./benchmark/ + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Install dependencies + working-directory: ./benchmark/ + run: | + poetry install -vvv + poetry build + + - name: Run regression tests + working-directory: ./benchmark/ + run: | + mkdir agent + cd agent + git clone https://github.com/SilenNaihin/mini-agi -b benchmark-integration + cd $AGENT_NAME + prefix="" + if [ "$AGENT_NAME" == "gpt-engineer" ]; then + make install + source venv/bin/activate + elif [ "$AGENT_NAME" == "Auto-GPT" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + pip uninstall agbenchmark -y + elif [ "$AGENT_NAME" == "mini-agi" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + cp .env_example .env + elif [ "$AGENT_NAME" == "smol-developer" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + elif [ "$AGENT_NAME" == "BabyAGI" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + elif [ "$AGENT_NAME" == "SuperAGI" ]; then + cp config_template.yaml config.yaml + sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml + docker-compose up -d --build + elif [ "$AGENT_NAME" == "beebot" ]; then + poetry install + poetry run playwright install + poetry run uvicorn beebot.initiator.api:create_app --factory --timeout-graceful-shutdown=1 & + prefix="poetry run " + elif [ "$AGENT_NAME" == "PolyGPT" ]; then + cp .env.template .env + curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.1/install.sh | bash + export NVM_DIR=$HOME/.nvm + source $NVM_DIR/nvm.sh + nvm install && nvm use + yarn install + export NODE_TLS_REJECT_UNAUTHORIZED=0 + elif [ "$AGENT_NAME" == "Turbo" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + cp .env.template .env + sed -i 's/your-openai-api-key/${{ secrets.OPENAI_API_KEY }}/g' .env + else + echo "Unknown agent name: $AGENT_NAME" + exit 1 + fi + + pip install ../../dist/*.whl + + bash -c "$(curl -fsSL https://raw.githubusercontent.com/merwanehamadi/helicone/b7ab4bc53e51d8ab29fff19ce5986ab7720970c6/mitmproxy.sh)" -s start + + cd ../.. + if [ "${GITHUB_EVENT_NAME}" == "pull_request" ] || [ "${{ github.event_name }}" == "push" ]; then + set +e # Ignore non-zero exit codes and continue execution + echo "Running the following command: ${prefix}agbenchmark start --maintain --mock" + ${prefix}agbenchmark start --maintain --mock + EXIT_CODE=$? + set -e # Stop ignoring non-zero exit codes + # Check if the exit code was 5, and if so, exit with 0 instead + if [ $EXIT_CODE -eq 5 ]; then + echo "regression_tests.json is empty." + fi + + echo "Running the following command: ${prefix}agbenchmark start --mock" + ${prefix}agbenchmark start --mock + + echo "Running the following command: ${prefix}agbenchmark start --mock --category=retrieval" + ${prefix}agbenchmark start --mock --category=retrieval + + echo "Running the following command: ${prefix}agbenchmark start --mock --category=interface" + ${prefix}agbenchmark start --mock --category=interface + + echo "Running the following command: ${prefix}agbenchmark start --mock --category=code" + ${prefix}agbenchmark start --mock --category=code + + echo "Running the following command: ${prefix}agbenchmark start --mock --category=memory" + ${prefix}agbenchmark start --mock --category=memory + + echo "Running the following command: ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval" + ${prefix}agbenchmark start --mock --suite TestRevenueRetrieval + + echo "Running the following command: ${prefix}agbenchmark start --test=TestWriteFile" + ${prefix}agbenchmark start --test=TestWriteFile + + poetry install + poetry run uvicorn server:app --reload & + sleep 5 + export AGENT_NAME=mini-agi + else + echo "${prefix}agbenchmark start" + ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." + fi + + cd ../.. + + env: + GITHUB_EVENT_NAME: ${{ github.event_name }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + AGENT_NAME: ${{ matrix.agent-name }} + PROMPT_USER: false # For mini-agi. TODO: Remove this and put it in benchmarks.py + HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} + BASERUN_API_KEY: ${{ secrets.BASERUN_API_KEY }} + REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt + HELICONE_CACHE_ENABLED: false + HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} + REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }} + WOLFRAM_ALPHA_APPID: ${{ secrets.WOLFRAM_ALPHA_APPID }} + SERPER_API_KEY: ${{ secrets.SERPER_API_KEY }} + BING_SUBSCRIPTION_KEY: ${{ secrets.BING_SUBSCRIPTION_KEY }} + + - name: Upload reports + if: always() + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.agent-name }} + path: reports/${{ matrix.agent-name }} + + - name: Authenticate and Push to Branch + working-directory: ./benchmark/ + if: (success() || failure()) && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + run: | + git config --global user.email "github-bot@agpt.co" + git config --global user.name "Auto-GPT-Bot" + + git add reports/* || echo "nothing to commit" + commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')" + git commit -m "${commit_message}" + git stash + current_branch=${{ github.ref_name }} + attempts=0 + max_attempts=3 + + while [ $attempts -lt $max_attempts ]; do + git fetch origin $current_branch + git rebase origin/$current_branch + if git push origin HEAD; then + echo "Success!" + poetry run python reports/send_to_googledrive.py || echo "Failed to upload to Google Drive" + exit 0 + else + echo "Attempt $(($attempts + 1)) failed. Retrying..." + attempts=$(($attempts + 1)) + fi + done + + echo "Failed after $max_attempts attempts." + env: + GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }} + GITHUB_REF_NAME: ${{ github.ref_name }} diff --git a/benchmark/.gitmodules b/benchmark/.gitmodules deleted file mode 100644 index a8a544a54..000000000 --- a/benchmark/.gitmodules +++ /dev/null @@ -1,39 +0,0 @@ -[submodule "agent/Auto-GPT"] - path = agent/Auto-GPT - url = https://github.com/Significant-Gravitas/Auto-GPT - branch = master -[submodule "agent/gpt-engineer"] - path = agent/gpt-engineer - url = https://github.com/merwanehamadi/gpt-engineer.git - branch = benchmark-integration -[submodule "agent/mini-agi"] - path = agent/mini-agi - url = https://github.com/SilenNaihin/mini-agi.git - branch = benchmark-integration -[submodule "agent/smol-developer"] - path = agent/smol-developer - url = https://github.com/e2b-dev/smol-developer.git - branch = benchmarks -[submodule "agent/SuperAGI"] - path = agent/SuperAGI - url = https://github.com/SilenNaihin/SuperAGI.git - branch = benchmark-integration -[submodule "agent/BabyAGI"] - path = agent/BabyAGI - url = https://github.com/SilenNaihin/babyagi.git - branch = benchmark-integration -[submodule "agent/beebot"] - path = agent/beebot - url = https://github.com/AutoPackAI/beebot.git - branch = main -[submodule "agent/PolyGPT"] - path = agent/PolyGPT - url = https://github.com/polywrap/PolyGPT.git - branch = nerfzael-use-local-wrap-library -[submodule "frontend"] - path = frontend - url = https://github.com/agbenchmark/agbenchmark-frontend.git -[submodule "agent/Turbo"] - path = agent/Turbo - url = https://github.com/lc0rp/Auto-GPT-Turbo.git - branch = main diff --git a/benchmark/agbenchmark/__init__.py b/benchmark/agbenchmark/__init__.py index 2fc9970ce..e69de29bb 100644 --- a/benchmark/agbenchmark/__init__.py +++ b/benchmark/agbenchmark/__init__.py @@ -1,5 +0,0 @@ -import pydevd_pycharm - -pydevd_pycharm.settrace( - "localhost", port=9739, stdoutToServer=True, stderrToServer=True -) diff --git a/benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json b/benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json similarity index 100% rename from benchmark/agbenchmark/challenges/library/ethereum/check_price/data.json rename to benchmark/agbenchmark/challenges/library/ethereum/check_price/data_draft.json diff --git a/benchmark/paper/agent_action_regex.py b/benchmark/paper/agent_action_regex.py index 6bd55f9d1..abe4a8fdd 100644 --- a/benchmark/paper/agent_action_regex.py +++ b/benchmark/paper/agent_action_regex.py @@ -1,5 +1,5 @@ -import re import json +import re def is_action_auto_gpt(log):