name: AGBenchmark CI on: push: branches: [ master, development, ci-test* ] paths: - 'benchmark/**' - .github/workflows/benchmark-ci.yml - '!benchmark/reports/**' pull_request: branches: [ master, development, release-* ] paths: - 'benchmark/**' - '!benchmark/reports/**' - .github/workflows/benchmark-ci.yml concurrency: group: ${{ format('benchmark-ci-{0}', github.head_ref && format('{0}-{1}', github.event_name, github.event.pull_request.number) || github.sha) }} cancel-in-progress: ${{ startsWith(github.event_name, 'pull_request') }} defaults: run: shell: bash env: min-python-version: '3.10' jobs: test: permissions: contents: read timeout-minutes: 30 strategy: fail-fast: false matrix: python-version: ["3.10"] platform-os: [ubuntu, macos, macos-arm64, windows] runs-on: ${{ matrix.platform-os != 'macos-arm64' && format('{0}-latest', matrix.platform-os) || 'macos-14' }} defaults: run: shell: bash working-directory: benchmark steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Set up Python dependency cache # On Windows, unpacking cached dependencies takes longer than just installing them if: runner.os != 'Windows' uses: actions/cache@v4 with: path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }} key: poetry-${{ runner.os }}-${{ hashFiles('benchmark/poetry.lock') }} - name: Install Poetry (Unix) if: runner.os != 'Windows' run: | curl -sSL https://install.python-poetry.org | python3 - if [ "${{ runner.os }}" = "macOS" ]; then PATH="$HOME/.local/bin:$PATH" echo "$HOME/.local/bin" >> $GITHUB_PATH fi - name: Install Poetry (Windows) if: runner.os == 'Windows' shell: pwsh run: | (Invoke-WebRequest -Uri https://install.python-poetry.org -UseBasicParsing).Content | python - $env:PATH += ";$env:APPDATA\Python\Scripts" echo "$env:APPDATA\Python\Scripts" >> $env:GITHUB_PATH - name: Install Python dependencies run: poetry install - name: Run pytest with coverage run: | poetry run pytest -vv \ --cov=agbenchmark --cov-branch --cov-report term-missing --cov-report xml \ --durations=10 \ tests env: CI: true OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} flags: agbenchmark,${{ runner.os }} self-test-with-agent: runs-on: ubuntu-latest strategy: matrix: agent-name: [ forge ] fail-fast: false timeout-minutes: 20 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true - name: Set up Python ${{ env.min-python-version }} uses: actions/setup-python@v5 with: python-version: ${{ env.min-python-version }} - name: Install Poetry run: | curl -sSL https://install.python-poetry.org | python - - name: Run regression tests working-directory: . run: | ./run agent start ${{ matrix.agent-name }} cd ${{ matrix.agent-name }} set +e # Ignore non-zero exit codes and continue execution echo "Running the following command: poetry run agbenchmark --maintain --mock" poetry run agbenchmark --maintain --mock EXIT_CODE=$? set -e # Stop ignoring non-zero exit codes # Check if the exit code was 5, and if so, exit with 0 instead if [ $EXIT_CODE -eq 5 ]; then echo "regression_tests.json is empty." fi echo "Running the following command: poetry run agbenchmark --mock" poetry run agbenchmark --mock echo "Running the following command: poetry run agbenchmark --mock --category=data" poetry run agbenchmark --mock --category=data echo "Running the following command: poetry run agbenchmark --mock --category=coding" poetry run agbenchmark --mock --category=coding echo "Running the following command: poetry run agbenchmark --test=WriteFile" poetry run agbenchmark --test=WriteFile cd ../benchmark poetry install echo "Adding the BUILD_SKILL_TREE environment variable. This will attempt to add new elements in the skill tree. If new elements are added, the CI fails because they should have been pushed" export BUILD_SKILL_TREE=true poetry run agbenchmark --mock CHANGED=$(git diff --name-only | grep -E '(agbenchmark/challenges)|(../frontend/assets)') || echo "No diffs" if [ ! -z "$CHANGED" ]; then echo "There are unstaged changes please run agbenchmark and commit those changes since they are needed." echo "$CHANGED" exit 1 else echo "No unstaged changes." fi env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} TELEMETRY_ENVIRONMENT: autogpt-benchmark-ci TELEMETRY_OPT_IN: ${{ github.ref_name == 'master' }}