From e25f6103443b83f017c4d0bd3a7be9c98cf7e83a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 13:23:00 -0400 Subject: [PATCH] local runs, home_path config, submodule miniagi (#50) --- .github/workflows/mini-agi.yml | 63 +++++++++ .gitmodules | 4 + README.md | 128 +----------------- agbenchmark/README.md | 126 +++++++++++++++++ agbenchmark/agent_interface.py | 11 +- agbenchmark/start_benchmark.py | 2 +- .../write_file/write_file_test.py | 2 + agent/Auto-GPT | 2 +- agent/benchmarks.py | 15 -- agent/benchmarks_example.py | 35 +++++ agent/config_example.json | 6 + agent/gpt-engineer | 2 +- agent/mini-agi | 1 + agent/regression_tests_example.json | 7 + agent/smol-developer | 2 +- config.json | 3 +- 16 files changed, 262 insertions(+), 147 deletions(-) create mode 100644 .github/workflows/mini-agi.yml create mode 100644 agbenchmark/README.md delete mode 100644 agent/benchmarks.py create mode 100644 agent/benchmarks_example.py create mode 100644 agent/config_example.json create mode 160000 agent/mini-agi create mode 100644 agent/regression_tests_example.json diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml new file mode 100644 index 000000000..92980572a --- /dev/null +++ b/.github/workflows/mini-agi.yml @@ -0,0 +1,63 @@ +name: mini-agi Regression Test + +on: + workflow_dispatch: + branches: [master] + push: + branches: [stable, master, ci-test*] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/mini-agi + make install + source venv/bin/activate + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitmodules b/.gitmodules index b45a16ada..5af445f7a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,6 +6,10 @@ path = agent/gpt-engineer url = https://github.com/merwanehamadi/gpt-engineer.git branch = benchmark-integration +[submodule "agent/mini-agi"] + path = agent/mini-agi + url = https://github.com/SilenNaihin/mini-agi.git + branch = benchmark-integration [submodule "agent/smol-developer"] path = agent/smol-developer url = https://github.com/merwanehamadi/developer.git diff --git a/README.md b/README.md index c0f67f153..ed348b5ab 100644 --- a/README.md +++ b/README.md @@ -2,127 +2,13 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -## As a user +### Scores: -1. `pip install auto-gpt-benchmarks` -2. Add boilerplate code to run and kill agent -3. `agbenchmark start` - - `--category challenge_category` to run tests in a specific category - - `--mock` to only run mock tests if they exists for each test - - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests -4. We call boilerplate code for your agent -5. Show pass rate of tests, logs, and any other metrics +Scoring of agents will go here. Both overall and by category. -## Contributing +### Integrated Agents -##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x - -### To run the existing mocks - -1. clone the repo `auto-gpt-benchmarks` -2. `pip install poetry` -3. `poetry shell` -4. `poetry install` -5. `cp .env_example .env` -6. `agbenchmark start --mock` - Keep config the same and watch the logs :) - -### To run with mini-agi - -1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` -2. `pip install -r requirements.txt` -3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed -4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` - -- To add requirements `poetry add requirement`. - -Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. - -If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit - -Let people know what beautiful code you write does, document everything well - -Share your progress :) - -### Pytest - -an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic - -```python -import pytest -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge -import os - - -class TestWriteFile(BasicChallenge): - """Testing if LLM can write to a file""" - - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "w_file_data.json") - - @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): - # implement scoring logic by looking at workspace -``` - -All challenges will inherit from parent class which has the mark and any specific methods for their category - -```python -@pytest.mark.basic -class BasicChallenge(Challenge): - pass -``` - -Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. -```python -@pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this - def setup_module(self, workspace): - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) -``` - -#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) - -## Workspace - -If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config - -#### Dataset - -Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ - -## Repo - -``` -|-- auto-gpt-benchmarks/ **main project directory** -| |-- metrics.py **combining scores, metrics, final evaluation** -| |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** -| |-- Challenge.py **easy challenge creation class** -| |-- config.json **workspace folder** -| |-- challenges/ **challenges across different domains** -| | |-- adaptability/ -| | |-- basic_abilities/ -| | |-- code/ -| | |-- memory/ -| | |-- retrieval/ -| | |-- web_navigation/ -| | |-- writing/ -| |-- tests/ -| | |-- basic_abilities/ **every llm should pass these challenges** -| | |-- regression/ **challenges that already passed** -``` - -## How to add new agents to agbenchmark ? -Example with smol developer. - -1- Create a github branch with your agent following the same pattern as this example: - -https://github.com/smol-ai/developer/pull/114/files - -2- Create the submodule and the github workflow by following the same pattern as this example: - -https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files +- Auto-GPT +- gpt-engineer +- mini-agi +- smol-developer diff --git a/agbenchmark/README.md b/agbenchmark/README.md new file mode 100644 index 000000000..a478f83f3 --- /dev/null +++ b/agbenchmark/README.md @@ -0,0 +1,126 @@ +## As a user + +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to run and kill agent +3. `agbenchmark start` + - `--category challenge_category` to run tests in a specific category + - `--mock` to only run mock tests if they exists for each test + - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests +4. We call boilerplate code for your agent +5. Show pass rate of tests, logs, and any other metrics + +## Contributing + +##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x + +### To run the existing mocks + +1. clone the repo `auto-gpt-benchmarks` +2. `pip install poetry` +3. `poetry shell` +4. `poetry install` +5. `cp .env_example .env` +6. `agbenchmark start --mock` + Keep config the same and watch the logs :) + +### To run with mini-agi + +1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` +2. `pip install -r requirements.txt` +3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed +4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` + +- To add requirements `poetry add requirement`. + +Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. + +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit + +Let people know what beautiful code you write does, document everything well + +Share your progress :) + +### Pytest + +an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic + +```python +import pytest +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge +import os + + +class TestWriteFile(BasicChallenge): + """Testing if LLM can write to a file""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + # implement scoring logic by looking at workspace +``` + +All challenges will inherit from parent class which has the mark and any specific methods for their category + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. + +```python +@pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): + Challenge.write_to_file( + workspace, self.data.ground.files[0], "this is how we're doing" + ) +``` + +#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) + +## Workspace + +If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config + +#### Dataset + +Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ + +## Repo + +``` +|-- auto-gpt-benchmarks/ **main project directory** +| |-- metrics.py **combining scores, metrics, final evaluation** +| |-- start_benchmark.py **entry point from cli** +| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** +| |-- Challenge.py **easy challenge creation class** +| |-- config.json **workspace folder** +| |-- challenges/ **challenges across different domains** +| | |-- adaptability/ +| | |-- basic_abilities/ +| | |-- code/ +| | |-- memory/ +| | |-- retrieval/ +| | |-- web_navigation/ +| | |-- writing/ +| |-- tests/ +| | |-- basic_abilities/ **every llm should pass these challenges** +| | |-- regression/ **challenges that already passed** +``` + +## How to add new agents to agbenchmark ? + +Example with smol developer. + +1- Create a github branch with your agent following the same pattern as this example: + +https://github.com/smol-ai/developer/pull/114/files + +2- Create the submodule and the github workflow by following the same pattern as this example: + +https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index bd75f8dbb..993aa242a 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,4 +1,3 @@ -import importlib import os import subprocess import sys @@ -29,18 +28,18 @@ def run_agent( mock_manager.delegate(mock_func) else: timeout = config["cutoff"] - print(f"Running Python function '{config['func_path']}' with timeout {timeout}") + print( + f"Running Python function '{config['entry_path']}' with timeout {timeout}" + ) # Get the current working directory cwd = os.getcwd() # Add current directory to Python's import path sys.path.append(cwd) + sys.path.append(os.path.join(cwd, config["home_path"])) - module_name = config["func_path"].replace("/", ".").rstrip(".py") - module = importlib.import_module(module_name) - - command = [sys.executable, "benchmarks.py", str(task)] + command = [sys.executable, config["entry_path"], str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 7489aa309..8ef01d3c5 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -38,7 +38,7 @@ def start(category: str, reg: bool, mock: bool) -> int: default=os.path.join(Path.home(), "workspace"), ) - config["func_path"] = click.prompt( + config["entry_path"] = click.prompt( "Please enter a the path to your run_specific_agent function implementation", default="/benchmarks.py", ) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 05db09657..c59e03ccf 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Any, Dict +import pytest from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge @@ -11,6 +12,7 @@ class TestWriteFile(BasicChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") + @pytest.mark.depends(name="basic_write_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 2e5eac51d..dd65cc256 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 2e5eac51d06d495919d720d370c4d9efd49f4784 +Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62 diff --git a/agent/benchmarks.py b/agent/benchmarks.py deleted file mode 100644 index eb66412c1..000000000 --- a/agent/benchmarks.py +++ /dev/null @@ -1,15 +0,0 @@ -# import subprocess - - -def run_specific_agent(task, conn): - cycle_count = 0 - while ( - not conn.poll() - ): # Check if there's a termination signal from the main process - response = run_agent(task) # run the agent and get the response and cycle count - - if response: - cycle_count += 1 - - # Send response and cycle count back to the main process - conn.send((response, cycle_count)) diff --git a/agent/benchmarks_example.py b/agent/benchmarks_example.py new file mode 100644 index 000000000..0c35aa9bb --- /dev/null +++ b/agent/benchmarks_example.py @@ -0,0 +1,35 @@ +import os +import sys +from typing import Tuple +import pexpect + + +def run_specific_agent(task: str) -> Tuple[str, int]: + # Ensure the directory for the project exists + os.makedirs("workspace_path", exist_ok=True) + + # Run the agent command + child = pexpect.spawn(f"python example.py {task}") + + # Create a loop to continuously read output + while True: + try: + child.expect("\n") # This waits until a newline appears + print(child.before.decode()) # This prints the line + except pexpect.EOF: + break # No more output, break the loop + + # Check the exit status + child.close() # Close the child process + + # Return child process's exit status and any error messages + return child.before.decode(), child.exitstatus + + +if __name__ == "__main__": + # The first argument is the script name itself, second is the task + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + task = sys.argv[1] + run_specific_agent(task) diff --git a/agent/config_example.json b/agent/config_example.json new file mode 100644 index 000000000..ba2ec0b80 --- /dev/null +++ b/agent/config_example.json @@ -0,0 +1,6 @@ +{ + "workspace": "projects/my-new-project/workspace", + "entry_path": "benchmarks.py", + "home_path": "", + "cutoff": 60 +} diff --git a/agent/gpt-engineer b/agent/gpt-engineer index f91ac66b8..155ea895e 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit f91ac66b8e8210760aaa0047f2ca11c52e55aaa5 +Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d diff --git a/agent/mini-agi b/agent/mini-agi new file mode 160000 index 000000000..70bd3f035 --- /dev/null +++ b/agent/mini-agi @@ -0,0 +1 @@ +Subproject commit 70bd3f035e7d898221cdb0fc2912d20037fec901 diff --git a/agent/regression_tests_example.json b/agent/regression_tests_example.json new file mode 100644 index 000000000..a0c76dc55 --- /dev/null +++ b/agent/regression_tests_example.json @@ -0,0 +1,7 @@ +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" + } +} diff --git a/agent/smol-developer b/agent/smol-developer index 896198af5..5a3ad4310 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit 896198af51dd86dc3cfc2e258c3479948844e283 +Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e diff --git a/config.json b/config.json index 652618e4b..ba2ec0b80 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,6 @@ { "workspace": "projects/my-new-project/workspace", - "func_path": "benchmarks.py", + "entry_path": "benchmarks.py", + "home_path": "", "cutoff": 60 }