Integrate one challenge to auto gpt (#44)
parent
0f33416b0e
commit
2062844fa6
|
@ -0,0 +1,62 @@
|
|||
name: Auto-GPT Regression Test
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
regression-tests:
|
||||
permissions:
|
||||
pull-requests: write
|
||||
contents: write
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 30
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10"]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.pull_request.head.ref }}
|
||||
repository: ${{ github.event.pull_request.head.repo.full_name }}
|
||||
submodules: true
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- id: get_date
|
||||
name: Get date
|
||||
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Install Poetry
|
||||
run: |
|
||||
curl -sSL https://install.python-poetry.org | python -
|
||||
|
||||
- name: Set up Poetry cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/.cache/pypoetry
|
||||
.venv
|
||||
key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
|
||||
|
||||
- name: Set up venv and install Python dependencies
|
||||
run: |
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
poetry install
|
||||
|
||||
- name: Build project
|
||||
run: |
|
||||
source venv/bin/activate
|
||||
poetry build
|
||||
cd agent/Auto-GPT
|
||||
pip install -r requirements.txt
|
||||
pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
|
||||
agbenchmark start --reg
|
||||
env:
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
@ -157,4 +157,6 @@ cython_debug/
|
|||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
.idea/
|
||||
.DS_Store
|
||||
```
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
[submodule "Auto-GPT"]
|
||||
path = agent/Auto-GPT
|
||||
url = https://github.com/Significant-Gravitas/Auto-GPT.git
|
||||
branch = benchmark-integration
|
|
@ -1,9 +1,10 @@
|
|||
import os
|
||||
import importlib
|
||||
import time
|
||||
from agbenchmark.mocks.MockManager import MockManager
|
||||
from multiprocessing import Process, Pipe
|
||||
|
||||
from agbenchmark.mocks.MockManager import MockManager
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import time
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
@ -26,45 +27,44 @@ def run_agent(task, mock_func, config):
|
|||
timeout = config["cutoff"]
|
||||
print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
|
||||
|
||||
parent_conn, child_conn = Pipe()
|
||||
# Get the current working directory
|
||||
cwd = os.getcwd()
|
||||
|
||||
# Add current directory to Python's import path
|
||||
sys.path.append(cwd)
|
||||
|
||||
|
||||
# Import the specific agent dynamically
|
||||
module_name = config["func_path"].replace("/", ".").rstrip(".py")
|
||||
module = importlib.import_module(module_name)
|
||||
run_specific_agent = getattr(module, "run_specific_agent")
|
||||
|
||||
process = Process(target=run_specific_agent, args=(task, child_conn))
|
||||
process.start()
|
||||
|
||||
command = [sys.executable, "benchmarks.py", str(task)]
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd)
|
||||
|
||||
start_time = time.time()
|
||||
timeout = config["cutoff"]
|
||||
|
||||
while True:
|
||||
if (
|
||||
parent_conn.poll()
|
||||
): # Check if there's a new message from the child process
|
||||
response, cycle_count = parent_conn.recv()
|
||||
print(f"Cycle {cycle_count}: {response}")
|
||||
output = process.stdout.readline()
|
||||
print(output.strip())
|
||||
|
||||
if cycle_count >= config["cutoff"]:
|
||||
print(
|
||||
f"Cycle count has reached the limit of {config['cutoff']}. Terminating."
|
||||
)
|
||||
child_conn.send("terminate")
|
||||
break
|
||||
|
||||
if time.time() - start_time > timeout:
|
||||
print(
|
||||
"The Python function has exceeded the time limit and was terminated."
|
||||
)
|
||||
child_conn.send(
|
||||
"terminate"
|
||||
) # Send a termination signal to the child process
|
||||
break
|
||||
|
||||
if not process.is_alive():
|
||||
# Check if process has ended
|
||||
if process.poll() is not None:
|
||||
print("The Python function has finished running.")
|
||||
break
|
||||
|
||||
process.join()
|
||||
# Check if process has exceeded timeout
|
||||
if time.time() - start_time > timeout:
|
||||
print("The Python function has exceeded the time limit and was terminated.")
|
||||
process.terminate()
|
||||
break
|
||||
|
||||
# Optional: sleep for a while
|
||||
time.sleep(0.1)
|
||||
|
||||
# Wait for process to terminate, then get return code
|
||||
process.wait()
|
||||
|
||||
|
||||
|
||||
ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"workspace": "C:\\Users\\silen\\miniagi",
|
||||
"func_path": "agent/benchmarks.py",
|
||||
"workspace": "autogpt/workspace/auto_gpt_workspace",
|
||||
"func_path": "benchmarks.py",
|
||||
"cutoff": 60
|
||||
}
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import shutil
|
||||
from agbenchmark.tests.regression.RegressionManager import RegressionManager
|
||||
from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def config(request):
|
||||
config_file = os.path.abspath("agbenchmark/config.json")
|
||||
print(f"Config file: {config_file}")
|
||||
with open(config_file, "r") as f:
|
||||
|
||||
print(f"Config file: {CONFIG_PATH}")
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
if request.config.getoption("--mock"):
|
||||
|
@ -36,10 +39,7 @@ def workspace(config):
|
|||
def pytest_addoption(parser):
|
||||
parser.addoption("--mock", action="store_true", default=False)
|
||||
|
||||
|
||||
regression_json = "agbenchmark/tests/regression/regression_tests.json"
|
||||
|
||||
regression_manager = RegressionManager(regression_json)
|
||||
regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
|
||||
|
||||
|
||||
# this is to get the challenge_data from every test
|
||||
|
@ -53,13 +53,16 @@ def pytest_runtest_makereport(item, call):
|
|||
challenge_data = item.funcargs.get("challenge_data", None)
|
||||
difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
|
||||
dependencies = challenge_data.dependencies if challenge_data else []
|
||||
|
||||
parts = item.nodeid.split("::")[0].split("/")
|
||||
agbenchmark_index = parts.index("agbenchmark")
|
||||
file_path = "/".join(parts[agbenchmark_index:])
|
||||
test_details = {
|
||||
"difficulty": difficulty,
|
||||
"dependencies": dependencies,
|
||||
"test": item.nodeid,
|
||||
"test": file_path,
|
||||
}
|
||||
|
||||
|
||||
print("pytest_runtest_makereport", test_details)
|
||||
if call.excinfo is None:
|
||||
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
|
||||
|
|
|
@ -7,6 +7,13 @@ from dotenv import load_dotenv, set_key
|
|||
|
||||
load_dotenv()
|
||||
|
||||
CURRENT_DIRECTORY = Path(__file__).resolve().parent
|
||||
|
||||
new_path = CURRENT_DIRECTORY / "config.json"
|
||||
|
||||
CONFIG_PATH = str(new_path.resolve())
|
||||
|
||||
REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
|
@ -15,16 +22,12 @@ def cli():
|
|||
|
||||
@cli.command()
|
||||
@click.option("--category", default=None, help="Specific category to run")
|
||||
@click.option("--noreg", is_flag=True, help="Skip regression tests")
|
||||
@click.option("--reg", is_flag=True, help="Runs only regression tests")
|
||||
@click.option("--mock", is_flag=True, help="Run with mock")
|
||||
def start(category, noreg, mock):
|
||||
def start(category, reg, mock):
|
||||
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
|
||||
config_file = "agbenchmark/config.json"
|
||||
|
||||
config_dir = os.path.abspath(config_file)
|
||||
|
||||
# Check if configuration file exists and is not empty
|
||||
if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
|
||||
if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
|
||||
config = {}
|
||||
|
||||
config["workspace"] = click.prompt(
|
||||
|
@ -42,11 +45,11 @@ def start(category, noreg, mock):
|
|||
default="60",
|
||||
)
|
||||
|
||||
with open(config_dir, "w") as f:
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
json.dump(config, f)
|
||||
else:
|
||||
# If the configuration file exists and is not empty, load it
|
||||
with open(config_dir, "r") as f:
|
||||
with open(CONFIG_PATH, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
set_key(".env", "MOCK_TEST", "True" if mock else "False")
|
||||
|
@ -58,11 +61,9 @@ def start(category, noreg, mock):
|
|||
if not os.path.exists(workspace_path):
|
||||
os.makedirs(workspace_path, exist_ok=True)
|
||||
|
||||
regression_path = os.path.abspath(
|
||||
"agbenchmark/tests/regression/regression_tests.json"
|
||||
)
|
||||
if not os.path.exists(regression_path):
|
||||
with open(regression_path, "a"):
|
||||
|
||||
if not os.path.exists(REGRESSION_TESTS_PATH):
|
||||
with open(REGRESSION_TESTS_PATH, "a"):
|
||||
pass
|
||||
|
||||
print("Current configuration:")
|
||||
|
@ -70,31 +71,40 @@ def start(category, noreg, mock):
|
|||
print(f"{key}: {value}")
|
||||
|
||||
print("Starting benchmark tests...", category)
|
||||
pytest_args = ["agbenchmark", "-vs"]
|
||||
tests_to_run = []
|
||||
pytest_args = ["-vs"]
|
||||
if category:
|
||||
pytest_args.extend(
|
||||
["-m", category]
|
||||
) # run categorys that are of a specific marker
|
||||
if noreg:
|
||||
pytest_args.extend(
|
||||
["-k", "not regression"]
|
||||
) # run categorys that are of a specific marker but don't include regression categorys
|
||||
print(f"Running {'non-regression' + category if noreg else category} categorys")
|
||||
)
|
||||
else:
|
||||
if noreg:
|
||||
print("Running all non-regression categorys")
|
||||
pytest_args.extend(
|
||||
["-k", "not regression"]
|
||||
) # run categorys that are not regression categorys
|
||||
if reg:
|
||||
print("Running all regression tests")
|
||||
tests_to_run = get_regression_tests()
|
||||
else:
|
||||
print("Running all categorys") # run all categorys
|
||||
print("Running all categories")
|
||||
|
||||
if mock:
|
||||
pytest_args.append("--mock")
|
||||
|
||||
# Run pytest with the constructed arguments
|
||||
if not tests_to_run:
|
||||
tests_to_run = [str(CURRENT_DIRECTORY)]
|
||||
pytest_args.extend(tests_to_run)
|
||||
pytest.main(pytest_args)
|
||||
|
||||
|
||||
def get_regression_tests():
|
||||
if not Path(REGRESSION_TESTS_PATH).exists():
|
||||
with open(REGRESSION_TESTS_PATH, 'w') as file:
|
||||
json.dump({}, file)
|
||||
|
||||
with open(REGRESSION_TESTS_PATH, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()]
|
||||
|
||||
return regression_tests
|
||||
|
||||
if __name__ == "__main__":
|
||||
start()
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
|
||||
import os
|
||||
|
@ -9,10 +11,11 @@ class TestWriteFile(BasicChallenge):
|
|||
def get_file_path(self) -> str: # all tests must implement this method
|
||||
return os.path.join(os.path.dirname(__file__), "w_file_data.json")
|
||||
|
||||
@pytest.mark.depends(on=[], name="basic_write_file")
|
||||
def test_method(self, config):
|
||||
self.setup_challenge(config)
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
|
||||
workspace = Path(os.getcwd()) / config['workspace']
|
||||
files_contents = self.open_files(workspace, self.data.ground.files)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
{}
|
|
@ -0,0 +1 @@
|
|||
Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b
|
|
@ -1 +0,0 @@
|
|||
Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue