From 15c5469bb1aabf291864b5ba11981948b7b64fb2 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 22 Jun 2023 08:18:22 -0400 Subject: [PATCH] Add automatic regression markers (#38) --- README.md | 2 +- agbenchmark/conftest.py | 33 +++++++++ agbenchmark/start_benchmark.py | 72 ++++++++++++------- .../tests/regression/RegressionManager.py | 22 ++++++ .../tests/regression/regression_tests.txt | 0 pyproject.toml | 1 + 6 files changed, 103 insertions(+), 27 deletions(-) create mode 100644 agbenchmark/tests/regression/RegressionManager.py create mode 100644 agbenchmark/tests/regression/regression_tests.txt diff --git a/README.md b/README.md index 216f1202c..b46562d2d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Share your progress :) 1. `pip install auto-gpt-benchmarks` 2. Add boilerplate code to start webserver to your agent (run loop and stop condition) -3. `agbenchmark start --challenge challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory +3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory 4. We call the server to run the agent for each test 5. Show pass rate of tests, logs, and any other metrics diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index b3ca086d8..55f5ca82d 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -3,6 +3,7 @@ import os import pytest import shutil from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval +from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests @@ -41,3 +42,35 @@ def server_response(request, config): # response.status_code == 200 # ), f"Request failed with status code {response.status_code}" mock_retrieval(task, config["workspace"]) + + +regression_txt = "agbenchmark/tests/regression/regression_tests.txt" + +regression_manager = RegressionManager(regression_txt) + + +def pytest_runtest_makereport(item, call): + """Called for each test report. Generated for each stage + of a test run (setup, call, teardown).""" + if call.when == "call": + if ( + call.excinfo is None + ): # if no error in the call stage, add it as a regression test + regression_manager.add_test(item.nodeid) + else: # otherwise, :( + regression_manager.remove_test(item.nodeid) + + +def pytest_collection_modifyitems(items): + """Called once all test items are collected. Used + to add regression marker to collected test items.""" + for item in items: + print("pytest_collection_modifyitems", item.nodeid) + if item.nodeid + "\n" in regression_manager.tests: + print(regression_manager.tests) + item.add_marker(pytest.mark.regression) + + +def pytest_sessionfinish(): + """Called at the end of the session to save regression tests""" + regression_manager.save() diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 79f308435..b7a116ebc 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -10,38 +10,58 @@ def cli(): @cli.command() -@click.option("--challenge", default=None, help="Specific challenge to run") -def start(challenge): - """Start the benchmark tests. If a challenge flag is is provided, run the challenges with that mark.""" - with open("agbenchmark/config.json", "r") as f: - config = json.load(f) +@click.option("--category", default=None, help="Specific category to run") +@click.option("--noreg", is_flag=True, help="Skip regression tests") +def start(category, noreg): + """Start the benchmark tests. If a category flag is is provided, run the categories with that mark.""" + """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" + config_file = "agbenchmark/config.json" + + # Check if configuration file exists and is not empty + if not os.path.exists(config_file) or os.stat(config_file).st_size == 0: + config = {} + + config["hostname"] = click.prompt( + "\nPlease enter a new hostname", default="localhost" + ) + config["port"] = click.prompt("Please enter a new port", default=8080) + config["workspace"] = click.prompt( + "Please enter a new workspace path", default="/path/to/workspace" + ) + + with open(config_file, "w") as f: + json.dump(config, f) + else: + # If the configuration file exists and is not empty, load it + with open(config_file, "r") as f: + config = json.load(f) print("Current configuration:") for key, value in config.items(): print(f"{key}: {value}") - update_config = click.confirm( - "\nDo you want to update these parameters?", default=False - ) - if update_config: - config["hostname"] = click.prompt( - "\nPlease enter a new hostname", default=config["hostname"] - ) - config["port"] = click.prompt("Please enter a new port", default=config["port"]) - config["workspace"] = click.prompt( - "Please enter a new workspace path", default=config["workspace"] - ) - - with open("agbenchmark/config.json", "w") as f: - json.dump(config, f) - - print("Starting benchmark tests...", challenge) - if challenge: - print(f"Running {challenge} challenges") - pytest.main(["agbenchmark", "-m", challenge, "-vs"]) + print("Starting benchmark tests...", category) + pytest_args = ["agbenchmark", "-vs"] + if category: + pytest_args.extend( + ["-m", category] + ) # run categorys that are of a specific marker + if noreg: + pytest_args.extend( + ["-k", "not regression"] + ) # run categorys that are of a specific marker but don't include regression categorys + print(f"Running {'non-regression' + category if noreg else category} categorys") else: - print("Running all challenges") - pytest.main(["agbenchmark", "-vs"]) + if noreg: + print("Running all non-regression categorys") + pytest_args.extend( + ["-k", "not regression"] + ) # run categorys that are not regression categorys + else: + print("Running all categorys") # run all categorys + + # Run pytest with the constructed arguments + pytest.main(pytest_args) if __name__ == "__main__": diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py new file mode 100644 index 000000000..9117d53f1 --- /dev/null +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -0,0 +1,22 @@ +class RegressionManager: + """Abstracts interaction with the regression tests file""" + + def __init__(self, filename: str): + self.filename = filename + self.load() + + def load(self) -> None: + with open(self.filename, "r") as f: + self.tests = f.readlines() + + def save(self) -> None: + with open(self.filename, "w") as f: + f.writelines(self.tests) + + def add_test(self, test_id) -> None: + if f"{test_id}\n" not in self.tests: + self.tests.append(f"{test_id}\n") + + def remove_test(self, test_id) -> None: + if f"{test_id}\n" in self.tests: + self.tests.remove(f"{test_id}\n") diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt new file mode 100644 index 000000000..e69de29bb diff --git a/pyproject.toml b/pyproject.toml index f88821cf2..5498381a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ testpaths = [ ] markers = [ "retrieval", + "regression" ] [tool.poetry.scripts]