From ef2107d9c2e78e2ff2a5e694f200ca813bdcd3f8 Mon Sep 17 00:00:00 2001 From: SwiftyOS Date: Mon, 11 Sep 2023 17:24:23 +0200 Subject: [PATCH] Fixing benchmark code --- benchmark/backend/main.py | 3 +- benchmark/benchmark/__main__.py | 29 ++++++++---- benchmark/benchmark/agent_api_interface.py | 1 - benchmark/benchmark/agent_interface.py | 1 - .../4_url_shortener/artifacts_out/test.py | 13 ++++-- .../artifacts_out/url_shortener.py | 8 +++- .../artifacts_out/tic_tac_toe.py | 40 ++++++++++++---- .../code/5_tic_tac_toe/custom_python/test.py | 24 ++++++---- .../6_battleship/artifacts_in/conftest.py | 3 +- .../artifacts_in/test_negative.py | 7 +-- .../6_battleship/artifacts_out/battleship.py | 8 +--- .../6_battleship/artifacts_out/conftest.py | 3 +- .../artifacts_out/test_negative.py | 7 +-- benchmark/benchmark/conftest.py | 27 +++++++---- benchmark/benchmark/generate_test.py | 10 ++-- benchmark/benchmark/reports/ReportManager.py | 19 ++++---- benchmark/benchmark/reports/reports.py | 46 +++++++++++-------- benchmark/benchmark/start_benchmark.py | 18 ++++---- benchmark/benchmark/utils/data_types.py | 25 +++++++--- .../benchmark/utils/dependencies/__init__.py | 1 - benchmark/benchmark/utils/utils.py | 3 -- 21 files changed, 177 insertions(+), 119 deletions(-) diff --git a/benchmark/backend/main.py b/benchmark/backend/main.py index 03880f0ed..c0c2bf2df 100644 --- a/benchmark/backend/main.py +++ b/benchmark/backend/main.py @@ -9,11 +9,10 @@ from typing import Any sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from agbenchmark.utils.utils import find_absolute_benchmark_path from fastapi import FastAPI, Query from fastapi.middleware.cors import CORSMiddleware -from agbenchmark.utils.utils import find_absolute_benchmark_path - app = FastAPI() origins = ["http://localhost:3000"] diff --git a/benchmark/benchmark/__main__.py b/benchmark/benchmark/__main__.py index f7f0a77fa..64eae925f 100644 --- a/benchmark/benchmark/__main__.py +++ b/benchmark/benchmark/__main__.py @@ -5,10 +5,10 @@ import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional -import toml import click import pytest +import toml from helicone.lock import HeliconeLockManager from benchmark.utils.data_types import AgentBenchmarkConfig @@ -72,7 +72,9 @@ def run_benchmark( try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + agent_benchmark_config.agent_benchmark_config_path = ( + agent_benchmark_config_path + ) except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") return 1 @@ -96,9 +98,10 @@ def run_benchmark( "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite." ) return 1 - - assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \ - "Error: host needs to be added to the config if api_mode is set to True." + + assert not ( + agent_benchmark_config.api_mode and not agent_benchmark_config.host + ), "Error: host needs to be added to the config if api_mode is set to True." print("Current configuration:") for key, value in vars(agent_benchmark_config).items(): @@ -200,7 +203,12 @@ def cli() -> None: ) @click.option("--nc", is_flag=True, help="Run without cutoff") @click.option("--cutoff", help="Set or override tests cutoff (seconds)") -@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True) +@click.option( + "--agent-config", + type=click.Path(exists=True), + help="Path to the agent benchmark_config.json file,", + required=True, +) def start( maintain: bool, improve: bool, @@ -220,8 +228,9 @@ def start( original_stdout = sys.stdout # Save the original standard output exit_code = None - - assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided" + assert ( + "benchmark_config.json" in agent_config + ), "benchmark_config.json must be provided" if backend: with open("backend/backend_stdout.txt", "w") as f: @@ -266,7 +275,9 @@ def start( def version(): """Print the version of the benchmark tool.""" current_directory = Path(__file__).resolve().parent - version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"] + version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][ + "version" + ] print(f"Benchmark Tool Version {version}") diff --git a/benchmark/benchmark/agent_api_interface.py b/benchmark/benchmark/agent_api_interface.py index 17dbd7308..6bd76de86 100644 --- a/benchmark/benchmark/agent_api_interface.py +++ b/benchmark/benchmark/agent_api_interface.py @@ -51,7 +51,6 @@ async def run_api_agent( artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id) for artifact in artifacts: - if artifact.relative_path: folder_path = os.path.join(config["workspace"], artifact.relative_path) else: diff --git a/benchmark/benchmark/agent_interface.py b/benchmark/benchmark/agent_interface.py index e7c6ac4dd..e79899717 100644 --- a/benchmark/benchmark/agent_interface.py +++ b/benchmark/benchmark/agent_interface.py @@ -12,7 +12,6 @@ from typing import Any, List import psutil from dotenv import load_dotenv - load_dotenv() helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS") diff --git a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py index 94fcac027..c3daffa80 100644 --- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py +++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py @@ -1,15 +1,22 @@ import unittest -from url_shortener import shorten_url, retrieve_url + +from url_shortener import retrieve_url, shorten_url + class TestURLShortener(unittest.TestCase): def test_url_retrieval(self): # Shorten the URL to get its shortened form - shortened_url = shorten_url('https://www.example.com') + shortened_url = shorten_url("https://www.example.com") # Retrieve the original URL using the shortened URL directly retrieved_url = retrieve_url(shortened_url) - self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!") + self.assertEqual( + retrieved_url, + "https://www.example.com", + "Retrieved URL does not match the original!", + ) + if __name__ == "__main__": unittest.main() diff --git a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py index 8fe0d315d..89a73a82b 100644 --- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py +++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py @@ -3,6 +3,7 @@ import base64 URL_MAPPING = {} + def shorten_url(url): # Convert the URL to base64 encoded_url = base64.b64encode(url.encode()).decode() @@ -12,13 +13,15 @@ def shorten_url(url): URL_MAPPING[short_url] = url return short_url + def retrieve_url(short_url): return URL_MAPPING.get(short_url, "URL not found") + def main(): parser = argparse.ArgumentParser(description="URL Shortener") - parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened") - parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved") + parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened") + parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved") args = parser.parse_args() @@ -32,5 +35,6 @@ def main(): else: print("No valid arguments provided.") + if __name__ == "__main__": main() diff --git a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py index 0caa903fa..e0163220a 100644 --- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py +++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py @@ -1,38 +1,45 @@ import pprint + def column(matrix, i): return [row[i] for row in matrix] + def check(list): if len(set(list)) <= 1: if list[0] != 0: return list[0] return None + def checkDiagLeft(board): - if (board[0][0] == board[1][1] and board[1][1] == board[2][2]): + if board[0][0] == board[1][1] and board[1][1] == board[2][2]: if board[0][0] != 0: return board[0][0] return None + def checkDiagRight(board): - if (board[2][0] == board[1][1] and board[1][1] == board[0][2]): + if board[2][0] == board[1][1] and board[1][1] == board[0][2]: if board[2][0] != 0: return board[2][0] return None + def placeItem(row, column, board, current_player): if board[row][column] != 0: return None else: board[row][column] = current_player + def swapPlayers(player): - if (player == 2): + if player == 2: return 1 else: return 2 + def winner(board): for rowIndex in board: if check(rowIndex) is not None: @@ -46,23 +53,35 @@ def winner(board): return checkDiagRight(board) return 0 + def getLocation(): - location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ") + location = input( + "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 " + ) print(f"\nYou picked {location}") - coordinates = [int(x) for x in location.split(',')] - while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2): + coordinates = [int(x) for x in location.split(",")] + while ( + len(coordinates) != 2 + or coordinates[0] < 0 + or coordinates[0] > 2 + or coordinates[1] < 0 + or coordinates[1] > 2 + ): print("You inputted a location in an invalid format") - location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ") - coordinates = [int(x) for x in location.split(',')] + location = input( + "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 " + ) + coordinates = [int(x) for x in location.split(",")] return coordinates + def gamePlay(): num_moves = 0 pp = pprint.PrettyPrinter(width=20) current_player = 1 board = [[0 for x in range(3)] for x in range(3)] - while (num_moves < 9 and winner(board) == 0): + while num_moves < 9 and winner(board) == 0: print("This is the current board: ") pp.pprint(board) coordinates = getLocation() @@ -75,5 +94,6 @@ def gamePlay(): if winner(board) == 0: print("Draw") -if __name__ == '__main__': + +if __name__ == "__main__": gamePlay() diff --git a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py index 6fa522513..94b778208 100644 --- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py +++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py @@ -1,18 +1,20 @@ import subprocess + import pytest + def run_game_with_inputs(inputs): # Start the game process process = subprocess.Popen( - ['python', 'tic_tac_toe.py'], + ["python", "tic_tac_toe.py"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, ) # Send the input moves one by one - output, errors = process.communicate('\n'.join(inputs)) + output, errors = process.communicate("\n".join(inputs)) # Print the inputs and outputs print("Inputs:\n", "\n".join(inputs)) @@ -22,14 +24,18 @@ def run_game_with_inputs(inputs): return output -@pytest.mark.parametrize("inputs, expected_output", [ - (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"), - (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"), - (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw") -]) +@pytest.mark.parametrize( + "inputs, expected_output", + [ + (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"), + (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"), + (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"), + ], +) def test_game(inputs, expected_output): output = run_game_with_inputs(inputs) assert expected_output in output -if __name__ == '__main__': + +if __name__ == "__main__": pytest.main() diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py index f1e984576..a1412966b 100644 --- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py +++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py @@ -1,9 +1,8 @@ import pytest - from abstract_class import ShipPlacement, Turn - from battleship import Battleship + @pytest.fixture def battleship_game(): return Battleship() diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py index 484ae3509..34bed48b4 100644 --- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py +++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py @@ -1,7 +1,6 @@ import pytest -from pydantic import ValidationError - from abstract_class import ShipPlacement, Turn +from pydantic import ValidationError def test_ship_placement_out_of_bounds(battleship_game): @@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game): def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id): - game = battleship_game.get_game( - initialized_game_id - ) + game = battleship_game.get_game(initialized_game_id) additional_ship = ShipPlacement( ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal" ) diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py index 5d87181f6..ad7dc83f9 100644 --- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py +++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py @@ -86,9 +86,7 @@ class Battleship(AbstractBattleship): game.turns.append(turn) if hit_ship == "hit": - return TurnResponse( - result="miss", ship_type=None - ) + return TurnResponse(result="miss", ship_type=None) if hit_ship: ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship) @@ -133,9 +131,7 @@ class Battleship(AbstractBattleship): ) if hits == total_ships_length: - return GameStatus( - is_game_over=True, winner="player" - ) + return GameStatus(is_game_over=True, winner="player") else: return GameStatus(is_game_over=False, winner=None) diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py index f1e984576..a1412966b 100644 --- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py +++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py @@ -1,9 +1,8 @@ import pytest - from abstract_class import ShipPlacement, Turn - from battleship import Battleship + @pytest.fixture def battleship_game(): return Battleship() diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py index 484ae3509..34bed48b4 100644 --- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py +++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py @@ -1,7 +1,6 @@ import pytest -from pydantic import ValidationError - from abstract_class import ShipPlacement, Turn +from pydantic import ValidationError def test_ship_placement_out_of_bounds(battleship_game): @@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game): def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id): - game = battleship_game.get_game( - initialized_game_id - ) + game = battleship_game.get_game(initialized_game_id) additional_ship = ShipPlacement( ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal" ) diff --git a/benchmark/benchmark/conftest.py b/benchmark/benchmark/conftest.py index f1e6ad8bf..a93867e49 100644 --- a/benchmark/benchmark/conftest.py +++ b/benchmark/benchmark/conftest.py @@ -16,7 +16,7 @@ from benchmark.reports.reports import ( generate_single_call_report, session_finish, ) -from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig +from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig GLOBAL_TIMEOUT = ( 1500 # The tests will stop after 25 minutes so we can send the reports. @@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig: try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + agent_benchmark_config.agent_benchmark_config_path = ( + agent_benchmark_config_path + ) return agent_benchmark_config except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise - - - def resolve_workspace(workspace: str) -> str: if workspace.startswith("${") and workspace.endswith("}"): # Extract the string inside ${...} @@ -65,7 +64,9 @@ def config(request: Any) -> Any: try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + agent_benchmark_config.agent_benchmark_config_path = ( + agent_benchmark_config_path + ) except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise @@ -73,8 +74,12 @@ def config(request: Any) -> Any: if isinstance(config["workspace"], str): config["workspace"] = resolve_workspace(agent_benchmark_config.workspace) else: # it's a input output dict - config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input") - config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output") + config["workspace"]["input"] = resolve_workspace( + agent_benchmark_config.workspace / "input" + ) + config["workspace"]["output"] = resolve_workspace( + agent_benchmark_config.workspace / "output" + ) return config @@ -238,9 +243,11 @@ def scores(request: Any) -> None: # this is adding the dependency marker and category markers automatically from the json def pytest_collection_modifyitems(items: Any, config: Any) -> None: try: - with open(config.getoption('--agent_config_path'), "r") as f: + with open(config.getoption("--agent_config_path"), "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path') + agent_benchmark_config.agent_benchmark_config_path = config.getoption( + "--agent_config_path" + ) except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise diff --git a/benchmark/benchmark/generate_test.py b/benchmark/benchmark/generate_test.py index 1180119bf..fd81058b3 100644 --- a/benchmark/benchmark/generate_test.py +++ b/benchmark/benchmark/generate_test.py @@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional import pytest from benchmark.utils.challenge import Challenge -from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig +from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig from benchmark.utils.utils import get_test_path DATA_CATEGORY = {} @@ -222,7 +222,7 @@ def create_challenge( def generate_tests() -> None: # sourcery skip: invert-any-all print("Generating tests...") - challenges_path = os.path.join(os.path.dirname(__file__), 'challenges') + challenges_path = os.path.join(os.path.dirname(__file__), "challenges") json_files = deque( glob.glob( @@ -239,14 +239,16 @@ def generate_tests() -> None: # sourcery skip: invert-any-all try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + agent_benchmark_config.agent_benchmark_config_path = ( + agent_benchmark_config_path + ) except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise regression_reports_path = agent_benchmark_config.get_regression_reports_path() if regression_reports_path and os.path.exists(regression_reports_path): - with open(regression_reports_path, 'r') as f: + with open(regression_reports_path, "r") as f: regression_tests = json.load(f) else: regression_tests = {} diff --git a/benchmark/benchmark/reports/ReportManager.py b/benchmark/benchmark/reports/ReportManager.py index 991dd7cf1..7138f77f9 100644 --- a/benchmark/benchmark/reports/ReportManager.py +++ b/benchmark/benchmark/reports/ReportManager.py @@ -6,12 +6,13 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict +from benchmark.__main__ import BENCHMARK_START_TIME from benchmark.reports.processing.graphs import save_single_radar_chart from benchmark.reports.processing.process_report import get_agent_category from benchmark.reports.processing.report_types import Report -from benchmark.utils.utils import get_highest_success_difficulty from benchmark.utils.data_types import AgentBenchmarkConfig -from benchmark.__main__ import BENCHMARK_START_TIME +from benchmark.utils.utils import get_highest_success_difficulty + class ReportManager: """Abstracts interaction with the regression tests file""" @@ -24,9 +25,9 @@ class ReportManager: def load(self) -> None: if not os.path.exists(self.filename): os.makedirs(os.path.dirname(self.filename), exist_ok=True) - with open(self.filename, 'w') as f: + with open(self.filename, "w") as f: pass - + try: with open(self.filename, "r") as f: file_content = ( @@ -62,13 +63,12 @@ class ReportManager: self.save() def end_info_report(self, config: AgentBenchmarkConfig) -> None: - command = " ".join(sys.argv) self.tests = { "command": command.split(os.sep)[-1], - "benchmark_git_commit_sha": '---', - "agent_git_commit_sha": '---', + "benchmark_git_commit_sha": "---", + "agent_git_commit_sha": "---", "completion_time": datetime.now(timezone.utc).strftime( "%Y-%m-%dT%H:%M:%S+00:00" ), @@ -79,7 +79,9 @@ class ReportManager: "total_cost": self.get_total_costs(), }, "tests": self.tests, - "config": {k: v for k, v in json.loads(config.json()).items() if v is not None}, + "config": { + k: v for k, v in json.loads(config.json()).items() if v is not None + }, } converted_data = Report.parse_obj(self.tests) @@ -88,7 +90,6 @@ class ReportManager: save_single_radar_chart( agent_categories, - config.get_reports_path() / "radar_chart.png", ) diff --git a/benchmark/benchmark/reports/reports.py b/benchmark/benchmark/reports/reports.py index 1cb81fd32..fed23110e 100644 --- a/benchmark/benchmark/reports/reports.py +++ b/benchmark/benchmark/reports/reports.py @@ -4,7 +4,13 @@ import sys from pathlib import Path from typing import Any, Dict -from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig +from benchmark.reports.ReportManager import ReportManager +from benchmark.utils.data_types import ( + DIFFICULTY_MAP, + AgentBenchmarkConfig, + DifficultyLevel, + SuiteConfig, +) from benchmark.utils.get_data_from_helicone import get_data_from_helicone from benchmark.utils.utils import ( calculate_success_percentage, @@ -12,8 +18,6 @@ from benchmark.utils.utils import ( get_test_path, replace_backslash, ) -from benchmark.reports.ReportManager import ReportManager - def get_agent_benchmark_config() -> AgentBenchmarkConfig: @@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig: try: with open(agent_benchmark_config_path, "r") as f: agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path + agent_benchmark_config.agent_benchmark_config_path = ( + agent_benchmark_config_path + ) return agent_benchmark_config except json.JSONDecodeError: print("Error: benchmark_config.json is not a valid JSON file.") raise + def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]: agent_benchmark_config = get_agent_benchmark_config() # tests that consistently pass are considered regression tests - REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path()) + REGRESSION_MANAGER = ReportManager( + agent_benchmark_config.get_regression_reports_path() + ) # print(f"Using {REPORTS_PATH} for reports") # user facing reporting information - INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json")) + INFO_MANAGER = ReportManager( + str(agent_benchmark_config.get_reports_path() / "report.json") + ) # internal db step in replacement track pass/fail rate - INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path()) + INTERNAL_INFO_MANAGER = ReportManager( + agent_benchmark_config.get_success_rate_path() + ) return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER @@ -110,7 +123,7 @@ def generate_combined_suite_report( ) tests[test_name] = test_info_details - + info_details: Any = { "data_path": challenge_location, "task": challenge_data["task"], @@ -132,16 +145,12 @@ def get_previous_test_results( agent_tests: dict[str, list[bool]] = {} mock = "--mock" in sys.argv # Check if --mock is in sys.argv - prev_test_results = INTERNAL_INFO_MANAGER.tests.get( - test_name, [] - ) + prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, []) if not mock: # only add if it's an actual test prev_test_results.append(info_details["metrics"]["success"]) - INTERNAL_INFO_MANAGER.add_test( - test_name, prev_test_results - ) + INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results) # can calculate success rate regardless of mock info_details["metrics"]["success_%"] = calculate_success_percentage( @@ -199,8 +208,8 @@ def generate_single_call_report( }, "answers": answers, } - if 'metadata' in challenge_data: - info_details['metadata'] = challenge_data['metadata'] + if "metadata" in challenge_data: + info_details["metadata"] = challenge_data["metadata"] mock = "--mock" in sys.argv # Check if --mock is in sys.argv @@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None: } for name in suite_file_datum: - test_data = INFO_MANAGER.tests[ - name - ] # get the individual test reports + test_data = INFO_MANAGER.tests[name] # get the individual test reports data[name] = test_data # this is for calculating highest difficulty INFO_MANAGER.remove_test(name) @@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None: agent_benchmark_config = get_agent_benchmark_config() - INTERNAL_INFO_MANAGER.save() INFO_MANAGER.end_info_report(agent_benchmark_config) REGRESSION_MANAGER.save() diff --git a/benchmark/benchmark/start_benchmark.py b/benchmark/benchmark/start_benchmark.py index 77044b5c4..ce23606af 100644 --- a/benchmark/benchmark/start_benchmark.py +++ b/benchmark/benchmark/start_benchmark.py @@ -9,14 +9,13 @@ from typing import Any, Optional import click import pytest from helicone.lock import HeliconeLockManager -import sys -sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark') + +sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark") from agbenchmark.reports.ReportManager import ReportManager -from agbenchmark.utils.utils import ( +from agbenchmark.utils.utils import ( # get_git_commit_sha, AGENT_NAME, calculate_dynamic_paths, - # get_git_commit_sha, ) CURRENT_DIRECTORY = Path(__file__).resolve().parent @@ -34,8 +33,8 @@ if os.environ.get("HELICONE_API_KEY"): SUCCESS_RATE_PATH, CHALLENGES_PATH, ) = calculate_dynamic_paths() -BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..") -AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY) +BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..") +AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY) # open a file in the challenges/optional_categories with open( Path(__file__).resolve().parent / "challenges" / "optional_categories.json" @@ -334,13 +333,16 @@ def get_regression_data() -> Any: return data + @cli.command() def version(): """Print the version of the benchmark tool.""" import toml - version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"] - print(f"Benchmark Tool Version {version}") + version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][ + "version" + ] + print(f"Benchmark Tool Version {version}") # def run_from_backend( diff --git a/benchmark/benchmark/utils/data_types.py b/benchmark/benchmark/utils/data_types.py index e5d9e9876..57a327cf1 100644 --- a/benchmark/benchmark/utils/data_types.py +++ b/benchmark/benchmark/utils/data_types.py @@ -1,11 +1,14 @@ import glob import json +import sys +from datetime import datetime, timezone from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional -import sys + from pydantic import BaseModel, root_validator, validator -from datetime import datetime, timezone + + class DifficultyLevel(Enum): interface = "interface" basic = "basic" @@ -29,6 +32,7 @@ DIFFICULTY_MAP = { STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel} + def calculate_info_test_path(base_path: Path) -> Path: """ Calculates the path to the directory where the test report will be saved. @@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path: return report_path + class AgentBenchmarkConfig(BaseModel): """ This class represents the configuration for the Agent Benchmark. @@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel): - api_mode: A boolean indicating whether the benchmark is run in API mode. - host: The host where the benchmark is run. """ + agent_benchmark_config_path: Path | None = None entry_path: Path workspace: Path @@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel): def get_reports_location(self) -> Path: if not self.reports_folder: - self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve() + self.reports_folder = ( + self.agent_benchmark_config_path + / self.entry_path.parent + / ".." + / "reports" + ).resolve() return self.reports_folder - + def get_reports_path(self) -> Path: return calculate_info_test_path(self.get_reports_location()) - - def get_regression_reports_path(self) -> Path: + def get_regression_reports_path(self) -> Path: return self.get_reports_location() / "regression_tests.json" - + def get_success_rate_path(self) -> Path: return self.get_reports_location() / "success_rate.json" + class Info(BaseModel): difficulty: DifficultyLevel description: str diff --git a/benchmark/benchmark/utils/dependencies/__init__.py b/benchmark/benchmark/utils/dependencies/__init__.py index 596c47609..12668daec 100644 --- a/benchmark/benchmark/utils/dependencies/__init__.py +++ b/benchmark/benchmark/utils/dependencies/__init__.py @@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None: for action in group.options: current_options += action._short_opts + action._long_opts - group = parser.getgroup("depends") # Add a flag to list all names + the tests they resolve to diff --git a/benchmark/benchmark/utils/utils.py b/benchmark/benchmark/utils/utils.py index ebfdb0305..8f9dc2055 100644 --- a/benchmark/benchmark/utils/utils.py +++ b/benchmark/benchmark/utils/utils.py @@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME") REPORT_LOCATION = os.getenv("REPORT_LOCATION", None) - - - def replace_backslash(value: Any) -> Any: if isinstance(value, str): return re.sub(