From ef2107d9c2e78e2ff2a5e694f200ca813bdcd3f8 Mon Sep 17 00:00:00 2001
From: SwiftyOS <craigswift13@gmail.com>
Date: Mon, 11 Sep 2023 17:24:23 +0200
Subject: [PATCH] Fixing benchmark code

---
 benchmark/backend/main.py                     |  3 +-
 benchmark/benchmark/__main__.py               | 29 ++++++++----
 benchmark/benchmark/agent_api_interface.py    |  1 -
 benchmark/benchmark/agent_interface.py        |  1 -
 .../4_url_shortener/artifacts_out/test.py     | 13 ++++--
 .../artifacts_out/url_shortener.py            |  8 +++-
 .../artifacts_out/tic_tac_toe.py              | 40 ++++++++++++----
 .../code/5_tic_tac_toe/custom_python/test.py  | 24 ++++++----
 .../6_battleship/artifacts_in/conftest.py     |  3 +-
 .../artifacts_in/test_negative.py             |  7 +--
 .../6_battleship/artifacts_out/battleship.py  |  8 +---
 .../6_battleship/artifacts_out/conftest.py    |  3 +-
 .../artifacts_out/test_negative.py            |  7 +--
 benchmark/benchmark/conftest.py               | 27 +++++++----
 benchmark/benchmark/generate_test.py          | 10 ++--
 benchmark/benchmark/reports/ReportManager.py  | 19 ++++----
 benchmark/benchmark/reports/reports.py        | 46 +++++++++++--------
 benchmark/benchmark/start_benchmark.py        | 18 ++++----
 benchmark/benchmark/utils/data_types.py       | 25 +++++++---
 .../benchmark/utils/dependencies/__init__.py  |  1 -
 benchmark/benchmark/utils/utils.py            |  3 --
 21 files changed, 177 insertions(+), 119 deletions(-)

diff --git a/benchmark/backend/main.py b/benchmark/backend/main.py
index 03880f0ed..c0c2bf2df 100644
--- a/benchmark/backend/main.py
+++ b/benchmark/backend/main.py
@@ -9,11 +9,10 @@ from typing import Any
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
+from agbenchmark.utils.utils import find_absolute_benchmark_path
 from fastapi import FastAPI, Query
 from fastapi.middleware.cors import CORSMiddleware
 
-from agbenchmark.utils.utils import find_absolute_benchmark_path
-
 app = FastAPI()
 
 origins = ["http://localhost:3000"]
diff --git a/benchmark/benchmark/__main__.py b/benchmark/benchmark/__main__.py
index f7f0a77fa..64eae925f 100644
--- a/benchmark/benchmark/__main__.py
+++ b/benchmark/benchmark/__main__.py
@@ -5,10 +5,10 @@ import sys
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Optional
-import toml
 
 import click
 import pytest
+import toml
 from helicone.lock import HeliconeLockManager
 
 from benchmark.utils.data_types import AgentBenchmarkConfig
@@ -72,7 +72,9 @@ def run_benchmark(
     try:
         with open(agent_benchmark_config_path, "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         return 1
@@ -96,9 +98,10 @@ def run_benchmark(
             "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
         )
         return 1
-    
-    assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
-        "Error: host needs to be added to the config if api_mode is set to True."
+
+    assert not (
+        agent_benchmark_config.api_mode and not agent_benchmark_config.host
+    ), "Error: host needs to be added to the config if api_mode is set to True."
 
     print("Current configuration:")
     for key, value in vars(agent_benchmark_config).items():
@@ -200,7 +203,12 @@ def cli() -> None:
 )
 @click.option("--nc", is_flag=True, help="Run without cutoff")
 @click.option("--cutoff", help="Set or override tests cutoff (seconds)")
-@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
+@click.option(
+    "--agent-config",
+    type=click.Path(exists=True),
+    help="Path to the agent benchmark_config.json file,",
+    required=True,
+)
 def start(
     maintain: bool,
     improve: bool,
@@ -220,8 +228,9 @@ def start(
     original_stdout = sys.stdout  # Save the original standard output
     exit_code = None
 
-
-    assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
+    assert (
+        "benchmark_config.json" in agent_config
+    ), "benchmark_config.json must be provided"
 
     if backend:
         with open("backend/backend_stdout.txt", "w") as f:
@@ -266,7 +275,9 @@ def start(
 def version():
     """Print the version of the benchmark tool."""
     current_directory = Path(__file__).resolve().parent
-    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
+    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
+        "version"
+    ]
     print(f"Benchmark Tool Version {version}")
 
 
diff --git a/benchmark/benchmark/agent_api_interface.py b/benchmark/benchmark/agent_api_interface.py
index 17dbd7308..6bd76de86 100644
--- a/benchmark/benchmark/agent_api_interface.py
+++ b/benchmark/benchmark/agent_api_interface.py
@@ -51,7 +51,6 @@ async def run_api_agent(
 
         artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
         for artifact in artifacts:
-
             if artifact.relative_path:
                 folder_path = os.path.join(config["workspace"], artifact.relative_path)
             else:
diff --git a/benchmark/benchmark/agent_interface.py b/benchmark/benchmark/agent_interface.py
index e7c6ac4dd..e79899717 100644
--- a/benchmark/benchmark/agent_interface.py
+++ b/benchmark/benchmark/agent_interface.py
@@ -12,7 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv
 
-
 load_dotenv()
 
 helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
diff --git a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
index 94fcac027..c3daffa80 100644
--- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
+++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
@@ -1,15 +1,22 @@
 import unittest
-from url_shortener import shorten_url, retrieve_url
+
+from url_shortener import retrieve_url, shorten_url
+
 
 class TestURLShortener(unittest.TestCase):
     def test_url_retrieval(self):
         # Shorten the URL to get its shortened form
-        shortened_url = shorten_url('https://www.example.com')
+        shortened_url = shorten_url("https://www.example.com")
 
         # Retrieve the original URL using the shortened URL directly
         retrieved_url = retrieve_url(shortened_url)
 
-        self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!")
+        self.assertEqual(
+            retrieved_url,
+            "https://www.example.com",
+            "Retrieved URL does not match the original!",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
index 8fe0d315d..89a73a82b 100644
--- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
+++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
@@ -3,6 +3,7 @@ import base64
 
 URL_MAPPING = {}
 
+
 def shorten_url(url):
     # Convert the URL to base64
     encoded_url = base64.b64encode(url.encode()).decode()
@@ -12,13 +13,15 @@ def shorten_url(url):
     URL_MAPPING[short_url] = url
     return short_url
 
+
 def retrieve_url(short_url):
     return URL_MAPPING.get(short_url, "URL not found")
 
+
 def main():
     parser = argparse.ArgumentParser(description="URL Shortener")
-    parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened")
-    parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved")
+    parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened")
+    parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved")
 
     args = parser.parse_args()
 
@@ -32,5 +35,6 @@ def main():
     else:
         print("No valid arguments provided.")
 
+
 if __name__ == "__main__":
     main()
diff --git a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
index 0caa903fa..e0163220a 100644
--- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
+++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
@@ -1,38 +1,45 @@
 import pprint
 
+
 def column(matrix, i):
     return [row[i] for row in matrix]
 
+
 def check(list):
     if len(set(list)) <= 1:
         if list[0] != 0:
             return list[0]
     return None
 
+
 def checkDiagLeft(board):
-    if (board[0][0] == board[1][1] and board[1][1] == board[2][2]):
+    if board[0][0] == board[1][1] and board[1][1] == board[2][2]:
         if board[0][0] != 0:
             return board[0][0]
     return None
 
+
 def checkDiagRight(board):
-    if (board[2][0] == board[1][1] and board[1][1] == board[0][2]):
+    if board[2][0] == board[1][1] and board[1][1] == board[0][2]:
         if board[2][0] != 0:
             return board[2][0]
     return None
 
+
 def placeItem(row, column, board, current_player):
     if board[row][column] != 0:
         return None
     else:
         board[row][column] = current_player
 
+
 def swapPlayers(player):
-    if (player == 2):
+    if player == 2:
         return 1
     else:
         return 2
 
+
 def winner(board):
     for rowIndex in board:
         if check(rowIndex) is not None:
@@ -46,23 +53,35 @@ def winner(board):
         return checkDiagRight(board)
     return 0
 
+
 def getLocation():
-    location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
+    location = input(
+        "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
+    )
     print(f"\nYou picked {location}")
-    coordinates = [int(x) for x in location.split(',')]
-    while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2):
+    coordinates = [int(x) for x in location.split(",")]
+    while (
+        len(coordinates) != 2
+        or coordinates[0] < 0
+        or coordinates[0] > 2
+        or coordinates[1] < 0
+        or coordinates[1] > 2
+    ):
         print("You inputted a location in an invalid format")
-        location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
-        coordinates = [int(x) for x in location.split(',')]
+        location = input(
+            "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
+        )
+        coordinates = [int(x) for x in location.split(",")]
     return coordinates
 
+
 def gamePlay():
     num_moves = 0
     pp = pprint.PrettyPrinter(width=20)
     current_player = 1
     board = [[0 for x in range(3)] for x in range(3)]
 
-    while (num_moves < 9 and winner(board) == 0):
+    while num_moves < 9 and winner(board) == 0:
         print("This is the current board: ")
         pp.pprint(board)
         coordinates = getLocation()
@@ -75,5 +94,6 @@ def gamePlay():
     if winner(board) == 0:
         print("Draw")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     gamePlay()
diff --git a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
index 6fa522513..94b778208 100644
--- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
+++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
@@ -1,18 +1,20 @@
 import subprocess
+
 import pytest
 
+
 def run_game_with_inputs(inputs):
     # Start the game process
     process = subprocess.Popen(
-        ['python', 'tic_tac_toe.py'],
+        ["python", "tic_tac_toe.py"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
-        text=True
+        text=True,
     )
 
     # Send the input moves one by one
-    output, errors = process.communicate('\n'.join(inputs))
+    output, errors = process.communicate("\n".join(inputs))
 
     # Print the inputs and outputs
     print("Inputs:\n", "\n".join(inputs))
@@ -22,14 +24,18 @@ def run_game_with_inputs(inputs):
     return output
 
 
-@pytest.mark.parametrize("inputs, expected_output", [
-    (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
-    (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
-    (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw")
-])
+@pytest.mark.parametrize(
+    "inputs, expected_output",
+    [
+        (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
+        (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
+        (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
+    ],
+)
 def test_game(inputs, expected_output):
     output = run_game_with_inputs(inputs)
     assert expected_output in output
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     pytest.main()
diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
index f1e984576..a1412966b 100644
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
@@ -1,9 +1,8 @@
 import pytest
-
 from abstract_class import ShipPlacement, Turn
-
 from battleship import Battleship
 
+
 @pytest.fixture
 def battleship_game():
     return Battleship()
diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
index 484ae3509..34bed48b4 100644
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
@@ -1,7 +1,6 @@
 import pytest
-from pydantic import ValidationError
-
 from abstract_class import ShipPlacement, Turn
+from pydantic import ValidationError
 
 
 def test_ship_placement_out_of_bounds(battleship_game):
@@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
 
 
 def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
-    game = battleship_game.get_game(
-        initialized_game_id
-    )
+    game = battleship_game.get_game(initialized_game_id)
     additional_ship = ShipPlacement(
         ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
     )
diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
index 5d87181f6..ad7dc83f9 100644
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
@@ -86,9 +86,7 @@ class Battleship(AbstractBattleship):
         game.turns.append(turn)
 
         if hit_ship == "hit":
-            return TurnResponse(
-                result="miss", ship_type=None
-            )
+            return TurnResponse(result="miss", ship_type=None)
 
         if hit_ship:
             ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship)
@@ -133,9 +131,7 @@ class Battleship(AbstractBattleship):
         )
 
         if hits == total_ships_length:
-            return GameStatus(
-                is_game_over=True, winner="player"
-            )
+            return GameStatus(is_game_over=True, winner="player")
         else:
             return GameStatus(is_game_over=False, winner=None)
 
diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
index f1e984576..a1412966b 100644
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
@@ -1,9 +1,8 @@
 import pytest
-
 from abstract_class import ShipPlacement, Turn
-
 from battleship import Battleship
 
+
 @pytest.fixture
 def battleship_game():
     return Battleship()
diff --git a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
index 484ae3509..34bed48b4 100644
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
@@ -1,7 +1,6 @@
 import pytest
-from pydantic import ValidationError
-
 from abstract_class import ShipPlacement, Turn
+from pydantic import ValidationError
 
 
 def test_ship_placement_out_of_bounds(battleship_game):
@@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
 
 
 def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
-    game = battleship_game.get_game(
-        initialized_game_id
-    )
+    game = battleship_game.get_game(initialized_game_id)
     additional_ship = ShipPlacement(
         ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
     )
diff --git a/benchmark/benchmark/conftest.py b/benchmark/benchmark/conftest.py
index f1e6ad8bf..a93867e49 100644
--- a/benchmark/benchmark/conftest.py
+++ b/benchmark/benchmark/conftest.py
@@ -16,7 +16,7 @@ from benchmark.reports.reports import (
     generate_single_call_report,
     session_finish,
 )
-from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig
 
 GLOBAL_TIMEOUT = (
     1500  # The tests will stop after 25 minutes so we can send the reports.
@@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
     try:
         with open(agent_benchmark_config_path, "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
             return agent_benchmark_config
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         raise
 
 
-    
-
-
 def resolve_workspace(workspace: str) -> str:
     if workspace.startswith("${") and workspace.endswith("}"):
         # Extract the string inside ${...}
@@ -65,7 +64,9 @@ def config(request: Any) -> Any:
     try:
         with open(agent_benchmark_config_path, "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         raise
@@ -73,8 +74,12 @@ def config(request: Any) -> Any:
     if isinstance(config["workspace"], str):
         config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
     else:  # it's a input output dict
-        config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input")
-        config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output")
+        config["workspace"]["input"] = resolve_workspace(
+            agent_benchmark_config.workspace / "input"
+        )
+        config["workspace"]["output"] = resolve_workspace(
+            agent_benchmark_config.workspace / "output"
+        )
 
     return config
 
@@ -238,9 +243,11 @@ def scores(request: Any) -> None:
 # this is adding the dependency marker and category markers automatically from the json
 def pytest_collection_modifyitems(items: Any, config: Any) -> None:
     try:
-        with open(config.getoption('--agent_config_path'), "r") as f:
+        with open(config.getoption("--agent_config_path"), "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path')
+            agent_benchmark_config.agent_benchmark_config_path = config.getoption(
+                "--agent_config_path"
+            )
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         raise
diff --git a/benchmark/benchmark/generate_test.py b/benchmark/benchmark/generate_test.py
index 1180119bf..fd81058b3 100644
--- a/benchmark/benchmark/generate_test.py
+++ b/benchmark/benchmark/generate_test.py
@@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional
 import pytest
 
 from benchmark.utils.challenge import Challenge
-from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig
 from benchmark.utils.utils import get_test_path
 
 DATA_CATEGORY = {}
@@ -222,7 +222,7 @@ def create_challenge(
 def generate_tests() -> None:  # sourcery skip: invert-any-all
     print("Generating tests...")
 
-    challenges_path = os.path.join(os.path.dirname(__file__), 'challenges')
+    challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
 
     json_files = deque(
         glob.glob(
@@ -239,14 +239,16 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
     try:
         with open(agent_benchmark_config_path, "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         raise
 
     regression_reports_path = agent_benchmark_config.get_regression_reports_path()
     if regression_reports_path and os.path.exists(regression_reports_path):
-        with open(regression_reports_path, 'r') as f:
+        with open(regression_reports_path, "r") as f:
             regression_tests = json.load(f)
     else:
         regression_tests = {}
diff --git a/benchmark/benchmark/reports/ReportManager.py b/benchmark/benchmark/reports/ReportManager.py
index 991dd7cf1..7138f77f9 100644
--- a/benchmark/benchmark/reports/ReportManager.py
+++ b/benchmark/benchmark/reports/ReportManager.py
@@ -6,12 +6,13 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict
 
+from benchmark.__main__ import BENCHMARK_START_TIME
 from benchmark.reports.processing.graphs import save_single_radar_chart
 from benchmark.reports.processing.process_report import get_agent_category
 from benchmark.reports.processing.report_types import Report
-from benchmark.utils.utils import get_highest_success_difficulty
 from benchmark.utils.data_types import AgentBenchmarkConfig
-from benchmark.__main__ import BENCHMARK_START_TIME
+from benchmark.utils.utils import get_highest_success_difficulty
+
 
 class ReportManager:
     """Abstracts interaction with the regression tests file"""
@@ -24,9 +25,9 @@ class ReportManager:
     def load(self) -> None:
         if not os.path.exists(self.filename):
             os.makedirs(os.path.dirname(self.filename), exist_ok=True)
-            with open(self.filename, 'w') as f:
+            with open(self.filename, "w") as f:
                 pass
-        
+
         try:
             with open(self.filename, "r") as f:
                 file_content = (
@@ -62,13 +63,12 @@ class ReportManager:
         self.save()
 
     def end_info_report(self, config: AgentBenchmarkConfig) -> None:
-
         command = " ".join(sys.argv)
 
         self.tests = {
             "command": command.split(os.sep)[-1],
-            "benchmark_git_commit_sha": '---',
-            "agent_git_commit_sha": '---',
+            "benchmark_git_commit_sha": "---",
+            "agent_git_commit_sha": "---",
             "completion_time": datetime.now(timezone.utc).strftime(
                 "%Y-%m-%dT%H:%M:%S+00:00"
             ),
@@ -79,7 +79,9 @@ class ReportManager:
                 "total_cost": self.get_total_costs(),
             },
             "tests": self.tests,
-            "config": {k: v for k, v in json.loads(config.json()).items() if v is not None},
+            "config": {
+                k: v for k, v in json.loads(config.json()).items() if v is not None
+            },
         }
 
         converted_data = Report.parse_obj(self.tests)
@@ -88,7 +90,6 @@ class ReportManager:
 
         save_single_radar_chart(
             agent_categories,
-            
             config.get_reports_path() / "radar_chart.png",
         )
 
diff --git a/benchmark/benchmark/reports/reports.py b/benchmark/benchmark/reports/reports.py
index 1cb81fd32..fed23110e 100644
--- a/benchmark/benchmark/reports/reports.py
+++ b/benchmark/benchmark/reports/reports.py
@@ -4,7 +4,13 @@ import sys
 from pathlib import Path
 from typing import Any, Dict
 
-from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig
+from benchmark.reports.ReportManager import ReportManager
+from benchmark.utils.data_types import (
+    DIFFICULTY_MAP,
+    AgentBenchmarkConfig,
+    DifficultyLevel,
+    SuiteConfig,
+)
 from benchmark.utils.get_data_from_helicone import get_data_from_helicone
 from benchmark.utils.utils import (
     calculate_success_percentage,
@@ -12,8 +18,6 @@ from benchmark.utils.utils import (
     get_test_path,
     replace_backslash,
 )
-from benchmark.reports.ReportManager import ReportManager
-
 
 
 def get_agent_benchmark_config() -> AgentBenchmarkConfig:
@@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig:
     try:
         with open(agent_benchmark_config_path, "r") as f:
             agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
             return agent_benchmark_config
     except json.JSONDecodeError:
         print("Error: benchmark_config.json is not a valid JSON file.")
         raise
 
+
 def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
     agent_benchmark_config = get_agent_benchmark_config()
     # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path())
+    REGRESSION_MANAGER = ReportManager(
+        agent_benchmark_config.get_regression_reports_path()
+    )
 
     # print(f"Using {REPORTS_PATH} for reports")
     # user facing reporting information
-    INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json"))
+    INFO_MANAGER = ReportManager(
+        str(agent_benchmark_config.get_reports_path() / "report.json")
+    )
 
     # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path())
+    INTERNAL_INFO_MANAGER = ReportManager(
+        agent_benchmark_config.get_success_rate_path()
+    )
 
     return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
 
@@ -110,7 +123,7 @@ def generate_combined_suite_report(
         )
 
         tests[test_name] = test_info_details
-        
+
     info_details: Any = {
         "data_path": challenge_location,
         "task": challenge_data["task"],
@@ -132,16 +145,12 @@ def get_previous_test_results(
     agent_tests: dict[str, list[bool]] = {}
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
-    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(
-        test_name, []
-    )
+    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
 
     if not mock:
         # only add if it's an actual test
         prev_test_results.append(info_details["metrics"]["success"])
-        INTERNAL_INFO_MANAGER.add_test(
-            test_name, prev_test_results
-        )
+        INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
 
     # can calculate success rate regardless of mock
     info_details["metrics"]["success_%"] = calculate_success_percentage(
@@ -199,8 +208,8 @@ def generate_single_call_report(
         },
         "answers": answers,
     }
-    if 'metadata' in challenge_data:
-        info_details['metadata'] = challenge_data['metadata']
+    if "metadata" in challenge_data:
+        info_details["metadata"] = challenge_data["metadata"]
 
     mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
 
@@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
         }
 
         for name in suite_file_datum:
-            test_data = INFO_MANAGER.tests[
-                name
-            ]  # get the individual test reports
+            test_data = INFO_MANAGER.tests[name]  # get the individual test reports
             data[name] = test_data  # this is for calculating highest difficulty
             INFO_MANAGER.remove_test(name)
 
@@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None:
 
     agent_benchmark_config = get_agent_benchmark_config()
 
-
     INTERNAL_INFO_MANAGER.save()
     INFO_MANAGER.end_info_report(agent_benchmark_config)
     REGRESSION_MANAGER.save()
diff --git a/benchmark/benchmark/start_benchmark.py b/benchmark/benchmark/start_benchmark.py
index 77044b5c4..ce23606af 100644
--- a/benchmark/benchmark/start_benchmark.py
+++ b/benchmark/benchmark/start_benchmark.py
@@ -9,14 +9,13 @@ from typing import Any, Optional
 import click
 import pytest
 from helicone.lock import HeliconeLockManager
-import sys
-sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark')
+
+sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
 
 from agbenchmark.reports.ReportManager import ReportManager
-from agbenchmark.utils.utils import (
+from agbenchmark.utils.utils import (  # get_git_commit_sha,
     AGENT_NAME,
     calculate_dynamic_paths,
-    # get_git_commit_sha,
 )
 
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
@@ -34,8 +33,8 @@ if os.environ.get("HELICONE_API_KEY"):
     SUCCESS_RATE_PATH,
     CHALLENGES_PATH,
 ) = calculate_dynamic_paths()
-BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
-AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
+BENCHMARK_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
+AGENT_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY)
 # open a file in the challenges/optional_categories
 with open(
     Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
@@ -334,13 +333,16 @@ def get_regression_data() -> Any:
 
     return data
 
+
 @cli.command()
 def version():
     """Print the version of the benchmark tool."""
     import toml
-    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
-    print(f"Benchmark Tool Version {version}")
 
+    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
+        "version"
+    ]
+    print(f"Benchmark Tool Version {version}")
 
 
 # def run_from_backend(
diff --git a/benchmark/benchmark/utils/data_types.py b/benchmark/benchmark/utils/data_types.py
index e5d9e9876..57a327cf1 100644
--- a/benchmark/benchmark/utils/data_types.py
+++ b/benchmark/benchmark/utils/data_types.py
@@ -1,11 +1,14 @@
 import glob
 import json
+import sys
+from datetime import datetime, timezone
 from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-import sys
+
 from pydantic import BaseModel, root_validator, validator
-from datetime import datetime, timezone
+
+
 class DifficultyLevel(Enum):
     interface = "interface"
     basic = "basic"
@@ -29,6 +32,7 @@ DIFFICULTY_MAP = {
 
 STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
 
+
 def calculate_info_test_path(base_path: Path) -> Path:
     """
     Calculates the path to the directory where the test report will be saved.
@@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path:
 
     return report_path
 
+
 class AgentBenchmarkConfig(BaseModel):
     """
     This class represents the configuration for the Agent Benchmark.
@@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel):
     - api_mode: A boolean indicating whether the benchmark is run in API mode.
     - host: The host where the benchmark is run.
     """
+
     agent_benchmark_config_path: Path | None = None
     entry_path: Path
     workspace: Path
@@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel):
 
     def get_reports_location(self) -> Path:
         if not self.reports_folder:
-            self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve()
+            self.reports_folder = (
+                self.agent_benchmark_config_path
+                / self.entry_path.parent
+                / ".."
+                / "reports"
+            ).resolve()
         return self.reports_folder
-    
+
     def get_reports_path(self) -> Path:
         return calculate_info_test_path(self.get_reports_location())
-    
-    def get_regression_reports_path(self) -> Path:
 
+    def get_regression_reports_path(self) -> Path:
         return self.get_reports_location() / "regression_tests.json"
-    
+
     def get_success_rate_path(self) -> Path:
         return self.get_reports_location() / "success_rate.json"
 
+
 class Info(BaseModel):
     difficulty: DifficultyLevel
     description: str
diff --git a/benchmark/benchmark/utils/dependencies/__init__.py b/benchmark/benchmark/utils/dependencies/__init__.py
index 596c47609..12668daec 100644
--- a/benchmark/benchmark/utils/dependencies/__init__.py
+++ b/benchmark/benchmark/utils/dependencies/__init__.py
@@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
         for action in group.options:
             current_options += action._short_opts + action._long_opts
 
-
     group = parser.getgroup("depends")
 
     # Add a flag to list all names + the tests they resolve to
diff --git a/benchmark/benchmark/utils/utils.py b/benchmark/benchmark/utils/utils.py
index ebfdb0305..8f9dc2055 100644
--- a/benchmark/benchmark/utils/utils.py
+++ b/benchmark/benchmark/utils/utils.py
@@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME")
 REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
 
 
-
-
-
 def replace_backslash(value: Any) -> Any:
     if isinstance(value, str):
         return re.sub(