Fixing benchmark code

2023-09-11 17:24:23 +02:00 · 2023-09-11 17:24:23 +02:00 · ef2107d9c2
parent c73e90c4e6
commit ef2107d9c2
21 changed files with 177 additions and 119 deletions
--- a/benchmark/backend/main.py
+++ b/benchmark/backend/main.py
@ -9,11 +9,10 @@ from typing import Any
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from agbenchmark.utils.utils import find_absolute_benchmark_path
 from fastapi import FastAPI, Query
 from fastapi.middleware.cors import CORSMiddleware
 from agbenchmark.utils.utils import find_absolute_benchmark_path
 app = FastAPI()
 origins = ["http://localhost:3000"]
--- a/benchmark/benchmark/main.py
+++ b/benchmark/benchmark/main.py
@ -5,10 +5,10 @@ import sys
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Optional
 import toml
 import click
 import pytest
 import toml
 from helicone.lock import HeliconeLockManager
 from benchmark.utils.data_types import AgentBenchmarkConfig
@ -72,7 +72,9 @@ def run_benchmark(
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        return 1
@ -97,8 +99,9 @@ def run_benchmark(
        )
        return 1
-    assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
+    assert not (
-        "Error: host needs to be added to the config if api_mode is set to True."
+        agent_benchmark_config.api_mode and not agent_benchmark_config.host
    ), "Error: host needs to be added to the config if api_mode is set to True."
    print("Current configuration:")
    for key, value in vars(agent_benchmark_config).items():
@ -200,7 +203,12 @@ def cli() -> None:
 )
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
-@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
+@click.option(
    "--agent-config",
    type=click.Path(exists=True),
    help="Path to the agent benchmark_config.json file,",
    required=True,
 )
 def start(
    maintain: bool,
    improve: bool,
@ -220,8 +228,9 @@ def start(
    original_stdout = sys.stdout  # Save the original standard output
    exit_code = None
-
+    assert (
-    assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
+        "benchmark_config.json" in agent_config
    ), "benchmark_config.json must be provided"
    if backend:
        with open("backend/backend_stdout.txt", "w") as f:
@ -266,7 +275,9 @@ def start(
 def version():
    """Print the version of the benchmark tool."""
    current_directory = Path(__file__).resolve().parent
-    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
+    version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
        "version"
    ]
    print(f"Benchmark Tool Version {version}")
--- a/benchmark/benchmark/agent_api_interface.py
+++ b/benchmark/benchmark/agent_api_interface.py
@ -51,7 +51,6 @@ async def run_api_agent(
        artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
        for artifact in artifacts:
            if artifact.relative_path:
                folder_path = os.path.join(config["workspace"], artifact.relative_path)
            else:
--- a/benchmark/benchmark/agent_interface.py
+++ b/benchmark/benchmark/agent_interface.py
@ -12,7 +12,6 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv
 load_dotenv()
 helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
--- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
+++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/test.py
@ -1,15 +1,22 @@
 import unittest
-from url_shortener import shorten_url, retrieve_url
+
 from url_shortener import retrieve_url, shorten_url
 class TestURLShortener(unittest.TestCase):
    def test_url_retrieval(self):
        # Shorten the URL to get its shortened form
-        shortened_url = shorten_url('https://www.example.com')
+        shortened_url = shorten_url("https://www.example.com")
        # Retrieve the original URL using the shortened URL directly
        retrieved_url = retrieve_url(shortened_url)
-        self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!")
+        self.assertEqual(
            retrieved_url,
            "https://www.example.com",
            "Retrieved URL does not match the original!",
        )
 if __name__ == "__main__":
    unittest.main()
--- a/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
+++ b/benchmark/benchmark/challenges/verticals/code/4_url_shortener/artifacts_out/url_shortener.py
@ -3,6 +3,7 @@ import base64
 URL_MAPPING = {}
 def shorten_url(url):
    # Convert the URL to base64
    encoded_url = base64.b64encode(url.encode()).decode()
@ -12,13 +13,15 @@ def shorten_url(url):
    URL_MAPPING[short_url] = url
    return short_url
 def retrieve_url(short_url):
    return URL_MAPPING.get(short_url, "URL not found")
 def main():
    parser = argparse.ArgumentParser(description="URL Shortener")
-    parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened")
+    parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened")
-    parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved")
+    parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved")
    args = parser.parse_args()
@ -32,5 +35,6 @@ def main():
    else:
        print("No valid arguments provided.")
 if __name__ == "__main__":
    main()
--- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
+++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/artifacts_out/tic_tac_toe.py
@ -1,38 +1,45 @@
 import pprint
 def column(matrix, i):
    return [row[i] for row in matrix]
 def check(list):
    if len(set(list)) <= 1:
        if list[0] != 0:
            return list[0]
    return None
 def checkDiagLeft(board):
-    if (board[0][0] == board[1][1] and board[1][1] == board[2][2]):
+    if board[0][0] == board[1][1] and board[1][1] == board[2][2]:
        if board[0][0] != 0:
            return board[0][0]
    return None
 def checkDiagRight(board):
-    if (board[2][0] == board[1][1] and board[1][1] == board[0][2]):
+    if board[2][0] == board[1][1] and board[1][1] == board[0][2]:
        if board[2][0] != 0:
            return board[2][0]
    return None
 def placeItem(row, column, board, current_player):
    if board[row][column] != 0:
        return None
    else:
        board[row][column] = current_player
 def swapPlayers(player):
-    if (player == 2):
+    if player == 2:
        return 1
    else:
        return 2
 def winner(board):
    for rowIndex in board:
        if check(rowIndex) is not None:
@ -46,23 +53,35 @@ def winner(board):
        return checkDiagRight(board)
    return 0
 def getLocation():
-    location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
+    location = input(
        "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
    )
    print(f"\nYou picked {location}")
-    coordinates = [int(x) for x in location.split(',')]
+    coordinates = [int(x) for x in location.split(",")]
-    while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2):
+    while (
        len(coordinates) != 2
        or coordinates[0] < 0
        or coordinates[0] > 2
        or coordinates[1] < 0
        or coordinates[1] > 2
    ):
        print("You inputted a location in an invalid format")
-        location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
+        location = input(
-        coordinates = [int(x) for x in location.split(',')]
+            "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
        )
        coordinates = [int(x) for x in location.split(",")]
    return coordinates
 def gamePlay():
    num_moves = 0
    pp = pprint.PrettyPrinter(width=20)
    current_player = 1
    board = [[0 for x in range(3)] for x in range(3)]
-    while (num_moves < 9 and winner(board) == 0):
+    while num_moves < 9 and winner(board) == 0:
        print("This is the current board: ")
        pp.pprint(board)
        coordinates = getLocation()
@ -75,5 +94,6 @@ def gamePlay():
    if winner(board) == 0:
        print("Draw")
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    gamePlay()
--- a/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
+++ b/benchmark/benchmark/challenges/verticals/code/5_tic_tac_toe/custom_python/test.py
@ -1,18 +1,20 @@
 import subprocess
 import pytest
 def run_game_with_inputs(inputs):
    # Start the game process
    process = subprocess.Popen(
-        ['python', 'tic_tac_toe.py'],
+        ["python", "tic_tac_toe.py"],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
-        text=True
+        text=True,
    )
    # Send the input moves one by one
-    output, errors = process.communicate('\n'.join(inputs))
+    output, errors = process.communicate("\n".join(inputs))
    # Print the inputs and outputs
    print("Inputs:\n", "\n".join(inputs))
@ -22,14 +24,18 @@ def run_game_with_inputs(inputs):
    return output
-@pytest.mark.parametrize("inputs, expected_output", [
+@pytest.mark.parametrize(
-    (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
+    "inputs, expected_output",
-    (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
+    [
-    (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw")
+        (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
-])
+        (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
        (["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
    ],
 )
 def test_game(inputs, expected_output):
    output = run_game_with_inputs(inputs)
    assert expected_output in output
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    pytest.main()
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/conftest.py
@ -1,9 +1,8 @@
 import pytest
 from abstract_class import ShipPlacement, Turn
 from battleship import Battleship
@pytest.fixture
 def battleship_game():
    return Battleship()
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_in/test_negative.py
@ -1,7 +1,6 @@
 import pytest
 from pydantic import ValidationError
 from abstract_class import ShipPlacement, Turn
 from pydantic import ValidationError
 def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
 def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
-    game = battleship_game.get_game(
+    game = battleship_game.get_game(initialized_game_id)
        initialized_game_id
    )
    additional_ship = ShipPlacement(
        ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
    )
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/battleship.py
@ -86,9 +86,7 @@ class Battleship(AbstractBattleship):
        game.turns.append(turn)
        if hit_ship == "hit":
-            return TurnResponse(
+            return TurnResponse(result="miss", ship_type=None)
                result="miss", ship_type=None
            )
        if hit_ship:
            ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship)
@ -133,9 +131,7 @@ class Battleship(AbstractBattleship):
        )
        if hits == total_ships_length:
-            return GameStatus(
+            return GameStatus(is_game_over=True, winner="player")
                is_game_over=True, winner="player"
            )
        else:
            return GameStatus(is_game_over=False, winner=None)
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/conftest.py
@ -1,9 +1,8 @@
 import pytest
 from abstract_class import ShipPlacement, Turn
 from battleship import Battleship
@pytest.fixture
 def battleship_game():
    return Battleship()
--- a/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
+++ b/benchmark/benchmark/challenges/verticals/code/6_battleship/artifacts_out/test_negative.py
@ -1,7 +1,6 @@
 import pytest
 from pydantic import ValidationError
 from abstract_class import ShipPlacement, Turn
 from pydantic import ValidationError
 def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
 def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
-    game = battleship_game.get_game(
+    game = battleship_game.get_game(initialized_game_id)
        initialized_game_id
    )
    additional_ship = ShipPlacement(
        ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
    )
--- a/benchmark/benchmark/conftest.py
+++ b/benchmark/benchmark/conftest.py
@ -16,7 +16,7 @@ from benchmark.reports.reports import (
    generate_single_call_report,
    session_finish,
 )
-from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig
 GLOBAL_TIMEOUT = (
    1500  # The tests will stop after 25 minutes so we can send the reports.
@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
            return agent_benchmark_config
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise
 def resolve_workspace(workspace: str) -> str:
    if workspace.startswith("${") and workspace.endswith("}"):
        # Extract the string inside ${...}
@ -65,7 +64,9 @@ def config(request: Any) -> Any:
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise
@ -73,8 +74,12 @@ def config(request: Any) -> Any:
    if isinstance(config["workspace"], str):
        config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
    else:  # it's a input output dict
-        config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input")
+        config["workspace"]["input"] = resolve_workspace(
-        config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output")
+            agent_benchmark_config.workspace / "input"
        )
        config["workspace"]["output"] = resolve_workspace(
            agent_benchmark_config.workspace / "output"
        )
    return config
@ -238,9 +243,11 @@ def scores(request: Any) -> None:
 # this is adding the dependency marker and category markers automatically from the json
 def pytest_collection_modifyitems(items: Any, config: Any) -> None:
    try:
-        with open(config.getoption('--agent_config_path'), "r") as f:
+        with open(config.getoption("--agent_config_path"), "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path')
+            agent_benchmark_config.agent_benchmark_config_path = config.getoption(
                "--agent_config_path"
            )
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise
--- a/benchmark/benchmark/generate_test.py
+++ b/benchmark/benchmark/generate_test.py
@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional
 import pytest
 from benchmark.utils.challenge import Challenge
-from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig
+from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig
 from benchmark.utils.utils import get_test_path
 DATA_CATEGORY = {}
@ -222,7 +222,7 @@ def create_challenge(
 def generate_tests() -> None:  # sourcery skip: invert-any-all
    print("Generating tests...")
-    challenges_path = os.path.join(os.path.dirname(__file__), 'challenges')
+    challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
    json_files = deque(
        glob.glob(
@ -239,14 +239,16 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise
    regression_reports_path = agent_benchmark_config.get_regression_reports_path()
    if regression_reports_path and os.path.exists(regression_reports_path):
-        with open(regression_reports_path, 'r') as f:
+        with open(regression_reports_path, "r") as f:
            regression_tests = json.load(f)
    else:
        regression_tests = {}
--- a/benchmark/benchmark/reports/ReportManager.py
+++ b/benchmark/benchmark/reports/ReportManager.py
@ -6,12 +6,13 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict
 from benchmark.__main__ import BENCHMARK_START_TIME
 from benchmark.reports.processing.graphs import save_single_radar_chart
 from benchmark.reports.processing.process_report import get_agent_category
 from benchmark.reports.processing.report_types import Report
 from benchmark.utils.utils import get_highest_success_difficulty
 from benchmark.utils.data_types import AgentBenchmarkConfig
-from benchmark.__main__ import BENCHMARK_START_TIME
+from benchmark.utils.utils import get_highest_success_difficulty
 class ReportManager:
    """Abstracts interaction with the regression tests file"""
@ -24,7 +25,7 @@ class ReportManager:
    def load(self) -> None:
        if not os.path.exists(self.filename):
            os.makedirs(os.path.dirname(self.filename), exist_ok=True)
-            with open(self.filename, 'w') as f:
+            with open(self.filename, "w") as f:
                pass
        try:
@ -62,13 +63,12 @@ class ReportManager:
        self.save()
    def end_info_report(self, config: AgentBenchmarkConfig) -> None:
        command = " ".join(sys.argv)
        self.tests = {
            "command": command.split(os.sep)[-1],
-            "benchmark_git_commit_sha": '---',
+            "benchmark_git_commit_sha": "---",
-            "agent_git_commit_sha": '---',
+            "agent_git_commit_sha": "---",
            "completion_time": datetime.now(timezone.utc).strftime(
                "%Y-%m-%dT%H:%M:%S+00:00"
            ),
@ -79,7 +79,9 @@ class ReportManager:
                "total_cost": self.get_total_costs(),
            },
            "tests": self.tests,
-            "config": {k: v for k, v in json.loads(config.json()).items() if v is not None},
+            "config": {
                k: v for k, v in json.loads(config.json()).items() if v is not None
            },
        }
        converted_data = Report.parse_obj(self.tests)
@ -88,7 +90,6 @@ class ReportManager:
        save_single_radar_chart(
            agent_categories,
            config.get_reports_path() / "radar_chart.png",
        )
--- a/benchmark/benchmark/reports/reports.py
+++ b/benchmark/benchmark/reports/reports.py
@ -4,7 +4,13 @@ import sys
 from pathlib import Path
 from typing import Any, Dict
-from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig
+from benchmark.reports.ReportManager import ReportManager
 from benchmark.utils.data_types import (
    DIFFICULTY_MAP,
    AgentBenchmarkConfig,
    DifficultyLevel,
    SuiteConfig,
 )
 from benchmark.utils.get_data_from_helicone import get_data_from_helicone
 from benchmark.utils.utils import (
    calculate_success_percentage,
@ -12,8 +18,6 @@ from benchmark.utils.utils import (
    get_test_path,
    replace_backslash,
 )
 from benchmark.reports.ReportManager import ReportManager
 def get_agent_benchmark_config() -> AgentBenchmarkConfig:
@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig:
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
+            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
            return agent_benchmark_config
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise
 def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
    agent_benchmark_config = get_agent_benchmark_config()
    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path())
+    REGRESSION_MANAGER = ReportManager(
        agent_benchmark_config.get_regression_reports_path()
    )
    # print(f"Using {REPORTS_PATH} for reports")
    # user facing reporting information
-    INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json"))
+    INFO_MANAGER = ReportManager(
        str(agent_benchmark_config.get_reports_path() / "report.json")
    )
    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path())
+    INTERNAL_INFO_MANAGER = ReportManager(
        agent_benchmark_config.get_success_rate_path()
    )
    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
@ -132,16 +145,12 @@ def get_previous_test_results(
    agent_tests: dict[str, list[bool]] = {}
    mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
-    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(
+    prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
        test_name, []
    )
    if not mock:
        # only add if it's an actual test
        prev_test_results.append(info_details["metrics"]["success"])
-        INTERNAL_INFO_MANAGER.add_test(
+        INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
            test_name, prev_test_results
        )
    # can calculate success rate regardless of mock
    info_details["metrics"]["success_%"] = calculate_success_percentage(
@ -199,8 +208,8 @@ def generate_single_call_report(
        },
        "answers": answers,
    }
-    if 'metadata' in challenge_data:
+    if "metadata" in challenge_data:
-        info_details['metadata'] = challenge_data['metadata']
+        info_details["metadata"] = challenge_data["metadata"]
    mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
        }
        for name in suite_file_datum:
-            test_data = INFO_MANAGER.tests[
+            test_data = INFO_MANAGER.tests[name]  # get the individual test reports
                name
            ]  # get the individual test reports
            data[name] = test_data  # this is for calculating highest difficulty
            INFO_MANAGER.remove_test(name)
@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None:
    agent_benchmark_config = get_agent_benchmark_config()
    INTERNAL_INFO_MANAGER.save()
    INFO_MANAGER.end_info_report(agent_benchmark_config)
    REGRESSION_MANAGER.save()
--- a/benchmark/benchmark/start_benchmark.py
+++ b/benchmark/benchmark/start_benchmark.py
@ -9,14 +9,13 @@ from typing import Any, Optional
 import click
 import pytest
 from helicone.lock import HeliconeLockManager
-import sys
+
-sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark')
+sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
 from agbenchmark.reports.ReportManager import ReportManager
-from agbenchmark.utils.utils import (
+from agbenchmark.utils.utils import (  # get_git_commit_sha,
    AGENT_NAME,
    calculate_dynamic_paths,
    # get_git_commit_sha,
 )
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
@ -34,8 +33,8 @@ if os.environ.get("HELICONE_API_KEY"):
    SUCCESS_RATE_PATH,
    CHALLENGES_PATH,
 ) = calculate_dynamic_paths()
-BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
+BENCHMARK_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
-AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
+AGENT_GIT_COMMIT_SHA = "---"  # get_git_commit_sha(HOME_DIRECTORY)
 # open a file in the challenges/optional_categories
 with open(
    Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
@ -334,13 +333,16 @@ def get_regression_data() -> Any:
    return data
@cli.command()
 def version():
    """Print the version of the benchmark tool."""
    import toml
    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
    print(f"Benchmark Tool Version {version}")
    version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
        "version"
    ]
    print(f"Benchmark Tool Version {version}")
 # def run_from_backend(
--- a/benchmark/benchmark/utils/data_types.py
+++ b/benchmark/benchmark/utils/data_types.py
@ -1,11 +1,14 @@
 import glob
 import json
 import sys
 from datetime import datetime, timezone
 from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-import sys
+
 from pydantic import BaseModel, root_validator, validator
-from datetime import datetime, timezone
+
 class DifficultyLevel(Enum):
    interface = "interface"
    basic = "basic"
@ -29,6 +32,7 @@ DIFFICULTY_MAP = {
 STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
 def calculate_info_test_path(base_path: Path) -> Path:
    """
    Calculates the path to the directory where the test report will be saved.
@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path:
    return report_path
 class AgentBenchmarkConfig(BaseModel):
    """
    This class represents the configuration for the Agent Benchmark.
@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel):
    - api_mode: A boolean indicating whether the benchmark is run in API mode.
    - host: The host where the benchmark is run.
    """
    agent_benchmark_config_path: Path | None = None
    entry_path: Path
    workspace: Path
@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel):
    def get_reports_location(self) -> Path:
        if not self.reports_folder:
-            self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve()
+            self.reports_folder = (
                self.agent_benchmark_config_path
                / self.entry_path.parent
                / ".."
                / "reports"
            ).resolve()
        return self.reports_folder
    def get_reports_path(self) -> Path:
        return calculate_info_test_path(self.get_reports_location())
    def get_regression_reports_path(self) -> Path:
        return self.get_reports_location() / "regression_tests.json"
    def get_success_rate_path(self) -> Path:
        return self.get_reports_location() / "success_rate.json"
 class Info(BaseModel):
    difficulty: DifficultyLevel
    description: str
--- a/benchmark/benchmark/utils/dependencies/init.py
+++ b/benchmark/benchmark/utils/dependencies/init.py
@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
        for action in group.options:
            current_options += action._short_opts + action._long_opts
    group = parser.getgroup("depends")
    # Add a flag to list all names + the tests they resolve to
--- a/benchmark/benchmark/utils/utils.py
+++ b/benchmark/benchmark/utils/utils.py
@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME")
 REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
 def replace_backslash(value: Any) -> Any:
    if isinstance(value, str):
        return re.sub(