Fixing benchmark code

pull/5194/head
SwiftyOS 2023-09-11 17:24:23 +02:00 committed by Merwane Hamadi
parent c73e90c4e6
commit ef2107d9c2
21 changed files with 177 additions and 119 deletions

View File

@ -9,11 +9,10 @@ from typing import Any
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from agbenchmark.utils.utils import find_absolute_benchmark_path
from fastapi import FastAPI, Query from fastapi import FastAPI, Query
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from agbenchmark.utils.utils import find_absolute_benchmark_path
app = FastAPI() app = FastAPI()
origins = ["http://localhost:3000"] origins = ["http://localhost:3000"]

View File

@ -5,10 +5,10 @@ import sys
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any, Optional from typing import Any, Optional
import toml
import click import click
import pytest import pytest
import toml
from helicone.lock import HeliconeLockManager from helicone.lock import HeliconeLockManager
from benchmark.utils.data_types import AgentBenchmarkConfig from benchmark.utils.data_types import AgentBenchmarkConfig
@ -72,7 +72,9 @@ def run_benchmark(
try: try:
with open(agent_benchmark_config_path, "r") as f: with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
return 1 return 1
@ -97,8 +99,9 @@ def run_benchmark(
) )
return 1 return 1
assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \ assert not (
"Error: host needs to be added to the config if api_mode is set to True." agent_benchmark_config.api_mode and not agent_benchmark_config.host
), "Error: host needs to be added to the config if api_mode is set to True."
print("Current configuration:") print("Current configuration:")
for key, value in vars(agent_benchmark_config).items(): for key, value in vars(agent_benchmark_config).items():
@ -200,7 +203,12 @@ def cli() -> None:
) )
@click.option("--nc", is_flag=True, help="Run without cutoff") @click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)") @click.option("--cutoff", help="Set or override tests cutoff (seconds)")
@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True) @click.option(
"--agent-config",
type=click.Path(exists=True),
help="Path to the agent benchmark_config.json file,",
required=True,
)
def start( def start(
maintain: bool, maintain: bool,
improve: bool, improve: bool,
@ -220,8 +228,9 @@ def start(
original_stdout = sys.stdout # Save the original standard output original_stdout = sys.stdout # Save the original standard output
exit_code = None exit_code = None
assert (
assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided" "benchmark_config.json" in agent_config
), "benchmark_config.json must be provided"
if backend: if backend:
with open("backend/backend_stdout.txt", "w") as f: with open("backend/backend_stdout.txt", "w") as f:
@ -266,7 +275,9 @@ def start(
def version(): def version():
"""Print the version of the benchmark tool.""" """Print the version of the benchmark tool."""
current_directory = Path(__file__).resolve().parent current_directory = Path(__file__).resolve().parent
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"] version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
"version"
]
print(f"Benchmark Tool Version {version}") print(f"Benchmark Tool Version {version}")

View File

@ -51,7 +51,6 @@ async def run_api_agent(
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id) artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
for artifact in artifacts: for artifact in artifacts:
if artifact.relative_path: if artifact.relative_path:
folder_path = os.path.join(config["workspace"], artifact.relative_path) folder_path = os.path.join(config["workspace"], artifact.relative_path)
else: else:

View File

@ -12,7 +12,6 @@ from typing import Any, List
import psutil import psutil
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS") helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")

View File

@ -1,15 +1,22 @@
import unittest import unittest
from url_shortener import shorten_url, retrieve_url
from url_shortener import retrieve_url, shorten_url
class TestURLShortener(unittest.TestCase): class TestURLShortener(unittest.TestCase):
def test_url_retrieval(self): def test_url_retrieval(self):
# Shorten the URL to get its shortened form # Shorten the URL to get its shortened form
shortened_url = shorten_url('https://www.example.com') shortened_url = shorten_url("https://www.example.com")
# Retrieve the original URL using the shortened URL directly # Retrieve the original URL using the shortened URL directly
retrieved_url = retrieve_url(shortened_url) retrieved_url = retrieve_url(shortened_url)
self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!") self.assertEqual(
retrieved_url,
"https://www.example.com",
"Retrieved URL does not match the original!",
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@ -3,6 +3,7 @@ import base64
URL_MAPPING = {} URL_MAPPING = {}
def shorten_url(url): def shorten_url(url):
# Convert the URL to base64 # Convert the URL to base64
encoded_url = base64.b64encode(url.encode()).decode() encoded_url = base64.b64encode(url.encode()).decode()
@ -12,13 +13,15 @@ def shorten_url(url):
URL_MAPPING[short_url] = url URL_MAPPING[short_url] = url
return short_url return short_url
def retrieve_url(short_url): def retrieve_url(short_url):
return URL_MAPPING.get(short_url, "URL not found") return URL_MAPPING.get(short_url, "URL not found")
def main(): def main():
parser = argparse.ArgumentParser(description="URL Shortener") parser = argparse.ArgumentParser(description="URL Shortener")
parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened") parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened")
parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved") parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved")
args = parser.parse_args() args = parser.parse_args()
@ -32,5 +35,6 @@ def main():
else: else:
print("No valid arguments provided.") print("No valid arguments provided.")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,38 +1,45 @@
import pprint import pprint
def column(matrix, i): def column(matrix, i):
return [row[i] for row in matrix] return [row[i] for row in matrix]
def check(list): def check(list):
if len(set(list)) <= 1: if len(set(list)) <= 1:
if list[0] != 0: if list[0] != 0:
return list[0] return list[0]
return None return None
def checkDiagLeft(board): def checkDiagLeft(board):
if (board[0][0] == board[1][1] and board[1][1] == board[2][2]): if board[0][0] == board[1][1] and board[1][1] == board[2][2]:
if board[0][0] != 0: if board[0][0] != 0:
return board[0][0] return board[0][0]
return None return None
def checkDiagRight(board): def checkDiagRight(board):
if (board[2][0] == board[1][1] and board[1][1] == board[0][2]): if board[2][0] == board[1][1] and board[1][1] == board[0][2]:
if board[2][0] != 0: if board[2][0] != 0:
return board[2][0] return board[2][0]
return None return None
def placeItem(row, column, board, current_player): def placeItem(row, column, board, current_player):
if board[row][column] != 0: if board[row][column] != 0:
return None return None
else: else:
board[row][column] = current_player board[row][column] = current_player
def swapPlayers(player): def swapPlayers(player):
if (player == 2): if player == 2:
return 1 return 1
else: else:
return 2 return 2
def winner(board): def winner(board):
for rowIndex in board: for rowIndex in board:
if check(rowIndex) is not None: if check(rowIndex) is not None:
@ -46,23 +53,35 @@ def winner(board):
return checkDiagRight(board) return checkDiagRight(board)
return 0 return 0
def getLocation(): def getLocation():
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ") location = input(
"Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
)
print(f"\nYou picked {location}") print(f"\nYou picked {location}")
coordinates = [int(x) for x in location.split(',')] coordinates = [int(x) for x in location.split(",")]
while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2): while (
len(coordinates) != 2
or coordinates[0] < 0
or coordinates[0] > 2
or coordinates[1] < 0
or coordinates[1] > 2
):
print("You inputted a location in an invalid format") print("You inputted a location in an invalid format")
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ") location = input(
coordinates = [int(x) for x in location.split(',')] "Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
)
coordinates = [int(x) for x in location.split(",")]
return coordinates return coordinates
def gamePlay(): def gamePlay():
num_moves = 0 num_moves = 0
pp = pprint.PrettyPrinter(width=20) pp = pprint.PrettyPrinter(width=20)
current_player = 1 current_player = 1
board = [[0 for x in range(3)] for x in range(3)] board = [[0 for x in range(3)] for x in range(3)]
while (num_moves < 9 and winner(board) == 0): while num_moves < 9 and winner(board) == 0:
print("This is the current board: ") print("This is the current board: ")
pp.pprint(board) pp.pprint(board)
coordinates = getLocation() coordinates = getLocation()
@ -75,5 +94,6 @@ def gamePlay():
if winner(board) == 0: if winner(board) == 0:
print("Draw") print("Draw")
if __name__ == '__main__':
if __name__ == "__main__":
gamePlay() gamePlay()

View File

@ -1,18 +1,20 @@
import subprocess import subprocess
import pytest import pytest
def run_game_with_inputs(inputs): def run_game_with_inputs(inputs):
# Start the game process # Start the game process
process = subprocess.Popen( process = subprocess.Popen(
['python', 'tic_tac_toe.py'], ["python", "tic_tac_toe.py"],
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True,
) )
# Send the input moves one by one # Send the input moves one by one
output, errors = process.communicate('\n'.join(inputs)) output, errors = process.communicate("\n".join(inputs))
# Print the inputs and outputs # Print the inputs and outputs
print("Inputs:\n", "\n".join(inputs)) print("Inputs:\n", "\n".join(inputs))
@ -22,14 +24,18 @@ def run_game_with_inputs(inputs):
return output return output
@pytest.mark.parametrize("inputs, expected_output", [ @pytest.mark.parametrize(
(["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"), "inputs, expected_output",
(["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"), [
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw") (["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
]) (["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
],
)
def test_game(inputs, expected_output): def test_game(inputs, expected_output):
output = run_game_with_inputs(inputs) output = run_game_with_inputs(inputs)
assert expected_output in output assert expected_output in output
if __name__ == '__main__':
if __name__ == "__main__":
pytest.main() pytest.main()

View File

@ -1,9 +1,8 @@
import pytest import pytest
from abstract_class import ShipPlacement, Turn from abstract_class import ShipPlacement, Turn
from battleship import Battleship from battleship import Battleship
@pytest.fixture @pytest.fixture
def battleship_game(): def battleship_game():
return Battleship() return Battleship()

View File

@ -1,7 +1,6 @@
import pytest import pytest
from pydantic import ValidationError
from abstract_class import ShipPlacement, Turn from abstract_class import ShipPlacement, Turn
from pydantic import ValidationError
def test_ship_placement_out_of_bounds(battleship_game): def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id): def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
game = battleship_game.get_game( game = battleship_game.get_game(initialized_game_id)
initialized_game_id
)
additional_ship = ShipPlacement( additional_ship = ShipPlacement(
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal" ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
) )

View File

@ -86,9 +86,7 @@ class Battleship(AbstractBattleship):
game.turns.append(turn) game.turns.append(turn)
if hit_ship == "hit": if hit_ship == "hit":
return TurnResponse( return TurnResponse(result="miss", ship_type=None)
result="miss", ship_type=None
)
if hit_ship: if hit_ship:
ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship) ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship)
@ -133,9 +131,7 @@ class Battleship(AbstractBattleship):
) )
if hits == total_ships_length: if hits == total_ships_length:
return GameStatus( return GameStatus(is_game_over=True, winner="player")
is_game_over=True, winner="player"
)
else: else:
return GameStatus(is_game_over=False, winner=None) return GameStatus(is_game_over=False, winner=None)

View File

@ -1,9 +1,8 @@
import pytest import pytest
from abstract_class import ShipPlacement, Turn from abstract_class import ShipPlacement, Turn
from battleship import Battleship from battleship import Battleship
@pytest.fixture @pytest.fixture
def battleship_game(): def battleship_game():
return Battleship() return Battleship()

View File

@ -1,7 +1,6 @@
import pytest import pytest
from pydantic import ValidationError
from abstract_class import ShipPlacement, Turn from abstract_class import ShipPlacement, Turn
from pydantic import ValidationError
def test_ship_placement_out_of_bounds(battleship_game): def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id): def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
game = battleship_game.get_game( game = battleship_game.get_game(initialized_game_id)
initialized_game_id
)
additional_ship = ShipPlacement( additional_ship = ShipPlacement(
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal" ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
) )

View File

@ -16,7 +16,7 @@ from benchmark.reports.reports import (
generate_single_call_report, generate_single_call_report,
session_finish, session_finish,
) )
from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig
GLOBAL_TIMEOUT = ( GLOBAL_TIMEOUT = (
1500 # The tests will stop after 25 minutes so we can send the reports. 1500 # The tests will stop after 25 minutes so we can send the reports.
@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
try: try:
with open(agent_benchmark_config_path, "r") as f: with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config return agent_benchmark_config
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
raise raise
def resolve_workspace(workspace: str) -> str: def resolve_workspace(workspace: str) -> str:
if workspace.startswith("${") and workspace.endswith("}"): if workspace.startswith("${") and workspace.endswith("}"):
# Extract the string inside ${...} # Extract the string inside ${...}
@ -65,7 +64,9 @@ def config(request: Any) -> Any:
try: try:
with open(agent_benchmark_config_path, "r") as f: with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
raise raise
@ -73,8 +74,12 @@ def config(request: Any) -> Any:
if isinstance(config["workspace"], str): if isinstance(config["workspace"], str):
config["workspace"] = resolve_workspace(agent_benchmark_config.workspace) config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
else: # it's a input output dict else: # it's a input output dict
config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input") config["workspace"]["input"] = resolve_workspace(
config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output") agent_benchmark_config.workspace / "input"
)
config["workspace"]["output"] = resolve_workspace(
agent_benchmark_config.workspace / "output"
)
return config return config
@ -238,9 +243,11 @@ def scores(request: Any) -> None:
# this is adding the dependency marker and category markers automatically from the json # this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None: def pytest_collection_modifyitems(items: Any, config: Any) -> None:
try: try:
with open(config.getoption('--agent_config_path'), "r") as f: with open(config.getoption("--agent_config_path"), "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path') agent_benchmark_config.agent_benchmark_config_path = config.getoption(
"--agent_config_path"
)
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
raise raise

View File

@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional
import pytest import pytest
from benchmark.utils.challenge import Challenge from benchmark.utils.challenge import Challenge
from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig
from benchmark.utils.utils import get_test_path from benchmark.utils.utils import get_test_path
DATA_CATEGORY = {} DATA_CATEGORY = {}
@ -222,7 +222,7 @@ def create_challenge(
def generate_tests() -> None: # sourcery skip: invert-any-all def generate_tests() -> None: # sourcery skip: invert-any-all
print("Generating tests...") print("Generating tests...")
challenges_path = os.path.join(os.path.dirname(__file__), 'challenges') challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
json_files = deque( json_files = deque(
glob.glob( glob.glob(
@ -239,14 +239,16 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
try: try:
with open(agent_benchmark_config_path, "r") as f: with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
raise raise
regression_reports_path = agent_benchmark_config.get_regression_reports_path() regression_reports_path = agent_benchmark_config.get_regression_reports_path()
if regression_reports_path and os.path.exists(regression_reports_path): if regression_reports_path and os.path.exists(regression_reports_path):
with open(regression_reports_path, 'r') as f: with open(regression_reports_path, "r") as f:
regression_tests = json.load(f) regression_tests = json.load(f)
else: else:
regression_tests = {} regression_tests = {}

View File

@ -6,12 +6,13 @@ from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any, Dict from typing import Any, Dict
from benchmark.__main__ import BENCHMARK_START_TIME
from benchmark.reports.processing.graphs import save_single_radar_chart from benchmark.reports.processing.graphs import save_single_radar_chart
from benchmark.reports.processing.process_report import get_agent_category from benchmark.reports.processing.process_report import get_agent_category
from benchmark.reports.processing.report_types import Report from benchmark.reports.processing.report_types import Report
from benchmark.utils.utils import get_highest_success_difficulty
from benchmark.utils.data_types import AgentBenchmarkConfig from benchmark.utils.data_types import AgentBenchmarkConfig
from benchmark.__main__ import BENCHMARK_START_TIME from benchmark.utils.utils import get_highest_success_difficulty
class ReportManager: class ReportManager:
"""Abstracts interaction with the regression tests file""" """Abstracts interaction with the regression tests file"""
@ -24,7 +25,7 @@ class ReportManager:
def load(self) -> None: def load(self) -> None:
if not os.path.exists(self.filename): if not os.path.exists(self.filename):
os.makedirs(os.path.dirname(self.filename), exist_ok=True) os.makedirs(os.path.dirname(self.filename), exist_ok=True)
with open(self.filename, 'w') as f: with open(self.filename, "w") as f:
pass pass
try: try:
@ -62,13 +63,12 @@ class ReportManager:
self.save() self.save()
def end_info_report(self, config: AgentBenchmarkConfig) -> None: def end_info_report(self, config: AgentBenchmarkConfig) -> None:
command = " ".join(sys.argv) command = " ".join(sys.argv)
self.tests = { self.tests = {
"command": command.split(os.sep)[-1], "command": command.split(os.sep)[-1],
"benchmark_git_commit_sha": '---', "benchmark_git_commit_sha": "---",
"agent_git_commit_sha": '---', "agent_git_commit_sha": "---",
"completion_time": datetime.now(timezone.utc).strftime( "completion_time": datetime.now(timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S+00:00" "%Y-%m-%dT%H:%M:%S+00:00"
), ),
@ -79,7 +79,9 @@ class ReportManager:
"total_cost": self.get_total_costs(), "total_cost": self.get_total_costs(),
}, },
"tests": self.tests, "tests": self.tests,
"config": {k: v for k, v in json.loads(config.json()).items() if v is not None}, "config": {
k: v for k, v in json.loads(config.json()).items() if v is not None
},
} }
converted_data = Report.parse_obj(self.tests) converted_data = Report.parse_obj(self.tests)
@ -88,7 +90,6 @@ class ReportManager:
save_single_radar_chart( save_single_radar_chart(
agent_categories, agent_categories,
config.get_reports_path() / "radar_chart.png", config.get_reports_path() / "radar_chart.png",
) )

View File

@ -4,7 +4,13 @@ import sys
from pathlib import Path from pathlib import Path
from typing import Any, Dict from typing import Any, Dict
from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig from benchmark.reports.ReportManager import ReportManager
from benchmark.utils.data_types import (
DIFFICULTY_MAP,
AgentBenchmarkConfig,
DifficultyLevel,
SuiteConfig,
)
from benchmark.utils.get_data_from_helicone import get_data_from_helicone from benchmark.utils.get_data_from_helicone import get_data_from_helicone
from benchmark.utils.utils import ( from benchmark.utils.utils import (
calculate_success_percentage, calculate_success_percentage,
@ -12,8 +18,6 @@ from benchmark.utils.utils import (
get_test_path, get_test_path,
replace_backslash, replace_backslash,
) )
from benchmark.reports.ReportManager import ReportManager
def get_agent_benchmark_config() -> AgentBenchmarkConfig: def get_agent_benchmark_config() -> AgentBenchmarkConfig:
@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig:
try: try:
with open(agent_benchmark_config_path, "r") as f: with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config return agent_benchmark_config
except json.JSONDecodeError: except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.") print("Error: benchmark_config.json is not a valid JSON file.")
raise raise
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]: def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config() agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests # tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path()) REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path()
)
# print(f"Using {REPORTS_PATH} for reports") # print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information # user facing reporting information
INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json")) INFO_MANAGER = ReportManager(
str(agent_benchmark_config.get_reports_path() / "report.json")
)
# internal db step in replacement track pass/fail rate # internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path()) INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path()
)
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
@ -132,16 +145,12 @@ def get_previous_test_results(
agent_tests: dict[str, list[bool]] = {} agent_tests: dict[str, list[bool]] = {}
mock = "--mock" in sys.argv # Check if --mock is in sys.argv mock = "--mock" in sys.argv # Check if --mock is in sys.argv
prev_test_results = INTERNAL_INFO_MANAGER.tests.get( prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
test_name, []
)
if not mock: if not mock:
# only add if it's an actual test # only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"]) prev_test_results.append(info_details["metrics"]["success"])
INTERNAL_INFO_MANAGER.add_test( INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
test_name, prev_test_results
)
# can calculate success rate regardless of mock # can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage( info_details["metrics"]["success_%"] = calculate_success_percentage(
@ -199,8 +208,8 @@ def generate_single_call_report(
}, },
"answers": answers, "answers": answers,
} }
if 'metadata' in challenge_data: if "metadata" in challenge_data:
info_details['metadata'] = challenge_data['metadata'] info_details["metadata"] = challenge_data["metadata"]
mock = "--mock" in sys.argv # Check if --mock is in sys.argv mock = "--mock" in sys.argv # Check if --mock is in sys.argv
@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
} }
for name in suite_file_datum: for name in suite_file_datum:
test_data = INFO_MANAGER.tests[ test_data = INFO_MANAGER.tests[name] # get the individual test reports
name
] # get the individual test reports
data[name] = test_data # this is for calculating highest difficulty data[name] = test_data # this is for calculating highest difficulty
INFO_MANAGER.remove_test(name) INFO_MANAGER.remove_test(name)
@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None:
agent_benchmark_config = get_agent_benchmark_config() agent_benchmark_config = get_agent_benchmark_config()
INTERNAL_INFO_MANAGER.save() INTERNAL_INFO_MANAGER.save()
INFO_MANAGER.end_info_report(agent_benchmark_config) INFO_MANAGER.end_info_report(agent_benchmark_config)
REGRESSION_MANAGER.save() REGRESSION_MANAGER.save()

View File

@ -9,14 +9,13 @@ from typing import Any, Optional
import click import click
import pytest import pytest
from helicone.lock import HeliconeLockManager from helicone.lock import HeliconeLockManager
import sys
sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark') sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
from agbenchmark.reports.ReportManager import ReportManager from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.utils.utils import ( from agbenchmark.utils.utils import ( # get_git_commit_sha,
AGENT_NAME, AGENT_NAME,
calculate_dynamic_paths, calculate_dynamic_paths,
# get_git_commit_sha,
) )
CURRENT_DIRECTORY = Path(__file__).resolve().parent CURRENT_DIRECTORY = Path(__file__).resolve().parent
@ -34,8 +33,8 @@ if os.environ.get("HELICONE_API_KEY"):
SUCCESS_RATE_PATH, SUCCESS_RATE_PATH,
CHALLENGES_PATH, CHALLENGES_PATH,
) = calculate_dynamic_paths() ) = calculate_dynamic_paths()
BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..") BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY) AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
# open a file in the challenges/optional_categories # open a file in the challenges/optional_categories
with open( with open(
Path(__file__).resolve().parent / "challenges" / "optional_categories.json" Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
@ -334,13 +333,16 @@ def get_regression_data() -> Any:
return data return data
@cli.command() @cli.command()
def version(): def version():
"""Print the version of the benchmark tool.""" """Print the version of the benchmark tool."""
import toml import toml
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
print(f"Benchmark Tool Version {version}")
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
"version"
]
print(f"Benchmark Tool Version {version}")
# def run_from_backend( # def run_from_backend(

View File

@ -1,11 +1,14 @@
import glob import glob
import json import json
import sys
from datetime import datetime, timezone
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import sys
from pydantic import BaseModel, root_validator, validator from pydantic import BaseModel, root_validator, validator
from datetime import datetime, timezone
class DifficultyLevel(Enum): class DifficultyLevel(Enum):
interface = "interface" interface = "interface"
basic = "basic" basic = "basic"
@ -29,6 +32,7 @@ DIFFICULTY_MAP = {
STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel} STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
def calculate_info_test_path(base_path: Path) -> Path: def calculate_info_test_path(base_path: Path) -> Path:
""" """
Calculates the path to the directory where the test report will be saved. Calculates the path to the directory where the test report will be saved.
@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path:
return report_path return report_path
class AgentBenchmarkConfig(BaseModel): class AgentBenchmarkConfig(BaseModel):
""" """
This class represents the configuration for the Agent Benchmark. This class represents the configuration for the Agent Benchmark.
@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel):
- api_mode: A boolean indicating whether the benchmark is run in API mode. - api_mode: A boolean indicating whether the benchmark is run in API mode.
- host: The host where the benchmark is run. - host: The host where the benchmark is run.
""" """
agent_benchmark_config_path: Path | None = None agent_benchmark_config_path: Path | None = None
entry_path: Path entry_path: Path
workspace: Path workspace: Path
@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel):
def get_reports_location(self) -> Path: def get_reports_location(self) -> Path:
if not self.reports_folder: if not self.reports_folder:
self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve() self.reports_folder = (
self.agent_benchmark_config_path
/ self.entry_path.parent
/ ".."
/ "reports"
).resolve()
return self.reports_folder return self.reports_folder
def get_reports_path(self) -> Path: def get_reports_path(self) -> Path:
return calculate_info_test_path(self.get_reports_location()) return calculate_info_test_path(self.get_reports_location())
def get_regression_reports_path(self) -> Path: def get_regression_reports_path(self) -> Path:
return self.get_reports_location() / "regression_tests.json" return self.get_reports_location() / "regression_tests.json"
def get_success_rate_path(self) -> Path: def get_success_rate_path(self) -> Path:
return self.get_reports_location() / "success_rate.json" return self.get_reports_location() / "success_rate.json"
class Info(BaseModel): class Info(BaseModel):
difficulty: DifficultyLevel difficulty: DifficultyLevel
description: str description: str

View File

@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
for action in group.options: for action in group.options:
current_options += action._short_opts + action._long_opts current_options += action._short_opts + action._long_opts
group = parser.getgroup("depends") group = parser.getgroup("depends")
# Add a flag to list all names + the tests they resolve to # Add a flag to list all names + the tests they resolve to

View File

@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME")
REPORT_LOCATION = os.getenv("REPORT_LOCATION", None) REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
def replace_backslash(value: Any) -> Any: def replace_backslash(value: Any) -> Any:
if isinstance(value, str): if isinstance(value, str):
return re.sub( return re.sub(