Fixing benchmark code
parent
c73e90c4e6
commit
ef2107d9c2
|
@ -9,11 +9,10 @@ from typing import Any
|
|||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
||||
from agbenchmark.utils.utils import find_absolute_benchmark_path
|
||||
from fastapi import FastAPI, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from agbenchmark.utils.utils import find_absolute_benchmark_path
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
origins = ["http://localhost:3000"]
|
||||
|
|
|
@ -5,10 +5,10 @@ import sys
|
|||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
import toml
|
||||
|
||||
import click
|
||||
import pytest
|
||||
import toml
|
||||
from helicone.lock import HeliconeLockManager
|
||||
|
||||
from benchmark.utils.data_types import AgentBenchmarkConfig
|
||||
|
@ -72,7 +72,9 @@ def run_benchmark(
|
|||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
return 1
|
||||
|
@ -97,8 +99,9 @@ def run_benchmark(
|
|||
)
|
||||
return 1
|
||||
|
||||
assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
|
||||
"Error: host needs to be added to the config if api_mode is set to True."
|
||||
assert not (
|
||||
agent_benchmark_config.api_mode and not agent_benchmark_config.host
|
||||
), "Error: host needs to be added to the config if api_mode is set to True."
|
||||
|
||||
print("Current configuration:")
|
||||
for key, value in vars(agent_benchmark_config).items():
|
||||
|
@ -200,7 +203,12 @@ def cli() -> None:
|
|||
)
|
||||
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
||||
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
||||
@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
|
||||
@click.option(
|
||||
"--agent-config",
|
||||
type=click.Path(exists=True),
|
||||
help="Path to the agent benchmark_config.json file,",
|
||||
required=True,
|
||||
)
|
||||
def start(
|
||||
maintain: bool,
|
||||
improve: bool,
|
||||
|
@ -220,8 +228,9 @@ def start(
|
|||
original_stdout = sys.stdout # Save the original standard output
|
||||
exit_code = None
|
||||
|
||||
|
||||
assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
|
||||
assert (
|
||||
"benchmark_config.json" in agent_config
|
||||
), "benchmark_config.json must be provided"
|
||||
|
||||
if backend:
|
||||
with open("backend/backend_stdout.txt", "w") as f:
|
||||
|
@ -266,7 +275,9 @@ def start(
|
|||
def version():
|
||||
"""Print the version of the benchmark tool."""
|
||||
current_directory = Path(__file__).resolve().parent
|
||||
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
|
||||
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
|
||||
"version"
|
||||
]
|
||||
print(f"Benchmark Tool Version {version}")
|
||||
|
||||
|
||||
|
|
|
@ -51,7 +51,6 @@ async def run_api_agent(
|
|||
|
||||
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
|
||||
for artifact in artifacts:
|
||||
|
||||
if artifact.relative_path:
|
||||
folder_path = os.path.join(config["workspace"], artifact.relative_path)
|
||||
else:
|
||||
|
|
|
@ -12,7 +12,6 @@ from typing import Any, List
|
|||
import psutil
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
||||
load_dotenv()
|
||||
|
||||
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
|
||||
|
|
|
@ -1,15 +1,22 @@
|
|||
import unittest
|
||||
from url_shortener import shorten_url, retrieve_url
|
||||
|
||||
from url_shortener import retrieve_url, shorten_url
|
||||
|
||||
|
||||
class TestURLShortener(unittest.TestCase):
|
||||
def test_url_retrieval(self):
|
||||
# Shorten the URL to get its shortened form
|
||||
shortened_url = shorten_url('https://www.example.com')
|
||||
shortened_url = shorten_url("https://www.example.com")
|
||||
|
||||
# Retrieve the original URL using the shortened URL directly
|
||||
retrieved_url = retrieve_url(shortened_url)
|
||||
|
||||
self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!")
|
||||
self.assertEqual(
|
||||
retrieved_url,
|
||||
"https://www.example.com",
|
||||
"Retrieved URL does not match the original!",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
@ -3,6 +3,7 @@ import base64
|
|||
|
||||
URL_MAPPING = {}
|
||||
|
||||
|
||||
def shorten_url(url):
|
||||
# Convert the URL to base64
|
||||
encoded_url = base64.b64encode(url.encode()).decode()
|
||||
|
@ -12,13 +13,15 @@ def shorten_url(url):
|
|||
URL_MAPPING[short_url] = url
|
||||
return short_url
|
||||
|
||||
|
||||
def retrieve_url(short_url):
|
||||
return URL_MAPPING.get(short_url, "URL not found")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="URL Shortener")
|
||||
parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened")
|
||||
parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved")
|
||||
parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened")
|
||||
parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -32,5 +35,6 @@ def main():
|
|||
else:
|
||||
print("No valid arguments provided.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
@ -1,38 +1,45 @@
|
|||
import pprint
|
||||
|
||||
|
||||
def column(matrix, i):
|
||||
return [row[i] for row in matrix]
|
||||
|
||||
|
||||
def check(list):
|
||||
if len(set(list)) <= 1:
|
||||
if list[0] != 0:
|
||||
return list[0]
|
||||
return None
|
||||
|
||||
|
||||
def checkDiagLeft(board):
|
||||
if (board[0][0] == board[1][1] and board[1][1] == board[2][2]):
|
||||
if board[0][0] == board[1][1] and board[1][1] == board[2][2]:
|
||||
if board[0][0] != 0:
|
||||
return board[0][0]
|
||||
return None
|
||||
|
||||
|
||||
def checkDiagRight(board):
|
||||
if (board[2][0] == board[1][1] and board[1][1] == board[0][2]):
|
||||
if board[2][0] == board[1][1] and board[1][1] == board[0][2]:
|
||||
if board[2][0] != 0:
|
||||
return board[2][0]
|
||||
return None
|
||||
|
||||
|
||||
def placeItem(row, column, board, current_player):
|
||||
if board[row][column] != 0:
|
||||
return None
|
||||
else:
|
||||
board[row][column] = current_player
|
||||
|
||||
|
||||
def swapPlayers(player):
|
||||
if (player == 2):
|
||||
if player == 2:
|
||||
return 1
|
||||
else:
|
||||
return 2
|
||||
|
||||
|
||||
def winner(board):
|
||||
for rowIndex in board:
|
||||
if check(rowIndex) is not None:
|
||||
|
@ -46,23 +53,35 @@ def winner(board):
|
|||
return checkDiagRight(board)
|
||||
return 0
|
||||
|
||||
|
||||
def getLocation():
|
||||
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
|
||||
location = input(
|
||||
"Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
|
||||
)
|
||||
print(f"\nYou picked {location}")
|
||||
coordinates = [int(x) for x in location.split(',')]
|
||||
while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2):
|
||||
coordinates = [int(x) for x in location.split(",")]
|
||||
while (
|
||||
len(coordinates) != 2
|
||||
or coordinates[0] < 0
|
||||
or coordinates[0] > 2
|
||||
or coordinates[1] < 0
|
||||
or coordinates[1] > 2
|
||||
):
|
||||
print("You inputted a location in an invalid format")
|
||||
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
|
||||
coordinates = [int(x) for x in location.split(',')]
|
||||
location = input(
|
||||
"Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
|
||||
)
|
||||
coordinates = [int(x) for x in location.split(",")]
|
||||
return coordinates
|
||||
|
||||
|
||||
def gamePlay():
|
||||
num_moves = 0
|
||||
pp = pprint.PrettyPrinter(width=20)
|
||||
current_player = 1
|
||||
board = [[0 for x in range(3)] for x in range(3)]
|
||||
|
||||
while (num_moves < 9 and winner(board) == 0):
|
||||
while num_moves < 9 and winner(board) == 0:
|
||||
print("This is the current board: ")
|
||||
pp.pprint(board)
|
||||
coordinates = getLocation()
|
||||
|
@ -75,5 +94,6 @@ def gamePlay():
|
|||
if winner(board) == 0:
|
||||
print("Draw")
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
gamePlay()
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def run_game_with_inputs(inputs):
|
||||
# Start the game process
|
||||
process = subprocess.Popen(
|
||||
['python', 'tic_tac_toe.py'],
|
||||
["python", "tic_tac_toe.py"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Send the input moves one by one
|
||||
output, errors = process.communicate('\n'.join(inputs))
|
||||
output, errors = process.communicate("\n".join(inputs))
|
||||
|
||||
# Print the inputs and outputs
|
||||
print("Inputs:\n", "\n".join(inputs))
|
||||
|
@ -22,14 +24,18 @@ def run_game_with_inputs(inputs):
|
|||
return output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("inputs, expected_output", [
|
||||
@pytest.mark.parametrize(
|
||||
"inputs, expected_output",
|
||||
[
|
||||
(["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
|
||||
(["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
|
||||
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw")
|
||||
])
|
||||
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
|
||||
],
|
||||
)
|
||||
def test_game(inputs, expected_output):
|
||||
output = run_game_with_inputs(inputs)
|
||||
assert expected_output in output
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main()
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from abstract_class import ShipPlacement, Turn
|
||||
|
||||
from battleship import Battleship
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def battleship_game():
|
||||
return Battleship()
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from abstract_class import ShipPlacement, Turn
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
||||
def test_ship_placement_out_of_bounds(battleship_game):
|
||||
|
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
|
|||
|
||||
|
||||
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
|
||||
game = battleship_game.get_game(
|
||||
initialized_game_id
|
||||
)
|
||||
game = battleship_game.get_game(initialized_game_id)
|
||||
additional_ship = ShipPlacement(
|
||||
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
|
||||
)
|
||||
|
|
|
@ -86,9 +86,7 @@ class Battleship(AbstractBattleship):
|
|||
game.turns.append(turn)
|
||||
|
||||
if hit_ship == "hit":
|
||||
return TurnResponse(
|
||||
result="miss", ship_type=None
|
||||
)
|
||||
return TurnResponse(result="miss", ship_type=None)
|
||||
|
||||
if hit_ship:
|
||||
ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship)
|
||||
|
@ -133,9 +131,7 @@ class Battleship(AbstractBattleship):
|
|||
)
|
||||
|
||||
if hits == total_ships_length:
|
||||
return GameStatus(
|
||||
is_game_over=True, winner="player"
|
||||
)
|
||||
return GameStatus(is_game_over=True, winner="player")
|
||||
else:
|
||||
return GameStatus(is_game_over=False, winner=None)
|
||||
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from abstract_class import ShipPlacement, Turn
|
||||
|
||||
from battleship import Battleship
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def battleship_game():
|
||||
return Battleship()
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from abstract_class import ShipPlacement, Turn
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
||||
def test_ship_placement_out_of_bounds(battleship_game):
|
||||
|
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
|
|||
|
||||
|
||||
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
|
||||
game = battleship_game.get_game(
|
||||
initialized_game_id
|
||||
)
|
||||
game = battleship_game.get_game(initialized_game_id)
|
||||
additional_ship = ShipPlacement(
|
||||
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
|
||||
)
|
||||
|
|
|
@ -16,7 +16,7 @@ from benchmark.reports.reports import (
|
|||
generate_single_call_report,
|
||||
session_finish,
|
||||
)
|
||||
from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig
|
||||
from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig
|
||||
|
||||
GLOBAL_TIMEOUT = (
|
||||
1500 # The tests will stop after 25 minutes so we can send the reports.
|
||||
|
@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
|
|||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
return agent_benchmark_config
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def resolve_workspace(workspace: str) -> str:
|
||||
if workspace.startswith("${") and workspace.endswith("}"):
|
||||
# Extract the string inside ${...}
|
||||
|
@ -65,7 +64,9 @@ def config(request: Any) -> Any:
|
|||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
@ -73,8 +74,12 @@ def config(request: Any) -> Any:
|
|||
if isinstance(config["workspace"], str):
|
||||
config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
|
||||
else: # it's a input output dict
|
||||
config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input")
|
||||
config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output")
|
||||
config["workspace"]["input"] = resolve_workspace(
|
||||
agent_benchmark_config.workspace / "input"
|
||||
)
|
||||
config["workspace"]["output"] = resolve_workspace(
|
||||
agent_benchmark_config.workspace / "output"
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
|
@ -238,9 +243,11 @@ def scores(request: Any) -> None:
|
|||
# this is adding the dependency marker and category markers automatically from the json
|
||||
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
||||
try:
|
||||
with open(config.getoption('--agent_config_path'), "r") as f:
|
||||
with open(config.getoption("--agent_config_path"), "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path')
|
||||
agent_benchmark_config.agent_benchmark_config_path = config.getoption(
|
||||
"--agent_config_path"
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
|
|
@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional
|
|||
import pytest
|
||||
|
||||
from benchmark.utils.challenge import Challenge
|
||||
from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig
|
||||
from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig
|
||||
from benchmark.utils.utils import get_test_path
|
||||
|
||||
DATA_CATEGORY = {}
|
||||
|
@ -222,7 +222,7 @@ def create_challenge(
|
|||
def generate_tests() -> None: # sourcery skip: invert-any-all
|
||||
print("Generating tests...")
|
||||
|
||||
challenges_path = os.path.join(os.path.dirname(__file__), 'challenges')
|
||||
challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
|
||||
|
||||
json_files = deque(
|
||||
glob.glob(
|
||||
|
@ -239,14 +239,16 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
|||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
regression_reports_path = agent_benchmark_config.get_regression_reports_path()
|
||||
if regression_reports_path and os.path.exists(regression_reports_path):
|
||||
with open(regression_reports_path, 'r') as f:
|
||||
with open(regression_reports_path, "r") as f:
|
||||
regression_tests = json.load(f)
|
||||
else:
|
||||
regression_tests = {}
|
||||
|
|
|
@ -6,12 +6,13 @@ from datetime import datetime, timezone
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from benchmark.__main__ import BENCHMARK_START_TIME
|
||||
from benchmark.reports.processing.graphs import save_single_radar_chart
|
||||
from benchmark.reports.processing.process_report import get_agent_category
|
||||
from benchmark.reports.processing.report_types import Report
|
||||
from benchmark.utils.utils import get_highest_success_difficulty
|
||||
from benchmark.utils.data_types import AgentBenchmarkConfig
|
||||
from benchmark.__main__ import BENCHMARK_START_TIME
|
||||
from benchmark.utils.utils import get_highest_success_difficulty
|
||||
|
||||
|
||||
class ReportManager:
|
||||
"""Abstracts interaction with the regression tests file"""
|
||||
|
@ -24,7 +25,7 @@ class ReportManager:
|
|||
def load(self) -> None:
|
||||
if not os.path.exists(self.filename):
|
||||
os.makedirs(os.path.dirname(self.filename), exist_ok=True)
|
||||
with open(self.filename, 'w') as f:
|
||||
with open(self.filename, "w") as f:
|
||||
pass
|
||||
|
||||
try:
|
||||
|
@ -62,13 +63,12 @@ class ReportManager:
|
|||
self.save()
|
||||
|
||||
def end_info_report(self, config: AgentBenchmarkConfig) -> None:
|
||||
|
||||
command = " ".join(sys.argv)
|
||||
|
||||
self.tests = {
|
||||
"command": command.split(os.sep)[-1],
|
||||
"benchmark_git_commit_sha": '---',
|
||||
"agent_git_commit_sha": '---',
|
||||
"benchmark_git_commit_sha": "---",
|
||||
"agent_git_commit_sha": "---",
|
||||
"completion_time": datetime.now(timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S+00:00"
|
||||
),
|
||||
|
@ -79,7 +79,9 @@ class ReportManager:
|
|||
"total_cost": self.get_total_costs(),
|
||||
},
|
||||
"tests": self.tests,
|
||||
"config": {k: v for k, v in json.loads(config.json()).items() if v is not None},
|
||||
"config": {
|
||||
k: v for k, v in json.loads(config.json()).items() if v is not None
|
||||
},
|
||||
}
|
||||
|
||||
converted_data = Report.parse_obj(self.tests)
|
||||
|
@ -88,7 +90,6 @@ class ReportManager:
|
|||
|
||||
save_single_radar_chart(
|
||||
agent_categories,
|
||||
|
||||
config.get_reports_path() / "radar_chart.png",
|
||||
)
|
||||
|
||||
|
|
|
@ -4,7 +4,13 @@ import sys
|
|||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig
|
||||
from benchmark.reports.ReportManager import ReportManager
|
||||
from benchmark.utils.data_types import (
|
||||
DIFFICULTY_MAP,
|
||||
AgentBenchmarkConfig,
|
||||
DifficultyLevel,
|
||||
SuiteConfig,
|
||||
)
|
||||
from benchmark.utils.get_data_from_helicone import get_data_from_helicone
|
||||
from benchmark.utils.utils import (
|
||||
calculate_success_percentage,
|
||||
|
@ -12,8 +18,6 @@ from benchmark.utils.utils import (
|
|||
get_test_path,
|
||||
replace_backslash,
|
||||
)
|
||||
from benchmark.reports.ReportManager import ReportManager
|
||||
|
||||
|
||||
|
||||
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
||||
|
@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
|||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
return agent_benchmark_config
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
|
||||
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
|
||||
agent_benchmark_config = get_agent_benchmark_config()
|
||||
# tests that consistently pass are considered regression tests
|
||||
REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path())
|
||||
REGRESSION_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_regression_reports_path()
|
||||
)
|
||||
|
||||
# print(f"Using {REPORTS_PATH} for reports")
|
||||
# user facing reporting information
|
||||
INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json"))
|
||||
INFO_MANAGER = ReportManager(
|
||||
str(agent_benchmark_config.get_reports_path() / "report.json")
|
||||
)
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path())
|
||||
INTERNAL_INFO_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_success_rate_path()
|
||||
)
|
||||
|
||||
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||
|
||||
|
@ -132,16 +145,12 @@ def get_previous_test_results(
|
|||
agent_tests: dict[str, list[bool]] = {}
|
||||
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
|
||||
|
||||
prev_test_results = INTERNAL_INFO_MANAGER.tests.get(
|
||||
test_name, []
|
||||
)
|
||||
prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
|
||||
|
||||
if not mock:
|
||||
# only add if it's an actual test
|
||||
prev_test_results.append(info_details["metrics"]["success"])
|
||||
INTERNAL_INFO_MANAGER.add_test(
|
||||
test_name, prev_test_results
|
||||
)
|
||||
INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
|
||||
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
|
@ -199,8 +208,8 @@ def generate_single_call_report(
|
|||
},
|
||||
"answers": answers,
|
||||
}
|
||||
if 'metadata' in challenge_data:
|
||||
info_details['metadata'] = challenge_data['metadata']
|
||||
if "metadata" in challenge_data:
|
||||
info_details["metadata"] = challenge_data["metadata"]
|
||||
|
||||
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
|
||||
|
||||
|
@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
|
|||
}
|
||||
|
||||
for name in suite_file_datum:
|
||||
test_data = INFO_MANAGER.tests[
|
||||
name
|
||||
] # get the individual test reports
|
||||
test_data = INFO_MANAGER.tests[name] # get the individual test reports
|
||||
data[name] = test_data # this is for calculating highest difficulty
|
||||
INFO_MANAGER.remove_test(name)
|
||||
|
||||
|
@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None:
|
|||
|
||||
agent_benchmark_config = get_agent_benchmark_config()
|
||||
|
||||
|
||||
INTERNAL_INFO_MANAGER.save()
|
||||
INFO_MANAGER.end_info_report(agent_benchmark_config)
|
||||
REGRESSION_MANAGER.save()
|
||||
|
|
|
@ -9,14 +9,13 @@ from typing import Any, Optional
|
|||
import click
|
||||
import pytest
|
||||
from helicone.lock import HeliconeLockManager
|
||||
import sys
|
||||
sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark')
|
||||
|
||||
sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
|
||||
|
||||
from agbenchmark.reports.ReportManager import ReportManager
|
||||
from agbenchmark.utils.utils import (
|
||||
from agbenchmark.utils.utils import ( # get_git_commit_sha,
|
||||
AGENT_NAME,
|
||||
calculate_dynamic_paths,
|
||||
# get_git_commit_sha,
|
||||
)
|
||||
|
||||
CURRENT_DIRECTORY = Path(__file__).resolve().parent
|
||||
|
@ -334,13 +333,16 @@ def get_regression_data() -> Any:
|
|||
|
||||
return data
|
||||
|
||||
|
||||
@cli.command()
|
||||
def version():
|
||||
"""Print the version of the benchmark tool."""
|
||||
import toml
|
||||
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
|
||||
print(f"Benchmark Tool Version {version}")
|
||||
|
||||
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
|
||||
"version"
|
||||
]
|
||||
print(f"Benchmark Tool Version {version}")
|
||||
|
||||
|
||||
# def run_from_backend(
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
import glob
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
import sys
|
||||
|
||||
from pydantic import BaseModel, root_validator, validator
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
class DifficultyLevel(Enum):
|
||||
interface = "interface"
|
||||
basic = "basic"
|
||||
|
@ -29,6 +32,7 @@ DIFFICULTY_MAP = {
|
|||
|
||||
STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
|
||||
|
||||
|
||||
def calculate_info_test_path(base_path: Path) -> Path:
|
||||
"""
|
||||
Calculates the path to the directory where the test report will be saved.
|
||||
|
@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path:
|
|||
|
||||
return report_path
|
||||
|
||||
|
||||
class AgentBenchmarkConfig(BaseModel):
|
||||
"""
|
||||
This class represents the configuration for the Agent Benchmark.
|
||||
|
@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
- api_mode: A boolean indicating whether the benchmark is run in API mode.
|
||||
- host: The host where the benchmark is run.
|
||||
"""
|
||||
|
||||
agent_benchmark_config_path: Path | None = None
|
||||
entry_path: Path
|
||||
workspace: Path
|
||||
|
@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
|
||||
def get_reports_location(self) -> Path:
|
||||
if not self.reports_folder:
|
||||
self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve()
|
||||
self.reports_folder = (
|
||||
self.agent_benchmark_config_path
|
||||
/ self.entry_path.parent
|
||||
/ ".."
|
||||
/ "reports"
|
||||
).resolve()
|
||||
return self.reports_folder
|
||||
|
||||
def get_reports_path(self) -> Path:
|
||||
return calculate_info_test_path(self.get_reports_location())
|
||||
|
||||
def get_regression_reports_path(self) -> Path:
|
||||
|
||||
return self.get_reports_location() / "regression_tests.json"
|
||||
|
||||
def get_success_rate_path(self) -> Path:
|
||||
return self.get_reports_location() / "success_rate.json"
|
||||
|
||||
|
||||
class Info(BaseModel):
|
||||
difficulty: DifficultyLevel
|
||||
description: str
|
||||
|
|
|
@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
|
|||
for action in group.options:
|
||||
current_options += action._short_opts + action._long_opts
|
||||
|
||||
|
||||
group = parser.getgroup("depends")
|
||||
|
||||
# Add a flag to list all names + the tests they resolve to
|
||||
|
|
|
@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME")
|
|||
REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def replace_backslash(value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
return re.sub(
|
||||
|
|
Loading…
Reference in New Issue