Fixing benchmark code

pull/5194/head
SwiftyOS 2023-09-11 17:24:23 +02:00 committed by Merwane Hamadi
parent c73e90c4e6
commit ef2107d9c2
21 changed files with 177 additions and 119 deletions

View File

@ -9,11 +9,10 @@ from typing import Any
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from agbenchmark.utils.utils import find_absolute_benchmark_path
from fastapi import FastAPI, Query
from fastapi.middleware.cors import CORSMiddleware
from agbenchmark.utils.utils import find_absolute_benchmark_path
app = FastAPI()
origins = ["http://localhost:3000"]

View File

@ -5,10 +5,10 @@ import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import toml
import click
import pytest
import toml
from helicone.lock import HeliconeLockManager
from benchmark.utils.data_types import AgentBenchmarkConfig
@ -72,7 +72,9 @@ def run_benchmark(
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
return 1
@ -97,8 +99,9 @@ def run_benchmark(
)
return 1
assert not(agent_benchmark_config.api_mode and not agent_benchmark_config.host), \
"Error: host needs to be added to the config if api_mode is set to True."
assert not (
agent_benchmark_config.api_mode and not agent_benchmark_config.host
), "Error: host needs to be added to the config if api_mode is set to True."
print("Current configuration:")
for key, value in vars(agent_benchmark_config).items():
@ -200,7 +203,12 @@ def cli() -> None:
)
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
@click.option("--agent-config", type=click.Path(exists=True), help="Path to the agent benchmark_config.json file,", required=True)
@click.option(
"--agent-config",
type=click.Path(exists=True),
help="Path to the agent benchmark_config.json file,",
required=True,
)
def start(
maintain: bool,
improve: bool,
@ -220,8 +228,9 @@ def start(
original_stdout = sys.stdout # Save the original standard output
exit_code = None
assert "benchmark_config.json" in agent_config, "benchmark_config.json must be provided"
assert (
"benchmark_config.json" in agent_config
), "benchmark_config.json must be provided"
if backend:
with open("backend/backend_stdout.txt", "w") as f:
@ -266,7 +275,9 @@ def start(
def version():
"""Print the version of the benchmark tool."""
current_directory = Path(__file__).resolve().parent
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
"version"
]
print(f"Benchmark Tool Version {version}")

View File

@ -51,7 +51,6 @@ async def run_api_agent(
artifacts = await api_instance.list_agent_task_artifacts(task_id=task_id)
for artifact in artifacts:
if artifact.relative_path:
folder_path = os.path.join(config["workspace"], artifact.relative_path)
else:

View File

@ -12,7 +12,6 @@ from typing import Any, List
import psutil
from dotenv import load_dotenv
load_dotenv()
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")

View File

@ -1,15 +1,22 @@
import unittest
from url_shortener import shorten_url, retrieve_url
from url_shortener import retrieve_url, shorten_url
class TestURLShortener(unittest.TestCase):
def test_url_retrieval(self):
# Shorten the URL to get its shortened form
shortened_url = shorten_url('https://www.example.com')
shortened_url = shorten_url("https://www.example.com")
# Retrieve the original URL using the shortened URL directly
retrieved_url = retrieve_url(shortened_url)
self.assertEqual(retrieved_url, 'https://www.example.com', "Retrieved URL does not match the original!")
self.assertEqual(
retrieved_url,
"https://www.example.com",
"Retrieved URL does not match the original!",
)
if __name__ == "__main__":
unittest.main()

View File

@ -3,6 +3,7 @@ import base64
URL_MAPPING = {}
def shorten_url(url):
# Convert the URL to base64
encoded_url = base64.b64encode(url.encode()).decode()
@ -12,13 +13,15 @@ def shorten_url(url):
URL_MAPPING[short_url] = url
return short_url
def retrieve_url(short_url):
return URL_MAPPING.get(short_url, "URL not found")
def main():
parser = argparse.ArgumentParser(description="URL Shortener")
parser.add_argument('-s', '--shorten', type=str, help="URL to be shortened")
parser.add_argument('-r', '--retrieve', type=str, help="Short URL to be retrieved")
parser.add_argument("-s", "--shorten", type=str, help="URL to be shortened")
parser.add_argument("-r", "--retrieve", type=str, help="Short URL to be retrieved")
args = parser.parse_args()
@ -32,5 +35,6 @@ def main():
else:
print("No valid arguments provided.")
if __name__ == "__main__":
main()

View File

@ -1,38 +1,45 @@
import pprint
def column(matrix, i):
return [row[i] for row in matrix]
def check(list):
if len(set(list)) <= 1:
if list[0] != 0:
return list[0]
return None
def checkDiagLeft(board):
if (board[0][0] == board[1][1] and board[1][1] == board[2][2]):
if board[0][0] == board[1][1] and board[1][1] == board[2][2]:
if board[0][0] != 0:
return board[0][0]
return None
def checkDiagRight(board):
if (board[2][0] == board[1][1] and board[1][1] == board[0][2]):
if board[2][0] == board[1][1] and board[1][1] == board[0][2]:
if board[2][0] != 0:
return board[2][0]
return None
def placeItem(row, column, board, current_player):
if board[row][column] != 0:
return None
else:
board[row][column] = current_player
def swapPlayers(player):
if (player == 2):
if player == 2:
return 1
else:
return 2
def winner(board):
for rowIndex in board:
if check(rowIndex) is not None:
@ -46,23 +53,35 @@ def winner(board):
return checkDiagRight(board)
return 0
def getLocation():
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
location = input(
"Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
)
print(f"\nYou picked {location}")
coordinates = [int(x) for x in location.split(',')]
while (len(coordinates) != 2 or coordinates[0] < 0 or coordinates[0] > 2 or coordinates[1] < 0 or coordinates[1] > 2):
coordinates = [int(x) for x in location.split(",")]
while (
len(coordinates) != 2
or coordinates[0] < 0
or coordinates[0] > 2
or coordinates[1] < 0
or coordinates[1] > 2
):
print("You inputted a location in an invalid format")
location = input("Choose where to play. Enter two numbers separated by a comma, for example: 1,1 ")
coordinates = [int(x) for x in location.split(',')]
location = input(
"Choose where to play. Enter two numbers separated by a comma, for example: 1,1 "
)
coordinates = [int(x) for x in location.split(",")]
return coordinates
def gamePlay():
num_moves = 0
pp = pprint.PrettyPrinter(width=20)
current_player = 1
board = [[0 for x in range(3)] for x in range(3)]
while (num_moves < 9 and winner(board) == 0):
while num_moves < 9 and winner(board) == 0:
print("This is the current board: ")
pp.pprint(board)
coordinates = getLocation()
@ -75,5 +94,6 @@ def gamePlay():
if winner(board) == 0:
print("Draw")
if __name__ == '__main__':
if __name__ == "__main__":
gamePlay()

View File

@ -1,18 +1,20 @@
import subprocess
import pytest
def run_game_with_inputs(inputs):
# Start the game process
process = subprocess.Popen(
['python', 'tic_tac_toe.py'],
["python", "tic_tac_toe.py"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
text=True,
)
# Send the input moves one by one
output, errors = process.communicate('\n'.join(inputs))
output, errors = process.communicate("\n".join(inputs))
# Print the inputs and outputs
print("Inputs:\n", "\n".join(inputs))
@ -22,14 +24,18 @@ def run_game_with_inputs(inputs):
return output
@pytest.mark.parametrize("inputs, expected_output", [
(["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
(["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw")
])
@pytest.mark.parametrize(
"inputs, expected_output",
[
(["0,0", "1,0", "0,1", "1,1", "0,2"], "Player 1 won!"),
(["1,0", "0,0", "1,1", "0,1", "2,0", "0,2"], "Player 2 won!"),
(["0,0", "0,1", "0,2", "1,1", "1,0", "1,2", "2,1", "2,0", "2,2"], "Draw"),
],
)
def test_game(inputs, expected_output):
output = run_game_with_inputs(inputs)
assert expected_output in output
if __name__ == '__main__':
if __name__ == "__main__":
pytest.main()

View File

@ -1,9 +1,8 @@
import pytest
from abstract_class import ShipPlacement, Turn
from battleship import Battleship
@pytest.fixture
def battleship_game():
return Battleship()

View File

@ -1,7 +1,6 @@
import pytest
from pydantic import ValidationError
from abstract_class import ShipPlacement, Turn
from pydantic import ValidationError
def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
game = battleship_game.get_game(
initialized_game_id
)
game = battleship_game.get_game(initialized_game_id)
additional_ship = ShipPlacement(
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
)

View File

@ -86,9 +86,7 @@ class Battleship(AbstractBattleship):
game.turns.append(turn)
if hit_ship == "hit":
return TurnResponse(
result="miss", ship_type=None
)
return TurnResponse(result="miss", ship_type=None)
if hit_ship:
ship_placement = next(sp for sp in game.ships if sp.ship_type == hit_ship)
@ -133,9 +131,7 @@ class Battleship(AbstractBattleship):
)
if hits == total_ships_length:
return GameStatus(
is_game_over=True, winner="player"
)
return GameStatus(is_game_over=True, winner="player")
else:
return GameStatus(is_game_over=False, winner=None)

View File

@ -1,9 +1,8 @@
import pytest
from abstract_class import ShipPlacement, Turn
from battleship import Battleship
@pytest.fixture
def battleship_game():
return Battleship()

View File

@ -1,7 +1,6 @@
import pytest
from pydantic import ValidationError
from abstract_class import ShipPlacement, Turn
from pydantic import ValidationError
def test_ship_placement_out_of_bounds(battleship_game):
@ -51,9 +50,7 @@ def test_cant_hit_before_ships_placed(battleship_game):
def test_cant_place_ship_after_all_ships_placed(battleship_game, initialized_game_id):
game = battleship_game.get_game(
initialized_game_id
)
game = battleship_game.get_game(initialized_game_id)
additional_ship = ShipPlacement(
ship_type="carrier", start={"row": 2, "column": "E"}, direction="horizontal"
)

View File

@ -16,7 +16,7 @@ from benchmark.reports.reports import (
generate_single_call_report,
session_finish,
)
from benchmark.utils.data_types import SuiteConfig, AgentBenchmarkConfig
from benchmark.utils.data_types import AgentBenchmarkConfig, SuiteConfig
GLOBAL_TIMEOUT = (
1500 # The tests will stop after 25 minutes so we can send the reports.
@ -31,16 +31,15 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
def resolve_workspace(workspace: str) -> str:
if workspace.startswith("${") and workspace.endswith("}"):
# Extract the string inside ${...}
@ -65,7 +64,9 @@ def config(request: Any) -> Any:
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
@ -73,8 +74,12 @@ def config(request: Any) -> Any:
if isinstance(config["workspace"], str):
config["workspace"] = resolve_workspace(agent_benchmark_config.workspace)
else: # it's a input output dict
config["workspace"]["input"] = resolve_workspace(agent_benchmark_config.workspace / "input")
config["workspace"]["output"] = resolve_workspace(agent_benchmark_config.workspace / "output")
config["workspace"]["input"] = resolve_workspace(
agent_benchmark_config.workspace / "input"
)
config["workspace"]["output"] = resolve_workspace(
agent_benchmark_config.workspace / "output"
)
return config
@ -238,9 +243,11 @@ def scores(request: Any) -> None:
# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
try:
with open(config.getoption('--agent_config_path'), "r") as f:
with open(config.getoption("--agent_config_path"), "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = config.getoption('--agent_config_path')
agent_benchmark_config.agent_benchmark_config_path = config.getoption(
"--agent_config_path"
)
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise

View File

@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, Optional
import pytest
from benchmark.utils.challenge import Challenge
from benchmark.utils.data_types import ChallengeData, SuiteConfig, AgentBenchmarkConfig
from benchmark.utils.data_types import AgentBenchmarkConfig, ChallengeData, SuiteConfig
from benchmark.utils.utils import get_test_path
DATA_CATEGORY = {}
@ -222,7 +222,7 @@ def create_challenge(
def generate_tests() -> None: # sourcery skip: invert-any-all
print("Generating tests...")
challenges_path = os.path.join(os.path.dirname(__file__), 'challenges')
challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
json_files = deque(
glob.glob(
@ -239,14 +239,16 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
regression_reports_path = agent_benchmark_config.get_regression_reports_path()
if regression_reports_path and os.path.exists(regression_reports_path):
with open(regression_reports_path, 'r') as f:
with open(regression_reports_path, "r") as f:
regression_tests = json.load(f)
else:
regression_tests = {}

View File

@ -6,12 +6,13 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict
from benchmark.__main__ import BENCHMARK_START_TIME
from benchmark.reports.processing.graphs import save_single_radar_chart
from benchmark.reports.processing.process_report import get_agent_category
from benchmark.reports.processing.report_types import Report
from benchmark.utils.utils import get_highest_success_difficulty
from benchmark.utils.data_types import AgentBenchmarkConfig
from benchmark.__main__ import BENCHMARK_START_TIME
from benchmark.utils.utils import get_highest_success_difficulty
class ReportManager:
"""Abstracts interaction with the regression tests file"""
@ -24,7 +25,7 @@ class ReportManager:
def load(self) -> None:
if not os.path.exists(self.filename):
os.makedirs(os.path.dirname(self.filename), exist_ok=True)
with open(self.filename, 'w') as f:
with open(self.filename, "w") as f:
pass
try:
@ -62,13 +63,12 @@ class ReportManager:
self.save()
def end_info_report(self, config: AgentBenchmarkConfig) -> None:
command = " ".join(sys.argv)
self.tests = {
"command": command.split(os.sep)[-1],
"benchmark_git_commit_sha": '---',
"agent_git_commit_sha": '---',
"benchmark_git_commit_sha": "---",
"agent_git_commit_sha": "---",
"completion_time": datetime.now(timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S+00:00"
),
@ -79,7 +79,9 @@ class ReportManager:
"total_cost": self.get_total_costs(),
},
"tests": self.tests,
"config": {k: v for k, v in json.loads(config.json()).items() if v is not None},
"config": {
k: v for k, v in json.loads(config.json()).items() if v is not None
},
}
converted_data = Report.parse_obj(self.tests)
@ -88,7 +90,6 @@ class ReportManager:
save_single_radar_chart(
agent_categories,
config.get_reports_path() / "radar_chart.png",
)

View File

@ -4,7 +4,13 @@ import sys
from pathlib import Path
from typing import Any, Dict
from benchmark.utils.data_types import DIFFICULTY_MAP, DifficultyLevel, SuiteConfig, AgentBenchmarkConfig
from benchmark.reports.ReportManager import ReportManager
from benchmark.utils.data_types import (
DIFFICULTY_MAP,
AgentBenchmarkConfig,
DifficultyLevel,
SuiteConfig,
)
from benchmark.utils.get_data_from_helicone import get_data_from_helicone
from benchmark.utils.utils import (
calculate_success_percentage,
@ -12,8 +18,6 @@ from benchmark.utils.utils import (
get_test_path,
replace_backslash,
)
from benchmark.reports.ReportManager import ReportManager
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
@ -24,23 +28,32 @@ def get_agent_benchmark_config() -> AgentBenchmarkConfig:
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = agent_benchmark_config_path
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(agent_benchmark_config.get_regression_reports_path())
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path()
)
# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(str(agent_benchmark_config.get_reports_path() / "report.json"))
INFO_MANAGER = ReportManager(
str(agent_benchmark_config.get_reports_path() / "report.json")
)
# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(agent_benchmark_config.get_success_rate_path())
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path()
)
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
@ -132,16 +145,12 @@ def get_previous_test_results(
agent_tests: dict[str, list[bool]] = {}
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
prev_test_results = INTERNAL_INFO_MANAGER.tests.get(
test_name, []
)
prev_test_results = INTERNAL_INFO_MANAGER.tests.get(test_name, [])
if not mock:
# only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"])
INTERNAL_INFO_MANAGER.add_test(
test_name, prev_test_results
)
INTERNAL_INFO_MANAGER.add_test(test_name, prev_test_results)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(
@ -199,8 +208,8 @@ def generate_single_call_report(
},
"answers": answers,
}
if 'metadata' in challenge_data:
info_details['metadata'] = challenge_data['metadata']
if "metadata" in challenge_data:
info_details["metadata"] = challenge_data["metadata"]
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
@ -298,9 +307,7 @@ def generate_separate_suite_reports(suite_reports: dict) -> None:
}
for name in suite_file_datum:
test_data = INFO_MANAGER.tests[
name
] # get the individual test reports
test_data = INFO_MANAGER.tests[name] # get the individual test reports
data[name] = test_data # this is for calculating highest difficulty
INFO_MANAGER.remove_test(name)
@ -330,7 +337,6 @@ def session_finish(suite_reports: dict) -> None:
agent_benchmark_config = get_agent_benchmark_config()
INTERNAL_INFO_MANAGER.save()
INFO_MANAGER.end_info_report(agent_benchmark_config)
REGRESSION_MANAGER.save()

View File

@ -9,14 +9,13 @@ from typing import Any, Optional
import click
import pytest
from helicone.lock import HeliconeLockManager
import sys
sys.path.append('/Users/swifty/dev/Auto-GPT/benchmark')
sys.path.append("/Users/swifty/dev/Auto-GPT/benchmark")
from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.utils.utils import (
from agbenchmark.utils.utils import ( # get_git_commit_sha,
AGENT_NAME,
calculate_dynamic_paths,
# get_git_commit_sha,
)
CURRENT_DIRECTORY = Path(__file__).resolve().parent
@ -34,8 +33,8 @@ if os.environ.get("HELICONE_API_KEY"):
SUCCESS_RATE_PATH,
CHALLENGES_PATH,
) = calculate_dynamic_paths()
BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
BENCHMARK_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY / ".." / "..")
AGENT_GIT_COMMIT_SHA = "---" # get_git_commit_sha(HOME_DIRECTORY)
# open a file in the challenges/optional_categories
with open(
Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
@ -334,13 +333,16 @@ def get_regression_data() -> Any:
return data
@cli.command()
def version():
"""Print the version of the benchmark tool."""
import toml
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"]["version"]
print(f"Benchmark Tool Version {version}")
version = toml.load(CURRENT_DIRECTORY / ".." / "pyproject.toml")["tool"]["poetry"][
"version"
]
print(f"Benchmark Tool Version {version}")
# def run_from_backend(

View File

@ -1,11 +1,14 @@
import glob
import json
import sys
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
import sys
from pydantic import BaseModel, root_validator, validator
from datetime import datetime, timezone
class DifficultyLevel(Enum):
interface = "interface"
basic = "basic"
@ -29,6 +32,7 @@ DIFFICULTY_MAP = {
STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
def calculate_info_test_path(base_path: Path) -> Path:
"""
Calculates the path to the directory where the test report will be saved.
@ -69,6 +73,7 @@ def calculate_info_test_path(base_path: Path) -> Path:
return report_path
class AgentBenchmarkConfig(BaseModel):
"""
This class represents the configuration for the Agent Benchmark.
@ -79,6 +84,7 @@ class AgentBenchmarkConfig(BaseModel):
- api_mode: A boolean indicating whether the benchmark is run in API mode.
- host: The host where the benchmark is run.
"""
agent_benchmark_config_path: Path | None = None
entry_path: Path
workspace: Path
@ -88,19 +94,24 @@ class AgentBenchmarkConfig(BaseModel):
def get_reports_location(self) -> Path:
if not self.reports_folder:
self.reports_folder = (self.agent_benchmark_config_path / self.entry_path.parent / ".." / "reports").resolve()
self.reports_folder = (
self.agent_benchmark_config_path
/ self.entry_path.parent
/ ".."
/ "reports"
).resolve()
return self.reports_folder
def get_reports_path(self) -> Path:
return calculate_info_test_path(self.get_reports_location())
def get_regression_reports_path(self) -> Path:
return self.get_reports_location() / "regression_tests.json"
def get_success_rate_path(self) -> Path:
return self.get_reports_location() / "success_rate.json"
class Info(BaseModel):
difficulty: DifficultyLevel
description: str

View File

@ -67,7 +67,6 @@ def pytest_addoption(parser: Parser) -> None:
for action in group.options:
current_options += action._short_opts + action._long_opts
group = parser.getgroup("depends")
# Add a flag to list all names + the tests they resolve to

View File

@ -16,9 +16,6 @@ AGENT_NAME = os.getenv("AGENT_NAME")
REPORT_LOCATION = os.getenv("REPORT_LOCATION", None)
def replace_backslash(value: Any) -> Any:
if isinstance(value, str):
return re.sub(