more benchmark fixes
parent
d3067c83d3
commit
1281a264f5
|
@ -3,3 +3,48 @@
|
|||
# pydevd_pycharm.settrace(
|
||||
# "localhost", port=9739, stdoutToServer=True, stderrToServer=True
|
||||
# )
|
||||
from .utils.data_types import AgentBenchmarkConfig
|
||||
import sys
|
||||
import json
|
||||
from .reports.ReportManager import ReportManager
|
||||
|
||||
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
||||
if "--agent-config" in sys.argv:
|
||||
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
|
||||
else:
|
||||
print(sys.argv)
|
||||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
return agent_benchmark_config
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
|
||||
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
|
||||
agent_benchmark_config = get_agent_benchmark_config()
|
||||
# tests that consistently pass are considered regression tests
|
||||
REGRESSION_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_regression_reports_path()
|
||||
)
|
||||
|
||||
# print(f"Using {REPORTS_PATH} for reports")
|
||||
# user facing reporting information
|
||||
INFO_MANAGER = ReportManager(
|
||||
str(agent_benchmark_config.get_reports_path() / "report.json")
|
||||
)
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
INTERNAL_INFO_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_success_rate_path()
|
||||
)
|
||||
|
||||
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||
|
||||
|
||||
|
||||
(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
|
||||
|
|
|
@ -12,6 +12,8 @@ from typing import Any, List
|
|||
import psutil
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from benchmark.utils.data_types import AgentBenchmarkConfig
|
||||
|
||||
load_dotenv()
|
||||
|
||||
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
|
||||
|
@ -72,20 +74,21 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
|
|||
process.terminate()
|
||||
|
||||
|
||||
def run_agent(task: str, timeout: int) -> None:
|
||||
def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
|
||||
"""Calling to get a response"""
|
||||
|
||||
entry_path = "benchmark.benchmarks"
|
||||
|
||||
entry_path = agent_config.get_agent_entry_path()
|
||||
print(f"Running '{entry_path}' with timeout {timeout}")
|
||||
|
||||
command = [sys.executable, "-m", entry_path, str(task)]
|
||||
command = [sys.executable, entry_path, str(task)]
|
||||
|
||||
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True,
|
||||
cwd=benchmark.start_benchmark.HOME_DIRECTORY,
|
||||
cwd=agent_config.get_agent_directory(),
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
|
|||
raise
|
||||
|
||||
|
||||
def resolve_workspace(workspace: str) -> str:
|
||||
def resolve_workspace(workspace: Path) -> Path:
|
||||
"""
|
||||
This function resolves the workspace path.
|
||||
|
||||
|
@ -64,7 +64,7 @@ def resolve_workspace(workspace: str) -> str:
|
|||
Raises:
|
||||
ValueError: If the workspace path expression is invalid.
|
||||
"""
|
||||
if workspace.startswith("${") and workspace.endswith("}"):
|
||||
if isinstance(workspace, str) and workspace.startswith("${") and workspace.endswith("}"):
|
||||
# Extract the string inside ${...}
|
||||
path_expr = workspace[2:-1]
|
||||
|
||||
|
@ -77,8 +77,10 @@ def resolve_workspace(workspace: str) -> str:
|
|||
return path_value
|
||||
else:
|
||||
raise ValueError("Invalid workspace path expression.")
|
||||
elif isinstance(workspace, Path):
|
||||
return os.path.abspath(workspace)
|
||||
else:
|
||||
return os.path.abspath(Path(os.getcwd()) / workspace)
|
||||
raise ValueError("Invalid workspace type. Expected str or Path.")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
@ -98,6 +100,7 @@ def config(request: Any) -> Any:
|
|||
json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
|
||||
"""
|
||||
agent_benchmark_config_path = request.config.getoption("--agent_config_path")
|
||||
config = {'workspace': {}}
|
||||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
|
@ -186,19 +189,19 @@ def pytest_addoption(parser: Any) -> None:
|
|||
Args:
|
||||
parser (Any): The parser object to which the command-line options are added.
|
||||
"""
|
||||
parser.addoption("--agent_config_path", action="store_true", default=False)
|
||||
parser.addoption("--agent_config_path", action="store", default=False)
|
||||
parser.addoption("--no_dep", action="store_true", default=False)
|
||||
parser.addoption("--suite", action="store_true", default=False)
|
||||
parser.addoption("--mock", action="store_true", default=False)
|
||||
parser.addoption("--api_mode", action="store_true", default=False)
|
||||
parser.addoption("--host", action="store_true", default=None)
|
||||
parser.addoption("--category", action="store_true", default=False)
|
||||
parser.addoption("--nc", action="store_true", default=False)
|
||||
parser.addoption("--cutoff", action="store_true", default=False)
|
||||
parser.addoption("--category", action="store_true", default=False)
|
||||
parser.addoption("--test", action="store_true", default=None)
|
||||
parser.addoption("--improve", action="store_true", default=False)
|
||||
parser.addoption("--maintain", action="store_true", default=False)
|
||||
parser.addoption("--explore", action="store_true", default=False)
|
||||
parser.addoption("--test", action="store_true", default=None)
|
||||
parser.addoption("--no_dep", action="store_true", default=False)
|
||||
parser.addoption("--suite", action="store_true", default=False)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
@ -433,7 +436,8 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
|||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
data = json.loads(agent_benchmark_config.get_regression_reports_path())
|
||||
regression_file = agent_benchmark_config.get_regression_reports_path()
|
||||
data = json.loads(open(regression_file, 'r').read()) if os.path.exists(regression_file) else {}
|
||||
|
||||
for item in items:
|
||||
# Assuming item.cls is your test class
|
||||
|
@ -444,19 +448,19 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
|||
|
||||
# Then you can access your properties
|
||||
name = item.parent.cls.__name__
|
||||
dependencies = test_class_instance.data.dependencies
|
||||
# dependencies = test_class_instance.data.dependencies
|
||||
|
||||
# Filter dependencies if they exist in regression data if its an improvement test
|
||||
if config.getoption("--improve") or config.getoption(
|
||||
"--category"
|
||||
): # TODO: same task suite
|
||||
dependencies = [dep for dep in dependencies if not data.get(dep, None)]
|
||||
if ( # TODO: separate task suite
|
||||
config.getoption("--test")
|
||||
or config.getoption("--no_dep")
|
||||
or config.getoption("--maintain")
|
||||
):
|
||||
dependencies = []
|
||||
# if config.getoption("--improve") or config.getoption(
|
||||
# "--category"
|
||||
# ): # TODO: same task suite
|
||||
# dependencies = [dep for dep in dependencies if not data.get(dep, None)]
|
||||
# if ( # TODO: separate task suite
|
||||
# config.getoption("--test")
|
||||
# or config.getoption("--no_dep")
|
||||
# or config.getoption("--maintain")
|
||||
# ):
|
||||
dependencies = []
|
||||
|
||||
# Add depends marker dynamically
|
||||
item.add_marker(pytest.mark.depends(on=dependencies, name=name))
|
||||
|
|
|
@ -72,8 +72,8 @@ def create_single_test(
|
|||
# Define test class dynamically
|
||||
challenge_class = types.new_class(data["name"], (Challenge,))
|
||||
print(challenge_location)
|
||||
clean_challenge_location = get_test_path(challenge_location)
|
||||
setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
|
||||
# clean_challenge_location = get_test_path(challenge_location)
|
||||
setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
|
||||
|
||||
# in the case of a suite
|
||||
if isinstance(challenge_data, ChallengeData):
|
||||
|
@ -86,7 +86,7 @@ def create_single_test(
|
|||
setattr(
|
||||
challenge_class,
|
||||
"_data_cache",
|
||||
{clean_challenge_location: challenge_data},
|
||||
{challenge_location: challenge_data},
|
||||
)
|
||||
|
||||
setattr(
|
||||
|
@ -161,6 +161,7 @@ def create_challenge(
|
|||
json_files: deque,
|
||||
) -> deque:
|
||||
path = Path(json_file).resolve()
|
||||
print("Creating challenge for", path)
|
||||
if suite_config is not None:
|
||||
grandparent_dir = path.parent.parent
|
||||
|
||||
|
@ -212,6 +213,7 @@ def create_challenge(
|
|||
|
||||
else:
|
||||
create_single_test(data, str(path))
|
||||
print("Creation complete for", path)
|
||||
|
||||
return json_files
|
||||
|
||||
|
@ -223,6 +225,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
|||
print("Generating tests...")
|
||||
|
||||
challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
|
||||
print(f"Looking for challenges in {challenges_path}...")
|
||||
|
||||
json_files = deque(
|
||||
glob.glob(
|
||||
|
@ -231,7 +234,8 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
|||
)
|
||||
)
|
||||
|
||||
agent_benchmark_config_path = Path.cwd() / "agbenchmark" / "config.json"
|
||||
print(f"Found {len(json_files)} challenges.")
|
||||
print(f"Sample path: {json_files[0]}")
|
||||
|
||||
if "--agent-config" in sys.argv:
|
||||
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
|
||||
|
@ -266,7 +270,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
|||
suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
|
||||
|
||||
commands = sys.argv
|
||||
# --category flag
|
||||
# --by flag
|
||||
if "--category" in commands:
|
||||
categories = data.get("category", [])
|
||||
commands_set = set(commands)
|
||||
|
@ -317,6 +321,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
|
|||
print(f"Generated suite for {suite_config.prefix}.")
|
||||
else:
|
||||
print(f"Generated test for {data['name']}.")
|
||||
print("Test generation complete.")
|
||||
|
||||
|
||||
def challenge_should_be_ignored(json_file):
|
||||
|
|
|
@ -18,47 +18,7 @@ from benchmark.utils.utils import (
|
|||
get_test_path,
|
||||
replace_backslash,
|
||||
)
|
||||
|
||||
|
||||
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
||||
if "--agent-config" in sys.argv:
|
||||
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
|
||||
else:
|
||||
print(sys.argv)
|
||||
try:
|
||||
with open(agent_benchmark_config_path, "r") as f:
|
||||
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
|
||||
agent_benchmark_config.agent_benchmark_config_path = (
|
||||
agent_benchmark_config_path
|
||||
)
|
||||
return agent_benchmark_config
|
||||
except json.JSONDecodeError:
|
||||
print("Error: benchmark_config.json is not a valid JSON file.")
|
||||
raise
|
||||
|
||||
|
||||
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
|
||||
agent_benchmark_config = get_agent_benchmark_config()
|
||||
# tests that consistently pass are considered regression tests
|
||||
REGRESSION_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_regression_reports_path()
|
||||
)
|
||||
|
||||
# print(f"Using {REPORTS_PATH} for reports")
|
||||
# user facing reporting information
|
||||
INFO_MANAGER = ReportManager(
|
||||
str(agent_benchmark_config.get_reports_path() / "report.json")
|
||||
)
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
INTERNAL_INFO_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_success_rate_path()
|
||||
)
|
||||
|
||||
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||
|
||||
|
||||
(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
|
||||
from benchmark import REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||
|
||||
|
||||
def generate_combined_suite_report(
|
||||
|
|
|
@ -11,7 +11,7 @@ import openai
|
|||
import pytest
|
||||
|
||||
from benchmark.agent_api_interface import run_api_agent
|
||||
from benchmark.utils.data_types import ChallengeData, Ground
|
||||
from benchmark.utils.data_types import ChallengeData, Ground, AgentBenchmarkConfig
|
||||
from benchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
FEW_SHOT_EXAMPLES,
|
||||
|
@ -74,7 +74,8 @@ class Challenge(ABC):
|
|||
config["workspace"], "artifacts_out", path
|
||||
)
|
||||
else:
|
||||
run_agent(self.task, cutoff)
|
||||
agent_benchmark_config: AgentBenchmarkConfig = config["AgentBenchmarkConfig"]
|
||||
run_agent(self.task, cutoff, agent_config=agent_benchmark_config)
|
||||
|
||||
# hidden files are added after the agent runs. Hidden files can be python test files.
|
||||
# We copy them in the workspace to make it easy to import the code produced by the agent
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import glob
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
@ -68,9 +69,9 @@ def calculate_info_test_path(base_path: Path) -> Path:
|
|||
# Create the full new directory path with ISO standard UTC date-time stamp
|
||||
report_path = base_path / f"{date_stamp}_{run_name}"
|
||||
|
||||
|
||||
# Ensure the new directory is created
|
||||
report_path.mkdir(exist_ok=True)
|
||||
|
||||
return report_path
|
||||
|
||||
|
||||
|
@ -78,6 +79,8 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
"""
|
||||
This class represents the configuration for the Agent Benchmark.
|
||||
It includes the following attributes:
|
||||
- agent_benchmark_config_path: The path to the agent benchmark config that this object was created from.
|
||||
- entry_path: The path to the entry point of the benchmark for the agent, relative to the agent_benchmark_config_path.
|
||||
- workspace: The path to the workspace where the benchmark will be run.
|
||||
- reports_folder: The path to the folder where the benchmark reports will be stored.
|
||||
- api_mode: A boolean indicating whether the benchmark is run in API mode.
|
||||
|
@ -85,6 +88,7 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
"""
|
||||
|
||||
agent_benchmark_config_path: Path | None = None
|
||||
entry_path: str
|
||||
workspace: Path
|
||||
reports_folder: Path | None = None
|
||||
api_mode: bool = False
|
||||
|
@ -93,7 +97,7 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
def get_reports_location(self) -> Path:
|
||||
if not self.reports_folder:
|
||||
self.reports_folder = (
|
||||
self.agent_benchmark_config_path / "reports"
|
||||
Path(self.agent_benchmark_config_path).parent / "reports"
|
||||
).resolve()
|
||||
return self.reports_folder
|
||||
|
||||
|
@ -105,7 +109,12 @@ class AgentBenchmarkConfig(BaseModel):
|
|||
|
||||
def get_success_rate_path(self) -> Path:
|
||||
return self.get_reports_location() / "success_rate.json"
|
||||
|
||||
def get_agent_home_directory(self) -> Path:
|
||||
return Path(self.agent_benchmark_config_path).resolve().parent
|
||||
|
||||
def get_agent_entry_path(self) -> Path:
|
||||
return (self.get_agent_home_directory() / self.entry_path).resolve()
|
||||
|
||||
class Info(BaseModel):
|
||||
difficulty: DifficultyLevel
|
||||
|
|
Loading…
Reference in New Issue