more benchmark fixes

pull/5200/head
SwiftyOS 2023-09-12 12:47:06 +02:00
parent d3067c83d3
commit 1281a264f5
7 changed files with 102 additions and 75 deletions

View File

@ -3,3 +3,48 @@
# pydevd_pycharm.settrace(
# "localhost", port=9739, stdoutToServer=True, stderrToServer=True
# )
from .utils.data_types import AgentBenchmarkConfig
import sys
import json
from .reports.ReportManager import ReportManager
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
if "--agent-config" in sys.argv:
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
else:
print(sys.argv)
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path()
)
# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(agent_benchmark_config.get_reports_path() / "report.json")
)
# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path()
)
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()

View File

@ -12,6 +12,8 @@ from typing import Any, List
import psutil
from dotenv import load_dotenv
from benchmark.utils.data_types import AgentBenchmarkConfig
load_dotenv()
helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
@ -72,20 +74,21 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
process.terminate()
def run_agent(task: str, timeout: int) -> None:
def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
"""Calling to get a response"""
entry_path = "benchmark.benchmarks"
entry_path = agent_config.get_agent_entry_path()
print(f"Running '{entry_path}' with timeout {timeout}")
command = [sys.executable, "-m", entry_path, str(task)]
command = [sys.executable, entry_path, str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=benchmark.start_benchmark.HOME_DIRECTORY,
cwd=agent_config.get_agent_directory(),
bufsize=1,
)

View File

@ -51,7 +51,7 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
raise
def resolve_workspace(workspace: str) -> str:
def resolve_workspace(workspace: Path) -> Path:
"""
This function resolves the workspace path.
@ -64,7 +64,7 @@ def resolve_workspace(workspace: str) -> str:
Raises:
ValueError: If the workspace path expression is invalid.
"""
if workspace.startswith("${") and workspace.endswith("}"):
if isinstance(workspace, str) and workspace.startswith("${") and workspace.endswith("}"):
# Extract the string inside ${...}
path_expr = workspace[2:-1]
@ -77,8 +77,10 @@ def resolve_workspace(workspace: str) -> str:
return path_value
else:
raise ValueError("Invalid workspace path expression.")
elif isinstance(workspace, Path):
return os.path.abspath(workspace)
else:
return os.path.abspath(Path(os.getcwd()) / workspace)
raise ValueError("Invalid workspace type. Expected str or Path.")
@pytest.fixture(scope="module")
@ -98,6 +100,7 @@ def config(request: Any) -> Any:
json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
"""
agent_benchmark_config_path = request.config.getoption("--agent_config_path")
config = {'workspace': {}}
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
@ -186,19 +189,19 @@ def pytest_addoption(parser: Any) -> None:
Args:
parser (Any): The parser object to which the command-line options are added.
"""
parser.addoption("--agent_config_path", action="store_true", default=False)
parser.addoption("--agent_config_path", action="store", default=False)
parser.addoption("--no_dep", action="store_true", default=False)
parser.addoption("--suite", action="store_true", default=False)
parser.addoption("--mock", action="store_true", default=False)
parser.addoption("--api_mode", action="store_true", default=False)
parser.addoption("--host", action="store_true", default=None)
parser.addoption("--category", action="store_true", default=False)
parser.addoption("--nc", action="store_true", default=False)
parser.addoption("--cutoff", action="store_true", default=False)
parser.addoption("--category", action="store_true", default=False)
parser.addoption("--test", action="store_true", default=None)
parser.addoption("--improve", action="store_true", default=False)
parser.addoption("--maintain", action="store_true", default=False)
parser.addoption("--explore", action="store_true", default=False)
parser.addoption("--test", action="store_true", default=None)
parser.addoption("--no_dep", action="store_true", default=False)
parser.addoption("--suite", action="store_true", default=False)
@pytest.fixture(autouse=True)
@ -433,7 +436,8 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
data = json.loads(agent_benchmark_config.get_regression_reports_path())
regression_file = agent_benchmark_config.get_regression_reports_path()
data = json.loads(open(regression_file, 'r').read()) if os.path.exists(regression_file) else {}
for item in items:
# Assuming item.cls is your test class
@ -444,19 +448,19 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
# Then you can access your properties
name = item.parent.cls.__name__
dependencies = test_class_instance.data.dependencies
# dependencies = test_class_instance.data.dependencies
# Filter dependencies if they exist in regression data if its an improvement test
if config.getoption("--improve") or config.getoption(
"--category"
): # TODO: same task suite
dependencies = [dep for dep in dependencies if not data.get(dep, None)]
if ( # TODO: separate task suite
config.getoption("--test")
or config.getoption("--no_dep")
or config.getoption("--maintain")
):
dependencies = []
# if config.getoption("--improve") or config.getoption(
# "--category"
# ): # TODO: same task suite
# dependencies = [dep for dep in dependencies if not data.get(dep, None)]
# if ( # TODO: separate task suite
# config.getoption("--test")
# or config.getoption("--no_dep")
# or config.getoption("--maintain")
# ):
dependencies = []
# Add depends marker dynamically
item.add_marker(pytest.mark.depends(on=dependencies, name=name))

View File

@ -72,8 +72,8 @@ def create_single_test(
# Define test class dynamically
challenge_class = types.new_class(data["name"], (Challenge,))
print(challenge_location)
clean_challenge_location = get_test_path(challenge_location)
setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
# clean_challenge_location = get_test_path(challenge_location)
setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
# in the case of a suite
if isinstance(challenge_data, ChallengeData):
@ -86,7 +86,7 @@ def create_single_test(
setattr(
challenge_class,
"_data_cache",
{clean_challenge_location: challenge_data},
{challenge_location: challenge_data},
)
setattr(
@ -161,6 +161,7 @@ def create_challenge(
json_files: deque,
) -> deque:
path = Path(json_file).resolve()
print("Creating challenge for", path)
if suite_config is not None:
grandparent_dir = path.parent.parent
@ -212,6 +213,7 @@ def create_challenge(
else:
create_single_test(data, str(path))
print("Creation complete for", path)
return json_files
@ -223,6 +225,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
print("Generating tests...")
challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
print(f"Looking for challenges in {challenges_path}...")
json_files = deque(
glob.glob(
@ -231,7 +234,8 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
)
)
agent_benchmark_config_path = Path.cwd() / "agbenchmark" / "config.json"
print(f"Found {len(json_files)} challenges.")
print(f"Sample path: {json_files[0]}")
if "--agent-config" in sys.argv:
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
@ -266,7 +270,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))
commands = sys.argv
# --category flag
# --by flag
if "--category" in commands:
categories = data.get("category", [])
commands_set = set(commands)
@ -317,6 +321,7 @@ def generate_tests() -> None: # sourcery skip: invert-any-all
print(f"Generated suite for {suite_config.prefix}.")
else:
print(f"Generated test for {data['name']}.")
print("Test generation complete.")
def challenge_should_be_ignored(json_file):

View File

@ -18,47 +18,7 @@ from benchmark.utils.utils import (
get_test_path,
replace_backslash,
)
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
if "--agent-config" in sys.argv:
agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
else:
print(sys.argv)
try:
with open(agent_benchmark_config_path, "r") as f:
agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
agent_benchmark_config.agent_benchmark_config_path = (
agent_benchmark_config_path
)
return agent_benchmark_config
except json.JSONDecodeError:
print("Error: benchmark_config.json is not a valid JSON file.")
raise
def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path()
)
# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(agent_benchmark_config.get_reports_path() / "report.json")
)
# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path()
)
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
from benchmark import REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
def generate_combined_suite_report(

View File

@ -11,7 +11,7 @@ import openai
import pytest
from benchmark.agent_api_interface import run_api_agent
from benchmark.utils.data_types import ChallengeData, Ground
from benchmark.utils.data_types import ChallengeData, Ground, AgentBenchmarkConfig
from benchmark.utils.prompts import (
END_PROMPT,
FEW_SHOT_EXAMPLES,
@ -74,7 +74,8 @@ class Challenge(ABC):
config["workspace"], "artifacts_out", path
)
else:
run_agent(self.task, cutoff)
agent_benchmark_config: AgentBenchmarkConfig = config["AgentBenchmarkConfig"]
run_agent(self.task, cutoff, agent_config=agent_benchmark_config)
# hidden files are added after the agent runs. Hidden files can be python test files.
# We copy them in the workspace to make it easy to import the code produced by the agent

View File

@ -1,6 +1,7 @@
import glob
import json
import sys
import os
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
@ -68,9 +69,9 @@ def calculate_info_test_path(base_path: Path) -> Path:
# Create the full new directory path with ISO standard UTC date-time stamp
report_path = base_path / f"{date_stamp}_{run_name}"
# Ensure the new directory is created
report_path.mkdir(exist_ok=True)
return report_path
@ -78,6 +79,8 @@ class AgentBenchmarkConfig(BaseModel):
"""
This class represents the configuration for the Agent Benchmark.
It includes the following attributes:
- agent_benchmark_config_path: The path to the agent benchmark config that this object was created from.
- entry_path: The path to the entry point of the benchmark for the agent, relative to the agent_benchmark_config_path.
- workspace: The path to the workspace where the benchmark will be run.
- reports_folder: The path to the folder where the benchmark reports will be stored.
- api_mode: A boolean indicating whether the benchmark is run in API mode.
@ -85,6 +88,7 @@ class AgentBenchmarkConfig(BaseModel):
"""
agent_benchmark_config_path: Path | None = None
entry_path: str
workspace: Path
reports_folder: Path | None = None
api_mode: bool = False
@ -93,7 +97,7 @@ class AgentBenchmarkConfig(BaseModel):
def get_reports_location(self) -> Path:
if not self.reports_folder:
self.reports_folder = (
self.agent_benchmark_config_path / "reports"
Path(self.agent_benchmark_config_path).parent / "reports"
).resolve()
return self.reports_folder
@ -105,7 +109,12 @@ class AgentBenchmarkConfig(BaseModel):
def get_success_rate_path(self) -> Path:
return self.get_reports_location() / "success_rate.json"
def get_agent_home_directory(self) -> Path:
return Path(self.agent_benchmark_config_path).resolve().parent
def get_agent_entry_path(self) -> Path:
return (self.get_agent_home_directory() / self.entry_path).resolve()
class Info(BaseModel):
difficulty: DifficultyLevel