more benchmark fixes

2023-09-12 12:47:06 +02:00 · 2023-09-12 12:47:06 +02:00 · 1281a264f5
parent d3067c83d3
commit 1281a264f5
7 changed files with 102 additions and 75 deletions
--- a/benchmark/benchmark/init.py
+++ b/benchmark/benchmark/init.py
@ -3,3 +3,48 @@
 # pydevd_pycharm.settrace(
 #     "localhost", port=9739, stdoutToServer=True, stderrToServer=True
 # )
+from .utils.data_types import AgentBenchmarkConfig
+import sys
+import json
+from .reports.ReportManager import ReportManager
+
+def get_agent_benchmark_config() -> AgentBenchmarkConfig:
+    if "--agent-config" in sys.argv:
+        agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
+    else:
+        print(sys.argv)
+    try:
+        with open(agent_benchmark_config_path, "r") as f:
+            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
+            agent_benchmark_config.agent_benchmark_config_path = (
+                agent_benchmark_config_path
+            )
+            return agent_benchmark_config
+    except json.JSONDecodeError:
+        print("Error: benchmark_config.json is not a valid JSON file.")
+        raise
+
+
+def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
+    agent_benchmark_config = get_agent_benchmark_config()
+    # tests that consistently pass are considered regression tests
+    REGRESSION_MANAGER = ReportManager(
+        agent_benchmark_config.get_regression_reports_path()
+    )
+
+    # print(f"Using {REPORTS_PATH} for reports")
+    # user facing reporting information
+    INFO_MANAGER = ReportManager(
+        str(agent_benchmark_config.get_reports_path() / "report.json")
+    )
+
+    # internal db step in replacement track pass/fail rate
+    INTERNAL_INFO_MANAGER = ReportManager(
+        agent_benchmark_config.get_success_rate_path()
+    )
+
+    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
+
+
+
+(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
--- a/benchmark/benchmark/agent_interface.py
+++ b/benchmark/benchmark/agent_interface.py
@ -12,6 +12,8 @@ from typing import Any, List
 import psutil
 from dotenv import load_dotenv

+from benchmark.utils.data_types import AgentBenchmarkConfig
+
 load_dotenv()

 helicone_graphql_logs = os.getenv("HELICONE_GRAPHQL_LOGS")
@ -72,20 +74,21 @@ def run_windows_env(process: Any, start_time: float, timeout: float) -> None:
        process.terminate()


-def run_agent(task: str, timeout: int) -> None:
+def run_agent(task: str, timeout: int, agent_config: AgentBenchmarkConfig) -> None:
    """Calling to get a response"""

-    entry_path = "benchmark.benchmarks"
-
+    entry_path = agent_config.get_agent_entry_path()
    print(f"Running '{entry_path}' with timeout {timeout}")

-    command = [sys.executable, "-m", entry_path, str(task)]
+    command = [sys.executable, entry_path, str(task)]
+
+    
    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        universal_newlines=True,
-        cwd=benchmark.start_benchmark.HOME_DIRECTORY,
+        cwd=agent_config.get_agent_directory(),
        bufsize=1,
    )

--- a/benchmark/benchmark/conftest.py
+++ b/benchmark/benchmark/conftest.py
@ -51,7 +51,7 @@ def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
        raise


-def resolve_workspace(workspace: str) -> str:
+def resolve_workspace(workspace: Path) -> Path:
    """
    This function resolves the workspace path.

@ -64,7 +64,7 @@ def resolve_workspace(workspace: str) -> str:
    Raises:
        ValueError: If the workspace path expression is invalid.
    """
-    if workspace.startswith("${") and workspace.endswith("}"):
+    if isinstance(workspace, str) and workspace.startswith("${") and workspace.endswith("}"):
        # Extract the string inside ${...}
        path_expr = workspace[2:-1]

@ -77,8 +77,10 @@ def resolve_workspace(workspace: str) -> str:
            return path_value
        else:
            raise ValueError("Invalid workspace path expression.")
+    elif isinstance(workspace, Path):
+        return os.path.abspath(workspace)
    else:
-        return os.path.abspath(Path(os.getcwd()) / workspace)
+        raise ValueError("Invalid workspace type. Expected str or Path.")


@pytest.fixture(scope="module")
@ -98,6 +100,7 @@ def config(request: Any) -> Any:
        json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
    """
    agent_benchmark_config_path = request.config.getoption("--agent_config_path")
+    config = {'workspace': {}}
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
@ -186,19 +189,19 @@ def pytest_addoption(parser: Any) -> None:
    Args:
        parser (Any): The parser object to which the command-line options are added.
    """
-    parser.addoption("--agent_config_path", action="store_true", default=False)
+    parser.addoption("--agent_config_path", action="store", default=False)
+    parser.addoption("--no_dep", action="store_true", default=False)
+    parser.addoption("--suite", action="store_true", default=False)
    parser.addoption("--mock", action="store_true", default=False)
    parser.addoption("--api_mode", action="store_true", default=False)
    parser.addoption("--host", action="store_true", default=None)
-    parser.addoption("--category", action="store_true", default=False)
    parser.addoption("--nc", action="store_true", default=False)
    parser.addoption("--cutoff", action="store_true", default=False)
+    parser.addoption("--category", action="store_true", default=False)
+    parser.addoption("--test", action="store_true", default=None)
    parser.addoption("--improve", action="store_true", default=False)
    parser.addoption("--maintain", action="store_true", default=False)
    parser.addoption("--explore", action="store_true", default=False)
-    parser.addoption("--test", action="store_true", default=None)
-    parser.addoption("--no_dep", action="store_true", default=False)
-    parser.addoption("--suite", action="store_true", default=False)


@pytest.fixture(autouse=True)
@ -433,7 +436,8 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise

-    data = json.loads(agent_benchmark_config.get_regression_reports_path())
+    regression_file = agent_benchmark_config.get_regression_reports_path()
+    data = json.loads(open(regression_file, 'r').read()) if os.path.exists(regression_file) else {}

    for item in items:
        # Assuming item.cls is your test class
@ -444,19 +448,19 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:

        # Then you can access your properties
        name = item.parent.cls.__name__
-        dependencies = test_class_instance.data.dependencies
+        # dependencies = test_class_instance.data.dependencies

        # Filter dependencies if they exist in regression data if its an improvement test
-        if config.getoption("--improve") or config.getoption(
-            "--category"
-        ):  # TODO: same task suite
-            dependencies = [dep for dep in dependencies if not data.get(dep, None)]
-        if (  # TODO: separate task suite
-            config.getoption("--test")
-            or config.getoption("--no_dep")
-            or config.getoption("--maintain")
-        ):
-            dependencies = []
+        # if config.getoption("--improve") or config.getoption(
+        #     "--category"
+        # ):  # TODO: same task suite
+        #     dependencies = [dep for dep in dependencies if not data.get(dep, None)]
+        # if (  # TODO: separate task suite
+        #     config.getoption("--test")
+        #     or config.getoption("--no_dep")
+        #     or config.getoption("--maintain")
+        # ):
+        dependencies = []

        # Add depends marker dynamically
        item.add_marker(pytest.mark.depends(on=dependencies, name=name))
--- a/benchmark/benchmark/generate_test.py
+++ b/benchmark/benchmark/generate_test.py
@ -72,8 +72,8 @@ def create_single_test(
    # Define test class dynamically
    challenge_class = types.new_class(data["name"], (Challenge,))
    print(challenge_location)
-    clean_challenge_location = get_test_path(challenge_location)
-    setattr(challenge_class, "CHALLENGE_LOCATION", clean_challenge_location)
+    # clean_challenge_location = get_test_path(challenge_location)
+    setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)

    # in the case of a suite
    if isinstance(challenge_data, ChallengeData):
@ -86,7 +86,7 @@ def create_single_test(
        setattr(
            challenge_class,
            "_data_cache",
-            {clean_challenge_location: challenge_data},
+            {challenge_location: challenge_data},
        )

    setattr(
@ -161,6 +161,7 @@ def create_challenge(
    json_files: deque,
 ) -> deque:
    path = Path(json_file).resolve()
+    print("Creating challenge for", path)
    if suite_config is not None:
        grandparent_dir = path.parent.parent

@ -212,6 +213,7 @@ def create_challenge(

    else:
        create_single_test(data, str(path))
+    print("Creation complete for", path)

    return json_files

@ -223,6 +225,7 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
    print("Generating tests...")

    challenges_path = os.path.join(os.path.dirname(__file__), "challenges")
+    print(f"Looking for challenges in {challenges_path}...")

    json_files = deque(
        glob.glob(
@ -231,7 +234,8 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
        )
    )

-    agent_benchmark_config_path = Path.cwd() / "agbenchmark" / "config.json"
+    print(f"Found {len(json_files)} challenges.")
+    print(f"Sample path: {json_files[0]}")

    if "--agent-config" in sys.argv:
        agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
@ -266,7 +270,7 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
        suite_config = SuiteConfig.suite_data_if_suite(Path(json_file))

        commands = sys.argv
-        # --category flag
+        # --by flag
        if "--category" in commands:
            categories = data.get("category", [])
            commands_set = set(commands)
@ -317,6 +321,7 @@ def generate_tests() -> None:  # sourcery skip: invert-any-all
            print(f"Generated suite for {suite_config.prefix}.")
        else:
            print(f"Generated test for {data['name']}.")
+    print("Test generation complete.")


 def challenge_should_be_ignored(json_file):
--- a/benchmark/benchmark/reports/reports.py
+++ b/benchmark/benchmark/reports/reports.py
@ -18,47 +18,7 @@ from benchmark.utils.utils import (
    get_test_path,
    replace_backslash,
 )
-
-
-def get_agent_benchmark_config() -> AgentBenchmarkConfig:
-    if "--agent-config" in sys.argv:
-        agent_benchmark_config_path = sys.argv[sys.argv.index("--agent-config") + 1]
-    else:
-        print(sys.argv)
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-
-def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
-    agent_benchmark_config = get_agent_benchmark_config()
-    # tests that consistently pass are considered regression tests
-    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path()
-    )
-
-    # print(f"Using {REPORTS_PATH} for reports")
-    # user facing reporting information
-    INFO_MANAGER = ReportManager(
-        str(agent_benchmark_config.get_reports_path() / "report.json")
-    )
-
-    # internal db step in replacement track pass/fail rate
-    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path()
-    )
-
-    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
-
-
-(REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER) = get_report_managers()
+from benchmark import REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER


 def generate_combined_suite_report(
--- a/benchmark/benchmark/utils/challenge.py
+++ b/benchmark/benchmark/utils/challenge.py
@ -11,7 +11,7 @@ import openai
 import pytest

 from benchmark.agent_api_interface import run_api_agent
-from benchmark.utils.data_types import ChallengeData, Ground
+from benchmark.utils.data_types import ChallengeData, Ground, AgentBenchmarkConfig
 from benchmark.utils.prompts import (
    END_PROMPT,
    FEW_SHOT_EXAMPLES,
@ -74,7 +74,8 @@ class Challenge(ABC):
                    config["workspace"], "artifacts_out", path
                )
        else:
-            run_agent(self.task, cutoff)
+            agent_benchmark_config: AgentBenchmarkConfig = config["AgentBenchmarkConfig"]
+            run_agent(self.task, cutoff, agent_config=agent_benchmark_config)

        # hidden files are added after the agent runs. Hidden files can be python test files.
        # We copy them in the workspace to make it easy to import the code produced by the agent
--- a/benchmark/benchmark/utils/data_types.py
+++ b/benchmark/benchmark/utils/data_types.py
@ -1,6 +1,7 @@
 import glob
 import json
 import sys
+import os
 from datetime import datetime, timezone
 from enum import Enum
 from pathlib import Path
@ -68,9 +69,9 @@ def calculate_info_test_path(base_path: Path) -> Path:
    # Create the full new directory path with ISO standard UTC date-time stamp
    report_path = base_path / f"{date_stamp}_{run_name}"

+
    # Ensure the new directory is created
    report_path.mkdir(exist_ok=True)
-
    return report_path


@ -78,6 +79,8 @@ class AgentBenchmarkConfig(BaseModel):
    """
    This class represents the configuration for the Agent Benchmark.
    It includes the following attributes:
+    - agent_benchmark_config_path: The path to the agent benchmark config that this object was created from.
+    - entry_path: The path to the entry point of the benchmark for the agent, relative to the agent_benchmark_config_path.
    - workspace: The path to the workspace where the benchmark will be run.
    - reports_folder: The path to the folder where the benchmark reports will be stored.
    - api_mode: A boolean indicating whether the benchmark is run in API mode.
@ -85,6 +88,7 @@ class AgentBenchmarkConfig(BaseModel):
    """

    agent_benchmark_config_path: Path | None = None
+    entry_path: str
    workspace: Path
    reports_folder: Path | None = None
    api_mode: bool = False
@ -93,7 +97,7 @@ class AgentBenchmarkConfig(BaseModel):
    def get_reports_location(self) -> Path:
        if not self.reports_folder:
            self.reports_folder = (
-                self.agent_benchmark_config_path / "reports"
+                Path(self.agent_benchmark_config_path).parent / "reports"
            ).resolve()
        return self.reports_folder

@ -105,7 +109,12 @@ class AgentBenchmarkConfig(BaseModel):

    def get_success_rate_path(self) -> Path:
        return self.get_reports_location() / "success_rate.json"
+    
+    def get_agent_home_directory(self) -> Path:
+        return Path(self.agent_benchmark_config_path).resolve().parent

+    def get_agent_entry_path(self) -> Path:
+        return (self.get_agent_home_directory() / self.entry_path).resolve()

 class Info(BaseModel):
    difficulty: DifficultyLevel