Added ability to keep answers

2023-09-13 11:56:31 +02:00 · 2023-09-13 11:56:31 +02:00 · d44a4f591d
parent bacd0e5e4e
commit d44a4f591d
8 changed files with 33 additions and 14 deletions
--- a/benchmark/agbenchmark/init.py
+++ b/benchmark/agbenchmark/init.py
@ -1,10 +1,12 @@
-from pathlib import Path
-
 import json
+from datetime import datetime, timezone
+from pathlib import Path

 from .reports.ReportManager import ReportManager
 from .utils.data_types import AgentBenchmarkConfig

+BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
+

 def get_agent_benchmark_config() -> AgentBenchmarkConfig:
    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
@ -24,18 +26,19 @@ def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
    agent_benchmark_config = get_agent_benchmark_config()
    # tests that consistently pass are considered regression tests
    REGRESSION_MANAGER = ReportManager(
-        agent_benchmark_config.get_regression_reports_path()
+        agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME
    )

    # print(f"Using {REPORTS_PATH} for reports")
    # user facing reporting information
    INFO_MANAGER = ReportManager(
-        str(agent_benchmark_config.get_reports_path() / "report.json")
+        str(agent_benchmark_config.get_reports_path() / "report.json"),
+        BENCHMARK_START_TIME,
    )

    # internal db step in replacement track pass/fail rate
    INTERNAL_INFO_MANAGER = ReportManager(
-        agent_benchmark_config.get_success_rate_path()
+        agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME
    )

    return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
--- a/benchmark/agbenchmark/main.py
+++ b/benchmark/agbenchmark/main.py
@ -11,10 +11,9 @@ import pytest
 import toml
 from helicone.lock import HeliconeLockManager

+from agbenchmark import BENCHMARK_START_TIME
 from agbenchmark.utils.data_types import AgentBenchmarkConfig

-BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
-
 if os.environ.get("HELICONE_API_KEY"):
    HeliconeLockManager.write_custom_property(
        "benchmark_start_time", BENCHMARK_START_TIME
@ -58,6 +57,7 @@ def run_benchmark(
    mock: bool = False,
    no_dep: bool = False,
    nc: bool = False,
+    keep_answers: bool = False,
    category: Optional[list[str]] = None,
    skip_category: Optional[list[str]] = None,
    test: Optional[str] = None,
@ -98,6 +98,9 @@ def run_benchmark(
        print(f"{key}: {value}")

    pytest_args = ["-vs"]
+    if keep_answers:
+        pytest_args.append("--keep-answers")
+
    if test:
        print("Running specific test:", test)
        pytest_args.extend(["-k", test, "--test"])
@ -187,6 +190,7 @@ def cli() -> None:
    help="Run without dependencies",
 )
@click.option("--nc", is_flag=True, help="Run without cutoff")
+@click.option("--keep-answers", is_flag=True, help="Keep answers")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
 def start(
    maintain: bool,
@ -195,6 +199,7 @@ def start(
    mock: bool,
    no_dep: bool,
    nc: bool,
+    keep_answers: bool,
    category: Optional[list[str]] = None,
    skip_category: Optional[list[str]] = None,
    test: Optional[str] = None,
@ -215,6 +220,7 @@ def start(
                mock=mock,
                no_dep=no_dep,
                nc=nc,
+                keep_answers=keep_answers,
                category=category,
                skip_category=skip_category,
                test=test,
@ -231,6 +237,7 @@ def start(
            mock=mock,
            no_dep=no_dep,
            nc=nc,
+            keep_answers=keep_answers,
            category=category,
            skip_category=skip_category,
            test=test,
--- a/benchmark/agbenchmark/conftest.py
+++ b/benchmark/agbenchmark/conftest.py
@ -186,6 +186,7 @@ def pytest_addoption(parser: Any) -> None:
    The "--explore" option is used to run the tests in exploration mode.
    The "--test" option is used to run a specific test.
    The "--no_dep" option is used to run the tests without dependencies.
+    The "--keep_answers" option is used to keep the answers of the tests.

    Args:
        parser (Any): The parser object to which the command-line options are added.
@ -201,6 +202,7 @@ def pytest_addoption(parser: Any) -> None:
    parser.addoption("--improve", action="store_true", default=False)
    parser.addoption("--maintain", action="store_true", default=False)
    parser.addoption("--explore", action="store_true", default=False)
+    parser.addoption("--keep-answers", action="store_true", default=False)


@pytest.fixture(autouse=True)
@ -313,7 +315,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
    )

    if call.when == "call":
-        answers = getattr(item, 'answers', None)
+        answers = getattr(item, "answers", None)
        generate_single_call_report(item, call, challenge_data, answers)

    if call.when == "teardown":
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@ -77,7 +77,9 @@ def create_single_test(
        await self.setup_challenge(config, timeout)

        scores = self.get_scores(config)
-        request.node.answers = scores["answers"]  # store answers in request.node
+        request.node.answers = (
+            scores["answers"] if "--keep-answers" in sys.argv else None
+        )
        del scores["answers"]  # remove answers from scores
        request.node.scores = scores  # store scores in request.node
        assert 1 in scores["values"]
--- a/benchmark/agbenchmark/reports/ReportManager.py
+++ b/benchmark/agbenchmark/reports/ReportManager.py
@ -4,7 +4,6 @@ import sys
 import time
 from datetime import datetime, timezone

-from agbenchmark.__main__ import BENCHMARK_START_TIME
 from agbenchmark.reports.processing.graphs import save_single_radar_chart
 from agbenchmark.reports.processing.process_report import get_agent_category
 from agbenchmark.reports.processing.report_types import Report
@ -15,9 +14,11 @@ from agbenchmark.utils.utils import get_highest_success_difficulty
 class ReportManager:
    """Abstracts interaction with the regression tests file"""

-    def __init__(self, filename: str):
+    def __init__(self, filename: str, benchmark_start_time: str):
        self.filename = filename
        self.start_time = time.time()
+        self.benchmark_start_time = benchmark_start_time
+
        self.load()

    def load(self) -> None:
@ -70,7 +71,7 @@ class ReportManager:
            "completion_time": datetime.now(timezone.utc).strftime(
                "%Y-%m-%dT%H:%M:%S+00:00"
            ),
-            "benchmark_start_time": BENCHMARK_START_TIME,
+            "benchmark_start_time": self.benchmark_start_time,
            "metrics": {
                "run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
                "highest_difficulty": get_highest_success_difficulty(self.tests),
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@ -89,7 +89,7 @@ def generate_single_call_report(
    }
    if answers:
        info_details["answers"] = answers
-        
+
    if "metadata" in challenge_data:
        info_details["metadata"] = challenge_data["metadata"]

--- a/benchmark/agbenchmark/utils/data_types.py
+++ b/benchmark/agbenchmark/utils/data_types.py
@ -17,9 +17,12 @@ class DifficultyLevel(Enum):
    expert = "expert"
    human = "human"

+
 class Workspace(BaseModel):
    input: str
    output: str
+
+
 # map from enum to difficulty level (numeric)
 DIFFICULTY_MAP = {
    DifficultyLevel.interface: 1,
--- a/benchmark/agbenchmark/utils/get_data_from_helicone.py
+++ b/benchmark/agbenchmark/utils/get_data_from_helicone.py
@ -4,6 +4,7 @@ from typing import Optional

 import requests

+from agbenchmark import BENCHMARK_START_TIME
 from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS


@ -30,7 +31,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
                "name": "agent",
            },
            {
-                "value": {"equals": agbenchmark.start_agbenchmark.BENCHMARK_START_TIME},
+                "value": {"equals": BENCHMARK_START_TIME},
                "name": "benchmark_start_time",
            },
            {"value": {"equals": challenge}, "name": "challenge"},