Added ability to keep answers
parent
bacd0e5e4e
commit
d44a4f591d
|
@ -1,10 +1,12 @@
|
|||
from pathlib import Path
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from .reports.ReportManager import ReportManager
|
||||
from .utils.data_types import AgentBenchmarkConfig
|
||||
|
||||
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||
|
||||
|
||||
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
||||
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
|
||||
|
@ -24,18 +26,19 @@ def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
|
|||
agent_benchmark_config = get_agent_benchmark_config()
|
||||
# tests that consistently pass are considered regression tests
|
||||
REGRESSION_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_regression_reports_path()
|
||||
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME
|
||||
)
|
||||
|
||||
# print(f"Using {REPORTS_PATH} for reports")
|
||||
# user facing reporting information
|
||||
INFO_MANAGER = ReportManager(
|
||||
str(agent_benchmark_config.get_reports_path() / "report.json")
|
||||
str(agent_benchmark_config.get_reports_path() / "report.json"),
|
||||
BENCHMARK_START_TIME,
|
||||
)
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
INTERNAL_INFO_MANAGER = ReportManager(
|
||||
agent_benchmark_config.get_success_rate_path()
|
||||
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME
|
||||
)
|
||||
|
||||
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||
|
|
|
@ -11,10 +11,9 @@ import pytest
|
|||
import toml
|
||||
from helicone.lock import HeliconeLockManager
|
||||
|
||||
from agbenchmark import BENCHMARK_START_TIME
|
||||
from agbenchmark.utils.data_types import AgentBenchmarkConfig
|
||||
|
||||
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||
|
||||
if os.environ.get("HELICONE_API_KEY"):
|
||||
HeliconeLockManager.write_custom_property(
|
||||
"benchmark_start_time", BENCHMARK_START_TIME
|
||||
|
@ -58,6 +57,7 @@ def run_benchmark(
|
|||
mock: bool = False,
|
||||
no_dep: bool = False,
|
||||
nc: bool = False,
|
||||
keep_answers: bool = False,
|
||||
category: Optional[list[str]] = None,
|
||||
skip_category: Optional[list[str]] = None,
|
||||
test: Optional[str] = None,
|
||||
|
@ -98,6 +98,9 @@ def run_benchmark(
|
|||
print(f"{key}: {value}")
|
||||
|
||||
pytest_args = ["-vs"]
|
||||
if keep_answers:
|
||||
pytest_args.append("--keep-answers")
|
||||
|
||||
if test:
|
||||
print("Running specific test:", test)
|
||||
pytest_args.extend(["-k", test, "--test"])
|
||||
|
@ -187,6 +190,7 @@ def cli() -> None:
|
|||
help="Run without dependencies",
|
||||
)
|
||||
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
||||
@click.option("--keep-answers", is_flag=True, help="Keep answers")
|
||||
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
||||
def start(
|
||||
maintain: bool,
|
||||
|
@ -195,6 +199,7 @@ def start(
|
|||
mock: bool,
|
||||
no_dep: bool,
|
||||
nc: bool,
|
||||
keep_answers: bool,
|
||||
category: Optional[list[str]] = None,
|
||||
skip_category: Optional[list[str]] = None,
|
||||
test: Optional[str] = None,
|
||||
|
@ -215,6 +220,7 @@ def start(
|
|||
mock=mock,
|
||||
no_dep=no_dep,
|
||||
nc=nc,
|
||||
keep_answers=keep_answers,
|
||||
category=category,
|
||||
skip_category=skip_category,
|
||||
test=test,
|
||||
|
@ -231,6 +237,7 @@ def start(
|
|||
mock=mock,
|
||||
no_dep=no_dep,
|
||||
nc=nc,
|
||||
keep_answers=keep_answers,
|
||||
category=category,
|
||||
skip_category=skip_category,
|
||||
test=test,
|
||||
|
|
|
@ -186,6 +186,7 @@ def pytest_addoption(parser: Any) -> None:
|
|||
The "--explore" option is used to run the tests in exploration mode.
|
||||
The "--test" option is used to run a specific test.
|
||||
The "--no_dep" option is used to run the tests without dependencies.
|
||||
The "--keep_answers" option is used to keep the answers of the tests.
|
||||
|
||||
Args:
|
||||
parser (Any): The parser object to which the command-line options are added.
|
||||
|
@ -201,6 +202,7 @@ def pytest_addoption(parser: Any) -> None:
|
|||
parser.addoption("--improve", action="store_true", default=False)
|
||||
parser.addoption("--maintain", action="store_true", default=False)
|
||||
parser.addoption("--explore", action="store_true", default=False)
|
||||
parser.addoption("--keep-answers", action="store_true", default=False)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
|
@ -313,7 +315,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|||
)
|
||||
|
||||
if call.when == "call":
|
||||
answers = getattr(item, 'answers', None)
|
||||
answers = getattr(item, "answers", None)
|
||||
generate_single_call_report(item, call, challenge_data, answers)
|
||||
|
||||
if call.when == "teardown":
|
||||
|
|
|
@ -77,7 +77,9 @@ def create_single_test(
|
|||
await self.setup_challenge(config, timeout)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
request.node.answers = scores["answers"] # store answers in request.node
|
||||
request.node.answers = (
|
||||
scores["answers"] if "--keep-answers" in sys.argv else None
|
||||
)
|
||||
del scores["answers"] # remove answers from scores
|
||||
request.node.scores = scores # store scores in request.node
|
||||
assert 1 in scores["values"]
|
||||
|
|
|
@ -4,7 +4,6 @@ import sys
|
|||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from agbenchmark.__main__ import BENCHMARK_START_TIME
|
||||
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
||||
from agbenchmark.reports.processing.process_report import get_agent_category
|
||||
from agbenchmark.reports.processing.report_types import Report
|
||||
|
@ -15,9 +14,11 @@ from agbenchmark.utils.utils import get_highest_success_difficulty
|
|||
class ReportManager:
|
||||
"""Abstracts interaction with the regression tests file"""
|
||||
|
||||
def __init__(self, filename: str):
|
||||
def __init__(self, filename: str, benchmark_start_time: str):
|
||||
self.filename = filename
|
||||
self.start_time = time.time()
|
||||
self.benchmark_start_time = benchmark_start_time
|
||||
|
||||
self.load()
|
||||
|
||||
def load(self) -> None:
|
||||
|
@ -70,7 +71,7 @@ class ReportManager:
|
|||
"completion_time": datetime.now(timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S+00:00"
|
||||
),
|
||||
"benchmark_start_time": BENCHMARK_START_TIME,
|
||||
"benchmark_start_time": self.benchmark_start_time,
|
||||
"metrics": {
|
||||
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
|
||||
"highest_difficulty": get_highest_success_difficulty(self.tests),
|
||||
|
|
|
@ -89,7 +89,7 @@ def generate_single_call_report(
|
|||
}
|
||||
if answers:
|
||||
info_details["answers"] = answers
|
||||
|
||||
|
||||
if "metadata" in challenge_data:
|
||||
info_details["metadata"] = challenge_data["metadata"]
|
||||
|
||||
|
|
|
@ -17,9 +17,12 @@ class DifficultyLevel(Enum):
|
|||
expert = "expert"
|
||||
human = "human"
|
||||
|
||||
|
||||
class Workspace(BaseModel):
|
||||
input: str
|
||||
output: str
|
||||
|
||||
|
||||
# map from enum to difficulty level (numeric)
|
||||
DIFFICULTY_MAP = {
|
||||
DifficultyLevel.interface: 1,
|
||||
|
|
|
@ -4,6 +4,7 @@ from typing import Optional
|
|||
|
||||
import requests
|
||||
|
||||
from agbenchmark import BENCHMARK_START_TIME
|
||||
from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS
|
||||
|
||||
|
||||
|
@ -30,7 +31,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
|
|||
"name": "agent",
|
||||
},
|
||||
{
|
||||
"value": {"equals": agbenchmark.start_agbenchmark.BENCHMARK_START_TIME},
|
||||
"value": {"equals": BENCHMARK_START_TIME},
|
||||
"name": "benchmark_start_time",
|
||||
},
|
||||
{"value": {"equals": challenge}, "name": "challenge"},
|
||||
|
|
Loading…
Reference in New Issue