Added ability to keep answers

pull/5177/head^2
SwiftyOS 2023-09-13 11:56:31 +02:00
parent bacd0e5e4e
commit d44a4f591d
8 changed files with 33 additions and 14 deletions

View File

@ -1,10 +1,12 @@
from pathlib import Path
import json
from datetime import datetime, timezone
from pathlib import Path
from .reports.ReportManager import ReportManager
from .utils.data_types import AgentBenchmarkConfig
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
@ -24,18 +26,19 @@ def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
agent_benchmark_config = get_agent_benchmark_config()
# tests that consistently pass are considered regression tests
REGRESSION_MANAGER = ReportManager(
agent_benchmark_config.get_regression_reports_path()
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME
)
# print(f"Using {REPORTS_PATH} for reports")
# user facing reporting information
INFO_MANAGER = ReportManager(
str(agent_benchmark_config.get_reports_path() / "report.json")
str(agent_benchmark_config.get_reports_path() / "report.json"),
BENCHMARK_START_TIME,
)
# internal db step in replacement track pass/fail rate
INTERNAL_INFO_MANAGER = ReportManager(
agent_benchmark_config.get_success_rate_path()
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME
)
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER

View File

@ -11,10 +11,9 @@ import pytest
import toml
from helicone.lock import HeliconeLockManager
from agbenchmark import BENCHMARK_START_TIME
from agbenchmark.utils.data_types import AgentBenchmarkConfig
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
@ -58,6 +57,7 @@ def run_benchmark(
mock: bool = False,
no_dep: bool = False,
nc: bool = False,
keep_answers: bool = False,
category: Optional[list[str]] = None,
skip_category: Optional[list[str]] = None,
test: Optional[str] = None,
@ -98,6 +98,9 @@ def run_benchmark(
print(f"{key}: {value}")
pytest_args = ["-vs"]
if keep_answers:
pytest_args.append("--keep-answers")
if test:
print("Running specific test:", test)
pytest_args.extend(["-k", test, "--test"])
@ -187,6 +190,7 @@ def cli() -> None:
help="Run without dependencies",
)
@click.option("--nc", is_flag=True, help="Run without cutoff")
@click.option("--keep-answers", is_flag=True, help="Keep answers")
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
def start(
maintain: bool,
@ -195,6 +199,7 @@ def start(
mock: bool,
no_dep: bool,
nc: bool,
keep_answers: bool,
category: Optional[list[str]] = None,
skip_category: Optional[list[str]] = None,
test: Optional[str] = None,
@ -215,6 +220,7 @@ def start(
mock=mock,
no_dep=no_dep,
nc=nc,
keep_answers=keep_answers,
category=category,
skip_category=skip_category,
test=test,
@ -231,6 +237,7 @@ def start(
mock=mock,
no_dep=no_dep,
nc=nc,
keep_answers=keep_answers,
category=category,
skip_category=skip_category,
test=test,

View File

@ -186,6 +186,7 @@ def pytest_addoption(parser: Any) -> None:
The "--explore" option is used to run the tests in exploration mode.
The "--test" option is used to run a specific test.
The "--no_dep" option is used to run the tests without dependencies.
The "--keep_answers" option is used to keep the answers of the tests.
Args:
parser (Any): The parser object to which the command-line options are added.
@ -201,6 +202,7 @@ def pytest_addoption(parser: Any) -> None:
parser.addoption("--improve", action="store_true", default=False)
parser.addoption("--maintain", action="store_true", default=False)
parser.addoption("--explore", action="store_true", default=False)
parser.addoption("--keep-answers", action="store_true", default=False)
@pytest.fixture(autouse=True)
@ -313,7 +315,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
)
if call.when == "call":
answers = getattr(item, 'answers', None)
answers = getattr(item, "answers", None)
generate_single_call_report(item, call, challenge_data, answers)
if call.when == "teardown":

View File

@ -77,7 +77,9 @@ def create_single_test(
await self.setup_challenge(config, timeout)
scores = self.get_scores(config)
request.node.answers = scores["answers"] # store answers in request.node
request.node.answers = (
scores["answers"] if "--keep-answers" in sys.argv else None
)
del scores["answers"] # remove answers from scores
request.node.scores = scores # store scores in request.node
assert 1 in scores["values"]

View File

@ -4,7 +4,6 @@ import sys
import time
from datetime import datetime, timezone
from agbenchmark.__main__ import BENCHMARK_START_TIME
from agbenchmark.reports.processing.graphs import save_single_radar_chart
from agbenchmark.reports.processing.process_report import get_agent_category
from agbenchmark.reports.processing.report_types import Report
@ -15,9 +14,11 @@ from agbenchmark.utils.utils import get_highest_success_difficulty
class ReportManager:
"""Abstracts interaction with the regression tests file"""
def __init__(self, filename: str):
def __init__(self, filename: str, benchmark_start_time: str):
self.filename = filename
self.start_time = time.time()
self.benchmark_start_time = benchmark_start_time
self.load()
def load(self) -> None:
@ -70,7 +71,7 @@ class ReportManager:
"completion_time": datetime.now(timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S+00:00"
),
"benchmark_start_time": BENCHMARK_START_TIME,
"benchmark_start_time": self.benchmark_start_time,
"metrics": {
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
"highest_difficulty": get_highest_success_difficulty(self.tests),

View File

@ -89,7 +89,7 @@ def generate_single_call_report(
}
if answers:
info_details["answers"] = answers
if "metadata" in challenge_data:
info_details["metadata"] = challenge_data["metadata"]

View File

@ -17,9 +17,12 @@ class DifficultyLevel(Enum):
expert = "expert"
human = "human"
class Workspace(BaseModel):
input: str
output: str
# map from enum to difficulty level (numeric)
DIFFICULTY_MAP = {
DifficultyLevel.interface: 1,

View File

@ -4,6 +4,7 @@ from typing import Optional
import requests
from agbenchmark import BENCHMARK_START_TIME
from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS
@ -30,7 +31,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
"name": "agent",
},
{
"value": {"equals": agbenchmark.start_agbenchmark.BENCHMARK_START_TIME},
"value": {"equals": BENCHMARK_START_TIME},
"name": "benchmark_start_time",
},
{"value": {"equals": challenge}, "name": "challenge"},