Added ability to keep answers
parent
bacd0e5e4e
commit
d44a4f591d
|
@ -1,10 +1,12 @@
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from .reports.ReportManager import ReportManager
|
from .reports.ReportManager import ReportManager
|
||||||
from .utils.data_types import AgentBenchmarkConfig
|
from .utils.data_types import AgentBenchmarkConfig
|
||||||
|
|
||||||
|
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||||
|
|
||||||
|
|
||||||
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
def get_agent_benchmark_config() -> AgentBenchmarkConfig:
|
||||||
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
|
agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
|
||||||
|
@ -24,18 +26,19 @@ def get_report_managers() -> tuple[ReportManager, ReportManager, ReportManager]:
|
||||||
agent_benchmark_config = get_agent_benchmark_config()
|
agent_benchmark_config = get_agent_benchmark_config()
|
||||||
# tests that consistently pass are considered regression tests
|
# tests that consistently pass are considered regression tests
|
||||||
REGRESSION_MANAGER = ReportManager(
|
REGRESSION_MANAGER = ReportManager(
|
||||||
agent_benchmark_config.get_regression_reports_path()
|
agent_benchmark_config.get_regression_reports_path(), BENCHMARK_START_TIME
|
||||||
)
|
)
|
||||||
|
|
||||||
# print(f"Using {REPORTS_PATH} for reports")
|
# print(f"Using {REPORTS_PATH} for reports")
|
||||||
# user facing reporting information
|
# user facing reporting information
|
||||||
INFO_MANAGER = ReportManager(
|
INFO_MANAGER = ReportManager(
|
||||||
str(agent_benchmark_config.get_reports_path() / "report.json")
|
str(agent_benchmark_config.get_reports_path() / "report.json"),
|
||||||
|
BENCHMARK_START_TIME,
|
||||||
)
|
)
|
||||||
|
|
||||||
# internal db step in replacement track pass/fail rate
|
# internal db step in replacement track pass/fail rate
|
||||||
INTERNAL_INFO_MANAGER = ReportManager(
|
INTERNAL_INFO_MANAGER = ReportManager(
|
||||||
agent_benchmark_config.get_success_rate_path()
|
agent_benchmark_config.get_success_rate_path(), BENCHMARK_START_TIME
|
||||||
)
|
)
|
||||||
|
|
||||||
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
return REGRESSION_MANAGER, INFO_MANAGER, INTERNAL_INFO_MANAGER
|
||||||
|
|
|
@ -11,10 +11,9 @@ import pytest
|
||||||
import toml
|
import toml
|
||||||
from helicone.lock import HeliconeLockManager
|
from helicone.lock import HeliconeLockManager
|
||||||
|
|
||||||
|
from agbenchmark import BENCHMARK_START_TIME
|
||||||
from agbenchmark.utils.data_types import AgentBenchmarkConfig
|
from agbenchmark.utils.data_types import AgentBenchmarkConfig
|
||||||
|
|
||||||
BENCHMARK_START_TIME = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
|
||||||
|
|
||||||
if os.environ.get("HELICONE_API_KEY"):
|
if os.environ.get("HELICONE_API_KEY"):
|
||||||
HeliconeLockManager.write_custom_property(
|
HeliconeLockManager.write_custom_property(
|
||||||
"benchmark_start_time", BENCHMARK_START_TIME
|
"benchmark_start_time", BENCHMARK_START_TIME
|
||||||
|
@ -58,6 +57,7 @@ def run_benchmark(
|
||||||
mock: bool = False,
|
mock: bool = False,
|
||||||
no_dep: bool = False,
|
no_dep: bool = False,
|
||||||
nc: bool = False,
|
nc: bool = False,
|
||||||
|
keep_answers: bool = False,
|
||||||
category: Optional[list[str]] = None,
|
category: Optional[list[str]] = None,
|
||||||
skip_category: Optional[list[str]] = None,
|
skip_category: Optional[list[str]] = None,
|
||||||
test: Optional[str] = None,
|
test: Optional[str] = None,
|
||||||
|
@ -98,6 +98,9 @@ def run_benchmark(
|
||||||
print(f"{key}: {value}")
|
print(f"{key}: {value}")
|
||||||
|
|
||||||
pytest_args = ["-vs"]
|
pytest_args = ["-vs"]
|
||||||
|
if keep_answers:
|
||||||
|
pytest_args.append("--keep-answers")
|
||||||
|
|
||||||
if test:
|
if test:
|
||||||
print("Running specific test:", test)
|
print("Running specific test:", test)
|
||||||
pytest_args.extend(["-k", test, "--test"])
|
pytest_args.extend(["-k", test, "--test"])
|
||||||
|
@ -187,6 +190,7 @@ def cli() -> None:
|
||||||
help="Run without dependencies",
|
help="Run without dependencies",
|
||||||
)
|
)
|
||||||
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
@click.option("--nc", is_flag=True, help="Run without cutoff")
|
||||||
|
@click.option("--keep-answers", is_flag=True, help="Keep answers")
|
||||||
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
|
||||||
def start(
|
def start(
|
||||||
maintain: bool,
|
maintain: bool,
|
||||||
|
@ -195,6 +199,7 @@ def start(
|
||||||
mock: bool,
|
mock: bool,
|
||||||
no_dep: bool,
|
no_dep: bool,
|
||||||
nc: bool,
|
nc: bool,
|
||||||
|
keep_answers: bool,
|
||||||
category: Optional[list[str]] = None,
|
category: Optional[list[str]] = None,
|
||||||
skip_category: Optional[list[str]] = None,
|
skip_category: Optional[list[str]] = None,
|
||||||
test: Optional[str] = None,
|
test: Optional[str] = None,
|
||||||
|
@ -215,6 +220,7 @@ def start(
|
||||||
mock=mock,
|
mock=mock,
|
||||||
no_dep=no_dep,
|
no_dep=no_dep,
|
||||||
nc=nc,
|
nc=nc,
|
||||||
|
keep_answers=keep_answers,
|
||||||
category=category,
|
category=category,
|
||||||
skip_category=skip_category,
|
skip_category=skip_category,
|
||||||
test=test,
|
test=test,
|
||||||
|
@ -231,6 +237,7 @@ def start(
|
||||||
mock=mock,
|
mock=mock,
|
||||||
no_dep=no_dep,
|
no_dep=no_dep,
|
||||||
nc=nc,
|
nc=nc,
|
||||||
|
keep_answers=keep_answers,
|
||||||
category=category,
|
category=category,
|
||||||
skip_category=skip_category,
|
skip_category=skip_category,
|
||||||
test=test,
|
test=test,
|
||||||
|
|
|
@ -186,6 +186,7 @@ def pytest_addoption(parser: Any) -> None:
|
||||||
The "--explore" option is used to run the tests in exploration mode.
|
The "--explore" option is used to run the tests in exploration mode.
|
||||||
The "--test" option is used to run a specific test.
|
The "--test" option is used to run a specific test.
|
||||||
The "--no_dep" option is used to run the tests without dependencies.
|
The "--no_dep" option is used to run the tests without dependencies.
|
||||||
|
The "--keep_answers" option is used to keep the answers of the tests.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
parser (Any): The parser object to which the command-line options are added.
|
parser (Any): The parser object to which the command-line options are added.
|
||||||
|
@ -201,6 +202,7 @@ def pytest_addoption(parser: Any) -> None:
|
||||||
parser.addoption("--improve", action="store_true", default=False)
|
parser.addoption("--improve", action="store_true", default=False)
|
||||||
parser.addoption("--maintain", action="store_true", default=False)
|
parser.addoption("--maintain", action="store_true", default=False)
|
||||||
parser.addoption("--explore", action="store_true", default=False)
|
parser.addoption("--explore", action="store_true", default=False)
|
||||||
|
parser.addoption("--keep-answers", action="store_true", default=False)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
|
@ -313,7 +315,7 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
if call.when == "call":
|
if call.when == "call":
|
||||||
answers = getattr(item, 'answers', None)
|
answers = getattr(item, "answers", None)
|
||||||
generate_single_call_report(item, call, challenge_data, answers)
|
generate_single_call_report(item, call, challenge_data, answers)
|
||||||
|
|
||||||
if call.when == "teardown":
|
if call.when == "teardown":
|
||||||
|
|
|
@ -77,7 +77,9 @@ def create_single_test(
|
||||||
await self.setup_challenge(config, timeout)
|
await self.setup_challenge(config, timeout)
|
||||||
|
|
||||||
scores = self.get_scores(config)
|
scores = self.get_scores(config)
|
||||||
request.node.answers = scores["answers"] # store answers in request.node
|
request.node.answers = (
|
||||||
|
scores["answers"] if "--keep-answers" in sys.argv else None
|
||||||
|
)
|
||||||
del scores["answers"] # remove answers from scores
|
del scores["answers"] # remove answers from scores
|
||||||
request.node.scores = scores # store scores in request.node
|
request.node.scores = scores # store scores in request.node
|
||||||
assert 1 in scores["values"]
|
assert 1 in scores["values"]
|
||||||
|
|
|
@ -4,7 +4,6 @@ import sys
|
||||||
import time
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from agbenchmark.__main__ import BENCHMARK_START_TIME
|
|
||||||
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
from agbenchmark.reports.processing.graphs import save_single_radar_chart
|
||||||
from agbenchmark.reports.processing.process_report import get_agent_category
|
from agbenchmark.reports.processing.process_report import get_agent_category
|
||||||
from agbenchmark.reports.processing.report_types import Report
|
from agbenchmark.reports.processing.report_types import Report
|
||||||
|
@ -15,9 +14,11 @@ from agbenchmark.utils.utils import get_highest_success_difficulty
|
||||||
class ReportManager:
|
class ReportManager:
|
||||||
"""Abstracts interaction with the regression tests file"""
|
"""Abstracts interaction with the regression tests file"""
|
||||||
|
|
||||||
def __init__(self, filename: str):
|
def __init__(self, filename: str, benchmark_start_time: str):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.start_time = time.time()
|
self.start_time = time.time()
|
||||||
|
self.benchmark_start_time = benchmark_start_time
|
||||||
|
|
||||||
self.load()
|
self.load()
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
|
@ -70,7 +71,7 @@ class ReportManager:
|
||||||
"completion_time": datetime.now(timezone.utc).strftime(
|
"completion_time": datetime.now(timezone.utc).strftime(
|
||||||
"%Y-%m-%dT%H:%M:%S+00:00"
|
"%Y-%m-%dT%H:%M:%S+00:00"
|
||||||
),
|
),
|
||||||
"benchmark_start_time": BENCHMARK_START_TIME,
|
"benchmark_start_time": self.benchmark_start_time,
|
||||||
"metrics": {
|
"metrics": {
|
||||||
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
|
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
|
||||||
"highest_difficulty": get_highest_success_difficulty(self.tests),
|
"highest_difficulty": get_highest_success_difficulty(self.tests),
|
||||||
|
|
|
@ -17,9 +17,12 @@ class DifficultyLevel(Enum):
|
||||||
expert = "expert"
|
expert = "expert"
|
||||||
human = "human"
|
human = "human"
|
||||||
|
|
||||||
|
|
||||||
class Workspace(BaseModel):
|
class Workspace(BaseModel):
|
||||||
input: str
|
input: str
|
||||||
output: str
|
output: str
|
||||||
|
|
||||||
|
|
||||||
# map from enum to difficulty level (numeric)
|
# map from enum to difficulty level (numeric)
|
||||||
DIFFICULTY_MAP = {
|
DIFFICULTY_MAP = {
|
||||||
DifficultyLevel.interface: 1,
|
DifficultyLevel.interface: 1,
|
||||||
|
|
|
@ -4,6 +4,7 @@ from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from agbenchmark import BENCHMARK_START_TIME
|
||||||
from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS
|
from agbenchmark.agent_interface import HELICONE_GRAPHQL_LOGS
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,7 +31,7 @@ query ExampleQuery($properties: [PropertyFilter!]){
|
||||||
"name": "agent",
|
"name": "agent",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"value": {"equals": agbenchmark.start_agbenchmark.BENCHMARK_START_TIME},
|
"value": {"equals": BENCHMARK_START_TIME},
|
||||||
"name": "benchmark_start_time",
|
"name": "benchmark_start_time",
|
||||||
},
|
},
|
||||||
{"value": {"equals": challenge}, "name": "challenge"},
|
{"value": {"equals": challenge}, "name": "challenge"},
|
||||||
|
|
Loading…
Reference in New Issue