import logging import os from pathlib import Path from typing import Optional, Sequence from dotenv import load_dotenv from agbenchmark.challenges import get_unique_categories from agbenchmark.config import AgentBenchmarkConfig load_dotenv() logger = logging.getLogger(__name__) def run_benchmark( config: AgentBenchmarkConfig, maintain: bool = False, improve: bool = False, explore: bool = False, tests: tuple[str] = tuple(), categories: tuple[str] = tuple(), skip_categories: tuple[str] = tuple(), attempts_per_challenge: int = 1, mock: bool = False, no_dep: bool = False, no_cutoff: bool = False, cutoff: Optional[int] = None, keep_answers: bool = False, server: bool = False, ) -> int: """ Starts the benchmark. If a category flag is provided, only challenges with the corresponding mark will be run. """ import pytest from agbenchmark.reports.ReportManager import SingletonReportManager validate_args( maintain=maintain, improve=improve, explore=explore, tests=tests, categories=categories, skip_categories=skip_categories, no_cutoff=no_cutoff, cutoff=cutoff, ) SingletonReportManager() for key, value in vars(config).items(): logger.debug(f"config.{key} = {repr(value)}") pytest_args = ["-vs"] if tests: logger.info(f"Running specific test(s): {' '.join(tests)}") pytest_args += [f"--test={t}" for t in tests] else: all_categories = get_unique_categories() if categories or skip_categories: categories_to_run = set(categories) or all_categories if skip_categories: categories_to_run = categories_to_run.difference(set(skip_categories)) assert categories_to_run, "Error: You can't skip all categories" pytest_args += [f"--category={c}" for c in categories_to_run] logger.info(f"Running tests of category: {categories_to_run}") else: logger.info("Running all categories") if maintain: logger.info("Running only regression tests") elif improve: logger.info("Running only non-regression tests") elif explore: logger.info("Only attempt challenges that have never been beaten") if mock: # TODO: unhack os.environ[ "IS_MOCK" ] = "True" # ugly hack to make the mock work when calling from API # Pass through flags for flag, active in { "--maintain": maintain, "--improve": improve, "--explore": explore, "--no-dep": no_dep, "--mock": mock, "--nc": no_cutoff, "--keep-answers": keep_answers, }.items(): if active: pytest_args.append(flag) if attempts_per_challenge > 1: pytest_args.append(f"--attempts={attempts_per_challenge}") if cutoff: pytest_args.append(f"--cutoff={cutoff}") logger.debug(f"Setting cuttoff override to {cutoff} seconds.") current_dir = Path(__file__).resolve().parent pytest_args.append(str(current_dir / "generate_test.py")) pytest_args.append("--cache-clear") logger.debug(f"Running Pytest with args: {pytest_args}") exit_code = pytest.main(pytest_args) SingletonReportManager.clear_instance() return exit_code class InvalidInvocationError(ValueError): pass def validate_args( maintain: bool, improve: bool, explore: bool, tests: Sequence[str], categories: Sequence[str], skip_categories: Sequence[str], no_cutoff: bool, cutoff: Optional[int], ) -> None: if categories: all_categories = get_unique_categories() invalid_categories = set(categories) - all_categories if invalid_categories: raise InvalidInvocationError( "One or more invalid categories were specified: " f"{', '.join(invalid_categories)}.\n" f"Valid categories are: {', '.join(all_categories)}." ) if (maintain + improve + explore) > 1: raise InvalidInvocationError( "You can't use --maintain, --improve or --explore at the same time. " "Please choose one." ) if tests and (categories or skip_categories or maintain or improve or explore): raise InvalidInvocationError( "If you're running a specific test make sure no other options are " "selected. Please just pass the --test." ) if no_cutoff and cutoff: raise InvalidInvocationError( "You can't use both --nc and --cutoff at the same time. " "Please choose one." )