From 752bac099bb977a0e9e106a1b92ee4d4141d525f Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Fri, 16 Feb 2024 17:53:19 +0100 Subject: [PATCH] feat(benchmark/report): Add and record `TestResult.n_steps` - Added `n_steps` attribute to `TestResult` type - Added logic to record the number of steps to `BuiltinChallenge.test_method`, `WebArenaChallenge.test_method`, and `.reports.add_test_result_to_report` --- benchmark/agbenchmark/challenges/builtin.py | 3 +++ benchmark/agbenchmark/challenges/webarena.py | 3 +++ benchmark/agbenchmark/reports/processing/report_types.py | 2 ++ benchmark/agbenchmark/reports/reports.py | 1 + 4 files changed, 9 insertions(+) diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py index fd28dc3ee..5b616e449 100644 --- a/benchmark/agbenchmark/challenges/builtin.py +++ b/benchmark/agbenchmark/challenges/builtin.py @@ -173,6 +173,7 @@ class BuiltinChallenge(BaseChallenge): timeout = int(cutoff) # type: ignore task_id = "" + n_steps = 0 timed_out = None try: async for step in self.run_challenge( @@ -180,9 +181,11 @@ class BuiltinChallenge(BaseChallenge): ): if not task_id: task_id = step.task_id + n_steps += 1 timed_out = False except TimeoutError: timed_out = True + request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) agent_client_config = ClientConfig(host=config.host) diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py index 395b5a6ee..3cec1f956 100644 --- a/benchmark/agbenchmark/challenges/webarena.py +++ b/benchmark/agbenchmark/challenges/webarena.py @@ -393,6 +393,7 @@ class WebArenaChallenge(BaseChallenge): elif cutoff := request.config.getoption("--cutoff"): timeout = int(cutoff) + n_steps = 0 timed_out = None eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = [] try: @@ -402,6 +403,7 @@ class WebArenaChallenge(BaseChallenge): if not step.output: logger.warn(f"Step has no output: {step}") continue + n_steps += 1 step_eval_results = self.evaluate_step_result( step, mock=request.config.getoption("--mock") ) @@ -419,6 +421,7 @@ class WebArenaChallenge(BaseChallenge): timed_out = False except TimeoutError: timed_out = True + request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) # Get the column aggregate (highest score for each Eval) diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py index b6deef021..2ed4acf3b 100644 --- a/benchmark/agbenchmark/reports/processing/report_types.py +++ b/benchmark/agbenchmark/reports/processing/report_types.py @@ -20,6 +20,8 @@ class TestResult(BaseModel): """If applicable, the reason why the run was not successful""" reached_cutoff: bool | None = None # None if in progress """Whether the run had to be stopped due to reaching the timeout""" + n_steps: int | None = None + """The number of steps executed by the agent""" cost: float | None = None """The (known) cost incurred by the run, e.g. from using paid LLM APIs""" diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py index 728d19fd9..4844f5bfe 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/agbenchmark/reports/reports.py @@ -92,6 +92,7 @@ def add_test_result_to_report( run_time=f"{str(round(call.duration, 3))} seconds", fail_reason=str(call.excinfo.value) if call.excinfo else None, reached_cutoff=user_properties.get("timed_out", False), + n_steps=user_properties.get("n_steps"), ) ) test_report.metrics.success_percentage = (