diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py index 694d10184..71e61bad4 100644 --- a/benchmark/agbenchmark/challenges/builtin.py +++ b/benchmark/agbenchmark/challenges/builtin.py @@ -10,7 +10,12 @@ from pathlib import Path from typing import Any, ClassVar, Iterator, Literal, Optional import pytest -from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig +from agent_protocol_client import ( + AgentApi, + ApiClient, + Configuration as ClientConfig, + Step, +) from colorama import Fore, Style from openai import _load_client as get_openai_client from pydantic import BaseModel, constr, Field, validator @@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge): n_steps = 0 timed_out = None agent_task_cost = None + steps: list[Step] = [] try: async for step in self.run_challenge( config, timeout, mock=request.config.getoption("--mock") @@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge): task_id = step.task_id n_steps += 1 + steps.append(step.copy()) if step.additional_output: agent_task_cost = step.additional_output.get( "task_total_cost", @@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge): timed_out = False except TimeoutError: timed_out = True + request.node.user_properties.append(("steps", steps)) request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) request.node.user_properties.append(("agent_task_cost", agent_task_cost)) diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py index 2e51ab2be..9f44ac8f4 100644 --- a/benchmark/agbenchmark/challenges/webarena.py +++ b/benchmark/agbenchmark/challenges/webarena.py @@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge): n_steps = 0 timed_out = None agent_task_cost = None + steps: list[Step] = [] eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = [] try: async for step in self.run_challenge( @@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge): continue n_steps += 1 + steps.append(step) if step.additional_output: agent_task_cost = step.additional_output.get( "task_total_cost", @@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge): timed_out = False except TimeoutError: timed_out = True + request.node.user_properties.append(("steps", steps)) request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) request.node.user_properties.append(("agent_task_cost", agent_task_cost)) diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py index 0475455a7..ea2ad840f 100644 --- a/benchmark/agbenchmark/reports/processing/report_types.py +++ b/benchmark/agbenchmark/reports/processing/report_types.py @@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line import logging from typing import Any, Dict, List +from agent_protocol_client import Step from pydantic import BaseModel, Field, constr, validator datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$" @@ -24,6 +25,8 @@ class TestResult(BaseModel): """Whether the run had to be stopped due to reaching the timeout""" n_steps: int | None = None """The number of steps executed by the agent""" + steps: list[Step] = [] + """The steps generated by the agent""" cost: float | None = None """The (known) cost incurred by the run, e.g. from using paid LLM APIs""" diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py index 2068e86f2..431f4ba6e 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/agbenchmark/reports/reports.py @@ -97,6 +97,7 @@ def add_test_result_to_report( ), reached_cutoff=user_properties.get("timed_out", False), n_steps=user_properties.get("n_steps"), + steps=user_properties.get("steps", []), cost=user_properties.get("agent_task_cost"), ) )