feat(benchmark): Include Steps in Report

2024-02-19 17:08:24 +01:00 · 2024-02-19 17:08:24 +01:00 · 3a17011129
parent c339c6b54f
commit 3a17011129
4 changed files with 16 additions and 1 deletions
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@ -10,7 +10,12 @@ from pathlib import Path
 from typing import Any, ClassVar, Iterator, Literal, Optional

 import pytest
-from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
+from agent_protocol_client import (
+    AgentApi,
+    ApiClient,
+    Configuration as ClientConfig,
+    Step,
+)
 from colorama import Fore, Style
 from openai import _load_client as get_openai_client
 from pydantic import BaseModel, constr, Field, validator
@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge):
        n_steps = 0
        timed_out = None
        agent_task_cost = None
+        steps: list[Step] = []
        try:
            async for step in self.run_challenge(
                config, timeout, mock=request.config.getoption("--mock")
@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge):
                    task_id = step.task_id

                n_steps += 1
+                steps.append(step.copy())
                if step.additional_output:
                    agent_task_cost = step.additional_output.get(
                        "task_total_cost",
@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge):
            timed_out = False
        except TimeoutError:
            timed_out = True
+        request.node.user_properties.append(("steps", steps))
        request.node.user_properties.append(("n_steps", n_steps))
        request.node.user_properties.append(("timed_out", timed_out))
        request.node.user_properties.append(("agent_task_cost", agent_task_cost))
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge):
        n_steps = 0
        timed_out = None
        agent_task_cost = None
+        steps: list[Step] = []
        eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
        try:
            async for step in self.run_challenge(
@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge):
                    continue

                n_steps += 1
+                steps.append(step)
                if step.additional_output:
                    agent_task_cost = step.additional_output.get(
                        "task_total_cost",
@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
            timed_out = False
        except TimeoutError:
            timed_out = True
+        request.node.user_properties.append(("steps", steps))
        request.node.user_properties.append(("n_steps", n_steps))
        request.node.user_properties.append(("timed_out", timed_out))
        request.node.user_properties.append(("agent_task_cost", agent_task_cost))
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line
 import logging
 from typing import Any, Dict, List

+from agent_protocol_client import Step
 from pydantic import BaseModel, Field, constr, validator

 datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
@ -24,6 +25,8 @@ class TestResult(BaseModel):
    """Whether the run had to be stopped due to reaching the timeout"""
    n_steps: int | None = None
    """The number of steps executed by the agent"""
+    steps: list[Step] = []
+    """The steps generated by the agent"""
    cost: float | None = None
    """The (known) cost incurred by the run, e.g. from using paid LLM APIs"""

--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@ -97,6 +97,7 @@ def add_test_result_to_report(
                ),
                reached_cutoff=user_properties.get("timed_out", False),
                n_steps=user_properties.get("n_steps"),
+                steps=user_properties.get("steps", []),
                cost=user_properties.get("agent_task_cost"),
            )
        )