feat(benchmark): Include Steps in Report

pull/6875/head
Reinier van der Leer 2024-02-19 17:08:24 +01:00
parent c339c6b54f
commit 3a17011129
No known key found for this signature in database
GPG Key ID: CDC1180FDAE06193
4 changed files with 16 additions and 1 deletions

View File

@ -10,7 +10,12 @@ from pathlib import Path
from typing import Any, ClassVar, Iterator, Literal, Optional
import pytest
from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
from agent_protocol_client import (
AgentApi,
ApiClient,
Configuration as ClientConfig,
Step,
)
from colorama import Fore, Style
from openai import _load_client as get_openai_client
from pydantic import BaseModel, constr, Field, validator
@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge):
n_steps = 0
timed_out = None
agent_task_cost = None
steps: list[Step] = []
try:
async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge):
task_id = step.task_id
n_steps += 1
steps.append(step.copy())
if step.additional_output:
agent_task_cost = step.additional_output.get(
"task_total_cost",
@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge):
timed_out = False
except TimeoutError:
timed_out = True
request.node.user_properties.append(("steps", steps))
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
request.node.user_properties.append(("agent_task_cost", agent_task_cost))

View File

@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge):
n_steps = 0
timed_out = None
agent_task_cost = None
steps: list[Step] = []
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try:
async for step in self.run_challenge(
@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge):
continue
n_steps += 1
steps.append(step)
if step.additional_output:
agent_task_cost = step.additional_output.get(
"task_total_cost",
@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
timed_out = False
except TimeoutError:
timed_out = True
request.node.user_properties.append(("steps", steps))
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
request.node.user_properties.append(("agent_task_cost", agent_task_cost))

View File

@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line
import logging
from typing import Any, Dict, List
from agent_protocol_client import Step
from pydantic import BaseModel, Field, constr, validator
datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
@ -24,6 +25,8 @@ class TestResult(BaseModel):
"""Whether the run had to be stopped due to reaching the timeout"""
n_steps: int | None = None
"""The number of steps executed by the agent"""
steps: list[Step] = []
"""The steps generated by the agent"""
cost: float | None = None
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""

View File

@ -97,6 +97,7 @@ def add_test_result_to_report(
),
reached_cutoff=user_properties.get("timed_out", False),
n_steps=user_properties.get("n_steps"),
steps=user_properties.get("steps", []),
cost=user_properties.get("agent_task_cost"),
)
)