feat(benchmark): Include Steps in Report
parent
c339c6b54f
commit
3a17011129
|
@ -10,7 +10,12 @@ from pathlib import Path
|
|||
from typing import Any, ClassVar, Iterator, Literal, Optional
|
||||
|
||||
import pytest
|
||||
from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
|
||||
from agent_protocol_client import (
|
||||
AgentApi,
|
||||
ApiClient,
|
||||
Configuration as ClientConfig,
|
||||
Step,
|
||||
)
|
||||
from colorama import Fore, Style
|
||||
from openai import _load_client as get_openai_client
|
||||
from pydantic import BaseModel, constr, Field, validator
|
||||
|
@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge):
|
|||
n_steps = 0
|
||||
timed_out = None
|
||||
agent_task_cost = None
|
||||
steps: list[Step] = []
|
||||
try:
|
||||
async for step in self.run_challenge(
|
||||
config, timeout, mock=request.config.getoption("--mock")
|
||||
|
@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge):
|
|||
task_id = step.task_id
|
||||
|
||||
n_steps += 1
|
||||
steps.append(step.copy())
|
||||
if step.additional_output:
|
||||
agent_task_cost = step.additional_output.get(
|
||||
"task_total_cost",
|
||||
|
@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge):
|
|||
timed_out = False
|
||||
except TimeoutError:
|
||||
timed_out = True
|
||||
request.node.user_properties.append(("steps", steps))
|
||||
request.node.user_properties.append(("n_steps", n_steps))
|
||||
request.node.user_properties.append(("timed_out", timed_out))
|
||||
request.node.user_properties.append(("agent_task_cost", agent_task_cost))
|
||||
|
|
|
@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge):
|
|||
n_steps = 0
|
||||
timed_out = None
|
||||
agent_task_cost = None
|
||||
steps: list[Step] = []
|
||||
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
||||
try:
|
||||
async for step in self.run_challenge(
|
||||
|
@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge):
|
|||
continue
|
||||
|
||||
n_steps += 1
|
||||
steps.append(step)
|
||||
if step.additional_output:
|
||||
agent_task_cost = step.additional_output.get(
|
||||
"task_total_cost",
|
||||
|
@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
|
|||
timed_out = False
|
||||
except TimeoutError:
|
||||
timed_out = True
|
||||
request.node.user_properties.append(("steps", steps))
|
||||
request.node.user_properties.append(("n_steps", n_steps))
|
||||
request.node.user_properties.append(("timed_out", timed_out))
|
||||
request.node.user_properties.append(("agent_task_cost", agent_task_cost))
|
||||
|
|
|
@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line
|
|||
import logging
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from agent_protocol_client import Step
|
||||
from pydantic import BaseModel, Field, constr, validator
|
||||
|
||||
datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
|
||||
|
@ -24,6 +25,8 @@ class TestResult(BaseModel):
|
|||
"""Whether the run had to be stopped due to reaching the timeout"""
|
||||
n_steps: int | None = None
|
||||
"""The number of steps executed by the agent"""
|
||||
steps: list[Step] = []
|
||||
"""The steps generated by the agent"""
|
||||
cost: float | None = None
|
||||
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ def add_test_result_to_report(
|
|||
),
|
||||
reached_cutoff=user_properties.get("timed_out", False),
|
||||
n_steps=user_properties.get("n_steps"),
|
||||
steps=user_properties.get("steps", []),
|
||||
cost=user_properties.get("agent_task_cost"),
|
||||
)
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue