feat(benchmark/report): Add and record `TestResult.n_steps`
- Added `n_steps` attribute to `TestResult` type - Added logic to record the number of steps to `BuiltinChallenge.test_method`, `WebArenaChallenge.test_method`, and `.reports.add_test_result_to_report`pull/6857/head
parent
a5de79beb6
commit
752bac099b
|
@ -173,6 +173,7 @@ class BuiltinChallenge(BaseChallenge):
|
||||||
timeout = int(cutoff) # type: ignore
|
timeout = int(cutoff) # type: ignore
|
||||||
|
|
||||||
task_id = ""
|
task_id = ""
|
||||||
|
n_steps = 0
|
||||||
timed_out = None
|
timed_out = None
|
||||||
try:
|
try:
|
||||||
async for step in self.run_challenge(
|
async for step in self.run_challenge(
|
||||||
|
@ -180,9 +181,11 @@ class BuiltinChallenge(BaseChallenge):
|
||||||
):
|
):
|
||||||
if not task_id:
|
if not task_id:
|
||||||
task_id = step.task_id
|
task_id = step.task_id
|
||||||
|
n_steps += 1
|
||||||
timed_out = False
|
timed_out = False
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
timed_out = True
|
timed_out = True
|
||||||
|
request.node.user_properties.append(("n_steps", n_steps))
|
||||||
request.node.user_properties.append(("timed_out", timed_out))
|
request.node.user_properties.append(("timed_out", timed_out))
|
||||||
|
|
||||||
agent_client_config = ClientConfig(host=config.host)
|
agent_client_config = ClientConfig(host=config.host)
|
||||||
|
|
|
@ -393,6 +393,7 @@ class WebArenaChallenge(BaseChallenge):
|
||||||
elif cutoff := request.config.getoption("--cutoff"):
|
elif cutoff := request.config.getoption("--cutoff"):
|
||||||
timeout = int(cutoff)
|
timeout = int(cutoff)
|
||||||
|
|
||||||
|
n_steps = 0
|
||||||
timed_out = None
|
timed_out = None
|
||||||
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
|
||||||
try:
|
try:
|
||||||
|
@ -402,6 +403,7 @@ class WebArenaChallenge(BaseChallenge):
|
||||||
if not step.output:
|
if not step.output:
|
||||||
logger.warn(f"Step has no output: {step}")
|
logger.warn(f"Step has no output: {step}")
|
||||||
continue
|
continue
|
||||||
|
n_steps += 1
|
||||||
step_eval_results = self.evaluate_step_result(
|
step_eval_results = self.evaluate_step_result(
|
||||||
step, mock=request.config.getoption("--mock")
|
step, mock=request.config.getoption("--mock")
|
||||||
)
|
)
|
||||||
|
@ -419,6 +421,7 @@ class WebArenaChallenge(BaseChallenge):
|
||||||
timed_out = False
|
timed_out = False
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
timed_out = True
|
timed_out = True
|
||||||
|
request.node.user_properties.append(("n_steps", n_steps))
|
||||||
request.node.user_properties.append(("timed_out", timed_out))
|
request.node.user_properties.append(("timed_out", timed_out))
|
||||||
|
|
||||||
# Get the column aggregate (highest score for each Eval)
|
# Get the column aggregate (highest score for each Eval)
|
||||||
|
|
|
@ -20,6 +20,8 @@ class TestResult(BaseModel):
|
||||||
"""If applicable, the reason why the run was not successful"""
|
"""If applicable, the reason why the run was not successful"""
|
||||||
reached_cutoff: bool | None = None # None if in progress
|
reached_cutoff: bool | None = None # None if in progress
|
||||||
"""Whether the run had to be stopped due to reaching the timeout"""
|
"""Whether the run had to be stopped due to reaching the timeout"""
|
||||||
|
n_steps: int | None = None
|
||||||
|
"""The number of steps executed by the agent"""
|
||||||
cost: float | None = None
|
cost: float | None = None
|
||||||
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
|
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
|
||||||
|
|
||||||
|
|
|
@ -92,6 +92,7 @@ def add_test_result_to_report(
|
||||||
run_time=f"{str(round(call.duration, 3))} seconds",
|
run_time=f"{str(round(call.duration, 3))} seconds",
|
||||||
fail_reason=str(call.excinfo.value) if call.excinfo else None,
|
fail_reason=str(call.excinfo.value) if call.excinfo else None,
|
||||||
reached_cutoff=user_properties.get("timed_out", False),
|
reached_cutoff=user_properties.get("timed_out", False),
|
||||||
|
n_steps=user_properties.get("n_steps"),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
test_report.metrics.success_percentage = (
|
test_report.metrics.success_percentage = (
|
||||||
|
|
Loading…
Reference in New Issue