144 lines
4.9 KiB
Python
Executable File
144 lines
4.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
from pathlib import Path
|
|
|
|
import click
|
|
|
|
from agbenchmark.reports.processing.report_types import Report
|
|
|
|
|
|
@click.command()
|
|
@click.argument(
|
|
"report_json_file", type=click.Path(exists=True, dir_okay=False, path_type=Path)
|
|
)
|
|
def print_markdown_report(report_json_file: Path):
|
|
"""
|
|
Generates a Markdown report from a given report.json file.
|
|
|
|
:param report_json_file: Path to the report.json file.
|
|
:return: A string containing the Markdown formatted report.
|
|
"""
|
|
report = Report.model_validate_json(report_json_file.read_text())
|
|
|
|
# Header and metadata
|
|
click.echo("# Benchmark Report")
|
|
click.echo(f"- ⌛ **Run time:** `{report.metrics.run_time}`")
|
|
click.echo(
|
|
f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`"
|
|
)
|
|
if report.completion_time:
|
|
click.echo(
|
|
f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`"
|
|
)
|
|
if report.metrics.total_cost:
|
|
click.echo(f"- 💸 **Total cost:** `${round(report.metrics.total_cost, 2)}`")
|
|
click.echo(
|
|
f"- 🏅 **Highest achieved difficulty:** `{report.metrics.highest_difficulty}`"
|
|
)
|
|
click.echo(f"- ⚙️ **Command:** `{report.command}`")
|
|
|
|
click.echo() # spacing
|
|
|
|
# Aggregate information
|
|
successful, failed, unreliable = [], [], []
|
|
for test in report.tests.values():
|
|
test.metrics.success_percentage = (
|
|
rsp
|
|
if (rsp := test.metrics.success_percentage) is not None
|
|
else sum(float(r.success or 0) for r in test.results)
|
|
* 100
|
|
/ len(test.results)
|
|
)
|
|
if test.metrics.success_percentage == 100.0:
|
|
successful.append(test)
|
|
elif test.metrics.success_percentage == 0.0:
|
|
failed.append(test)
|
|
else:
|
|
unreliable.append(test)
|
|
|
|
# Summary
|
|
click.echo("## Summary")
|
|
click.echo(f"- **`{len(successful)}` passed** {'✅'*len(successful)}")
|
|
click.echo(f"- **`{len(failed)}` failed** {'❌'*len(failed)}")
|
|
click.echo(f"- **`{len(unreliable)}` unreliable** {'⚠️'*len(unreliable)}")
|
|
|
|
click.echo() # spacing
|
|
|
|
# Test results
|
|
click.echo("## Challenges")
|
|
for test_name, test in report.tests.items():
|
|
click.echo() # spacing
|
|
|
|
result_indicator = (
|
|
"✅"
|
|
if test.metrics.success_percentage == 100.0
|
|
else "⚠️"
|
|
if test.metrics.success_percentage > 0
|
|
else "❌"
|
|
)
|
|
click.echo(
|
|
f"### {test_name} {result_indicator if test.metrics.attempted else '❔'}"
|
|
)
|
|
click.echo(f"{test.description}")
|
|
|
|
click.echo() # spacing
|
|
|
|
click.echo(f"- **Attempted:** {'Yes 👍' if test.metrics.attempted else 'No 👎'}")
|
|
click.echo(
|
|
f"- **Success rate:** {round(test.metrics.success_percentage)}% "
|
|
f"({len([r for r in test.results if r.success])}/{len(test.results)})"
|
|
)
|
|
click.echo(f"- **Difficulty:** `{test.difficulty}`")
|
|
click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`")
|
|
click.echo(
|
|
f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n"
|
|
f"{indent('> ', test.task)}\n\n"
|
|
f"Reference answer:\n{indent('> ', test.answer)}\n"
|
|
"</details>"
|
|
)
|
|
|
|
click.echo() # spacing
|
|
|
|
click.echo("\n#### Attempts")
|
|
for i, attempt in enumerate(test.results, 1):
|
|
click.echo(
|
|
f"\n{i}. **{'✅ Passed' if attempt.success else '❌ Failed'}** "
|
|
f"in **{attempt.run_time}** "
|
|
f"and **{quantify('step', attempt.n_steps)}**\n"
|
|
)
|
|
if attempt.cost is not None:
|
|
click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`")
|
|
if attempt.fail_reason:
|
|
click.echo(
|
|
" - **Failure reason:**\n"
|
|
+ indent(" > ", attempt.fail_reason)
|
|
+ "\n"
|
|
)
|
|
if attempt.steps:
|
|
click.echo(
|
|
indent(
|
|
3 * " ",
|
|
"<details>\n<summary><strong>Steps</strong></summary>\n",
|
|
)
|
|
)
|
|
for j, step in enumerate(attempt.steps, 1):
|
|
click.echo()
|
|
click.echo(
|
|
indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}")
|
|
)
|
|
click.echo("\n</details>")
|
|
|
|
|
|
def indent(indent: str, text: str, prefix_indent: bool = True) -> str:
|
|
return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent)
|
|
|
|
|
|
def quantify(noun: str, count: int, plural_suffix: str = "s") -> str:
|
|
if count == 1:
|
|
return f"{count} {noun}"
|
|
return f"{count} {noun}{plural_suffix}"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print_markdown_report()
|