feat(benchmark): Add reports/format.py script to convert report.json to markdown
parent
fb63bf4425
commit
bfd479a50b
|
@ -0,0 +1,136 @@
|
|||
import click
|
||||
|
||||
from agbenchmark.reports.processing.report_types import Report
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument("report_json_file", type=click.Path(exists=True, dir_okay=False))
|
||||
def print_markdown_report(report_json_file: str):
|
||||
"""
|
||||
Generates a Markdown report from a given report.json file.
|
||||
|
||||
:param report_json_file: Path to the report.json file.
|
||||
:return: A string containing the Markdown formatted report.
|
||||
"""
|
||||
report = Report.parse_file(report_json_file)
|
||||
|
||||
# Header and metadata
|
||||
click.echo(f"# Benchmark Report")
|
||||
click.echo(f"- ⌛ **Run time:** `{report.metrics.run_time}`")
|
||||
click.echo(
|
||||
f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`"
|
||||
)
|
||||
if report.completion_time:
|
||||
click.echo(
|
||||
f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`"
|
||||
)
|
||||
if report.metrics.total_cost:
|
||||
click.echo(f"- 💸 **Total cost:** `${round(report.metrics.total_cost, 2)}`")
|
||||
click.echo(
|
||||
f"- 🏅 **Highest achieved difficulty:** `{report.metrics.highest_difficulty}`"
|
||||
)
|
||||
click.echo(f"- ⚙️ **Command:** `{report.command}`")
|
||||
|
||||
click.echo() # spacing
|
||||
|
||||
# Aggregate information
|
||||
successful, failed, unreliable = [], [], []
|
||||
for test in report.tests.values():
|
||||
test.metrics.success_percentage = (
|
||||
rsp
|
||||
if (rsp := test.metrics.success_percentage) is not None
|
||||
else sum(float(r.success or 0) for r in test.results)
|
||||
* 100
|
||||
/ len(test.results)
|
||||
)
|
||||
if test.metrics.success_percentage == 100.0:
|
||||
successful.append(test)
|
||||
elif test.metrics.success_percentage == 0.0:
|
||||
failed.append(test)
|
||||
else:
|
||||
unreliable.append(test)
|
||||
|
||||
# Summary
|
||||
click.echo("## Summary")
|
||||
click.echo(f"- **`{len(successful)}` passed** {'✅'*len(successful)}")
|
||||
click.echo(f"- **`{len(failed)}` failed** {'❌'*len(failed)}")
|
||||
click.echo(f"- **`{len(unreliable)}` unreliable** {'⚠️'*len(unreliable)}")
|
||||
|
||||
click.echo() # spacing
|
||||
|
||||
# Test results
|
||||
click.echo("## Challenges")
|
||||
for test_name, test in report.tests.items():
|
||||
click.echo() # spacing
|
||||
|
||||
result_indicator = (
|
||||
"✅"
|
||||
if test.metrics.success_percentage == 100.0
|
||||
else "⚠️"
|
||||
if test.metrics.success_percentage > 0
|
||||
else "❌"
|
||||
)
|
||||
click.echo(
|
||||
f"### {test_name} {result_indicator if test.metrics.attempted else '❔'}"
|
||||
)
|
||||
click.echo(f"{test.description}")
|
||||
|
||||
click.echo() # spacing
|
||||
|
||||
click.echo(f"- **Attempted:** {'Yes 👍' if test.metrics.attempted else 'No 👎'}")
|
||||
click.echo(
|
||||
f"- **Success rate:** {round(test.metrics.success_percentage)}% "
|
||||
f"({len([r for r in test.results if r.success])}/{len(test.results)})"
|
||||
)
|
||||
click.echo(f"- **Difficulty:** `{test.difficulty}`")
|
||||
click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`")
|
||||
click.echo(
|
||||
f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n"
|
||||
f"{indent('> ', test.task)}\n\n"
|
||||
f"Reference answer:\n{indent('> ', test.answer)}\n"
|
||||
"</details>"
|
||||
)
|
||||
|
||||
click.echo() # spacing
|
||||
|
||||
click.echo("\n#### Attempts")
|
||||
for i, attempt in enumerate(test.results, 1):
|
||||
click.echo(
|
||||
f"\n{i}. **{'✅ Passed' if attempt.success else '❌ Failed'}** "
|
||||
f"in **{attempt.run_time}** "
|
||||
f"and **{quantify('step', attempt.n_steps)}**\n"
|
||||
)
|
||||
click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`")
|
||||
if attempt.fail_reason:
|
||||
click.echo(
|
||||
" - **Failure reason:**\n"
|
||||
+ indent(" > ", attempt.fail_reason)
|
||||
+ "\n"
|
||||
)
|
||||
if attempt.steps:
|
||||
click.echo(
|
||||
indent(
|
||||
3 * " ",
|
||||
"<details>\n<summary><strong>Steps</strong></summary>\n",
|
||||
)
|
||||
)
|
||||
for j, step in enumerate(attempt.steps, 1):
|
||||
click.echo()
|
||||
click.echo(
|
||||
indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}")
|
||||
)
|
||||
click.echo("\n</details>")
|
||||
|
||||
|
||||
def indent(indent: str, text: str, prefix_indent: bool = True) -> str:
|
||||
return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent)
|
||||
|
||||
|
||||
def quantify(noun: str, count: int, plural_suffix: str = "s") -> str:
|
||||
if count == 1:
|
||||
return f"{count} {noun}"
|
||||
return f"{count} {noun}{plural_suffix}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print_markdown_report()
|
Loading…
Reference in New Issue