AutoGPT/classic/benchmark/reports/match_records.py

330 lines
12 KiB
Python
Raw Normal View History

import glob
2023-09-22 03:06:37 +00:00
import json
import os
2023-09-12 01:20:03 +00:00
from typing import Dict, List, Optional, Union
2023-09-22 03:06:37 +00:00
import pandas as pd
from gql import Client, gql
from gql.transport.aiohttp import AIOHTTPTransport
2023-09-12 01:20:03 +00:00
from pydantic import BaseModel, Field
2023-09-22 03:06:37 +00:00
# from agbenchmark.reports.processing.report_types import Report, SuiteTest
2023-09-12 01:20:03 +00:00
class Metrics(BaseModel):
difficulty: str
success: bool
success_percent: float = Field(alias="success_%")
2023-09-12 01:20:03 +00:00
run_time: Optional[str] = None
fail_reason: Optional[str] = None
attempted: Optional[bool] = None
class MetricsOverall(BaseModel):
run_time: str
highest_difficulty: str
percentage: Optional[float] = None
class Test(BaseModel):
data_path: str
is_regression: bool
answer: str
description: str
metrics: Metrics
category: List[str]
task: Optional[str] = None
reached_cutoff: Optional[bool] = None
class SuiteTest(BaseModel):
data_path: str
metrics: MetricsOverall
tests: Dict[str, Test]
category: Optional[List[str]] = None
task: Optional[str] = None
reached_cutoff: Optional[bool] = None
class Report(BaseModel):
command: str
completion_time: str
benchmark_start_time: str
metrics: MetricsOverall
tests: Dict[str, Union[Test, SuiteTest]]
config: Dict[str, str | dict[str, str]]
2023-09-03 05:28:47 +00:00
def get_reports():
# Initialize an empty list to store the report data
report_data = []
# Get the current working directory
current_dir = os.getcwd()
# Check if the current directory ends with 'reports'
if current_dir.endswith("reports"):
reports_dir = "/"
else:
reports_dir = "reports"
# Iterate over all agent directories in the reports directory
for agent_name in os.listdir(reports_dir):
2023-09-03 05:28:47 +00:00
if agent_name is None:
continue
agent_dir = os.path.join(reports_dir, agent_name)
# Check if the item is a directory (an agent directory)
if os.path.isdir(agent_dir):
# Construct the path to the report.json file
2023-09-12 01:20:03 +00:00
# Get all directories and files, but note that this will also include any file, not just directories.
run_dirs = glob.glob(os.path.join(agent_dir, "*"))
2023-09-12 01:20:03 +00:00
# Get all json files starting with 'file'
# old_report_files = glob.glob(os.path.join(agent_dir, "file*.json"))
# For each run directory, add the report.json to the end
2023-09-12 01:20:03 +00:00
# Only include the path if it's actually a directory
report_files = [
2023-09-12 01:20:03 +00:00
os.path.join(run_dir, "report.json")
for run_dir in run_dirs
if os.path.isdir(run_dir)
]
2023-09-12 01:20:03 +00:00
# old_report_files already contains the full paths, so no need to join again
# report_files = report_files + old_report_files
for report_file in report_files:
# Check if the report.json file exists
if os.path.isfile(report_file):
# Open the report.json file
with open(report_file, "r") as f:
# Load the JSON data from the file
2023-09-03 05:28:47 +00:00
json_data = json.load(f)
2023-09-12 01:20:03 +00:00
print(f"Processing {report_file}")
report = Report.model_validate(json_data)
2023-09-03 05:28:47 +00:00
for test_name, test_data in report.tests.items():
test_json = {
"agent": agent_name.lower(),
"benchmark_start_time": report.benchmark_start_time,
2023-09-03 05:28:47 +00:00
}
if isinstance(test_data, SuiteTest):
if (
test_data.category
): # this means it's a same task test
2023-09-03 05:28:47 +00:00
test_json["challenge"] = test_name
test_json["attempted"] = test_data.tests[
list(test_data.tests.keys())[0]
].metrics.attempted
test_json["categories"] = ", ".join(
test_data.category
)
2023-09-03 05:28:47 +00:00
test_json["task"] = test_data.task
test_json["success"] = test_data.metrics.percentage
test_json[
"difficulty"
] = test_data.metrics.highest_difficulty
test_json[
"success_%"
] = test_data.metrics.percentage
2023-09-03 05:28:47 +00:00
test_json["run_time"] = test_data.metrics.run_time
test_json["is_regression"] = test_data.tests[
list(test_data.tests.keys())[0]
].is_regression
else: # separate tasks in 1 suite
for (
suite_test_name,
suite_data,
) in test_data.tests.items():
2023-09-03 05:28:47 +00:00
test_json["challenge"] = suite_test_name
test_json[
"attempted"
] = suite_data.metrics.attempted
test_json["categories"] = ", ".join(
suite_data.category
)
2023-09-03 05:28:47 +00:00
test_json["task"] = suite_data.task
test_json["success"] = (
100.0 if suite_data.metrics.success else 0
)
test_json[
"difficulty"
] = suite_data.metrics.difficulty
test_json[
"success_%"
] = suite_data.metrics.success_percentage
test_json[
"run_time"
] = suite_data.metrics.run_time
test_json[
"is_regression"
] = suite_data.is_regression
2023-09-03 05:28:47 +00:00
else:
test_json["challenge"] = test_name
test_json["attempted"] = test_data.metrics.attempted
test_json["categories"] = ", ".join(test_data.category)
test_json["task"] = test_data.task
test_json["success"] = (
100.0 if test_data.metrics.success else 0
)
2023-09-03 05:28:47 +00:00
test_json["difficulty"] = test_data.metrics.difficulty
test_json[
"success_%"
] = test_data.metrics.success_percentage
2023-09-03 05:28:47 +00:00
test_json["run_time"] = test_data.metrics.run_time
test_json["is_regression"] = test_data.is_regression
2023-09-03 05:28:47 +00:00
report_data.append(test_json)
return pd.DataFrame(report_data)
def get_helicone_data():
helicone_api_key = os.getenv("HELICONE_API_KEY")
url = "https://www.helicone.ai/api/graphql"
# Replace <KEY> with your personal access key
transport = AIOHTTPTransport(
url=url, headers={"authorization": f"Bearer {helicone_api_key}"}
)
client = Client(transport=transport, fetch_schema_from_transport=True)
SIZE = 250
i = 0
data = []
print("Fetching data from Helicone")
while True:
query = gql(
"""
query ExampleQuery($limit: Int, $offset: Int){
heliconeRequest(
limit: $limit
offset: $offset
) {
2023-09-03 05:28:47 +00:00
costUSD
prompt
properties{
name
value
}
requestBody
response
createdAt
}
}
"""
)
print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records")
try:
result = client.execute(
query, variable_values={"limit": SIZE, "offset": i * SIZE}
)
except Exception as e:
print(f"Error occurred: {e}")
result = None
i += 1
if result:
for item in result["heliconeRequest"]:
properties = {
prop["name"]: prop["value"] for prop in item["properties"]
}
data.append(
{
"createdAt": item["createdAt"],
"agent": properties.get("agent"),
2023-09-03 05:28:47 +00:00
"costUSD": item["costUSD"],
"job_id": properties.get("job_id"),
"challenge": properties.get("challenge"),
"benchmark_start_time": properties.get("benchmark_start_time"),
"prompt": item["prompt"],
2023-09-03 05:28:47 +00:00
"response": item["response"],
"model": item["requestBody"].get("model"),
"request": item["requestBody"].get("messages"),
}
)
if not result or (len(result["heliconeRequest"]) == 0):
print("No more results")
break
df = pd.DataFrame(data)
# Drop rows where agent is None
df = df.dropna(subset=["agent"])
# Convert the remaining agent names to lowercase
df["agent"] = df["agent"].str.lower()
return df
2023-09-03 05:28:47 +00:00
if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"):
reports_df = pd.read_pickle("raw_reports.pkl")
helicone_df = pd.read_pickle("raw_helicone.pkl")
else:
reports_df = get_reports()
2023-09-03 05:28:47 +00:00
reports_df.to_pickle("raw_reports.pkl")
helicone_df = get_helicone_data()
2023-09-03 05:28:47 +00:00
helicone_df.to_pickle("raw_helicone.pkl")
def try_formats(date_str):
formats = ["%Y-%m-%d-%H:%M", "%Y-%m-%dT%H:%M:%S%z"]
for fmt in formats:
try:
return pd.to_datetime(date_str, format=fmt)
except ValueError:
pass
return None
helicone_df["benchmark_start_time"] = pd.to_datetime(
helicone_df["benchmark_start_time"].apply(try_formats), utc=True
)
helicone_df = helicone_df.dropna(subset=["benchmark_start_time"])
helicone_df["createdAt"] = pd.to_datetime(
helicone_df["createdAt"], unit="ms", origin="unix"
)
reports_df["benchmark_start_time"] = pd.to_datetime(
reports_df["benchmark_start_time"].apply(try_formats), utc=True
)
reports_df = reports_df.dropna(subset=["benchmark_start_time"])
assert pd.api.types.is_datetime64_any_dtype(
helicone_df["benchmark_start_time"]
), "benchmark_start_time in helicone_df is not datetime"
assert pd.api.types.is_datetime64_any_dtype(
reports_df["benchmark_start_time"]
), "benchmark_start_time in reports_df is not datetime"
reports_df["report_time"] = reports_df["benchmark_start_time"]
2023-09-03 05:28:47 +00:00
# df = pd.merge_asof(
# helicone_df.sort_values("benchmark_start_time"),
# reports_df.sort_values("benchmark_start_time"),
# left_on="benchmark_start_time",
# right_on="benchmark_start_time",
# by=["agent", "challenge"],
# direction="backward",
# )
df = pd.merge(
helicone_df,
reports_df,
on=["benchmark_start_time", "agent", "challenge"],
2023-09-12 01:20:03 +00:00
how="inner",
)
df.to_pickle("df.pkl")
print(df.info())
print("Data saved to df.pkl")
print("To load the data use: df = pd.read_pickle('df.pkl')")