small data changes

2023-09-11 18:20:03 -07:00 · 2023-09-11 18:20:03 -07:00 · 39efed59af
parent a5a9142b20
commit 39efed59af
5 changed files with 189 additions and 93 deletions
--- a/benchmark/frontend/package-lock.json
+++ b/benchmark/frontend/package-lock.json
@ -4254,6 +4254,126 @@
      "funding": {
        "url": "https://github.com/sponsors/colinhacks"
      }
+    },
+    "node_modules/@next/swc-darwin-arm64": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-13.4.13.tgz",
+      "integrity": "sha512-ZptVhHjzUuivnXMNCJ6lER33HN7lC+rZ01z+PM10Ows21NHFYMvGhi5iXkGtBDk6VmtzsbqnAjnx4Oz5um0FjA==",
+      "cpu": [
+        "arm64"
+      ],
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-darwin-x64": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-13.4.13.tgz",
+      "integrity": "sha512-t9nTiWCLApw8W4G1kqJyYP7y6/7lyal3PftmRturIxAIBlZss9wrtVN8nci50StDHmIlIDxfguYIEGVr9DbFTg==",
+      "cpu": [
+        "x64"
+      ],
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-linux-arm64-gnu": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-13.4.13.tgz",
+      "integrity": "sha512-xEHUqC8eqR5DHe8SOmMnDU1K3ggrJ28uIKltrQAwqFSSSmzjnN/XMocZkcVhuncuxYrpbri0iMQstRyRVdQVWg==",
+      "cpu": [
+        "arm64"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-linux-arm64-musl": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-13.4.13.tgz",
+      "integrity": "sha512-sNf3MnLAm8rquSSAoeD9nVcdaDeRYOeey4stOWOyWIgbBDtP+C93amSgH/LPTDoUV7gNiU6f+ghepTjTjRgIUQ==",
+      "cpu": [
+        "arm64"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-linux-x64-gnu": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-13.4.13.tgz",
+      "integrity": "sha512-WhcRaJJSHyx9OWmKjjz+OWHumiPZWRqmM/09Bt7Up4UqUJFFhGExeztR4trtv3rflvULatu9IH/nTV8fUUgaMA==",
+      "cpu": [
+        "x64"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-linux-x64-musl": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-13.4.13.tgz",
+      "integrity": "sha512-+Y4LLhOWWZQIDKVwr2R17lq2KSN0F1c30QVgGIWfnjjHpH8nrIWHEndhqYU+iFuW8It78CiJjQKTw4f51HD7jA==",
+      "cpu": [
+        "x64"
+      ],
+      "optional": true,
+      "os": [
+        "linux"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-win32-arm64-msvc": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-13.4.13.tgz",
+      "integrity": "sha512-rWurdOR20uxjfqd1X9vDAgv0Jb26KjyL8akF9CBeFqX8rVaBAnW/Wf6A2gYEwyYY4Bai3T7p1kro6DFrsvBAAw==",
+      "cpu": [
+        "arm64"
+      ],
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
+    },
+    "node_modules/@next/swc-win32-ia32-msvc": {
+      "version": "13.4.13",
+      "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-13.4.13.tgz",
+      "integrity": "sha512-E8bSPwRuY5ibJ3CzLQmJEt8qaWrPYuUTwnrwygPUEWoLzD5YRx9SD37oXRdU81TgGwDzCxpl7z5Nqlfk50xAog==",
+      "cpu": [
+        "ia32"
+      ],
+      "optional": true,
+      "os": [
+        "win32"
+      ],
+      "engines": {
+        "node": ">= 10"
+      }
    }
  }
 }
--- a/benchmark/frontend/prisma/schema.prisma
+++ b/benchmark/frontend/prisma/schema.prisma
@ -1,81 +0,0 @@
-// This is your Prisma schema file,
-// learn more about it in the docs: https://pris.ly/d/prisma-schema
-
-generator client {
-    provider = "prisma-client-js"
-}
-
-datasource db {
-    provider = "sqlite"
-    url      = env("DATABASE_URL")
-}
-
-model Metrics {
-    id             Int     @id @default(autoincrement())
-    difficulty     String
-    success        Boolean
-    successPercent Float
-    runTime        String?
-    failReason     String?
-    Test           Test[]
-}
-
-model MetricsOverall {
-    id                Int         @id @default(autoincrement())
-    runTime           String
-    highestDifficulty String
-    percentage        Float?
-    SuiteTest         SuiteTest[]
-    Report            Report[]
-}
-
-model Test {
-    id            Int       @id @default(autoincrement())
-    dataPath      String
-    isRegression  Boolean
-    answer        String
-    description   String
-    metricsId     Int
-    metrics       Metrics   @relation(fields: [metricsId], references: [id])
-    categoryId    Int?
-    category      Category? @relation(fields: [categoryId], references: [id])
-    task          String?
-    reachedCutoff Boolean?
-}
-
-model SuiteTest {
-    id               Int            @id @default(autoincrement())
-    dataPath         String
-    metricsOverallId Int
-    metricsOverall   MetricsOverall @relation(fields: [metricsOverallId], references: [id])
-    categoryId       Int?
-    category         Category?      @relation(fields: [categoryId], references: [id])
-    task             String?
-    reachedCutoff    Boolean?
-}
-
-model Category {
-    id         Int         @id @default(autoincrement())
-    name       String      @unique
-    tests      Test[]
-    suiteTests SuiteTest[]
-}
-
-model Report {
-    id                 Int            @id @default(autoincrement())
-    command            String
-    completionTime     String
-    benchmarkStartTime String
-    metricsOverallId   Int
-    metricsOverall     MetricsOverall @relation(fields: [metricsOverallId], references: [id])
-    configKey          String
-    configValue        String
-    agentId            Int
-    agent              Agent          @relation(fields: [agentId], references: [id])
-}
-
-model Agent {
-    id      Int      @id @default(autoincrement())
-    name    String   @unique
-    reports Report[]
-}
--- a/benchmark/frontend/src/env.mjs
+++ b/benchmark/frontend/src/env.mjs
@ -7,7 +7,7 @@ export const env = createEnv({
   * isn't built with invalid env vars.
   */
  server: {
-    DATABASE_URL: z.string().url(),
+    // DATABASE_URL: z.string().url(),
    NODE_ENV: z.enum(["development", "test", "production"]),
  },

@ -25,7 +25,7 @@ export const env = createEnv({
   * middlewares) or client-side so we need to destruct manually.
   */
  runtimeEnv: {
-    DATABASE_URL: process.env.DATABASE_URL,
+    // DATABASE_URL: process.env.DATABASE_URL,
    NODE_ENV: process.env.NODE_ENV,
    // NEXT_PUBLIC_CLIENTVAR: process.env.NEXT_PUBLIC_CLIENTVAR,
  },
--- a/benchmark/paper/combined_data.ipynb
+++ b/benchmark/paper/combined_data.ipynb
@ -51,7 +51,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df.dropna(subset=['benchmark_start_time', 'response', 'model'], inplace=True)"
+    "df.dropna(subset=['benchmark_start_time', 'response', 'model', 'agent'], inplace=True)"
   ]
  },
  {
@ -684,7 +684,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
@ -708,8 +708,8 @@
    "    return x\n",
    "\n",
    "challenge = \"TestRememberMultipleIds\"\n",
-    "agent_array = ['beebot'] # df['agent'].unique()\n",
-    "request_type = 'request' # 'request' or 'response'\n",
+    "agent_array = df['agent'].unique()\n",
+    "request_type = 'response' # 'request' or 'response'\n",
    "\n",
    "# Loop through unique agents\n",
    "for agent in agent_array:\n",
@ -1494,7 +1494,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -1533,7 +1533,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
--- a/benchmark/reports/match_records.py
+++ b/benchmark/reports/match_records.py
@ -6,7 +6,55 @@ from gql.transport.aiohttp import AIOHTTPTransport
 from gql import gql, Client
 import os

-from agbenchmark.reports.processing.report_types import Report, SuiteTest
+# from agbenchmark.reports.processing.report_types import Report, SuiteTest
+
+from typing import Dict, List, Optional, Union
+
+from pydantic import BaseModel, Field
+
+
+class Metrics(BaseModel):
+    difficulty: str
+    success: bool
+    success_percent: float = Field(..., alias="success_%")
+    run_time: Optional[str] = None
+    fail_reason: Optional[str] = None
+    attempted: Optional[bool] = None
+
+
+class MetricsOverall(BaseModel):
+    run_time: str
+    highest_difficulty: str
+    percentage: Optional[float] = None
+
+
+class Test(BaseModel):
+    data_path: str
+    is_regression: bool
+    answer: str
+    description: str
+    metrics: Metrics
+    category: List[str]
+    task: Optional[str] = None
+    reached_cutoff: Optional[bool] = None
+
+
+class SuiteTest(BaseModel):
+    data_path: str
+    metrics: MetricsOverall
+    tests: Dict[str, Test]
+    category: Optional[List[str]] = None
+    task: Optional[str] = None
+    reached_cutoff: Optional[bool] = None
+
+
+class Report(BaseModel):
+    command: str
+    completion_time: str
+    benchmark_start_time: str
+    metrics: MetricsOverall
+    tests: Dict[str, Union[Test, SuiteTest]]
+    config: Dict[str, str | dict[str, str]]


 def get_reports():
@ -31,13 +79,21 @@ def get_reports():
        # Check if the item is a directory (an agent directory)
        if os.path.isdir(agent_dir):
            # Construct the path to the report.json file
-            # Use glob to find all run directories in the agent_dir
+            # Get all directories and files, but note that this will also include any file, not just directories.
            run_dirs = glob.glob(os.path.join(agent_dir, "*"))

+            # Get all json files starting with 'file'
+            # old_report_files = glob.glob(os.path.join(agent_dir, "file*.json"))
+
            # For each run directory, add the report.json to the end
+            # Only include the path if it's actually a directory
            report_files = [
-                os.path.join(run_dir, "report.json") for run_dir in run_dirs
+                os.path.join(run_dir, "report.json")
+                for run_dir in run_dirs
+                if os.path.isdir(run_dir)
            ]
+            # old_report_files already contains the full paths, so no need to join again
+            # report_files = report_files + old_report_files
            for report_file in report_files:
                # Check if the report.json file exists
                if os.path.isfile(report_file):
@ -45,6 +101,7 @@ def get_reports():
                    with open(report_file, "r") as f:
                        # Load the JSON data from the file
                        json_data = json.load(f)
+                        print(f"Processing {report_file}")
                        report = Report.parse_obj(json_data)

                        for test_name, test_data in report.tests.items():
@ -265,7 +322,7 @@ df = pd.merge(
    helicone_df,
    reports_df,
    on=["benchmark_start_time", "agent", "challenge"],
-    how="left",
+    how="inner",
 )

 df.to_pickle("df.pkl")