From 87753dbeb5a2284f4cc27f09164826ddff8edeed Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 1 Aug 2023 18:46:00 -0700 Subject: [PATCH] Remove high costs (#235) --- .../Auto-GPT/folder10_08-01-02-43/report.json | 20 +++--- .../Auto-GPT/folder11_08-01-03-21/report.json | 62 +++++++++---------- .../BabyAGI/folder11_08-01-02-46/report.json | 16 ++--- .../BabyAGI/folder12_08-01-03-23/report.json | 62 +++++++++---------- .../BabyAGI/folder16_08-01-19-54/report.json | 4 +- .../beebot/folder12_08-01-03-21/report.json | 62 +++++++++---------- .../folder11_08-01-02-42/report.json | 12 ++-- .../folder10_08-01-02-42/report.json | 26 ++++---- .../folder11_08-01-03-20/report.json | 62 +++++++++---------- .../mini-agi/folder12_08-01-02-43/report.json | 4 +- .../mini-agi/folder13_08-01-03-21/report.json | 62 +++++++++---------- .../folder12_08-01-03-21/report.json | 62 +++++++++---------- 12 files changed, 227 insertions(+), 227 deletions(-) diff --git a/reports/Auto-GPT/folder10_08-01-02-43/report.json b/reports/Auto-GPT/folder10_08-01-02-43/report.json index 3374140a9..d2ee74164 100644 --- a/reports/Auto-GPT/folder10_08-01-02-43/report.json +++ b/reports/Auto-GPT/folder10_08-01-02-43/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "43.156 seconds" }, "reached_cutoff": false @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "39.967 seconds" }, "reached_cutoff": false @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "49.456 seconds" }, "reached_cutoff": false @@ -77,7 +77,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "50.779 seconds" }, "reached_cutoff": false @@ -95,7 +95,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "46.428 seconds" }, "reached_cutoff": false @@ -114,7 +114,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -133,7 +133,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.017 seconds" }, "reached_cutoff": true @@ -152,7 +152,7 @@ "difficulty": "novice", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "75.049 seconds" }, "reached_cutoff": true @@ -180,7 +180,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.05 seconds" }, "reached_cutoff": true @@ -192,4 +192,4 @@ "workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks" } -} \ No newline at end of file +} diff --git a/reports/Auto-GPT/folder11_08-01-03-21/report.json b/reports/Auto-GPT/folder11_08-01-03-21/report.json index 27114b7a6..3dc2c418a 100644 --- a/reports/Auto-GPT/folder11_08-01-03-21/report.json +++ b/reports/Auto-GPT/folder11_08-01-03-21/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1343.3564774999995, + "cost": null, "run_time": "43.121 seconds" }, "reached_cutoff": false @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1345.506869499999, + "cost": null, "run_time": "61.122 seconds" }, "reached_cutoff": true @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1348.7567854999993, + "cost": null, "run_time": "60.227 seconds" }, "reached_cutoff": true @@ -77,7 +77,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0]", "success_%": 0.0, - "cost": 1351.2083754999996, + "cost": null, "run_time": "44.372 seconds" }, "reached_cutoff": false @@ -95,7 +95,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1354.0949554999993, + "cost": null, "run_time": "56.795 seconds" }, "reached_cutoff": false @@ -114,7 +114,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1354.2449554999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -133,7 +133,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1357.2560394999998, + "cost": null, "run_time": "60.421 seconds" }, "reached_cutoff": true @@ -152,7 +152,7 @@ "difficulty": "novice", "success": true, "success_%": 100.0, - "cost": 1364.7770254999998, + "cost": null, "run_time": "75.062 seconds" }, "reached_cutoff": true @@ -171,7 +171,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1368.2915574999995, + "cost": null, "run_time": "60.026 seconds" }, "reached_cutoff": true @@ -190,7 +190,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1368.2918914999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -204,7 +204,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1368.2918914999993, + "cost": null, "run_time": "0.004 seconds" }, "tests": { @@ -268,7 +268,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1371.3122634999995, + "cost": null, "run_time": "75.083 seconds" }, "reached_cutoff": true @@ -287,7 +287,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1375.6546774999997, + "cost": null, "run_time": "75.068 seconds" }, "reached_cutoff": true @@ -307,7 +307,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1375.6546774999997, + "cost": null, "run_time": "0.003 seconds" }, "reached_cutoff": false @@ -326,7 +326,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.1842814999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -345,7 +345,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.1842814999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -365,7 +365,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.6591454999998, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -384,7 +384,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.7042934999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -403,7 +403,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1376.7042935, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -422,7 +422,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]", "success_%": 0.0, - "cost": 1376.7042935, + "cost": null, "run_time": "0.003 seconds" }, "reached_cutoff": false @@ -442,7 +442,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.7042935, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -461,7 +461,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.7357334999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -489,7 +489,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1360.5222974999997, + "cost": null, "run_time": "60.061 seconds" }, "reached_cutoff": true @@ -509,7 +509,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1368.3237474999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -529,7 +529,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.2331374999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -549,7 +549,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.7042934999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -579,7 +579,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1375.6834654999998, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -599,7 +599,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.0837854999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -619,7 +619,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.0871414999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -639,7 +639,7 @@ "success": false, "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1376.0871414999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -651,4 +651,4 @@ "workspace": "auto_gpt_workspace", "entry_path": "agbenchmark.benchmarks" } -} \ No newline at end of file +} diff --git a/reports/BabyAGI/folder11_08-01-02-46/report.json b/reports/BabyAGI/folder11_08-01-02-46/report.json index 2eef1f0ad..fea82f186 100644 --- a/reports/BabyAGI/folder11_08-01-02-46/report.json +++ b/reports/BabyAGI/folder11_08-01-02-46/report.json @@ -21,7 +21,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.154 seconds" }, "reached_cutoff": true @@ -40,7 +40,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestPlanCreation::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999999, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -60,7 +60,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalDivergence::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -79,7 +79,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -98,7 +98,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -117,7 +117,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -136,7 +136,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -145,4 +145,4 @@ "config": { "workspace": "babycoder/playground" } -} \ No newline at end of file +} diff --git a/reports/BabyAGI/folder12_08-01-03-23/report.json b/reports/BabyAGI/folder12_08-01-03-23/report.json index 6cc03e5f1..67565356b 100644 --- a/reports/BabyAGI/folder12_08-01-03-23/report.json +++ b/reports/BabyAGI/folder12_08-01-03-23/report.json @@ -21,7 +21,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1349.5450334999996, + "cost": null, "run_time": "60.118 seconds" }, "reached_cutoff": true @@ -40,7 +40,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestPlanCreation::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1349.7110934999992, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -60,7 +60,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalDivergence::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1349.7220814999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -79,7 +79,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1349.771653499999, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -98,7 +98,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1349.7908734999999, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -117,7 +117,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.0636414999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -136,7 +136,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.1109494999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -156,7 +156,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.1949954999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -175,7 +175,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.2521314999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -194,7 +194,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.3188514999995, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -208,7 +208,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1350.3505874999994, + "cost": null, "run_time": "0.003 seconds" }, "tests": { @@ -272,7 +272,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.3895874999992, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -291,7 +291,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.3895874999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -311,7 +311,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.3895874999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -330,7 +330,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.758855499999, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -349,7 +349,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.7791274999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -369,7 +369,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.7791274999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -388,7 +388,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.7791274999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -407,7 +407,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1350.8574474999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -426,7 +426,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]", "success_%": 0.0, - "cost": 1350.8574474999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -446,7 +446,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.8574474999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -465,7 +465,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.8805354999995, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -493,7 +493,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.1566454999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -513,7 +513,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.3505874999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -533,7 +533,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.7791274999988, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -553,7 +553,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.8805354999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -583,7 +583,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.4219954999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -603,7 +603,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.4706314999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -623,7 +623,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.5281074999991, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -643,7 +643,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1350.5744154999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -654,4 +654,4 @@ "config": { "workspace": "babycoder/playground" } -} \ No newline at end of file +} diff --git a/reports/BabyAGI/folder16_08-01-19-54/report.json b/reports/BabyAGI/folder16_08-01-19-54/report.json index 6d76ebcbc..68ae05e62 100644 --- a/reports/BabyAGI/folder16_08-01-19-54/report.json +++ b/reports/BabyAGI/folder16_08-01-19-54/report.json @@ -330,7 +330,7 @@ "success": false, "fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1571.2554915, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -654,4 +654,4 @@ "config": { "workspace": "babycoder/playground" } -} \ No newline at end of file +} diff --git a/reports/beebot/folder12_08-01-03-21/report.json b/reports/beebot/folder12_08-01-03-21/report.json index 0c5e64cb5..6435d48b5 100644 --- a/reports/beebot/folder12_08-01-03-21/report.json +++ b/reports/beebot/folder12_08-01-03-21/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1343.5359974999994, + "cost": null, "run_time": "60.004 seconds" }, "reached_cutoff": true @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1345.5068694999998, + "cost": null, "run_time": "61.046 seconds" }, "reached_cutoff": true @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1348.7567854999998, + "cost": null, "run_time": "60.144 seconds" }, "reached_cutoff": true @@ -76,7 +76,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1350.8574474999994, + "cost": null, "run_time": "35.62 seconds" }, "reached_cutoff": false @@ -94,7 +94,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1353.8305034999994, + "cost": null, "run_time": "60.004 seconds" }, "reached_cutoff": true @@ -113,7 +113,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0]", "success_%": 0.0, - "cost": 1356.8839354999998, + "cost": null, "run_time": "60.01 seconds" }, "reached_cutoff": true @@ -132,7 +132,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1360.2214214999997, + "cost": null, "run_time": "60.011 seconds" }, "reached_cutoff": true @@ -151,7 +151,7 @@ "difficulty": "novice", "success": true, "success_%": 100.0, - "cost": 1365.1847854999999, + "cost": null, "run_time": "41.004 seconds" }, "reached_cutoff": false @@ -169,7 +169,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1368.2918914999993, + "cost": null, "run_time": "60.005 seconds" }, "reached_cutoff": true @@ -188,7 +188,7 @@ "success": false, "fail_reason": "generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1368.3542554999992, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -202,7 +202,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1368.3542554999992, + "cost": null, "run_time": "0.004 seconds" }, "tests": { @@ -265,7 +265,7 @@ "difficulty": "intermediate", "success": true, "success_%": 100.0, - "cost": 1373.0814234999998, + "cost": null, "run_time": "75.044 seconds" }, "reached_cutoff": true @@ -284,7 +284,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1378.1223054999994, + "cost": null, "run_time": "75.039 seconds" }, "reached_cutoff": true @@ -304,7 +304,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1380.5522154999996, + "cost": null, "run_time": "39.757 seconds" }, "reached_cutoff": false @@ -323,7 +323,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1392.5523254999994, + "cost": null, "run_time": "60.009 seconds" }, "reached_cutoff": true @@ -342,7 +342,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1396.2070014999993, + "cost": null, "run_time": "90.043 seconds" }, "reached_cutoff": true @@ -362,7 +362,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1403.8377014999992, + "cost": null, "run_time": "90.04 seconds" }, "reached_cutoff": true @@ -381,7 +381,7 @@ "success": false, "fail_reason": "generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1403.9080654999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -400,7 +400,7 @@ "success": false, "fail_reason": "generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1403.9080654999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -419,7 +419,7 @@ "success": false, "fail_reason": "generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]", "success_%": 0.0, - "cost": 1403.9080654999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -439,7 +439,7 @@ "success": false, "fail_reason": "generate_test.py::TestThreeSum::test_method[challenge_data0] depends on generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1403.9080654999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -458,7 +458,7 @@ "success": false, "fail_reason": "generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1404.0667974999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -485,7 +485,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1362.8408334999992, + "cost": null, "run_time": "40.908 seconds" }, "reached_cutoff": false @@ -504,7 +504,7 @@ "difficulty": "novice", "success": true, "success_%": 100.0, - "cost": 1369.4081974999995, + "cost": null, "run_time": "29.797 seconds" }, "reached_cutoff": false @@ -524,7 +524,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1399.2661054999996, + "cost": null, "run_time": "75.04 seconds" }, "reached_cutoff": true @@ -544,7 +544,7 @@ "success": false, "fail_reason": "generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1403.9891414999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -574,7 +574,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0]", "success_%": 0.0, - "cost": 1383.1865814999994, + "cost": null, "run_time": "60.01 seconds" }, "reached_cutoff": true @@ -594,7 +594,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0, 0.0]", "success_%": 0.0, - "cost": 1385.6828374999998, + "cost": null, "run_time": "60.011 seconds" }, "reached_cutoff": true @@ -614,7 +614,7 @@ "success": false, "fail_reason": "assert 1 in [0.0, 0.0, 0.0]", "success_%": 0.0, - "cost": 1388.4356334999998, + "cost": null, "run_time": "60.011 seconds" }, "reached_cutoff": true @@ -634,7 +634,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1389.1831974999996, + "cost": null, "run_time": "17.267 seconds" }, "reached_cutoff": false @@ -645,4 +645,4 @@ "config": { "workspace": "workspace" } -} \ No newline at end of file +} diff --git a/reports/beebot/folder3_07-31-03-06/folder11_08-01-02-42/report.json b/reports/beebot/folder3_07-31-03-06/folder11_08-01-02-42/report.json index db428a1a4..cf028fc17 100644 --- a/reports/beebot/folder3_07-31-03-06/folder11_08-01-02-42/report.json +++ b/reports/beebot/folder3_07-31-03-06/folder11_08-01-02-42/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.004 seconds" }, "reached_cutoff": true @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.988 seconds" }, "reached_cutoff": true @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.15 seconds" }, "reached_cutoff": true @@ -76,7 +76,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999994, + "cost": null, "run_time": "29.371 seconds" }, "reached_cutoff": false @@ -94,7 +94,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.004 seconds" }, "reached_cutoff": true @@ -103,4 +103,4 @@ "config": { "workspace": "workspace" } -} \ No newline at end of file +} diff --git a/reports/gpt-engineer/folder10_08-01-02-42/report.json b/reports/gpt-engineer/folder10_08-01-02-42/report.json index 2b7ae28c6..e4600f139 100644 --- a/reports/gpt-engineer/folder10_08-01-02-42/report.json +++ b/reports/gpt-engineer/folder10_08-01-02-42/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "60.007 seconds" }, "reached_cutoff": true @@ -39,7 +39,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "51.859 seconds" }, "reached_cutoff": false @@ -59,7 +59,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "43.621 seconds" }, "reached_cutoff": false @@ -78,7 +78,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "44.743 seconds" }, "reached_cutoff": false @@ -97,7 +97,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "48.236 seconds" }, "reached_cutoff": false @@ -116,7 +116,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -135,7 +135,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -155,7 +155,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -174,7 +174,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -193,7 +193,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -207,7 +207,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.004 seconds" }, "tests": { @@ -279,7 +279,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -291,4 +291,4 @@ "workspace": "projects/my-new-project/workspace", "entry_path": "agbenchmark.benchmarks" } -} \ No newline at end of file +} diff --git a/reports/gpt-engineer/folder11_08-01-03-20/report.json b/reports/gpt-engineer/folder11_08-01-03-20/report.json index 84b1232a5..535ed1db0 100644 --- a/reports/gpt-engineer/folder11_08-01-03-20/report.json +++ b/reports/gpt-engineer/folder11_08-01-03-20/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1342.2433194999994, + "cost": null, "run_time": "60.007 seconds" }, "reached_cutoff": true @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 50.0, - "cost": 1344.3781614999991, + "cost": null, "run_time": "54.386 seconds" }, "reached_cutoff": false @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1347.3474394999994, + "cost": null, "run_time": "60.043 seconds" }, "reached_cutoff": true @@ -77,7 +77,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1349.5450334999996, + "cost": null, "run_time": "43.389 seconds" }, "reached_cutoff": false @@ -96,7 +96,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1352.5128454999995, + "cost": null, "run_time": "54.925 seconds" }, "reached_cutoff": false @@ -115,7 +115,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.5128454999995, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -134,7 +134,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.5371894999996, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -154,7 +154,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.6214774999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -173,7 +173,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.6214774999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -192,7 +192,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.6685134999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -206,7 +206,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1352.7113854999993, + "cost": null, "run_time": "0.003 seconds" }, "tests": { @@ -270,7 +270,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.8593534999998, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -289,7 +289,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.1095534999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -309,7 +309,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.1262974999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -328,7 +328,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.2447094999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -347,7 +347,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.2447094999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -367,7 +367,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.3277894999992, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -386,7 +386,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.5095294999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -405,7 +405,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1353.5095294999992, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -424,7 +424,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]", "success_%": 0.0, - "cost": 1353.6475294999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -444,7 +444,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.6771614999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -463,7 +463,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.7215094999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -491,7 +491,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.5890694999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -511,7 +511,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1352.8417054999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -531,7 +531,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.2847255, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -551,7 +551,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.7040734999991, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -581,7 +581,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.1555254999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -601,7 +601,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.1555254999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -621,7 +621,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.1819374999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -641,7 +641,7 @@ "success": false, "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1353.2447094999993, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -653,4 +653,4 @@ "workspace": "projects/my-new-project/workspace", "entry_path": "agbenchmark.benchmarks" } -} \ No newline at end of file +} diff --git a/reports/mini-agi/folder12_08-01-02-43/report.json b/reports/mini-agi/folder12_08-01-02-43/report.json index bf9a9d459..8ea32dd05 100644 --- a/reports/mini-agi/folder12_08-01-02-43/report.json +++ b/reports/mini-agi/folder12_08-01-02-43/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.2960054999996, + "cost": null, "run_time": "18.419 seconds" }, "reached_cutoff": false @@ -29,4 +29,4 @@ "config": { "workspace": "${os.path.join(Path.home(), 'miniagi')}" } -} \ No newline at end of file +} diff --git a/reports/mini-agi/folder13_08-01-03-21/report.json b/reports/mini-agi/folder13_08-01-03-21/report.json index ec86ce188..12b266fe6 100644 --- a/reports/mini-agi/folder13_08-01-03-21/report.json +++ b/reports/mini-agi/folder13_08-01-03-21/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.6358494999993, + "cost": null, "run_time": "14.976 seconds" }, "reached_cutoff": false @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1342.7803774999993, + "cost": null, "run_time": "34.59 seconds" }, "reached_cutoff": false @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1343.5949394999996, + "cost": null, "run_time": "21.005 seconds" }, "reached_cutoff": false @@ -76,7 +76,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1344.2619814999994, + "cost": null, "run_time": "20.928 seconds" }, "reached_cutoff": false @@ -94,7 +94,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1344.9177814999994, + "cost": null, "run_time": "17.648 seconds" }, "reached_cutoff": false @@ -112,7 +112,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1346.108983499999, + "cost": null, "run_time": "23.255 seconds" }, "reached_cutoff": false @@ -130,7 +130,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1351.5025554999993, + "cost": null, "run_time": "99.538 seconds" }, "reached_cutoff": true @@ -149,7 +149,7 @@ "difficulty": "novice", "success": true, "success_%": 100.0, - "cost": 1356.7393354999992, + "cost": null, "run_time": "41.77 seconds" }, "reached_cutoff": false @@ -168,7 +168,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1360.2214214999997, + "cost": null, "run_time": "61.047 seconds" }, "reached_cutoff": true @@ -187,7 +187,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1363.9154954999997, + "cost": null, "run_time": "61.445 seconds" }, "reached_cutoff": true @@ -201,7 +201,7 @@ "metrics": { "percentage": 33.33, "highest_difficulty": "novice", - "cost": 1365.6362114999993, + "cost": null, "run_time": "29.589 seconds" }, "tests": { @@ -264,7 +264,7 @@ "difficulty": "intermediate", "success": true, "success_%": 100.0, - "cost": 1367.9136655000002, + "cost": null, "run_time": "42.673 seconds" }, "reached_cutoff": false @@ -282,7 +282,7 @@ "difficulty": "intermediate", "success": true, "success_%": 100.0, - "cost": 1371.3122634999997, + "cost": null, "run_time": "86.683 seconds" }, "reached_cutoff": true @@ -302,7 +302,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.5417034999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -321,7 +321,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.6130294999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -340,7 +340,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.6130294999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -359,7 +359,7 @@ "difficulty": "advanced", "success": true, "success_%": 100.0, - "cost": 1375.4160334999997, + "cost": null, "run_time": "59.221 seconds" }, "reached_cutoff": false @@ -378,7 +378,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1375.6546774999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -397,7 +397,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1375.6546774999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -416,7 +416,7 @@ "success": false, "fail_reason": "assert 1 in []", "success_%": 0.0, - "cost": 1380.0412454999998, + "cost": null, "run_time": "66.239 seconds" }, "reached_cutoff": true @@ -436,7 +436,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1380.3811174999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -455,7 +455,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1380.4471354999991, + "cost": null, "run_time": "0.003 seconds" }, "reached_cutoff": false @@ -483,7 +483,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1354.5040255, + "cost": null, "run_time": "65.444 seconds" }, "reached_cutoff": true @@ -503,7 +503,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1365.8208914999998, + "cost": null, "run_time": "0.003 seconds" }, "reached_cutoff": false @@ -523,7 +523,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.6130294999998, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -543,7 +543,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1380.4249174999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -573,7 +573,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.5417034999996, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -593,7 +593,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.5764874999998, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -613,7 +613,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.5764874999998, + "cost": null, "run_time": "0.003 seconds" }, "reached_cutoff": false @@ -633,7 +633,7 @@ "success": false, "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1371.6130294999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -644,4 +644,4 @@ "config": { "workspace": "${os.path.join(Path.home(), 'miniagi')}" } -} \ No newline at end of file +} diff --git a/reports/smol-developer/folder12_08-01-03-21/report.json b/reports/smol-developer/folder12_08-01-03-21/report.json index 32bcfc1f0..9f1a51817 100644 --- a/reports/smol-developer/folder12_08-01-03-21/report.json +++ b/reports/smol-developer/folder12_08-01-03-21/report.json @@ -20,7 +20,7 @@ "difficulty": "interface", "success": true, "success_%": 100.0, - "cost": 1341.4717674999995, + "cost": null, "run_time": "5.977 seconds" }, "reached_cutoff": false @@ -38,7 +38,7 @@ "difficulty": "basic", "success": true, "success_%": 100.0, - "cost": 1342.9386294999993, + "cost": null, "run_time": "44.426 seconds" }, "reached_cutoff": false @@ -58,7 +58,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1343.9160854999996, + "cost": null, "run_time": "23.1 seconds" }, "reached_cutoff": false @@ -77,7 +77,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1344.454445499999, + "cost": null, "run_time": "16.193 seconds" }, "reached_cutoff": false @@ -96,7 +96,7 @@ "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, - "cost": 1344.7021834999994, + "cost": null, "run_time": "6.787 seconds" }, "reached_cutoff": false @@ -115,7 +115,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1344.8143234999998, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -134,7 +134,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1344.8180494999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -154,7 +154,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1344.8498534999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -173,7 +173,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1344.9177814999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -192,7 +192,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.022661499999, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -206,7 +206,7 @@ "metrics": { "percentage": 0, "highest_difficulty": "No successful tests", - "cost": 1345.0226614999992, + "cost": null, "run_time": "0.004 seconds" }, "tests": { @@ -270,7 +270,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0226614999997, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -289,7 +289,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0226614999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -309,7 +309,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0226614999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -328,7 +328,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.2446694999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -347,7 +347,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.2446694999994, + "cost": null, "run_time": "0.001 seconds" }, "reached_cutoff": false @@ -367,7 +367,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.4681094999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -386,7 +386,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.4681094999999, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -405,7 +405,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]", "success_%": 0.0, - "cost": 1345.5068694999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -424,7 +424,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]", "success_%": 0.0, - "cost": 1345.5068694999995, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -444,7 +444,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.5405734999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -463,7 +463,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.6027334999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -491,7 +491,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1344.8180494999992, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -511,7 +511,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0226614999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -531,7 +531,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.4681094999994, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -551,7 +551,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.5405734999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -581,7 +581,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0529334999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -601,7 +601,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0529334999997, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -621,7 +621,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.0529334999997, + "cost": null, "run_time": "0.05 seconds" }, "reached_cutoff": false @@ -641,7 +641,7 @@ "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, - "cost": 1345.1856294999993, + "cost": null, "run_time": "0.002 seconds" }, "reached_cutoff": false @@ -653,4 +653,4 @@ "workspace": "generated", "entry_path": "agbenchmark.benchmarks" } -} \ No newline at end of file +}