Remove high costs (#235)

pull/5155/head
merwanehamadi 2023-08-01 18:46:00 -07:00 committed by GitHub
parent 6e5935a932
commit 87753dbeb5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 227 additions and 227 deletions

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "43.156 seconds"
},
"reached_cutoff": false
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "39.967 seconds"
},
"reached_cutoff": false
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "49.456 seconds"
},
"reached_cutoff": false
@ -77,7 +77,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "50.779 seconds"
},
"reached_cutoff": false
@ -95,7 +95,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "46.428 seconds"
},
"reached_cutoff": false
@ -114,7 +114,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -133,7 +133,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.017 seconds"
},
"reached_cutoff": true
@ -152,7 +152,7 @@
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "75.049 seconds"
},
"reached_cutoff": true
@ -180,7 +180,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.05 seconds"
},
"reached_cutoff": true

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1343.3564774999995,
"cost": null,
"run_time": "43.121 seconds"
},
"reached_cutoff": false
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1345.506869499999,
"cost": null,
"run_time": "61.122 seconds"
},
"reached_cutoff": true
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1348.7567854999993,
"cost": null,
"run_time": "60.227 seconds"
},
"reached_cutoff": true
@ -77,7 +77,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 0.0,
"cost": 1351.2083754999996,
"cost": null,
"run_time": "44.372 seconds"
},
"reached_cutoff": false
@ -95,7 +95,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1354.0949554999993,
"cost": null,
"run_time": "56.795 seconds"
},
"reached_cutoff": false
@ -114,7 +114,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1354.2449554999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -133,7 +133,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1357.2560394999998,
"cost": null,
"run_time": "60.421 seconds"
},
"reached_cutoff": true
@ -152,7 +152,7 @@
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"cost": 1364.7770254999998,
"cost": null,
"run_time": "75.062 seconds"
},
"reached_cutoff": true
@ -171,7 +171,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1368.2915574999995,
"cost": null,
"run_time": "60.026 seconds"
},
"reached_cutoff": true
@ -190,7 +190,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1368.2918914999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -204,7 +204,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1368.2918914999993,
"cost": null,
"run_time": "0.004 seconds"
},
"tests": {
@ -268,7 +268,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1371.3122634999995,
"cost": null,
"run_time": "75.083 seconds"
},
"reached_cutoff": true
@ -287,7 +287,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1375.6546774999997,
"cost": null,
"run_time": "75.068 seconds"
},
"reached_cutoff": true
@ -307,7 +307,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1375.6546774999997,
"cost": null,
"run_time": "0.003 seconds"
},
"reached_cutoff": false
@ -326,7 +326,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.1842814999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -345,7 +345,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.1842814999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -365,7 +365,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.6591454999998,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -384,7 +384,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.7042934999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -403,7 +403,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1376.7042935,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -422,7 +422,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]",
"success_%": 0.0,
"cost": 1376.7042935,
"cost": null,
"run_time": "0.003 seconds"
},
"reached_cutoff": false
@ -442,7 +442,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.7042935,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -461,7 +461,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.7357334999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -489,7 +489,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1360.5222974999997,
"cost": null,
"run_time": "60.061 seconds"
},
"reached_cutoff": true
@ -509,7 +509,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1368.3237474999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -529,7 +529,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.2331374999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -549,7 +549,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.7042934999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -579,7 +579,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1375.6834654999998,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -599,7 +599,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.0837854999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -619,7 +619,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.0871414999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -639,7 +639,7 @@
"success": false,
"fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1376.0871414999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false

View File

@ -21,7 +21,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.154 seconds"
},
"reached_cutoff": true
@ -40,7 +40,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestPlanCreation::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999999,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -60,7 +60,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalDivergence::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -79,7 +79,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -98,7 +98,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -117,7 +117,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -136,7 +136,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false

View File

@ -21,7 +21,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1349.5450334999996,
"cost": null,
"run_time": "60.118 seconds"
},
"reached_cutoff": true
@ -40,7 +40,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestPlanCreation::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1349.7110934999992,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -60,7 +60,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalDivergence::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1349.7220814999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -79,7 +79,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1349.771653499999,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -98,7 +98,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1349.7908734999999,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -117,7 +117,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.0636414999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -136,7 +136,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.1109494999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -156,7 +156,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.1949954999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -175,7 +175,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.2521314999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -194,7 +194,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.3188514999995,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -208,7 +208,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1350.3505874999994,
"cost": null,
"run_time": "0.003 seconds"
},
"tests": {
@ -272,7 +272,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.3895874999992,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -291,7 +291,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.3895874999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -311,7 +311,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.3895874999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -330,7 +330,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.758855499999,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -349,7 +349,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.7791274999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -369,7 +369,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.7791274999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -388,7 +388,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.7791274999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -407,7 +407,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1350.8574474999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -426,7 +426,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]",
"success_%": 0.0,
"cost": 1350.8574474999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -446,7 +446,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.8574474999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -465,7 +465,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.8805354999995,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -493,7 +493,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0], agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestWriteFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.1566454999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -513,7 +513,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.3505874999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -533,7 +533,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.7791274999988,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -553,7 +553,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.8805354999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -583,7 +583,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.4219954999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -603,7 +603,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.4706314999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -623,7 +623,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.5281074999991,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -643,7 +643,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1350.5744154999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false

View File

@ -330,7 +330,7 @@
"success": false,
"fail_reason": "agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/BabyAGI/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1571.2554915,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1343.5359974999994,
"cost": null,
"run_time": "60.004 seconds"
},
"reached_cutoff": true
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1345.5068694999998,
"cost": null,
"run_time": "61.046 seconds"
},
"reached_cutoff": true
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1348.7567854999998,
"cost": null,
"run_time": "60.144 seconds"
},
"reached_cutoff": true
@ -76,7 +76,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1350.8574474999994,
"cost": null,
"run_time": "35.62 seconds"
},
"reached_cutoff": false
@ -94,7 +94,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1353.8305034999994,
"cost": null,
"run_time": "60.004 seconds"
},
"reached_cutoff": true
@ -113,7 +113,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 0.0,
"cost": 1356.8839354999998,
"cost": null,
"run_time": "60.01 seconds"
},
"reached_cutoff": true
@ -132,7 +132,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1360.2214214999997,
"cost": null,
"run_time": "60.011 seconds"
},
"reached_cutoff": true
@ -151,7 +151,7 @@
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"cost": 1365.1847854999999,
"cost": null,
"run_time": "41.004 seconds"
},
"reached_cutoff": false
@ -169,7 +169,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1368.2918914999993,
"cost": null,
"run_time": "60.005 seconds"
},
"reached_cutoff": true
@ -188,7 +188,7 @@
"success": false,
"fail_reason": "generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1368.3542554999992,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -202,7 +202,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1368.3542554999992,
"cost": null,
"run_time": "0.004 seconds"
},
"tests": {
@ -265,7 +265,7 @@
"difficulty": "intermediate",
"success": true,
"success_%": 100.0,
"cost": 1373.0814234999998,
"cost": null,
"run_time": "75.044 seconds"
},
"reached_cutoff": true
@ -284,7 +284,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1378.1223054999994,
"cost": null,
"run_time": "75.039 seconds"
},
"reached_cutoff": true
@ -304,7 +304,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1380.5522154999996,
"cost": null,
"run_time": "39.757 seconds"
},
"reached_cutoff": false
@ -323,7 +323,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1392.5523254999994,
"cost": null,
"run_time": "60.009 seconds"
},
"reached_cutoff": true
@ -342,7 +342,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1396.2070014999993,
"cost": null,
"run_time": "90.043 seconds"
},
"reached_cutoff": true
@ -362,7 +362,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1403.8377014999992,
"cost": null,
"run_time": "90.04 seconds"
},
"reached_cutoff": true
@ -381,7 +381,7 @@
"success": false,
"fail_reason": "generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1403.9080654999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -400,7 +400,7 @@
"success": false,
"fail_reason": "generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1403.9080654999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -419,7 +419,7 @@
"success": false,
"fail_reason": "generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]",
"success_%": 0.0,
"cost": 1403.9080654999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -439,7 +439,7 @@
"success": false,
"fail_reason": "generate_test.py::TestThreeSum::test_method[challenge_data0] depends on generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1403.9080654999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -458,7 +458,7 @@
"success": false,
"fail_reason": "generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1404.0667974999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -485,7 +485,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1362.8408334999992,
"cost": null,
"run_time": "40.908 seconds"
},
"reached_cutoff": false
@ -504,7 +504,7 @@
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"cost": 1369.4081974999995,
"cost": null,
"run_time": "29.797 seconds"
},
"reached_cutoff": false
@ -524,7 +524,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1399.2661054999996,
"cost": null,
"run_time": "75.04 seconds"
},
"reached_cutoff": true
@ -544,7 +544,7 @@
"success": false,
"fail_reason": "generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1403.9891414999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -574,7 +574,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 0.0,
"cost": 1383.1865814999994,
"cost": null,
"run_time": "60.01 seconds"
},
"reached_cutoff": true
@ -594,7 +594,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0, 0.0]",
"success_%": 0.0,
"cost": 1385.6828374999998,
"cost": null,
"run_time": "60.011 seconds"
},
"reached_cutoff": true
@ -614,7 +614,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0, 0.0]",
"success_%": 0.0,
"cost": 1388.4356334999998,
"cost": null,
"run_time": "60.011 seconds"
},
"reached_cutoff": true
@ -634,7 +634,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1389.1831974999996,
"cost": null,
"run_time": "17.267 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.004 seconds"
},
"reached_cutoff": true
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.988 seconds"
},
"reached_cutoff": true
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.15 seconds"
},
"reached_cutoff": true
@ -76,7 +76,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999994,
"cost": null,
"run_time": "29.371 seconds"
},
"reached_cutoff": false
@ -94,7 +94,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.004 seconds"
},
"reached_cutoff": true

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "60.007 seconds"
},
"reached_cutoff": true
@ -39,7 +39,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "51.859 seconds"
},
"reached_cutoff": false
@ -59,7 +59,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "43.621 seconds"
},
"reached_cutoff": false
@ -78,7 +78,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "44.743 seconds"
},
"reached_cutoff": false
@ -97,7 +97,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "48.236 seconds"
},
"reached_cutoff": false
@ -116,7 +116,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -135,7 +135,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -155,7 +155,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -174,7 +174,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -193,7 +193,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -207,7 +207,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.004 seconds"
},
"tests": {
@ -279,7 +279,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1342.2433194999994,
"cost": null,
"run_time": "60.007 seconds"
},
"reached_cutoff": true
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 50.0,
"cost": 1344.3781614999991,
"cost": null,
"run_time": "54.386 seconds"
},
"reached_cutoff": false
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1347.3474394999994,
"cost": null,
"run_time": "60.043 seconds"
},
"reached_cutoff": true
@ -77,7 +77,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1349.5450334999996,
"cost": null,
"run_time": "43.389 seconds"
},
"reached_cutoff": false
@ -96,7 +96,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1352.5128454999995,
"cost": null,
"run_time": "54.925 seconds"
},
"reached_cutoff": false
@ -115,7 +115,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.5128454999995,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -134,7 +134,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.5371894999996,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -154,7 +154,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.6214774999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -173,7 +173,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.6214774999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -192,7 +192,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.6685134999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -206,7 +206,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1352.7113854999993,
"cost": null,
"run_time": "0.003 seconds"
},
"tests": {
@ -270,7 +270,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.8593534999998,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -289,7 +289,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.1095534999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -309,7 +309,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.1262974999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -328,7 +328,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.2447094999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -347,7 +347,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.2447094999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -367,7 +367,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.3277894999992,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -386,7 +386,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.5095294999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -405,7 +405,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1353.5095294999992,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -424,7 +424,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]",
"success_%": 0.0,
"cost": 1353.6475294999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -444,7 +444,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.6771614999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -463,7 +463,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.7215094999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -491,7 +491,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.5890694999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -511,7 +511,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1352.8417054999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -531,7 +531,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.2847255,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -551,7 +551,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.7040734999991,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -581,7 +581,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.1555254999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -601,7 +601,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.1555254999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -621,7 +621,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.1819374999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -641,7 +641,7 @@
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1353.2447094999993,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.2960054999996,
"cost": null,
"run_time": "18.419 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.6358494999993,
"cost": null,
"run_time": "14.976 seconds"
},
"reached_cutoff": false
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1342.7803774999993,
"cost": null,
"run_time": "34.59 seconds"
},
"reached_cutoff": false
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1343.5949394999996,
"cost": null,
"run_time": "21.005 seconds"
},
"reached_cutoff": false
@ -76,7 +76,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1344.2619814999994,
"cost": null,
"run_time": "20.928 seconds"
},
"reached_cutoff": false
@ -94,7 +94,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1344.9177814999994,
"cost": null,
"run_time": "17.648 seconds"
},
"reached_cutoff": false
@ -112,7 +112,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1346.108983499999,
"cost": null,
"run_time": "23.255 seconds"
},
"reached_cutoff": false
@ -130,7 +130,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1351.5025554999993,
"cost": null,
"run_time": "99.538 seconds"
},
"reached_cutoff": true
@ -149,7 +149,7 @@
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"cost": 1356.7393354999992,
"cost": null,
"run_time": "41.77 seconds"
},
"reached_cutoff": false
@ -168,7 +168,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1360.2214214999997,
"cost": null,
"run_time": "61.047 seconds"
},
"reached_cutoff": true
@ -187,7 +187,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1363.9154954999997,
"cost": null,
"run_time": "61.445 seconds"
},
"reached_cutoff": true
@ -201,7 +201,7 @@
"metrics": {
"percentage": 33.33,
"highest_difficulty": "novice",
"cost": 1365.6362114999993,
"cost": null,
"run_time": "29.589 seconds"
},
"tests": {
@ -264,7 +264,7 @@
"difficulty": "intermediate",
"success": true,
"success_%": 100.0,
"cost": 1367.9136655000002,
"cost": null,
"run_time": "42.673 seconds"
},
"reached_cutoff": false
@ -282,7 +282,7 @@
"difficulty": "intermediate",
"success": true,
"success_%": 100.0,
"cost": 1371.3122634999997,
"cost": null,
"run_time": "86.683 seconds"
},
"reached_cutoff": true
@ -302,7 +302,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.5417034999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -321,7 +321,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.6130294999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -340,7 +340,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.6130294999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -359,7 +359,7 @@
"difficulty": "advanced",
"success": true,
"success_%": 100.0,
"cost": 1375.4160334999997,
"cost": null,
"run_time": "59.221 seconds"
},
"reached_cutoff": false
@ -378,7 +378,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1375.6546774999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -397,7 +397,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1375.6546774999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -416,7 +416,7 @@
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"cost": 1380.0412454999998,
"cost": null,
"run_time": "66.239 seconds"
},
"reached_cutoff": true
@ -436,7 +436,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1380.3811174999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -455,7 +455,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1380.4471354999991,
"cost": null,
"run_time": "0.003 seconds"
},
"reached_cutoff": false
@ -483,7 +483,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1354.5040255,
"cost": null,
"run_time": "65.444 seconds"
},
"reached_cutoff": true
@ -503,7 +503,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1365.8208914999998,
"cost": null,
"run_time": "0.003 seconds"
},
"reached_cutoff": false
@ -523,7 +523,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.6130294999998,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -543,7 +543,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1380.4249174999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -573,7 +573,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.5417034999996,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -593,7 +593,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.5764874999998,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -613,7 +613,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.5764874999998,
"cost": null,
"run_time": "0.003 seconds"
},
"reached_cutoff": false
@ -633,7 +633,7 @@
"success": false,
"fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1371.6130294999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false

View File

@ -20,7 +20,7 @@
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"cost": 1341.4717674999995,
"cost": null,
"run_time": "5.977 seconds"
},
"reached_cutoff": false
@ -38,7 +38,7 @@
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"cost": 1342.9386294999993,
"cost": null,
"run_time": "44.426 seconds"
},
"reached_cutoff": false
@ -58,7 +58,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1343.9160854999996,
"cost": null,
"run_time": "23.1 seconds"
},
"reached_cutoff": false
@ -77,7 +77,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1344.454445499999,
"cost": null,
"run_time": "16.193 seconds"
},
"reached_cutoff": false
@ -96,7 +96,7 @@
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"cost": 1344.7021834999994,
"cost": null,
"run_time": "6.787 seconds"
},
"reached_cutoff": false
@ -115,7 +115,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestSearch::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1344.8143234999998,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -134,7 +134,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicContentGen::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1344.8180494999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -154,7 +154,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1344.8498534999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -173,7 +173,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1344.9177814999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -192,7 +192,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptLink::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicRetrieval::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.022661499999,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -206,7 +206,7 @@
"metrics": {
"percentage": 0,
"highest_difficulty": "No successful tests",
"cost": 1345.0226614999992,
"cost": null,
"run_time": "0.004 seconds"
},
"tests": {
@ -270,7 +270,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0226614999997,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -289,7 +289,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0226614999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -309,7 +309,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestInstructionFollowing::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0226614999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -328,7 +328,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.2446694999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -347,7 +347,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.2446694999994,
"cost": null,
"run_time": "0.001 seconds"
},
"reached_cutoff": false
@ -367,7 +367,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugMultipleTypo::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.4681094999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -386,7 +386,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.4681094999999,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -405,7 +405,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.2[None]",
"success_%": 0.0,
"cost": 1345.5068694999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -424,7 +424,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestAdaptTeslaRevenue::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRevenueRetrieval::test_TestRevenueRetrieval_1.0[None]",
"success_%": 0.0,
"cost": 1345.5068694999995,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -444,7 +444,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestThreeSum::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestFunctionCodeGeneration::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.5405734999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -463,7 +463,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.6027334999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -491,7 +491,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1344.8180494999992,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -511,7 +511,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Simple::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0226614999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -531,7 +531,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Write::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.4681094999994,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -551,7 +551,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Tests::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestReturnCode_Modify::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.5405734999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -581,7 +581,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Simple::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0529334999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -601,7 +601,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_advanced::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0529334999997,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false
@ -621,7 +621,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Hard::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.0529334999997,
"cost": null,
"run_time": "0.05 seconds"
},
"reached_cutoff": false
@ -641,7 +641,7 @@
"success": false,
"fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestGoalLoss_Medium::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/generate_test.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 0.0,
"cost": 1345.1856294999993,
"cost": null,
"run_time": "0.002 seconds"
},
"reached_cutoff": false