{ "command": "agbenchmark start", "completion_time": "2023-07-18-00:17", "metrics": { "run_time": "41.3 seconds", "highest_difficulty": "novice: 3" }, "tests": { "TestWriteFile": { "data_path": "agbenchmark/challenges/interface/write_file", "is_regression": false, "metrics": { "difficulty": "interface", "success": true, "success_%": 100.0, "run_time": "5.554 seconds" } }, "TestBasicCodeGeneration": { "data_path": "agbenchmark/challenges/code/d4", "is_regression": false, "metrics": { "difficulty": "novice", "success": true, "success_%": 100.0, "run_time": "8.223 seconds" } }, "TestSearch": { "data_path": "agbenchmark/challenges/interface/search", "is_regression": false, "metrics": { "difficulty": "interface", "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, "run_time": "16.099 seconds" } }, "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", "is_regression": false, "metrics": { "difficulty": "interface", "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, "run_time": "9.624 seconds" } }, "TestThreeSum": { "data_path": "agbenchmark/challenges/code/d5", "is_regression": false, "metrics": { "difficulty": "intermediate", "success": false, "fail_reason": "assert 1 in [0.0]", "success_%": 0.0, "run_time": "1.625 seconds" } }, "TestBasicRetrieval": { "data_path": "agbenchmark/challenges/retrieval/r1", "is_regression": false, "metrics": { "difficulty": "basic", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestDebugSimpleTypoWithGuidance": { "data_path": "agbenchmark/challenges/code/d1", "is_regression": false, "metrics": { "difficulty": "basic", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestBasicMemory": { "data_path": "agbenchmark/challenges/memory/m1", "is_regression": false, "metrics": { "difficulty": "basic", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestRetrieval2": { "data_path": "agbenchmark/challenges/retrieval/r2", "is_regression": false, "metrics": { "difficulty": "novice", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestCreateSimpleWebServer": { "data_path": "agbenchmark/challenges/code/d3", "is_regression": false, "metrics": { "difficulty": "advanced", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.002 seconds" } }, "TestDebugSimpleTypoWithoutGuidance": { "data_path": "agbenchmark/challenges/code/d2", "is_regression": false, "metrics": { "difficulty": "novice", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestRememberMultipleIds": { "data_path": "agbenchmark/challenges/memory/m2", "is_regression": false, "metrics": { "difficulty": "novice", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestRetrieval3": { "data_path": "agbenchmark/challenges/retrieval/r3", "is_regression": false, "metrics": { "difficulty": "intermediate", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestRememberMultipleIdsWithNoise": { "data_path": "agbenchmark/challenges/memory/m3", "is_regression": false, "metrics": { "difficulty": "intermediate", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } }, "TestRememberMultiplePhrasesWithNoise": { "data_path": "agbenchmark/challenges/memory/m4", "is_regression": false, "metrics": { "difficulty": "advanced", "success": false, "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", "success_%": 0.0, "run_time": "0.001 seconds" } } }, "config": { "workspace": "generated" } }