gpt-engineer-20230716225908

pull/5155/head
Auto-GPT-Bot 2023-07-16 22:59:08 +00:00
parent a36eadf554
commit 5c7acbc719
1 changed files with 175 additions and 0 deletions

View File

@ -0,0 +1,175 @@
{
"command": "agbenchmark start",
"completion_time": "2023-07-16-22:59",
"metrics": {
"run_time": "449.82 seconds",
"highest_difficulty": "novice: 3"
},
"tests": {
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 50.0,
"run_time": "62.5 seconds"
}
},
"TestBasicCodeGeneration": {
"data_path": "agbenchmark/challenges/code/d4",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": true,
"success_%": 100.0,
"run_time": "70.822 seconds"
}
},
"TestSearch": {
"data_path": "agbenchmark/challenges/interface/search",
"is_regression": true,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"run_time": "68.908 seconds"
}
},
"TestReadFile": {
"data_path": "agbenchmark/challenges/interface/read_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
"success_%": 75.0,
"run_time": "60.495 seconds"
}
},
"TestThreeSum": {
"data_path": "agbenchmark/challenges/code/d5",
"is_regression": false,
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "69.361 seconds"
}
},
"TestBasicRetrieval": {
"data_path": "agbenchmark/challenges/retrieval/r1",
"is_regression": true,
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"run_time": "67.503 seconds"
}
},
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
"success_%": 0.0,
"run_time": "0.001 seconds"
}
},
"TestBasicMemory": {
"data_path": "agbenchmark/challenges/memory/m1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
"success_%": 75.0,
"run_time": "0.001 seconds"
}
},
"TestRetrieval2": {
"data_path": "agbenchmark/challenges/retrieval/r2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 75.0,
"run_time": "50.064 seconds"
}
},
"TestCreateSimpleWebServer": {
"data_path": "agbenchmark/challenges/code/d3",
"is_regression": false,
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"run_time": "0.001 seconds"
}
},
"TestDebugSimpleTypoWithoutGuidance": {
"data_path": "agbenchmark/challenges/code/d2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0.0,
"run_time": "0.001 seconds"
}
},
"TestRememberMultipleIds": {
"data_path": "agbenchmark/challenges/memory/m2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
"success_%": 75.0,
"run_time": "0.001 seconds"
}
},
"TestRetrieval3": {
"data_path": "agbenchmark/challenges/retrieval/r3",
"is_regression": false,
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
"success_%": 75.0,
"run_time": "0.001 seconds"
}
},
"TestRememberMultipleIdsWithNoise": {
"data_path": "agbenchmark/challenges/memory/m3",
"is_regression": false,
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
"success_%": 75.0,
"run_time": "0.001 seconds"
}
},
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4",
"is_regression": false,
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
"success_%": 75.0,
"run_time": "0.001 seconds"
}
}
},
"config": {
"workspace": "projects/my-new-project/workspace",
"entry_path": "agbenchmark.benchmarks"
}
}