Fixing memory challenges, naming, testing mini-agi, smooth retrieval scaling (#166)
parent
c7a5498f0f
commit
12c5d54583
|
@ -103,22 +103,25 @@ class Challenge(ABC):
|
|||
]
|
||||
|
||||
def scoring(self, content: str, ground: Ground) -> float:
|
||||
print("Scoring content: ", content)
|
||||
if ground.should_contain:
|
||||
for should_contain_word in ground.should_contain:
|
||||
if should_contain_word not in content:
|
||||
print(f"Word that should exist - {should_contain_word}: False")
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should exist: {should_contain_word} exists in the content"
|
||||
)
|
||||
print(f"Word that should exist - {should_contain_word}: True")
|
||||
|
||||
if ground.should_not_contain:
|
||||
for should_not_contain_word in ground.should_not_contain:
|
||||
if should_not_contain_word in content:
|
||||
print(
|
||||
f"Word that should not exist - {should_not_contain_word}: False"
|
||||
)
|
||||
return 0.0
|
||||
else:
|
||||
print(
|
||||
f"Word that should not exist: {should_not_contain_word} does not exist in the content"
|
||||
f"Word that should not exist - {should_not_contain_word}: True"
|
||||
)
|
||||
|
||||
return 1.0
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
"category": ["code", "iterate"],
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"dependencies": ["TestReadFile", "TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"cutoff": 75,
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
|
@ -3,7 +3,7 @@
|
|||
"category": ["code", "iterate"],
|
||||
"task": "Make test.py run without errors.",
|
||||
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
|
||||
"cutoff": 60,
|
||||
"cutoff": 75,
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
|
@ -3,7 +3,7 @@
|
|||
"category": ["code"],
|
||||
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"cutoff": 90,
|
||||
"ground": {
|
||||
"answer": "The two_sum function coded properly.",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
|
@ -12,7 +12,7 @@
|
|||
"type": "execute_python_code"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "novice",
|
||||
"difficulty": "advanced",
|
||||
"description": "Tests ability for the agent to create the two_sum function.",
|
||||
"side_effects": []
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
"category": ["code"],
|
||||
"task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
|
||||
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
|
||||
"cutoff": 60,
|
||||
"cutoff": 90,
|
||||
"ground": {
|
||||
"answer": "GET localhost:8079/health responds with a 200 OK",
|
||||
"should_contain": [],
|
|
@ -12,7 +12,7 @@
|
|||
"type": "execute_python_code"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "intermediate",
|
||||
"difficulty": "advanced",
|
||||
"description": "Tests ability for the agent to create the three_sum function.",
|
||||
"side_effects": []
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
Write the id previously mentioned to a .txt file.
|
|
@ -0,0 +1 @@
|
|||
Write the id previously mentioned to a file called result.txt.
|
|
@ -8,7 +8,7 @@
|
|||
"answer": "2314",
|
||||
"should_contain": ["2314"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"files": ["result.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
|
@ -1 +0,0 @@
|
|||
Write the ids previously mentioned to a .txt file.
|
|
@ -0,0 +1 @@
|
|||
Write the ids previously mentioned to a file called result.txt.
|
|
@ -8,7 +8,7 @@
|
|||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"files": ["result.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
|
@ -1,5 +1,5 @@
|
|||
sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
|
||||
|
||||
Write the ids previously mentioned to a .txt file.
|
||||
Write the ids previously mentioned to a file called result.txt.
|
||||
|
||||
gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
|
|
@ -1,14 +1,14 @@
|
|||
{
|
||||
"name": "TestRememberMultipleIdsWithNoise",
|
||||
"name": "TestRememberMultipleWithNoise",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIds"],
|
||||
"cutoff": 60,
|
||||
"cutoff": 75,
|
||||
"ground": {
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"files": ["result.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
|
@ -1,5 +1,5 @@
|
|||
ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
|
||||
|
||||
The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
|
||||
The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt.
|
||||
|
||||
8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
|
|
@ -1,5 +1,5 @@
|
|||
sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
|
||||
|
||||
Write the phrases previously mentioned to a .txt file.
|
||||
Write the phrases previously mentioned to a file called result.txt.
|
||||
|
||||
gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
|
|
@ -1,4 +1,4 @@
|
|||
The purple elephant danced on a rainbow while eating a taco
|
||||
The sneaky toaster stole my socks and ran away to Hawaii
|
||||
My pet rock sings better than Beyoncé on Tuesdays
|
||||
My pet rock sings better than Beyonce on Tuesdays
|
||||
The giant hamster rode a unicycle through the crowded mall
|
|
@ -2,18 +2,18 @@
|
|||
"name": "TestRememberMultiplePhrasesWithNoise",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIdsWithNoise"],
|
||||
"cutoff": 60,
|
||||
"dependencies": ["TestRememberMultipleWithNoise"],
|
||||
"cutoff": 90,
|
||||
"ground": {
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"should_contain": [
|
||||
"The purple elephant danced on a rainbow while eating a taco",
|
||||
"The sneaky toaster stole my socks and ran away to Hawaii",
|
||||
"My pet rock sings better than Beyoncé on Tuesdays",
|
||||
"My pet rock sings better than Beyonce on Tuesdays",
|
||||
"The giant hamster rode a unicycle through the crowded mall"
|
||||
],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"files": ["result.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability to retrieve information from a website.",
|
||||
"description": "Specifies specific website to retrieve website from.",
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "TestRetrieval2.1",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write Tesla's precise revenue in 2022 into a .txt file.",
|
||||
"dependencies": ["TestRetrieval2.0"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"should_contain": ["81", "462"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "novice",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
81,462 Millions
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "TestRetrieval2.2",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestRetrieval2.1"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"should_contain": ["81,462"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "intermediate",
|
||||
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
{
|
||||
"name": "TestRetrieval2",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestBasicRetrieval"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "81,462",
|
||||
"should_contain": ["81,462"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "novice",
|
||||
"description": "Tests ability to retrieve information.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
81,462 Millions
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "TestRetrieval2.0",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue in 2022 into a .txt file.",
|
||||
"dependencies": ["TestBasicRetrieval"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"should_contain": ["81"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "novice",
|
||||
"description": "A no guardrails search for info",
|
||||
"side_effects": []
|
||||
}
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
"name": "TestRetrieval3",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestRetrieval2"],
|
||||
"dependencies": ["TestRetrieval2.1"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
|
|
|
@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
|
|||
|
||||
|
||||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
challenge_data = item.funcargs.get("challenge_data", None)
|
||||
if call.when == "call":
|
||||
challenge_data = item.funcargs.get("challenge_data", None)
|
||||
difficulty = (
|
||||
challenge_data["info"]["difficulty"] if challenge_data else "unknown"
|
||||
)
|
||||
|
@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|||
info_details: Any = {
|
||||
"data_path": challenge_location,
|
||||
"is_regression": False,
|
||||
"task": challenge_data["task"],
|
||||
"answer": challenge_data["ground"]["answer"],
|
||||
"description": challenge_data["info"]["description"],
|
||||
"metrics": {
|
||||
"difficulty": difficulty,
|
||||
"success": False,
|
||||
|
@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|||
"run_time"
|
||||
] = f"{str(round(run_time, 3))} seconds"
|
||||
|
||||
info_details["reached_cutoff"] = (
|
||||
float(run_time) > challenge_data["cutoff"]
|
||||
)
|
||||
|
||||
info_manager.add_test(test_name, info_details)
|
||||
|
||||
|
||||
|
|
|
@ -3,28 +3,20 @@
|
|||
"TestBasicMemory": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
true
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestCreateSimpleWebServer": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true,
|
||||
|
@ -32,41 +24,62 @@
|
|||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestSearch": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2.2": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRetrieval2.1": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestRetrieval2.0": [
|
||||
true,
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
false,
|
||||
false,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
false
|
||||
],
|
||||
"TestRememberMultipleWithNoise": [
|
||||
false,
|
||||
true
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
]
|
||||
}
|
||||
}
|
|
@ -1,36 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestWriteFile",
|
||||
"completion_time": "2023-07-17-13:34",
|
||||
"metrics": {
|
||||
"run_time": "23.83 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.009 seconds"
|
||||
}
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
},
|
||||
"command": "agbenchmark start --test TestWriteFile",
|
||||
"completion_time": "2023-07-17-09:54",
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"metrics": {
|
||||
"run_time": "22.36 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 40.0,
|
||||
"run_time": "22.169 seconds"
|
||||
}
|
||||
}
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "23.627 seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultipleWithNoise",
|
||||
"completion_time": "2023-07-17-21:24",
|
||||
"metrics": {
|
||||
"run_time": "77.71 seconds",
|
||||
"highest_difficulty": "intermediate: 4"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultipleWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m3",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": true,
|
||||
"success_%": 50.0,
|
||||
"run_time": "77.397 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultipleWithNoise",
|
||||
"completion_time": "2023-07-17-21:19",
|
||||
"metrics": {
|
||||
"run_time": "74.3 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultipleWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m3",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "74.059 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:28",
|
||||
"metrics": {
|
||||
"run_time": "60.86 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "60.631 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:32",
|
||||
"metrics": {
|
||||
"run_time": "73.04 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "72.736 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:34",
|
||||
"metrics": {
|
||||
"run_time": "81.59 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "81.374 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:36",
|
||||
"metrics": {
|
||||
"run_time": "98.32 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "98.021 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:42",
|
||||
"metrics": {
|
||||
"run_time": "303.13 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "302.919 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
|
||||
"completion_time": "2023-07-17-21:27",
|
||||
"metrics": {
|
||||
"run_time": "77.72 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4_phrases",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
|
||||
"success_%": 0.0,
|
||||
"run_time": "77.491 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-17-21:46",
|
||||
"metrics": {
|
||||
"run_time": "87.21 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "86.967 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-17-21:47",
|
||||
"metrics": {
|
||||
"run_time": "48.52 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "48.208 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-17-21:55",
|
||||
"metrics": {
|
||||
"run_time": "54.95 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1_debug",
|
||||
"is_regression": false,
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "54.741 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-17-21:44",
|
||||
"metrics": {
|
||||
"run_time": "63.37 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "63.125 seconds"
|
||||
},
|
||||
"reached_cutoff": true
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -9,6 +9,7 @@
|
|||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
|
@ -18,8 +19,7 @@
|
|||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
"TestReadFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
|
@ -21,7 +22,6 @@
|
|||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4",
|
||||
"reached_termination_time": true
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
"TestReadFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
|
@ -18,8 +19,7 @@
|
|||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestSearch",
|
||||
"completion_time": "2023-07-17-13:35",
|
||||
"metrics": {
|
||||
"run_time": "20.58 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestSearch": {
|
||||
"data_path": "agbenchmark/challenges/interface/search",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "20.367 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -9,6 +9,7 @@
|
|||
"TestSearch": {
|
||||
"data_path": "agbenchmark/challenges/interface/search",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
|
@ -18,8 +19,7 @@
|
|||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestBasicRetrieval",
|
||||
"completion_time": "2023-07-17-13:31",
|
||||
"metrics": {
|
||||
"run_time": "26.05 seconds",
|
||||
"highest_difficulty": "basic: 2"
|
||||
},
|
||||
"tests": {
|
||||
"TestBasicRetrieval": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r1",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "25.818 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-15-22:16",
|
||||
"metrics": {
|
||||
"run_time": "45.92 seconds",
|
||||
"highest_difficulty": ": 0"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "45.599 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestBasicRetrieval",
|
||||
"completion_time": "2023-07-17-13:22",
|
||||
"metrics": {
|
||||
"run_time": "61.24 seconds",
|
||||
"highest_difficulty": "basic: 2"
|
||||
},
|
||||
"tests": {
|
||||
"TestBasicRetrieval": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r1",
|
||||
"is_regression": true,
|
||||
"reached_cutoff": true,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "60.872 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-15-22:15",
|
||||
"metrics": {
|
||||
"run_time": "32.99 seconds",
|
||||
"highest_difficulty": ": 0"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "32.582 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.0",
|
||||
"completion_time": "2023-07-17-17:10",
|
||||
"metrics": {
|
||||
"run_time": "66.81 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.0": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
|
||||
"is_regression": false,
|
||||
"reached_cutoff": true,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "A no guardrails search for info",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "66.547 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2",
|
||||
"completion_time": "2023-07-17-13:54",
|
||||
"metrics": {
|
||||
"run_time": "36 seconds",
|
||||
"highest_difficulty": "TestRetrieval2: 3"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
|
||||
"is_regression": false,
|
||||
"reached_cutoff": false,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "A no guardrails search for info",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": true,
|
||||
"success_%": 50.0,
|
||||
"run_time": "35.59 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.1",
|
||||
"completion_time": "2023-07-17-17:27",
|
||||
"metrics": {
|
||||
"run_time": "64.44 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.1": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
|
||||
"is_regression": false,
|
||||
"reached_cutoff:": true,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0, 0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "64.216 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.1",
|
||||
"completion_time": "2023-07-17-17:53",
|
||||
"metrics": {
|
||||
"run_time": "30.08 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.1": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
|
||||
"is_regression": false,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "29.711 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.1",
|
||||
"completion_time": "2023-07-17-17:54",
|
||||
"metrics": {
|
||||
"run_time": "27.49 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.1": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
|
||||
"is_regression": false,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "27.266 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.1",
|
||||
"completion_time": "2023-07-17-17:56",
|
||||
"metrics": {
|
||||
"run_time": "23.64 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.1": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
|
||||
"is_regression": false,
|
||||
"task": "Write Tesla's precise revenue in 2022 into a .txt file.",
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "23.42 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.1",
|
||||
"completion_time": "2023-07-17-14:03",
|
||||
"metrics": {
|
||||
"run_time": "68.39 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.1": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
|
||||
"is_regression": false,
|
||||
"reached_cutoff": true,
|
||||
"answer": "It was $81.462 billion in 2022.",
|
||||
"description": "This one checks the accuracy of the information over r2",
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in []",
|
||||
"success_%": 0.0,
|
||||
"run_time": "68.15 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.2",
|
||||
"completion_time": "2023-07-17-17:57",
|
||||
"metrics": {
|
||||
"run_time": "31.1 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.2": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
|
||||
"is_regression": false,
|
||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "30.888 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestRetrieval2.2",
|
||||
"completion_time": "2023-07-17-14:04",
|
||||
"metrics": {
|
||||
"run_time": "28.08 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestRetrieval2.2": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
|
||||
"is_regression": false,
|
||||
"reached_cutoff": false,
|
||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
||||
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "27.857 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestBasicMemory",
|
||||
"completion_time": "2023-07-17-18:22",
|
||||
"metrics": {
|
||||
"run_time": "53.48 seconds",
|
||||
"highest_difficulty": "basic: 2"
|
||||
},
|
||||
"tests": {
|
||||
"TestBasicMemory": {
|
||||
"data_path": "agbenchmark/challenges/memory/m1",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "2314",
|
||||
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 75.0,
|
||||
"run_time": "53.252 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestBasicMemory",
|
||||
"completion_time": "2023-07-17-18:18",
|
||||
"metrics": {
|
||||
"run_time": "62.11 seconds",
|
||||
"highest_difficulty": "No successful tests"
|
||||
},
|
||||
"tests": {
|
||||
"TestBasicMemory": {
|
||||
"data_path": "agbenchmark/challenges/memory/m1",
|
||||
"is_regression": false,
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"answer": "2314",
|
||||
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'",
|
||||
"success_%": 83.33,
|
||||
"run_time": "61.879 seconds"
|
||||
},
|
||||
"reached_cutoff": false
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue