Fixing memory challenges, naming, testing mini-agi, smooth retrieval scaling (#166)

pull/5155/head
Silen Naihin 2023-07-17 22:41:58 -04:00 committed by GitHub
parent c7a5498f0f
commit 12c5d54583
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
104 changed files with 1022 additions and 187 deletions

View File

@ -103,22 +103,25 @@ class Challenge(ABC):
]
def scoring(self, content: str, ground: Ground) -> float:
print("Scoring content: ", content)
if ground.should_contain:
for should_contain_word in ground.should_contain:
if should_contain_word not in content:
print(f"Word that should exist - {should_contain_word}: False")
return 0.0
else:
print(
f"Word that should exist: {should_contain_word} exists in the content"
)
print(f"Word that should exist - {should_contain_word}: True")
if ground.should_not_contain:
for should_not_contain_word in ground.should_not_contain:
if should_not_contain_word in content:
print(
f"Word that should not exist - {should_not_contain_word}: False"
)
return 0.0
else:
print(
f"Word that should not exist: {should_not_contain_word} does not exist in the content"
f"Word that should not exist - {should_not_contain_word}: True"
)
return 1.0

View File

@ -3,7 +3,7 @@
"category": ["code", "iterate"],
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": ["TestReadFile", "TestWriteFile"],
"cutoff": 60,
"cutoff": 75,
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],

View File

@ -3,7 +3,7 @@
"category": ["code", "iterate"],
"task": "Make test.py run without errors.",
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
"cutoff": 60,
"cutoff": 75,
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],

View File

@ -3,7 +3,7 @@
"category": ["code"],
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
"dependencies": ["TestWriteFile"],
"cutoff": 60,
"cutoff": 90,
"ground": {
"answer": "The two_sum function coded properly.",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@ -12,7 +12,7 @@
"type": "execute_python_code"
},
"info": {
"difficulty": "novice",
"difficulty": "advanced",
"description": "Tests ability for the agent to create the two_sum function.",
"side_effects": []
}

View File

@ -3,7 +3,7 @@
"category": ["code"],
"task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
"cutoff": 60,
"cutoff": 90,
"ground": {
"answer": "GET localhost:8079/health responds with a 200 OK",
"should_contain": [],

View File

@ -12,7 +12,7 @@
"type": "execute_python_code"
},
"info": {
"difficulty": "intermediate",
"difficulty": "advanced",
"description": "Tests ability for the agent to create the three_sum function.",
"side_effects": []
}

View File

@ -1 +0,0 @@
Write the id previously mentioned to a .txt file.

View File

@ -0,0 +1 @@
Write the id previously mentioned to a file called result.txt.

View File

@ -8,7 +8,7 @@
"answer": "2314",
"should_contain": ["2314"],
"should_not_contain": [],
"files": [".txt"],
"files": ["result.txt"],
"type": "file"
},
"info": {

View File

@ -1 +0,0 @@
Write the ids previously mentioned to a .txt file.

View File

@ -0,0 +1 @@
Write the ids previously mentioned to a file called result.txt.

View File

@ -8,7 +8,7 @@
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
"should_not_contain": [],
"files": [".txt"],
"files": ["result.txt"],
"type": "file"
},
"info": {

View File

@ -1,5 +1,5 @@
sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
Write the ids previously mentioned to a .txt file.
Write the ids previously mentioned to a file called result.txt.
gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0

View File

@ -1,14 +1,14 @@
{
"name": "TestRememberMultipleIdsWithNoise",
"name": "TestRememberMultipleWithNoise",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestRememberMultipleIds"],
"cutoff": 60,
"cutoff": 75,
"ground": {
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
"should_not_contain": [],
"files": [".txt"],
"files": ["result.txt"],
"type": "file"
},
"info": {

View File

@ -1,5 +1,5 @@
ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt.
8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ

View File

@ -1,5 +1,5 @@
sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
Write the phrases previously mentioned to a .txt file.
Write the phrases previously mentioned to a file called result.txt.
gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0

View File

@ -1,4 +1,4 @@
The purple elephant danced on a rainbow while eating a taco
The sneaky toaster stole my socks and ran away to Hawaii
My pet rock sings better than Beyoncé on Tuesdays
My pet rock sings better than Beyonce on Tuesdays
The giant hamster rode a unicycle through the crowded mall

View File

@ -2,18 +2,18 @@
"name": "TestRememberMultiplePhrasesWithNoise",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestRememberMultipleIdsWithNoise"],
"cutoff": 60,
"dependencies": ["TestRememberMultipleWithNoise"],
"cutoff": 90,
"ground": {
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"should_contain": [
"The purple elephant danced on a rainbow while eating a taco",
"The sneaky toaster stole my socks and ran away to Hawaii",
"My pet rock sings better than Beyoncé on Tuesdays",
"My pet rock sings better than Beyonce on Tuesdays",
"The giant hamster rode a unicycle through the crowded mall"
],
"should_not_contain": [],
"files": [".txt"],
"files": ["result.txt"],
"type": "file"
},
"info": {

View File

@ -13,7 +13,7 @@
},
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information from a website.",
"description": "Specifies specific website to retrieve website from.",
"side_effects": []
}
}

View File

@ -0,0 +1,19 @@
{
"name": "TestRetrieval2.1",
"category": ["retrieval"],
"task": "Write Tesla's precise revenue in 2022 into a .txt file.",
"dependencies": ["TestRetrieval2.0"],
"cutoff": 60,
"ground": {
"answer": "It was $81.462 billion in 2022.",
"should_contain": ["81", "462"],
"should_not_contain": [],
"files": [".txt"],
"type": "file"
},
"info": {
"difficulty": "novice",
"description": "This one checks the accuracy of the information over r2",
"side_effects": []
}
}

View File

@ -0,0 +1 @@
81,462 Millions

View File

@ -0,0 +1,19 @@
{
"name": "TestRetrieval2.2",
"category": ["retrieval"],
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": ["TestRetrieval2.1"],
"cutoff": 60,
"ground": {
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
"should_contain": ["81,462"],
"should_not_contain": [],
"files": [".txt"],
"type": "file"
},
"info": {
"difficulty": "intermediate",
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
"side_effects": []
}
}

View File

@ -1,19 +0,0 @@
{
"name": "TestRetrieval2",
"category": ["retrieval"],
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": ["TestBasicRetrieval"],
"cutoff": 60,
"ground": {
"answer": "81,462",
"should_contain": ["81,462"],
"should_not_contain": [],
"files": [".txt"],
"type": "file"
},
"info": {
"difficulty": "novice",
"description": "Tests ability to retrieve information.",
"side_effects": ["tests if there is in fact an LLM attached"]
}
}

View File

@ -0,0 +1 @@
81,462 Millions

View File

@ -0,0 +1,19 @@
{
"name": "TestRetrieval2.0",
"category": ["retrieval"],
"task": "Write tesla's revenue in 2022 into a .txt file.",
"dependencies": ["TestBasicRetrieval"],
"cutoff": 60,
"ground": {
"answer": "It was $81.462 billion in 2022.",
"should_contain": ["81"],
"should_not_contain": [],
"files": [".txt"],
"type": "file"
},
"info": {
"difficulty": "novice",
"description": "A no guardrails search for info",
"side_effects": []
}
}

View File

@ -2,7 +2,7 @@
"name": "TestRetrieval3",
"category": ["retrieval"],
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": ["TestRetrieval2"],
"dependencies": ["TestRetrieval2.1"],
"cutoff": 60,
"ground": {
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",

View File

@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
def pytest_runtest_makereport(item: Any, call: Any) -> None:
challenge_data = item.funcargs.get("challenge_data", None)
if call.when == "call":
challenge_data = item.funcargs.get("challenge_data", None)
difficulty = (
challenge_data["info"]["difficulty"] if challenge_data else "unknown"
)
@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
info_details: Any = {
"data_path": challenge_location,
"is_regression": False,
"task": challenge_data["task"],
"answer": challenge_data["ground"]["answer"],
"description": challenge_data["info"]["description"],
"metrics": {
"difficulty": difficulty,
"success": False,
@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
"run_time"
] = f"{str(round(run_time, 3))} seconds"
info_details["reached_cutoff"] = (
float(run_time) > challenge_data["cutoff"]
)
info_manager.add_test(test_name, info_details)

View File

@ -3,28 +3,20 @@
"TestBasicMemory": [
true,
true,
true,
true,
true,
false,
false,
true
],
"TestBasicRetrieval": [
true,
true,
true,
true,
true
],
"TestCreateSimpleWebServer": [
false,
false,
false
],
"TestDebugSimpleTypoWithGuidance": [
false,
false,
false
],
"TestDebugSimpleTypoWithoutGuidance": [
false,
false,
false
],
"TestReadFile": [
true,
true,
@ -32,41 +24,62 @@
true,
true
],
"TestRememberMultipleIds": [
true,
true,
true
],
"TestRememberMultipleIdsWithNoise": [
true,
true,
true
],
"TestRememberMultiplePhrasesWithNoise": [
true,
true,
true
],
"TestRetrieval2": [
true,
true,
true
],
"TestRetrieval3": [
true,
true,
true
],
"TestSearch": [
true,
true,
true,
true,
true
],
"TestWriteFile": [
true,
true,
true,
true
],
"TestRetrieval2.2": [
false,
false,
false,
false
],
"TestRetrieval2.1": [
false,
false,
false,
false,
false,
false
],
"TestRetrieval2.0": [
true,
false
],
"TestRememberMultipleIds": [
false,
false,
true
],
"TestRememberMultipleIdsWithNoise": [
false
],
"TestRememberMultipleWithNoise": [
false,
true
],
"TestRememberMultiplePhrasesWithNoise": [
false,
false,
false,
false,
false,
false
],
"TestDebugSimpleTypoWithGuidance": [
false,
false,
false,
false
]
}
}

View File

@ -1,36 +1,27 @@
{
"command": "agbenchmark start --test TestWriteFile",
"completion_time": "2023-07-17-13:34",
"metrics": {
"run_time": "23.83 seconds",
"highest_difficulty": "interface: 1"
},
"tests": {
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": true,
"metrics": {
"difficulty": "interface",
"success": true,
"non_mock_success_%": 100.0,
"run_time": "0.009 seconds"
}
},
"additional": {
"model": "gpt-3.5-turbo"
},
"command": "agbenchmark start --test TestWriteFile",
"completion_time": "2023-07-17-09:54",
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"metrics": {
"run_time": "22.36 seconds",
"highest_difficulty": "interface: 1"
},
"tests": {
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 40.0,
"run_time": "22.169 seconds"
}
}
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": true,
"reached_cutoff": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"run_time": "23.627 seconds"
}
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRememberMultipleWithNoise",
"completion_time": "2023-07-17-21:24",
"metrics": {
"run_time": "77.71 seconds",
"highest_difficulty": "intermediate: 4"
},
"tests": {
"TestRememberMultipleWithNoise": {
"data_path": "agbenchmark/challenges/memory/m3",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "3145\n3791\n9317\n9471",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "intermediate",
"success": true,
"success_%": 50.0,
"run_time": "77.397 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultipleWithNoise",
"completion_time": "2023-07-17-21:19",
"metrics": {
"run_time": "74.3 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultipleWithNoise": {
"data_path": "agbenchmark/challenges/memory/m3",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "3145\n3791\n9317\n9471",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "74.059 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:28",
"metrics": {
"run_time": "60.86 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "60.631 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:32",
"metrics": {
"run_time": "73.04 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "72.736 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:34",
"metrics": {
"run_time": "81.59 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "81.374 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:36",
"metrics": {
"run_time": "98.32 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "98.021 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:42",
"metrics": {
"run_time": "303.13 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "302.919 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
"completion_time": "2023-07-17-21:27",
"metrics": {
"run_time": "77.72 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRememberMultiplePhrasesWithNoise": {
"data_path": "agbenchmark/challenges/memory/m4_phrases",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
"success_%": 0.0,
"run_time": "77.491 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,28 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-17-21:46",
"metrics": {
"run_time": "87.21 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"answer": "[0, 1] [2, 5] [0, 3]",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "86.967 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}
}

View File

@ -0,0 +1,28 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-17-21:47",
"metrics": {
"run_time": "48.52 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"answer": "[0, 1] [2, 5] [0, 3]",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "48.208 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}
}

View File

@ -0,0 +1,28 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-17-21:55",
"metrics": {
"run_time": "54.95 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1_debug",
"is_regression": false,
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"answer": "[0, 1] [2, 5] [0, 3]",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "54.741 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-17-21:44",
"metrics": {
"run_time": "63.37 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"answer": "[0, 1] [2, 5] [0, 3]",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "63.125 seconds"
},
"reached_cutoff": true
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -9,6 +9,7 @@
"TestWriteFile": {
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"reached_cutoff": false,
"metrics": {
"difficulty": "interface",
"success": true,
@ -18,8 +19,7 @@
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"

View File

@ -9,6 +9,7 @@
"TestReadFile": {
"data_path": "agbenchmark/challenges/interface/read_file",
"is_regression": true,
"reached_cutoff": true,
"metrics": {
"difficulty": "interface",
"success": true,
@ -21,7 +22,6 @@
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4",
"reached_termination_time": true
"model": "gpt-3.5-turbo"
}
}

View File

@ -9,6 +9,7 @@
"TestReadFile": {
"data_path": "agbenchmark/challenges/interface/read_file",
"is_regression": true,
"reached_cutoff": false,
"metrics": {
"difficulty": "interface",
"success": true,
@ -18,8 +19,7 @@
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"

View File

@ -0,0 +1,27 @@
{
"command": "agbenchmark start --test TestSearch",
"completion_time": "2023-07-17-13:35",
"metrics": {
"run_time": "20.58 seconds",
"highest_difficulty": "interface: 1"
},
"tests": {
"TestSearch": {
"data_path": "agbenchmark/challenges/interface/search",
"is_regression": true,
"reached_cutoff": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 100.0,
"run_time": "20.367 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -9,6 +9,7 @@
"TestSearch": {
"data_path": "agbenchmark/challenges/interface/search",
"is_regression": true,
"reached_cutoff": false,
"metrics": {
"difficulty": "interface",
"success": true,
@ -18,8 +19,7 @@
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"

View File

@ -0,0 +1,27 @@
{
"command": "agbenchmark start --test TestBasicRetrieval",
"completion_time": "2023-07-17-13:31",
"metrics": {
"run_time": "26.05 seconds",
"highest_difficulty": "basic: 2"
},
"tests": {
"TestBasicRetrieval": {
"data_path": "agbenchmark/challenges/retrieval/r1",
"is_regression": true,
"reached_cutoff": false,
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"run_time": "25.818 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -1,28 +0,0 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-15-22:16",
"metrics": {
"run_time": "45.92 seconds",
"highest_difficulty": ": 0"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "45.599 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,27 @@
{
"command": "agbenchmark start --test TestBasicRetrieval",
"completion_time": "2023-07-17-13:22",
"metrics": {
"run_time": "61.24 seconds",
"highest_difficulty": "basic: 2"
},
"tests": {
"TestBasicRetrieval": {
"data_path": "agbenchmark/challenges/retrieval/r1",
"is_regression": true,
"reached_cutoff": true,
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 100.0,
"run_time": "60.872 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -1,28 +0,0 @@
{
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
"completion_time": "2023-07-15-22:15",
"metrics": {
"run_time": "32.99 seconds",
"highest_difficulty": ": 0"
},
"tests": {
"TestDebugSimpleTypoWithGuidance": {
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "32.582 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark.benchmarks"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.0",
"completion_time": "2023-07-17-17:10",
"metrics": {
"run_time": "66.81 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.0": {
"data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
"is_regression": false,
"reached_cutoff": true,
"answer": "It was $81.462 billion in 2022.",
"description": "A no guardrails search for info",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "66.547 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,29 @@
{
"command": "agbenchmark start --test TestRetrieval2",
"completion_time": "2023-07-17-13:54",
"metrics": {
"run_time": "36 seconds",
"highest_difficulty": "TestRetrieval2: 3"
},
"tests": {
"TestRetrieval2": {
"data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
"is_regression": false,
"reached_cutoff": false,
"answer": "It was $81.462 billion in 2022.",
"description": "A no guardrails search for info",
"metrics": {
"difficulty": "novice",
"success": true,
"success_%": 50.0,
"run_time": "35.59 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.1",
"completion_time": "2023-07-17-17:27",
"metrics": {
"run_time": "64.44 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.1": {
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
"is_regression": false,
"reached_cutoff:": true,
"answer": "It was $81.462 billion in 2022.",
"description": "This one checks the accuracy of the information over r2",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0, 0.0]",
"success_%": 0.0,
"run_time": "64.216 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.1",
"completion_time": "2023-07-17-17:53",
"metrics": {
"run_time": "30.08 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.1": {
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
"is_regression": false,
"answer": "It was $81.462 billion in 2022.",
"description": "This one checks the accuracy of the information over r2",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "29.711 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.1",
"completion_time": "2023-07-17-17:54",
"metrics": {
"run_time": "27.49 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.1": {
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
"is_regression": false,
"answer": "It was $81.462 billion in 2022.",
"description": "This one checks the accuracy of the information over r2",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "27.266 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRetrieval2.1",
"completion_time": "2023-07-17-17:56",
"metrics": {
"run_time": "23.64 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.1": {
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
"is_regression": false,
"task": "Write Tesla's precise revenue in 2022 into a .txt file.",
"answer": "It was $81.462 billion in 2022.",
"description": "This one checks the accuracy of the information over r2",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "23.42 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.1",
"completion_time": "2023-07-17-14:03",
"metrics": {
"run_time": "68.39 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.1": {
"data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
"is_regression": false,
"reached_cutoff": true,
"answer": "It was $81.462 billion in 2022.",
"description": "This one checks the accuracy of the information over r2",
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "assert 1 in []",
"success_%": 0.0,
"run_time": "68.15 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestRetrieval2.2",
"completion_time": "2023-07-17-17:57",
"metrics": {
"run_time": "31.1 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.2": {
"data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
"is_regression": false,
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "30.888 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestRetrieval2.2",
"completion_time": "2023-07-17-14:04",
"metrics": {
"run_time": "28.08 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestRetrieval2.2": {
"data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
"is_regression": false,
"reached_cutoff": false,
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
"metrics": {
"difficulty": "intermediate",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0.0,
"run_time": "27.857 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,30 @@
{
"command": "agbenchmark start --test TestBasicMemory",
"completion_time": "2023-07-17-18:22",
"metrics": {
"run_time": "53.48 seconds",
"highest_difficulty": "basic: 2"
},
"tests": {
"TestBasicMemory": {
"data_path": "agbenchmark/challenges/memory/m1",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "2314",
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 75.0,
"run_time": "53.252 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-4"
}
}

View File

@ -0,0 +1,31 @@
{
"command": "agbenchmark start --test TestBasicMemory",
"completion_time": "2023-07-17-18:18",
"metrics": {
"run_time": "62.11 seconds",
"highest_difficulty": "No successful tests"
},
"tests": {
"TestBasicMemory": {
"data_path": "agbenchmark/challenges/memory/m1",
"is_regression": false,
"task": "Follow the instructions in the instructions_1.txt file",
"answer": "2314",
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'",
"success_%": 83.33,
"run_time": "61.879 seconds"
},
"reached_cutoff": false
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
},
"additional": {
"model": "gpt-3.5-turbo"
}
}

Some files were not shown because too many files have changed in this diff Show More