diff --git a/.gitmodules b/.gitmodules index f14b5e07d..d2b71f9c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "agent/Auto-GPT"] path = agent/Auto-GPT - url = https://github.com/Significant-Gravitas/Auto-GPT.git + url = https://github.com/merwanehamadi/Auto-GPT.git branch = benchmark-integration [submodule "agent/gpt-engineer"] path = agent/gpt-engineer diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 991a7e8e0..897f4f8cf 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -16,9 +16,7 @@ MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False def run_agent( - task: str, - config: Dict[str, Any], - challenge_location: str, + task: str, config: Dict[str, Any], challenge_location: str, cutoff: int ) -> None: """Calling to get a response""" @@ -27,9 +25,7 @@ def run_agent( config["workspace"], "artifacts_out", challenge_location ) else: - print( - f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}" - ) + print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}") command = [sys.executable, "-m", config["entry_path"], str(task)] process = subprocess.Popen( command, @@ -50,11 +46,11 @@ def run_agent( if ( process.poll() is not None or output == "" - or (time.time() - start_time > config["cutoff"]) + or (time.time() - start_time > cutoff) ): break - if time.time() - start_time > config["cutoff"]: + if time.time() - start_time > cutoff: print("The Python function has exceeded the time limit and was terminated.") process.kill() else: diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index f07faf8ee..4f24bb603 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -4,16 +4,9 @@ import subprocess from abc import ABC from typing import Any, Dict, List -from dotenv import load_dotenv - from agbenchmark.challenges.define_task_types import ChallengeData, Ground from agbenchmark.start_benchmark import CURRENT_DIRECTORY -load_dotenv() - -mock_test_str = os.getenv("MOCK_TEST") -MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False - class Challenge(ABC): """The parent class to all specific challenges classes. @@ -37,14 +30,14 @@ class Challenge(ABC): def dependencies(self) -> list: return self.data.dependencies - def setup_challenge(self, config: Dict[str, Any]) -> None: + def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent copy_artifacts_into_workspace( config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION ) - run_agent(self.task, config, self.CHALLENGE_LOCATION) + run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff) # hidden files are added after the agent runs. Hidden files can be python test files. # We copy them in the workspace to make it easy to import the code produced by the agent diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index bc1a15b42..d8e0280a4 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 60, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index fca86f29b..de32ef9a7 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 60, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json index ae0e45581..c5d111a4d 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d3/data.json @@ -3,6 +3,7 @@ "category": ["code"], "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 60, "ground": { "answer": "GET localhost:8079/health responds with a 200 OK", "should_contain": [], diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json index b2320a4e5..e8db918d2 100644 --- a/agbenchmark/challenges/code/d4/data.json +++ b/agbenchmark/challenges/code/d4/data.json @@ -3,6 +3,7 @@ "category": ["code"], "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "The two_sum function coded properly.", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json index 4b44c6943..434b1312e 100644 --- a/agbenchmark/challenges/code/d5/data.json +++ b/agbenchmark/challenges/code/d5/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"], + "cutoff": 60, "ground": { "answer": "The three_sum function coded properly.", "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"], diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 668025dd2..dc1777d71 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -61,6 +61,7 @@ class ChallengeData(BaseModel): category: List[str] task: str dependencies: List[str] + cutoff: int ground: Ground info: Info diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index 1bf340454..55878856b 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "random string Hello World!", "should_contain": ["random string", "Hello World!"], diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index de8934d95..793e71715 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "This is a Heading\nThis is a paragraph.", "should_contain": ["Heading", "paragraph"], diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index 8db9cd620..426e6c3d4 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Print the the capital of America to a .txt file", "dependencies": [], + "cutoff": 60, "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index ab86f1c3c..33c523244 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 60, "ground": { "answer": "2314", "should_contain": ["2314"], diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 9205c99f1..840e8dc83 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestBasicMemory"], + "cutoff": 60, "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 3b78d614b..3af2fb3a5 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIds"], + "cutoff": 60, "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 84f5c2b21..17a078e1b 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIdsWithNoise"], + "cutoff": 60, "ground": { "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", "should_contain": [ diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index e3e09302d..c3af4862d 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", "dependencies": ["TestWriteFile", "TestSearch"], + "cutoff": 60, "ground": { "answer": "£25.89", "should_contain": ["25.89"], diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 977be4bcd..f558b8584 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestBasicRetrieval"], + "cutoff": 60, "ground": { "answer": "81,462", "should_contain": ["81,462"], diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index 5504908ea..eb998ffbf 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestRetrieval2"], + "cutoff": 60, "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "should_contain": [ diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 98a5ab81a..255b39e57 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -54,7 +54,8 @@ def generate_tests() -> None: # Define test method within the dynamically created class def test_method(self, config: Dict[str, Any]) -> None: # type: ignore - self.setup_challenge(config) + cutoff = self.data.cutoff or 60 + self.setup_challenge(config, cutoff) scores = self.get_scores(config) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index af83029ef..820f133b1 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,4 @@ { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks", - "cutoff": 60 + "entry_path": "agbenchmark.benchmarks" } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 952588105..245df485e 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -18,12 +18,10 @@ from agbenchmark.start_benchmark import ( from agbenchmark.utils import calculate_success_percentage -def resolve_workspace(config: Dict[str, Any]) -> str: - if config.get("workspace", "").startswith("${") and config.get( - "workspace", "" - ).endswith("}"): +def resolve_workspace(workspace: str) -> str: + if workspace.startswith("${") and workspace.endswith("}"): # Extract the string inside ${...} - path_expr = config["workspace"][2:-1] + path_expr = workspace[2:-1] # Check if it starts with "os.path.join" if path_expr.strip().startswith("os.path.join"): @@ -35,7 +33,7 @@ def resolve_workspace(config: Dict[str, Any]) -> str: else: raise ValueError("Invalid workspace path expression.") else: - return os.path.abspath(Path(os.getcwd()) / config["workspace"]) + return os.path.abspath(Path(os.getcwd()) / workspace) @pytest.fixture(scope="module") @@ -45,10 +43,10 @@ def config(request: Any) -> None: config = json.load(f) if isinstance(config["workspace"], str): - config["workspace"] = resolve_workspace(config) + config["workspace"] = resolve_workspace(config["workspace"]) else: # it's a input output dict - config["workspace"]["input"] = resolve_workspace(config) - config["workspace"]["output"] = resolve_workspace(config) + config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"]) + config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"]) return config @@ -173,18 +171,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: regression_manager.remove_test(test_name) info_details["metrics"]["fail_reason"] = str(call.excinfo.value) - prev_test_results: list[bool] = [] - + prev_test_results: list[bool] = internal_info.tests.get(test_name, []) if not mock: # only add if it's an actual test - prev_test_results = internal_info.tests.get(test_name, []) prev_test_results.append(info_details["metrics"]["success"]) internal_info.add_test(test_name, prev_test_results) - # can calculate success rate regardless of mock - info_details["metrics"]["success_%"] = calculate_success_percentage( - prev_test_results - ) + # can calculate success rate regardless of mock + info_details["metrics"]["success_%"] = calculate_success_percentage( + prev_test_results + ) + else: + # can calculate success rate regardless of mock + info_details["metrics"][ + "non_mock_success_%" + ] = calculate_success_percentage(prev_test_results) if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]: # if the last 3 tests were successful, add to the regression tests diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json index 5f46bd854..95a051d54 100644 --- a/agbenchmark/internal_info.json +++ b/agbenchmark/internal_info.json @@ -62,6 +62,12 @@ "TestWriteFile": [ true, true, - true + true, + false, + false, + false, + false, + true, + false ] } \ No newline at end of file diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json index ce73ce263..25591a4de 100644 --- a/agbenchmark/regression_tests.json +++ b/agbenchmark/regression_tests.json @@ -16,57 +16,52 @@ "data_path": "agbenchmark/challenges/retrieval/r1" }, "TestReadFile": { - "difficulty": "basic", + "difficulty": "interface", "dependencies": [ "TestWriteFile" ], "data_path": "agbenchmark/challenges/interface/read_file" }, "TestRememberMultipleIds": { - "difficulty": "basic", + "difficulty": "novice", "dependencies": [ "TestBasicMemory" ], "data_path": "agbenchmark/challenges/memory/m2" }, "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", + "difficulty": "intermediate", "dependencies": [ "TestRememberMultipleIds" ], "data_path": "agbenchmark/challenges/memory/m3" }, "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "difficulty": "advanced", "dependencies": [ "TestRememberMultipleIdsWithNoise" ], "data_path": "agbenchmark/challenges/memory/m4" }, "TestRetrieval2": { - "difficulty": "basic", + "difficulty": "novice", "dependencies": [ "TestBasicRetrieval" ], "data_path": "agbenchmark/challenges/retrieval/r2" }, "TestRetrieval3": { - "difficulty": "basic", + "difficulty": "intermediate", "dependencies": [ "TestRetrieval2" ], "data_path": "agbenchmark/challenges/retrieval/r3" }, "TestSearch": { - "difficulty": "basic", + "difficulty": "interface", "dependencies": [ "TestWriteFile" ], "data_path": "agbenchmark/challenges/interface/search" - }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "data_path": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/file1_07-14-18-54.json similarity index 72% rename from agbenchmark/reports/1.json rename to agbenchmark/reports/file1_07-14-18-54.json index 45945a3ee..f81d19d3d 100644 --- a/agbenchmark/reports/1.json +++ b/agbenchmark/reports/file1_07-14-18-54.json @@ -1,8 +1,8 @@ { "command": "agbenchmark start --mock", - "completion_time": "2023-07-11-21:09", + "completion_time": "2023-07-14-18:54", "metrics": { - "run_time": "0.96 seconds", + "run_time": "0.97 seconds", "highest_difficulty": "advanced: 5" }, "tests": { @@ -12,28 +12,28 @@ "metrics": { "difficulty": "interface", "success": true, - "success_%": 0, - "run_time": "0.008 seconds" + "non_mock_success_%": 75.0, + "run_time": "0.007 seconds" } }, "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "interface", "success": true, - "success_%": 0, - "run_time": "0.005 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.008 seconds" } }, "TestSearch": { "data_path": "agbenchmark/challenges/interface/search", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "interface", "success": true, - "success_%": 0, - "run_time": "0.006 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.007 seconds" } }, "TestDebugSimpleTypoWithGuidance": { @@ -43,28 +43,28 @@ "difficulty": "basic", "success": false, "fail_reason": "assert 1 in [0.0]", - "success_%": 0, - "run_time": "0.489 seconds" + "non_mock_success_%": 0.0, + "run_time": "0.448 seconds" } }, "TestBasicMemory": { "data_path": "agbenchmark/challenges/memory/m1", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "basic", "success": true, - "success_%": 0, - "run_time": "0.02 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.028 seconds" } }, "TestBasicRetrieval": { "data_path": "agbenchmark/challenges/retrieval/r1", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "basic", "success": true, - "success_%": 0, - "run_time": "0.01 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.014 seconds" } }, "TestDebugSimpleTypoWithoutGuidance": { @@ -74,7 +74,7 @@ "difficulty": "novice", "success": false, "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0, + "non_mock_success_%": 0.0, "run_time": "0.001 seconds" } }, @@ -85,64 +85,63 @@ "difficulty": "advanced", "success": false, "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0, - "run_time": "0.001 seconds" + "non_mock_success_%": 0.0, + "run_time": "0.002 seconds" } }, "TestRememberMultipleIds": { "data_path": "agbenchmark/challenges/memory/m2", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "novice", "success": true, - "success_%": 0, - "run_time": "0.018 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.023 seconds" } }, "TestRetrieval2": { "data_path": "agbenchmark/challenges/retrieval/r2", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "novice", "success": true, - "success_%": 0, - "run_time": "0.009 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.013 seconds" } }, "TestRememberMultipleIdsWithNoise": { "data_path": "agbenchmark/challenges/memory/m3", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "intermediate", "success": true, - "success_%": 0, - "run_time": "0.022 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.03 seconds" } }, "TestRetrieval3": { "data_path": "agbenchmark/challenges/retrieval/r3", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "intermediate", "success": true, - "success_%": 0, - "run_time": "0.01 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.016 seconds" } }, "TestRememberMultiplePhrasesWithNoise": { "data_path": "agbenchmark/challenges/memory/m4", - "is_regression": false, + "is_regression": true, "metrics": { "difficulty": "advanced", "success": true, - "success_%": 0, - "run_time": "0.021 seconds" + "non_mock_success_%": 100.0, + "run_time": "0.034 seconds" } } }, "config": { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks", - "cutoff": 60 + "entry_path": "agbenchmark.benchmarks" } } \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index ab2586e60..b31c9f5f9 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -62,7 +62,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) - config["entry_path"] = click.prompt( "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder", - default="benchmarks.py", + default="agbenchmark/benchmarks.py", ) config["cutoff"] = click.prompt( diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 598113d3d..1174e89bb 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,6 +1,7 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob import re +from datetime import datetime from pathlib import Path from typing import Any @@ -12,11 +13,13 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str: if not INFO_TESTS_PATH.exists(): INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) - return str(INFO_TESTS_PATH / "1.json") + return str( + INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" + ) else: json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) file_count = len(json_files) - run_name = f"{file_count + 1}.json" + run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" new_file_path = INFO_TESTS_PATH / run_name return str(new_file_path) @@ -35,8 +38,10 @@ def replace_backslash(value: Any) -> Any: def calculate_success_percentage(results: list[bool]) -> float: - success_count = results.count(True) - total_count = len(results) + # Take the last 10 results or all if less than 10 + last_results = results[-10:] if len(results) > 10 else results + success_count = last_results.count(True) + total_count = len(last_results) if total_count == 0: return 0 success_percentage = (success_count / total_count) * 100 # as a percentage @@ -45,7 +50,7 @@ def calculate_success_percentage(results: list[bool]) -> float: def get_highest_success_difficulty(data: dict) -> str: highest_difficulty = None - highest_difficulty_level = -1 + highest_difficulty_level = 0 for test_name, test_data in data.items(): if test_data["metrics"]["success"]: diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 357a918ec..62ad7aa8c 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84 +Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16 diff --git a/agent/SuperAGI b/agent/SuperAGI index bd4b3def6..f880b2464 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007 +Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b diff --git a/agent/config_example.json b/agent/config_example.json index 7ab65bc20..9e8bd3f08 100644 --- a/agent/config_example.json +++ b/agent/config_example.json @@ -1,5 +1,4 @@ { "workspace": "projects/my-new-project/workspace", - "entry_path": "agbenchmark/benchmarks.py", - "cutoff": 60 + "entry_path": "agbenchmark/benchmarks.py" } diff --git a/agent/gpt-engineer b/agent/gpt-engineer index f0c76918d..a0162df0d 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82 +Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0 diff --git a/agent/mini-agi b/agent/mini-agi index 08764876d..0f8eba95d 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0 +Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9 diff --git a/agent/smol-developer b/agent/smol-developer index f4f439551..70b57dd04 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898 +Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7