diff --git a/.gitmodules b/.gitmodules
index f14b5e07d..d2b71f9c4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "agent/Auto-GPT"]
 	path = agent/Auto-GPT
-	url = https://github.com/Significant-Gravitas/Auto-GPT.git
+	url = https://github.com/merwanehamadi/Auto-GPT.git
 	branch = benchmark-integration
 [submodule "agent/gpt-engineer"]
 	path = agent/gpt-engineer
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 991a7e8e0..897f4f8cf 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -16,9 +16,7 @@ MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 
 
 def run_agent(
-    task: str,
-    config: Dict[str, Any],
-    challenge_location: str,
+    task: str, config: Dict[str, Any], challenge_location: str, cutoff: int
 ) -> None:
     """Calling to get a response"""
 
@@ -27,9 +25,7 @@ def run_agent(
             config["workspace"], "artifacts_out", challenge_location
         )
     else:
-        print(
-            f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
-        )
+        print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
         command = [sys.executable, "-m", config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
@@ -50,11 +46,11 @@ def run_agent(
             if (
                 process.poll() is not None
                 or output == ""
-                or (time.time() - start_time > config["cutoff"])
+                or (time.time() - start_time > cutoff)
             ):
                 break
 
-        if time.time() - start_time > config["cutoff"]:
+        if time.time() - start_time > cutoff:
             print("The Python function has exceeded the time limit and was terminated.")
             process.kill()
         else:
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index f07faf8ee..4f24bb603 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -4,16 +4,9 @@ import subprocess
 from abc import ABC
 from typing import Any, Dict, List
 
-from dotenv import load_dotenv
-
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 from agbenchmark.start_benchmark import CURRENT_DIRECTORY
 
-load_dotenv()
-
-mock_test_str = os.getenv("MOCK_TEST")
-MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
-
 
 class Challenge(ABC):
     """The parent class to all specific challenges classes.
@@ -37,14 +30,14 @@ class Challenge(ABC):
     def dependencies(self) -> list:
         return self.data.dependencies
 
-    def setup_challenge(self, config: Dict[str, Any]) -> None:
+    def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
         from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, config, self.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff)
 
         # hidden files are added after the agent runs. Hidden files can be python test files.
         # We copy them in the workspace to make it easy to import the code produced by the agent
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index bc1a15b42..d8e0280a4 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index fca86f29b..de32ef9a7 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 60,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
index ae0e45581..c5d111a4d 100644
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -3,6 +3,7 @@
   "category": ["code"],
   "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 60,
   "ground": {
     "answer": "GET localhost:8079/health responds with a 200 OK",
     "should_contain": [],
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
index b2320a4e5..e8db918d2 100644
--- a/agbenchmark/challenges/code/d4/data.json
+++ b/agbenchmark/challenges/code/d4/data.json
@@ -3,6 +3,7 @@
   "category": ["code"],
   "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "The two_sum function coded properly.",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json
index 4b44c6943..434b1312e 100644
--- a/agbenchmark/challenges/code/d5/data.json
+++ b/agbenchmark/challenges/code/d5/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
   "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
+  "cutoff": 60,
   "ground": {
     "answer": "The three_sum function coded properly.",
     "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 668025dd2..dc1777d71 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -61,6 +61,7 @@ class ChallengeData(BaseModel):
     category: List[str]
     task: str
     dependencies: List[str]
+    cutoff: int
     ground: Ground
     info: Info
 
diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
index 1bf340454..55878856b 100644
--- a/agbenchmark/challenges/interface/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "random string Hello World!",
     "should_contain": ["random string", "Hello World!"],
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
index de8934d95..793e71715 100644
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "This is a Heading\nThis is a paragraph.",
     "should_contain": ["Heading", "paragraph"],
diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
index 8db9cd620..426e6c3d4 100644
--- a/agbenchmark/challenges/interface/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
+  "cutoff": 60,
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index ab86f1c3c..33c523244 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "2314",
     "should_contain": ["2314"],
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 9205c99f1..840e8dc83 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestBasicMemory"],
+  "cutoff": 60,
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 3b78d614b..3af2fb3a5 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIds"],
+  "cutoff": 60,
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index 84f5c2b21..17a078e1b 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIdsWithNoise"],
+  "cutoff": 60,
   "ground": {
     "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
     "should_contain": [
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index e3e09302d..c3af4862d 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
   "dependencies": ["TestWriteFile", "TestSearch"],
+  "cutoff": 60,
   "ground": {
     "answer": "£25.89",
     "should_contain": ["25.89"],
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 977be4bcd..f558b8584 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
   "dependencies": ["TestBasicRetrieval"],
+  "cutoff": 60,
   "ground": {
     "answer": "81,462",
     "should_contain": ["81,462"],
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index 5504908ea..eb998ffbf 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
   "dependencies": ["TestRetrieval2"],
+  "cutoff": 60,
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
     "should_contain": [
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 98a5ab81a..255b39e57 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -54,7 +54,8 @@ def generate_tests() -> None:
 
         # Define test method within the dynamically created class
         def test_method(self, config: Dict[str, Any]) -> None:  # type: ignore
-            self.setup_challenge(config)
+            cutoff = self.data.cutoff or 60
+            self.setup_challenge(config, cutoff)
 
             scores = self.get_scores(config)
 
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index af83029ef..820f133b1 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,4 @@
 {
   "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "agbenchmark.benchmarks",
-  "cutoff": 60
+  "entry_path": "agbenchmark.benchmarks"
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 952588105..245df485e 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -18,12 +18,10 @@ from agbenchmark.start_benchmark import (
 from agbenchmark.utils import calculate_success_percentage
 
 
-def resolve_workspace(config: Dict[str, Any]) -> str:
-    if config.get("workspace", "").startswith("${") and config.get(
-        "workspace", ""
-    ).endswith("}"):
+def resolve_workspace(workspace: str) -> str:
+    if workspace.startswith("${") and workspace.endswith("}"):
         # Extract the string inside ${...}
-        path_expr = config["workspace"][2:-1]
+        path_expr = workspace[2:-1]
 
         # Check if it starts with "os.path.join"
         if path_expr.strip().startswith("os.path.join"):
@@ -35,7 +33,7 @@ def resolve_workspace(config: Dict[str, Any]) -> str:
         else:
             raise ValueError("Invalid workspace path expression.")
     else:
-        return os.path.abspath(Path(os.getcwd()) / config["workspace"])
+        return os.path.abspath(Path(os.getcwd()) / workspace)
 
 
 @pytest.fixture(scope="module")
@@ -45,10 +43,10 @@ def config(request: Any) -> None:
         config = json.load(f)
 
     if isinstance(config["workspace"], str):
-        config["workspace"] = resolve_workspace(config)
+        config["workspace"] = resolve_workspace(config["workspace"])
     else:  # it's a input output dict
-        config["workspace"]["input"] = resolve_workspace(config)
-        config["workspace"]["output"] = resolve_workspace(config)
+        config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
+        config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])
 
     return config
 
@@ -173,18 +171,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
                 regression_manager.remove_test(test_name)
             info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
 
-        prev_test_results: list[bool] = []
-
+        prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
         if not mock:
             # only add if it's an actual test
-            prev_test_results = internal_info.tests.get(test_name, [])
             prev_test_results.append(info_details["metrics"]["success"])
             internal_info.add_test(test_name, prev_test_results)
 
-        # can calculate success rate regardless of mock
-        info_details["metrics"]["success_%"] = calculate_success_percentage(
-            prev_test_results
-        )
+            # can calculate success rate regardless of mock
+            info_details["metrics"]["success_%"] = calculate_success_percentage(
+                prev_test_results
+            )
+        else:
+            # can calculate success rate regardless of mock
+            info_details["metrics"][
+                "non_mock_success_%"
+            ] = calculate_success_percentage(prev_test_results)
 
         if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
             # if the last 3 tests were successful, add to the regression tests
diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json
index 5f46bd854..95a051d54 100644
--- a/agbenchmark/internal_info.json
+++ b/agbenchmark/internal_info.json
@@ -62,6 +62,12 @@
     "TestWriteFile": [
         true,
         true,
-        true
+        true,
+        false,
+        false,
+        false,
+        false,
+        true,
+        false
     ]
 }
\ No newline at end of file
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
index ce73ce263..25591a4de 100644
--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -16,57 +16,52 @@
         "data_path": "agbenchmark/challenges/retrieval/r1"
     },
     "TestReadFile": {
-        "difficulty": "basic",
+        "difficulty": "interface",
         "dependencies": [
             "TestWriteFile"
         ],
         "data_path": "agbenchmark/challenges/interface/read_file"
     },
     "TestRememberMultipleIds": {
-        "difficulty": "basic",
+        "difficulty": "novice",
         "dependencies": [
             "TestBasicMemory"
         ],
         "data_path": "agbenchmark/challenges/memory/m2"
     },
     "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
+        "difficulty": "intermediate",
         "dependencies": [
             "TestRememberMultipleIds"
         ],
         "data_path": "agbenchmark/challenges/memory/m3"
     },
     "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
+        "difficulty": "advanced",
         "dependencies": [
             "TestRememberMultipleIdsWithNoise"
         ],
         "data_path": "agbenchmark/challenges/memory/m4"
     },
     "TestRetrieval2": {
-        "difficulty": "basic",
+        "difficulty": "novice",
         "dependencies": [
             "TestBasicRetrieval"
         ],
         "data_path": "agbenchmark/challenges/retrieval/r2"
     },
     "TestRetrieval3": {
-        "difficulty": "basic",
+        "difficulty": "intermediate",
         "dependencies": [
             "TestRetrieval2"
         ],
         "data_path": "agbenchmark/challenges/retrieval/r3"
     },
     "TestSearch": {
-        "difficulty": "basic",
+        "difficulty": "interface",
         "dependencies": [
             "TestWriteFile"
         ],
         "data_path": "agbenchmark/challenges/interface/search"
-    },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "data_path": "agbenchmark/challenges/interface/write_file"
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/file1_07-14-18-54.json
similarity index 72%
rename from agbenchmark/reports/1.json
rename to agbenchmark/reports/file1_07-14-18-54.json
index 45945a3ee..f81d19d3d 100644
--- a/agbenchmark/reports/1.json
+++ b/agbenchmark/reports/file1_07-14-18-54.json
@@ -1,8 +1,8 @@
 {
     "command": "agbenchmark start --mock",
-    "completion_time": "2023-07-11-21:09",
+    "completion_time": "2023-07-14-18:54",
     "metrics": {
-        "run_time": "0.96 seconds",
+        "run_time": "0.97 seconds",
         "highest_difficulty": "advanced: 5"
     },
     "tests": {
@@ -12,28 +12,28 @@
             "metrics": {
                 "difficulty": "interface",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.008 seconds"
+                "non_mock_success_%": 75.0,
+                "run_time": "0.007 seconds"
             }
         },
         "TestReadFile": {
             "data_path": "agbenchmark/challenges/interface/read_file",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "interface",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.005 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.008 seconds"
             }
         },
         "TestSearch": {
             "data_path": "agbenchmark/challenges/interface/search",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "interface",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.006 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.007 seconds"
             }
         },
         "TestDebugSimpleTypoWithGuidance": {
@@ -43,28 +43,28 @@
                 "difficulty": "basic",
                 "success": false,
                 "fail_reason": "assert 1 in [0.0]",
-                "success_%": 0,
-                "run_time": "0.489 seconds"
+                "non_mock_success_%": 0.0,
+                "run_time": "0.448 seconds"
             }
         },
         "TestBasicMemory": {
             "data_path": "agbenchmark/challenges/memory/m1",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "basic",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.02 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.028 seconds"
             }
         },
         "TestBasicRetrieval": {
             "data_path": "agbenchmark/challenges/retrieval/r1",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "basic",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.01 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.014 seconds"
             }
         },
         "TestDebugSimpleTypoWithoutGuidance": {
@@ -74,7 +74,7 @@
                 "difficulty": "novice",
                 "success": false,
                 "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0,
+                "non_mock_success_%": 0.0,
                 "run_time": "0.001 seconds"
             }
         },
@@ -85,64 +85,63 @@
                 "difficulty": "advanced",
                 "success": false,
                 "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0,
-                "run_time": "0.001 seconds"
+                "non_mock_success_%": 0.0,
+                "run_time": "0.002 seconds"
             }
         },
         "TestRememberMultipleIds": {
             "data_path": "agbenchmark/challenges/memory/m2",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "novice",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.018 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.023 seconds"
             }
         },
         "TestRetrieval2": {
             "data_path": "agbenchmark/challenges/retrieval/r2",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "novice",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.009 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.013 seconds"
             }
         },
         "TestRememberMultipleIdsWithNoise": {
             "data_path": "agbenchmark/challenges/memory/m3",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "intermediate",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.022 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.03 seconds"
             }
         },
         "TestRetrieval3": {
             "data_path": "agbenchmark/challenges/retrieval/r3",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "intermediate",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.01 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.016 seconds"
             }
         },
         "TestRememberMultiplePhrasesWithNoise": {
             "data_path": "agbenchmark/challenges/memory/m4",
-            "is_regression": false,
+            "is_regression": true,
             "metrics": {
                 "difficulty": "advanced",
                 "success": true,
-                "success_%": 0,
-                "run_time": "0.021 seconds"
+                "non_mock_success_%": 100.0,
+                "run_time": "0.034 seconds"
             }
         }
     },
     "config": {
         "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-        "entry_path": "agbenchmark.benchmarks",
-        "cutoff": 60
+        "entry_path": "agbenchmark.benchmarks"
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index ab2586e60..b31c9f5f9 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -62,7 +62,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
 
         config["entry_path"] = click.prompt(
             "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
-            default="benchmarks.py",
+            default="agbenchmark/benchmarks.py",
         )
 
         config["cutoff"] = click.prompt(
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 598113d3d..1174e89bb 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,6 +1,7 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
 import re
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 
@@ -12,11 +13,13 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
 
     if not INFO_TESTS_PATH.exists():
         INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
-        return str(INFO_TESTS_PATH / "1.json")
+        return str(
+            INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+        )
     else:
         json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
         file_count = len(json_files)
-        run_name = f"{file_count + 1}.json"
+        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
         new_file_path = INFO_TESTS_PATH / run_name
         return str(new_file_path)
 
@@ -35,8 +38,10 @@ def replace_backslash(value: Any) -> Any:
 
 
 def calculate_success_percentage(results: list[bool]) -> float:
-    success_count = results.count(True)
-    total_count = len(results)
+    # Take the last 10 results or all if less than 10
+    last_results = results[-10:] if len(results) > 10 else results
+    success_count = last_results.count(True)
+    total_count = len(last_results)
     if total_count == 0:
         return 0
     success_percentage = (success_count / total_count) * 100  # as a percentage
@@ -45,7 +50,7 @@ def calculate_success_percentage(results: list[bool]) -> float:
 
 def get_highest_success_difficulty(data: dict) -> str:
     highest_difficulty = None
-    highest_difficulty_level = -1
+    highest_difficulty_level = 0
 
     for test_name, test_data in data.items():
         if test_data["metrics"]["success"]:
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 357a918ec..62ad7aa8c 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84
+Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16
diff --git a/agent/SuperAGI b/agent/SuperAGI
index bd4b3def6..f880b2464 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007
+Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b
diff --git a/agent/config_example.json b/agent/config_example.json
index 7ab65bc20..9e8bd3f08 100644
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -1,5 +1,4 @@
 {
   "workspace": "projects/my-new-project/workspace",
-  "entry_path": "agbenchmark/benchmarks.py",
-  "cutoff": 60
+  "entry_path": "agbenchmark/benchmarks.py"
 }
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index f0c76918d..a0162df0d 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82
+Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0
diff --git a/agent/mini-agi b/agent/mini-agi
index 08764876d..0f8eba95d 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0
+Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9
diff --git a/agent/smol-developer b/agent/smol-developer
index f4f439551..70b57dd04 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898
+Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7