From 12c5d545837b5256f34695820601f1797b489703 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 17 Jul 2023 22:41:58 -0400
Subject: [PATCH] Fixing memory challenges, naming, testing mini-agi, smooth
 retrieval scaling (#166)

---
 agbenchmark/challenge.py                      | 11 ++-
 .../{d1 => d1_debug}/artifacts_in/__init__.py |  0
 .../{d1 => d1_debug}/artifacts_in/code.py     |  0
 .../{d1 => d1_debug}/artifacts_in/test.py     |  0
 .../artifacts_out/__init__.py                 |  0
 .../{d1 => d1_debug}/artifacts_out/code.py    |  0
 .../{d1 => d1_debug}/artifacts_out/test.py    |  0
 .../code/{d1 => d1_debug}/data.json           |  2 +-
 .../{d2 => d2_vague}/artifacts_in/__init__.py |  0
 .../{d2 => d2_vague}/artifacts_in/code.py     |  0
 .../{d2 => d2_vague}/artifacts_in/test.py     |  0
 .../artifacts_out/__init__.py                 |  0
 .../{d2 => d2_vague}/artifacts_out/code.py    |  0
 .../{d2 => d2_vague}/artifacts_out/test.py    |  0
 .../code/{d2 => d2_vague}/data.json           |  2 +-
 .../artifacts_out/__init__.py                 |  0
 .../{d4 => d3_two_sum}/artifacts_out/code.py  |  0
 .../{d4 => d3_two_sum}/custom_python/test.py  |  0
 .../code/{d4 => d3_two_sum}/data.json         |  4 +-
 .../custom_python/api_tests.py                |  0
 .../code/{d3 => d4_web_server}/data.json      |  2 +-
 .../artifacts_out/__init__.py                 |  0
 .../artifacts_out/code.py                     |  0
 .../custom_python/test.py                     |  0
 .../code/{d5 => d5_three_sum}/data.json       |  2 +-
 .../memory/m1/artifacts_in/instructions_5.txt |  1 -
 .../artifacts_in/instructions_1.txt           |  0
 .../artifacts_in/instructions_2.txt           |  0
 .../artifacts_in/instructions_3.txt           |  0
 .../artifacts_in/instructions_4.txt           |  0
 .../m1_id/artifacts_in/instructions_5.txt     |  1 +
 .../artifacts_out/result.txt}                 |  0
 .../challenges/memory/{m1 => m1_id}/data.json |  2 +-
 .../memory/m2/artifacts_in/instructions_5.txt |  1 -
 .../artifacts_in/instructions_1.txt           |  0
 .../artifacts_in/instructions_2.txt           |  0
 .../artifacts_in/instructions_3.txt           |  0
 .../artifacts_in/instructions_4.txt           |  0
 .../artifacts_in/instructions_5.txt           |  1 +
 .../artifacts_out/result.txt}                 |  0
 .../memory/{m2 => m2_multiple}/data.json      |  2 +-
 .../artifacts_in/instructions_1.txt           |  0
 .../artifacts_in/instructions_2.txt           |  0
 .../artifacts_in/instructions_3.txt           |  0
 .../artifacts_in/instructions_4.txt           |  0
 .../artifacts_in/instructions_5.txt           |  2 +-
 .../artifacts_out/result.txt}                 |  0
 .../memory/{m3 => m3_noise}/data.json         |  6 +-
 .../artifacts_in/instructions_1.txt           |  0
 .../artifacts_in/instructions_2.txt           |  0
 .../artifacts_in/instructions_3.txt           |  2 +-
 .../artifacts_in/instructions_4.txt           |  0
 .../artifacts_in/instructions_5.txt           |  2 +-
 .../artifacts_out/result.txt}                 |  2 +-
 .../memory/{m4 => m4_phrases}/data.json       |  8 +-
 .../artifacts_out/random_file.txt             |  0
 .../retrieval/{r1 => r1_book_price}/data.json |  2 +-
 .../artifacts_out/random_file.txt             |  0
 .../retrieval/r2.1_specific/data.json         | 19 ++++
 .../artifacts_out/random_file.txt             |  1 +
 .../retrieval/r2.2_formatting/data.json       | 19 ++++
 agbenchmark/challenges/retrieval/r2/data.json | 19 ----
 .../artifacts_out/random_file.txt             |  1 +
 .../retrieval/r2_tesla_revenue/data.json      | 19 ++++
 agbenchmark/challenges/retrieval/r3/data.json |  2 +-
 agbenchmark/conftest.py                       |  9 +-
 agbenchmark/reports/internal_info.json        | 93 +++++++++++--------
 .../reports/mini-agi/1.1_TestWriteFile.json   | 57 +++++-------
 .../10.1_TestRememberMultipleWithNoise.json   | 30 ++++++
 .../10_TestRememberMultipleWithNoise.json     | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 ..._TestRememberMultiplePhrasesWithNoise.json | 31 +++++++
 .../12.1_TestDebugSimpleTypoWithGuidance.json | 28 ++++++
 .../12.2_TestDebugSimpleTypoWithGuidance.json | 28 ++++++
 .../12.3_TestDebugSimpleTypoWithGuidance.json | 28 ++++++
 .../12_TestDebugSimpleTypoWithGuidance.json   | 31 +++++++
 .../reports/mini-agi/1_TestWriteFIle.json     |  4 +-
 .../reports/mini-agi/2.1_TestReadFile.json    |  4 +-
 .../reports/mini-agi/2_TestReadFile.json      |  4 +-
 .../reports/mini-agi/3.1_TestSearch.json      | 27 ++++++
 .../reports/mini-agi/3_TestSearch.json        |  4 +-
 .../mini-agi/4.1_TestBasicRetrieval.json      | 27 ++++++
 .../4.1_TestDebugSimpleTypoWithGuidance.json  | 28 ------
 .../mini-agi/4_TestBasicRetrieval.json        | 27 ++++++
 .../4_TestDebugSimpleTypoWithGuidance.json    | 28 ------
 .../mini-agi/5.1_TestRetrieval2.0.json        | 30 ++++++
 .../reports/mini-agi/5_TestRetrieval2.0.json  | 29 ++++++
 .../mini-agi/6.1_TestRetrieval2.1.json        | 30 ++++++
 .../mini-agi/6.2_TestRetrieval2.1.json        | 30 ++++++
 .../mini-agi/6.3_TestRetrieval2.1.json        | 30 ++++++
 .../mini-agi/6.4_TestRetrieval2.1.json        | 31 +++++++
 .../reports/mini-agi/6_TestRetrieval2.1.json  | 30 ++++++
 .../mini-agi/7.1_TestRetrieval2.2.json        | 31 +++++++
 .../reports/mini-agi/7_TestRetrieval2.2.json  | 30 ++++++
 .../reports/mini-agi/8.1_TestBasicMemory.json | 30 ++++++
 .../reports/mini-agi/8_TestBasicMemory.json   | 31 +++++++
 .../mini-agi/9.1_TestRememberMultipleIds.json | 30 ++++++
 .../mini-agi/9_TestRememberMultipleIds.json   | 31 +++++++
 agbenchmark/utils.py                          |  5 +-
 agent/mini-agi                                |  2 +-
 104 files changed, 1022 insertions(+), 187 deletions(-)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/__init__.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/code.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_in/test.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/__init__.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/code.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/artifacts_out/test.py (100%)
 rename agbenchmark/challenges/code/{d1 => d1_debug}/data.json (97%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/__init__.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/code.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_in/test.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/__init__.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/code.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/artifacts_out/test.py (100%)
 rename agbenchmark/challenges/code/{d2 => d2_vague}/data.json (97%)
 rename agbenchmark/challenges/code/{d4 => d3_two_sum}/artifacts_out/__init__.py (100%)
 rename agbenchmark/challenges/code/{d4 => d3_two_sum}/artifacts_out/code.py (100%)
 rename agbenchmark/challenges/code/{d4 => d3_two_sum}/custom_python/test.py (100%)
 rename agbenchmark/challenges/code/{d4 => d3_two_sum}/data.json (94%)
 rename agbenchmark/challenges/code/{d3 => d4_web_server}/custom_python/api_tests.py (100%)
 rename agbenchmark/challenges/code/{d3 => d4_web_server}/data.json (97%)
 rename agbenchmark/challenges/code/{d5 => d5_three_sum}/artifacts_out/__init__.py (100%)
 rename agbenchmark/challenges/code/{d5 => d5_three_sum}/artifacts_out/code.py (100%)
 rename agbenchmark/challenges/code/{d5 => d5_three_sum}/custom_python/test.py (100%)
 rename agbenchmark/challenges/code/{d5 => d5_three_sum}/data.json (96%)
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
 rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/{m1 => m1_id}/artifacts_in/instructions_4.txt (100%)
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
 rename agbenchmark/challenges/memory/{m1/artifacts_out/random_file.txt => m1_id/artifacts_out/result.txt} (100%)
 rename agbenchmark/challenges/memory/{m1 => m1_id}/data.json (95%)
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
 rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/{m2 => m2_multiple}/artifacts_in/instructions_4.txt (100%)
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
 rename agbenchmark/challenges/memory/{m2/artifacts_out/random_file.txt => m2_multiple/artifacts_out/result.txt} (100%)
 rename agbenchmark/challenges/memory/{m2 => m2_multiple}/data.json (95%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_3.txt (100%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/artifacts_in/instructions_5.txt (94%)
 rename agbenchmark/challenges/memory/{m3/artifacts_out/random_file.txt => m3_noise/artifacts_out/result.txt} (100%)
 rename agbenchmark/challenges/memory/{m3 => m3_noise}/data.json (88%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_1.txt (100%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_2.txt (100%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_3.txt (89%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_4.txt (100%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/artifacts_in/instructions_5.txt (93%)
 rename agbenchmark/challenges/memory/{m4/artifacts_out/random_file.txt => m4_phrases/artifacts_out/result.txt} (77%)
 rename agbenchmark/challenges/memory/{m4 => m4_phrases}/data.json (87%)
 rename agbenchmark/challenges/retrieval/{r1 => r1_book_price}/artifacts_out/random_file.txt (100%)
 rename agbenchmark/challenges/retrieval/{r1 => r1_book_price}/data.json (86%)
 rename agbenchmark/challenges/retrieval/{r2 => r2.1_specific}/artifacts_out/random_file.txt (100%)
 create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r2/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
 create mode 100644 agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/3.1_TestSearch.json
 create mode 100644 agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
 delete mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
 delete mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
 create mode 100644 agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
 create mode 100644 agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
 create mode 100644 agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
 create mode 100644 agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
 create mode 100644 agbenchmark/reports/mini-agi/8_TestBasicMemory.json
 create mode 100644 agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
 create mode 100644 agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json

diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 4f24bb603..cdaebed4f 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -103,22 +103,25 @@ class Challenge(ABC):
         ]
 
     def scoring(self, content: str, ground: Ground) -> float:
+        print("Scoring content: ", content)
         if ground.should_contain:
             for should_contain_word in ground.should_contain:
                 if should_contain_word not in content:
+                    print(f"Word that should exist - {should_contain_word}: False")
                     return 0.0
                 else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
+                    print(f"Word that should exist - {should_contain_word}: True")
 
         if ground.should_not_contain:
             for should_not_contain_word in ground.should_not_contain:
                 if should_not_contain_word in content:
+                    print(
+                        f"Word that should not exist - {should_not_contain_word}: False"
+                    )
                     return 0.0
                 else:
                     print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                        f"Word that should not exist - {should_not_contain_word}: True"
                     )
 
         return 1.0
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_in/__init__.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_in/code.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_in/code.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_in/test.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_in/test.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_out/__init__.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_out/code.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_out/code.py
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d1/artifacts_out/test.py
rename to agbenchmark/challenges/code/d1_debug/artifacts_out/test.py
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1_debug/data.json
similarity index 97%
rename from agbenchmark/challenges/code/d1/data.json
rename to agbenchmark/challenges/code/d1_debug/data.json
index d8e0280a4..4e2798a37 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1_debug/data.json
@@ -3,7 +3,7 @@
   "category": ["code", "iterate"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
-  "cutoff": 60,
+  "cutoff": 75,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_in/__init__.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_in/code.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_in/code.py
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_in/test.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_in/test.py
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_out/__init__.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_out/code.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_out/code.py
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d2/artifacts_out/test.py
rename to agbenchmark/challenges/code/d2_vague/artifacts_out/test.py
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2_vague/data.json
similarity index 97%
rename from agbenchmark/challenges/code/d2/data.json
rename to agbenchmark/challenges/code/d2_vague/data.json
index de32ef9a7..2b6c3526c 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2_vague/data.json
@@ -3,7 +3,7 @@
   "category": ["code", "iterate"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
-  "cutoff": 60,
+  "cutoff": 75,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d4/artifacts_out/__init__.py
rename to agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d4/artifacts_out/code.py
rename to agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py
diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d4/custom_python/test.py
rename to agbenchmark/challenges/code/d3_two_sum/custom_python/test.py
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d3_two_sum/data.json
similarity index 94%
rename from agbenchmark/challenges/code/d4/data.json
rename to agbenchmark/challenges/code/d3_two_sum/data.json
index e8db918d2..6df083d40 100644
--- a/agbenchmark/challenges/code/d4/data.json
+++ b/agbenchmark/challenges/code/d3_two_sum/data.json
@@ -3,7 +3,7 @@
   "category": ["code"],
   "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
   "dependencies": ["TestWriteFile"],
-  "cutoff": 60,
+  "cutoff": 90,
   "ground": {
     "answer": "The two_sum function coded properly.",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -12,7 +12,7 @@
     "type": "execute_python_code"
   },
   "info": {
-    "difficulty": "novice",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to create the two_sum function.",
     "side_effects": []
   }
diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py
similarity index 100%
rename from agbenchmark/challenges/code/d3/custom_python/api_tests.py
rename to agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d4_web_server/data.json
similarity index 97%
rename from agbenchmark/challenges/code/d3/data.json
rename to agbenchmark/challenges/code/d4_web_server/data.json
index c5d111a4d..5c936e882 100644
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d4_web_server/data.json
@@ -3,7 +3,7 @@
   "category": ["code"],
   "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
-  "cutoff": 60,
+  "cutoff": 90,
   "ground": {
     "answer": "GET localhost:8079/health responds with a 200 OK",
     "should_contain": [],
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py
similarity index 100%
rename from agbenchmark/challenges/code/d5/artifacts_out/__init__.py
rename to agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py
similarity index 100%
rename from agbenchmark/challenges/code/d5/artifacts_out/code.py
rename to agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py
diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py
similarity index 100%
rename from agbenchmark/challenges/code/d5/custom_python/test.py
rename to agbenchmark/challenges/code/d5_three_sum/custom_python/test.py
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5_three_sum/data.json
similarity index 96%
rename from agbenchmark/challenges/code/d5/data.json
rename to agbenchmark/challenges/code/d5_three_sum/data.json
index 434b1312e..4df03b45b 100644
--- a/agbenchmark/challenges/code/d5/data.json
+++ b/agbenchmark/challenges/code/d5_three_sum/data.json
@@ -12,7 +12,7 @@
     "type": "execute_python_code"
   },
   "info": {
-    "difficulty": "intermediate",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to create the three_sum function.",
     "side_effects": []
   }
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
deleted file mode 100644
index 8a2d35509..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the id previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
rename to agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..73246e041
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the id previously mentioned to a file called result.txt.
diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
rename to agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1_id/data.json
similarity index 95%
rename from agbenchmark/challenges/memory/m1/data.json
rename to agbenchmark/challenges/memory/m1_id/data.json
index 33c523244..186d25801 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1_id/data.json
@@ -8,7 +8,7 @@
     "answer": "2314",
     "should_contain": ["2314"],
     "should_not_contain": [],
-    "files": [".txt"],
+    "files": ["result.txt"],
     "type": "file"
   },
   "info": {
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
deleted file mode 100644
index 2f60afa17..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the ids previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
rename to agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..30ac41026
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the ids previously mentioned to a file called result.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
rename to agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2_multiple/data.json
similarity index 95%
rename from agbenchmark/challenges/memory/m2/data.json
rename to agbenchmark/challenges/memory/m2_multiple/data.json
index 840e8dc83..aac25b21e 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2_multiple/data.json
@@ -8,7 +8,7 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": [".txt"],
+    "files": ["result.txt"],
     "type": "file"
   },
   "info": {
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
similarity index 94%
rename from agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
index 99c9efa35..aa379ea51 100644
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the ids previously mentioned to a .txt file.
+Write the ids previously mentioned to a file called result.txt.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
rename to agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3_noise/data.json
similarity index 88%
rename from agbenchmark/challenges/memory/m3/data.json
rename to agbenchmark/challenges/memory/m3_noise/data.json
index 3af2fb3a5..8234e25c7 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3_noise/data.json
@@ -1,14 +1,14 @@
 {
-  "name": "TestRememberMultipleIdsWithNoise",
+  "name": "TestRememberMultipleWithNoise",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIds"],
-  "cutoff": 60,
+  "cutoff": 75,
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": [".txt"],
+    "files": ["result.txt"],
     "type": "file"
   },
   "info": {
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
similarity index 89%
rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
index d7f6f08fc..dc444c2ab 100644
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
@@ -1,5 +1,5 @@
 ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
 
-The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
+The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt.
 
 8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt
similarity index 100%
rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
similarity index 93%
rename from agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
index b4ef60bad..2092b42a2 100644
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the phrases previously mentioned to a .txt file.
+Write the phrases previously mentioned to a file called result.txt.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
similarity index 77%
rename from agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
rename to agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
index 9b8405bf1..bdab23d9b 100644
--- a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
@@ -1,4 +1,4 @@
 The purple elephant danced on a rainbow while eating a taco
 The sneaky toaster stole my socks and ran away to Hawaii
-My pet rock sings better than Beyoncé on Tuesdays
+My pet rock sings better than Beyonce on Tuesdays
 The giant hamster rode a unicycle through the crowded mall
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4_phrases/data.json
similarity index 87%
rename from agbenchmark/challenges/memory/m4/data.json
rename to agbenchmark/challenges/memory/m4_phrases/data.json
index 17a078e1b..da17fb02a 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4_phrases/data.json
@@ -2,18 +2,18 @@
   "name": "TestRememberMultiplePhrasesWithNoise",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": ["TestRememberMultipleIdsWithNoise"],
-  "cutoff": 60,
+  "dependencies": ["TestRememberMultipleWithNoise"],
+  "cutoff": 90,
   "ground": {
     "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
     "should_contain": [
       "The purple elephant danced on a rainbow while eating a taco",
       "The sneaky toaster stole my socks and ran away to Hawaii",
-      "My pet rock sings better than Beyoncé on Tuesdays",
+      "My pet rock sings better than Beyonce on Tuesdays",
       "The giant hamster rode a unicycle through the crowded mall"
     ],
     "should_not_contain": [],
-    "files": [".txt"],
+    "files": ["result.txt"],
     "type": "file"
   },
   "info": {
diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt
similarity index 100%
rename from agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
rename to agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1_book_price/data.json
similarity index 86%
rename from agbenchmark/challenges/retrieval/r1/data.json
rename to agbenchmark/challenges/retrieval/r1_book_price/data.json
index c3af4862d..9a6924daf 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1_book_price/data.json
@@ -13,7 +13,7 @@
   },
   "info": {
     "difficulty": "basic",
-    "description": "Tests ability to retrieve information from a website.",
+    "description": "Specifies specific website to retrieve website from.",
     "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt
similarity index 100%
rename from agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
rename to agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt
diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/data.json b/agbenchmark/challenges/retrieval/r2.1_specific/data.json
new file mode 100644
index 000000000..3fcd50801
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.1_specific/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.1",
+  "category": ["retrieval"],
+  "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+  "dependencies": ["TestRetrieval2.0"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022.",
+    "should_contain": ["81", "462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "This one checks the accuracy of the information over r2",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/data.json b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json
new file mode 100644
index 000000000..294effeff
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.2",
+  "category": ["retrieval"],
+  "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": ["TestRetrieval2.1"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+    "should_contain": ["81,462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
deleted file mode 100644
index f558b8584..000000000
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestRetrieval2",
-  "category": ["retrieval"],
-  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestBasicRetrieval"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "81,462",
-    "should_contain": ["81,462"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "novice",
-    "description": "Tests ability to retrieve information.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
new file mode 100644
index 000000000..63665ed63
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.0",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue in 2022 into a .txt file.",
+  "dependencies": ["TestBasicRetrieval"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022.",
+    "should_contain": ["81"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "A no guardrails search for info",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index eb998ffbf..d3e4bd0f5 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -2,7 +2,7 @@
   "name": "TestRetrieval3",
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestRetrieval2"],
+  "dependencies": ["TestRetrieval2.1"],
   "cutoff": 60,
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 4a62af0b5..b544d2c6e 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
+    challenge_data = item.funcargs.get("challenge_data", None)
     if call.when == "call":
-        challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = (
             challenge_data["info"]["difficulty"] if challenge_data else "unknown"
         )
@@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         info_details: Any = {
             "data_path": challenge_location,
             "is_regression": False,
+            "task": challenge_data["task"],
+            "answer": challenge_data["ground"]["answer"],
+            "description": challenge_data["info"]["description"],
             "metrics": {
                 "difficulty": difficulty,
                 "success": False,
@@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
                     "run_time"
                 ] = f"{str(round(run_time, 3))} seconds"
 
+                info_details["reached_cutoff"] = (
+                    float(run_time) > challenge_data["cutoff"]
+                )
+
             info_manager.add_test(test_name, info_details)
 
 
diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
index 0bfad744a..d20e8c778 100644
--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -3,28 +3,20 @@
         "TestBasicMemory": [
             true,
             true,
+            true,
+            true,
+            true,
+            false,
+            false,
             true
         ],
         "TestBasicRetrieval": [
+            true,
+            true,
             true,
             true,
             true
         ],
-        "TestCreateSimpleWebServer": [
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithGuidance": [
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithoutGuidance": [
-            false,
-            false,
-            false
-        ],
         "TestReadFile": [
             true,
             true,
@@ -32,41 +24,62 @@
             true,
             true
         ],
-        "TestRememberMultipleIds": [
-            true,
-            true,
-            true
-        ],
-        "TestRememberMultipleIdsWithNoise": [
-            true,
-            true,
-            true
-        ],
-        "TestRememberMultiplePhrasesWithNoise": [
-            true,
-            true,
-            true
-        ],
-        "TestRetrieval2": [
-            true,
-            true,
-            true
-        ],
-        "TestRetrieval3": [
-            true,
-            true,
-            true
-        ],
         "TestSearch": [
             true,
             true,
             true,
+            true,
             true
         ],
         "TestWriteFile": [
+            true,
             true,
             true,
             true
+        ],
+        "TestRetrieval2.2": [
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestRetrieval2.1": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestRetrieval2.0": [
+            true,
+            false
+        ],
+        "TestRememberMultipleIds": [
+            false,
+            false,
+            true
+        ],
+        "TestRememberMultipleIdsWithNoise": [
+            false
+        ],
+        "TestRememberMultipleWithNoise": [
+            false,
+            true
+        ],
+        "TestRememberMultiplePhrasesWithNoise": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithGuidance": [
+            false,
+            false,
+            false,
+            false
         ]
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
index 637c2d5c5..419052311 100644
--- a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
+++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
@@ -1,36 +1,27 @@
 {
+  "command": "agbenchmark start --test TestWriteFile",
+  "completion_time": "2023-07-17-13:34",
+  "metrics": {
+    "run_time": "23.83 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
     "TestWriteFile": {
-        "data_path": "agbenchmark/challenges/interface/write_file",
-        "is_regression": true,
-        "metrics": {
-            "difficulty": "interface",
-            "success": true,
-            "non_mock_success_%": 100.0,
-            "run_time": "0.009 seconds"
-        }
-    },
-    "additional": {
-        "model": "gpt-3.5-turbo"
-    },
-    "command": "agbenchmark start --test TestWriteFile",
-    "completion_time": "2023-07-17-09:54",
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-    },
-    "metrics": {
-        "run_time": "22.36 seconds",
-        "highest_difficulty": "interface: 1"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 40.0,
-                "run_time": "22.169 seconds"
-            }
-        }
+      "data_path": "agbenchmark/challenges/interface/write_file",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "23.627 seconds"
+      }
     }
-}
\ No newline at end of file
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
new file mode 100644
index 000000000..811fd3e85
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleWithNoise",
+  "completion_time": "2023-07-17-21:24",
+  "metrics": {
+    "run_time": "77.71 seconds",
+    "highest_difficulty": "intermediate: 4"
+  },
+  "tests": {
+    "TestRememberMultipleWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m3",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "77.397 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
new file mode 100644
index 000000000..08c2b7075
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleWithNoise",
+  "completion_time": "2023-07-17-21:19",
+  "metrics": {
+    "run_time": "74.3 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultipleWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m3",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "74.059 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..0de6f003c
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:28",
+  "metrics": {
+    "run_time": "60.86 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "60.631 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..1d2abb8e7
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:32",
+  "metrics": {
+    "run_time": "73.04 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "72.736 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..1d256b8c0
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:34",
+  "metrics": {
+    "run_time": "81.59 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "81.374 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..e67a6ac3e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:36",
+  "metrics": {
+    "run_time": "98.32 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "98.021 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..9e76704db
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:42",
+  "metrics": {
+    "run_time": "303.13 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "302.919 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..e98ca330e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:27",
+  "metrics": {
+    "run_time": "77.72 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "77.491 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..9c9f3dc2a
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:46",
+    "metrics": {
+        "run_time": "87.21 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "86.967 seconds"
+            },
+            "reached_cutoff": true
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..4765201fb
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:47",
+    "metrics": {
+        "run_time": "48.52 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "48.208 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..ac2592f33
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:55",
+    "metrics": {
+        "run_time": "54.95 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "54.741 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..e84c6e9a8
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-17-21:44",
+  "metrics": {
+    "run_time": "63.37 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+      "answer": "[0, 1] [2, 5] [0, 3]",
+      "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "63.125 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
index e64783190..6ac7d1045 100644
--- a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
+++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
@@ -9,6 +9,7 @@
     "TestWriteFile": {
       "data_path": "agbenchmark/challenges/interface/write_file",
       "is_regression": false,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
index b5d73af99..4758addf1 100644
--- a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
@@ -9,6 +9,7 @@
     "TestReadFile": {
       "data_path": "agbenchmark/challenges/interface/read_file",
       "is_regression": true,
+      "reached_cutoff": true,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -21,7 +22,6 @@
     "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
-    "model": "gpt-4",
-    "reached_termination_time": true
+    "model": "gpt-3.5-turbo"
   }
 }
diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json
index 869eaaac1..87c7956d6 100644
--- a/agbenchmark/reports/mini-agi/2_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json
@@ -9,6 +9,7 @@
     "TestReadFile": {
       "data_path": "agbenchmark/challenges/interface/read_file",
       "is_regression": true,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/3.1_TestSearch.json b/agbenchmark/reports/mini-agi/3.1_TestSearch.json
new file mode 100644
index 000000000..6a2744e72
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/3.1_TestSearch.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestSearch",
+  "completion_time": "2023-07-17-13:35",
+  "metrics": {
+    "run_time": "20.58 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestSearch": {
+      "data_path": "agbenchmark/challenges/interface/search",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "20.367 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json
index d9d05db4a..c7d6c4309 100644
--- a/agbenchmark/reports/mini-agi/3_TestSearch.json
+++ b/agbenchmark/reports/mini-agi/3_TestSearch.json
@@ -9,6 +9,7 @@
     "TestSearch": {
       "data_path": "agbenchmark/challenges/interface/search",
       "is_regression": true,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
new file mode 100644
index 000000000..6ff0fa63b
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestBasicRetrieval",
+  "completion_time": "2023-07-17-13:31",
+  "metrics": {
+    "run_time": "26.05 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicRetrieval": {
+      "data_path": "agbenchmark/challenges/retrieval/r1",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "25.818 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
deleted file mode 100644
index d72d599d8..000000000
--- a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
-  "completion_time": "2023-07-15-22:16",
-  "metrics": {
-    "run_time": "45.92 seconds",
-    "highest_difficulty": ": 0"
-  },
-  "tests": {
-    "TestDebugSimpleTypoWithGuidance": {
-      "data_path": "agbenchmark/challenges/code/d1",
-      "is_regression": false,
-      "metrics": {
-        "difficulty": "basic",
-        "success": false,
-        "fail_reason": "assert 1 in [0.0]",
-        "success_%": 0.0,
-        "run_time": "45.599 seconds"
-      }
-    }
-  },
-  "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
-  },
-  "additional": {
-    "model": "gpt-4"
-  }
-}
diff --git a/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
new file mode 100644
index 000000000..54c4fdcca
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestBasicRetrieval",
+  "completion_time": "2023-07-17-13:22",
+  "metrics": {
+    "run_time": "61.24 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicRetrieval": {
+      "data_path": "agbenchmark/challenges/retrieval/r1",
+      "is_regression": true,
+      "reached_cutoff": true,
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "60.872 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
deleted file mode 100644
index 7985a7843..000000000
--- a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
-  "completion_time": "2023-07-15-22:15",
-  "metrics": {
-    "run_time": "32.99 seconds",
-    "highest_difficulty": ": 0"
-  },
-  "tests": {
-    "TestDebugSimpleTypoWithGuidance": {
-      "data_path": "agbenchmark/challenges/code/d1",
-      "is_regression": false,
-      "metrics": {
-        "difficulty": "basic",
-        "success": false,
-        "fail_reason": "assert 1 in [0.0]",
-        "success_%": 0.0,
-        "run_time": "32.582 seconds"
-      }
-    }
-  },
-  "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
-  },
-  "additional": {
-    "model": "gpt-4"
-  }
-}
diff --git a/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
new file mode 100644
index 000000000..4149ebe70
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.0",
+  "completion_time": "2023-07-17-17:10",
+  "metrics": {
+    "run_time": "66.81 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.0": {
+      "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+      "is_regression": false,
+      "reached_cutoff": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "A no guardrails search for info",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "66.547 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
new file mode 100644
index 000000000..28d091d28
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
@@ -0,0 +1,29 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2",
+  "completion_time": "2023-07-17-13:54",
+  "metrics": {
+    "run_time": "36 seconds",
+    "highest_difficulty": "TestRetrieval2: 3"
+  },
+  "tests": {
+    "TestRetrieval2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+      "is_regression": false,
+      "reached_cutoff": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "A no guardrails search for info",
+      "metrics": {
+        "difficulty": "novice",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "35.59 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
new file mode 100644
index 000000000..ed3ede1d3
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:27",
+  "metrics": {
+    "run_time": "64.44 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "reached_cutoff:": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0, 0.0]",
+        "success_%": 0.0,
+        "run_time": "64.216 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
new file mode 100644
index 000000000..04f972329
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:53",
+  "metrics": {
+    "run_time": "30.08 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "29.711 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
new file mode 100644
index 000000000..383774347
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:54",
+  "metrics": {
+    "run_time": "27.49 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "27.266 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
new file mode 100644
index 000000000..71cd9e007
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:56",
+  "metrics": {
+    "run_time": "23.64 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "23.42 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
new file mode 100644
index 000000000..1dceec03d
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-14:03",
+  "metrics": {
+    "run_time": "68.39 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "reached_cutoff": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in []",
+        "success_%": 0.0,
+        "run_time": "68.15 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
new file mode 100644
index 000000000..99373f7f1
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.2",
+  "completion_time": "2023-07-17-17:57",
+  "metrics": {
+    "run_time": "31.1 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+      "is_regression": false,
+      "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+      "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+      "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "30.888 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
new file mode 100644
index 000000000..ccdca26b3
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.2",
+  "completion_time": "2023-07-17-14:04",
+  "metrics": {
+    "run_time": "28.08 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+      "is_regression": false,
+      "reached_cutoff": false,
+      "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+      "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "27.857 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
new file mode 100644
index 000000000..66cc2f9ae
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestBasicMemory",
+  "completion_time": "2023-07-17-18:22",
+  "metrics": {
+    "run_time": "53.48 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicMemory": {
+      "data_path": "agbenchmark/challenges/memory/m1",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "2314",
+      "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 75.0,
+        "run_time": "53.252 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/8_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json
new file mode 100644
index 000000000..7ce535507
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestBasicMemory",
+  "completion_time": "2023-07-17-18:18",
+  "metrics": {
+    "run_time": "62.11 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestBasicMemory": {
+      "data_path": "agbenchmark/challenges/memory/m1",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "2314",
+      "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'",
+        "success_%": 83.33,
+        "run_time": "61.879 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
new file mode 100644
index 000000000..462e73900
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleIds",
+  "completion_time": "2023-07-17-20:22",
+  "metrics": {
+    "run_time": "57.6 seconds",
+    "highest_difficulty": "novice: 3"
+  },
+  "tests": {
+    "TestRememberMultipleIds": {
+      "data_path": "agbenchmark/challenges/memory/m2_noise",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+      "metrics": {
+        "difficulty": "novice",
+        "success": true,
+        "success_%": 33.33,
+        "run_time": "57.355 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json
new file mode 100644
index 000000000..aa726196e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleIds",
+  "completion_time": "2023-07-17-18:33",
+  "metrics": {
+    "run_time": "61.32 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultipleIds": {
+      "data_path": "agbenchmark/challenges/memory/m2_noise",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "61.089 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index f1ed43639..63f4d836c 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -55,7 +55,6 @@ def calculate_info_test_path(reports_path: Path) -> str:
                 all_prefix_numbers.append(math.floor(number))
 
             max_prefix = max(all_prefix_numbers, default=0)
-            print("HEY WE ARE HERE BIG DAWG", max_prefix)
             run_name = f"{max_prefix + 1}_{test_arg}.json"
         else:
             # Take the number from before the _ and add the .{number}
@@ -118,7 +117,9 @@ def get_highest_success_difficulty(data: dict) -> str:
     else:
         highest_difficulty_str = ""
 
-    return f"{highest_difficulty_str}: {highest_difficulty_level}"
+    if highest_difficulty_level:
+        return f"{highest_difficulty_str}: {highest_difficulty_level}"
+    return "No successful tests"
 
 
 def assign_paths(folder_path: Path) -> tuple[str, str, str]:
diff --git a/agent/mini-agi b/agent/mini-agi
index 0a9fcd8c3..4a346ab7c 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d
+Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3