From a5073ab57790a84d146877e1b6512eecbfc12b09 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 09:42:36 -0400
Subject: [PATCH 01/20] basic challenges, more ChallengeData structure

---
 agbenchmark/Challenge.py                      | 22 ++++++++++++++
 agbenchmark/challenges/define_task_types.py   | 16 ++++++----
 agbenchmark/challenges/retrieval/Retrieval.py | 22 +-------------
 .../challenges/retrieval/r1/r1_data.json      | 10 +++++--
 .../challenges/retrieval/r1/r1_test.py        |  6 ++--
 agbenchmark/mocks/tests/basic_mocks.py        | 28 ++++++++++++++++++
 agbenchmark/mocks/tests/retrieval_mocks.py    |  7 +----
 .../read_file/r_file_data.json                | 15 ++++++++++
 .../read_file/read_file_test.py               | 29 +++++++++++++++++++
 .../tests/basic_abilities/read_file_test.py   |  0
 .../write_file/w_file_data.json               | 16 ++++++++++
 .../write_file/write_file_test.py             | 27 +++++++++++++++++
 .../tests/basic_abilities/write_file_test.py  |  0
 pyproject.toml                                |  3 +-
 14 files changed, 163 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 20bf55853..9828a0e9e 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from agbenchmark.challenges.define_task_types import Ground
 
 
 class Challenge:
@@ -30,3 +31,24 @@ class Challenge:
             for filename in os.listdir(workspace)
             if os.path.isfile(os.path.join(workspace, filename))
         ]
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f1a841b53..879a46af0 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,12 @@ import json
 import os
 
 
+class Info(BaseModel):
+    difficulty: str
+    description: str
+    side_effects: List[str]
+
+
 class Ground(BaseModel):
     answer: str
     should_contain: Optional[List[str]]
@@ -11,20 +17,20 @@ class Ground(BaseModel):
     files: List[str]
 
 
-class Challenge(BaseModel):
-    category: str
+class ChallengeData(BaseModel):
+    category: List[str]
     task: str
     ground: Ground
-    difficulty: str
     mock_func: Optional[str] = None
+    info: Info
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
             file.write(self.json())
 
     @staticmethod
-    def deserialize(path: str) -> "Challenge":
+    def deserialize(path: str) -> "ChallengeData":
         print("Deserializing", path)
         with open(path, "r") as file:
             data = json.load(file)
-        return Challenge(**data)
+        return ChallengeData(**data)
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 2db22ae4d..9434d69c3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,27 +1,7 @@
 from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import Ground
 
 
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
+    pass
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index c7cc31004..08b74d1b7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,5 @@
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -7,6 +7,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy",
-  "mock_func": "retrieval_1_mock"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index e20c9f7b9..d37c5e795 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,9 +1,11 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import Challenge, Ground
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
-data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r1_data.json")
+)
 
 
 class TestRetrieval1(RetrievalChallenge):
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index e69de29bb..eb7b96541 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -0,0 +1,28 @@
+from agbenchmark.Challenge import Challenge
+from ..basic_gpt_agent import basic_gpt_agent
+
+
+def basic_read_file_mock(task: str, workspace: str):
+    """
+    This mock reads a file and returns its content.
+    """
+
+    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+
+    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
+
+    Challenge.write_to_file(
+        workspace, "file_to_check.txt", f"random string: {file_contents}"
+    )
+
+
+def basic_write_file_mock(task: str, workspace: str):
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
index 23f4bde17..2481de060 100644
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,4 +1,3 @@
-from ..basic_gpt_agent import basic_gpt_agent
 from agbenchmark.Challenge import Challenge
 
 
@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
 def retrieval_1_mock(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
new file mode 100644
index 000000000..55319ddfc
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -0,0 +1,15 @@
+{
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "ground": {
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_read_file_mock",
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
new file mode 100644
index 000000000..610ccdab6
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -0,0 +1,29 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
+)
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(
+        self, workspace
+    ):  # create_file simply there for the function to depend on the fixture
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
new file mode 100644
index 000000000..4aaa1347d
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -0,0 +1,16 @@
+{
+  "category": ["basic"],
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
new file mode 100644
index 000000000..ccb10fe70
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -0,0 +1,27 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "w_file_data.json")
+)
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pyproject.toml b/pyproject.toml
index 5498381a2..6f79e75ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression"
+    "regression",
+    "basic"
 ]
 
 [tool.poetry.scripts]

From 66c9e68b0430066d23e9acd66e5259ea5d5190d7 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:15:53 -0400
Subject: [PATCH 02/20] file creation from within file before server :)

---
 agbenchmark/conftest.py                                   | 2 +-
 agbenchmark/mocks/tests/basic_mocks.py                    | 2 +-
 .../tests/basic_abilities/read_file/read_file_test.py     | 8 ++++++++
 agbenchmark/tests/regression/regression_tests.txt         | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 908d39e89..434f6dbde 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -17,7 +17,7 @@ def config():
     return config
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def workspace(config):
     yield config["workspace"]
     # teardown after test function completes
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index eb7b96541..bbff6a9c7 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
 
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 610ccdab6..35d1d80c5 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -8,6 +8,14 @@ data = ChallengeData.deserialize(
 )
 
 
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
+
+
 class TestReadFile(Challenge):
     """Testing if LLM can read a file"""
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index e69de29bb..a5f8fbd1d 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -0,0 +1,2 @@
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]

From 4fa9f72083aa09bf1770f10a3254c4d0ef674a9a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:24:17 -0400
Subject: [PATCH 03/20] adding dependencies on other challenges

---
 agbenchmark/mocks/tests/basic_mocks.py            |  2 --
 .../basic_abilities/read_file/read_file_test.py   |  1 +
 .../basic_abilities/write_file/write_file_test.py |  1 +
 agbenchmark/tests/regression/regression_tests.txt |  1 -
 poetry.lock                                       | 15 ++++++++++++++-
 pyproject.toml                                    |  1 +
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index bbff6a9c7..550095b72 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
-
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
     Challenge.write_to_file(
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 35d1d80c5..ea794281e 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,7 @@ class TestReadFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(depends=["write_file"])
     def test_retrieval(
         self, workspace
     ):  # create_file simply there for the function to depend on the fixture
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index ccb10fe70..b2c559c9e 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -17,6 +17,7 @@ class TestWriteFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(name="write_file")
     def test_retrieval(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index a5f8fbd1d..84e625af4 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,2 +1 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3f1059aaf..3bc37622e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
+[[package]]
+name = "pytest-dependency"
+version = "0.5.1"
+description = "Manage dependencies of tests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+]
+
+[package.dependencies]
+pytest = ">=3.6.0"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -765,4 +778,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38"
+content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
diff --git a/pyproject.toml b/pyproject.toml
index 6f79e75ce..087ac8447 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
+pytest-dependency = "^0.5.1"
 
 
 [build-system]

From f895d54e02c92e262172d9a773f7e6a4870d435d Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 14:42:35 -0400
Subject: [PATCH 04/20] more elegant marking & dependency solution

---
 README.md                                     | 74 +++++++++++++++++--
 agbenchmark/challenges/README.md              | 38 +++++-----
 agbenchmark/challenges/define_task_types.py   |  1 +
 .../challenges/retrieval/r1/r1_data.json      |  1 +
 .../tests/basic_abilities/BasicChallenge.py   |  7 ++
 .../read_file/r_file_data.json                |  1 +
 .../read_file/read_file_test.py               | 12 +--
 .../write_file/w_file_data.json               |  1 +
 .../write_file/write_file_test.py             |  9 +--
 .../tests/regression/regression_tests.txt     |  2 +
 poetry.lock                                   | 17 ++++-
 pyproject.toml                                |  1 +
 12 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py

diff --git a/README.md b/README.md
index 0a8d119af..0ad0cf345 100644
--- a/README.md
+++ b/README.md
@@ -51,15 +51,73 @@ Share your progress :)
 
 to create a test:
 
-```
-@pytest.mark.parametrize(
-"server_response",
-["VARIABLE"], # VARIABLE = the query/goal you provide to the model
-indirect=True,
+```python
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from ..CategoryChallenge import CategoryChallenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
 )
-@pytest.mark.(VARIABLE) # VARIABLE = category of the test
-def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
-assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
+
+class TestSomething(CategoryChallenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    def test_retrieval(
+        self, workspace
+    ):
+        # scoring logic goes here
+```
+
+All challenges will inherit from parent class which has the mark
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+If you want to add a custom mark to a Challenge, you must specify it before the test definition
+
+```python
+@pytest.mark.other_mark
+def test_retrieval(self, workspace):
+```
+
+To add a dependency to a challenge use the following
+
+```python
+# to defining what a test depends on
+from pytest_dependency import depends
+
+def test1(self, request, workspace):
+   depends(request, data.dependencies)
+# for defining a test as a dependency
+@pytest.mark.dependency()
+def test2
+```
+
+Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+
+```python
+@pytest.mark.run(order=1)
+```
+
+To create a file to test a challenge, add this to the challenge file which will create a file before running the server
+
+```python
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
 ```
 
 ## Api
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 50efe2c4d..d5229e937 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,28 +4,25 @@
 
 Input:
 
-- **category** (str): information-retrieval
-- **difficulty**(str): the difficulty of this query. choices from
-
-## Information-retrieval challenges
-
-Input:
-
-- **category** (str): information-retrieval
-- **task** (str): the question the agent needs to be solve.
+- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **task** (str): The task that the agent needs to solve.
+- **dependencies** (str[]): The dependencies that the challenge needs to run.
 - **ground** (dict): The ground truth.
-  - **answer** (str): The raw text of ground truth answer
-  - **should_contain** (list): the exact strings that is required in the final answer
-  - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
-- **difficulty**(str): the difficulty of this query. choices from
-- **mock_func**: function to mock the agent's response. This is used for testing purposes
+  - **answer** (str): The raw text of the ground truth answer.
+  - **should_contain** (list): The exact strings that are required in the final answer.
+  - **should_not_contain** (list): The exact strings that should not be in the final answer.
+  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
+- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **info** (dict): Additional info about the challenge.
+  - **difficulty** (str): The difficulty of this query.
+  - **description** (str): Description of the challenge.
+  - **side_effects** (str[]): Describes the effects of the challenge.
 
 Example:
 
 ```python
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -33,11 +30,16 @@ Example:
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
 
 ```
 
-Output:
+Current Output:
 
 - **score** (float): scores range from [0, 1]
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 879a46af0..694671218 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -20,6 +20,7 @@ class Ground(BaseModel):
 class ChallengeData(BaseModel):
     category: List[str]
     task: str
+    dependencies: List[str]
     ground: Ground
     mock_func: Optional[str] = None
     info: Info
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 08b74d1b7..fe05b6d51 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,6 @@
 {
   "category": ["basic"],
+  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
new file mode 100644
index 000000000..563207405
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -0,0 +1,7 @@
+import pytest
+from agbenchmark.Challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 55319ddfc..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ea794281e..03b2d6cab 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,7 +1,9 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
         )
 
 
-class TestReadFile(Challenge):
+class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(depends=["write_file"])
-    def test_retrieval(
-        self, workspace
-    ):  # create_file simply there for the function to depend on the fixture
+    def test_read_file(self, request, workspace):
+        depends(request, data.dependencies)
+
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 4aaa1347d..562d1c364 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b2c559c9e..b09162e3d 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,6 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
-from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
 data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
 )
 
 
-class TestWriteFile(Challenge):
+class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
     @pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(name="write_file")
-    def test_retrieval(self, workspace):
+    @pytest.mark.dependency()
+    def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 84e625af4..b831003fc 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1 +1,3 @@
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3bc37622e..f6f24c5f2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,21 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-ordering"
+version = "0.6"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
+    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
+    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
+content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
diff --git a/pyproject.toml b/pyproject.toml
index 087ac8447..faee61c2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
+pytest-ordering = "^0.6"
 
 
 [build-system]

From d1c5e0a91a7a0f23b0e8de5f394204e96ec668cd Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 00:22:53 -0400
Subject: [PATCH 05/20] finally figured out right way to do dependencies

---
 agbenchmark/challenges/retrieval/Retrieval.py |  2 ++
 .../challenges/retrieval/r1/r1_data.json      |  4 ++--
 .../challenges/retrieval/r1/r1_test.py        |  6 ++++--
 .../tests/basic_abilities/BasicChallenge.py   |  1 +
 .../read_file/r_file_data.json                |  4 +++-
 .../read_file/read_file_test.py               |  6 ++----
 .../write_file/write_file_test.py             |  1 -
 .../tests/regression/regression_tests.txt     |  4 ++--
 poetry.lock                                   | 19 ++++++++++++++++++-
 pyproject.toml                                |  3 ++-
 10 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 9434d69c3..b8aa81ce3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,6 +1,8 @@
 from agbenchmark.Challenge import Challenge
+import pytest
 
 
+@pytest.mark.retrieval
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index fe05b6d51..562d1c364 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,14 +1,14 @@
 {
   "category": ["basic"],
-  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_write_file_mock",
   "info": {
     "difficulty": "easy",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d37c5e795..5e6d6abf4 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,6 +2,8 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
+from pytest_dependency import depends
+
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r1_data.json")
@@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.retrieval
-    def test_retrieval(self, workspace):
+    def test_retrieval(self, request, workspace):
+        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..0cada86cc 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,6 +2,7 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
+@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..4d04f33e7 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,7 +1,9 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 03b2d6cab..ad08da4e0 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
-from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_read_file(self, request, workspace):
-        depends(request, data.dependencies)
-
+    @pytest.mark.order(after=data.dependencies)
+    def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b09162e3d..4c94320e0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.dependency()
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index b831003fc..df27f3124 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
+agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index f6f24c5f2..4764bf493 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,23 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-order"
+version = "1.1.0"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
+    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
+]
+
+[package.dependencies]
+pytest = [
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+]
+
 [[package]]
 name = "pytest-ordering"
 version = "0.6"
@@ -793,4 +810,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
+content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
diff --git a/pyproject.toml b/pyproject.toml
index faee61c2d..fd2c52041 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
 pytest-ordering = "^0.6"
+pytest-order = "^1.1.0"
 
 
 [build-system]
@@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-ra -q"
+addopts = "--order-dependencies" # -ra -q 
 testpaths = [
     "tests", "agbenchmark",
 ]

From 31c11927199714516891db5aa3044eb1a4396eb4 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 08:48:16 -0400
Subject: [PATCH 06/20] other was non solution, solution is pytest-depends

---
 agbenchmark/challenges/README.md              | 20 ++---
 .../challenges/retrieval/r1/r1_test.py        |  2 -
 .../tests/basic_abilities/BasicChallenge.py   |  1 -
 .../read_file/r_file_data.json                |  4 +-
 .../read_file/read_file_test.py               |  2 +-
 .../write_file/write_file_test.py             |  1 +
 .../tests/regression/regression_tests.txt     |  2 +-
 poetry.lock                                   | 80 ++++++++++---------
 pyproject.toml                                |  6 +-
 9 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index d5229e937..e457b85c4 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -6,7 +6,7 @@ Input:
 
 - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
-- **dependencies** (str[]): The dependencies that the challenge needs to run.
+- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
   - **answer** (str): The raw text of the ground truth answer.
   - **should_contain** (list): The exact strings that are required in the final answer.
@@ -23,18 +23,20 @@ Example:
 ```python
 {
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_read_file_mock",
   "info": {
-    "difficulty": "easy",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
   }
 }
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 5e6d6abf4..45becaf75 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
-from pytest_dependency import depends
 
 
 data = ChallengeData.deserialize(
@@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, request, workspace):
-        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 0cada86cc..563207405 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
-@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 4d04f33e7..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,9 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ad08da4e0..494a9b071 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.order(after=data.dependencies)
+    @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 4c94320e0..0a4ef4a2c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index df27f3124..57b94cd7a 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 4764bf493..d7939fbfe 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -368,6 +368,20 @@ files = [
     {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
 ]
 
+[[package]]
+name = "future-fstrings"
+version = "1.2.0"
+description = "A backport of fstrings to python<3.6"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"},
+    {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"},
+]
+
+[package.extras]
+rewrite = ["tokenize-rt (>=3)"]
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -473,6 +487,24 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
 [[package]]
 name = "openai"
 version = "0.27.8"
@@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
-name = "pytest-dependency"
-version = "0.5.1"
-description = "Manage dependencies of tests"
+name = "pytest-depends"
+version = "1.0.1"
+description = "Tests that depend on other tests"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+    {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"},
+    {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"},
 ]
 
 [package.dependencies]
-pytest = ">=3.6.0"
-
-[[package]]
-name = "pytest-order"
-version = "1.1.0"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
-    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
-]
-
-[package.dependencies]
-pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
-]
-
-[[package]]
-name = "pytest-ordering"
-version = "0.6"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
-    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
-    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
-]
-
-[package.dependencies]
-pytest = "*"
+colorama = "*"
+future-fstrings = "*"
+networkx = "*"
+pytest = ">=3"
 
 [[package]]
 name = "requests"
@@ -810,4 +814,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
+content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
diff --git a/pyproject.toml b/pyproject.toml
index fd2c52041..0a4f8ba73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,9 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
-pytest-dependency = "^0.5.1"
-pytest-ordering = "^0.6"
-pytest-order = "^1.1.0"
+pytest-depends = "^1.0.1"
 
 
 [build-system]
@@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "--order-dependencies" # -ra -q 
+addopts = "-ra -q"
 testpaths = [
     "tests", "agbenchmark",
 ]

From adc6b225a6063bc2b0981f1156f25bde9279040e Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 11:12:33 -0400
Subject: [PATCH 07/20] update regression tests info

---
 .../challenges/retrieval/r1/r1_test.py        |  7 +++-
 agbenchmark/conftest.py                       | 36 +++++++++++++------
 .../read_file/read_file_test.py               |  5 +++
 .../write_file/w_file_data.json               |  2 +-
 .../write_file/write_file_test.py             |  5 +++
 .../tests/regression/RegressionManager.py     | 25 ++++++++-----
 .../tests/regression/regression_tests.json    |  1 +
 .../tests/regression/regression_tests.txt     | 17 +++++++--
 8 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 agbenchmark/tests/regression/regression_tests.json

diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 45becaf75..489d298fb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_retrieval(self, request, workspace):
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
+    def test_retrieval(self, workspace, current_challenge_data):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 434f6dbde..78114c204 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
+from agbenchmark.challenges.define_task_types import ChallengeData
 
 
 @pytest.fixture(scope="module")
@@ -64,21 +65,34 @@ def server_response(request, config):
     #     print(f"Request succeeded with status code {response.status_code}")
 
 
-regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
+regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
-regression_manager = RegressionManager(regression_txt)
+regression_manager = RegressionManager(regression_json)
+
+
+# this is to get the challenge_data from every test
+@pytest.fixture(autouse=True)
+def regression_data(request):
+    return request.param
 
 
 def pytest_runtest_makereport(item, call):
-    """Called for each test report. Generated for each stage
-    of a test run (setup, call, teardown)."""
     if call.when == "call":
-        if (
-            call.excinfo is None
-        ):  # if no error in the call stage, add it as a regression test
-            regression_manager.add_test(item.nodeid)
-        else:  # otherwise, :(
-            regression_manager.remove_test(item.nodeid)
+        challenge_data = item.funcargs.get("regression_data", None)
+        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
+        dependencies = challenge_data.dependencies if challenge_data else []
+
+        test_details = {
+            "difficulty": difficulty,
+            "dependencies": dependencies,
+            "test": item.nodeid,
+        }
+
+        print("pytest_runtest_makereport", test_details)
+        if call.excinfo is None:
+            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+        else:
+            regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
 def pytest_collection_modifyitems(items):
@@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items):
     to add regression marker to collected test items."""
     for item in items:
         print("pytest_collection_modifyitems", item.nodeid)
-        if item.nodeid + "\n" in regression_manager.tests:
+        if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 494a9b071..7d14228c8 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 562d1c364..1d2621081 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -10,7 +10,7 @@
   },
   "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 0a4ef4a2c..330128898 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
index 9117d53f1..a1379ecae 100644
--- a/agbenchmark/tests/regression/RegressionManager.py
+++ b/agbenchmark/tests/regression/RegressionManager.py
@@ -1,3 +1,6 @@
+import json
+
+
 class RegressionManager:
     """Abstracts interaction with the regression tests file"""
 
@@ -6,17 +9,21 @@ class RegressionManager:
         self.load()
 
     def load(self) -> None:
-        with open(self.filename, "r") as f:
-            self.tests = f.readlines()
+        try:
+            with open(self.filename, "r") as f:
+                self.tests = json.load(f)
+        except (FileNotFoundError, json.decoder.JSONDecodeError):
+            self.tests = {}
 
     def save(self) -> None:
         with open(self.filename, "w") as f:
-            f.writelines(self.tests)
+            json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_id) -> None:
-        if f"{test_id}\n" not in self.tests:
-            self.tests.append(f"{test_id}\n")
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
 
-    def remove_test(self, test_id) -> None:
-        if f"{test_id}\n" in self.tests:
-            self.tests.remove(f"{test_id}\n")
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 57b94cd7a..8af722f07 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,14 @@
-agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
+{
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
+        "difficulty": "easy",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file

From 7604ae07bb6d79cfe8e5a28fdf3fa85c83603b1b Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 19:30:04 -0400
Subject: [PATCH 08/20] can now put file extensions or names in files data

---
 agbenchmark/Challenge.py                      | 22 ++++++++++++++++++-
 .../challenges/retrieval/r1/r1_test.py        | 12 +++++-----
 .../read_file/read_file_test.py               | 12 +++++-----
 .../write_file/w_file_data.json               |  2 +-
 .../write_file/write_file_test.py             | 12 +++++-----
 .../tests/regression/regression_tests.json    | 15 ++++++++++++-
 6 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 9828a0e9e..d159296b1 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+import glob
 from agbenchmark.challenges.define_task_types import Ground
 
 
@@ -14,6 +14,26 @@ class Challenge:
         with open(workspace_dir, "r") as f:
             return f.read()
 
+    @staticmethod
+    def open_files(workspace: str, file_patterns: list):
+        script_dir = os.path.abspath(workspace)
+        files_contents = []
+
+        for file_pattern in file_patterns:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                with open(file_path, "r") as f:
+                    files_contents.append(f.read())
+
+        return files_contents
+
     @staticmethod
     def write_to_file(workspace: str, filename: str, content: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 489d298fb..2a7d92a71 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, workspace, current_challenge_data):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 7d14228c8..90946670c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge):
     )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 1d2621081..037c5bd88 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -6,7 +6,7 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
   },
   "mock_func": "basic_write_file_mock",
   "info": {
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 330128898..187378ff1 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge):
     )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..c84fc9c99 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file

From 4be22ae5abc884404370196bf71da86affe82131 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 26 Jun 2023 09:27:20 -0400
Subject: [PATCH 09/20] mini agi attempt

---
 agbenchmark/conftest.py                       | 44 +++++++++++--------
 .../tests/regression/regression_tests.json    | 15 +------
 agent/agbenchmark_run.py                      | 27 ++++++++++++
 agent/mini-agi                                |  1 +
 4 files changed, 55 insertions(+), 32 deletions(-)
 create mode 100644 agent/agbenchmark_run.py
 create mode 160000 agent/mini-agi

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 78114c204..b3b69f194 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -7,6 +7,7 @@ import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
 from agbenchmark.challenges.define_task_types import ChallengeData
+import subprocess
 
 
 @pytest.fixture(scope="module")
@@ -42,27 +43,34 @@ def server_response(request, config):
     else:
         task = request.param
         mock_function_name = None
-    # print(f"Server starting at {request.module}")
-    # try:
-    #     response = requests.post(
-    #         f"{config['hostname']}:{config['port']}", data={"task": task}
-    #     )
-    #     response.raise_for_status()  # This will raise an HTTPError if the status is 4xx or 5xx
-    # except RequestException:
-    #     # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock
 
-    if mock_function_name:
-        mock_manager = MockManager(
-            task
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_function_name)
-        mock_manager.delegate(mock_function_name)
-    else:
-        print("No mock provided")
+    # get the current file's directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
 
+    # construct the script's path
+    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+
+    # form the command
+    command = ["python", script_path, task]
+
+    # if mock_function_name:
+    #     mock_manager = MockManager(
+    #         task
+    #     )  # workspace doesn't need to be passed in, stays the same
+    #     print("Server unavailable, using mock", mock_function_name)
+    #     mock_manager.delegate(mock_function_name)
     # else:
-    #     # This code is run if no exception occurred
-    #     print(f"Request succeeded with status code {response.status_code}")
+    #     print("No mock provided")
+
+    try:
+        # run the command and wait for it to complete
+        result = subprocess.run(
+            command, shell=True, check=True, text=True, capture_output=True
+        )
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Subprocess failed with the following error:\n{e}")
+        # If the subprocess returns a non-zero exit status
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index c84fc9c99..9e26dfeeb 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,14 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
new file mode 100644
index 000000000..f509f5e66
--- /dev/null
+++ b/agent/agbenchmark_run.py
@@ -0,0 +1,27 @@
+import argparse
+import subprocess
+import os
+
+
+def main(objective):
+    # get the current directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # form the command
+    command = (
+        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
+    )
+
+    # run the command
+    subprocess.run(command, shell=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
+    parser.add_argument(
+        "objective", type=str, help="The objective to pass to miniagi.py"
+    )
+
+    args = parser.parse_args()
+
+    main(args.objective)
diff --git a/agent/mini-agi b/agent/mini-agi
new file mode 160000
index 000000000..d2add8f18
--- /dev/null
+++ b/agent/mini-agi
@@ -0,0 +1 @@
+Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b

From 8c44b9eddf7c566d5e39f7e11149772b96e23a5f Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 09:42:36 -0400
Subject: [PATCH 10/20] basic challenges, more ChallengeData structure

---
 agbenchmark/Challenge.py                      | 22 ++++++++++++++
 agbenchmark/challenges/define_task_types.py   | 16 ++++++----
 agbenchmark/challenges/retrieval/Retrieval.py | 22 +-------------
 .../challenges/retrieval/r1/r1_data.json      | 10 +++++--
 .../challenges/retrieval/r1/r1_test.py        |  6 ++--
 agbenchmark/mocks/tests/basic_mocks.py        | 28 ++++++++++++++++++
 agbenchmark/mocks/tests/retrieval_mocks.py    |  7 +----
 .../read_file/r_file_data.json                | 15 ++++++++++
 .../read_file/read_file_test.py               | 29 +++++++++++++++++++
 .../tests/basic_abilities/read_file_test.py   |  0
 .../write_file/w_file_data.json               | 16 ++++++++++
 .../write_file/write_file_test.py             | 27 +++++++++++++++++
 .../tests/basic_abilities/write_file_test.py  |  0
 pyproject.toml                                |  3 +-
 14 files changed, 163 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 20bf55853..9828a0e9e 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from agbenchmark.challenges.define_task_types import Ground
 
 
 class Challenge:
@@ -30,3 +31,24 @@ class Challenge:
             for filename in os.listdir(workspace)
             if os.path.isfile(os.path.join(workspace, filename))
         ]
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f1a841b53..879a46af0 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,12 @@ import json
 import os
 
 
+class Info(BaseModel):
+    difficulty: str
+    description: str
+    side_effects: List[str]
+
+
 class Ground(BaseModel):
     answer: str
     should_contain: Optional[List[str]]
@@ -11,20 +17,20 @@ class Ground(BaseModel):
     files: List[str]
 
 
-class Challenge(BaseModel):
-    category: str
+class ChallengeData(BaseModel):
+    category: List[str]
     task: str
     ground: Ground
-    difficulty: str
     mock_func: Optional[str] = None
+    info: Info
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
             file.write(self.json())
 
     @staticmethod
-    def deserialize(path: str) -> "Challenge":
+    def deserialize(path: str) -> "ChallengeData":
         print("Deserializing", path)
         with open(path, "r") as file:
             data = json.load(file)
-        return Challenge(**data)
+        return ChallengeData(**data)
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 2db22ae4d..9434d69c3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,27 +1,7 @@
 from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import Ground
 
 
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
+    pass
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index c7cc31004..08b74d1b7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,5 @@
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -7,6 +7,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy",
-  "mock_func": "retrieval_1_mock"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index e20c9f7b9..d37c5e795 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,9 +1,11 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import Challenge, Ground
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
-data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r1_data.json")
+)
 
 
 class TestRetrieval1(RetrievalChallenge):
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index e69de29bb..eb7b96541 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -0,0 +1,28 @@
+from agbenchmark.Challenge import Challenge
+from ..basic_gpt_agent import basic_gpt_agent
+
+
+def basic_read_file_mock(task: str, workspace: str):
+    """
+    This mock reads a file and returns its content.
+    """
+
+    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+
+    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
+
+    Challenge.write_to_file(
+        workspace, "file_to_check.txt", f"random string: {file_contents}"
+    )
+
+
+def basic_write_file_mock(task: str, workspace: str):
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
index 23f4bde17..2481de060 100644
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,4 +1,3 @@
-from ..basic_gpt_agent import basic_gpt_agent
 from agbenchmark.Challenge import Challenge
 
 
@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
 def retrieval_1_mock(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
new file mode 100644
index 000000000..55319ddfc
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -0,0 +1,15 @@
+{
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "ground": {
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_read_file_mock",
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
new file mode 100644
index 000000000..610ccdab6
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -0,0 +1,29 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
+)
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(
+        self, workspace
+    ):  # create_file simply there for the function to depend on the fixture
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
new file mode 100644
index 000000000..4aaa1347d
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -0,0 +1,16 @@
+{
+  "category": ["basic"],
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
new file mode 100644
index 000000000..ccb10fe70
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -0,0 +1,27 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "w_file_data.json")
+)
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pyproject.toml b/pyproject.toml
index 5498381a2..6f79e75ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression"
+    "regression",
+    "basic"
 ]
 
 [tool.poetry.scripts]

From 22458a04e81f6a4e200581fe4046182b96f6e17c Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:15:53 -0400
Subject: [PATCH 11/20] file creation from within file before server :)

---
 agbenchmark/conftest.py                                   | 2 +-
 agbenchmark/mocks/tests/basic_mocks.py                    | 2 +-
 .../tests/basic_abilities/read_file/read_file_test.py     | 8 ++++++++
 agbenchmark/tests/regression/regression_tests.txt         | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 908d39e89..434f6dbde 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -17,7 +17,7 @@ def config():
     return config
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def workspace(config):
     yield config["workspace"]
     # teardown after test function completes
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index eb7b96541..bbff6a9c7 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
 
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 610ccdab6..35d1d80c5 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -8,6 +8,14 @@ data = ChallengeData.deserialize(
 )
 
 
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
+
+
 class TestReadFile(Challenge):
     """Testing if LLM can read a file"""
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index e69de29bb..a5f8fbd1d 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -0,0 +1,2 @@
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]

From 60a7ac2343df15127e38da5d490edab887f81608 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:24:17 -0400
Subject: [PATCH 12/20] adding dependencies on other challenges

---
 agbenchmark/mocks/tests/basic_mocks.py            |  2 --
 .../basic_abilities/read_file/read_file_test.py   |  1 +
 .../basic_abilities/write_file/write_file_test.py |  1 +
 agbenchmark/tests/regression/regression_tests.txt |  1 -
 poetry.lock                                       | 15 ++++++++++++++-
 pyproject.toml                                    |  1 +
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index bbff6a9c7..550095b72 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
-
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
     Challenge.write_to_file(
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 35d1d80c5..ea794281e 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,7 @@ class TestReadFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(depends=["write_file"])
     def test_retrieval(
         self, workspace
     ):  # create_file simply there for the function to depend on the fixture
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index ccb10fe70..b2c559c9e 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -17,6 +17,7 @@ class TestWriteFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(name="write_file")
     def test_retrieval(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index a5f8fbd1d..84e625af4 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,2 +1 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3f1059aaf..3bc37622e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
+[[package]]
+name = "pytest-dependency"
+version = "0.5.1"
+description = "Manage dependencies of tests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+]
+
+[package.dependencies]
+pytest = ">=3.6.0"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -765,4 +778,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38"
+content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
diff --git a/pyproject.toml b/pyproject.toml
index 6f79e75ce..087ac8447 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
+pytest-dependency = "^0.5.1"
 
 
 [build-system]

From 2f28a66591ea37715282271ccf92560e89a7924a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 14:42:35 -0400
Subject: [PATCH 13/20] more elegant marking & dependency solution

---
 README.md                                     | 74 +++++++++++++++++--
 agbenchmark/challenges/README.md              | 38 +++++-----
 agbenchmark/challenges/define_task_types.py   |  1 +
 .../challenges/retrieval/r1/r1_data.json      |  1 +
 .../tests/basic_abilities/BasicChallenge.py   |  7 ++
 .../read_file/r_file_data.json                |  1 +
 .../read_file/read_file_test.py               | 12 +--
 .../write_file/w_file_data.json               |  1 +
 .../write_file/write_file_test.py             |  9 +--
 .../tests/regression/regression_tests.txt     |  2 +
 poetry.lock                                   | 17 ++++-
 pyproject.toml                                |  1 +
 12 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py

diff --git a/README.md b/README.md
index 0a8d119af..0ad0cf345 100644
--- a/README.md
+++ b/README.md
@@ -51,15 +51,73 @@ Share your progress :)
 
 to create a test:
 
-```
-@pytest.mark.parametrize(
-"server_response",
-["VARIABLE"], # VARIABLE = the query/goal you provide to the model
-indirect=True,
+```python
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from ..CategoryChallenge import CategoryChallenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
 )
-@pytest.mark.(VARIABLE) # VARIABLE = category of the test
-def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
-assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
+
+class TestSomething(CategoryChallenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    def test_retrieval(
+        self, workspace
+    ):
+        # scoring logic goes here
+```
+
+All challenges will inherit from parent class which has the mark
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+If you want to add a custom mark to a Challenge, you must specify it before the test definition
+
+```python
+@pytest.mark.other_mark
+def test_retrieval(self, workspace):
+```
+
+To add a dependency to a challenge use the following
+
+```python
+# to defining what a test depends on
+from pytest_dependency import depends
+
+def test1(self, request, workspace):
+   depends(request, data.dependencies)
+# for defining a test as a dependency
+@pytest.mark.dependency()
+def test2
+```
+
+Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+
+```python
+@pytest.mark.run(order=1)
+```
+
+To create a file to test a challenge, add this to the challenge file which will create a file before running the server
+
+```python
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
 ```
 
 ## Api
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 50efe2c4d..d5229e937 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,28 +4,25 @@
 
 Input:
 
-- **category** (str): information-retrieval
-- **difficulty**(str): the difficulty of this query. choices from
-
-## Information-retrieval challenges
-
-Input:
-
-- **category** (str): information-retrieval
-- **task** (str): the question the agent needs to be solve.
+- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **task** (str): The task that the agent needs to solve.
+- **dependencies** (str[]): The dependencies that the challenge needs to run.
 - **ground** (dict): The ground truth.
-  - **answer** (str): The raw text of ground truth answer
-  - **should_contain** (list): the exact strings that is required in the final answer
-  - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
-- **difficulty**(str): the difficulty of this query. choices from
-- **mock_func**: function to mock the agent's response. This is used for testing purposes
+  - **answer** (str): The raw text of the ground truth answer.
+  - **should_contain** (list): The exact strings that are required in the final answer.
+  - **should_not_contain** (list): The exact strings that should not be in the final answer.
+  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
+- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **info** (dict): Additional info about the challenge.
+  - **difficulty** (str): The difficulty of this query.
+  - **description** (str): Description of the challenge.
+  - **side_effects** (str[]): Describes the effects of the challenge.
 
 Example:
 
 ```python
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -33,11 +30,16 @@ Example:
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
 
 ```
 
-Output:
+Current Output:
 
 - **score** (float): scores range from [0, 1]
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 879a46af0..694671218 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -20,6 +20,7 @@ class Ground(BaseModel):
 class ChallengeData(BaseModel):
     category: List[str]
     task: str
+    dependencies: List[str]
     ground: Ground
     mock_func: Optional[str] = None
     info: Info
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 08b74d1b7..fe05b6d51 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,6 @@
 {
   "category": ["basic"],
+  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
new file mode 100644
index 000000000..563207405
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -0,0 +1,7 @@
+import pytest
+from agbenchmark.Challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 55319ddfc..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ea794281e..03b2d6cab 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,7 +1,9 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
         )
 
 
-class TestReadFile(Challenge):
+class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(depends=["write_file"])
-    def test_retrieval(
-        self, workspace
-    ):  # create_file simply there for the function to depend on the fixture
+    def test_read_file(self, request, workspace):
+        depends(request, data.dependencies)
+
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 4aaa1347d..562d1c364 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b2c559c9e..b09162e3d 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,6 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
-from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
 data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
 )
 
 
-class TestWriteFile(Challenge):
+class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
     @pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(name="write_file")
-    def test_retrieval(self, workspace):
+    @pytest.mark.dependency()
+    def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 84e625af4..b831003fc 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1 +1,3 @@
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3bc37622e..f6f24c5f2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,21 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-ordering"
+version = "0.6"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
+    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
+    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
+content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
diff --git a/pyproject.toml b/pyproject.toml
index 087ac8447..faee61c2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
+pytest-ordering = "^0.6"
 
 
 [build-system]

From 06a6f080543ddffd8baf3aaf51ec97ff1fce86b3 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 00:22:53 -0400
Subject: [PATCH 14/20] finally figured out right way to do dependencies

---
 agbenchmark/challenges/retrieval/Retrieval.py |  2 ++
 .../challenges/retrieval/r1/r1_data.json      |  4 ++--
 .../challenges/retrieval/r1/r1_test.py        |  6 ++++--
 .../tests/basic_abilities/BasicChallenge.py   |  1 +
 .../read_file/r_file_data.json                |  4 +++-
 .../read_file/read_file_test.py               |  6 ++----
 .../write_file/write_file_test.py             |  1 -
 .../tests/regression/regression_tests.txt     |  4 ++--
 poetry.lock                                   | 19 ++++++++++++++++++-
 pyproject.toml                                |  3 ++-
 10 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 9434d69c3..b8aa81ce3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,6 +1,8 @@
 from agbenchmark.Challenge import Challenge
+import pytest
 
 
+@pytest.mark.retrieval
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index fe05b6d51..562d1c364 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,14 +1,14 @@
 {
   "category": ["basic"],
-  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_write_file_mock",
   "info": {
     "difficulty": "easy",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d37c5e795..5e6d6abf4 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,6 +2,8 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
+from pytest_dependency import depends
+
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r1_data.json")
@@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.retrieval
-    def test_retrieval(self, workspace):
+    def test_retrieval(self, request, workspace):
+        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..0cada86cc 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,6 +2,7 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
+@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..4d04f33e7 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,7 +1,9 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 03b2d6cab..ad08da4e0 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
-from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_read_file(self, request, workspace):
-        depends(request, data.dependencies)
-
+    @pytest.mark.order(after=data.dependencies)
+    def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b09162e3d..4c94320e0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.dependency()
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index b831003fc..df27f3124 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
+agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index f6f24c5f2..4764bf493 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,23 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-order"
+version = "1.1.0"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
+    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
+]
+
+[package.dependencies]
+pytest = [
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+]
+
 [[package]]
 name = "pytest-ordering"
 version = "0.6"
@@ -793,4 +810,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
+content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
diff --git a/pyproject.toml b/pyproject.toml
index faee61c2d..fd2c52041 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
 pytest-ordering = "^0.6"
+pytest-order = "^1.1.0"
 
 
 [build-system]
@@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-ra -q"
+addopts = "--order-dependencies" # -ra -q 
 testpaths = [
     "tests", "agbenchmark",
 ]

From a2f79760ce8abdddfc27c5b0b42a58df903b352c Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 08:48:16 -0400
Subject: [PATCH 15/20] other was non solution, solution is pytest-depends

---
 agbenchmark/challenges/README.md              | 20 ++---
 .../challenges/retrieval/r1/r1_test.py        |  2 -
 .../tests/basic_abilities/BasicChallenge.py   |  1 -
 .../read_file/r_file_data.json                |  4 +-
 .../read_file/read_file_test.py               |  2 +-
 .../write_file/write_file_test.py             |  1 +
 .../tests/regression/regression_tests.txt     |  2 +-
 poetry.lock                                   | 80 ++++++++++---------
 pyproject.toml                                |  6 +-
 9 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index d5229e937..e457b85c4 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -6,7 +6,7 @@ Input:
 
 - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
-- **dependencies** (str[]): The dependencies that the challenge needs to run.
+- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
   - **answer** (str): The raw text of the ground truth answer.
   - **should_contain** (list): The exact strings that are required in the final answer.
@@ -23,18 +23,20 @@ Example:
 ```python
 {
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_read_file_mock",
   "info": {
-    "difficulty": "easy",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
   }
 }
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 5e6d6abf4..45becaf75 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
-from pytest_dependency import depends
 
 
 data = ChallengeData.deserialize(
@@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, request, workspace):
-        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 0cada86cc..563207405 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
-@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 4d04f33e7..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,9 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ad08da4e0..494a9b071 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.order(after=data.dependencies)
+    @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 4c94320e0..0a4ef4a2c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index df27f3124..57b94cd7a 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 4764bf493..d7939fbfe 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -368,6 +368,20 @@ files = [
     {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
 ]
 
+[[package]]
+name = "future-fstrings"
+version = "1.2.0"
+description = "A backport of fstrings to python<3.6"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"},
+    {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"},
+]
+
+[package.extras]
+rewrite = ["tokenize-rt (>=3)"]
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -473,6 +487,24 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
 [[package]]
 name = "openai"
 version = "0.27.8"
@@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
-name = "pytest-dependency"
-version = "0.5.1"
-description = "Manage dependencies of tests"
+name = "pytest-depends"
+version = "1.0.1"
+description = "Tests that depend on other tests"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+    {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"},
+    {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"},
 ]
 
 [package.dependencies]
-pytest = ">=3.6.0"
-
-[[package]]
-name = "pytest-order"
-version = "1.1.0"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
-    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
-]
-
-[package.dependencies]
-pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
-]
-
-[[package]]
-name = "pytest-ordering"
-version = "0.6"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
-    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
-    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
-]
-
-[package.dependencies]
-pytest = "*"
+colorama = "*"
+future-fstrings = "*"
+networkx = "*"
+pytest = ">=3"
 
 [[package]]
 name = "requests"
@@ -810,4 +814,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
+content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
diff --git a/pyproject.toml b/pyproject.toml
index fd2c52041..0a4f8ba73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,9 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
-pytest-dependency = "^0.5.1"
-pytest-ordering = "^0.6"
-pytest-order = "^1.1.0"
+pytest-depends = "^1.0.1"
 
 
 [build-system]
@@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "--order-dependencies" # -ra -q 
+addopts = "-ra -q"
 testpaths = [
     "tests", "agbenchmark",
 ]

From 2411c35d0eb0af6ff0fb4a64ac2b431ea2d41adb Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 11:12:33 -0400
Subject: [PATCH 16/20] update regression tests info

---
 .../challenges/retrieval/r1/r1_test.py        |  7 +++-
 agbenchmark/conftest.py                       | 36 +++++++++++++------
 .../read_file/read_file_test.py               |  5 +++
 .../write_file/w_file_data.json               |  2 +-
 .../write_file/write_file_test.py             |  5 +++
 .../tests/regression/RegressionManager.py     | 25 ++++++++-----
 .../tests/regression/regression_tests.json    |  1 +
 .../tests/regression/regression_tests.txt     | 17 +++++++--
 8 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 agbenchmark/tests/regression/regression_tests.json

diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 45becaf75..489d298fb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_retrieval(self, request, workspace):
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
+    def test_retrieval(self, workspace, current_challenge_data):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 434f6dbde..78114c204 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
+from agbenchmark.challenges.define_task_types import ChallengeData
 
 
 @pytest.fixture(scope="module")
@@ -64,21 +65,34 @@ def server_response(request, config):
     #     print(f"Request succeeded with status code {response.status_code}")
 
 
-regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
+regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
-regression_manager = RegressionManager(regression_txt)
+regression_manager = RegressionManager(regression_json)
+
+
+# this is to get the challenge_data from every test
+@pytest.fixture(autouse=True)
+def regression_data(request):
+    return request.param
 
 
 def pytest_runtest_makereport(item, call):
-    """Called for each test report. Generated for each stage
-    of a test run (setup, call, teardown)."""
     if call.when == "call":
-        if (
-            call.excinfo is None
-        ):  # if no error in the call stage, add it as a regression test
-            regression_manager.add_test(item.nodeid)
-        else:  # otherwise, :(
-            regression_manager.remove_test(item.nodeid)
+        challenge_data = item.funcargs.get("regression_data", None)
+        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
+        dependencies = challenge_data.dependencies if challenge_data else []
+
+        test_details = {
+            "difficulty": difficulty,
+            "dependencies": dependencies,
+            "test": item.nodeid,
+        }
+
+        print("pytest_runtest_makereport", test_details)
+        if call.excinfo is None:
+            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+        else:
+            regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
 def pytest_collection_modifyitems(items):
@@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items):
     to add regression marker to collected test items."""
     for item in items:
         print("pytest_collection_modifyitems", item.nodeid)
-        if item.nodeid + "\n" in regression_manager.tests:
+        if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 494a9b071..7d14228c8 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 562d1c364..1d2621081 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -10,7 +10,7 @@
   },
   "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 0a4ef4a2c..330128898 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
index 9117d53f1..a1379ecae 100644
--- a/agbenchmark/tests/regression/RegressionManager.py
+++ b/agbenchmark/tests/regression/RegressionManager.py
@@ -1,3 +1,6 @@
+import json
+
+
 class RegressionManager:
     """Abstracts interaction with the regression tests file"""
 
@@ -6,17 +9,21 @@ class RegressionManager:
         self.load()
 
     def load(self) -> None:
-        with open(self.filename, "r") as f:
-            self.tests = f.readlines()
+        try:
+            with open(self.filename, "r") as f:
+                self.tests = json.load(f)
+        except (FileNotFoundError, json.decoder.JSONDecodeError):
+            self.tests = {}
 
     def save(self) -> None:
         with open(self.filename, "w") as f:
-            f.writelines(self.tests)
+            json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_id) -> None:
-        if f"{test_id}\n" not in self.tests:
-            self.tests.append(f"{test_id}\n")
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
 
-    def remove_test(self, test_id) -> None:
-        if f"{test_id}\n" in self.tests:
-            self.tests.remove(f"{test_id}\n")
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 57b94cd7a..8af722f07 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,14 @@
-agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
+{
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
+        "difficulty": "easy",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file

From d6a6e69f2e3ed1cd4bb1715ae737ad50d6b17cb9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 19:30:04 -0400
Subject: [PATCH 17/20] can now put file extensions or names in files data

---
 agbenchmark/Challenge.py                      | 22 ++++++++++++++++++-
 .../challenges/retrieval/r1/r1_test.py        | 12 +++++-----
 .../read_file/read_file_test.py               | 12 +++++-----
 .../write_file/w_file_data.json               |  2 +-
 .../write_file/write_file_test.py             | 12 +++++-----
 .../tests/regression/regression_tests.json    | 15 ++++++++++++-
 6 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 9828a0e9e..d159296b1 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+import glob
 from agbenchmark.challenges.define_task_types import Ground
 
 
@@ -14,6 +14,26 @@ class Challenge:
         with open(workspace_dir, "r") as f:
             return f.read()
 
+    @staticmethod
+    def open_files(workspace: str, file_patterns: list):
+        script_dir = os.path.abspath(workspace)
+        files_contents = []
+
+        for file_pattern in file_patterns:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                with open(file_path, "r") as f:
+                    files_contents.append(f.read())
+
+        return files_contents
+
     @staticmethod
     def write_to_file(workspace: str, filename: str, content: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 489d298fb..2a7d92a71 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, workspace, current_challenge_data):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 7d14228c8..90946670c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge):
     )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 1d2621081..037c5bd88 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -6,7 +6,7 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
   },
   "mock_func": "basic_write_file_mock",
   "info": {
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 330128898..187378ff1 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge):
     )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..c84fc9c99 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file

From fa0df12439b7beea91a46f08e7f6154900dc1047 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 26 Jun 2023 09:27:20 -0400
Subject: [PATCH 18/20] mini agi attempt

---
 agbenchmark/conftest.py                       | 44 +++++++++++--------
 .../tests/regression/regression_tests.json    | 15 +------
 agent/agbenchmark_run.py                      | 27 ++++++++++++
 3 files changed, 54 insertions(+), 32 deletions(-)
 create mode 100644 agent/agbenchmark_run.py

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 78114c204..b3b69f194 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -7,6 +7,7 @@ import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
 from agbenchmark.challenges.define_task_types import ChallengeData
+import subprocess
 
 
 @pytest.fixture(scope="module")
@@ -42,27 +43,34 @@ def server_response(request, config):
     else:
         task = request.param
         mock_function_name = None
-    # print(f"Server starting at {request.module}")
-    # try:
-    #     response = requests.post(
-    #         f"{config['hostname']}:{config['port']}", data={"task": task}
-    #     )
-    #     response.raise_for_status()  # This will raise an HTTPError if the status is 4xx or 5xx
-    # except RequestException:
-    #     # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock
 
-    if mock_function_name:
-        mock_manager = MockManager(
-            task
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_function_name)
-        mock_manager.delegate(mock_function_name)
-    else:
-        print("No mock provided")
+    # get the current file's directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
 
+    # construct the script's path
+    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+
+    # form the command
+    command = ["python", script_path, task]
+
+    # if mock_function_name:
+    #     mock_manager = MockManager(
+    #         task
+    #     )  # workspace doesn't need to be passed in, stays the same
+    #     print("Server unavailable, using mock", mock_function_name)
+    #     mock_manager.delegate(mock_function_name)
     # else:
-    #     # This code is run if no exception occurred
-    #     print(f"Request succeeded with status code {response.status_code}")
+    #     print("No mock provided")
+
+    try:
+        # run the command and wait for it to complete
+        result = subprocess.run(
+            command, shell=True, check=True, text=True, capture_output=True
+        )
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Subprocess failed with the following error:\n{e}")
+        # If the subprocess returns a non-zero exit status
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index c84fc9c99..9e26dfeeb 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,14 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
new file mode 100644
index 000000000..f509f5e66
--- /dev/null
+++ b/agent/agbenchmark_run.py
@@ -0,0 +1,27 @@
+import argparse
+import subprocess
+import os
+
+
+def main(objective):
+    # get the current directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # form the command
+    command = (
+        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
+    )
+
+    # run the command
+    subprocess.run(command, shell=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
+    parser.add_argument(
+        "objective", type=str, help="The objective to pass to miniagi.py"
+    )
+
+    args = parser.parse_args()
+
+    main(args.objective)

From f933717d8b6f28e268437e000a57e187076287af Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 18:17:54 -0400
Subject: [PATCH 19/20] mini-agi, simple challenge creation, --mock flag

---
 .env.example                                  |   4 +
 README.md                                     |   2 +-
 agbenchmark/Challenge.py                      |  53 ++++++++-
 agbenchmark/challenges/define_task_types.py   |  12 +-
 .../challenges/retrieval/r1/r1_data.json      |  12 +-
 .../challenges/retrieval/r1/r1_test.py        |  24 +---
 agbenchmark/config.json                       |   2 +-
 agbenchmark/conftest.py                       | 103 ++++++++++++------
 agbenchmark/start_benchmark.py                |  20 +++-
 .../tests/basic_abilities/BasicChallenge.py   |   2 +
 .../read_file/r_file_data.json                |   7 +-
 .../read_file/read_file_test.py               |  43 +++-----
 .../write_file/w_file_data.json               |   8 +-
 .../write_file/write_file_test.py             |  26 ++---
 .../tests/regression/regression_tests.json    |  15 ++-
 .../tests/regression/regression_tests.txt     |  14 ---
 agent/agbenchmark_run.py                      |  27 -----
 poetry.lock                                   |  16 ++-
 pyproject.toml                                |   3 +-
 19 files changed, 235 insertions(+), 158 deletions(-)
 create mode 100644 .env.example
 delete mode 100644 agbenchmark/tests/regression/regression_tests.txt
 delete mode 100644 agent/agbenchmark_run.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 000000000..0a91118a9
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+OPENAI_API_KEY=
+AGENT_NAME=mini-agi
+AGENT_TIMEOUT=60
+MOCK_TEST=False
\ No newline at end of file
diff --git a/README.md b/README.md
index 0ad0cf345..794279478 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
-        "server_response",
+        "run_agent",
         [(data.task, data.mock_func)],
         indirect=True,
     )
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index d159296b1..f644abc4a 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,12 +1,63 @@
 import os
 import glob
+import pytest
+from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
+from agbenchmark.challenges.define_task_types import ChallengeData
+from dotenv import load_dotenv, set_key
+
+load_dotenv()
+
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 
-class Challenge:
+class Challenge(ABC):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
+    @abstractmethod
+    def get_file_path(self) -> str:
+        """This should be implemented by any class which inherits from BasicChallenge"""
+        pass
+
+    @property
+    def data(self) -> ChallengeData:
+        return ChallengeData.deserialize(self.get_file_path())
+
+    @property
+    def mock(self):
+        return self.data.mock.mock_func if self.data.mock else None
+
+    @property
+    def task(self):
+        return (
+            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
+        )
+
+    @property
+    def dependencies(self) -> list:
+        print("self.data.dependencies", self.data.dependencies)
+        return self.data.dependencies
+
+    @property
+    def name(self) -> str:
+        print("self.data.name", self.data.name)
+        return self.data.name
+
+    @pytest.mark.parametrize(
+        "run_agent",
+        [(task, mock)],
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "challenge_data",
+        [data],
+        indirect=True,
+    )
+    def test_method(self, workspace):
+        raise NotImplementedError
+
     @staticmethod
     def open_file(workspace: str, filename: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 694671218..7fc2361b5 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,11 @@ import json
 import os
 
 
+class Mock(BaseModel):
+    mock_func: str
+    mock_task: Optional[str] = None
+
+
 class Info(BaseModel):
     difficulty: str
     description: str
@@ -12,17 +17,18 @@ class Info(BaseModel):
 
 class Ground(BaseModel):
     answer: str
-    should_contain: Optional[List[str]]
-    should_not_contain: Optional[List[str]]
+    should_contain: Optional[List[str]] = None
+    should_not_contain: Optional[List[str]] = None
     files: List[str]
 
 
 class ChallengeData(BaseModel):
+    name: str
     category: List[str]
     task: str
     dependencies: List[str]
     ground: Ground
-    mock_func: Optional[str] = None
+    mock: Optional[Mock] = None
     info: Info
 
     def serialize(self, path: str) -> None:
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 562d1c364..80c5e51eb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,16 +1,20 @@
 {
+  "name": "retrieval1",
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
   },
-  "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 2a7d92a71..0bd907d8a 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r1_data.json")
-)
-
-
 class TestRetrieval1(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    def test_retrieval(self, workspace, current_challenge_data):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r1_data.json")
+
+    def test_method(self, workspace):
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index d285627e5..9e5c1880f 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,5 @@
 {
   "hostname": "localhost",
   "port": 8080,
-  "workspace": "agbenchmark/mocks/workspace"
+  "workspace": "C:/Users/silen/miniagi"
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index b3b69f194..4edd4b5e0 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -4,18 +4,24 @@ import pytest
 import shutil
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
-from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
-from agbenchmark.challenges.define_task_types import ChallengeData
 import subprocess
+from agbenchmark.Challenge import Challenge
+from dotenv import load_dotenv
+
+load_dotenv()
 
 
 @pytest.fixture(scope="module")
-def config():
+def config(request):
     config_file = os.path.abspath("agbenchmark/config.json")
     print(f"Config file: {config_file}")
     with open(config_file, "r") as f:
         config = json.load(f)
+
+    if request.config.getoption("--mock"):
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
     return config
 
 
@@ -34,43 +40,49 @@ def workspace(config):
             print(f"Failed to delete {file_path}. Reason: {e}")
 
 
+def pytest_addoption(parser):
+    parser.addoption("--mock", action="store_true", default=False)
+
+
+AGENT_NAME = os.getenv("AGENT_NAME")
+AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
+
+
 @pytest.fixture(autouse=True)
-def server_response(request, config):
+def run_agent(request, config):
     """Calling to get a response"""
     if isinstance(request.param, tuple):
         task = request.param[0]  # The task is passed in indirectly
-        mock_function_name = request.param[1]
+        mock_function_name = request.param[1] or None
     else:
         task = request.param
         mock_function_name = None
 
-    # get the current file's directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
+    if mock_function_name != None and (request.config.getoption("--mock")):
+        if mock_function_name:
+            mock_manager = MockManager(
+                task
+            )  # workspace doesn't need to be passed in, stays the same
+            print("Server unavailable, using mock", mock_function_name)
+            mock_manager.delegate(mock_function_name)
+        else:
+            print("No mock provided")
+    else:
+        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
 
-    # construct the script's path
-    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+        try:
+            timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60
 
-    # form the command
-    command = ["python", script_path, task]
-
-    # if mock_function_name:
-    #     mock_manager = MockManager(
-    #         task
-    #     )  # workspace doesn't need to be passed in, stays the same
-    #     print("Server unavailable, using mock", mock_function_name)
-    #     mock_manager.delegate(mock_function_name)
-    # else:
-    #     print("No mock provided")
-
-    try:
-        # run the command and wait for it to complete
-        result = subprocess.run(
-            command, shell=True, check=True, text=True, capture_output=True
-        )
-        return result
-    except subprocess.CalledProcessError as e:
-        print(f"Subprocess failed with the following error:\n{e}")
-        # If the subprocess returns a non-zero exit status
+            subprocess.run(
+                ["python", "miniagi.py", task],
+                check=True,
+                cwd=path,
+                timeout=timeout
+                # text=True,
+                # capture_output=True
+            )
+        except subprocess.TimeoutExpired:
+            print("The subprocess has exceeded the time limit and was terminated.")
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
@@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json)
 
 # this is to get the challenge_data from every test
 @pytest.fixture(autouse=True)
-def regression_data(request):
+def challenge_data(request):
     return request.param
 
 
 def pytest_runtest_makereport(item, call):
     if call.when == "call":
-        challenge_data = item.funcargs.get("regression_data", None)
+        challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
         dependencies = challenge_data.dependencies if challenge_data else []
 
@@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call):
 
 def pytest_collection_modifyitems(items):
     """Called once all test items are collected. Used
-    to add regression marker to collected test items."""
+    to add regression and depends markers to collected test items."""
     for item in items:
-        print("pytest_collection_modifyitems", item.nodeid)
+        # regression add
         if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
@@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items):
 def pytest_sessionfinish():
     """Called at the end of the session to save regression tests"""
     regression_manager.save()
+
+
+# this is so that all tests can inherit from the Challenge class
+def pytest_generate_tests(metafunc):
+    if "challenge_data" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = test_class.data
+
+        # Add the parameters to the test function
+        metafunc.parametrize("challenge_data", [params], indirect=True)
+
+    if "run_agent" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = [(test_class.task, test_class.mock)]
+
+        # Add the parameters to the test function
+        metafunc.parametrize("run_agent", params, indirect=True)
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 6adcc09bf..ac612293a 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,6 +2,10 @@ import click
 import pytest
 import json
 import os
+from pathlib import Path
+from dotenv import load_dotenv, set_key
+
+load_dotenv()
 
 
 @click.group()
@@ -12,8 +16,8 @@ def cli():
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
 @click.option("--noreg", is_flag=True, help="Skip regression tests")
-def start(category, noreg):
-    """Start the benchmark tests. If a category flag is is provided, run the categories with that mark."""
+@click.option("--mock", is_flag=True, help="Run with mock")
+def start(category, noreg, mock):
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     config_file = "agbenchmark/config.json"
 
@@ -28,7 +32,8 @@ def start(category, noreg):
         )
         config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
-            "Please enter a new workspace path", default="agbenchmark/mocks/workspace"
+            "Please enter a new workspace path",
+            default=os.path.join(Path.home(), "miniagi"),
         )
 
         with open(config_dir, "w") as f:
@@ -38,13 +43,17 @@ def start(category, noreg):
         with open(config_dir, "r") as f:
             config = json.load(f)
 
+    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    if mock:
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
     # create workspace directory if it doesn't exist
     workspace_path = os.path.abspath(config["workspace"])
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path, exist_ok=True)
 
     regression_path = os.path.abspath(
-        "agbenchmark/tests/regression/regression_tests.txt"
+        "agbenchmark/tests/regression/regression_tests.json"
     )
     if not os.path.exists(regression_path):
         with open(regression_path, "a"):
@@ -74,6 +83,9 @@ def start(category, noreg):
         else:
             print("Running all categorys")  # run all categorys
 
+    if mock:
+        pytest_args.append("--mock")
+
     # Run pytest with the constructed arguments
     pytest.main(pytest_args)
 
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..6e7f73100 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -1,5 +1,7 @@
 import pytest
 from agbenchmark.Challenge import Challenge
+from agbenchmark.challenges.define_task_types import ChallengeData
+from abc import abstractmethod
 
 
 @pytest.mark.basic
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..b21e2724b 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,13 +1,16 @@
 {
+  "name": "basic_read_file",
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": ["basic_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "basic_read_file_mock",
+  "mock": {
+    "mock_func": "basic_read_file_mock"
+  },
   "info": {
     "description": "This reads the file quickly",
     "difficulty": "basic",
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 90946670c..68288a42c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
-        Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
-        )
-
 
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(on=data.dependencies)
-    def test_read_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    @pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
+        Challenge.write_to_file(
+            workspace, self.data.ground.files[0], "this is how we're doing"
+        )
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r_file_data.json")
+
+    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
+    def test_method(
+        self, workspace
+    ):  # run_test is a common name that all tests must implement
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 037c5bd88..358ebb538 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
+  "name": "basic_write_file",
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
     "answer": "Washington",
@@ -8,7 +9,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": [".txt"]
   },
-  "mock_func": "basic_write_file_mock",
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
+  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 187378ff1..8caa6605a 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "w_file_data.json")
-)
-
 
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(name="test_write_file")
-    def test_write_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        print("my workspace is ", workspace)
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..8a6278fea 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
deleted file mode 100644
index 8af722f07..000000000
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
-        "difficulty": "easy",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
deleted file mode 100644
index f509f5e66..000000000
--- a/agent/agbenchmark_run.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import argparse
-import subprocess
-import os
-
-
-def main(objective):
-    # get the current directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # form the command
-    command = (
-        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
-    )
-
-    # run the command
-    subprocess.run(command, shell=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
-    parser.add_argument(
-        "objective", type=str, help="The objective to pass to miniagi.py"
-    )
-
-    args = parser.parse_args()
-
-    main(args.objective)
diff --git a/poetry.lock b/poetry.lock
index d7939fbfe..7b2477bc6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -644,6 +644,20 @@ future-fstrings = "*"
 networkx = "*"
 pytest = ">=3"
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -814,4 +828,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
+content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"
diff --git a/pyproject.toml b/pyproject.toml
index 0a4f8ba73..043fe68a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
+python-dotenv = "^1.0.0"
 
 
 [build-system]
@@ -30,7 +31,7 @@ testpaths = [
 markers = [
     "retrieval",
     "regression",
-    "basic"
+    "basic",
 ]
 
 [tool.poetry.scripts]

From 76ee994d2c7a205799bc7c07adfa70f0c93102e9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 19:19:14 -0400
Subject: [PATCH 20/20] read mes, remove port and host from config, etc

---
 .env.example                                  |   1 -
 README.md                                     | 154 ++++++------------
 agbenchmark/challenges/README.md              |  31 ++--
 agbenchmark/config.json                       |   4 +-
 agbenchmark/mocks/basic_gpt_agent.py          |  20 ---
 agbenchmark/mocks/tests/basic_mocks.py        |  12 +-
 agbenchmark/start_benchmark.py                |   4 -
 .../read_file/read_file_test.py               |   5 +-
 .../write_file/write_file_test.py             |   1 -
 .../tests/regression/regression_tests.json    |   7 -
 10 files changed, 73 insertions(+), 166 deletions(-)
 delete mode 100644 agbenchmark/mocks/basic_gpt_agent.py

diff --git a/.env.example b/.env.example
index 0a91118a9..7782d048e 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,3 @@
-OPENAI_API_KEY=
 AGENT_NAME=mini-agi
 AGENT_TIMEOUT=60
 MOCK_TEST=False
\ No newline at end of file
diff --git a/README.md b/README.md
index 794279478..2c8daa0ad 100644
--- a/README.md
+++ b/README.md
@@ -2,80 +2,70 @@
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
+## As a user
+
+1. `pip install auto-gpt-benchmarks`
+2. Add boilerplate code to run and kill agent
+3. `agbenchmark start`
+   - `--category challenge_category` to run tests in a specific category
+   - `--mock` to only run mock tests if they exists for each test
+   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
+4. We call boilerplate code for your agent
+5. Show pass rate of tests, logs, and any other metrics
+
+## Contributing
+
 ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
-### To run the basic existing mock (June 21)
+### To run the existing mocks
 
 1. clone the repo `auto-gpt-benchmarks`
 2. `pip install poetry`
 3. `poetry shell`
 4. `poetry install`
-5. `agbenchmark start`
+5. `cp .env_example .env`
+6. `agbenchmark start --mock`
    Keep config the same and watch the logs :)
 
+### To run with mini-agi
+
+1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
+2. `pip install -r requirements.txt`
+3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
+4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
+
 - To add requirements `poetry add requirement`.
 
 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 
-If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
 
 Let people know what beautiful code you write does, document everything well
 
 Share your progress :)
 
-## How this works
-
-1. `pip install auto-gpt-benchmarks`
-2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
-3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
-4. We call the server to run the agent for each test
-5. Show pass rate of tests, logs, and any other metrics
-
-### To run the basic existing mock (June 21)
-
-1. clone the repo `auto-gpt-benchmarks`
-2. `pip install poetry`
-3. `poetry shell`
-4. `poetry install`
-5. `agbenchmark start`
-   Keep config the same and watch the logs :)
-
-#### Bonuses
-
-- You can adds tests by git cloning auto-gpt-benchmarks to your repo
-- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server
-- Simple, easy to use
-- Don't have to deal with cloud or parallelization yet
-
 ### Pytest
 
-to create a test:
+an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
 
 ```python
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
-from ..CategoryChallenge import CategoryChallenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)
 
-class TestSomething(CategoryChallenge):
-    """Testing if LLM can read a file"""
+class TestWriteFile(BasicChallenge):
+    """Testing if LLM can write to a file"""
 
-    @pytest.mark.parametrize(
-        "run_agent",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    def test_retrieval(
-        self, workspace
-    ):
-        # scoring logic goes here
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        # implement scoring logic by looking at workspace
 ```
 
-All challenges will inherit from parent class which has the mark
+All challenges will inherit from parent class which has the mark and any specific methods for their category
 
 ```python
 @pytest.mark.basic
@@ -83,50 +73,23 @@ class BasicChallenge(Challenge):
     pass
 ```
 
-If you want to add a custom mark to a Challenge, you must specify it before the test definition
-
-```python
-@pytest.mark.other_mark
-def test_retrieval(self, workspace):
-```
-
-To add a dependency to a challenge use the following
-
-```python
-# to defining what a test depends on
-from pytest_dependency import depends
-
-def test1(self, request, workspace):
-   depends(request, data.dependencies)
-# for defining a test as a dependency
-@pytest.mark.dependency()
-def test2
-```
-
-Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
-
-```python
-@pytest.mark.run(order=1)
-```
-
 To create a file to test a challenge, add this to the challenge file which will create a file before running the server
 
 ```python
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
+@pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
         Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
+            workspace, self.data.ground.files[0], "this is how we're doing"
         )
 ```
 
-## Api
-
-FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server
+#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
 
 ## Workspace
 
-Defined by the user on config
+If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 
 #### Dataset
 
@@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 |-- auto-gpt-benchmarks/ **main project directory**
 | |-- metrics.py **combining scores, metrics, final evaluation**
 | |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **shared fixtures across all tests**
-| |-- Challenge.py **easy challenge creation class?**
-| |-- config.json **hostname, port, workspace folder**
+| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
+| |-- Challenge.py **easy challenge creation class**
+| |-- config.json **workspace folder**
 | |-- challenges/ **challenges across different domains**
 | | |-- adaptability/
 | | |-- basic_abilities/
@@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 | | |-- retrieval/
 | | |-- web_navigation/
 | | |-- writing/
-| |-- tests/ **challenges across different metrics**
-| | |-- basic_abilities/
-| | |-- interface/
-| |-- workspace/ **workspace related func**
-| | |-- **init**.py
-| | |-- workspace_manager.py **creation, deletion**
+| |-- tests/
+| | |-- basic_abilities/ **every llm should pass these challenges**
+| | |-- regression/ **challenges that already passed**
 ```
-
-### Easy Challenge Creation
-
-tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval
-
-#### Written Challenges
-
-For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
-
-#### Validators
-
-Designed to handle specific types of output (e.g., text, code, structured data)
-
-#### Logging
-
-Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
-
-Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index e457b85c4..9e74d19ce 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,7 +4,8 @@
 
 Input:
 
-- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **name** (str): Name of the challenge.
+- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
 - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
@@ -12,7 +13,9 @@ Input:
   - **should_contain** (list): The exact strings that are required in the final answer.
   - **should_not_contain** (list): The exact strings that should not be in the final answer.
   - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
-- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **mock** (dict): Mock response for testing.
+  - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+  - **mock_task** (str): Task to provide for the mock function.
 - **info** (dict): Additional info about the challenge.
   - **difficulty** (str): The difficulty of this query.
   - **description** (str): Description of the challenge.
@@ -22,24 +25,26 @@ Example:
 
 ```python
 {
+  "name": "basic_write_file",
   "category": ["basic"],
-  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "task": "Print the the capital of America to a .txt file",
+  "dependencies": [],
   "ground": {
-    "answer": "random string: this is how we're doing",
-    "should_contain": ["random string: this is how we're doing"],
-    "files": ["file_to_check.txt"]
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
   },
-  "mock_func": "basic_read_file_mock",
   "info": {
-    "description": "This reads the file quickly",
     "difficulty": "basic",
-    "side_effects": [""]
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
   }
 }
-
 ```
 
 Current Output:
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 9e5c1880f..3de1dd643 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,3 @@
 {
-  "hostname": "localhost",
-  "port": 8080,
-  "workspace": "C:/Users/silen/miniagi"
+  "hostname": "localhost"
 }
diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py
deleted file mode 100644
index 6aac3d191..000000000
--- a/agbenchmark/mocks/basic_gpt_agent.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import json
-import openai
-
-
-def basic_gpt_agent(query) -> str:
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}]
-    )
-
-    answer = response["choices"][0]["message"]["content"]  # type: ignore
-
-    print("QUERY       : ", query)
-    print("AGENT ANSWER: ", answer)
-
-    return answer
-
-
-if __name__ == "__main__":
-    # server boilerplate example here
-    basic_gpt_agent("")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 550095b72..631b30c2c 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,5 +1,4 @@
 from agbenchmark.Challenge import Challenge
-from ..basic_gpt_agent import basic_gpt_agent
 
 
 def basic_read_file_mock(task: str, workspace: str):
@@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str):
     """
     This mock writes to a file (creates one if it doesn't exist)
     """
-
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "Washington DC is the capital of the United States of America",
+    )
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index ac612293a..c9f3643cc 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -27,10 +27,6 @@ def start(category, noreg, mock):
     if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
         config = {}
 
-        config["hostname"] = click.prompt(
-            "\nPlease enter a new hostname", default="localhost"
-        )
-        config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
             "Please enter a new workspace path",
             default=os.path.join(Path.home(), "miniagi"),
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 68288a42c..f99ae608c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
@@ -8,9 +7,7 @@ import os
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
-    @pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
+    @pytest.fixture(scope="module", autouse=True)
     def setup_module(self, workspace):
         Challenge.write_to_file(
             workspace, self.data.ground.files[0], "this is how we're doing"
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 8caa6605a..39c73b163 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 8a6278fea..384f9e7c6 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -3,12 +3,5 @@
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "basic_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
     }
 }
\ No newline at end of file