Add 'Debug simple typo with guidance' challenge (#65)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2023-07-07 13:50:53 -07:00 · 2023-07-07 13:50:53 -07:00 · 9ede17891b
parent bfd0d5c826
commit 9ede17891b
56 changed files with 288 additions and 85 deletions
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess
 import sys
 import time
@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST")


 def run_agent(
-    task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
+    task: Optional[str],
+    mock_func: Optional[str],
+    config: Dict[str, Any],
+    challenge_location: str,
 ) -> None:
    """Calling to get a response"""

-    if mock_func == None and MOCK_FLAG == "True":
-        print("No mock provided")
-    elif MOCK_FLAG == "True":
+    if MOCK_FLAG == "True":
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_out", challenge_location
+        )
+        if mock_func is None:
+            print("No mock provided")
+            return
        mock_manager = MockManager(
            task, config
        )  # workspace doesn't need to be passed in, stays the same
@ -77,4 +85,19 @@ def run_agent(
        process.wait()


+def copy_artifacts_into_workspace(
+    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+) -> None:
+    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+
+    # Check if source_dir exists, if not then return immediately.
+    if not os.path.exists(source_dir):
+        return
+
+    for file_name in os.listdir(source_dir):
+        full_file_name = os.path.join(source_dir, file_name)
+        if os.path.isfile(full_file_name):
+            shutil.copy(full_file_name, workspace)
+
+
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@ -1,9 +1,10 @@
 import glob
 import inspect
 import os
-import shutil
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+import subprocess
+import types
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Type, cast

 import pytest
 from dotenv import load_dotenv
@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST")
 MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False


-class Challenge(ABC):
+class ChallengeMeta(ABCMeta):
+    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
+
+        super().__init__(name, bases, dct)
+        try:
+            frame = cast(types.FrameType, inspect.currentframe())
+            assert frame.f_back is not None
+            self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
+        except Exception as e:
+            print(f"Unable to get the file from 8 frames back due to: {str(e)}")
+            raise e
+
+
+class Challenge(ABC, metaclass=ChallengeMeta):
    """The parent class to all specific challenges classes.
    Defines helper methods for running a challenge"""

@ -52,11 +66,13 @@ class Challenge(ABC):
        return self.data.dependencies

    def setup_challenge(self, config: Dict[str, Any]) -> None:
-        from agbenchmark.agent_interface import run_agent
+        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent

-        self.copy_artifacts_into_workspace(config["workspace"])
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
+        )

-        run_agent(self.task, self.mock, config)
+        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)

    @property
    def name(self) -> str:
@ -77,8 +93,7 @@ class Challenge(ABC):
        with open(workspace_dir, "r") as f:
            return f.read()

-    @staticmethod
-    def open_files(workspace: str, file_patterns: list) -> List[str]:
+    def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
        script_dir = os.path.abspath(workspace)
        files_contents = []

@ -92,8 +107,17 @@ class Challenge(ABC):
                matching_files = [os.path.join(script_dir, file_pattern)]

            for file_path in matching_files:
-                with open(file_path, "r") as f:
-                    files_contents.append(f.read())
+                if self.data.ground.type == "execute_python_code":
+                    result = subprocess.run(
+                        ["python3", file_path],
+                        cwd=os.path.abspath(workspace),
+                        capture_output=True,
+                        text=True,
+                    )
+                    files_contents.append(result.stdout)
+                else:
+                    with open(file_path, "r") as f:
+                        files_contents.append(f.read())

        return files_contents

@ -135,19 +159,3 @@ class Challenge(ABC):
                    )

        return 1.0
-
-    def copy_artifacts_into_workspace(self, workspace: str) -> None:
-        curr_frame = inspect.currentframe()
-        outer_frame = inspect.getouterframes(curr_frame)[2]
-        caller_file_path = outer_frame.filename
-        caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
-        source_dir = os.path.join(caller_dir_path, "artifacts")
-
-        # Check if source_dir exists, if not then return immediately.
-        if not os.path.exists(source_dir):
-            return
-
-        for file_name in os.listdir(source_dir):
-            full_file_name = os.path.join(source_dir, file_name)
-            if os.path.isfile(full_file_name):
-                shutil.copy(full_file_name, workspace)
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@ -33,7 +33,8 @@ Example:
    "answer": "Washington",
    "should_contain": ["Washington"],
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_write_file_mock",
--- a/agbenchmark/challenges/code/code.py
+++ b/agbenchmark/challenges/code/code.py
@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.code
+class CodeChallenge(Challenge):
+    """Challenge for memory"""
--- a/agbenchmark/challenges/code/d1/artifacts_in/init.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/init.py
--- a/agbenchmark/challenges/code/d1/artifacts_in/code.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py
@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d1/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py
@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d1/artifacts_out/init.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/init.py
--- a/agbenchmark/challenges/code/d1/artifacts_out/code.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py
@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d1/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py
@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_with_guidance",
+  "category": ["code"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@ -0,0 +1,31 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
+        )
+
+    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@ -5,7 +5,7 @@ from pydantic import BaseModel


 class Mock(BaseModel):
-    mock_func: str
+    mock_func: Optional[str] = None
    mock_task: Optional[str] = None


@ -20,6 +20,7 @@ class Ground(BaseModel):
    should_contain: Optional[List[str]] = None
    should_not_contain: Optional[List[str]] = None
    files: List[str]
+    type: str


 class ChallengeData(BaseModel):
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
--- a/agbenchmark/challenges/memory/m1/m1_data.json
+++ b/agbenchmark/challenges/memory/m1/m1_data.json
@ -7,7 +7,8 @@
    "answer": "2314",
    "should_contain": ["2314"],
    "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_memory_mock",
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
@ -7,7 +7,8 @@
    "answer": "3145\n3791\n9317\n9471",
    "should_contain": ["3145", "3791", "9317", "9471"],
    "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "remember_multiple_ids_mock",
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
@ -7,7 +7,8 @@
    "answer": "3145\n3791\n9317\n9471",
    "should_contain": ["3145", "3791", "9317", "9471"],
    "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "remember_multiple_ids_mock",
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
@ -12,7 +12,8 @@
      "The giant hamster rode a unicycle through the crowded mall"
    ],
    "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "remember_multiple_phrases_with_noise_mock",
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@ -7,7 +7,8 @@
    "answer": "£25.89",
    "should_contain": ["25.89"],
    "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_retrieval_mock",
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/retrieval/r2/r2_data.json
+++ b/agbenchmark/challenges/retrieval/r2/r2_data.json
@ -7,7 +7,8 @@
    "answer": "81,462",
    "should_contain": ["81,462"],
    "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_retrieval_2_mock",
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/challenges/retrieval/r3/r3_data.json
+++ b/agbenchmark/challenges/retrieval/r3/r3_data.json
@ -7,7 +7,8 @@
    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
    "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_retrieval_3_mock",
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@ -1,18 +1,6 @@
 from agbenchmark.challenge import Challenge


-def basic_read_file_mock(task: str, workspace: str) -> None:
-    """
-    This mock reads a file and returns its content.
-    """
-
-    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
-
-    Challenge.write_to_file(
-        workspace, "file_to_check.txt", f"random string: {file_contents}"
-    )
-
-
 def basic_write_file_mock(task: str, workspace: str) -> None:
    """
    This mock writes to a file (creates one if it doesn't exist)
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
@ -0,0 +1 @@
+random string Hello World!
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@ -4,9 +4,10 @@
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
  "dependencies": ["basic_write_file"],
  "ground": {
-    "answer": "random string: Hello World!",
-    "should_contain": ["random string: Hello World!"],
-    "files": ["file_to_check.txt"]
+    "answer": "random string Hello World!",
+    "should_contain": ["random string", "Hello World!"],
+    "files": ["file_to_check.txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_read_file_mock"
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge):
    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@ -7,7 +7,8 @@
    "answer": "Washington",
    "should_contain": ["Washington"],
    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
  },
  "mock": {
    "mock_func": "basic_write_file_mock",
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge):
    def test_method(self, config: Dict[str, Any]) -> None:
        self.setup_challenge(config)

-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )

        scores = []
        for file_content in files_contents:
--- a/pyproject.toml
+++ b/pyproject.toml
@ -38,6 +38,7 @@ markers = [
    "retrieval",
    "regression",
    "basic",
+    "code",
    "memory"
 ]

--- a/regression_tests.json
+++ b/regression_tests.json
@ -1,9 +1,34 @@
 {
+    "TestDebugSimpleTypoWithGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
+    },
    "TestBasicMemory": {
        "difficulty": "basic",
        "dependencies": [],
        "test": "agbenchmark/challenges/memory/m1/m1_test.py"
    },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
+    "TestRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+    },
    "TestWriteFile": {
        "difficulty": "basic",
        "dependencies": [],
@ -19,31 +44,11 @@
        "dependencies": [],
        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
    },
-    "TestRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
-    },
    "TestReadFile": {
        "difficulty": "basic",
        "dependencies": [
            "basic_write_file"
        ],
        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
    }
 }