Add "Debug code without guidance" challenge (#66)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
2023-07-07 13:55:59 -07:00 · 2023-07-07 13:55:59 -07:00 · 6ef32a9b1f
parent 9ede17891b
commit 6ef32a9b1f
12 changed files with 150 additions and 3 deletions
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@ -4,7 +4,7 @@
  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
  "dependencies": [],
  "ground": {
-    "answer": "2314",
+    "answer": "[0, 1] [2, 5] [0, 3]",
    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
    "should_not_contain": [],
    "files": ["test.py"],
--- a/agbenchmark/challenges/code/d2/artifacts_in/init.py
+++ b/agbenchmark/challenges/code/d2/artifacts_in/init.py
--- a/agbenchmark/challenges/code/d2/artifacts_in/code.py
+++ b/agbenchmark/challenges/code/d2/artifacts_in/code.py
@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d2/artifacts_in/test.py
+++ b/agbenchmark/challenges/code/d2/artifacts_in/test.py
@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d2/artifacts_out/init.py
+++ b/agbenchmark/challenges/code/d2/artifacts_out/init.py
--- a/agbenchmark/challenges/code/d2/artifacts_out/code.py
+++ b/agbenchmark/challenges/code/d2/artifacts_out/code.py
@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
--- a/agbenchmark/challenges/code/d2/artifacts_out/test.py
+++ b/agbenchmark/challenges/code/d2/artifacts_out/test.py
@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
--- a/agbenchmark/challenges/code/d2/d2_data.json
+++ b/agbenchmark/challenges/code/d2/d2_data.json
@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_without_guidance",
+  "category": ["code"],
+  "task": "Make test.py run without errors.",
+  "dependencies": [],
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "d2_data.json")
+
+    @pytest.mark.depends(
+        name="test_debug_simple_typo_without_guidance",
+        depends=["test_debug_simple_typo_with_guidance"],
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@ -1,11 +1,11 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional

 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
 import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks


 class MockManager:
-    def __init__(self, task: str, config: Dict[str, Any]) -> None:
+    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
        self.task = task
        self.workspace = config["workspace"]
        self.modules = [basic_mocks, retrieval_mocks]
--- a/mypy.ini
+++ b/mypy.ini
@ -1,4 +1,5 @@
 [mypy]
+namespace_packages = True
 follow_imports = skip
 check_untyped_defs = True
 disallow_untyped_defs = True
--- a/regression_tests.json
+++ b/regression_tests.json
@ -50,5 +50,10 @@
            "basic_write_file"
        ],
        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
+    },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d2/d2_test.py"
    }
 }