Add "Debug code without guidance" challenge (#66)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
pull/5155/head
merwanehamadi 2023-07-07 13:55:59 -07:00 committed by GitHub
parent 9ede17891b
commit 6ef32a9b1f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 150 additions and 3 deletions

View File

@ -4,7 +4,7 @@
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": [],
"ground": {
"answer": "2314",
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],

View File

@ -0,0 +1,13 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -0,0 +1,12 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -0,0 +1,22 @@
{
"name": "debug_simple_typo_without_guidance",
"category": ["code"],
"task": "Make test.py run without errors.",
"dependencies": [],
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"side_effects": ["tests if there is in fact an LLM attached"]
}
}

View File

@ -0,0 +1,32 @@
import os
from typing import Any, Dict
import pytest
from agbenchmark.challenges.code.code import CodeChallenge
class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
"""The first memory challenge"""
def get_file_path(self) -> str: # all tests must implement this method
return os.path.join(os.path.dirname(__file__), "d2_data.json")
@pytest.mark.depends(
name="test_debug_simple_typo_without_guidance",
depends=["test_debug_simple_typo_with_guidance"],
)
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
assert 1 in scores

View File

@ -1,11 +1,11 @@
from typing import Any, Dict
from typing import Any, Dict, Optional
import agbenchmark.mocks.tests.basic_mocks as basic_mocks
import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
class MockManager:
def __init__(self, task: str, config: Dict[str, Any]) -> None:
def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
self.task = task
self.workspace = config["workspace"]
self.modules = [basic_mocks, retrieval_mocks]

View File

@ -1,4 +1,5 @@
[mypy]
namespace_packages = True
follow_imports = skip
check_untyped_defs = True
disallow_untyped_defs = True

View File

@ -50,5 +50,10 @@
"basic_write_file"
],
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d2/d2_test.py"
}
}