Add 'Debug simple typo with guidance' challenge (#65)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
pull/5155/head
merwanehamadi 2023-07-07 13:50:53 -07:00 committed by GitHub
parent bfd0d5c826
commit 9ede17891b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
56 changed files with 288 additions and 85 deletions

View File

@ -1,4 +1,5 @@
import os
import shutil
import subprocess
import sys
import time
@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST")
def run_agent(
task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
task: Optional[str],
mock_func: Optional[str],
config: Dict[str, Any],
challenge_location: str,
) -> None:
"""Calling to get a response"""
if mock_func == None and MOCK_FLAG == "True":
print("No mock provided")
elif MOCK_FLAG == "True":
if MOCK_FLAG == "True":
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
if mock_func is None:
print("No mock provided")
return
mock_manager = MockManager(
task, config
) # workspace doesn't need to be passed in, stays the same
@ -77,4 +85,19 @@ def run_agent(
process.wait()
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
) -> None:
source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):
return
for file_name in os.listdir(source_dir):
full_file_name = os.path.join(source_dir, file_name)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, workspace)
ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"

View File

@ -1,9 +1,10 @@
import glob
import inspect
import os
import shutil
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
import subprocess
import types
from abc import ABC, ABCMeta, abstractmethod
from typing import Any, Dict, List, Optional, Tuple, Type, cast
import pytest
from dotenv import load_dotenv
@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST")
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
class Challenge(ABC):
class ChallengeMeta(ABCMeta):
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
super().__init__(name, bases, dct)
try:
frame = cast(types.FrameType, inspect.currentframe())
assert frame.f_back is not None
self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
except Exception as e:
print(f"Unable to get the file from 8 frames back due to: {str(e)}")
raise e
class Challenge(ABC, metaclass=ChallengeMeta):
"""The parent class to all specific challenges classes.
Defines helper methods for running a challenge"""
@ -52,11 +66,13 @@ class Challenge(ABC):
return self.data.dependencies
def setup_challenge(self, config: Dict[str, Any]) -> None:
from agbenchmark.agent_interface import run_agent
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
self.copy_artifacts_into_workspace(config["workspace"])
copy_artifacts_into_workspace(
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
)
run_agent(self.task, self.mock, config)
run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
@property
def name(self) -> str:
@ -77,8 +93,7 @@ class Challenge(ABC):
with open(workspace_dir, "r") as f:
return f.read()
@staticmethod
def open_files(workspace: str, file_patterns: list) -> List[str]:
def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
script_dir = os.path.abspath(workspace)
files_contents = []
@ -92,8 +107,17 @@ class Challenge(ABC):
matching_files = [os.path.join(script_dir, file_pattern)]
for file_path in matching_files:
with open(file_path, "r") as f:
files_contents.append(f.read())
if self.data.ground.type == "execute_python_code":
result = subprocess.run(
["python3", file_path],
cwd=os.path.abspath(workspace),
capture_output=True,
text=True,
)
files_contents.append(result.stdout)
else:
with open(file_path, "r") as f:
files_contents.append(f.read())
return files_contents
@ -135,19 +159,3 @@ class Challenge(ABC):
)
return 1.0
def copy_artifacts_into_workspace(self, workspace: str) -> None:
curr_frame = inspect.currentframe()
outer_frame = inspect.getouterframes(curr_frame)[2]
caller_file_path = outer_frame.filename
caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
source_dir = os.path.join(caller_dir_path, "artifacts")
# Check if source_dir exists, if not then return immediately.
if not os.path.exists(source_dir):
return
for file_name in os.listdir(source_dir):
full_file_name = os.path.join(source_dir, file_name)
if os.path.isfile(full_file_name):
shutil.copy(full_file_name, workspace)

View File

@ -33,7 +33,8 @@ Example:
"answer": "Washington",
"should_contain": ["Washington"],
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": [".txt"]
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",

View File

@ -0,0 +1,8 @@
import pytest
from agbenchmark.challenge import Challenge
@pytest.mark.code
class CodeChallenge(Challenge):
"""Challenge for memory"""

View File

@ -0,0 +1,13 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -0,0 +1,12 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -0,0 +1,31 @@
# mypy: ignore-errors
from code import two_sum
from typing import List
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -0,0 +1,22 @@
{
"name": "debug_simple_typo_with_guidance",
"category": ["code"],
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": [],
"ground": {
"answer": "2314",
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
"should_not_contain": [],
"files": ["test.py"],
"type": "execute_python_code"
},
"mock": {
"mock_func": null,
"mock_task": null
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"side_effects": ["tests if there is in fact an LLM attached"]
}
}

View File

@ -0,0 +1,31 @@
import os
from typing import Any, Dict
import pytest
from agbenchmark.challenges.code.code import CodeChallenge
class TestDebugSimpleTypoWithGuidance(CodeChallenge):
"""The first memory challenge"""
def get_file_path(self) -> str: # all tests must implement this method
return os.path.join(
os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
)
@pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:
score = self.scoring(file_content, self.data.ground)
print("Your score is:", score)
scores.append(score)
assert 1 in scores

View File

@ -5,7 +5,7 @@ from pydantic import BaseModel
class Mock(BaseModel):
mock_func: str
mock_func: Optional[str] = None
mock_task: Optional[str] = None
@ -20,6 +20,7 @@ class Ground(BaseModel):
should_contain: Optional[List[str]] = None
should_not_contain: Optional[List[str]] = None
files: List[str]
type: str
class ChallengeData(BaseModel):

View File

@ -7,7 +7,8 @@
"answer": "2314",
"should_contain": ["2314"],
"should_not_contain": [],
"files": ["file_to_check.txt"]
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_memory_mock",

View File

@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
"should_not_contain": [],
"files": ["file_to_check.txt"]
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_ids_mock",

View File

@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "3145\n3791\n9317\n9471",
"should_contain": ["3145", "3791", "9317", "9471"],
"should_not_contain": [],
"files": ["file_to_check.txt"]
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_ids_mock",

View File

@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -12,7 +12,8 @@
"The giant hamster rode a unicycle through the crowded mall"
],
"should_not_contain": [],
"files": ["file_to_check.txt"]
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "remember_multiple_phrases_with_noise_mock",

View File

@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "£25.89",
"should_contain": ["25.89"],
"should_not_contain": [],
"files": [".txt"]
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_mock",

View File

@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "81,462",
"should_contain": ["81,462"],
"should_not_contain": [],
"files": [".txt"]
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_2_mock",

View File

@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
"should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
"should_not_contain": [],
"files": [".txt"]
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_retrieval_3_mock",

View File

@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -1,18 +1,6 @@
from agbenchmark.challenge import Challenge
def basic_read_file_mock(task: str, workspace: str) -> None:
"""
This mock reads a file and returns its content.
"""
file_contents = Challenge.open_file(workspace, "file_to_check.txt")
Challenge.write_to_file(
workspace, "file_to_check.txt", f"random string: {file_contents}"
)
def basic_write_file_mock(task: str, workspace: str) -> None:
"""
This mock writes to a file (creates one if it doesn't exist)

View File

@ -0,0 +1 @@
random string Hello World!

View File

@ -4,9 +4,10 @@
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["basic_write_file"],
"ground": {
"answer": "random string: Hello World!",
"should_contain": ["random string: Hello World!"],
"files": ["file_to_check.txt"]
"answer": "random string Hello World!",
"should_contain": ["random string", "Hello World!"],
"files": ["file_to_check.txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_read_file_mock"

View File

@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge):
@pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -7,7 +7,8 @@
"answer": "Washington",
"should_contain": ["Washington"],
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
"files": [".txt"]
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_write_file_mock",

View File

@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge):
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
files_contents = self.open_files(config["workspace"], self.data.ground.files)
files_contents = self.get_artifacts_out(
config["workspace"], self.data.ground.files
)
scores = []
for file_content in files_contents:

View File

@ -38,6 +38,7 @@ markers = [
"retrieval",
"regression",
"basic",
"code",
"memory"
]

View File

@ -1,9 +1,34 @@
{
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m1/m1_test.py"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
},
"TestRetrieval": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
@ -19,31 +44,11 @@
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
},
"TestRetrieval": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"basic_write_file"
],
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
}
}