Add 'Debug simple typo with guidance' challenge (#65)
Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>pull/5155/head
parent
bfd0d5c826
commit
9ede17891b
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST")
|
|||
|
||||
|
||||
def run_agent(
|
||||
task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
|
||||
task: Optional[str],
|
||||
mock_func: Optional[str],
|
||||
config: Dict[str, Any],
|
||||
challenge_location: str,
|
||||
) -> None:
|
||||
"""Calling to get a response"""
|
||||
|
||||
if mock_func == None and MOCK_FLAG == "True":
|
||||
print("No mock provided")
|
||||
elif MOCK_FLAG == "True":
|
||||
if MOCK_FLAG == "True":
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_out", challenge_location
|
||||
)
|
||||
if mock_func is None:
|
||||
print("No mock provided")
|
||||
return
|
||||
mock_manager = MockManager(
|
||||
task, config
|
||||
) # workspace doesn't need to be passed in, stays the same
|
||||
|
@ -77,4 +85,19 @@ def run_agent(
|
|||
process.wait()
|
||||
|
||||
|
||||
def copy_artifacts_into_workspace(
|
||||
workspace: str, artifact_folder_name: str, challenge_dir_path: str
|
||||
) -> None:
|
||||
source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
|
||||
|
||||
# Check if source_dir exists, if not then return immediately.
|
||||
if not os.path.exists(source_dir):
|
||||
return
|
||||
|
||||
for file_name in os.listdir(source_dir):
|
||||
full_file_name = os.path.join(source_dir, file_name)
|
||||
if os.path.isfile(full_file_name):
|
||||
shutil.copy(full_file_name, workspace)
|
||||
|
||||
|
||||
ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
import glob
|
||||
import inspect
|
||||
import os
|
||||
import shutil
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
import subprocess
|
||||
import types
|
||||
from abc import ABC, ABCMeta, abstractmethod
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type, cast
|
||||
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST")
|
|||
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
|
||||
|
||||
|
||||
class Challenge(ABC):
|
||||
class ChallengeMeta(ABCMeta):
|
||||
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
|
||||
|
||||
super().__init__(name, bases, dct)
|
||||
try:
|
||||
frame = cast(types.FrameType, inspect.currentframe())
|
||||
assert frame.f_back is not None
|
||||
self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
|
||||
except Exception as e:
|
||||
print(f"Unable to get the file from 8 frames back due to: {str(e)}")
|
||||
raise e
|
||||
|
||||
|
||||
class Challenge(ABC, metaclass=ChallengeMeta):
|
||||
"""The parent class to all specific challenges classes.
|
||||
Defines helper methods for running a challenge"""
|
||||
|
||||
|
@ -52,11 +66,13 @@ class Challenge(ABC):
|
|||
return self.data.dependencies
|
||||
|
||||
def setup_challenge(self, config: Dict[str, Any]) -> None:
|
||||
from agbenchmark.agent_interface import run_agent
|
||||
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
|
||||
|
||||
self.copy_artifacts_into_workspace(config["workspace"])
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
|
||||
)
|
||||
|
||||
run_agent(self.task, self.mock, config)
|
||||
run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
|
@ -77,8 +93,7 @@ class Challenge(ABC):
|
|||
with open(workspace_dir, "r") as f:
|
||||
return f.read()
|
||||
|
||||
@staticmethod
|
||||
def open_files(workspace: str, file_patterns: list) -> List[str]:
|
||||
def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
|
||||
script_dir = os.path.abspath(workspace)
|
||||
files_contents = []
|
||||
|
||||
|
@ -92,8 +107,17 @@ class Challenge(ABC):
|
|||
matching_files = [os.path.join(script_dir, file_pattern)]
|
||||
|
||||
for file_path in matching_files:
|
||||
with open(file_path, "r") as f:
|
||||
files_contents.append(f.read())
|
||||
if self.data.ground.type == "execute_python_code":
|
||||
result = subprocess.run(
|
||||
["python3", file_path],
|
||||
cwd=os.path.abspath(workspace),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
files_contents.append(result.stdout)
|
||||
else:
|
||||
with open(file_path, "r") as f:
|
||||
files_contents.append(f.read())
|
||||
|
||||
return files_contents
|
||||
|
||||
|
@ -135,19 +159,3 @@ class Challenge(ABC):
|
|||
)
|
||||
|
||||
return 1.0
|
||||
|
||||
def copy_artifacts_into_workspace(self, workspace: str) -> None:
|
||||
curr_frame = inspect.currentframe()
|
||||
outer_frame = inspect.getouterframes(curr_frame)[2]
|
||||
caller_file_path = outer_frame.filename
|
||||
caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
|
||||
source_dir = os.path.join(caller_dir_path, "artifacts")
|
||||
|
||||
# Check if source_dir exists, if not then return immediately.
|
||||
if not os.path.exists(source_dir):
|
||||
return
|
||||
|
||||
for file_name in os.listdir(source_dir):
|
||||
full_file_name = os.path.join(source_dir, file_name)
|
||||
if os.path.isfile(full_file_name):
|
||||
shutil.copy(full_file_name, workspace)
|
||||
|
|
|
@ -33,7 +33,8 @@ Example:
|
|||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": [".txt"]
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_write_file_mock",
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
@pytest.mark.code
|
||||
class CodeChallenge(Challenge):
|
||||
"""Challenge for memory"""
|
|
@ -0,0 +1,13 @@
|
|||
# mypy: ignore-errors
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
||||
seen = {}
|
||||
for i, num in enumerate(nums):
|
||||
typo
|
||||
complement = target - num
|
||||
if complement in seen:
|
||||
return [seen[complement], i]
|
||||
seen[num] = i
|
||||
return None
|
|
@ -0,0 +1,31 @@
|
|||
# mypy: ignore-errors
|
||||
from code import two_sum
|
||||
from typing import List
|
||||
|
||||
|
||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
||||
result = two_sum(nums, target)
|
||||
print(result)
|
||||
assert (
|
||||
result == expected_result
|
||||
), f"AssertionError: Expected the output to be {expected_result}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test the trivial case with the first two numbers
|
||||
nums = [2, 7, 11, 15]
|
||||
target = 9
|
||||
expected_result = [0, 1]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for ability to use zero and the same number twice
|
||||
nums = [2, 7, 0, 15, 12, 0]
|
||||
target = 0
|
||||
expected_result = [2, 5]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for first and last index usage and negative numbers
|
||||
nums = [-6, 7, 11, 4]
|
||||
target = -2
|
||||
expected_result = [0, 3]
|
||||
test_two_sum(nums, target, expected_result)
|
|
@ -0,0 +1,12 @@
|
|||
# mypy: ignore-errors
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
||||
seen = {}
|
||||
for i, num in enumerate(nums):
|
||||
complement = target - num
|
||||
if complement in seen:
|
||||
return [seen[complement], i]
|
||||
seen[num] = i
|
||||
return None
|
|
@ -0,0 +1,31 @@
|
|||
# mypy: ignore-errors
|
||||
from code import two_sum
|
||||
from typing import List
|
||||
|
||||
|
||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
||||
result = two_sum(nums, target)
|
||||
print(result)
|
||||
assert (
|
||||
result == expected_result
|
||||
), f"AssertionError: Expected the output to be {expected_result}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# test the trivial case with the first two numbers
|
||||
nums = [2, 7, 11, 15]
|
||||
target = 9
|
||||
expected_result = [0, 1]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for ability to use zero and the same number twice
|
||||
nums = [2, 7, 0, 15, 12, 0]
|
||||
target = 0
|
||||
expected_result = [2, 5]
|
||||
test_two_sum(nums, target, expected_result)
|
||||
|
||||
# test for first and last index usage and negative numbers
|
||||
nums = [-6, 7, 11, 4]
|
||||
target = -2
|
||||
expected_result = [0, 3]
|
||||
test_two_sum(nums, target, expected_result)
|
|
@ -0,0 +1,22 @@
|
|||
{
|
||||
"name": "debug_simple_typo_with_guidance",
|
||||
"category": ["code"],
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"dependencies": [],
|
||||
"ground": {
|
||||
"answer": "2314",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
"should_not_contain": [],
|
||||
"files": ["test.py"],
|
||||
"type": "execute_python_code"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": null,
|
||||
"mock_task": null
|
||||
},
|
||||
"info": {
|
||||
"difficulty": "basic",
|
||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from agbenchmark.challenges.code.code import CodeChallenge
|
||||
|
||||
|
||||
class TestDebugSimpleTypoWithGuidance(CodeChallenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def get_file_path(self) -> str: # all tests must implement this method
|
||||
return os.path.join(
|
||||
os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
|
||||
)
|
||||
|
||||
@pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
score = self.scoring(file_content, self.data.ground)
|
||||
print("Your score is:", score)
|
||||
scores.append(score)
|
||||
|
||||
assert 1 in scores
|
|
@ -5,7 +5,7 @@ from pydantic import BaseModel
|
|||
|
||||
|
||||
class Mock(BaseModel):
|
||||
mock_func: str
|
||||
mock_func: Optional[str] = None
|
||||
mock_task: Optional[str] = None
|
||||
|
||||
|
||||
|
@ -20,6 +20,7 @@ class Ground(BaseModel):
|
|||
should_contain: Optional[List[str]] = None
|
||||
should_not_contain: Optional[List[str]] = None
|
||||
files: List[str]
|
||||
type: str
|
||||
|
||||
|
||||
class ChallengeData(BaseModel):
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "2314",
|
||||
"should_contain": ["2314"],
|
||||
"should_not_contain": [],
|
||||
"files": ["file_to_check.txt"]
|
||||
"files": ["file_to_check.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_memory_mock",
|
||||
|
|
|
@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
"should_not_contain": [],
|
||||
"files": ["file_to_check.txt"]
|
||||
"files": ["file_to_check.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "remember_multiple_ids_mock",
|
||||
|
|
|
@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
"should_not_contain": [],
|
||||
"files": ["file_to_check.txt"]
|
||||
"files": ["file_to_check.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "remember_multiple_ids_mock",
|
||||
|
|
|
@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -12,7 +12,8 @@
|
|||
"The giant hamster rode a unicycle through the crowded mall"
|
||||
],
|
||||
"should_not_contain": [],
|
||||
"files": ["file_to_check.txt"]
|
||||
"files": ["file_to_check.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "remember_multiple_phrases_with_noise_mock",
|
||||
|
|
|
@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "£25.89",
|
||||
"should_contain": ["25.89"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"]
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_retrieval_mock",
|
||||
|
|
|
@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "81,462",
|
||||
"should_contain": ["81,462"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"]
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_retrieval_2_mock",
|
||||
|
|
|
@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
|
||||
"should_not_contain": [],
|
||||
"files": [".txt"]
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_retrieval_3_mock",
|
||||
|
|
|
@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -1,18 +1,6 @@
|
|||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
def basic_read_file_mock(task: str, workspace: str) -> None:
|
||||
"""
|
||||
This mock reads a file and returns its content.
|
||||
"""
|
||||
|
||||
file_contents = Challenge.open_file(workspace, "file_to_check.txt")
|
||||
|
||||
Challenge.write_to_file(
|
||||
workspace, "file_to_check.txt", f"random string: {file_contents}"
|
||||
)
|
||||
|
||||
|
||||
def basic_write_file_mock(task: str, workspace: str) -> None:
|
||||
"""
|
||||
This mock writes to a file (creates one if it doesn't exist)
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
random string Hello World!
|
|
@ -4,9 +4,10 @@
|
|||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"dependencies": ["basic_write_file"],
|
||||
"ground": {
|
||||
"answer": "random string: Hello World!",
|
||||
"should_contain": ["random string: Hello World!"],
|
||||
"files": ["file_to_check.txt"]
|
||||
"answer": "random string Hello World!",
|
||||
"should_contain": ["random string", "Hello World!"],
|
||||
"files": ["file_to_check.txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_read_file_mock"
|
||||
|
|
|
@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge):
|
|||
@pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
"should_not_contain": ["New York", "Los Angeles", "San Francisco"],
|
||||
"files": [".txt"]
|
||||
"files": [".txt"],
|
||||
"type": "file"
|
||||
},
|
||||
"mock": {
|
||||
"mock_func": "basic_write_file_mock",
|
||||
|
|
|
@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge):
|
|||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
files_contents = self.open_files(config["workspace"], self.data.ground.files)
|
||||
files_contents = self.get_artifacts_out(
|
||||
config["workspace"], self.data.ground.files
|
||||
)
|
||||
|
||||
scores = []
|
||||
for file_content in files_contents:
|
||||
|
|
|
@ -38,6 +38,7 @@ markers = [
|
|||
"retrieval",
|
||||
"regression",
|
||||
"basic",
|
||||
"code",
|
||||
"memory"
|
||||
]
|
||||
|
||||
|
|
|
@ -1,9 +1,34 @@
|
|||
{
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m1/m1_test.py"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
|
||||
},
|
||||
"TestRetrieval": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
|
||||
},
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
|
@ -19,31 +44,11 @@
|
|||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
|
||||
},
|
||||
"TestRetrieval": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"basic_write_file"
|
||||
],
|
||||
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue