Just json, no test files (#77)
parent
573130549f
commit
3d43117554
|
@ -11,9 +11,18 @@ class RegressionManager:
|
|||
def load(self) -> None:
|
||||
try:
|
||||
with open(self.filename, "r") as f:
|
||||
self.tests = json.load(f)
|
||||
except (FileNotFoundError, json.decoder.JSONDecodeError):
|
||||
file_content = (
|
||||
f.read().strip()
|
||||
) # read the content and remove any leading/trailing whitespace
|
||||
if file_content: # if file is not empty, load the json
|
||||
self.tests = json.loads(file_content)
|
||||
else: # if file is empty, assign an empty dictionary
|
||||
self.tests = {}
|
||||
except FileNotFoundError:
|
||||
self.tests = {}
|
||||
except json.decoder.JSONDecodeError: # If JSON is invalid
|
||||
self.tests = {}
|
||||
self.save()
|
||||
|
||||
def save(self) -> None:
|
||||
with open(self.filename, "w") as f:
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
import glob
|
||||
import inspect
|
||||
import os
|
||||
import subprocess
|
||||
import types
|
||||
from abc import ABC, ABCMeta
|
||||
from typing import Any, Dict, List, Tuple, Type, cast
|
||||
from abc import ABC
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
|
@ -16,24 +14,12 @@ mock_test_str = os.getenv("MOCK_TEST")
|
|||
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
|
||||
|
||||
|
||||
class ChallengeMeta(ABCMeta):
|
||||
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
|
||||
super().__init__(name, bases, dct)
|
||||
try:
|
||||
frame = cast(types.FrameType, inspect.currentframe())
|
||||
assert frame.f_back is not None
|
||||
self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
|
||||
except Exception as e:
|
||||
print(f"Unable to get the file from 8 frames back due to: {str(e)}")
|
||||
raise e
|
||||
|
||||
|
||||
class Challenge(ABC, metaclass=ChallengeMeta):
|
||||
class Challenge(ABC):
|
||||
"""The parent class to all specific challenges classes.
|
||||
Defines helper methods for running a challenge"""
|
||||
|
||||
_data_cache: Dict[str, ChallengeData] = {}
|
||||
CHALLENGE_LOCATION: str
|
||||
CHALLENGE_LOCATION: str = ""
|
||||
|
||||
@property
|
||||
def data(self) -> ChallengeData:
|
||||
|
@ -54,10 +40,10 @@ class Challenge(ABC, metaclass=ChallengeMeta):
|
|||
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
|
||||
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
|
||||
config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
|
||||
)
|
||||
|
||||
run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
|
||||
run_agent(self.task, config, self.CHALLENGE_LOCATION)
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
raise NotImplementedError
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestDebugSimpleTypoWithGuidance",
|
||||
"category": ["code"],
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"dependencies": ["TestReadFile", "TestWriteFile"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestDebugSimpleTypoWithGuidance(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,14 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestDebugSimpleTypoWithoutGuidance(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestDebugSimpleTypoWithoutGuidance",
|
||||
"category": ["code"],
|
||||
"task": "Make test.py run without errors.",
|
||||
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
|
||||
|
|
|
@ -19,6 +19,7 @@ class Ground(BaseModel):
|
|||
|
||||
|
||||
class ChallengeData(BaseModel):
|
||||
name: str
|
||||
category: List[str]
|
||||
task: str
|
||||
dependencies: List[str]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"name": "ReadFile",
|
||||
"name": "TestReadFile",
|
||||
"category": ["interface"],
|
||||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestReadFile(Challenge):
|
||||
"""Testing if LLM can read a file"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"name": "WriteFile",
|
||||
"name": "TestWriteFile",
|
||||
"category": ["interface"],
|
||||
"task": "Print the the capital of America to a .txt file",
|
||||
"dependencies": [],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestWriteFile(Challenge):
|
||||
"""Testing if LLM can write to a file"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestBasicMemory",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestReadFile", "TestWriteFile"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestBasicMemory(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestRememberMultipleIds",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestBasicMemory"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRememberMultipleIds(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestRememberMultipleIdsWithNoise",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIds"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRememberMultipleIdsWithNoise(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestRememberMultiplePhrasesWithNoise",
|
||||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIdsWithNoise"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRememberMultiplePhrasesWithNoise(Challenge):
|
||||
"""The first memory challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestBasicRetrieval",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRetrieval(Challenge):
|
||||
"""The first information-retrieval challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,7 +1,8 @@
|
|||
{
|
||||
"name": "TestRetrieval2",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestRetrieval"],
|
||||
"dependencies": ["TestBasicRetrieval"],
|
||||
"ground": {
|
||||
"answer": "81,462",
|
||||
"should_contain": ["81,462"],
|
||||
|
|
|
@ -1,13 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRetrieval2(Challenge):
|
||||
"""The first information-retrieval challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"name": "TestRetrieval3",
|
||||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestRetrieval2"],
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
from typing import Any, Dict
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
|
||||
class TestRetrieval3(Challenge):
|
||||
"""The first information-retrieval challenge"""
|
||||
|
||||
def test_method(self, config: Dict[str, Any]) -> None:
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
|
||||
assert 1 in scores
|
|
@ -0,0 +1,78 @@
|
|||
import glob
|
||||
import importlib
|
||||
import json
|
||||
import os
|
||||
import types
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from agbenchmark.challenge import Challenge
|
||||
|
||||
load_dotenv()
|
||||
|
||||
IMPROVE = os.getenv("IMPROVE", "False")
|
||||
|
||||
|
||||
json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True)
|
||||
|
||||
|
||||
def get_test_path(json_file: str) -> str:
|
||||
abs_location = os.path.dirname(os.path.abspath(json_file))
|
||||
|
||||
path = Path(abs_location)
|
||||
|
||||
# Find the index of "agbenchmark" in the path parts
|
||||
try:
|
||||
agbenchmark_index = path.parts.index("agbenchmark")
|
||||
except ValueError:
|
||||
raise ValueError("Invalid challenge location.")
|
||||
|
||||
# Create the path from "agbenchmark" onwards
|
||||
challenge_location = Path(*path.parts[agbenchmark_index:])
|
||||
|
||||
return str(challenge_location)
|
||||
|
||||
|
||||
def generate_tests() -> None:
|
||||
print("Generating tests...")
|
||||
# Dynamic class creation
|
||||
for json_file in json_files:
|
||||
with open(json_file, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
class_name = data.get("name", "")
|
||||
|
||||
challenge_location = get_test_path(json_file)
|
||||
|
||||
# Define test class dynamically
|
||||
challenge_class = types.new_class(class_name, (Challenge,))
|
||||
|
||||
setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
|
||||
|
||||
# Define test method within the dynamically created class
|
||||
def test_method(self, config: Dict[str, Any]) -> None: # type: ignore
|
||||
self.setup_challenge(config)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
assert 1 in scores
|
||||
|
||||
# Parametrize the method here
|
||||
test_method = pytest.mark.parametrize(
|
||||
"challenge_data",
|
||||
[data],
|
||||
indirect=True,
|
||||
)(test_method)
|
||||
|
||||
setattr(challenge_class, "test_method", test_method)
|
||||
|
||||
# Attach the new class to a module so it can be discovered by pytest
|
||||
module = importlib.import_module(__name__)
|
||||
setattr(module, class_name, challenge_class)
|
||||
|
||||
print(f"Generated test for {class_name}.")
|
||||
|
||||
|
||||
generate_tests()
|
|
@ -88,13 +88,16 @@ def check_regression(request: Any) -> None:
|
|||
test_name = request.node.parent.name
|
||||
data = get_regression_data()
|
||||
|
||||
# Get the true location of the test
|
||||
challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
|
||||
|
||||
skip_string = f"Skipping {test_name} at {challenge_location}"
|
||||
|
||||
# Check if the test name exists in the regression tests
|
||||
if request.config.getoption("--improve") and data.get(test_name, None):
|
||||
pytest.skip("Skipping test because it's a regression test and --improve is set")
|
||||
pytest.skip(f"{skip_string} because it's a regression test")
|
||||
elif request.config.getoption("--maintain") and not data.get(test_name, None):
|
||||
pytest.skip(
|
||||
"Skipping test because it's not a regression test and --maintain is set"
|
||||
)
|
||||
pytest.skip(f"{skip_string} because it's not a regression test")
|
||||
|
||||
|
||||
# this is to get the challenge_data from every test
|
||||
|
@ -109,15 +112,19 @@ regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
|
|||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
if call.when == "call":
|
||||
challenge_data = item.funcargs.get("challenge_data", None)
|
||||
difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
|
||||
dependencies = challenge_data.dependencies if challenge_data else []
|
||||
parts = item.nodeid.split("::")[0].split("/")
|
||||
agbenchmark_index = parts.index("agbenchmark")
|
||||
file_path = "/".join(parts[agbenchmark_index:])
|
||||
difficulty = (
|
||||
challenge_data["info"]["difficulty"] if challenge_data else "unknown"
|
||||
)
|
||||
dependencies = dependencies = (
|
||||
challenge_data["dependencies"] if challenge_data else []
|
||||
)
|
||||
# Extract the challenge_location from the class
|
||||
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
|
||||
|
||||
test_details = {
|
||||
"difficulty": difficulty,
|
||||
"dependencies": dependencies,
|
||||
"test": file_path,
|
||||
"test": challenge_location,
|
||||
}
|
||||
|
||||
print("pytest_runtest_makereport", test_details)
|
||||
|
@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None:
|
|||
regression_manager.save()
|
||||
|
||||
|
||||
# this is so that all tests can inherit from the Challenge class
|
||||
def pytest_generate_tests(metafunc: Any) -> None:
|
||||
if "challenge_data" in metafunc.fixturenames:
|
||||
# Get the instance of the test class
|
||||
test_class = metafunc.cls()
|
||||
|
||||
# Generate the parameters
|
||||
params = test_class.data
|
||||
|
||||
# Add the parameters to the test function
|
||||
metafunc.parametrize("challenge_data", [params], indirect=True)
|
||||
|
||||
|
||||
# this is adding the dependency marker and category markers automatically from the json
|
||||
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
|
||||
data = get_regression_data()
|
||||
|
|
|
@ -1,59 +1,64 @@
|
|||
{
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/memory/m1/m1_test.py"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
|
||||
},
|
||||
"TestRetrieval": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
|
||||
},
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
|
||||
"test": "agbenchmark\\challenges\\interface\\write_file"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
|
||||
"test": "agbenchmark\\challenges\\interface\\read_file"
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestReadFile",
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\memory\\m1"
|
||||
},
|
||||
"TestBasicRetrieval": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\retrieval\\r1"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\memory\\m2"
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\retrieval\\r2"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\memory\\m3"
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\retrieval\\r3"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"test": "agbenchmark\\challenges\\memory\\m4"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue