Just json, no test files (#77)

pull/5155/head
Silen Naihin 2023-07-09 20:27:21 -04:00 committed by GitHub
parent 573130549f
commit 3d43117554
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
28 changed files with 177 additions and 239 deletions

View File

@ -11,9 +11,18 @@ class RegressionManager:
def load(self) -> None:
try:
with open(self.filename, "r") as f:
self.tests = json.load(f)
except (FileNotFoundError, json.decoder.JSONDecodeError):
file_content = (
f.read().strip()
) # read the content and remove any leading/trailing whitespace
if file_content: # if file is not empty, load the json
self.tests = json.loads(file_content)
else: # if file is empty, assign an empty dictionary
self.tests = {}
except FileNotFoundError:
self.tests = {}
except json.decoder.JSONDecodeError: # If JSON is invalid
self.tests = {}
self.save()
def save(self) -> None:
with open(self.filename, "w") as f:

View File

@ -1,10 +1,8 @@
import glob
import inspect
import os
import subprocess
import types
from abc import ABC, ABCMeta
from typing import Any, Dict, List, Tuple, Type, cast
from abc import ABC
from typing import Any, Dict, List
from dotenv import load_dotenv
@ -16,24 +14,12 @@ mock_test_str = os.getenv("MOCK_TEST")
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
class ChallengeMeta(ABCMeta):
def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
super().__init__(name, bases, dct)
try:
frame = cast(types.FrameType, inspect.currentframe())
assert frame.f_back is not None
self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
except Exception as e:
print(f"Unable to get the file from 8 frames back due to: {str(e)}")
raise e
class Challenge(ABC, metaclass=ChallengeMeta):
class Challenge(ABC):
"""The parent class to all specific challenges classes.
Defines helper methods for running a challenge"""
_data_cache: Dict[str, ChallengeData] = {}
CHALLENGE_LOCATION: str
CHALLENGE_LOCATION: str = ""
@property
def data(self) -> ChallengeData:
@ -54,10 +40,10 @@ class Challenge(ABC, metaclass=ChallengeMeta):
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
copy_artifacts_into_workspace(
config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
)
run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
run_agent(self.task, config, self.CHALLENGE_LOCATION)
def test_method(self, config: Dict[str, Any]) -> None:
raise NotImplementedError

View File

@ -1,4 +1,5 @@
{
"name": "TestDebugSimpleTypoWithGuidance",
"category": ["code"],
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
"dependencies": ["TestReadFile", "TestWriteFile"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestDebugSimpleTypoWithGuidance(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,14 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestDebugSimpleTypoWithoutGuidance(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestDebugSimpleTypoWithoutGuidance",
"category": ["code"],
"task": "Make test.py run without errors.",
"dependencies": ["TestDebugSimpleTypoWithGuidance"],

View File

@ -19,6 +19,7 @@ class Ground(BaseModel):
class ChallengeData(BaseModel):
name: str
category: List[str]
task: str
dependencies: List[str]

View File

@ -1,5 +1,5 @@
{
"name": "ReadFile",
"name": "TestReadFile",
"category": ["interface"],
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
"dependencies": ["TestWriteFile"],

View File

@ -1,12 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestReadFile(Challenge):
"""Testing if LLM can read a file"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,5 +1,5 @@
{
"name": "WriteFile",
"name": "TestWriteFile",
"category": ["interface"],
"task": "Print the the capital of America to a .txt file",
"dependencies": [],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestWriteFile(Challenge):
"""Testing if LLM can write to a file"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestBasicMemory",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestReadFile", "TestWriteFile"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestBasicMemory(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestRememberMultipleIds",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestBasicMemory"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRememberMultipleIds(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestRememberMultipleIdsWithNoise",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestRememberMultipleIds"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRememberMultipleIdsWithNoise(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestRememberMultiplePhrasesWithNoise",
"category": ["memory"],
"task": "Follow the instructions in the instructions_1.txt file",
"dependencies": ["TestRememberMultipleIdsWithNoise"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRememberMultiplePhrasesWithNoise(Challenge):
"""The first memory challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestBasicRetrieval",
"category": ["retrieval"],
"task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
"dependencies": ["TestWriteFile"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRetrieval(Challenge):
"""The first information-retrieval challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,7 +1,8 @@
{
"name": "TestRetrieval2",
"category": ["retrieval"],
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": ["TestRetrieval"],
"dependencies": ["TestBasicRetrieval"],
"ground": {
"answer": "81,462",
"should_contain": ["81,462"],

View File

@ -1,13 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRetrieval2(Challenge):
"""The first information-retrieval challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -1,4 +1,5 @@
{
"name": "TestRetrieval3",
"category": ["retrieval"],
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
"dependencies": ["TestRetrieval2"],

View File

@ -1,14 +0,0 @@
from typing import Any, Dict
from agbenchmark.challenge import Challenge
class TestRetrieval3(Challenge):
"""The first information-retrieval challenge"""
def test_method(self, config: Dict[str, Any]) -> None:
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores

View File

@ -0,0 +1,78 @@
import glob
import importlib
import json
import os
import types
from pathlib import Path
from typing import Any, Dict
import pytest
from dotenv import load_dotenv
from agbenchmark.challenge import Challenge
load_dotenv()
IMPROVE = os.getenv("IMPROVE", "False")
json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True)
def get_test_path(json_file: str) -> str:
abs_location = os.path.dirname(os.path.abspath(json_file))
path = Path(abs_location)
# Find the index of "agbenchmark" in the path parts
try:
agbenchmark_index = path.parts.index("agbenchmark")
except ValueError:
raise ValueError("Invalid challenge location.")
# Create the path from "agbenchmark" onwards
challenge_location = Path(*path.parts[agbenchmark_index:])
return str(challenge_location)
def generate_tests() -> None:
print("Generating tests...")
# Dynamic class creation
for json_file in json_files:
with open(json_file, "r") as f:
data = json.load(f)
class_name = data.get("name", "")
challenge_location = get_test_path(json_file)
# Define test class dynamically
challenge_class = types.new_class(class_name, (Challenge,))
setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
# Define test method within the dynamically created class
def test_method(self, config: Dict[str, Any]) -> None: # type: ignore
self.setup_challenge(config)
scores = self.get_scores(config)
assert 1 in scores
# Parametrize the method here
test_method = pytest.mark.parametrize(
"challenge_data",
[data],
indirect=True,
)(test_method)
setattr(challenge_class, "test_method", test_method)
# Attach the new class to a module so it can be discovered by pytest
module = importlib.import_module(__name__)
setattr(module, class_name, challenge_class)
print(f"Generated test for {class_name}.")
generate_tests()

View File

@ -88,13 +88,16 @@ def check_regression(request: Any) -> None:
test_name = request.node.parent.name
data = get_regression_data()
# Get the true location of the test
challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
skip_string = f"Skipping {test_name} at {challenge_location}"
# Check if the test name exists in the regression tests
if request.config.getoption("--improve") and data.get(test_name, None):
pytest.skip("Skipping test because it's a regression test and --improve is set")
pytest.skip(f"{skip_string} because it's a regression test")
elif request.config.getoption("--maintain") and not data.get(test_name, None):
pytest.skip(
"Skipping test because it's not a regression test and --maintain is set"
)
pytest.skip(f"{skip_string} because it's not a regression test")
# this is to get the challenge_data from every test
@ -109,15 +112,19 @@ regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
def pytest_runtest_makereport(item: Any, call: Any) -> None:
if call.when == "call":
challenge_data = item.funcargs.get("challenge_data", None)
difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
dependencies = challenge_data.dependencies if challenge_data else []
parts = item.nodeid.split("::")[0].split("/")
agbenchmark_index = parts.index("agbenchmark")
file_path = "/".join(parts[agbenchmark_index:])
difficulty = (
challenge_data["info"]["difficulty"] if challenge_data else "unknown"
)
dependencies = dependencies = (
challenge_data["dependencies"] if challenge_data else []
)
# Extract the challenge_location from the class
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
test_details = {
"difficulty": difficulty,
"dependencies": dependencies,
"test": file_path,
"test": challenge_location,
}
print("pytest_runtest_makereport", test_details)
@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None:
regression_manager.save()
# this is so that all tests can inherit from the Challenge class
def pytest_generate_tests(metafunc: Any) -> None:
if "challenge_data" in metafunc.fixturenames:
# Get the instance of the test class
test_class = metafunc.cls()
# Generate the parameters
params = test_class.data
# Add the parameters to the test function
metafunc.parametrize("challenge_data", [params], indirect=True)
# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
data = get_regression_data()

View File

@ -1,59 +1,64 @@
{
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/memory/m1/m1_test.py"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
},
"TestRetrieval": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
"test": "agbenchmark\\challenges\\interface\\write_file"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
"test": "agbenchmark\\challenges\\interface\\read_file"
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark\\challenges\\memory\\m1"
},
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark\\challenges\\retrieval\\r1"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark\\challenges\\memory\\m2"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark\\challenges\\retrieval\\r2"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark\\challenges\\memory\\m3"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark\\challenges\\retrieval\\r3"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark\\challenges\\memory\\m4"
}
}