Just json, no test files (#77)

2023-07-09 20:27:21 -04:00 · 2023-07-09 20:27:21 -04:00 · 3d43117554
parent 573130549f
commit 3d43117554
28 changed files with 177 additions and 239 deletions
--- a/agbenchmark/RegressionManager.py
+++ b/agbenchmark/RegressionManager.py
@ -11,9 +11,18 @@ class RegressionManager:
    def load(self) -> None:
        try:
            with open(self.filename, "r") as f:
-                self.tests = json.load(f)
-        except (FileNotFoundError, json.decoder.JSONDecodeError):
+                file_content = (
+                    f.read().strip()
+                )  # read the content and remove any leading/trailing whitespace
+                if file_content:  # if file is not empty, load the json
+                    self.tests = json.loads(file_content)
+                else:  # if file is empty, assign an empty dictionary
+                    self.tests = {}
+        except FileNotFoundError:
            self.tests = {}
+        except json.decoder.JSONDecodeError:  # If JSON is invalid
+            self.tests = {}
+        self.save()

    def save(self) -> None:
        with open(self.filename, "w") as f:
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@ -1,10 +1,8 @@
 import glob
-import inspect
 import os
 import subprocess
-import types
-from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Tuple, Type, cast
+from abc import ABC
+from typing import Any, Dict, List

 from dotenv import load_dotenv

@ -16,24 +14,12 @@ mock_test_str = os.getenv("MOCK_TEST")
 MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False


-class ChallengeMeta(ABCMeta):
-    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-        super().__init__(name, bases, dct)
-        try:
-            frame = cast(types.FrameType, inspect.currentframe())
-            assert frame.f_back is not None
-            self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
-        except Exception as e:
-            print(f"Unable to get the file from 8 frames back due to: {str(e)}")
-            raise e
-
-
-class Challenge(ABC, metaclass=ChallengeMeta):
+class Challenge(ABC):
    """The parent class to all specific challenges classes.
    Defines helper methods for running a challenge"""

    _data_cache: Dict[str, ChallengeData] = {}
-    CHALLENGE_LOCATION: str
+    CHALLENGE_LOCATION: str = ""

    @property
    def data(self) -> ChallengeData:
@ -54,10 +40,10 @@ class Challenge(ABC, metaclass=ChallengeMeta):
        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent

        copy_artifacts_into_workspace(
-            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
+            config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
        )

-        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.CHALLENGE_LOCATION)

    def test_method(self, config: Dict[str, Any]) -> None:
        raise NotImplementedError
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestDebugSimpleTypoWithGuidance",
  "category": ["code"],
  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
  "dependencies": ["TestReadFile", "TestWriteFile"],
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestDebugSimpleTypoWithGuidance(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@ -1,14 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestDebugSimpleTypoWithoutGuidance(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-
-        assert 1 in scores
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestDebugSimpleTypoWithoutGuidance",
  "category": ["code"],
  "task": "Make test.py run without errors.",
  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@ -19,6 +19,7 @@ class Ground(BaseModel):


 class ChallengeData(BaseModel):
+    name: str
    category: List[str]
    task: str
    dependencies: List[str]
--- a/agbenchmark/challenges/interface/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@ -1,5 +1,5 @@
 {
-  "name": "ReadFile",
+  "name": "TestReadFile",
  "category": ["interface"],
  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
  "dependencies": ["TestWriteFile"],
--- a/agbenchmark/challenges/interface/read_file/read_file_test.py
+++ b/agbenchmark/challenges/interface/read_file/read_file_test.py
@ -1,12 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestReadFile(Challenge):
-    """Testing if LLM can read a file"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/interface/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@ -1,5 +1,5 @@
 {
-  "name": "WriteFile",
+  "name": "TestWriteFile",
  "category": ["interface"],
  "task": "Print the the capital of America to a .txt file",
  "dependencies": [],
--- a/agbenchmark/challenges/interface/write_file/write_file_test.py
+++ b/agbenchmark/challenges/interface/write_file/write_file_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestWriteFile(Challenge):
-    """Testing if LLM can write to a file"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestBasicMemory",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
  "dependencies": ["TestReadFile", "TestWriteFile"],
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestBasicMemory(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultipleIds",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
  "dependencies": ["TestBasicMemory"],
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultipleIds(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultipleIdsWithNoise",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
  "dependencies": ["TestRememberMultipleIds"],
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultipleIdsWithNoise(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultiplePhrasesWithNoise",
  "category": ["memory"],
  "task": "Follow the instructions in the instructions_1.txt file",
  "dependencies": ["TestRememberMultipleIdsWithNoise"],
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultiplePhrasesWithNoise(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestBasicRetrieval",
  "category": ["retrieval"],
  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
  "dependencies": ["TestWriteFile"],
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@ -1,7 +1,8 @@
 {
+  "name": "TestRetrieval2",
  "category": ["retrieval"],
  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestRetrieval"],
+  "dependencies": ["TestBasicRetrieval"],
  "ground": {
    "answer": "81,462",
    "should_contain": ["81,462"],
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval2(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@ -1,4 +1,5 @@
 {
+  "name": "TestRetrieval3",
  "category": ["retrieval"],
  "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
  "dependencies": ["TestRetrieval2"],
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@ -1,14 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval3(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-
-        assert 1 in scores
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@ -0,0 +1,78 @@
+import glob
+import importlib
+import json
+import os
+import types
+from pathlib import Path
+from typing import Any, Dict
+
+import pytest
+from dotenv import load_dotenv
+
+from agbenchmark.challenge import Challenge
+
+load_dotenv()
+
+IMPROVE = os.getenv("IMPROVE", "False")
+
+
+json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True)
+
+
+def get_test_path(json_file: str) -> str:
+    abs_location = os.path.dirname(os.path.abspath(json_file))
+
+    path = Path(abs_location)
+
+    # Find the index of "agbenchmark" in the path parts
+    try:
+        agbenchmark_index = path.parts.index("agbenchmark")
+    except ValueError:
+        raise ValueError("Invalid challenge location.")
+
+    # Create the path from "agbenchmark" onwards
+    challenge_location = Path(*path.parts[agbenchmark_index:])
+
+    return str(challenge_location)
+
+
+def generate_tests() -> None:
+    print("Generating tests...")
+    # Dynamic class creation
+    for json_file in json_files:
+        with open(json_file, "r") as f:
+            data = json.load(f)
+
+            class_name = data.get("name", "")
+
+        challenge_location = get_test_path(json_file)
+
+        # Define test class dynamically
+        challenge_class = types.new_class(class_name, (Challenge,))
+
+        setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
+
+        # Define test method within the dynamically created class
+        def test_method(self, config: Dict[str, Any]) -> None:  # type: ignore
+            self.setup_challenge(config)
+
+            scores = self.get_scores(config)
+            assert 1 in scores
+
+        # Parametrize the method here
+        test_method = pytest.mark.parametrize(
+            "challenge_data",
+            [data],
+            indirect=True,
+        )(test_method)
+
+        setattr(challenge_class, "test_method", test_method)
+
+        # Attach the new class to a module so it can be discovered by pytest
+        module = importlib.import_module(__name__)
+        setattr(module, class_name, challenge_class)
+
+        print(f"Generated test for {class_name}.")
+
+
+generate_tests()
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@ -88,13 +88,16 @@ def check_regression(request: Any) -> None:
    test_name = request.node.parent.name
    data = get_regression_data()

+    # Get the true location of the test
+    challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
+
+    skip_string = f"Skipping {test_name} at {challenge_location}"
+
    # Check if the test name exists in the regression tests
    if request.config.getoption("--improve") and data.get(test_name, None):
-        pytest.skip("Skipping test because it's a regression test and --improve is set")
+        pytest.skip(f"{skip_string} because it's a regression test")
    elif request.config.getoption("--maintain") and not data.get(test_name, None):
-        pytest.skip(
-            "Skipping test because it's not a regression test and --maintain is set"
-        )
+        pytest.skip(f"{skip_string} because it's not a regression test")


 # this is to get the challenge_data from every test
@ -109,15 +112,19 @@ regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
    if call.when == "call":
        challenge_data = item.funcargs.get("challenge_data", None)
-        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
-        dependencies = challenge_data.dependencies if challenge_data else []
-        parts = item.nodeid.split("::")[0].split("/")
-        agbenchmark_index = parts.index("agbenchmark")
-        file_path = "/".join(parts[agbenchmark_index:])
+        difficulty = (
+            challenge_data["info"]["difficulty"] if challenge_data else "unknown"
+        )
+        dependencies = dependencies = (
+            challenge_data["dependencies"] if challenge_data else []
+        )
+        # Extract the challenge_location from the class
+        challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
+
        test_details = {
            "difficulty": difficulty,
            "dependencies": dependencies,
-            "test": file_path,
+            "test": challenge_location,
        }

        print("pytest_runtest_makereport", test_details)
@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None:
    regression_manager.save()


-# this is so that all tests can inherit from the Challenge class
-def pytest_generate_tests(metafunc: Any) -> None:
-    if "challenge_data" in metafunc.fixturenames:
-        # Get the instance of the test class
-        test_class = metafunc.cls()
-
-        # Generate the parameters
-        params = test_class.data
-
-        # Add the parameters to the test function
-        metafunc.parametrize("challenge_data", [params], indirect=True)
-
-
 # this is adding the dependency marker and category markers automatically from the json
 def pytest_collection_modifyitems(items: Any, config: Any) -> None:
    data = get_regression_data()
--- a/regression_tests.json
+++ b/regression_tests.json
@ -1,59 +1,64 @@
 {
-    "TestBasicMemory": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m1/m1_test.py"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicMemory"
-        ],
-        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIds"
-        ],
-        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
-        ],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
-    },
-    "TestRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval2"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
-    },
    "TestWriteFile": {
        "difficulty": "basic",
        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
+        "test": "agbenchmark\\challenges\\interface\\write_file"
    },
    "TestReadFile": {
        "difficulty": "basic",
        "dependencies": [
            "TestWriteFile"
        ],
-        "test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
+        "test": "agbenchmark\\challenges\\interface\\read_file"
+    },
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestReadFile",
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark\\challenges\\memory\\m1"
+    },
+    "TestBasicRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark\\challenges\\retrieval\\r1"
+    },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicMemory"
+        ],
+        "test": "agbenchmark\\challenges\\memory\\m2"
+    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicRetrieval"
+        ],
+        "test": "agbenchmark\\challenges\\retrieval\\r2"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
+        "test": "agbenchmark\\challenges\\memory\\m3"
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark\\challenges\\retrieval\\r3"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark\\challenges\\memory\\m4"
    }
 }