From a5073ab57790a84d146877e1b6512eecbfc12b09 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 09:42:36 -0400 Subject: [PATCH 01/20] basic challenges, more ChallengeData structure --- agbenchmark/Challenge.py | 22 ++++++++++++++ agbenchmark/challenges/define_task_types.py | 16 ++++++---- agbenchmark/challenges/retrieval/Retrieval.py | 22 +------------- .../challenges/retrieval/r1/r1_data.json | 10 +++++-- .../challenges/retrieval/r1/r1_test.py | 6 ++-- agbenchmark/mocks/tests/basic_mocks.py | 28 ++++++++++++++++++ agbenchmark/mocks/tests/retrieval_mocks.py | 7 +---- .../read_file/r_file_data.json | 15 ++++++++++ .../read_file/read_file_test.py | 29 +++++++++++++++++++ .../tests/basic_abilities/read_file_test.py | 0 .../write_file/w_file_data.json | 16 ++++++++++ .../write_file/write_file_test.py | 27 +++++++++++++++++ .../tests/basic_abilities/write_file_test.py | 0 pyproject.toml | 3 +- 14 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 20bf55853..9828a0e9e 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,6 @@ import os from typing import Optional +from agbenchmark.challenges.define_task_types import Ground class Challenge: @@ -30,3 +31,24 @@ class Challenge: for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f1a841b53..879a46af0 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,12 @@ import json import os +class Info(BaseModel): + difficulty: str + description: str + side_effects: List[str] + + class Ground(BaseModel): answer: str should_contain: Optional[List[str]] @@ -11,20 +17,20 @@ class Ground(BaseModel): files: List[str] -class Challenge(BaseModel): - category: str +class ChallengeData(BaseModel): + category: List[str] task: str ground: Ground - difficulty: str mock_func: Optional[str] = None + info: Info def serialize(self, path: str) -> None: with open(path, "w") as file: file.write(self.json()) @staticmethod - def deserialize(path: str) -> "Challenge": + def deserialize(path: str) -> "ChallengeData": print("Deserializing", path) with open(path, "r") as file: data = json.load(file) - return Challenge(**data) + return ChallengeData(**data) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 2db22ae4d..9434d69c3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,27 +1,7 @@ from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import Ground class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 + pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index c7cc31004..08b74d1b7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,5 @@ { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -7,6 +7,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy", - "mock_func": "retrieval_1_mock" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index e20c9f7b9..d37c5e795 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,9 +1,11 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import Challenge, Ground +from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r1_data.json") +) class TestRetrieval1(RetrievalChallenge): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index e69de29bb..eb7b96541 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -0,0 +1,28 @@ +from agbenchmark.Challenge import Challenge +from ..basic_gpt_agent import basic_gpt_agent + + +def basic_read_file_mock(task: str, workspace: str): + """ + This mock reads a file and returns its content. + """ + + Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + + file_contents = Challenge.open_file(workspace, "file_to_check.txt") + + Challenge.write_to_file( + workspace, "file_to_check.txt", f"random string: {file_contents}" + ) + + +def basic_write_file_mock(task: str, workspace: str): + """ + This mock writes to a file (creates one if it doesn't exist) + """ + + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 23f4bde17..2481de060 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,4 +1,3 @@ -from ..basic_gpt_agent import basic_gpt_agent from agbenchmark.Challenge import Challenge @@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file def retrieval_1_mock(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json new file mode 100644 index 000000000..55319ddfc --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -0,0 +1,15 @@ +{ + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "ground": { + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_read_file_mock", + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py new file mode 100644 index 000000000..610ccdab6 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -0,0 +1,29 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") +) + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval( + self, workspace + ): # create_file simply there for the function to depend on the fixture + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json new file mode 100644 index 000000000..4aaa1347d --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -0,0 +1,16 @@ +{ + "category": ["basic"], + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py new file mode 100644 index 000000000..ccb10fe70 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -0,0 +1,27 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "w_file_data.json") +) + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pyproject.toml b/pyproject.toml index 5498381a2..6f79e75ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,8 @@ testpaths = [ ] markers = [ "retrieval", - "regression" + "regression", + "basic" ] [tool.poetry.scripts] From 66c9e68b0430066d23e9acd66e5259ea5d5190d7 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:15:53 -0400 Subject: [PATCH 02/20] file creation from within file before server :) --- agbenchmark/conftest.py | 2 +- agbenchmark/mocks/tests/basic_mocks.py | 2 +- .../tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++ agbenchmark/tests/regression/regression_tests.txt | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 908d39e89..434f6dbde 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -17,7 +17,7 @@ def config(): return config -@pytest.fixture +@pytest.fixture(scope="module") def workspace(config): yield config["workspace"] # teardown after test function completes diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index eb7b96541..bbff6a9c7 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") file_contents = Challenge.open_file(workspace, "file_to_check.txt") diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 610ccdab6..35d1d80c5 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -8,6 +8,14 @@ data = ChallengeData.deserialize( ) +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) + + class TestReadFile(Challenge): """Testing if LLM can read a file""" diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index e69de29bb..a5f8fbd1d 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -0,0 +1,2 @@ +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] From 4fa9f72083aa09bf1770f10a3254c4d0ef674a9a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:24:17 -0400 Subject: [PATCH 03/20] adding dependencies on other challenges --- agbenchmark/mocks/tests/basic_mocks.py | 2 -- .../basic_abilities/read_file/read_file_test.py | 1 + .../basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 1 - poetry.lock | 15 ++++++++++++++- pyproject.toml | 1 + 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index bbff6a9c7..550095b72 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") - file_contents = Challenge.open_file(workspace, "file_to_check.txt") Challenge.write_to_file( diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 35d1d80c5..ea794281e 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,7 @@ class TestReadFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(depends=["write_file"]) def test_retrieval( self, workspace ): # create_file simply there for the function to depend on the fixture diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index ccb10fe70..b2c559c9e 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -17,6 +17,7 @@ class TestWriteFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(name="write_file") def test_retrieval(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index a5f8fbd1d..84e625af4 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,2 +1 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] diff --git a/poetry.lock b/poetry.lock index 3f1059aaf..3bc37622e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-dependency" +version = "0.5.1" +description = "Manage dependencies of tests" +optional = false +python-versions = "*" +files = [ + {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, +] + +[package.dependencies] +pytest = ">=3.6.0" + [[package]] name = "requests" version = "2.31.0" @@ -765,4 +778,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38" +content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" diff --git a/pyproject.toml b/pyproject.toml index 6f79e75ce..087ac8447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" +pytest-dependency = "^0.5.1" [build-system] From f895d54e02c92e262172d9a773f7e6a4870d435d Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 14:42:35 -0400 Subject: [PATCH 04/20] more elegant marking & dependency solution --- README.md | 74 +++++++++++++++++-- agbenchmark/challenges/README.md | 38 +++++----- agbenchmark/challenges/define_task_types.py | 1 + .../challenges/retrieval/r1/r1_data.json | 1 + .../tests/basic_abilities/BasicChallenge.py | 7 ++ .../read_file/r_file_data.json | 1 + .../read_file/read_file_test.py | 12 +-- .../write_file/w_file_data.json | 1 + .../write_file/write_file_test.py | 9 +-- .../tests/regression/regression_tests.txt | 2 + poetry.lock | 17 ++++- pyproject.toml | 1 + 12 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py diff --git a/README.md b/README.md index 0a8d119af..0ad0cf345 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,73 @@ Share your progress :) to create a test: -``` -@pytest.mark.parametrize( -"server_response", -["VARIABLE"], # VARIABLE = the query/goal you provide to the model -indirect=True, +```python +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from ..CategoryChallenge import CategoryChallenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") ) -@pytest.mark.(VARIABLE) # VARIABLE = category of the test -def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts -assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) + +class TestSomething(CategoryChallenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + def test_retrieval( + self, workspace + ): + # scoring logic goes here +``` + +All challenges will inherit from parent class which has the mark + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +If you want to add a custom mark to a Challenge, you must specify it before the test definition + +```python +@pytest.mark.other_mark +def test_retrieval(self, workspace): +``` + +To add a dependency to a challenge use the following + +```python +# to defining what a test depends on +from pytest_dependency import depends + +def test1(self, request, workspace): + depends(request, data.dependencies) +# for defining a test as a dependency +@pytest.mark.dependency() +def test2 +``` + +Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards + +```python +@pytest.mark.run(order=1) +``` + +To create a file to test a challenge, add this to the challenge file which will create a file before running the server + +```python +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) ``` ## Api diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 50efe2c4d..d5229e937 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,28 +4,25 @@ Input: -- **category** (str): information-retrieval -- **difficulty**(str): the difficulty of this query. choices from - -## Information-retrieval challenges - -Input: - -- **category** (str): information-retrieval -- **task** (str): the question the agent needs to be solve. +- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **task** (str): The task that the agent needs to solve. +- **dependencies** (str[]): The dependencies that the challenge needs to run. - **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt -- **difficulty**(str): the difficulty of this query. choices from -- **mock_func**: function to mock the agent's response. This is used for testing purposes + - **answer** (str): The raw text of the ground truth answer. + - **should_contain** (list): The exact strings that are required in the final answer. + - **should_not_contain** (list): The exact strings that should not be in the final answer. + - **files** (list): Files that are used for retrieval. Can specify file here or an extension. +- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **info** (dict): Additional info about the challenge. + - **difficulty** (str): The difficulty of this query. + - **description** (str): Description of the challenge. + - **side_effects** (str[]): Describes the effects of the challenge. Example: ```python { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -33,11 +30,16 @@ Example: "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } ``` -Output: +Current Output: - **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 879a46af0..694671218 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -20,6 +20,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): category: List[str] task: str + dependencies: List[str] ground: Ground mock_func: Optional[str] = None info: Info diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 08b74d1b7..fe05b6d51 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,6 @@ { "category": ["basic"], + "dependencies": ["test_write_file"], "task": "What is the capital of America?", "ground": { "answer": "Washington", diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py new file mode 100644 index 000000000..563207405 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -0,0 +1,7 @@ +import pytest +from agbenchmark.Challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 55319ddfc..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ea794281e..03b2d6cab 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,7 +1,9 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -16,7 +18,7 @@ def setup_module(workspace): ) -class TestReadFile(Challenge): +class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( @@ -24,11 +26,9 @@ class TestReadFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(depends=["write_file"]) - def test_retrieval( - self, workspace - ): # create_file simply there for the function to depend on the fixture + def test_read_file(self, request, workspace): + depends(request, data.dependencies) + file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 4aaa1347d..562d1c364 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b2c559c9e..b09162e3d 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,6 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData -from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os data = ChallengeData.deserialize( @@ -8,7 +8,7 @@ data = ChallengeData.deserialize( ) -class TestWriteFile(Challenge): +class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" @pytest.mark.parametrize( @@ -16,9 +16,8 @@ class TestWriteFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(name="write_file") - def test_retrieval(self, workspace): + @pytest.mark.dependency() + def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 84e625af4..b831003fc 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1 +1,3 @@ agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 3bc37622e..f6f24c5f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,21 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-ordering" +version = "0.6" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = "*" +files = [ + {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, + {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, + {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "requests" version = "2.31.0" @@ -778,4 +793,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" +content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" diff --git a/pyproject.toml b/pyproject.toml index 087ac8447..faee61c2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" +pytest-ordering = "^0.6" [build-system] From d1c5e0a91a7a0f23b0e8de5f394204e96ec668cd Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 00:22:53 -0400 Subject: [PATCH 05/20] finally figured out right way to do dependencies --- agbenchmark/challenges/retrieval/Retrieval.py | 2 ++ .../challenges/retrieval/r1/r1_data.json | 4 ++-- .../challenges/retrieval/r1/r1_test.py | 6 ++++-- .../tests/basic_abilities/BasicChallenge.py | 1 + .../read_file/r_file_data.json | 4 +++- .../read_file/read_file_test.py | 6 ++---- .../write_file/write_file_test.py | 1 - .../tests/regression/regression_tests.txt | 4 ++-- poetry.lock | 19 ++++++++++++++++++- pyproject.toml | 3 ++- 10 files changed, 36 insertions(+), 14 deletions(-) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 9434d69c3..b8aa81ce3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,6 +1,8 @@ from agbenchmark.Challenge import Challenge +import pytest +@pytest.mark.retrieval class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index fe05b6d51..562d1c364 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,14 +1,14 @@ { "category": ["basic"], - "dependencies": ["test_write_file"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_write_file_mock", "info": { "difficulty": "easy", "description": "Tests the writing to file", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d37c5e795..5e6d6abf4 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,6 +2,8 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os +from pytest_dependency import depends + data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r1_data.json") @@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.retrieval - def test_retrieval(self, workspace): + def test_retrieval(self, request, workspace): + depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..0cada86cc 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,6 +2,7 @@ import pytest from agbenchmark.Challenge import Challenge +@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..4d04f33e7 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,7 +1,9 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 03b2d6cab..ad08da4e0 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_read_file(self, request, workspace): - depends(request, data.dependencies) - + @pytest.mark.order(after=data.dependencies) + def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b09162e3d..4c94320e0 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.dependency() def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index b831003fc..df27f3124 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] +agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index f6f24c5f2..4764bf493 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,23 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-order" +version = "1.1.0" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, + {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, +] + +[package.dependencies] +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + [[package]] name = "pytest-ordering" version = "0.6" @@ -793,4 +810,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" +content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" diff --git a/pyproject.toml b/pyproject.toml index faee61c2d..fd2c52041 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" pytest-ordering = "^0.6" +pytest-order = "^1.1.0" [build-system] @@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q" +addopts = "--order-dependencies" # -ra -q testpaths = [ "tests", "agbenchmark", ] From 31c11927199714516891db5aa3044eb1a4396eb4 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 08:48:16 -0400 Subject: [PATCH 06/20] other was non solution, solution is pytest-depends --- agbenchmark/challenges/README.md | 20 ++--- .../challenges/retrieval/r1/r1_test.py | 2 - .../tests/basic_abilities/BasicChallenge.py | 1 - .../read_file/r_file_data.json | 4 +- .../read_file/read_file_test.py | 2 +- .../write_file/write_file_test.py | 1 + .../tests/regression/regression_tests.txt | 2 +- poetry.lock | 80 ++++++++++--------- pyproject.toml | 6 +- 9 files changed, 59 insertions(+), 59 deletions(-) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index d5229e937..e457b85c4 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -6,7 +6,7 @@ Input: - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. -- **dependencies** (str[]): The dependencies that the challenge needs to run. +- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. - **answer** (str): The raw text of the ground truth answer. - **should_contain** (list): The exact strings that are required in the final answer. @@ -23,18 +23,20 @@ Example: ```python { "category": ["basic"], - "task": "What is the capital of America?", + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_read_file_mock", "info": { - "difficulty": "easy", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 5e6d6abf4..45becaf75 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -from pytest_dependency import depends data = ChallengeData.deserialize( @@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, request, workspace): - depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 0cada86cc..563207405 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.Challenge import Challenge -@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 4d04f33e7..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,9 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ad08da4e0..494a9b071 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.order(after=data.dependencies) + @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 4c94320e0..0a4ef4a2c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index df27f3124..57b94cd7a 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 4764bf493..d7939fbfe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -368,6 +368,20 @@ files = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "idna" version = "3.4" @@ -473,6 +487,24 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "openai" version = "0.27.8" @@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] -name = "pytest-dependency" -version = "0.5.1" -description = "Manage dependencies of tests" +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" optional = false python-versions = "*" files = [ - {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, ] [package.dependencies] -pytest = ">=3.6.0" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, - {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, -] - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-ordering" -version = "0.6" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = "*" -files = [ - {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, - {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, - {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, -] - -[package.dependencies] -pytest = "*" +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" [[package]] name = "requests" @@ -810,4 +814,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" +content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" diff --git a/pyproject.toml b/pyproject.toml index fd2c52041..0a4f8ba73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" -pytest-dependency = "^0.5.1" -pytest-ordering = "^0.6" -pytest-order = "^1.1.0" +pytest-depends = "^1.0.1" [build-system] @@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "--order-dependencies" # -ra -q +addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ] From adc6b225a6063bc2b0981f1156f25bde9279040e Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 11:12:33 -0400 Subject: [PATCH 07/20] update regression tests info --- .../challenges/retrieval/r1/r1_test.py | 7 +++- agbenchmark/conftest.py | 36 +++++++++++++------ .../read_file/read_file_test.py | 5 +++ .../write_file/w_file_data.json | 2 +- .../write_file/write_file_test.py | 5 +++ .../tests/regression/RegressionManager.py | 25 ++++++++----- .../tests/regression/regression_tests.json | 1 + .../tests/regression/regression_tests.txt | 17 +++++++-- 8 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 agbenchmark/tests/regression/regression_tests.json diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 45becaf75..489d298fb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_retrieval(self, request, workspace): + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) + def test_retrieval(self, workspace, current_challenge_data): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 434f6dbde..78114c204 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager +from agbenchmark.challenges.define_task_types import ChallengeData @pytest.fixture(scope="module") @@ -64,21 +65,34 @@ def server_response(request, config): # print(f"Request succeeded with status code {response.status_code}") -regression_txt = "agbenchmark/tests/regression/regression_tests.txt" +regression_json = "agbenchmark/tests/regression/regression_tests.json" -regression_manager = RegressionManager(regression_txt) +regression_manager = RegressionManager(regression_json) + + +# this is to get the challenge_data from every test +@pytest.fixture(autouse=True) +def regression_data(request): + return request.param def pytest_runtest_makereport(item, call): - """Called for each test report. Generated for each stage - of a test run (setup, call, teardown).""" if call.when == "call": - if ( - call.excinfo is None - ): # if no error in the call stage, add it as a regression test - regression_manager.add_test(item.nodeid) - else: # otherwise, :( - regression_manager.remove_test(item.nodeid) + challenge_data = item.funcargs.get("regression_data", None) + difficulty = challenge_data.info.difficulty if challenge_data else "unknown" + dependencies = challenge_data.dependencies if challenge_data else [] + + test_details = { + "difficulty": difficulty, + "dependencies": dependencies, + "test": item.nodeid, + } + + print("pytest_runtest_makereport", test_details) + if call.excinfo is None: + regression_manager.add_test(item.nodeid.split("::")[1], test_details) + else: + regression_manager.remove_test(item.nodeid.split("::")[1]) def pytest_collection_modifyitems(items): @@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items): to add regression marker to collected test items.""" for item in items: print("pytest_collection_modifyitems", item.nodeid) - if item.nodeid + "\n" in regression_manager.tests: + if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 494a9b071..7d14228c8 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 562d1c364..1d2621081 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -10,7 +10,7 @@ }, "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 0a4ef4a2c..330128898 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py index 9117d53f1..a1379ecae 100644 --- a/agbenchmark/tests/regression/RegressionManager.py +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -1,3 +1,6 @@ +import json + + class RegressionManager: """Abstracts interaction with the regression tests file""" @@ -6,17 +9,21 @@ class RegressionManager: self.load() def load(self) -> None: - with open(self.filename, "r") as f: - self.tests = f.readlines() + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} def save(self) -> None: with open(self.filename, "w") as f: - f.writelines(self.tests) + json.dump(self.tests, f, indent=4) - def add_test(self, test_id) -> None: - if f"{test_id}\n" not in self.tests: - self.tests.append(f"{test_id}\n") + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() - def remove_test(self, test_id) -> None: - if f"{test_id}\n" in self.tests: - self.tests.remove(f"{test_id}\n") + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/agbenchmark/tests/regression/regression_tests.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 57b94cd7a..8af722f07 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,14 @@ -agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] +{ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { + "difficulty": "easy", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file From 7604ae07bb6d79cfe8e5a28fdf3fa85c83603b1b Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 19:30:04 -0400 Subject: [PATCH 08/20] can now put file extensions or names in files data --- agbenchmark/Challenge.py | 22 ++++++++++++++++++- .../challenges/retrieval/r1/r1_test.py | 12 +++++----- .../read_file/read_file_test.py | 12 +++++----- .../write_file/w_file_data.json | 2 +- .../write_file/write_file_test.py | 12 +++++----- .../tests/regression/regression_tests.json | 15 ++++++++++++- 6 files changed, 57 insertions(+), 18 deletions(-) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 9828a0e9e..d159296b1 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,5 @@ import os -from typing import Optional +import glob from agbenchmark.challenges.define_task_types import Ground @@ -14,6 +14,26 @@ class Challenge: with open(workspace_dir, "r") as f: return f.read() + @staticmethod + def open_files(workspace: str, file_patterns: list): + script_dir = os.path.abspath(workspace) + files_contents = [] + + for file_pattern in file_patterns: + # Check if it is a file extension + if file_pattern.startswith("."): + # Find all files with the given extension in the workspace + matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) + else: + # Otherwise, it is a specific file + matching_files = [os.path.join(script_dir, file_pattern)] + + for file_path in matching_files: + with open(file_path, "r") as f: + files_contents.append(f.read()) + + return files_contents + @staticmethod def write_to_file(workspace: str, filename: str, content: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 489d298fb..2a7d92a71 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, workspace, current_challenge_data): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 7d14228c8..90946670c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge): ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 1d2621081..037c5bd88 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -6,7 +6,7 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] }, "mock_func": "basic_write_file_mock", "info": { diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 330128898..187378ff1 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge): ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..c84fc9c99 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file From 4be22ae5abc884404370196bf71da86affe82131 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 26 Jun 2023 09:27:20 -0400 Subject: [PATCH 09/20] mini agi attempt --- agbenchmark/conftest.py | 44 +++++++++++-------- .../tests/regression/regression_tests.json | 15 +------ agent/agbenchmark_run.py | 27 ++++++++++++ agent/mini-agi | 1 + 4 files changed, 55 insertions(+), 32 deletions(-) create mode 100644 agent/agbenchmark_run.py create mode 160000 agent/mini-agi diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 78114c204..b3b69f194 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -7,6 +7,7 @@ import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager from agbenchmark.challenges.define_task_types import ChallengeData +import subprocess @pytest.fixture(scope="module") @@ -42,27 +43,34 @@ def server_response(request, config): else: task = request.param mock_function_name = None - # print(f"Server starting at {request.module}") - # try: - # response = requests.post( - # f"{config['hostname']}:{config['port']}", data={"task": task} - # ) - # response.raise_for_status() # This will raise an HTTPError if the status is 4xx or 5xx - # except RequestException: - # # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") + # get the current file's directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + # construct the script's path + script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") + + # form the command + command = ["python", script_path, task] + + # if mock_function_name: + # mock_manager = MockManager( + # task + # ) # workspace doesn't need to be passed in, stays the same + # print("Server unavailable, using mock", mock_function_name) + # mock_manager.delegate(mock_function_name) # else: - # # This code is run if no exception occurred - # print(f"Request succeeded with status code {response.status_code}") + # print("No mock provided") + + try: + # run the command and wait for it to complete + result = subprocess.run( + command, shell=True, check=True, text=True, capture_output=True + ) + return result + except subprocess.CalledProcessError as e: + print(f"Subprocess failed with the following error:\n{e}") + # If the subprocess returns a non-zero exit status regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index c84fc9c99..9e26dfeeb 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,14 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py new file mode 100644 index 000000000..f509f5e66 --- /dev/null +++ b/agent/agbenchmark_run.py @@ -0,0 +1,27 @@ +import argparse +import subprocess +import os + + +def main(objective): + # get the current directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # form the command + command = ( + f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" + ) + + # run the command + subprocess.run(command, shell=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") + parser.add_argument( + "objective", type=str, help="The objective to pass to miniagi.py" + ) + + args = parser.parse_args() + + main(args.objective) diff --git a/agent/mini-agi b/agent/mini-agi new file mode 160000 index 000000000..d2add8f18 --- /dev/null +++ b/agent/mini-agi @@ -0,0 +1 @@ +Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b From 8c44b9eddf7c566d5e39f7e11149772b96e23a5f Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 09:42:36 -0400 Subject: [PATCH 10/20] basic challenges, more ChallengeData structure --- agbenchmark/Challenge.py | 22 ++++++++++++++ agbenchmark/challenges/define_task_types.py | 16 ++++++---- agbenchmark/challenges/retrieval/Retrieval.py | 22 +------------- .../challenges/retrieval/r1/r1_data.json | 10 +++++-- .../challenges/retrieval/r1/r1_test.py | 6 ++-- agbenchmark/mocks/tests/basic_mocks.py | 28 ++++++++++++++++++ agbenchmark/mocks/tests/retrieval_mocks.py | 7 +---- .../read_file/r_file_data.json | 15 ++++++++++ .../read_file/read_file_test.py | 29 +++++++++++++++++++ .../tests/basic_abilities/read_file_test.py | 0 .../write_file/w_file_data.json | 16 ++++++++++ .../write_file/write_file_test.py | 27 +++++++++++++++++ .../tests/basic_abilities/write_file_test.py | 0 pyproject.toml | 3 +- 14 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 20bf55853..9828a0e9e 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,6 @@ import os from typing import Optional +from agbenchmark.challenges.define_task_types import Ground class Challenge: @@ -30,3 +31,24 @@ class Challenge: for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f1a841b53..879a46af0 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,12 @@ import json import os +class Info(BaseModel): + difficulty: str + description: str + side_effects: List[str] + + class Ground(BaseModel): answer: str should_contain: Optional[List[str]] @@ -11,20 +17,20 @@ class Ground(BaseModel): files: List[str] -class Challenge(BaseModel): - category: str +class ChallengeData(BaseModel): + category: List[str] task: str ground: Ground - difficulty: str mock_func: Optional[str] = None + info: Info def serialize(self, path: str) -> None: with open(path, "w") as file: file.write(self.json()) @staticmethod - def deserialize(path: str) -> "Challenge": + def deserialize(path: str) -> "ChallengeData": print("Deserializing", path) with open(path, "r") as file: data = json.load(file) - return Challenge(**data) + return ChallengeData(**data) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 2db22ae4d..9434d69c3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,27 +1,7 @@ from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import Ground class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 + pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index c7cc31004..08b74d1b7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,5 @@ { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -7,6 +7,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy", - "mock_func": "retrieval_1_mock" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index e20c9f7b9..d37c5e795 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,9 +1,11 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import Challenge, Ground +from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r1_data.json") +) class TestRetrieval1(RetrievalChallenge): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index e69de29bb..eb7b96541 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -0,0 +1,28 @@ +from agbenchmark.Challenge import Challenge +from ..basic_gpt_agent import basic_gpt_agent + + +def basic_read_file_mock(task: str, workspace: str): + """ + This mock reads a file and returns its content. + """ + + Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + + file_contents = Challenge.open_file(workspace, "file_to_check.txt") + + Challenge.write_to_file( + workspace, "file_to_check.txt", f"random string: {file_contents}" + ) + + +def basic_write_file_mock(task: str, workspace: str): + """ + This mock writes to a file (creates one if it doesn't exist) + """ + + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 23f4bde17..2481de060 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,4 +1,3 @@ -from ..basic_gpt_agent import basic_gpt_agent from agbenchmark.Challenge import Challenge @@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file def retrieval_1_mock(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json new file mode 100644 index 000000000..55319ddfc --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -0,0 +1,15 @@ +{ + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "ground": { + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_read_file_mock", + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py new file mode 100644 index 000000000..610ccdab6 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -0,0 +1,29 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") +) + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval( + self, workspace + ): # create_file simply there for the function to depend on the fixture + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json new file mode 100644 index 000000000..4aaa1347d --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -0,0 +1,16 @@ +{ + "category": ["basic"], + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py new file mode 100644 index 000000000..ccb10fe70 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -0,0 +1,27 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "w_file_data.json") +) + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pyproject.toml b/pyproject.toml index 5498381a2..6f79e75ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,8 @@ testpaths = [ ] markers = [ "retrieval", - "regression" + "regression", + "basic" ] [tool.poetry.scripts] From 22458a04e81f6a4e200581fe4046182b96f6e17c Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:15:53 -0400 Subject: [PATCH 11/20] file creation from within file before server :) --- agbenchmark/conftest.py | 2 +- agbenchmark/mocks/tests/basic_mocks.py | 2 +- .../tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++ agbenchmark/tests/regression/regression_tests.txt | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 908d39e89..434f6dbde 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -17,7 +17,7 @@ def config(): return config -@pytest.fixture +@pytest.fixture(scope="module") def workspace(config): yield config["workspace"] # teardown after test function completes diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index eb7b96541..bbff6a9c7 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") file_contents = Challenge.open_file(workspace, "file_to_check.txt") diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 610ccdab6..35d1d80c5 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -8,6 +8,14 @@ data = ChallengeData.deserialize( ) +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) + + class TestReadFile(Challenge): """Testing if LLM can read a file""" diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index e69de29bb..a5f8fbd1d 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -0,0 +1,2 @@ +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] From 60a7ac2343df15127e38da5d490edab887f81608 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:24:17 -0400 Subject: [PATCH 12/20] adding dependencies on other challenges --- agbenchmark/mocks/tests/basic_mocks.py | 2 -- .../basic_abilities/read_file/read_file_test.py | 1 + .../basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 1 - poetry.lock | 15 ++++++++++++++- pyproject.toml | 1 + 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index bbff6a9c7..550095b72 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") - file_contents = Challenge.open_file(workspace, "file_to_check.txt") Challenge.write_to_file( diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 35d1d80c5..ea794281e 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,7 @@ class TestReadFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(depends=["write_file"]) def test_retrieval( self, workspace ): # create_file simply there for the function to depend on the fixture diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index ccb10fe70..b2c559c9e 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -17,6 +17,7 @@ class TestWriteFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(name="write_file") def test_retrieval(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index a5f8fbd1d..84e625af4 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,2 +1 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] diff --git a/poetry.lock b/poetry.lock index 3f1059aaf..3bc37622e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-dependency" +version = "0.5.1" +description = "Manage dependencies of tests" +optional = false +python-versions = "*" +files = [ + {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, +] + +[package.dependencies] +pytest = ">=3.6.0" + [[package]] name = "requests" version = "2.31.0" @@ -765,4 +778,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38" +content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" diff --git a/pyproject.toml b/pyproject.toml index 6f79e75ce..087ac8447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" +pytest-dependency = "^0.5.1" [build-system] From 2f28a66591ea37715282271ccf92560e89a7924a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 14:42:35 -0400 Subject: [PATCH 13/20] more elegant marking & dependency solution --- README.md | 74 +++++++++++++++++-- agbenchmark/challenges/README.md | 38 +++++----- agbenchmark/challenges/define_task_types.py | 1 + .../challenges/retrieval/r1/r1_data.json | 1 + .../tests/basic_abilities/BasicChallenge.py | 7 ++ .../read_file/r_file_data.json | 1 + .../read_file/read_file_test.py | 12 +-- .../write_file/w_file_data.json | 1 + .../write_file/write_file_test.py | 9 +-- .../tests/regression/regression_tests.txt | 2 + poetry.lock | 17 ++++- pyproject.toml | 1 + 12 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py diff --git a/README.md b/README.md index 0a8d119af..0ad0cf345 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,73 @@ Share your progress :) to create a test: -``` -@pytest.mark.parametrize( -"server_response", -["VARIABLE"], # VARIABLE = the query/goal you provide to the model -indirect=True, +```python +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from ..CategoryChallenge import CategoryChallenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") ) -@pytest.mark.(VARIABLE) # VARIABLE = category of the test -def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts -assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) + +class TestSomething(CategoryChallenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + def test_retrieval( + self, workspace + ): + # scoring logic goes here +``` + +All challenges will inherit from parent class which has the mark + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +If you want to add a custom mark to a Challenge, you must specify it before the test definition + +```python +@pytest.mark.other_mark +def test_retrieval(self, workspace): +``` + +To add a dependency to a challenge use the following + +```python +# to defining what a test depends on +from pytest_dependency import depends + +def test1(self, request, workspace): + depends(request, data.dependencies) +# for defining a test as a dependency +@pytest.mark.dependency() +def test2 +``` + +Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards + +```python +@pytest.mark.run(order=1) +``` + +To create a file to test a challenge, add this to the challenge file which will create a file before running the server + +```python +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) ``` ## Api diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 50efe2c4d..d5229e937 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,28 +4,25 @@ Input: -- **category** (str): information-retrieval -- **difficulty**(str): the difficulty of this query. choices from - -## Information-retrieval challenges - -Input: - -- **category** (str): information-retrieval -- **task** (str): the question the agent needs to be solve. +- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **task** (str): The task that the agent needs to solve. +- **dependencies** (str[]): The dependencies that the challenge needs to run. - **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt -- **difficulty**(str): the difficulty of this query. choices from -- **mock_func**: function to mock the agent's response. This is used for testing purposes + - **answer** (str): The raw text of the ground truth answer. + - **should_contain** (list): The exact strings that are required in the final answer. + - **should_not_contain** (list): The exact strings that should not be in the final answer. + - **files** (list): Files that are used for retrieval. Can specify file here or an extension. +- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **info** (dict): Additional info about the challenge. + - **difficulty** (str): The difficulty of this query. + - **description** (str): Description of the challenge. + - **side_effects** (str[]): Describes the effects of the challenge. Example: ```python { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -33,11 +30,16 @@ Example: "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } ``` -Output: +Current Output: - **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 879a46af0..694671218 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -20,6 +20,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): category: List[str] task: str + dependencies: List[str] ground: Ground mock_func: Optional[str] = None info: Info diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 08b74d1b7..fe05b6d51 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,6 @@ { "category": ["basic"], + "dependencies": ["test_write_file"], "task": "What is the capital of America?", "ground": { "answer": "Washington", diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py new file mode 100644 index 000000000..563207405 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -0,0 +1,7 @@ +import pytest +from agbenchmark.Challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 55319ddfc..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ea794281e..03b2d6cab 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,7 +1,9 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -16,7 +18,7 @@ def setup_module(workspace): ) -class TestReadFile(Challenge): +class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( @@ -24,11 +26,9 @@ class TestReadFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(depends=["write_file"]) - def test_retrieval( - self, workspace - ): # create_file simply there for the function to depend on the fixture + def test_read_file(self, request, workspace): + depends(request, data.dependencies) + file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 4aaa1347d..562d1c364 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b2c559c9e..b09162e3d 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,6 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData -from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os data = ChallengeData.deserialize( @@ -8,7 +8,7 @@ data = ChallengeData.deserialize( ) -class TestWriteFile(Challenge): +class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" @pytest.mark.parametrize( @@ -16,9 +16,8 @@ class TestWriteFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(name="write_file") - def test_retrieval(self, workspace): + @pytest.mark.dependency() + def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 84e625af4..b831003fc 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1 +1,3 @@ agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 3bc37622e..f6f24c5f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,21 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-ordering" +version = "0.6" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = "*" +files = [ + {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, + {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, + {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "requests" version = "2.31.0" @@ -778,4 +793,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" +content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" diff --git a/pyproject.toml b/pyproject.toml index 087ac8447..faee61c2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" +pytest-ordering = "^0.6" [build-system] From 06a6f080543ddffd8baf3aaf51ec97ff1fce86b3 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 00:22:53 -0400 Subject: [PATCH 14/20] finally figured out right way to do dependencies --- agbenchmark/challenges/retrieval/Retrieval.py | 2 ++ .../challenges/retrieval/r1/r1_data.json | 4 ++-- .../challenges/retrieval/r1/r1_test.py | 6 ++++-- .../tests/basic_abilities/BasicChallenge.py | 1 + .../read_file/r_file_data.json | 4 +++- .../read_file/read_file_test.py | 6 ++---- .../write_file/write_file_test.py | 1 - .../tests/regression/regression_tests.txt | 4 ++-- poetry.lock | 19 ++++++++++++++++++- pyproject.toml | 3 ++- 10 files changed, 36 insertions(+), 14 deletions(-) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 9434d69c3..b8aa81ce3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,6 +1,8 @@ from agbenchmark.Challenge import Challenge +import pytest +@pytest.mark.retrieval class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index fe05b6d51..562d1c364 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,14 +1,14 @@ { "category": ["basic"], - "dependencies": ["test_write_file"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_write_file_mock", "info": { "difficulty": "easy", "description": "Tests the writing to file", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d37c5e795..5e6d6abf4 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,6 +2,8 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os +from pytest_dependency import depends + data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r1_data.json") @@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.retrieval - def test_retrieval(self, workspace): + def test_retrieval(self, request, workspace): + depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..0cada86cc 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,6 +2,7 @@ import pytest from agbenchmark.Challenge import Challenge +@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..4d04f33e7 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,7 +1,9 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 03b2d6cab..ad08da4e0 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_read_file(self, request, workspace): - depends(request, data.dependencies) - + @pytest.mark.order(after=data.dependencies) + def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b09162e3d..4c94320e0 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.dependency() def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index b831003fc..df27f3124 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] +agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index f6f24c5f2..4764bf493 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,23 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-order" +version = "1.1.0" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, + {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, +] + +[package.dependencies] +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + [[package]] name = "pytest-ordering" version = "0.6" @@ -793,4 +810,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" +content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" diff --git a/pyproject.toml b/pyproject.toml index faee61c2d..fd2c52041 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" pytest-ordering = "^0.6" +pytest-order = "^1.1.0" [build-system] @@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q" +addopts = "--order-dependencies" # -ra -q testpaths = [ "tests", "agbenchmark", ] From a2f79760ce8abdddfc27c5b0b42a58df903b352c Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 08:48:16 -0400 Subject: [PATCH 15/20] other was non solution, solution is pytest-depends --- agbenchmark/challenges/README.md | 20 ++--- .../challenges/retrieval/r1/r1_test.py | 2 - .../tests/basic_abilities/BasicChallenge.py | 1 - .../read_file/r_file_data.json | 4 +- .../read_file/read_file_test.py | 2 +- .../write_file/write_file_test.py | 1 + .../tests/regression/regression_tests.txt | 2 +- poetry.lock | 80 ++++++++++--------- pyproject.toml | 6 +- 9 files changed, 59 insertions(+), 59 deletions(-) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index d5229e937..e457b85c4 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -6,7 +6,7 @@ Input: - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. -- **dependencies** (str[]): The dependencies that the challenge needs to run. +- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. - **answer** (str): The raw text of the ground truth answer. - **should_contain** (list): The exact strings that are required in the final answer. @@ -23,18 +23,20 @@ Example: ```python { "category": ["basic"], - "task": "What is the capital of America?", + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_read_file_mock", "info": { - "difficulty": "easy", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 5e6d6abf4..45becaf75 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -from pytest_dependency import depends data = ChallengeData.deserialize( @@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, request, workspace): - depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 0cada86cc..563207405 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.Challenge import Challenge -@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 4d04f33e7..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,9 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ad08da4e0..494a9b071 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.order(after=data.dependencies) + @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 4c94320e0..0a4ef4a2c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index df27f3124..57b94cd7a 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 4764bf493..d7939fbfe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -368,6 +368,20 @@ files = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "idna" version = "3.4" @@ -473,6 +487,24 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "openai" version = "0.27.8" @@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] -name = "pytest-dependency" -version = "0.5.1" -description = "Manage dependencies of tests" +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" optional = false python-versions = "*" files = [ - {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, ] [package.dependencies] -pytest = ">=3.6.0" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, - {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, -] - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-ordering" -version = "0.6" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = "*" -files = [ - {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, - {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, - {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, -] - -[package.dependencies] -pytest = "*" +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" [[package]] name = "requests" @@ -810,4 +814,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" +content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" diff --git a/pyproject.toml b/pyproject.toml index fd2c52041..0a4f8ba73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" -pytest-dependency = "^0.5.1" -pytest-ordering = "^0.6" -pytest-order = "^1.1.0" +pytest-depends = "^1.0.1" [build-system] @@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "--order-dependencies" # -ra -q +addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ] From 2411c35d0eb0af6ff0fb4a64ac2b431ea2d41adb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 11:12:33 -0400 Subject: [PATCH 16/20] update regression tests info --- .../challenges/retrieval/r1/r1_test.py | 7 +++- agbenchmark/conftest.py | 36 +++++++++++++------ .../read_file/read_file_test.py | 5 +++ .../write_file/w_file_data.json | 2 +- .../write_file/write_file_test.py | 5 +++ .../tests/regression/RegressionManager.py | 25 ++++++++----- .../tests/regression/regression_tests.json | 1 + .../tests/regression/regression_tests.txt | 17 +++++++-- 8 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 agbenchmark/tests/regression/regression_tests.json diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 45becaf75..489d298fb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_retrieval(self, request, workspace): + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) + def test_retrieval(self, workspace, current_challenge_data): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 434f6dbde..78114c204 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager +from agbenchmark.challenges.define_task_types import ChallengeData @pytest.fixture(scope="module") @@ -64,21 +65,34 @@ def server_response(request, config): # print(f"Request succeeded with status code {response.status_code}") -regression_txt = "agbenchmark/tests/regression/regression_tests.txt" +regression_json = "agbenchmark/tests/regression/regression_tests.json" -regression_manager = RegressionManager(regression_txt) +regression_manager = RegressionManager(regression_json) + + +# this is to get the challenge_data from every test +@pytest.fixture(autouse=True) +def regression_data(request): + return request.param def pytest_runtest_makereport(item, call): - """Called for each test report. Generated for each stage - of a test run (setup, call, teardown).""" if call.when == "call": - if ( - call.excinfo is None - ): # if no error in the call stage, add it as a regression test - regression_manager.add_test(item.nodeid) - else: # otherwise, :( - regression_manager.remove_test(item.nodeid) + challenge_data = item.funcargs.get("regression_data", None) + difficulty = challenge_data.info.difficulty if challenge_data else "unknown" + dependencies = challenge_data.dependencies if challenge_data else [] + + test_details = { + "difficulty": difficulty, + "dependencies": dependencies, + "test": item.nodeid, + } + + print("pytest_runtest_makereport", test_details) + if call.excinfo is None: + regression_manager.add_test(item.nodeid.split("::")[1], test_details) + else: + regression_manager.remove_test(item.nodeid.split("::")[1]) def pytest_collection_modifyitems(items): @@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items): to add regression marker to collected test items.""" for item in items: print("pytest_collection_modifyitems", item.nodeid) - if item.nodeid + "\n" in regression_manager.tests: + if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 494a9b071..7d14228c8 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 562d1c364..1d2621081 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -10,7 +10,7 @@ }, "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 0a4ef4a2c..330128898 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py index 9117d53f1..a1379ecae 100644 --- a/agbenchmark/tests/regression/RegressionManager.py +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -1,3 +1,6 @@ +import json + + class RegressionManager: """Abstracts interaction with the regression tests file""" @@ -6,17 +9,21 @@ class RegressionManager: self.load() def load(self) -> None: - with open(self.filename, "r") as f: - self.tests = f.readlines() + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} def save(self) -> None: with open(self.filename, "w") as f: - f.writelines(self.tests) + json.dump(self.tests, f, indent=4) - def add_test(self, test_id) -> None: - if f"{test_id}\n" not in self.tests: - self.tests.append(f"{test_id}\n") + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() - def remove_test(self, test_id) -> None: - if f"{test_id}\n" in self.tests: - self.tests.remove(f"{test_id}\n") + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/agbenchmark/tests/regression/regression_tests.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 57b94cd7a..8af722f07 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,14 @@ -agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] +{ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { + "difficulty": "easy", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file From d6a6e69f2e3ed1cd4bb1715ae737ad50d6b17cb9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 19:30:04 -0400 Subject: [PATCH 17/20] can now put file extensions or names in files data --- agbenchmark/Challenge.py | 22 ++++++++++++++++++- .../challenges/retrieval/r1/r1_test.py | 12 +++++----- .../read_file/read_file_test.py | 12 +++++----- .../write_file/w_file_data.json | 2 +- .../write_file/write_file_test.py | 12 +++++----- .../tests/regression/regression_tests.json | 15 ++++++++++++- 6 files changed, 57 insertions(+), 18 deletions(-) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 9828a0e9e..d159296b1 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,5 @@ import os -from typing import Optional +import glob from agbenchmark.challenges.define_task_types import Ground @@ -14,6 +14,26 @@ class Challenge: with open(workspace_dir, "r") as f: return f.read() + @staticmethod + def open_files(workspace: str, file_patterns: list): + script_dir = os.path.abspath(workspace) + files_contents = [] + + for file_pattern in file_patterns: + # Check if it is a file extension + if file_pattern.startswith("."): + # Find all files with the given extension in the workspace + matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) + else: + # Otherwise, it is a specific file + matching_files = [os.path.join(script_dir, file_pattern)] + + for file_path in matching_files: + with open(file_path, "r") as f: + files_contents.append(f.read()) + + return files_contents + @staticmethod def write_to_file(workspace: str, filename: str, content: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 489d298fb..2a7d92a71 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, workspace, current_challenge_data): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 7d14228c8..90946670c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge): ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 1d2621081..037c5bd88 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -6,7 +6,7 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] }, "mock_func": "basic_write_file_mock", "info": { diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 330128898..187378ff1 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge): ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..c84fc9c99 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file From fa0df12439b7beea91a46f08e7f6154900dc1047 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 26 Jun 2023 09:27:20 -0400 Subject: [PATCH 18/20] mini agi attempt --- agbenchmark/conftest.py | 44 +++++++++++-------- .../tests/regression/regression_tests.json | 15 +------ agent/agbenchmark_run.py | 27 ++++++++++++ 3 files changed, 54 insertions(+), 32 deletions(-) create mode 100644 agent/agbenchmark_run.py diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 78114c204..b3b69f194 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -7,6 +7,7 @@ import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager from agbenchmark.challenges.define_task_types import ChallengeData +import subprocess @pytest.fixture(scope="module") @@ -42,27 +43,34 @@ def server_response(request, config): else: task = request.param mock_function_name = None - # print(f"Server starting at {request.module}") - # try: - # response = requests.post( - # f"{config['hostname']}:{config['port']}", data={"task": task} - # ) - # response.raise_for_status() # This will raise an HTTPError if the status is 4xx or 5xx - # except RequestException: - # # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") + # get the current file's directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + # construct the script's path + script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") + + # form the command + command = ["python", script_path, task] + + # if mock_function_name: + # mock_manager = MockManager( + # task + # ) # workspace doesn't need to be passed in, stays the same + # print("Server unavailable, using mock", mock_function_name) + # mock_manager.delegate(mock_function_name) # else: - # # This code is run if no exception occurred - # print(f"Request succeeded with status code {response.status_code}") + # print("No mock provided") + + try: + # run the command and wait for it to complete + result = subprocess.run( + command, shell=True, check=True, text=True, capture_output=True + ) + return result + except subprocess.CalledProcessError as e: + print(f"Subprocess failed with the following error:\n{e}") + # If the subprocess returns a non-zero exit status regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index c84fc9c99..9e26dfeeb 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,14 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py new file mode 100644 index 000000000..f509f5e66 --- /dev/null +++ b/agent/agbenchmark_run.py @@ -0,0 +1,27 @@ +import argparse +import subprocess +import os + + +def main(objective): + # get the current directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # form the command + command = ( + f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" + ) + + # run the command + subprocess.run(command, shell=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") + parser.add_argument( + "objective", type=str, help="The objective to pass to miniagi.py" + ) + + args = parser.parse_args() + + main(args.objective) From f933717d8b6f28e268437e000a57e187076287af Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 18:17:54 -0400 Subject: [PATCH 19/20] mini-agi, simple challenge creation, --mock flag --- .env.example | 4 + README.md | 2 +- agbenchmark/Challenge.py | 53 ++++++++- agbenchmark/challenges/define_task_types.py | 12 +- .../challenges/retrieval/r1/r1_data.json | 12 +- .../challenges/retrieval/r1/r1_test.py | 24 +--- agbenchmark/config.json | 2 +- agbenchmark/conftest.py | 103 ++++++++++++------ agbenchmark/start_benchmark.py | 20 +++- .../tests/basic_abilities/BasicChallenge.py | 2 + .../read_file/r_file_data.json | 7 +- .../read_file/read_file_test.py | 43 +++----- .../write_file/w_file_data.json | 8 +- .../write_file/write_file_test.py | 26 ++--- .../tests/regression/regression_tests.json | 15 ++- .../tests/regression/regression_tests.txt | 14 --- agent/agbenchmark_run.py | 27 ----- poetry.lock | 16 ++- pyproject.toml | 3 +- 19 files changed, 235 insertions(+), 158 deletions(-) create mode 100644 .env.example delete mode 100644 agbenchmark/tests/regression/regression_tests.txt delete mode 100644 agent/agbenchmark_run.py diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..0a91118a9 --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +OPENAI_API_KEY= +AGENT_NAME=mini-agi +AGENT_TIMEOUT=60 +MOCK_TEST=False \ No newline at end of file diff --git a/README.md b/README.md index 0ad0cf345..794279478 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( - "server_response", + "run_agent", [(data.task, data.mock_func)], indirect=True, ) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index d159296b1..f644abc4a 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,12 +1,63 @@ import os import glob +import pytest +from abc import ABC, abstractmethod from agbenchmark.challenges.define_task_types import Ground +from agbenchmark.challenges.define_task_types import ChallengeData +from dotenv import load_dotenv, set_key + +load_dotenv() + +mock_test_str = os.getenv("MOCK_TEST") +MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False -class Challenge: +class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" + @abstractmethod + def get_file_path(self) -> str: + """This should be implemented by any class which inherits from BasicChallenge""" + pass + + @property + def data(self) -> ChallengeData: + return ChallengeData.deserialize(self.get_file_path()) + + @property + def mock(self): + return self.data.mock.mock_func if self.data.mock else None + + @property + def task(self): + return ( + self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task + ) + + @property + def dependencies(self) -> list: + print("self.data.dependencies", self.data.dependencies) + return self.data.dependencies + + @property + def name(self) -> str: + print("self.data.name", self.data.name) + return self.data.name + + @pytest.mark.parametrize( + "run_agent", + [(task, mock)], + indirect=True, + ) + @pytest.mark.parametrize( + "challenge_data", + [data], + indirect=True, + ) + def test_method(self, workspace): + raise NotImplementedError + @staticmethod def open_file(workspace: str, filename: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 694671218..7fc2361b5 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,11 @@ import json import os +class Mock(BaseModel): + mock_func: str + mock_task: Optional[str] = None + + class Info(BaseModel): difficulty: str description: str @@ -12,17 +17,18 @@ class Info(BaseModel): class Ground(BaseModel): answer: str - should_contain: Optional[List[str]] - should_not_contain: Optional[List[str]] + should_contain: Optional[List[str]] = None + should_not_contain: Optional[List[str]] = None files: List[str] class ChallengeData(BaseModel): + name: str category: List[str] task: str dependencies: List[str] ground: Ground - mock_func: Optional[str] = None + mock: Optional[Mock] = None info: Info def serialize(self, path: str) -> None: diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 562d1c364..80c5e51eb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,16 +1,20 @@ { + "name": "retrieval1", "category": ["basic"], - "task": "What is the capital of America?", + "task": "Print the the capital of America to a .txt file", "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" }, - "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 2a7d92a71..0bd907d8a 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r1_data.json") -) - - class TestRetrieval1(RetrievalChallenge): """The first information-retrieval challenge""" - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - def test_retrieval(self, workspace, current_challenge_data): - files_contents = self.open_files(workspace, data.ground.files) + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r1_data.json") + + def test_method(self, workspace): + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d285627e5..9e5c1880f 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,5 @@ { "hostname": "localhost", "port": 8080, - "workspace": "agbenchmark/mocks/workspace" + "workspace": "C:/Users/silen/miniagi" } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index b3b69f194..4edd4b5e0 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -4,18 +4,24 @@ import pytest import shutil from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests -from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager -from agbenchmark.challenges.define_task_types import ChallengeData import subprocess +from agbenchmark.Challenge import Challenge +from dotenv import load_dotenv + +load_dotenv() @pytest.fixture(scope="module") -def config(): +def config(request): config_file = os.path.abspath("agbenchmark/config.json") print(f"Config file: {config_file}") with open(config_file, "r") as f: config = json.load(f) + + if request.config.getoption("--mock"): + config["workspace"] = "agbenchmark/mocks/workspace" + return config @@ -34,43 +40,49 @@ def workspace(config): print(f"Failed to delete {file_path}. Reason: {e}") +def pytest_addoption(parser): + parser.addoption("--mock", action="store_true", default=False) + + +AGENT_NAME = os.getenv("AGENT_NAME") +AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT") + + @pytest.fixture(autouse=True) -def server_response(request, config): +def run_agent(request, config): """Calling to get a response""" if isinstance(request.param, tuple): task = request.param[0] # The task is passed in indirectly - mock_function_name = request.param[1] + mock_function_name = request.param[1] or None else: task = request.param mock_function_name = None - # get the current file's directory - current_dir = os.path.dirname(os.path.abspath(__file__)) + if mock_function_name != None and (request.config.getoption("--mock")): + if mock_function_name: + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_function_name) + mock_manager.delegate(mock_function_name) + else: + print("No mock provided") + else: + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - # construct the script's path - script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") + try: + timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60 - # form the command - command = ["python", script_path, task] - - # if mock_function_name: - # mock_manager = MockManager( - # task - # ) # workspace doesn't need to be passed in, stays the same - # print("Server unavailable, using mock", mock_function_name) - # mock_manager.delegate(mock_function_name) - # else: - # print("No mock provided") - - try: - # run the command and wait for it to complete - result = subprocess.run( - command, shell=True, check=True, text=True, capture_output=True - ) - return result - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with the following error:\n{e}") - # If the subprocess returns a non-zero exit status + subprocess.run( + ["python", "miniagi.py", task], + check=True, + cwd=path, + timeout=timeout + # text=True, + # capture_output=True + ) + except subprocess.TimeoutExpired: + print("The subprocess has exceeded the time limit and was terminated.") regression_json = "agbenchmark/tests/regression/regression_tests.json" @@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json) # this is to get the challenge_data from every test @pytest.fixture(autouse=True) -def regression_data(request): +def challenge_data(request): return request.param def pytest_runtest_makereport(item, call): if call.when == "call": - challenge_data = item.funcargs.get("regression_data", None) + challenge_data = item.funcargs.get("challenge_data", None) difficulty = challenge_data.info.difficulty if challenge_data else "unknown" dependencies = challenge_data.dependencies if challenge_data else [] @@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call): def pytest_collection_modifyitems(items): """Called once all test items are collected. Used - to add regression marker to collected test items.""" + to add regression and depends markers to collected test items.""" for item in items: - print("pytest_collection_modifyitems", item.nodeid) + # regression add if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) @@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items): def pytest_sessionfinish(): """Called at the end of the session to save regression tests""" regression_manager.save() + + +# this is so that all tests can inherit from the Challenge class +def pytest_generate_tests(metafunc): + if "challenge_data" in metafunc.fixturenames: + # Get the instance of the test class + test_class = metafunc.cls() + + # Generate the parameters + params = test_class.data + + # Add the parameters to the test function + metafunc.parametrize("challenge_data", [params], indirect=True) + + if "run_agent" in metafunc.fixturenames: + # Get the instance of the test class + test_class = metafunc.cls() + + # Generate the parameters + params = [(test_class.task, test_class.mock)] + + # Add the parameters to the test function + metafunc.parametrize("run_agent", params, indirect=True) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 6adcc09bf..ac612293a 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -2,6 +2,10 @@ import click import pytest import json import os +from pathlib import Path +from dotenv import load_dotenv, set_key + +load_dotenv() @click.group() @@ -12,8 +16,8 @@ def cli(): @cli.command() @click.option("--category", default=None, help="Specific category to run") @click.option("--noreg", is_flag=True, help="Skip regression tests") -def start(category, noreg): - """Start the benchmark tests. If a category flag is is provided, run the categories with that mark.""" +@click.option("--mock", is_flag=True, help="Run with mock") +def start(category, noreg, mock): """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" config_file = "agbenchmark/config.json" @@ -28,7 +32,8 @@ def start(category, noreg): ) config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( - "Please enter a new workspace path", default="agbenchmark/mocks/workspace" + "Please enter a new workspace path", + default=os.path.join(Path.home(), "miniagi"), ) with open(config_dir, "w") as f: @@ -38,13 +43,17 @@ def start(category, noreg): with open(config_dir, "r") as f: config = json.load(f) + set_key(".env", "MOCK_TEST", "True" if mock else "False") + if mock: + config["workspace"] = "agbenchmark/mocks/workspace" + # create workspace directory if it doesn't exist workspace_path = os.path.abspath(config["workspace"]) if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) regression_path = os.path.abspath( - "agbenchmark/tests/regression/regression_tests.txt" + "agbenchmark/tests/regression/regression_tests.json" ) if not os.path.exists(regression_path): with open(regression_path, "a"): @@ -74,6 +83,9 @@ def start(category, noreg): else: print("Running all categorys") # run all categorys + if mock: + pytest_args.append("--mock") + # Run pytest with the constructed arguments pytest.main(pytest_args) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..6e7f73100 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -1,5 +1,7 @@ import pytest from agbenchmark.Challenge import Challenge +from agbenchmark.challenges.define_task_types import ChallengeData +from abc import abstractmethod @pytest.mark.basic diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..b21e2724b 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,13 +1,16 @@ { + "name": "basic_read_file", "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": ["basic_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "basic_read_file_mock", + "mock": { + "mock_func": "basic_read_file_mock" + }, "info": { "description": "This reads the file quickly", "difficulty": "basic", diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 90946670c..68288a42c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r_file_data.json") -) - - -@pytest.fixture(scope="module", autouse=True) -def setup_module(workspace): - if data.ground.should_contain: - Challenge.write_to_file( - workspace, data.ground.files[0], "this is how we're doing" - ) - class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - @pytest.mark.depends(on=data.dependencies) - def test_read_file(self, workspace): - files_contents = self.open_files(workspace, data.ground.files) + @pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): + Challenge.write_to_file( + workspace, self.data.ground.files[0], "this is how we're doing" + ) + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r_file_data.json") + + @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") + def test_method( + self, workspace + ): # run_test is a common name that all tests must implement + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 037c5bd88..358ebb538 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { + "name": "basic_write_file", "category": ["basic"], - "task": "What is the capital of America?", + "task": "Print the the capital of America to a .txt file", "dependencies": [], "ground": { "answer": "Washington", @@ -8,7 +9,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": [".txt"] }, - "mock_func": "basic_write_file_mock", + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" + }, "info": { "difficulty": "basic", "description": "Tests the writing to file", diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 187378ff1..8caa6605a 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "w_file_data.json") -) - class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - @pytest.mark.depends(name="test_write_file") - def test_write_file(self, workspace): - files_contents = self.open_files(workspace, data.ground.files) + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + print("my workspace is ", workspace) + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..8a6278fea 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "basic_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" + } +} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt deleted file mode 100644 index 8af722f07..000000000 --- a/agbenchmark/tests/regression/regression_tests.txt +++ /dev/null @@ -1,14 +0,0 @@ -{ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { - "difficulty": "easy", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py deleted file mode 100644 index f509f5e66..000000000 --- a/agent/agbenchmark_run.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse -import subprocess -import os - - -def main(objective): - # get the current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # form the command - command = ( - f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" - ) - - # run the command - subprocess.run(command, shell=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") - parser.add_argument( - "objective", type=str, help="The objective to pass to miniagi.py" - ) - - args = parser.parse_args() - - main(args.objective) diff --git a/poetry.lock b/poetry.lock index d7939fbfe..7b2477bc6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -644,6 +644,20 @@ future-fstrings = "*" networkx = "*" pytest = ">=3" +[[package]] +name = "python-dotenv" +version = "1.0.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, + {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "requests" version = "2.31.0" @@ -814,4 +828,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" +content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe" diff --git a/pyproject.toml b/pyproject.toml index 0a4f8ba73..043fe68a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" +python-dotenv = "^1.0.0" [build-system] @@ -30,7 +31,7 @@ testpaths = [ markers = [ "retrieval", "regression", - "basic" + "basic", ] [tool.poetry.scripts] From 76ee994d2c7a205799bc7c07adfa70f0c93102e9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 19:19:14 -0400 Subject: [PATCH 20/20] read mes, remove port and host from config, etc --- .env.example | 1 - README.md | 154 ++++++------------ agbenchmark/challenges/README.md | 31 ++-- agbenchmark/config.json | 4 +- agbenchmark/mocks/basic_gpt_agent.py | 20 --- agbenchmark/mocks/tests/basic_mocks.py | 12 +- agbenchmark/start_benchmark.py | 4 - .../read_file/read_file_test.py | 5 +- .../write_file/write_file_test.py | 1 - .../tests/regression/regression_tests.json | 7 - 10 files changed, 73 insertions(+), 166 deletions(-) delete mode 100644 agbenchmark/mocks/basic_gpt_agent.py diff --git a/.env.example b/.env.example index 0a91118a9..7782d048e 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,3 @@ -OPENAI_API_KEY= AGENT_NAME=mini-agi AGENT_TIMEOUT=60 MOCK_TEST=False \ No newline at end of file diff --git a/README.md b/README.md index 794279478..2c8daa0ad 100644 --- a/README.md +++ b/README.md @@ -2,80 +2,70 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work +## As a user + +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to run and kill agent +3. `agbenchmark start` + - `--category challenge_category` to run tests in a specific category + - `--mock` to only run mock tests if they exists for each test + - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests +4. We call boilerplate code for your agent +5. Show pass rate of tests, logs, and any other metrics + +## Contributing + ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x -### To run the basic existing mock (June 21) +### To run the existing mocks 1. clone the repo `auto-gpt-benchmarks` 2. `pip install poetry` 3. `poetry shell` 4. `poetry install` -5. `agbenchmark start` +5. `cp .env_example .env` +6. `agbenchmark start --mock` Keep config the same and watch the logs :) +### To run with mini-agi + +1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` +2. `pip install -r requirements.txt` +3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed +4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` + - To add requirements `poetry add requirement`. Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. -If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit Let people know what beautiful code you write does, document everything well Share your progress :) -## How this works - -1. `pip install auto-gpt-benchmarks` -2. Add boilerplate code to start webserver to your agent (run loop and stop condition) -3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory -4. We call the server to run the agent for each test -5. Show pass rate of tests, logs, and any other metrics - -### To run the basic existing mock (June 21) - -1. clone the repo `auto-gpt-benchmarks` -2. `pip install poetry` -3. `poetry shell` -4. `poetry install` -5. `agbenchmark start` - Keep config the same and watch the logs :) - -#### Bonuses - -- You can adds tests by git cloning auto-gpt-benchmarks to your repo -- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server -- Simple, easy to use -- Don't have to deal with cloud or parallelization yet - ### Pytest -to create a test: +an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic ```python import pytest -from agbenchmark.challenges.define_task_types import ChallengeData -from ..CategoryChallenge import CategoryChallenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r_file_data.json") -) -class TestSomething(CategoryChallenge): - """Testing if LLM can read a file""" +class TestWriteFile(BasicChallenge): + """Testing if LLM can write to a file""" - @pytest.mark.parametrize( - "run_agent", - [(data.task, data.mock_func)], - indirect=True, - ) - def test_retrieval( - self, workspace - ): - # scoring logic goes here + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + # implement scoring logic by looking at workspace ``` -All challenges will inherit from parent class which has the mark +All challenges will inherit from parent class which has the mark and any specific methods for their category ```python @pytest.mark.basic @@ -83,50 +73,23 @@ class BasicChallenge(Challenge): pass ``` -If you want to add a custom mark to a Challenge, you must specify it before the test definition - -```python -@pytest.mark.other_mark -def test_retrieval(self, workspace): -``` - -To add a dependency to a challenge use the following - -```python -# to defining what a test depends on -from pytest_dependency import depends - -def test1(self, request, workspace): - depends(request, data.dependencies) -# for defining a test as a dependency -@pytest.mark.dependency() -def test2 -``` - -Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards - -```python -@pytest.mark.run(order=1) -``` - To create a file to test a challenge, add this to the challenge file which will create a file before running the server ```python -@pytest.fixture(scope="module", autouse=True) -def setup_module(workspace): - if data.ground.should_contain: +@pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): Challenge.write_to_file( - workspace, data.ground.files[0], "this is how we're doing" + workspace, self.data.ground.files[0], "this is how we're doing" ) ``` -## Api - -FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server +#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) ## Workspace -Defined by the user on config +If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config #### Dataset @@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git |-- auto-gpt-benchmarks/ **main project directory** | |-- metrics.py **combining scores, metrics, final evaluation** | |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **shared fixtures across all tests** -| |-- Challenge.py **easy challenge creation class?** -| |-- config.json **hostname, port, workspace folder** +| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** +| |-- Challenge.py **easy challenge creation class** +| |-- config.json **workspace folder** | |-- challenges/ **challenges across different domains** | | |-- adaptability/ | | |-- basic_abilities/ @@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git | | |-- retrieval/ | | |-- web_navigation/ | | |-- writing/ -| |-- tests/ **challenges across different metrics** -| | |-- basic_abilities/ -| | |-- interface/ -| |-- workspace/ **workspace related func** -| | |-- **init**.py -| | |-- workspace_manager.py **creation, deletion** +| |-- tests/ +| | |-- basic_abilities/ **every llm should pass these challenges** +| | |-- regression/ **challenges that already passed** ``` - -### Easy Challenge Creation - -tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval - -#### Written Challenges - -For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore - -#### Validators - -Designed to handle specific types of output (e.g., text, code, structured data) - -#### Logging - -Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc - -Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index e457b85c4..9e74d19ce 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,7 +4,8 @@ Input: -- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **name** (str): Name of the challenge. +- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. @@ -12,7 +13,9 @@ Input: - **should_contain** (list): The exact strings that are required in the final answer. - **should_not_contain** (list): The exact strings that should not be in the final answer. - **files** (list): Files that are used for retrieval. Can specify file here or an extension. -- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **mock** (dict): Mock response for testing. + - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. + - **mock_task** (str): Task to provide for the mock function. - **info** (dict): Additional info about the challenge. - **difficulty** (str): The difficulty of this query. - **description** (str): Description of the challenge. @@ -22,24 +25,26 @@ Example: ```python { + "name": "basic_write_file", "category": ["basic"], - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "task": "Print the the capital of America to a .txt file", + "dependencies": [], "ground": { - "answer": "random string: this is how we're doing", - "should_contain": ["random string: this is how we're doing"], - "files": ["file_to_check.txt"] + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" }, - "mock_func": "basic_read_file_mock", "info": { - "description": "This reads the file quickly", "difficulty": "basic", - "side_effects": [""] + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] } } - ``` Current Output: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 9e5c1880f..3de1dd643 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,3 @@ { - "hostname": "localhost", - "port": 8080, - "workspace": "C:/Users/silen/miniagi" + "hostname": "localhost" } diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py deleted file mode 100644 index 6aac3d191..000000000 --- a/agbenchmark/mocks/basic_gpt_agent.py +++ /dev/null @@ -1,20 +0,0 @@ -import json -import openai - - -def basic_gpt_agent(query) -> str: - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}] - ) - - answer = response["choices"][0]["message"]["content"] # type: ignore - - print("QUERY : ", query) - print("AGENT ANSWER: ", answer) - - return answer - - -if __name__ == "__main__": - # server boilerplate example here - basic_gpt_agent("") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 550095b72..631b30c2c 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,5 +1,4 @@ from agbenchmark.Challenge import Challenge -from ..basic_gpt_agent import basic_gpt_agent def basic_read_file_mock(task: str, workspace: str): @@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str): """ This mock writes to a file (creates one if it doesn't exist) """ - - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "Washington DC is the capital of the United States of America", + ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index ac612293a..c9f3643cc 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -27,10 +27,6 @@ def start(category, noreg, mock): if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: config = {} - config["hostname"] = click.prompt( - "\nPlease enter a new hostname", default="localhost" - ) - config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( "Please enter a new workspace path", default=os.path.join(Path.home(), "miniagi"), diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 68288a42c..f99ae608c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os @@ -8,9 +7,7 @@ import os class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this + @pytest.fixture(scope="module", autouse=True) def setup_module(self, workspace): Challenge.write_to_file( workspace, self.data.ground.files[0], "this is how we're doing" diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 8caa6605a..39c73b163 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fea..384f9e7c6 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -3,12 +3,5 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" } } \ No newline at end of file