update regression tests info
parent
31c1192719
commit
adc6b225a6
|
@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge):
|
||||||
[(data.task, data.mock_func)],
|
[(data.task, data.mock_func)],
|
||||||
indirect=True,
|
indirect=True,
|
||||||
)
|
)
|
||||||
def test_retrieval(self, request, workspace):
|
@pytest.mark.parametrize(
|
||||||
|
"regression_data",
|
||||||
|
[data],
|
||||||
|
indirect=True,
|
||||||
|
)
|
||||||
|
def test_retrieval(self, workspace, current_challenge_data):
|
||||||
file = self.open_file(workspace, data.ground.files[0])
|
file = self.open_file(workspace, data.ground.files[0])
|
||||||
|
|
||||||
score = self.scoring(file, data.ground)
|
score = self.scoring(file, data.ground)
|
||||||
|
|
|
@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager
|
||||||
import requests
|
import requests
|
||||||
from requests.exceptions import RequestException
|
from requests.exceptions import RequestException
|
||||||
from agbenchmark.mocks.MockManager import MockManager
|
from agbenchmark.mocks.MockManager import MockManager
|
||||||
|
from agbenchmark.challenges.define_task_types import ChallengeData
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
|
@ -64,21 +65,34 @@ def server_response(request, config):
|
||||||
# print(f"Request succeeded with status code {response.status_code}")
|
# print(f"Request succeeded with status code {response.status_code}")
|
||||||
|
|
||||||
|
|
||||||
regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
|
regression_json = "agbenchmark/tests/regression/regression_tests.json"
|
||||||
|
|
||||||
regression_manager = RegressionManager(regression_txt)
|
regression_manager = RegressionManager(regression_json)
|
||||||
|
|
||||||
|
|
||||||
|
# this is to get the challenge_data from every test
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def regression_data(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_makereport(item, call):
|
def pytest_runtest_makereport(item, call):
|
||||||
"""Called for each test report. Generated for each stage
|
|
||||||
of a test run (setup, call, teardown)."""
|
|
||||||
if call.when == "call":
|
if call.when == "call":
|
||||||
if (
|
challenge_data = item.funcargs.get("regression_data", None)
|
||||||
call.excinfo is None
|
difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
|
||||||
): # if no error in the call stage, add it as a regression test
|
dependencies = challenge_data.dependencies if challenge_data else []
|
||||||
regression_manager.add_test(item.nodeid)
|
|
||||||
else: # otherwise, :(
|
test_details = {
|
||||||
regression_manager.remove_test(item.nodeid)
|
"difficulty": difficulty,
|
||||||
|
"dependencies": dependencies,
|
||||||
|
"test": item.nodeid,
|
||||||
|
}
|
||||||
|
|
||||||
|
print("pytest_runtest_makereport", test_details)
|
||||||
|
if call.excinfo is None:
|
||||||
|
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
|
||||||
|
else:
|
||||||
|
regression_manager.remove_test(item.nodeid.split("::")[1])
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(items):
|
def pytest_collection_modifyitems(items):
|
||||||
|
@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items):
|
||||||
to add regression marker to collected test items."""
|
to add regression marker to collected test items."""
|
||||||
for item in items:
|
for item in items:
|
||||||
print("pytest_collection_modifyitems", item.nodeid)
|
print("pytest_collection_modifyitems", item.nodeid)
|
||||||
if item.nodeid + "\n" in regression_manager.tests:
|
if item.nodeid.split("::")[1] in regression_manager.tests:
|
||||||
print(regression_manager.tests)
|
print(regression_manager.tests)
|
||||||
item.add_marker(pytest.mark.regression)
|
item.add_marker(pytest.mark.regression)
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge):
|
||||||
[(data.task, data.mock_func)],
|
[(data.task, data.mock_func)],
|
||||||
indirect=True,
|
indirect=True,
|
||||||
)
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"regression_data",
|
||||||
|
[data],
|
||||||
|
indirect=True,
|
||||||
|
)
|
||||||
@pytest.mark.depends(on=data.dependencies)
|
@pytest.mark.depends(on=data.dependencies)
|
||||||
def test_read_file(self, workspace):
|
def test_read_file(self, workspace):
|
||||||
file = self.open_file(workspace, data.ground.files[0])
|
file = self.open_file(workspace, data.ground.files[0])
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
},
|
},
|
||||||
"mock_func": "basic_write_file_mock",
|
"mock_func": "basic_write_file_mock",
|
||||||
"info": {
|
"info": {
|
||||||
"difficulty": "easy",
|
"difficulty": "basic",
|
||||||
"description": "Tests the writing to file",
|
"description": "Tests the writing to file",
|
||||||
"side_effects": ["tests if there is in fact an LLM attached"]
|
"side_effects": ["tests if there is in fact an LLM attached"]
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge):
|
||||||
[(data.task, data.mock_func)],
|
[(data.task, data.mock_func)],
|
||||||
indirect=True,
|
indirect=True,
|
||||||
)
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"regression_data",
|
||||||
|
[data],
|
||||||
|
indirect=True,
|
||||||
|
)
|
||||||
@pytest.mark.depends(name="test_write_file")
|
@pytest.mark.depends(name="test_write_file")
|
||||||
def test_write_file(self, workspace):
|
def test_write_file(self, workspace):
|
||||||
file = self.open_file(workspace, data.ground.files[0])
|
file = self.open_file(workspace, data.ground.files[0])
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
class RegressionManager:
|
class RegressionManager:
|
||||||
"""Abstracts interaction with the regression tests file"""
|
"""Abstracts interaction with the regression tests file"""
|
||||||
|
|
||||||
|
@ -6,17 +9,21 @@ class RegressionManager:
|
||||||
self.load()
|
self.load()
|
||||||
|
|
||||||
def load(self) -> None:
|
def load(self) -> None:
|
||||||
with open(self.filename, "r") as f:
|
try:
|
||||||
self.tests = f.readlines()
|
with open(self.filename, "r") as f:
|
||||||
|
self.tests = json.load(f)
|
||||||
|
except (FileNotFoundError, json.decoder.JSONDecodeError):
|
||||||
|
self.tests = {}
|
||||||
|
|
||||||
def save(self) -> None:
|
def save(self) -> None:
|
||||||
with open(self.filename, "w") as f:
|
with open(self.filename, "w") as f:
|
||||||
f.writelines(self.tests)
|
json.dump(self.tests, f, indent=4)
|
||||||
|
|
||||||
def add_test(self, test_id) -> None:
|
def add_test(self, test_name: str, test_details: dict) -> None:
|
||||||
if f"{test_id}\n" not in self.tests:
|
self.tests[test_name] = test_details
|
||||||
self.tests.append(f"{test_id}\n")
|
self.save()
|
||||||
|
|
||||||
def remove_test(self, test_id) -> None:
|
def remove_test(self, test_name: str) -> None:
|
||||||
if f"{test_id}\n" in self.tests:
|
if test_name in self.tests:
|
||||||
self.tests.remove(f"{test_id}\n")
|
del self.tests[test_name]
|
||||||
|
self.save()
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
{}
|
|
@ -1,3 +1,14 @@
|
||||||
agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
|
{
|
||||||
agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
|
"agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
|
||||||
agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
|
"difficulty": "easy",
|
||||||
|
"dependencies": [],
|
||||||
|
"test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
|
||||||
|
},
|
||||||
|
"agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
|
||||||
|
"difficulty": "basic",
|
||||||
|
"dependencies": [
|
||||||
|
"test_write_file"
|
||||||
|
],
|
||||||
|
"test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue