mock flag, workspace io fixes, mark fixes

pull/5155/head
Silen Naihin 2023-08-11 13:22:21 +01:00
parent f74a960d05
commit 1a61c66898
11 changed files with 36 additions and 26 deletions

View File

@ -1,4 +1,3 @@
AGENT_NAME=mini-agi
REPORT_LOCATION="reports/mini-agi"
MOCK_TEST=False # this is automatically set with the --mock flag
OPENAI_API_KEY="sk-" # for LLM eval

View File

@ -80,8 +80,13 @@ def get_list_of_file_paths(
def copy_artifacts_into_workspace(
workspace: str, artifact_folder_name: str, challenge_dir_path: str
workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
) -> None:
if isinstance(workspace, dict):
if artifact_folder_name == "artifacts_in":
workspace = workspace["input"]
else:
workspace = workspace["output"]
file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
for file_path in file_paths:
if os.path.isfile(file_path):

@ -1 +1 @@
Subproject commit b05bd27b8b056843e03c3e9d6056470eaba6e7dd
Subproject commit 0ec140a61ff6740bb62059e5d1d61495f845f7d2

View File

@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
yield config["workspace"]
# teardown after test function completes
if not config.get("keep_workspace_files", False):
print("Emptying workspace")
for filename in os.listdir(output_path):
file_path = os.path.join(output_path, filename)
try:

View File

@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:
def get_highest_category_difficulty(data: Test) -> None:
for category in data.category:
if category == "interface" or category == "iterate":
if (
category == "interface"
or category == "iterate"
or category == "product_advisor"
):
continue
categories.setdefault(category, 0)
if data.metrics.success:

View File

@ -43,4 +43,4 @@ class Report(BaseModel):
benchmark_start_time: str
metrics: MetricsOverall
tests: Dict[str, Union[Test, SuiteTest]]
config: Dict[str, str]
config: Dict[str, str | dict[str, str]]

View File

@ -7,7 +7,6 @@ from typing import Any, Dict
from agbenchmark.reports.ReportManager import ReportManager
from agbenchmark.start_benchmark import (
CONFIG_PATH,
MOCK_FLAG,
REGRESSION_TESTS_PATH,
REPORTS_PATH,
SUCCESS_RATE_PATH,
@ -144,7 +143,6 @@ def update_regression_tests(
def generate_single_call_report(
item: Any, call: Any, challenge_data: dict[str, Any]
) -> None:
try:
difficulty = challenge_data["info"]["difficulty"]
except KeyError:
@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
if info_details and test_name:
if run_time:
cost = None
if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
print("Getting cost from Helicone")
cost = get_data_from_helicone(test_name)
else:
print("Helicone not setup or mock flag set, not getting cost")
info_details["metrics"]["cost"] = cost
@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details["tests"].items():
update_challenges_already_beaten(nested_test_info, nested_test_name)
if "--mock" not in sys.argv:
update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details[
"tests"
].items():
update_challenges_already_beaten(
nested_test_info, nested_test_name
)
info_manager.add_test(test_name, info_details)

View File

@ -22,8 +22,6 @@ if os.environ.get("HELICONE_API_KEY"):
HeliconeLockManager.write_custom_property(
"benchmark_start_time", BENCHMARK_START_TIME
)
MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
(
HOME_DIRECTORY,
@ -170,8 +168,6 @@ def start(
for key, value in config.items():
print(f"{key}: {value}")
os.environ["MOCK_TEST"] = "True" if mock else "False"
pytest_args = ["-vs"]
if test:
print("Running specific test:", test)

View File

@ -10,7 +10,7 @@ import openai
import pytest
from agbenchmark.agent_api_interface import run_api_agent
from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
from agbenchmark.utils.data_types import ChallengeData, Ground
from agbenchmark.utils.prompts import (
END_PROMPT,
@ -61,7 +61,7 @@ class Challenge(ABC):
)
print(f"\033[1;30mTask: {self.task}\033[0m")
if MOCK_FLAG:
if "--mock" in sys.argv:
print("Running mock agent")
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
@ -88,7 +88,12 @@ class Challenge(ABC):
with open(workspace_dir, "r") as f:
return f.read()
def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
def get_artifacts_out(
self, workspace: str | dict[str, str], ground: Ground
) -> List[str]:
if isinstance(workspace, dict):
workspace = workspace["output"]
script_dir = workspace
files_contents = []
@ -163,7 +168,7 @@ class Challenge(ABC):
def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
openai.api_key = os.getenv("OPENAI_API_KEY")
if MOCK_FLAG:
if "--mock" in sys.argv:
return 1.0
# the validation for this is done in the Eval BaseModel
@ -190,7 +195,7 @@ class Challenge(ABC):
percentage = None
try:
if self.data.task == "" and MOCK_FLAG:
if self.data.task == "" and "--mock" in sys.argv:
scores = [1.0]
elif isinstance(self.data.ground, Ground):
files_contents = self.get_artifacts_out(

@ -1 +1 @@
Subproject commit b0318053b6cbe357f2e020fe0f1275a2cb3da767
Subproject commit 48b2101374264b97dbdfc2c0bb0ae45e769e157d

View File

@ -60,7 +60,8 @@ markers = [
"iterate",
"adaptability",
"safety",
"content_gen"
"content_gen",
"product_advisor"
]
filterwarnings = [
"ignore::pytest.PytestAssertRewriteWarning",