mock flag, workspace io fixes, mark fixes
parent
f74a960d05
commit
1a61c66898
|
@ -1,4 +1,3 @@
|
|||
AGENT_NAME=mini-agi
|
||||
REPORT_LOCATION="reports/mini-agi"
|
||||
MOCK_TEST=False # this is automatically set with the --mock flag
|
||||
OPENAI_API_KEY="sk-" # for LLM eval
|
||||
|
|
|
@ -80,8 +80,13 @@ def get_list_of_file_paths(
|
|||
|
||||
|
||||
def copy_artifacts_into_workspace(
|
||||
workspace: str, artifact_folder_name: str, challenge_dir_path: str
|
||||
workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
|
||||
) -> None:
|
||||
if isinstance(workspace, dict):
|
||||
if artifact_folder_name == "artifacts_in":
|
||||
workspace = workspace["input"]
|
||||
else:
|
||||
workspace = workspace["output"]
|
||||
file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
|
||||
for file_path in file_paths:
|
||||
if os.path.isfile(file_path):
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit b05bd27b8b056843e03c3e9d6056470eaba6e7dd
|
||||
Subproject commit 0ec140a61ff6740bb62059e5d1d61495f845f7d2
|
|
@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
|
|||
yield config["workspace"]
|
||||
# teardown after test function completes
|
||||
if not config.get("keep_workspace_files", False):
|
||||
print("Emptying workspace")
|
||||
for filename in os.listdir(output_path):
|
||||
file_path = os.path.join(output_path, filename)
|
||||
try:
|
||||
|
|
|
@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:
|
|||
|
||||
def get_highest_category_difficulty(data: Test) -> None:
|
||||
for category in data.category:
|
||||
if category == "interface" or category == "iterate":
|
||||
if (
|
||||
category == "interface"
|
||||
or category == "iterate"
|
||||
or category == "product_advisor"
|
||||
):
|
||||
continue
|
||||
categories.setdefault(category, 0)
|
||||
if data.metrics.success:
|
||||
|
|
|
@ -43,4 +43,4 @@ class Report(BaseModel):
|
|||
benchmark_start_time: str
|
||||
metrics: MetricsOverall
|
||||
tests: Dict[str, Union[Test, SuiteTest]]
|
||||
config: Dict[str, str]
|
||||
config: Dict[str, str | dict[str, str]]
|
||||
|
|
|
@ -7,7 +7,6 @@ from typing import Any, Dict
|
|||
from agbenchmark.reports.ReportManager import ReportManager
|
||||
from agbenchmark.start_benchmark import (
|
||||
CONFIG_PATH,
|
||||
MOCK_FLAG,
|
||||
REGRESSION_TESTS_PATH,
|
||||
REPORTS_PATH,
|
||||
SUCCESS_RATE_PATH,
|
||||
|
@ -144,7 +143,6 @@ def update_regression_tests(
|
|||
def generate_single_call_report(
|
||||
item: Any, call: Any, challenge_data: dict[str, Any]
|
||||
) -> None:
|
||||
|
||||
try:
|
||||
difficulty = challenge_data["info"]["difficulty"]
|
||||
except KeyError:
|
||||
|
@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
|
|||
if info_details and test_name:
|
||||
if run_time:
|
||||
cost = None
|
||||
if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
|
||||
if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
|
||||
print("Getting cost from Helicone")
|
||||
cost = get_data_from_helicone(test_name)
|
||||
else:
|
||||
print("Helicone not setup or mock flag set, not getting cost")
|
||||
|
||||
info_details["metrics"]["cost"] = cost
|
||||
|
||||
|
@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
|
|||
|
||||
info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
|
||||
|
||||
update_challenges_already_beaten(info_details, test_name)
|
||||
if info_details.get("tests") is not None:
|
||||
for nested_test_name, nested_test_info in info_details["tests"].items():
|
||||
update_challenges_already_beaten(nested_test_info, nested_test_name)
|
||||
if "--mock" not in sys.argv:
|
||||
update_challenges_already_beaten(info_details, test_name)
|
||||
if info_details.get("tests") is not None:
|
||||
for nested_test_name, nested_test_info in info_details[
|
||||
"tests"
|
||||
].items():
|
||||
update_challenges_already_beaten(
|
||||
nested_test_info, nested_test_name
|
||||
)
|
||||
|
||||
info_manager.add_test(test_name, info_details)
|
||||
|
||||
|
|
|
@ -22,8 +22,6 @@ if os.environ.get("HELICONE_API_KEY"):
|
|||
HeliconeLockManager.write_custom_property(
|
||||
"benchmark_start_time", BENCHMARK_START_TIME
|
||||
)
|
||||
MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
|
||||
|
||||
|
||||
(
|
||||
HOME_DIRECTORY,
|
||||
|
@ -170,8 +168,6 @@ def start(
|
|||
for key, value in config.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
os.environ["MOCK_TEST"] = "True" if mock else "False"
|
||||
|
||||
pytest_args = ["-vs"]
|
||||
if test:
|
||||
print("Running specific test:", test)
|
||||
|
|
|
@ -10,7 +10,7 @@ import openai
|
|||
import pytest
|
||||
|
||||
from agbenchmark.agent_api_interface import run_api_agent
|
||||
from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
|
||||
from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
|
||||
from agbenchmark.utils.data_types import ChallengeData, Ground
|
||||
from agbenchmark.utils.prompts import (
|
||||
END_PROMPT,
|
||||
|
@ -61,7 +61,7 @@ class Challenge(ABC):
|
|||
)
|
||||
print(f"\033[1;30mTask: {self.task}\033[0m")
|
||||
|
||||
if MOCK_FLAG:
|
||||
if "--mock" in sys.argv:
|
||||
print("Running mock agent")
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
|
||||
|
@ -88,7 +88,12 @@ class Challenge(ABC):
|
|||
with open(workspace_dir, "r") as f:
|
||||
return f.read()
|
||||
|
||||
def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
|
||||
def get_artifacts_out(
|
||||
self, workspace: str | dict[str, str], ground: Ground
|
||||
) -> List[str]:
|
||||
if isinstance(workspace, dict):
|
||||
workspace = workspace["output"]
|
||||
|
||||
script_dir = workspace
|
||||
files_contents = []
|
||||
|
||||
|
@ -163,7 +168,7 @@ class Challenge(ABC):
|
|||
|
||||
def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
|
||||
openai.api_key = os.getenv("OPENAI_API_KEY")
|
||||
if MOCK_FLAG:
|
||||
if "--mock" in sys.argv:
|
||||
return 1.0
|
||||
|
||||
# the validation for this is done in the Eval BaseModel
|
||||
|
@ -190,7 +195,7 @@ class Challenge(ABC):
|
|||
percentage = None
|
||||
|
||||
try:
|
||||
if self.data.task == "" and MOCK_FLAG:
|
||||
if self.data.task == "" and "--mock" in sys.argv:
|
||||
scores = [1.0]
|
||||
elif isinstance(self.data.ground, Ground):
|
||||
files_contents = self.get_artifacts_out(
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit b0318053b6cbe357f2e020fe0f1275a2cb3da767
|
||||
Subproject commit 48b2101374264b97dbdfc2c0bb0ae45e769e157d
|
|
@ -60,7 +60,8 @@ markers = [
|
|||
"iterate",
|
||||
"adaptability",
|
||||
"safety",
|
||||
"content_gen"
|
||||
"content_gen",
|
||||
"product_advisor"
|
||||
]
|
||||
filterwarnings = [
|
||||
"ignore::pytest.PytestAssertRewriteWarning",
|
||||
|
|
Loading…
Reference in New Issue