mock flag, workspace io fixes, mark fixes

2023-08-11 13:22:21 +01:00 · 2023-08-11 13:22:21 +01:00 · 1a61c66898
parent f74a960d05
commit 1a61c66898
11 changed files with 36 additions and 26 deletions
--- a/.env.example
+++ b/.env.example
@ -1,4 +1,3 @@
 AGENT_NAME=mini-agi
 REPORT_LOCATION="reports/mini-agi"
-MOCK_TEST=False # this is automatically set with the --mock flag
 OPENAI_API_KEY="sk-" # for LLM eval
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@ -80,8 +80,13 @@ def get_list_of_file_paths(


 def copy_artifacts_into_workspace(
-    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+    workspace: str | dict[str, str], artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
+    if isinstance(workspace, dict):
+        if artifact_folder_name == "artifacts_in":
+            workspace = workspace["input"]
+        else:
+            workspace = workspace["output"]
    file_paths = get_list_of_file_paths(challenge_dir_path, artifact_folder_name)
    for file_path in file_paths:
        if os.path.isfile(file_path):
--- a/agbenchmark/challenges
+++ b/agbenchmark/challenges
@ -1 +1 @@
-Subproject commit b05bd27b8b056843e03c3e9d6056470eaba6e7dd
+Subproject commit 0ec140a61ff6740bb62059e5d1d61495f845f7d2
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@ -76,7 +76,6 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
    yield config["workspace"]
    # teardown after test function completes
    if not config.get("keep_workspace_files", False):
-        print("Emptying workspace")
        for filename in os.listdir(output_path):
            file_path = os.path.join(output_path, filename)
            try:
--- a/agbenchmark/reports/processing/process_report.py
+++ b/agbenchmark/reports/processing/process_report.py
@ -36,7 +36,11 @@ def get_agent_category(report: Report) -> dict[str, Any]:

    def get_highest_category_difficulty(data: Test) -> None:
        for category in data.category:
-            if category == "interface" or category == "iterate":
+            if (
+                category == "interface"
+                or category == "iterate"
+                or category == "product_advisor"
+            ):
                continue
            categories.setdefault(category, 0)
            if data.metrics.success:
--- a/agbenchmark/reports/processing/report_types.py
+++ b/agbenchmark/reports/processing/report_types.py
@ -43,4 +43,4 @@ class Report(BaseModel):
    benchmark_start_time: str
    metrics: MetricsOverall
    tests: Dict[str, Union[Test, SuiteTest]]
-    config: Dict[str, str]
+    config: Dict[str, str | dict[str, str]]
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@ -7,7 +7,6 @@ from typing import Any, Dict
 from agbenchmark.reports.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
    CONFIG_PATH,
-    MOCK_FLAG,
    REGRESSION_TESTS_PATH,
    REPORTS_PATH,
    SUCCESS_RATE_PATH,
@ -144,7 +143,6 @@ def update_regression_tests(
 def generate_single_call_report(
    item: Any, call: Any, challenge_data: dict[str, Any]
 ) -> None:
-
    try:
        difficulty = challenge_data["info"]["difficulty"]
    except KeyError:
@ -205,11 +203,9 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
    if info_details and test_name:
        if run_time:
            cost = None
-            if not MOCK_FLAG and os.environ.get("HELICONE_API_KEY"):
+            if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
                print("Getting cost from Helicone")
                cost = get_data_from_helicone(test_name)
-            else:
-                print("Helicone not setup or mock flag set, not getting cost")

            info_details["metrics"]["cost"] = cost

@ -226,10 +222,15 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:

            info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]

-            update_challenges_already_beaten(info_details, test_name)
-            if info_details.get("tests") is not None:
-                for nested_test_name, nested_test_info in info_details["tests"].items():
-                    update_challenges_already_beaten(nested_test_info, nested_test_name)
+            if "--mock" not in sys.argv:
+                update_challenges_already_beaten(info_details, test_name)
+                if info_details.get("tests") is not None:
+                    for nested_test_name, nested_test_info in info_details[
+                        "tests"
+                    ].items():
+                        update_challenges_already_beaten(
+                            nested_test_info, nested_test_name
+                        )

        info_manager.add_test(test_name, info_details)

--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@ -22,8 +22,6 @@ if os.environ.get("HELICONE_API_KEY"):
    HeliconeLockManager.write_custom_property(
        "benchmark_start_time", BENCHMARK_START_TIME
    )
-MOCK_FLAG = os.getenv("MOCK_TEST", "").lower() == "true"
-

 (
    HOME_DIRECTORY,
@ -170,8 +168,6 @@ def start(
    for key, value in config.items():
        print(f"{key}: {value}")

-    os.environ["MOCK_TEST"] = "True" if mock else "False"
-
    pytest_args = ["-vs"]
    if test:
        print("Running specific test:", test)
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@ -10,7 +10,7 @@ import openai
 import pytest

 from agbenchmark.agent_api_interface import run_api_agent
-from agbenchmark.start_benchmark import MOCK_FLAG, OPTIONAL_CATEGORIES
+from agbenchmark.start_benchmark import OPTIONAL_CATEGORIES
 from agbenchmark.utils.data_types import ChallengeData, Ground
 from agbenchmark.utils.prompts import (
    END_PROMPT,
@ -61,7 +61,7 @@ class Challenge(ABC):
        )
        print(f"\033[1;30mTask: {self.task}\033[0m")

-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
            print("Running mock agent")
            copy_artifacts_into_workspace(
                config["workspace"], "artifacts_out", self.ARTIFACTS_LOCATION
@ -88,7 +88,12 @@ class Challenge(ABC):
        with open(workspace_dir, "r") as f:
            return f.read()

-    def get_artifacts_out(self, workspace: str, ground: Ground) -> List[str]:
+    def get_artifacts_out(
+        self, workspace: str | dict[str, str], ground: Ground
+    ) -> List[str]:
+        if isinstance(workspace, dict):
+            workspace = workspace["output"]
+
        script_dir = workspace
        files_contents = []

@ -163,7 +168,7 @@ class Challenge(ABC):

    def llm_eval(self, config: Dict[str, Any], content: str, ground: Ground) -> float:
        openai.api_key = os.getenv("OPENAI_API_KEY")
-        if MOCK_FLAG:
+        if "--mock" in sys.argv:
            return 1.0

        # the validation for this is done in the Eval BaseModel
@ -190,7 +195,7 @@ class Challenge(ABC):
        percentage = None

        try:
-            if self.data.task == "" and MOCK_FLAG:
+            if self.data.task == "" and "--mock" in sys.argv:
                scores = [1.0]
            elif isinstance(self.data.ground, Ground):
                files_contents = self.get_artifacts_out(
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@ -1 +1 @@
-Subproject commit b0318053b6cbe357f2e020fe0f1275a2cb3da767
+Subproject commit 48b2101374264b97dbdfc2c0bb0ae45e769e157d
--- a/pyproject.toml
+++ b/pyproject.toml
@ -60,7 +60,8 @@ markers = [
    "iterate",
    "adaptability",
    "safety",
-    "content_gen"
+    "content_gen",
+    "product_advisor"
 ]
 filterwarnings = [
    "ignore::pytest.PytestAssertRewriteWarning",