Dynamic cutoff and other quality of life (#101)
parent
757baba3ff
commit
9f3a2d4f05
|
@ -1,6 +1,6 @@
|
|||
[submodule "agent/Auto-GPT"]
|
||||
path = agent/Auto-GPT
|
||||
url = https://github.com/Significant-Gravitas/Auto-GPT.git
|
||||
url = https://github.com/merwanehamadi/Auto-GPT.git
|
||||
branch = benchmark-integration
|
||||
[submodule "agent/gpt-engineer"]
|
||||
path = agent/gpt-engineer
|
||||
|
|
|
@ -16,9 +16,7 @@ MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
|
|||
|
||||
|
||||
def run_agent(
|
||||
task: str,
|
||||
config: Dict[str, Any],
|
||||
challenge_location: str,
|
||||
task: str, config: Dict[str, Any], challenge_location: str, cutoff: int
|
||||
) -> None:
|
||||
"""Calling to get a response"""
|
||||
|
||||
|
@ -27,9 +25,7 @@ def run_agent(
|
|||
config["workspace"], "artifacts_out", challenge_location
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
|
||||
)
|
||||
print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
|
||||
command = [sys.executable, "-m", config["entry_path"], str(task)]
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
|
@ -50,11 +46,11 @@ def run_agent(
|
|||
if (
|
||||
process.poll() is not None
|
||||
or output == ""
|
||||
or (time.time() - start_time > config["cutoff"])
|
||||
or (time.time() - start_time > cutoff)
|
||||
):
|
||||
break
|
||||
|
||||
if time.time() - start_time > config["cutoff"]:
|
||||
if time.time() - start_time > cutoff:
|
||||
print("The Python function has exceeded the time limit and was terminated.")
|
||||
process.kill()
|
||||
else:
|
||||
|
|
|
@ -4,16 +4,9 @@ import subprocess
|
|||
from abc import ABC
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from agbenchmark.challenges.define_task_types import ChallengeData, Ground
|
||||
from agbenchmark.start_benchmark import CURRENT_DIRECTORY
|
||||
|
||||
load_dotenv()
|
||||
|
||||
mock_test_str = os.getenv("MOCK_TEST")
|
||||
MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
|
||||
|
||||
|
||||
class Challenge(ABC):
|
||||
"""The parent class to all specific challenges classes.
|
||||
|
@ -37,14 +30,14 @@ class Challenge(ABC):
|
|||
def dependencies(self) -> list:
|
||||
return self.data.dependencies
|
||||
|
||||
def setup_challenge(self, config: Dict[str, Any]) -> None:
|
||||
def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
|
||||
from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
|
||||
|
||||
copy_artifacts_into_workspace(
|
||||
config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
|
||||
)
|
||||
|
||||
run_agent(self.task, config, self.CHALLENGE_LOCATION)
|
||||
run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff)
|
||||
|
||||
# hidden files are added after the agent runs. Hidden files can be python test files.
|
||||
# We copy them in the workspace to make it easy to import the code produced by the agent
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["code", "iterate"],
|
||||
"task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
|
||||
"dependencies": ["TestReadFile", "TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["code", "iterate"],
|
||||
"task": "Make test.py run without errors.",
|
||||
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["code"],
|
||||
"task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
|
||||
"dependencies": ["TestDebugSimpleTypoWithGuidance"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "GET localhost:8079/health responds with a 200 OK",
|
||||
"should_contain": [],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["code"],
|
||||
"task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "The two_sum function coded properly.",
|
||||
"should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["code", "iterate"],
|
||||
"task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
|
||||
"dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "The three_sum function coded properly.",
|
||||
"should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
|
||||
|
|
|
@ -61,6 +61,7 @@ class ChallengeData(BaseModel):
|
|||
category: List[str]
|
||||
task: str
|
||||
dependencies: List[str]
|
||||
cutoff: int
|
||||
ground: Ground
|
||||
info: Info
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["interface"],
|
||||
"task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "random string Hello World!",
|
||||
"should_contain": ["random string", "Hello World!"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["interface"],
|
||||
"task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
|
||||
"dependencies": ["TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "This is a Heading\nThis is a paragraph.",
|
||||
"should_contain": ["Heading", "paragraph"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["interface"],
|
||||
"task": "Print the the capital of America to a .txt file",
|
||||
"dependencies": [],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "Washington",
|
||||
"should_contain": ["Washington"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestReadFile", "TestWriteFile"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "2314",
|
||||
"should_contain": ["2314"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestBasicMemory"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIds"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "3145\n3791\n9317\n9471",
|
||||
"should_contain": ["3145", "3791", "9317", "9471"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["memory"],
|
||||
"task": "Follow the instructions in the instructions_1.txt file",
|
||||
"dependencies": ["TestRememberMultipleIdsWithNoise"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
|
||||
"should_contain": [
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["retrieval"],
|
||||
"task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
|
||||
"dependencies": ["TestWriteFile", "TestSearch"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "£25.89",
|
||||
"should_contain": ["25.89"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestBasicRetrieval"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "81,462",
|
||||
"should_contain": ["81,462"],
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
"category": ["retrieval"],
|
||||
"task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
|
||||
"dependencies": ["TestRetrieval2"],
|
||||
"cutoff": 60,
|
||||
"ground": {
|
||||
"answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
|
||||
"should_contain": [
|
||||
|
|
|
@ -54,7 +54,8 @@ def generate_tests() -> None:
|
|||
|
||||
# Define test method within the dynamically created class
|
||||
def test_method(self, config: Dict[str, Any]) -> None: # type: ignore
|
||||
self.setup_challenge(config)
|
||||
cutoff = self.data.cutoff or 60
|
||||
self.setup_challenge(config, cutoff)
|
||||
|
||||
scores = self.get_scores(config)
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
{
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks",
|
||||
"cutoff": 60
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
}
|
||||
|
|
|
@ -18,12 +18,10 @@ from agbenchmark.start_benchmark import (
|
|||
from agbenchmark.utils import calculate_success_percentage
|
||||
|
||||
|
||||
def resolve_workspace(config: Dict[str, Any]) -> str:
|
||||
if config.get("workspace", "").startswith("${") and config.get(
|
||||
"workspace", ""
|
||||
).endswith("}"):
|
||||
def resolve_workspace(workspace: str) -> str:
|
||||
if workspace.startswith("${") and workspace.endswith("}"):
|
||||
# Extract the string inside ${...}
|
||||
path_expr = config["workspace"][2:-1]
|
||||
path_expr = workspace[2:-1]
|
||||
|
||||
# Check if it starts with "os.path.join"
|
||||
if path_expr.strip().startswith("os.path.join"):
|
||||
|
@ -35,7 +33,7 @@ def resolve_workspace(config: Dict[str, Any]) -> str:
|
|||
else:
|
||||
raise ValueError("Invalid workspace path expression.")
|
||||
else:
|
||||
return os.path.abspath(Path(os.getcwd()) / config["workspace"])
|
||||
return os.path.abspath(Path(os.getcwd()) / workspace)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
@ -45,10 +43,10 @@ def config(request: Any) -> None:
|
|||
config = json.load(f)
|
||||
|
||||
if isinstance(config["workspace"], str):
|
||||
config["workspace"] = resolve_workspace(config)
|
||||
config["workspace"] = resolve_workspace(config["workspace"])
|
||||
else: # it's a input output dict
|
||||
config["workspace"]["input"] = resolve_workspace(config)
|
||||
config["workspace"]["output"] = resolve_workspace(config)
|
||||
config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
|
||||
config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])
|
||||
|
||||
return config
|
||||
|
||||
|
@ -173,18 +171,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|||
regression_manager.remove_test(test_name)
|
||||
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
|
||||
|
||||
prev_test_results: list[bool] = []
|
||||
|
||||
prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
|
||||
if not mock:
|
||||
# only add if it's an actual test
|
||||
prev_test_results = internal_info.tests.get(test_name, [])
|
||||
prev_test_results.append(info_details["metrics"]["success"])
|
||||
internal_info.add_test(test_name, prev_test_results)
|
||||
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
prev_test_results
|
||||
)
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
prev_test_results
|
||||
)
|
||||
else:
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"][
|
||||
"non_mock_success_%"
|
||||
] = calculate_success_percentage(prev_test_results)
|
||||
|
||||
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
|
||||
# if the last 3 tests were successful, add to the regression tests
|
||||
|
|
|
@ -62,6 +62,12 @@
|
|||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false
|
||||
]
|
||||
}
|
|
@ -16,57 +16,52 @@
|
|||
"data_path": "agbenchmark/challenges/retrieval/r1"
|
||||
},
|
||||
"TestReadFile": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "interface",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/interface/read_file"
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "novice",
|
||||
"dependencies": [
|
||||
"TestBasicMemory"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/memory/m2"
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "intermediate",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIds"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/memory/m3"
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"difficulty": "medium",
|
||||
"difficulty": "advanced",
|
||||
"dependencies": [
|
||||
"TestRememberMultipleIdsWithNoise"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/memory/m4"
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "novice",
|
||||
"dependencies": [
|
||||
"TestBasicRetrieval"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2"
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "intermediate",
|
||||
"dependencies": [
|
||||
"TestRetrieval2"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/retrieval/r3"
|
||||
},
|
||||
"TestSearch": {
|
||||
"difficulty": "basic",
|
||||
"difficulty": "interface",
|
||||
"dependencies": [
|
||||
"TestWriteFile"
|
||||
],
|
||||
"data_path": "agbenchmark/challenges/interface/search"
|
||||
},
|
||||
"TestWriteFile": {
|
||||
"difficulty": "basic",
|
||||
"dependencies": [],
|
||||
"data_path": "agbenchmark/challenges/interface/write_file"
|
||||
}
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
"command": "agbenchmark start --mock",
|
||||
"completion_time": "2023-07-11-21:09",
|
||||
"completion_time": "2023-07-14-18:54",
|
||||
"metrics": {
|
||||
"run_time": "0.96 seconds",
|
||||
"run_time": "0.97 seconds",
|
||||
"highest_difficulty": "advanced: 5"
|
||||
},
|
||||
"tests": {
|
||||
|
@ -12,28 +12,28 @@
|
|||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.008 seconds"
|
||||
"non_mock_success_%": 75.0,
|
||||
"run_time": "0.007 seconds"
|
||||
}
|
||||
},
|
||||
"TestReadFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.005 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.008 seconds"
|
||||
}
|
||||
},
|
||||
"TestSearch": {
|
||||
"data_path": "agbenchmark/challenges/interface/search",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.006 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.007 seconds"
|
||||
}
|
||||
},
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
|
@ -43,28 +43,28 @@
|
|||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0,
|
||||
"run_time": "0.489 seconds"
|
||||
"non_mock_success_%": 0.0,
|
||||
"run_time": "0.448 seconds"
|
||||
}
|
||||
},
|
||||
"TestBasicMemory": {
|
||||
"data_path": "agbenchmark/challenges/memory/m1",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.02 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.028 seconds"
|
||||
}
|
||||
},
|
||||
"TestBasicRetrieval": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r1",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.01 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.014 seconds"
|
||||
}
|
||||
},
|
||||
"TestDebugSimpleTypoWithoutGuidance": {
|
||||
|
@ -74,7 +74,7 @@
|
|||
"difficulty": "novice",
|
||||
"success": false,
|
||||
"fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
|
||||
"success_%": 0,
|
||||
"non_mock_success_%": 0.0,
|
||||
"run_time": "0.001 seconds"
|
||||
}
|
||||
},
|
||||
|
@ -85,64 +85,63 @@
|
|||
"difficulty": "advanced",
|
||||
"success": false,
|
||||
"fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
|
||||
"success_%": 0,
|
||||
"run_time": "0.001 seconds"
|
||||
"non_mock_success_%": 0.0,
|
||||
"run_time": "0.002 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultipleIds": {
|
||||
"data_path": "agbenchmark/challenges/memory/m2",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.018 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.023 seconds"
|
||||
}
|
||||
},
|
||||
"TestRetrieval2": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r2",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "novice",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.009 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.013 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultipleIdsWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m3",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.022 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.03 seconds"
|
||||
}
|
||||
},
|
||||
"TestRetrieval3": {
|
||||
"data_path": "agbenchmark/challenges/retrieval/r3",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "intermediate",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.01 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.016 seconds"
|
||||
}
|
||||
},
|
||||
"TestRememberMultiplePhrasesWithNoise": {
|
||||
"data_path": "agbenchmark/challenges/memory/m4",
|
||||
"is_regression": false,
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "advanced",
|
||||
"success": true,
|
||||
"success_%": 0,
|
||||
"run_time": "0.021 seconds"
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.034 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks",
|
||||
"cutoff": 60
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
}
|
||||
}
|
|
@ -62,7 +62,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
|
|||
|
||||
config["entry_path"] = click.prompt(
|
||||
"Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
|
||||
default="benchmarks.py",
|
||||
default="agbenchmark/benchmarks.py",
|
||||
)
|
||||
|
||||
config["cutoff"] = click.prompt(
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# radio charts, logs, helper functions for tests, anything else relevant.
|
||||
import glob
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
@ -12,11 +13,13 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
|
|||
|
||||
if not INFO_TESTS_PATH.exists():
|
||||
INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
return str(INFO_TESTS_PATH / "1.json")
|
||||
return str(
|
||||
INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||
)
|
||||
else:
|
||||
json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
|
||||
file_count = len(json_files)
|
||||
run_name = f"{file_count + 1}.json"
|
||||
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||
new_file_path = INFO_TESTS_PATH / run_name
|
||||
return str(new_file_path)
|
||||
|
||||
|
@ -35,8 +38,10 @@ def replace_backslash(value: Any) -> Any:
|
|||
|
||||
|
||||
def calculate_success_percentage(results: list[bool]) -> float:
|
||||
success_count = results.count(True)
|
||||
total_count = len(results)
|
||||
# Take the last 10 results or all if less than 10
|
||||
last_results = results[-10:] if len(results) > 10 else results
|
||||
success_count = last_results.count(True)
|
||||
total_count = len(last_results)
|
||||
if total_count == 0:
|
||||
return 0
|
||||
success_percentage = (success_count / total_count) * 100 # as a percentage
|
||||
|
@ -45,7 +50,7 @@ def calculate_success_percentage(results: list[bool]) -> float:
|
|||
|
||||
def get_highest_success_difficulty(data: dict) -> str:
|
||||
highest_difficulty = None
|
||||
highest_difficulty_level = -1
|
||||
highest_difficulty_level = 0
|
||||
|
||||
for test_name, test_data in data.items():
|
||||
if test_data["metrics"]["success"]:
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84
|
||||
Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16
|
|
@ -1 +1 @@
|
|||
Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007
|
||||
Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b
|
|
@ -1,5 +1,4 @@
|
|||
{
|
||||
"workspace": "projects/my-new-project/workspace",
|
||||
"entry_path": "agbenchmark/benchmarks.py",
|
||||
"cutoff": 60
|
||||
"entry_path": "agbenchmark/benchmarks.py"
|
||||
}
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82
|
||||
Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0
|
|
@ -1 +1 @@
|
|||
Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0
|
||||
Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9
|
|
@ -1 +1 @@
|
|||
Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898
|
||||
Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7
|
Loading…
Reference in New Issue