internal_info.json dynamic changes (#163)
parent
ce4cefe7e7
commit
dffc1dfd51
|
@ -3,7 +3,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
from agbenchmark.utils import get_highest_success_difficulty
|
from agbenchmark.utils import get_highest_success_difficulty
|
||||||
|
|
||||||
|
@ -37,8 +37,18 @@ class ReportManager:
|
||||||
with open(self.filename, "w") as f:
|
with open(self.filename, "w") as f:
|
||||||
json.dump(self.tests, f, indent=4)
|
json.dump(self.tests, f, indent=4)
|
||||||
|
|
||||||
def add_test(self, test_name: str, test_details: dict | list) -> None:
|
def add_test(
|
||||||
self.tests[test_name] = test_details
|
self,
|
||||||
|
test_name: str,
|
||||||
|
test_details: dict | list,
|
||||||
|
agent_name: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
if agent_name:
|
||||||
|
if agent_name not in self.tests:
|
||||||
|
self.tests[agent_name] = {}
|
||||||
|
self.tests[agent_name][test_name] = test_details
|
||||||
|
else:
|
||||||
|
self.tests[test_name] = test_details
|
||||||
|
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ from agbenchmark.start_benchmark import (
|
||||||
REGRESSION_TESTS_PATH,
|
REGRESSION_TESTS_PATH,
|
||||||
get_regression_data,
|
get_regression_data,
|
||||||
)
|
)
|
||||||
from agbenchmark.utils import calculate_success_percentage
|
from agbenchmark.utils import AGENT_NAME, calculate_success_percentage
|
||||||
|
|
||||||
|
|
||||||
def resolve_workspace(workspace: str) -> str:
|
def resolve_workspace(workspace: str) -> str:
|
||||||
|
@ -128,9 +128,10 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
|
||||||
# user facing reporting information
|
# user facing reporting information
|
||||||
info_manager = ReportManager(INFO_TESTS_PATH)
|
info_manager = ReportManager(INFO_TESTS_PATH)
|
||||||
|
|
||||||
INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py
|
INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports"
|
||||||
|
|
||||||
# internal db step in replacement track pass/fail rate
|
# internal db step in replacement track pass/fail rate
|
||||||
internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
|
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||||
|
@ -171,11 +172,22 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||||
regression_manager.remove_test(test_name)
|
regression_manager.remove_test(test_name)
|
||||||
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
|
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
|
||||||
|
|
||||||
prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
|
prev_test_results: list[bool]
|
||||||
|
agent_tests: dict[str, list[bool]] = {}
|
||||||
|
|
||||||
|
# if the structure is nested inside of the agent name
|
||||||
|
if AGENT_NAME:
|
||||||
|
agent_tests = internal_info.tests.get(AGENT_NAME, {})
|
||||||
|
|
||||||
|
if agent_tests:
|
||||||
|
prev_test_results = agent_tests.get(test_name, [])
|
||||||
|
else:
|
||||||
|
prev_test_results = internal_info.tests.get(test_name, [])
|
||||||
|
|
||||||
if not mock:
|
if not mock:
|
||||||
# only add if it's an actual test
|
# only add if it's an actual test
|
||||||
prev_test_results.append(info_details["metrics"]["success"])
|
prev_test_results.append(info_details["metrics"]["success"])
|
||||||
internal_info.add_test(test_name, prev_test_results)
|
internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
|
||||||
|
|
||||||
# can calculate success rate regardless of mock
|
# can calculate success rate regardless of mock
|
||||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||||
|
|
|
@ -1,83 +0,0 @@
|
||||||
{
|
|
||||||
"TestBasicMemory": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestBasicRetrieval": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestCreateSimpleWebServer": [
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false
|
|
||||||
],
|
|
||||||
"TestDebugSimpleTypoWithGuidance": [
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false
|
|
||||||
],
|
|
||||||
"TestDebugSimpleTypoWithoutGuidance": [
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false
|
|
||||||
],
|
|
||||||
"TestReadFile": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestRememberMultipleIds": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestRememberMultipleIdsWithNoise": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestRememberMultiplePhrasesWithNoise": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestRetrieval2": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestRetrieval3": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestSearch": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true
|
|
||||||
],
|
|
||||||
"TestWriteFile": [
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
false,
|
|
||||||
true
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
{
|
||||||
|
"mini-agi": {
|
||||||
|
"TestBasicMemory": [true, true, true],
|
||||||
|
"TestBasicRetrieval": [true, true, true],
|
||||||
|
"TestCreateSimpleWebServer": [false, false, false],
|
||||||
|
"TestDebugSimpleTypoWithGuidance": [
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false
|
||||||
|
],
|
||||||
|
"TestDebugSimpleTypoWithoutGuidance": [false, false, false],
|
||||||
|
"TestReadFile": [true, true, true, true],
|
||||||
|
"TestRememberMultipleIds": [true, true, true],
|
||||||
|
"TestRememberMultipleIdsWithNoise": [true, true, true],
|
||||||
|
"TestRememberMultiplePhrasesWithNoise": [true, true, true],
|
||||||
|
"TestRetrieval2": [true, true, true],
|
||||||
|
"TestRetrieval3": [true, true, true],
|
||||||
|
"TestSearch": [true, true, true, true],
|
||||||
|
"TestWriteFile": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,7 +17,6 @@ HOME_ENV = os.getenv("HOME_ENV")
|
||||||
|
|
||||||
|
|
||||||
def calculate_info_test_path(reports_path: Path) -> str:
|
def calculate_info_test_path(reports_path: Path) -> str:
|
||||||
print("reports_pathreports_pathreports_pathreports_path", reports_path)
|
|
||||||
if not reports_path.exists():
|
if not reports_path.exists():
|
||||||
reports_path.mkdir(parents=True, exist_ok=True)
|
reports_path.mkdir(parents=True, exist_ok=True)
|
||||||
return str(
|
return str(
|
||||||
|
@ -129,6 +128,7 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
||||||
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
|
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
|
||||||
benchmarks_folder_path
|
benchmarks_folder_path
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
|
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
|
||||||
# used when its just a pip install
|
# used when its just a pip install
|
||||||
|
@ -139,4 +139,9 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
||||||
if not benchmarks_folder_path.exists():
|
if not benchmarks_folder_path.exists():
|
||||||
benchmarks_folder_path.mkdir(exist_ok=True)
|
benchmarks_folder_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
|
return (
|
||||||
|
HOME_DIRECTORY,
|
||||||
|
CONFIG_PATH,
|
||||||
|
REGRESSION_TESTS_PATH,
|
||||||
|
INFO_TESTS_PATH,
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue