internal_info.json dynamic changes (#163)
parent
ce4cefe7e7
commit
dffc1dfd51
|
@ -3,7 +3,7 @@ import os
|
|||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from agbenchmark.utils import get_highest_success_difficulty
|
||||
|
||||
|
@ -37,8 +37,18 @@ class ReportManager:
|
|||
with open(self.filename, "w") as f:
|
||||
json.dump(self.tests, f, indent=4)
|
||||
|
||||
def add_test(self, test_name: str, test_details: dict | list) -> None:
|
||||
self.tests[test_name] = test_details
|
||||
def add_test(
|
||||
self,
|
||||
test_name: str,
|
||||
test_details: dict | list,
|
||||
agent_name: Optional[str] = None,
|
||||
) -> None:
|
||||
if agent_name:
|
||||
if agent_name not in self.tests:
|
||||
self.tests[agent_name] = {}
|
||||
self.tests[agent_name][test_name] = test_details
|
||||
else:
|
||||
self.tests[test_name] = test_details
|
||||
|
||||
self.save()
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from agbenchmark.start_benchmark import (
|
|||
REGRESSION_TESTS_PATH,
|
||||
get_regression_data,
|
||||
)
|
||||
from agbenchmark.utils import calculate_success_percentage
|
||||
from agbenchmark.utils import AGENT_NAME, calculate_success_percentage
|
||||
|
||||
|
||||
def resolve_workspace(workspace: str) -> str:
|
||||
|
@ -128,9 +128,10 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
|
|||
# user facing reporting information
|
||||
info_manager = ReportManager(INFO_TESTS_PATH)
|
||||
|
||||
INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py
|
||||
INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports"
|
||||
|
||||
# internal db step in replacement track pass/fail rate
|
||||
internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
|
||||
internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
|
||||
|
||||
|
||||
def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
||||
|
@ -171,11 +172,22 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
|
|||
regression_manager.remove_test(test_name)
|
||||
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
|
||||
|
||||
prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
|
||||
prev_test_results: list[bool]
|
||||
agent_tests: dict[str, list[bool]] = {}
|
||||
|
||||
# if the structure is nested inside of the agent name
|
||||
if AGENT_NAME:
|
||||
agent_tests = internal_info.tests.get(AGENT_NAME, {})
|
||||
|
||||
if agent_tests:
|
||||
prev_test_results = agent_tests.get(test_name, [])
|
||||
else:
|
||||
prev_test_results = internal_info.tests.get(test_name, [])
|
||||
|
||||
if not mock:
|
||||
# only add if it's an actual test
|
||||
prev_test_results.append(info_details["metrics"]["success"])
|
||||
internal_info.add_test(test_name, prev_test_results)
|
||||
internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
|
||||
|
||||
# can calculate success rate regardless of mock
|
||||
info_details["metrics"]["success_%"] = calculate_success_percentage(
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
{
|
||||
"TestBasicMemory": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestCreateSimpleWebServer": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestSearch": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true
|
||||
]
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
"mini-agi": {
|
||||
"TestBasicMemory": [true, true, true],
|
||||
"TestBasicRetrieval": [true, true, true],
|
||||
"TestCreateSimpleWebServer": [false, false, false],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [false, false, false],
|
||||
"TestReadFile": [true, true, true, true],
|
||||
"TestRememberMultipleIds": [true, true, true],
|
||||
"TestRememberMultipleIdsWithNoise": [true, true, true],
|
||||
"TestRememberMultiplePhrasesWithNoise": [true, true, true],
|
||||
"TestRetrieval2": [true, true, true],
|
||||
"TestRetrieval3": [true, true, true],
|
||||
"TestSearch": [true, true, true, true],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true
|
||||
]
|
||||
}
|
||||
}
|
|
@ -17,7 +17,6 @@ HOME_ENV = os.getenv("HOME_ENV")
|
|||
|
||||
|
||||
def calculate_info_test_path(reports_path: Path) -> str:
|
||||
print("reports_pathreports_pathreports_pathreports_path", reports_path)
|
||||
if not reports_path.exists():
|
||||
reports_path.mkdir(parents=True, exist_ok=True)
|
||||
return str(
|
||||
|
@ -129,6 +128,7 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
|||
CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
|
||||
benchmarks_folder_path
|
||||
)
|
||||
|
||||
else:
|
||||
# otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
|
||||
# used when its just a pip install
|
||||
|
@ -139,4 +139,9 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
|
|||
if not benchmarks_folder_path.exists():
|
||||
benchmarks_folder_path.mkdir(exist_ok=True)
|
||||
|
||||
return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
|
||||
return (
|
||||
HOME_DIRECTORY,
|
||||
CONFIG_PATH,
|
||||
REGRESSION_TESTS_PATH,
|
||||
INFO_TESTS_PATH,
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue