file naming when --test (#164)
parent
dffc1dfd51
commit
8aa6452cc4
|
@ -1,40 +1,72 @@
|
||||||
{
|
{
|
||||||
"mini-agi": {
|
"mini-agi": {
|
||||||
"TestBasicMemory": [true, true, true],
|
"TestBasicMemory": [
|
||||||
"TestBasicRetrieval": [true, true, true],
|
true,
|
||||||
"TestCreateSimpleWebServer": [false, false, false],
|
true,
|
||||||
"TestDebugSimpleTypoWithGuidance": [
|
true
|
||||||
false,
|
],
|
||||||
false,
|
"TestBasicRetrieval": [
|
||||||
false,
|
true,
|
||||||
false,
|
true,
|
||||||
false,
|
true
|
||||||
false
|
],
|
||||||
],
|
"TestCreateSimpleWebServer": [
|
||||||
"TestDebugSimpleTypoWithoutGuidance": [false, false, false],
|
false,
|
||||||
"TestReadFile": [true, true, true, true],
|
false,
|
||||||
"TestRememberMultipleIds": [true, true, true],
|
false
|
||||||
"TestRememberMultipleIdsWithNoise": [true, true, true],
|
],
|
||||||
"TestRememberMultiplePhrasesWithNoise": [true, true, true],
|
"TestDebugSimpleTypoWithGuidance": [
|
||||||
"TestRetrieval2": [true, true, true],
|
false,
|
||||||
"TestRetrieval3": [true, true, true],
|
false,
|
||||||
"TestSearch": [true, true, true, true],
|
false
|
||||||
"TestWriteFile": [
|
],
|
||||||
true,
|
"TestDebugSimpleTypoWithoutGuidance": [
|
||||||
true,
|
false,
|
||||||
true,
|
false,
|
||||||
false,
|
false
|
||||||
false,
|
],
|
||||||
false,
|
"TestReadFile": [
|
||||||
false,
|
true,
|
||||||
true,
|
true,
|
||||||
false,
|
true,
|
||||||
true,
|
true,
|
||||||
false,
|
true
|
||||||
false,
|
],
|
||||||
false,
|
"TestRememberMultipleIds": [
|
||||||
false,
|
true,
|
||||||
true
|
true,
|
||||||
]
|
true
|
||||||
}
|
],
|
||||||
}
|
"TestRememberMultipleIdsWithNoise": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
],
|
||||||
|
"TestRememberMultiplePhrasesWithNoise": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
],
|
||||||
|
"TestRetrieval2": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
],
|
||||||
|
"TestRetrieval3": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
],
|
||||||
|
"TestSearch": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
],
|
||||||
|
"TestWriteFile": [
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
{
|
||||||
|
"TestWriteFile": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||||
|
"is_regression": true,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"non_mock_success_%": 100.0,
|
||||||
|
"run_time": "0.009 seconds"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-3.5-turbo"
|
||||||
|
},
|
||||||
|
"command": "agbenchmark start --test TestWriteFile",
|
||||||
|
"completion_time": "2023-07-17-09:54",
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||||
|
},
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "22.36 seconds",
|
||||||
|
"highest_difficulty": "interface: 1"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestWriteFile": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||||
|
"is_regression": false,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"success_%": 40.0,
|
||||||
|
"run_time": "22.169 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestWriteFile",
|
||||||
|
"completion_time": "2023-07-15-22:13",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "12.4 seconds",
|
||||||
|
"highest_difficulty": "interface: 1"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestWriteFile": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||||
|
"is_regression": false,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"success_%": 50.0,
|
||||||
|
"run_time": "12.127 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||||
|
"entry_path": "agbenchmark.benchmarks"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestReadFile",
|
||||||
|
"completion_time": "2023-07-17-10:12",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "65.27 seconds",
|
||||||
|
"highest_difficulty": "interface: 1"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestReadFile": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||||
|
"is_regression": true,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"success_%": 100.0,
|
||||||
|
"run_time": "65.074 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4",
|
||||||
|
"reached_termination_time": true
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestReadFile",
|
||||||
|
"completion_time": "2023-07-15-22:13",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "31.2 seconds",
|
||||||
|
"highest_difficulty": "interface: 1"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestReadFile": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||||
|
"is_regression": true,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"success_%": 100.0,
|
||||||
|
"run_time": "30.903 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||||
|
"entry_path": "agbenchmark.benchmarks"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestSearch",
|
||||||
|
"completion_time": "2023-07-15-22:14",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "16.88 seconds",
|
||||||
|
"highest_difficulty": "interface: 1"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestSearch": {
|
||||||
|
"data_path": "agbenchmark/challenges/interface/search",
|
||||||
|
"is_regression": true,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "interface",
|
||||||
|
"success": true,
|
||||||
|
"success_%": 100.0,
|
||||||
|
"run_time": "16.572 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||||
|
"entry_path": "agbenchmark.benchmarks"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||||
|
"completion_time": "2023-07-15-22:16",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "45.92 seconds",
|
||||||
|
"highest_difficulty": ": 0"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestDebugSimpleTypoWithGuidance": {
|
||||||
|
"data_path": "agbenchmark/challenges/code/d1",
|
||||||
|
"is_regression": false,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "basic",
|
||||||
|
"success": false,
|
||||||
|
"fail_reason": "assert 1 in [0.0]",
|
||||||
|
"success_%": 0.0,
|
||||||
|
"run_time": "45.599 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||||
|
"entry_path": "agbenchmark.benchmarks"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4"
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||||
|
"completion_time": "2023-07-15-22:15",
|
||||||
|
"metrics": {
|
||||||
|
"run_time": "32.99 seconds",
|
||||||
|
"highest_difficulty": ": 0"
|
||||||
|
},
|
||||||
|
"tests": {
|
||||||
|
"TestDebugSimpleTypoWithGuidance": {
|
||||||
|
"data_path": "agbenchmark/challenges/code/d1",
|
||||||
|
"is_regression": false,
|
||||||
|
"metrics": {
|
||||||
|
"difficulty": "basic",
|
||||||
|
"success": false,
|
||||||
|
"fail_reason": "assert 1 in [0.0]",
|
||||||
|
"success_%": 0.0,
|
||||||
|
"run_time": "32.582 seconds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||||
|
"entry_path": "agbenchmark.benchmarks"
|
||||||
|
},
|
||||||
|
"additional": {
|
||||||
|
"model": "gpt-4"
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,23 +0,0 @@
|
||||||
{
|
|
||||||
"command": "agbenchmark start --test TestWriteFile",
|
|
||||||
"completion_time": "2023-07-16-13:07",
|
|
||||||
"metrics": {
|
|
||||||
"run_time": "13.91 seconds",
|
|
||||||
"highest_difficulty": "interface: 1"
|
|
||||||
},
|
|
||||||
"tests": {
|
|
||||||
"TestWriteFile": {
|
|
||||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
|
||||||
"is_regression": false,
|
|
||||||
"metrics": {
|
|
||||||
"difficulty": "interface",
|
|
||||||
"success": true,
|
|
||||||
"success_%": 30.0,
|
|
||||||
"run_time": "13.684 seconds"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"config": {
|
|
||||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,7 +1,9 @@
|
||||||
# radio charts, logs, helper functions for tests, anything else relevant.
|
# radio charts, logs, helper functions for tests, anything else relevant.
|
||||||
import glob
|
import glob
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV")
|
||||||
|
|
||||||
|
|
||||||
def calculate_info_test_path(reports_path: Path) -> str:
|
def calculate_info_test_path(reports_path: Path) -> str:
|
||||||
|
command = sys.argv
|
||||||
|
|
||||||
if not reports_path.exists():
|
if not reports_path.exists():
|
||||||
reports_path.mkdir(parents=True, exist_ok=True)
|
reports_path.mkdir(parents=True, exist_ok=True)
|
||||||
return str(
|
|
||||||
reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
json_files = glob.glob(str(reports_path / "*.json"))
|
||||||
)
|
|
||||||
else:
|
# Default naming scheme
|
||||||
json_files = glob.glob(str(reports_path / "*.json"))
|
file_count = len(json_files)
|
||||||
file_count = len(json_files)
|
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||||
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
|
||||||
new_file_path = reports_path / run_name
|
# # If "--test" is in command
|
||||||
return str(new_file_path)
|
if "--test" in command:
|
||||||
|
test_index = command.index("--test")
|
||||||
|
try:
|
||||||
|
test_arg = command[test_index + 1] # Argument after --test
|
||||||
|
except IndexError:
|
||||||
|
raise ValueError("Expected an argument after --test")
|
||||||
|
|
||||||
|
# Get all files that include the string that is the argument after --test
|
||||||
|
related_files = [f for f in json_files if test_arg in f]
|
||||||
|
related_file_count = len(related_files)
|
||||||
|
|
||||||
|
# Determine the prefix based on the existing files
|
||||||
|
if related_file_count == 0:
|
||||||
|
# Try to find the highest prefix number among all files, then increment it
|
||||||
|
all_prefix_numbers = []
|
||||||
|
for f in json_files:
|
||||||
|
number = float(Path(f).stem.split("_")[0])
|
||||||
|
all_prefix_numbers.append(math.floor(number))
|
||||||
|
|
||||||
|
max_prefix = max(all_prefix_numbers, default=0)
|
||||||
|
print("HEY WE ARE HERE BIG DAWG", max_prefix)
|
||||||
|
run_name = f"{max_prefix + 1}_{test_arg}.json"
|
||||||
|
else:
|
||||||
|
# Take the number from before the _ and add the .{number}
|
||||||
|
prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0]
|
||||||
|
prefix = math.floor(float(prefix_str))
|
||||||
|
run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
|
||||||
|
|
||||||
|
print("run_namerun_namerun_name", run_name)
|
||||||
|
new_file_path = reports_path / run_name
|
||||||
|
return str(new_file_path)
|
||||||
|
|
||||||
|
|
||||||
def replace_backslash(value: Any) -> Any:
|
def replace_backslash(value: Any) -> Any:
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011
|
Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d
|
Loading…
Reference in New Issue