file naming when --test (#164)
parent
dffc1dfd51
commit
8aa6452cc4
|
@ -1,40 +1,72 @@
|
|||
{
|
||||
"mini-agi": {
|
||||
"TestBasicMemory": [true, true, true],
|
||||
"TestBasicRetrieval": [true, true, true],
|
||||
"TestCreateSimpleWebServer": [false, false, false],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [false, false, false],
|
||||
"TestReadFile": [true, true, true, true],
|
||||
"TestRememberMultipleIds": [true, true, true],
|
||||
"TestRememberMultipleIdsWithNoise": [true, true, true],
|
||||
"TestRememberMultiplePhrasesWithNoise": [true, true, true],
|
||||
"TestRetrieval2": [true, true, true],
|
||||
"TestRetrieval3": [true, true, true],
|
||||
"TestSearch": [true, true, true, true],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
true
|
||||
]
|
||||
}
|
||||
}
|
||||
"mini-agi": {
|
||||
"TestBasicMemory": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestBasicRetrieval": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestCreateSimpleWebServer": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestDebugSimpleTypoWithoutGuidance": [
|
||||
false,
|
||||
false,
|
||||
false
|
||||
],
|
||||
"TestReadFile": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIds": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultipleIdsWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRememberMultiplePhrasesWithNoise": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval2": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestRetrieval3": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestSearch": [
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true
|
||||
],
|
||||
"TestWriteFile": [
|
||||
true,
|
||||
true,
|
||||
true
|
||||
]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"non_mock_success_%": 100.0,
|
||||
"run_time": "0.009 seconds"
|
||||
}
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-3.5-turbo"
|
||||
},
|
||||
"command": "agbenchmark start --test TestWriteFile",
|
||||
"completion_time": "2023-07-17-09:54",
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"metrics": {
|
||||
"run_time": "22.36 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 40.0,
|
||||
"run_time": "22.169 seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestWriteFile",
|
||||
"completion_time": "2023-07-15-22:13",
|
||||
"metrics": {
|
||||
"run_time": "12.4 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 50.0,
|
||||
"run_time": "12.127 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestReadFile",
|
||||
"completion_time": "2023-07-17-10:12",
|
||||
"metrics": {
|
||||
"run_time": "65.27 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestReadFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "65.074 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4",
|
||||
"reached_termination_time": true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestReadFile",
|
||||
"completion_time": "2023-07-15-22:13",
|
||||
"metrics": {
|
||||
"run_time": "31.2 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestReadFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/read_file",
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "30.903 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestSearch",
|
||||
"completion_time": "2023-07-15-22:14",
|
||||
"metrics": {
|
||||
"run_time": "16.88 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestSearch": {
|
||||
"data_path": "agbenchmark/challenges/interface/search",
|
||||
"is_regression": true,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 100.0,
|
||||
"run_time": "16.572 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-15-22:16",
|
||||
"metrics": {
|
||||
"run_time": "45.92 seconds",
|
||||
"highest_difficulty": ": 0"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "45.599 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
|
||||
"completion_time": "2023-07-15-22:15",
|
||||
"metrics": {
|
||||
"run_time": "32.99 seconds",
|
||||
"highest_difficulty": ": 0"
|
||||
},
|
||||
"tests": {
|
||||
"TestDebugSimpleTypoWithGuidance": {
|
||||
"data_path": "agbenchmark/challenges/code/d1",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "basic",
|
||||
"success": false,
|
||||
"fail_reason": "assert 1 in [0.0]",
|
||||
"success_%": 0.0,
|
||||
"run_time": "32.582 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
|
||||
"entry_path": "agbenchmark.benchmarks"
|
||||
},
|
||||
"additional": {
|
||||
"model": "gpt-4"
|
||||
}
|
||||
}
|
|
@ -1,23 +0,0 @@
|
|||
{
|
||||
"command": "agbenchmark start --test TestWriteFile",
|
||||
"completion_time": "2023-07-16-13:07",
|
||||
"metrics": {
|
||||
"run_time": "13.91 seconds",
|
||||
"highest_difficulty": "interface: 1"
|
||||
},
|
||||
"tests": {
|
||||
"TestWriteFile": {
|
||||
"data_path": "agbenchmark/challenges/interface/write_file",
|
||||
"is_regression": false,
|
||||
"metrics": {
|
||||
"difficulty": "interface",
|
||||
"success": true,
|
||||
"success_%": 30.0,
|
||||
"run_time": "13.684 seconds"
|
||||
}
|
||||
}
|
||||
},
|
||||
"config": {
|
||||
"workspace": "${os.path.join(Path.home(), 'miniagi')}"
|
||||
}
|
||||
}
|
|
@ -1,7 +1,9 @@
|
|||
# radio charts, logs, helper functions for tests, anything else relevant.
|
||||
import glob
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV")
|
|||
|
||||
|
||||
def calculate_info_test_path(reports_path: Path) -> str:
|
||||
command = sys.argv
|
||||
|
||||
if not reports_path.exists():
|
||||
reports_path.mkdir(parents=True, exist_ok=True)
|
||||
return str(
|
||||
reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||
)
|
||||
else:
|
||||
json_files = glob.glob(str(reports_path / "*.json"))
|
||||
file_count = len(json_files)
|
||||
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||
new_file_path = reports_path / run_name
|
||||
return str(new_file_path)
|
||||
|
||||
json_files = glob.glob(str(reports_path / "*.json"))
|
||||
|
||||
# Default naming scheme
|
||||
file_count = len(json_files)
|
||||
run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
|
||||
|
||||
# # If "--test" is in command
|
||||
if "--test" in command:
|
||||
test_index = command.index("--test")
|
||||
try:
|
||||
test_arg = command[test_index + 1] # Argument after --test
|
||||
except IndexError:
|
||||
raise ValueError("Expected an argument after --test")
|
||||
|
||||
# Get all files that include the string that is the argument after --test
|
||||
related_files = [f for f in json_files if test_arg in f]
|
||||
related_file_count = len(related_files)
|
||||
|
||||
# Determine the prefix based on the existing files
|
||||
if related_file_count == 0:
|
||||
# Try to find the highest prefix number among all files, then increment it
|
||||
all_prefix_numbers = []
|
||||
for f in json_files:
|
||||
number = float(Path(f).stem.split("_")[0])
|
||||
all_prefix_numbers.append(math.floor(number))
|
||||
|
||||
max_prefix = max(all_prefix_numbers, default=0)
|
||||
print("HEY WE ARE HERE BIG DAWG", max_prefix)
|
||||
run_name = f"{max_prefix + 1}_{test_arg}.json"
|
||||
else:
|
||||
# Take the number from before the _ and add the .{number}
|
||||
prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0]
|
||||
prefix = math.floor(float(prefix_str))
|
||||
run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
|
||||
|
||||
print("run_namerun_namerun_name", run_name)
|
||||
new_file_path = reports_path / run_name
|
||||
return str(new_file_path)
|
||||
|
||||
|
||||
def replace_backslash(value: Any) -> Any:
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011
|
||||
Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d
|
Loading…
Reference in New Issue