clean(benchmark): Remove Depreciated Challenges (#7144)

* Remove depreciated challanges

* Update license and pyproject.toml
pull/7160/head
Swifty 2024-05-20 15:01:36 +02:00 committed by GitHub
parent e8d7dfa386
commit 2cca4fa47f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
164 changed files with 2 additions and 2312 deletions

View File

@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2023 Silen Naihin Copyright (c) 2024 AutoGPT
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@ -1,30 +0,0 @@
{
"category": [
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
"TestBasicRetrieval"
],
"eval_id": "2d64d7a5-d664-4b86-9921-0b5e3aa9cf91",
"ground": {
"answer": "It was $81.462 billion in 2022.",
"eval": {
"type": "file"
},
"files": [
".txt"
],
"should_contain": [
"81"
],
"should_not_contain": []
},
"info": {
"description": "A no guardrails search for info",
"difficulty": "novice",
"side_effects": []
},
"name": "RevenueRetrieval1.0",
"task": "Write tesla's revenue in 2022 into a .txt file."
}

View File

@ -1,31 +0,0 @@
{
"category": [
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
"TestRevenueRetrieval_1.0"
],
"eval_id": "b79898bb-263a-4184-8e4d-0aa52838bfdb",
"ground": {
"answer": "It was $81.462 billion in 2022.",
"eval": {
"type": "file"
},
"files": [
".txt"
],
"should_contain": [
"81",
"462"
],
"should_not_contain": []
},
"info": {
"description": "This one checks the accuracy of the information over r2",
"difficulty": "novice",
"side_effects": []
},
"name": "RevenueRetrieval1.1",
"task": "Write Tesla's revenue in 2022, rounded to the nearest million dollars, into a .txt file."
}

View File

@ -1,30 +0,0 @@
{
"category": [
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
"TestRevenueRetrieval1.1"
],
"eval_id": "838128f9-79ee-45cf-8a8f-c19b0d576a76",
"ground": {
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
"eval": {
"type": "file"
},
"files": [
".txt"
],
"should_contain": [
"81,462"
],
"should_not_contain": []
},
"info": {
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
"difficulty": "intermediate",
"side_effects": []
},
"name": "DeprecatedRevenueRetrieval1.2",
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
}

View File

@ -1,12 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,11 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,32 +0,0 @@
{
"category": [
"adaptability"
],
"cutoff": 75,
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"eval_id": "38671c68-89ea-4c51-92a5-1bc35a033c49",
"ground": {
"answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "A version of the debug challenge where the wrong files to test and change are given. The llm must figure the challenges out",
"difficulty": "intermediate",
"side_effects": []
},
"name": "AdaptSimpleTypoWithGuidance",
"task": "1- Run sample_code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until sample_code.py runs without errors.\n"
}

View File

@ -1,30 +0,0 @@
{
"category": [
"adaptability"
],
"cutoff": 60,
"dependencies": [
"TestRevenueRetrieval1.0"
],
"eval_id": "9d4894d8-6f7c-465a-bc91-ca79a21b6ca3",
"ground": {
"answer": "It was $81.462 billion in 2022.",
"eval": {
"type": "file"
},
"files": [
".txt"
],
"should_contain": [
"81"
],
"should_not_contain": []
},
"info": {
"description": "Given context adapt to the missing of the word revenue",
"difficulty": "intermediate",
"side_effects": []
},
"name": "AdaptTeslaRevenue",
"task": "I'm a financial planner, please help me write tesla's r in 2022 into a .txt file."
}

View File

@ -1,31 +0,0 @@
{
"category": [
"adaptability",
"scrape_synthesize"
],
"cutoff": 60,
"dependencies": [
"TestBasicRetrieval"
],
"eval_id": "261ee06f-a7b0-4d5c-bf92-3197763caba6",
"ground": {
"answer": "\u00a325.89",
"eval": {
"type": "file"
},
"files": [
".txt"
],
"should_contain": [
"25.89"
],
"should_not_contain": []
},
"info": {
"description": "An advanced version of this -> remove.html as well. Same as TestBasicRetrieval but link is slightly broken, supposed to be http:// at the start.",
"difficulty": "novice",
"side_effects": []
},
"name": "AdaptLink",
"task": "Write the price of the book in this url 'books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
}

View File

@ -1,29 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<title>List of Animals</title>
</head>
<body>
<h2>List of Animals</h2>
<ul>
<li id="dog">Dog</li>
<li>Cat</li>
<li>Rabbit</li>
<li>Horse</li>
</ul>
<div id="info"></div>
<script>
document.getElementById("dog").addEventListener("click", function() {
document.getElementById("info").innerHTML = "Dogs are known as man's best friend!";
});
</script>
</body>
</html>

View File

@ -1,48 +0,0 @@
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Path to the HTML file
current_path = os.path.abspath(__file__)
current_directory = os.path.dirname(current_path)
file_path = f"file://{current_directory}/animal_list.html"
# Create a new instance of the Chrome driver
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1024x768")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
# Navigate to the HTML file
driver.get(file_path)
# Wait for up to 10 seconds for the "dog" element to be available
wait = WebDriverWait(driver, 10)
dog_li = wait.until(EC.presence_of_element_located((By.ID, "dog")))
# Click on the "dog" list item
dog_li.click()
# Find the "info" div and get its text
info_div = driver.find_element(By.ID, "info")
info_text = info_div.text
# Assert that the text is what we expect
assert info_text == "Dogs are known as man's best friend!"
print(" passed!")
# Wait for 5 seconds
time.sleep(5)
# Close the browser window
driver.quit()

View File

@ -1,28 +0,0 @@
{
"category": [
"code"
],
"cutoff": 90,
"dependencies": [
"TestWritingCLIFileOrganizer"
],
"eval_id": "94ef736e-c2f1-4fa9-8cbf-a1c0873ee1ee",
"ground": {
"answer": "A web app where we can list animals and have details about dogs.",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [],
"should_not_contain": []
},
"info": {
"description": "Tests the agent's ability to build a basic html app.",
"difficulty": "basic",
"side_effects": []
},
"name": "WebAppListAnimals",
"task": "Build a web page with a list of animals. When someone clicks on the word 'Dog', a message should appear that says 'Dogs are known as man's best friend!'. You'll need to make a list with the name 'Dog' and then write a little bit of JavaScript to make the message appear when the name is clicked. Mark the div containing dog with the id 'dog'. Put the message inside a <div> with the id 'info'. Create a single html file called animal_list.html."
}

View File

@ -1,23 +0,0 @@
import random
import string
def generate_password(length: int) -> str:
if length < 8 or length > 16:
raise ValueError("Password length must be between 8 and 16 characters.")
characters = string.ascii_letters + string.digits + string.punctuation
password = [
random.choice(string.ascii_lowercase),
random.choice(string.ascii_uppercase),
random.choice(string.digits),
random.choice(string.punctuation),
]
password += [random.choice(characters) for _ in range(length - 4)]
random.shuffle(password)
return "".join(password)
if __name__ == "__main__":
password_length = random.randint(8, 16)
print(generate_password(password_length))

View File

@ -1,25 +0,0 @@
import unittest
import password_generator
class TestPasswordGenerator(unittest.TestCase):
def test_password_length(self):
for i in range(8, 17):
password = password_generator.generate_password(i)
self.assertEqual(len(password), i)
def test_value_error(self):
with self.assertRaises(ValueError):
password_generator.generate_password(7)
with self.assertRaises(ValueError):
password_generator.generate_password(17)
def test_password_content(self):
password = password_generator.generate_password()
self.assertTrue(any(c.isdigit() for c in password))
self.assertTrue(any(c in password_generator.string.punctuation for c in password))
if __name__ == "__main__":
unittest.main()

View File

@ -1,28 +0,0 @@
{
"category": [
"code"
],
"cutoff": 90,
"dependencies": [
"TestWriteFile"
],
"eval_id": "15686763-9be7-41e0-902a-80a99fd88089",
"ground": {
"answer": "password_generator.py is created and satisfies the requirements.",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a random password generator.",
"difficulty": "basic",
"side_effects": []
},
"name": "PasswordGenerator",
"task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). If no length is specified, the password should be 8 characters long. Any invalid input should raise a ValueError."
}

View File

@ -1,2 +0,0 @@
def multiply_int(num: int) -> int:
multiplied_num = num * 2

View File

@ -1,16 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, expected_result: int) -> None:
result = multiply_int(num)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
expected_result = 8
test_multiply_int(num, expected_result)

View File

@ -1,3 +0,0 @@
def multiply_int(num: int) -> int:
multiplied_num = num * 2
return multiplied_num

View File

@ -1,16 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, expected_result: int) -> None:
result = multiply_int(num)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
expected_result = 8
test_multiply_int(num, expected_result)

View File

@ -1,31 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 120,
"dependencies": [
"TestReadFile"
],
"eval_id": "bb23fa8c-6df9-410e-8845-bb2d1ebe0c12",
"ground": {
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"8"
],
"should_not_contain": []
},
"info": {
"description": "Simple test if a simple code instruction can be executed",
"difficulty": "basic",
"side_effects": []
},
"name": "ReturnCodeSimple",
"task": "Return the multiplied number in the function multiply_int in sample_code.py. You can make sure you have correctly done this by running test.py"
}

View File

@ -1,48 +0,0 @@
import argparse
import os
import shutil
def organize_files(directory_path):
# Define file type groups
file_types = {
"images": [".png", ".jpg", ".jpeg"],
"documents": [".pdf", ".docx", ".txt"],
"audio": [".mp3", ".wav", ".flac"],
}
# Create the folders if they don't exist
for folder_name in file_types.keys():
folder_path = os.path.join(directory_path, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# Traverse through all files and folders in the specified directory
for foldername, subfolders, filenames in os.walk(directory_path):
for filename in filenames:
# Get file extension
_, file_extension = os.path.splitext(filename)
# Move files to corresponding folders
for folder_name, extensions in file_types.items():
if file_extension in extensions:
old_path = os.path.join(foldername, filename)
new_path = os.path.join(directory_path, folder_name, filename)
if old_path != new_path:
shutil.move(old_path, new_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Organize files in a directory based on their file types"
)
parser.add_argument(
"--directory_path",
type=str,
required=True,
help="The path of the directory to be organized",
)
args = parser.parse_args()
organize_files(args.directory_path)

View File

@ -1,45 +0,0 @@
import os
import subprocess
import tempfile
import unittest
class TestOrganizeFiles(unittest.TestCase):
def setUp(self):
# Create temporary directory
self.test_dir = tempfile.mkdtemp()
# File types and their corresponding directory
self.file_types = {
"test_image.png": "images",
"test_doc.txt": "documents",
"test_audio.mp3": "audio",
}
# Create test files
for file_name in self.file_types.keys():
open(os.path.join(self.test_dir, file_name), "a").close()
def test_organize_files(self):
# Call the organize_files.py script using subprocess
subprocess.call(
["python", "organize_files.py", "--directory_path=" + self.test_dir]
)
# Check if the files have been moved to the correct directories
for file_name, directory in self.file_types.items():
self.assertTrue(
os.path.isfile(os.path.join(self.test_dir, directory, file_name))
)
def tearDown(self):
# Delete test directory and its contents
for file_name, directory in self.file_types.items():
os.remove(os.path.join(self.test_dir, directory, file_name))
for directory in set(self.file_types.values()):
os.rmdir(os.path.join(self.test_dir, directory))
os.rmdir(self.test_dir)
if __name__ == "__main__":
unittest.main()

View File

@ -1,28 +0,0 @@
{
"category": [
"code"
],
"cutoff": 90,
"dependencies": [
"TestPasswordGeneratorEasy"
],
"eval_id": "d6bbefcc-0ee5-4190-b8a1-3721d016f849",
"ground": {
"answer": "The correct python file is written and organizes the files accordingly",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create a random password generator.",
"difficulty": "basic",
"side_effects": []
},
"name": "WritingCLIFileOrganizer",
"task": "Create a file organizer CLI tool in Python that sorts files in a directory based on their file types (e.g., images, documents, audio) and moves them into these corresponding folders: 'images', 'documents', 'audio'. The entry point will be a python file that can be run this way: python organize_files.py --directory_path=YOUR_DIRECTORY_PATH"
}

View File

@ -1,16 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, expected_result: int) -> None:
result = multiply_int(num)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
expected_result = 8
test_multiply_int(num, expected_result)

View File

@ -1,3 +0,0 @@
def multiply_int(num: int) -> int:
multiplied_num = num * 2
return multiplied_num

View File

@ -1,16 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, expected_result: int) -> None:
result = multiply_int(num)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
expected_result = 8
test_multiply_int(num, expected_result)

View File

@ -1,31 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 120,
"dependencies": [
"TestReturnCodeSimple"
],
"eval_id": "a59a1904-e9d6-443b-adb7-2e1ff972843f",
"ground": {
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"8"
],
"should_not_contain": []
},
"info": {
"description": "Small step up, just writing the function with a name as well as the return statement.",
"difficulty": "novice",
"side_effects": []
},
"name": "ReturnCodeWrite",
"task": "Add a function called multiply_int in sample_code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py"
}

View File

@ -1,3 +0,0 @@
def multiply_int(num: int) -> int:
multiplied_num = num * 2
return multiplied_num

View File

@ -1,29 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
result = multiply_int(num, multiplier)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
multiplier = 2
expected_result = 8
test_multiply_int(num, multiplier, expected_result)
# so its not hard coded
num = 7
multiplier = 7
expected_result = 49
test_multiply_int(num, multiplier, expected_result)
# negative numbers
num = -6
multiplier = 2
expected_result = -12
test_multiply_int(num, multiplier, expected_result)

View File

@ -1,3 +0,0 @@
def multiply_int(num: int, multiplier: int) -> int:
multiplied_num = num * multiplier
return multiplied_num

View File

@ -1,29 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
result = multiply_int(num, multiplier)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
multiplier = 2
expected_result = 8
test_multiply_int(num, multiplier, expected_result)
# so its not hard coded
num = 7
multiplier = 7
expected_result = 49
test_multiply_int(num, multiplier, expected_result)
# negative numbers
num = -6
multiplier = 2
expected_result = -12
test_multiply_int(num, multiplier, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 120,
"dependencies": [
"TestReturnCodeWrite"
],
"eval_id": "092f3c8a-9723-4262-8e40-93d0cebba98a",
"ground": {
"answer": "def multiply_int(num, multiplier):\n return num * multiplier\n",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"8",
"49",
"-12"
],
"should_not_contain": []
},
"info": {
"description": "Builds on the previous function also take a multiplier .",
"difficulty": "intermediate",
"side_effects": []
},
"name": "ReturnCodeModify",
"task": "Modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running test.py"
}

View File

@ -1,3 +0,0 @@
def multiply_int(num: int) -> int:
multiplied_num = num * 2
return multiplied_num

View File

@ -1,17 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
result = multiply_int(num, multiplier)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# create a trivial test that has 4 as the num, and 2 as the multiplier. Make sure to fill in the expected result
num =
multiplier =
expected_result =
test_multiply_int()

View File

@ -1,3 +0,0 @@
def multiply_int(num: int, multiplier: int) -> int:
multiplied_num = num * multiplier
return multiplied_num

View File

@ -1,17 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
result = multiply_int(num, multiplier)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
multiplier = 2
expected_result = 8
test_multiply_int(num, multiplier, expected_result)

View File

@ -1,29 +0,0 @@
from sample_code import multiply_int
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
result = multiply_int(num, multiplier)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case
num = 4
multiplier = 2
expected_result = 8
test_multiply_int(num, multiplier, expected_result)
# so its not hard coded
num = 7
multiplier = 7
expected_result = 49
test_multiply_int(num, multiplier, expected_result)
# negative numbers
num = -6
multiplier = 2
expected_result = -12
test_multiply_int(num, multiplier, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 120,
"dependencies": [
"TestReturnCodeModify"
],
"eval_id": "d39b8ed1-5984-40b0-8de6-a1c5eec30bc7",
"ground": {
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"8",
"49",
"-12"
],
"should_not_contain": []
},
"info": {
"description": "Small step up, just writing the function with a name as well as the return statement.",
"difficulty": "advanced",
"side_effects": []
},
"name": "ReturnCodeTests",
"task": "First, modify testfile.py to fill in the test case to be able to test the code in sample_code.py. Next, modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running testfile.py that you previously modified."
}

View File

@ -1,12 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,11 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 75,
"dependencies": [
"TestReadFile"
],
"eval_id": "a758335b-539b-4d8a-b90e-cf7036952294",
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"difficulty": "novice",
"side_effects": []
},
"name": "DebugSimpleTypoWithGuidance",
"task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n"
}

View File

@ -1,12 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,11 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 75,
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"eval_id": "1d171b68-0374-4b08-ae6a-c7223f89ecc1",
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"difficulty": "intermediate",
"side_effects": []
},
"name": "DebugSimpleTypoWithoutGuidance",
"task": "Make test.py run without errors."
}

View File

@ -1,12 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from import
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,11 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 90,
"dependencies": [
"TestDebugSimpleTypoWithoutGuidance"
],
"eval_id": "f537c143-ab40-4a95-8cf2-ab90cb829ca5",
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Now it's not just the typo error, but also an incomplete import statement",
"difficulty": "advanced",
"side_effects": []
},
"name": "DebugMultipleTypo",
"task": "Make test.py run without errors."
}

View File

@ -1,22 +0,0 @@
from typing import List, Optional
def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
nums_indices = [(num, index) for index, num in enumerate(nums)]
nums_indices.sort()
for i in range(len(nums_indices) - 2):
if i > 0 and nums_indices[i] == nums_indices[i - 1]:
continue
l, r = i + 1, len(nums_indices) - 1
while l < r:
three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
if three_sum < target:
l += 1
elif three_sum > target:
r -= 1
else:
indices = sorted(
[nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
)
return indices
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import three_sum
def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
result = three_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first three numbers
nums = [2, 7, 11, 15]
target = 20
expected_result = [0, 1, 2]
test_three_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 2
expected_result = [0, 2, 5]
test_three_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = 9
expected_result = [0, 2, 3]
test_three_sum(nums, target, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 60,
"dependencies": [
"TestFunctionCodeGeneration"
],
"eval_id": "a38396b8-8f61-49fc-a973-0876a4b6b5e9",
"ground": {
"answer": "The three_sum function coded properly.",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1, 2]",
"[0, 2, 5]",
"[0, 2, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create the three_sum function.",
"difficulty": "advanced",
"side_effects": []
},
"name": "ThreeSum",
"task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2]."
}

View File

@ -1,11 +0,0 @@
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,31 +0,0 @@
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,32 +0,0 @@
{
"category": [
"code"
],
"cutoff": 90,
"dependencies": [
"TestReturnCodeWrite"
],
"eval_id": "c6703d23-7d2d-4b9b-a729-8014df9a7b4e",
"ground": {
"answer": "The two_sum function coded properly.",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to create the two_sum function.",
"difficulty": "advanced",
"side_effects": []
},
"name": "FunctionCodeGeneration",
"task": "Create a two_sum function in a file called sample_code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1]."
}

View File

@ -1,5 +0,0 @@
1. Search 'Toronto to San Francisco flights' on the internet
2. Click on the first link that is a flight aggregator such as SkyScanner or Google Flights
3. Select the dates that you want to travel
3. Click on the 'one way' option and click search
4. Search through all of the given options and select the cheapest flight

View File

@ -1,30 +0,0 @@
{
"category": [
"content_gen"
],
"cutoff": 120,
"dependencies": [
"TestWriteFile"
],
"eval_id": "6ff65567-eb1e-4c7d-8b7f-dfc91dc95ed1",
"ground": {
"answer": "1. Search for flights to Toronto.\n2. Select the cheapest flight.\n3. Book the flight.\n4. Pay for the flight.\n5. Confirm the flight.",
"eval": {
"scoring": "scale",
"template": "reference",
"type": "llm"
},
"files": [
".txt"
],
"should_contain": [],
"should_not_contain": []
},
"info": {
"description": "ability to generate content based on the content of 2 files.",
"difficulty": "basic",
"side_effects": []
},
"name": "PlanCreation",
"task": "As an agent powered by a large language model, create a plan for yourself based on the task of booking a flight to Toronto. Output this plan into a .txt file."
}

View File

@ -1,13 +0,0 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
typo
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,32 +0,0 @@
# mypy: ignore-errors
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,12 +0,0 @@
# mypy: ignore-errors
from typing import List, Optional
def two_sum(nums: List, target: int) -> Optional[List[int]]:
seen = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return None

View File

@ -1,32 +0,0 @@
# mypy: ignore-errors
from typing import List
from sample_code import two_sum
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
result = two_sum(nums, target)
print(result)
assert (
result == expected_result
), f"AssertionError: Expected the output to be {expected_result}"
if __name__ == "__main__":
# test the trivial case with the first two numbers
nums = [2, 7, 11, 15]
target = 9
expected_result = [0, 1]
test_two_sum(nums, target, expected_result)
# test for ability to use zero and the same number twice
nums = [2, 7, 0, 15, 12, 0]
target = 0
expected_result = [2, 5]
test_two_sum(nums, target, expected_result)
# test for first and last index usage and negative numbers
nums = [-6, 7, 11, 4]
target = -2
expected_result = [0, 3]
test_two_sum(nums, target, expected_result)

View File

@ -1,33 +0,0 @@
{
"category": [
"code",
"iterate"
],
"cutoff": 75,
"dependencies": [
"TestReadFile"
],
"eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f",
"ground": {
"answer": "[0, 1] [2, 5] [0, 3]",
"eval": {
"type": "python"
},
"files": [
"test.py"
],
"should_contain": [
"[0, 1]",
"[2, 5]",
"[0, 3]"
],
"should_not_contain": []
},
"info": {
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"difficulty": "novice",
"side_effects": []
},
"name": "DebugSimpleTypoWithGuidance",
"task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n"
}

View File

@ -1,2 +0,0 @@
The id to remember is 2314
Read the file instructions_2.txt

View File

@ -1 +0,0 @@
Read the file instructions_3.txt

View File

@ -1 +0,0 @@
Read the file instructions_4.txt

View File

@ -1 +0,0 @@
Read the file instructions_5.txt

Some files were not shown because too many files have changed in this diff Show More