clean(benchmark): Remove Depreciated Challenges (#7144)
* Remove depreciated challanges * Update license and pyproject.tomlpull/7160/head
parent
e8d7dfa386
commit
2cca4fa47f
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2023 Silen Naihin
|
Copyright (c) 2024 AutoGPT
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
81,462 Millions
|
|
|
@ -1,30 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"scrape_synthesize"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestBasicRetrieval"
|
|
||||||
],
|
|
||||||
"eval_id": "2d64d7a5-d664-4b86-9921-0b5e3aa9cf91",
|
|
||||||
"ground": {
|
|
||||||
"answer": "It was $81.462 billion in 2022.",
|
|
||||||
"eval": {
|
|
||||||
"type": "file"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"81"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "A no guardrails search for info",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "RevenueRetrieval1.0",
|
|
||||||
"task": "Write tesla's revenue in 2022 into a .txt file."
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
81,462 Millions
|
|
|
@ -1,31 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"scrape_synthesize"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestRevenueRetrieval_1.0"
|
|
||||||
],
|
|
||||||
"eval_id": "b79898bb-263a-4184-8e4d-0aa52838bfdb",
|
|
||||||
"ground": {
|
|
||||||
"answer": "It was $81.462 billion in 2022.",
|
|
||||||
"eval": {
|
|
||||||
"type": "file"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"81",
|
|
||||||
"462"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "This one checks the accuracy of the information over r2",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "RevenueRetrieval1.1",
|
|
||||||
"task": "Write Tesla's revenue in 2022, rounded to the nearest million dollars, into a .txt file."
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
81,462 Millions
|
|
|
@ -1,30 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"scrape_synthesize"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestRevenueRetrieval1.1"
|
|
||||||
],
|
|
||||||
"eval_id": "838128f9-79ee-45cf-8a8f-c19b0d576a76",
|
|
||||||
"ground": {
|
|
||||||
"answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
|
|
||||||
"eval": {
|
|
||||||
"type": "file"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"81,462"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
|
|
||||||
"difficulty": "intermediate",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "DeprecatedRevenueRetrieval1.2",
|
|
||||||
"task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
|
|
||||||
}
|
|
|
@ -1,12 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
typo
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,32 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"adaptability"
|
|
||||||
],
|
|
||||||
"cutoff": 75,
|
|
||||||
"dependencies": [
|
|
||||||
"TestDebugSimpleTypoWithGuidance"
|
|
||||||
],
|
|
||||||
"eval_id": "38671c68-89ea-4c51-92a5-1bc35a033c49",
|
|
||||||
"ground": {
|
|
||||||
"answer": "Same as the original DebugWithGuidance test - [0, 1] [2, 5] [0, 3]",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "A version of the debug challenge where the wrong files to test and change are given. The llm must figure the challenges out",
|
|
||||||
"difficulty": "intermediate",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "AdaptSimpleTypoWithGuidance",
|
|
||||||
"task": "1- Run sample_code.py.\n2- Read test.py.\n3- Modify test.py.\nRepeat step 1, 2 and 3 until sample_code.py runs without errors.\n"
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
81,462 Millions
|
|
|
@ -1,30 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"adaptability"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestRevenueRetrieval1.0"
|
|
||||||
],
|
|
||||||
"eval_id": "9d4894d8-6f7c-465a-bc91-ca79a21b6ca3",
|
|
||||||
"ground": {
|
|
||||||
"answer": "It was $81.462 billion in 2022.",
|
|
||||||
"eval": {
|
|
||||||
"type": "file"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"81"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Given context adapt to the missing of the word revenue",
|
|
||||||
"difficulty": "intermediate",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "AdaptTeslaRevenue",
|
|
||||||
"task": "I'm a financial planner, please help me write tesla's r in 2022 into a .txt file."
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
25.89
|
|
|
@ -1,31 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"adaptability",
|
|
||||||
"scrape_synthesize"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestBasicRetrieval"
|
|
||||||
],
|
|
||||||
"eval_id": "261ee06f-a7b0-4d5c-bf92-3197763caba6",
|
|
||||||
"ground": {
|
|
||||||
"answer": "\u00a325.89",
|
|
||||||
"eval": {
|
|
||||||
"type": "file"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"25.89"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "An advanced version of this -> remove.html as well. Same as TestBasicRetrieval but link is slightly broken, supposed to be http:// at the start.",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "AdaptLink",
|
|
||||||
"task": "Write the price of the book in this url 'books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
|
|
||||||
}
|
|
|
@ -1,29 +0,0 @@
|
||||||
<!DOCTYPE html>
|
|
||||||
<html>
|
|
||||||
|
|
||||||
<head>
|
|
||||||
<title>List of Animals</title>
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<h2>List of Animals</h2>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
<li id="dog">Dog</li>
|
|
||||||
<li>Cat</li>
|
|
||||||
<li>Rabbit</li>
|
|
||||||
<li>Horse</li>
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<div id="info"></div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
document.getElementById("dog").addEventListener("click", function() {
|
|
||||||
document.getElementById("info").innerHTML = "Dogs are known as man's best friend!";
|
|
||||||
});
|
|
||||||
</script>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
|
|
||||||
</html>
|
|
|
@ -1,48 +0,0 @@
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
from selenium import webdriver
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
|
|
||||||
# Path to the HTML file
|
|
||||||
current_path = os.path.abspath(__file__)
|
|
||||||
current_directory = os.path.dirname(current_path)
|
|
||||||
file_path = f"file://{current_directory}/animal_list.html"
|
|
||||||
|
|
||||||
# Create a new instance of the Chrome driver
|
|
||||||
|
|
||||||
chrome_options = Options()
|
|
||||||
chrome_options.add_argument("--headless")
|
|
||||||
chrome_options.add_argument("--disable-gpu")
|
|
||||||
chrome_options.add_argument("--window-size=1024x768")
|
|
||||||
chrome_options.add_argument("--no-sandbox")
|
|
||||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
|
||||||
|
|
||||||
# Navigate to the HTML file
|
|
||||||
driver.get(file_path)
|
|
||||||
|
|
||||||
# Wait for up to 10 seconds for the "dog" element to be available
|
|
||||||
wait = WebDriverWait(driver, 10)
|
|
||||||
dog_li = wait.until(EC.presence_of_element_located((By.ID, "dog")))
|
|
||||||
|
|
||||||
# Click on the "dog" list item
|
|
||||||
dog_li.click()
|
|
||||||
|
|
||||||
# Find the "info" div and get its text
|
|
||||||
info_div = driver.find_element(By.ID, "info")
|
|
||||||
info_text = info_div.text
|
|
||||||
|
|
||||||
# Assert that the text is what we expect
|
|
||||||
assert info_text == "Dogs are known as man's best friend!"
|
|
||||||
|
|
||||||
print(" passed!")
|
|
||||||
|
|
||||||
# Wait for 5 seconds
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
# Close the browser window
|
|
||||||
driver.quit()
|
|
|
@ -1,28 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code"
|
|
||||||
],
|
|
||||||
"cutoff": 90,
|
|
||||||
"dependencies": [
|
|
||||||
"TestWritingCLIFileOrganizer"
|
|
||||||
],
|
|
||||||
"eval_id": "94ef736e-c2f1-4fa9-8cbf-a1c0873ee1ee",
|
|
||||||
"ground": {
|
|
||||||
"answer": "A web app where we can list animals and have details about dogs.",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests the agent's ability to build a basic html app.",
|
|
||||||
"difficulty": "basic",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "WebAppListAnimals",
|
|
||||||
"task": "Build a web page with a list of animals. When someone clicks on the word 'Dog', a message should appear that says 'Dogs are known as man's best friend!'. You'll need to make a list with the name 'Dog' and then write a little bit of JavaScript to make the message appear when the name is clicked. Mark the div containing dog with the id 'dog'. Put the message inside a <div> with the id 'info'. Create a single html file called animal_list.html."
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
import random
|
|
||||||
import string
|
|
||||||
|
|
||||||
|
|
||||||
def generate_password(length: int) -> str:
|
|
||||||
if length < 8 or length > 16:
|
|
||||||
raise ValueError("Password length must be between 8 and 16 characters.")
|
|
||||||
|
|
||||||
characters = string.ascii_letters + string.digits + string.punctuation
|
|
||||||
password = [
|
|
||||||
random.choice(string.ascii_lowercase),
|
|
||||||
random.choice(string.ascii_uppercase),
|
|
||||||
random.choice(string.digits),
|
|
||||||
random.choice(string.punctuation),
|
|
||||||
]
|
|
||||||
password += [random.choice(characters) for _ in range(length - 4)]
|
|
||||||
random.shuffle(password)
|
|
||||||
return "".join(password)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
password_length = random.randint(8, 16)
|
|
||||||
print(generate_password(password_length))
|
|
|
@ -1,25 +0,0 @@
|
||||||
import unittest
|
|
||||||
|
|
||||||
import password_generator
|
|
||||||
|
|
||||||
|
|
||||||
class TestPasswordGenerator(unittest.TestCase):
|
|
||||||
def test_password_length(self):
|
|
||||||
for i in range(8, 17):
|
|
||||||
password = password_generator.generate_password(i)
|
|
||||||
self.assertEqual(len(password), i)
|
|
||||||
|
|
||||||
def test_value_error(self):
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
password_generator.generate_password(7)
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
password_generator.generate_password(17)
|
|
||||||
|
|
||||||
def test_password_content(self):
|
|
||||||
password = password_generator.generate_password()
|
|
||||||
self.assertTrue(any(c.isdigit() for c in password))
|
|
||||||
self.assertTrue(any(c in password_generator.string.punctuation for c in password))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
|
@ -1,28 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code"
|
|
||||||
],
|
|
||||||
"cutoff": 90,
|
|
||||||
"dependencies": [
|
|
||||||
"TestWriteFile"
|
|
||||||
],
|
|
||||||
"eval_id": "15686763-9be7-41e0-902a-80a99fd88089",
|
|
||||||
"ground": {
|
|
||||||
"answer": "password_generator.py is created and satisfies the requirements.",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to create a random password generator.",
|
|
||||||
"difficulty": "basic",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "PasswordGenerator",
|
|
||||||
"task": "Create a random password generator. The password should have between 8 and 16 characters and should contain letters, numbers and symbols. The password should be printed to the console. The entry point will be a python file that can be run this way: python password_generator.py [--len x] where x is the length of the password. The password_generator can also be imported as a module and called as password = password_generator.generate_password(len=x). If no length is specified, the password should be 8 characters long. Any invalid input should raise a ValueError."
|
|
||||||
}
|
|
|
@ -1,2 +0,0 @@
|
||||||
def multiply_int(num: int) -> int:
|
|
||||||
multiplied_num = num * 2
|
|
|
@ -1,16 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, expected_result)
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int) -> int:
|
|
||||||
multiplied_num = num * 2
|
|
||||||
return multiplied_num
|
|
|
@ -1,16 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, expected_result)
|
|
|
@ -1,31 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 120,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReadFile"
|
|
||||||
],
|
|
||||||
"eval_id": "bb23fa8c-6df9-410e-8845-bb2d1ebe0c12",
|
|
||||||
"ground": {
|
|
||||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"8"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Simple test if a simple code instruction can be executed",
|
|
||||||
"difficulty": "basic",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "ReturnCodeSimple",
|
|
||||||
"task": "Return the multiplied number in the function multiply_int in sample_code.py. You can make sure you have correctly done this by running test.py"
|
|
||||||
}
|
|
|
@ -1,48 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
|
|
||||||
def organize_files(directory_path):
|
|
||||||
# Define file type groups
|
|
||||||
file_types = {
|
|
||||||
"images": [".png", ".jpg", ".jpeg"],
|
|
||||||
"documents": [".pdf", ".docx", ".txt"],
|
|
||||||
"audio": [".mp3", ".wav", ".flac"],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create the folders if they don't exist
|
|
||||||
for folder_name in file_types.keys():
|
|
||||||
folder_path = os.path.join(directory_path, folder_name)
|
|
||||||
if not os.path.exists(folder_path):
|
|
||||||
os.makedirs(folder_path)
|
|
||||||
|
|
||||||
# Traverse through all files and folders in the specified directory
|
|
||||||
for foldername, subfolders, filenames in os.walk(directory_path):
|
|
||||||
for filename in filenames:
|
|
||||||
# Get file extension
|
|
||||||
_, file_extension = os.path.splitext(filename)
|
|
||||||
|
|
||||||
# Move files to corresponding folders
|
|
||||||
for folder_name, extensions in file_types.items():
|
|
||||||
if file_extension in extensions:
|
|
||||||
old_path = os.path.join(foldername, filename)
|
|
||||||
new_path = os.path.join(directory_path, folder_name, filename)
|
|
||||||
if old_path != new_path:
|
|
||||||
shutil.move(old_path, new_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Organize files in a directory based on their file types"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--directory_path",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The path of the directory to be organized",
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
organize_files(args.directory_path)
|
|
|
@ -1,45 +0,0 @@
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import tempfile
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
|
|
||||||
class TestOrganizeFiles(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
# Create temporary directory
|
|
||||||
self.test_dir = tempfile.mkdtemp()
|
|
||||||
|
|
||||||
# File types and their corresponding directory
|
|
||||||
self.file_types = {
|
|
||||||
"test_image.png": "images",
|
|
||||||
"test_doc.txt": "documents",
|
|
||||||
"test_audio.mp3": "audio",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create test files
|
|
||||||
for file_name in self.file_types.keys():
|
|
||||||
open(os.path.join(self.test_dir, file_name), "a").close()
|
|
||||||
|
|
||||||
def test_organize_files(self):
|
|
||||||
# Call the organize_files.py script using subprocess
|
|
||||||
subprocess.call(
|
|
||||||
["python", "organize_files.py", "--directory_path=" + self.test_dir]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if the files have been moved to the correct directories
|
|
||||||
for file_name, directory in self.file_types.items():
|
|
||||||
self.assertTrue(
|
|
||||||
os.path.isfile(os.path.join(self.test_dir, directory, file_name))
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
# Delete test directory and its contents
|
|
||||||
for file_name, directory in self.file_types.items():
|
|
||||||
os.remove(os.path.join(self.test_dir, directory, file_name))
|
|
||||||
for directory in set(self.file_types.values()):
|
|
||||||
os.rmdir(os.path.join(self.test_dir, directory))
|
|
||||||
os.rmdir(self.test_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
|
@ -1,28 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code"
|
|
||||||
],
|
|
||||||
"cutoff": 90,
|
|
||||||
"dependencies": [
|
|
||||||
"TestPasswordGeneratorEasy"
|
|
||||||
],
|
|
||||||
"eval_id": "d6bbefcc-0ee5-4190-b8a1-3721d016f849",
|
|
||||||
"ground": {
|
|
||||||
"answer": "The correct python file is written and organizes the files accordingly",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to create a random password generator.",
|
|
||||||
"difficulty": "basic",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "WritingCLIFileOrganizer",
|
|
||||||
"task": "Create a file organizer CLI tool in Python that sorts files in a directory based on their file types (e.g., images, documents, audio) and moves them into these corresponding folders: 'images', 'documents', 'audio'. The entry point will be a python file that can be run this way: python organize_files.py --directory_path=YOUR_DIRECTORY_PATH"
|
|
||||||
}
|
|
|
@ -1,16 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, expected_result)
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int) -> int:
|
|
||||||
multiplied_num = num * 2
|
|
||||||
return multiplied_num
|
|
|
@ -1,16 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, expected_result)
|
|
|
@ -1,31 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 120,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReturnCodeSimple"
|
|
||||||
],
|
|
||||||
"eval_id": "a59a1904-e9d6-443b-adb7-2e1ff972843f",
|
|
||||||
"ground": {
|
|
||||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"8"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Small step up, just writing the function with a name as well as the return statement.",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "ReturnCodeWrite",
|
|
||||||
"task": "Add a function called multiply_int in sample_code.py that multiplies numbers by 2. You can make sure you have correctly done this by running test.py"
|
|
||||||
}
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int) -> int:
|
|
||||||
multiplied_num = num * 2
|
|
||||||
return multiplied_num
|
|
|
@ -1,29 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num, multiplier)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# so its not hard coded
|
|
||||||
num = 7
|
|
||||||
multiplier = 7
|
|
||||||
expected_result = 49
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# negative numbers
|
|
||||||
num = -6
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = -12
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int, multiplier: int) -> int:
|
|
||||||
multiplied_num = num * multiplier
|
|
||||||
return multiplied_num
|
|
|
@ -1,29 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num, multiplier)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# so its not hard coded
|
|
||||||
num = 7
|
|
||||||
multiplier = 7
|
|
||||||
expected_result = 49
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# negative numbers
|
|
||||||
num = -6
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = -12
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 120,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReturnCodeWrite"
|
|
||||||
],
|
|
||||||
"eval_id": "092f3c8a-9723-4262-8e40-93d0cebba98a",
|
|
||||||
"ground": {
|
|
||||||
"answer": "def multiply_int(num, multiplier):\n return num * multiplier\n",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"8",
|
|
||||||
"49",
|
|
||||||
"-12"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Builds on the previous function also take a multiplier .",
|
|
||||||
"difficulty": "intermediate",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "ReturnCodeModify",
|
|
||||||
"task": "Modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running test.py"
|
|
||||||
}
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int) -> int:
|
|
||||||
multiplied_num = num * 2
|
|
||||||
return multiplied_num
|
|
|
@ -1,17 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num, multiplier)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# create a trivial test that has 4 as the num, and 2 as the multiplier. Make sure to fill in the expected result
|
|
||||||
num =
|
|
||||||
multiplier =
|
|
||||||
expected_result =
|
|
||||||
test_multiply_int()
|
|
|
@ -1,3 +0,0 @@
|
||||||
def multiply_int(num: int, multiplier: int) -> int:
|
|
||||||
multiplied_num = num * multiplier
|
|
||||||
return multiplied_num
|
|
|
@ -1,17 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num, multiplier)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
|
@ -1,29 +0,0 @@
|
||||||
from sample_code import multiply_int
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiply_int(num: int, multiplier, expected_result: int) -> None:
|
|
||||||
result = multiply_int(num, multiplier)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case
|
|
||||||
num = 4
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = 8
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# so its not hard coded
|
|
||||||
num = 7
|
|
||||||
multiplier = 7
|
|
||||||
expected_result = 49
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
||||||
|
|
||||||
# negative numbers
|
|
||||||
num = -6
|
|
||||||
multiplier = 2
|
|
||||||
expected_result = -12
|
|
||||||
test_multiply_int(num, multiplier, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 120,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReturnCodeModify"
|
|
||||||
],
|
|
||||||
"eval_id": "d39b8ed1-5984-40b0-8de6-a1c5eec30bc7",
|
|
||||||
"ground": {
|
|
||||||
"answer": "Just a simple multiple by 2 function. Num is 4 so answer is 8",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"8",
|
|
||||||
"49",
|
|
||||||
"-12"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Small step up, just writing the function with a name as well as the return statement.",
|
|
||||||
"difficulty": "advanced",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "ReturnCodeTests",
|
|
||||||
"task": "First, modify testfile.py to fill in the test case to be able to test the code in sample_code.py. Next, modify the multiply_int function in sample_code.py to be able to pass in a 'multiplier' argument to multiply the 'num' by 'multiplier'. Both arguments are integers. You can make sure you have correctly done this by running testfile.py that you previously modified."
|
|
||||||
}
|
|
|
@ -1,12 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
typo
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 75,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReadFile"
|
|
||||||
],
|
|
||||||
"eval_id": "a758335b-539b-4d8a-b90e-cf7036952294",
|
|
||||||
"ground": {
|
|
||||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "DebugSimpleTypoWithGuidance",
|
|
||||||
"task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n"
|
|
||||||
}
|
|
|
@ -1,12 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
typo
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 75,
|
|
||||||
"dependencies": [
|
|
||||||
"TestDebugSimpleTypoWithGuidance"
|
|
||||||
],
|
|
||||||
"eval_id": "1d171b68-0374-4b08-ae6a-c7223f89ecc1",
|
|
||||||
"ground": {
|
|
||||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
|
|
||||||
"difficulty": "intermediate",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "DebugSimpleTypoWithoutGuidance",
|
|
||||||
"task": "Make test.py run without errors."
|
|
||||||
}
|
|
|
@ -1,12 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
typo
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from import
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 90,
|
|
||||||
"dependencies": [
|
|
||||||
"TestDebugSimpleTypoWithoutGuidance"
|
|
||||||
],
|
|
||||||
"eval_id": "f537c143-ab40-4a95-8cf2-ab90cb829ca5",
|
|
||||||
"ground": {
|
|
||||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Now it's not just the typo error, but also an incomplete import statement",
|
|
||||||
"difficulty": "advanced",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "DebugMultipleTypo",
|
|
||||||
"task": "Make test.py run without errors."
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
|
|
||||||
nums_indices = [(num, index) for index, num in enumerate(nums)]
|
|
||||||
nums_indices.sort()
|
|
||||||
for i in range(len(nums_indices) - 2):
|
|
||||||
if i > 0 and nums_indices[i] == nums_indices[i - 1]:
|
|
||||||
continue
|
|
||||||
l, r = i + 1, len(nums_indices) - 1
|
|
||||||
while l < r:
|
|
||||||
three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
|
|
||||||
if three_sum < target:
|
|
||||||
l += 1
|
|
||||||
elif three_sum > target:
|
|
||||||
r -= 1
|
|
||||||
else:
|
|
||||||
indices = sorted(
|
|
||||||
[nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
|
|
||||||
)
|
|
||||||
return indices
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import three_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
|
|
||||||
result = three_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first three numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 20
|
|
||||||
expected_result = [0, 1, 2]
|
|
||||||
test_three_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 2
|
|
||||||
expected_result = [0, 2, 5]
|
|
||||||
test_three_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 2, 3]
|
|
||||||
test_three_sum(nums, target, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 60,
|
|
||||||
"dependencies": [
|
|
||||||
"TestFunctionCodeGeneration"
|
|
||||||
],
|
|
||||||
"eval_id": "a38396b8-8f61-49fc-a973-0876a4b6b5e9",
|
|
||||||
"ground": {
|
|
||||||
"answer": "The three_sum function coded properly.",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1, 2]",
|
|
||||||
"[0, 2, 5]",
|
|
||||||
"[0, 2, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to create the three_sum function.",
|
|
||||||
"difficulty": "advanced",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "ThreeSum",
|
|
||||||
"task": "Create a three_sum function in a file called sample_code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2]."
|
|
||||||
}
|
|
|
@ -1,11 +0,0 @@
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,31 +0,0 @@
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,32 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code"
|
|
||||||
],
|
|
||||||
"cutoff": 90,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReturnCodeWrite"
|
|
||||||
],
|
|
||||||
"eval_id": "c6703d23-7d2d-4b9b-a729-8014df9a7b4e",
|
|
||||||
"ground": {
|
|
||||||
"answer": "The two_sum function coded properly.",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to create the two_sum function.",
|
|
||||||
"difficulty": "advanced",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "FunctionCodeGeneration",
|
|
||||||
"task": "Create a two_sum function in a file called sample_code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1]."
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
1. Search 'Toronto to San Francisco flights' on the internet
|
|
||||||
2. Click on the first link that is a flight aggregator such as SkyScanner or Google Flights
|
|
||||||
3. Select the dates that you want to travel
|
|
||||||
3. Click on the 'one way' option and click search
|
|
||||||
4. Search through all of the given options and select the cheapest flight
|
|
|
@ -1,30 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"content_gen"
|
|
||||||
],
|
|
||||||
"cutoff": 120,
|
|
||||||
"dependencies": [
|
|
||||||
"TestWriteFile"
|
|
||||||
],
|
|
||||||
"eval_id": "6ff65567-eb1e-4c7d-8b7f-dfc91dc95ed1",
|
|
||||||
"ground": {
|
|
||||||
"answer": "1. Search for flights to Toronto.\n2. Select the cheapest flight.\n3. Book the flight.\n4. Pay for the flight.\n5. Confirm the flight.",
|
|
||||||
"eval": {
|
|
||||||
"scoring": "scale",
|
|
||||||
"template": "reference",
|
|
||||||
"type": "llm"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
".txt"
|
|
||||||
],
|
|
||||||
"should_contain": [],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "ability to generate content based on the content of 2 files.",
|
|
||||||
"difficulty": "basic",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "PlanCreation",
|
|
||||||
"task": "As an agent powered by a large language model, create a plan for yourself based on the task of booking a flight to Toronto. Output this plan into a .txt file."
|
|
||||||
}
|
|
|
@ -1,13 +0,0 @@
|
||||||
# mypy: ignore-errors
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
typo
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,32 +0,0 @@
|
||||||
# mypy: ignore-errors
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,12 +0,0 @@
|
||||||
# mypy: ignore-errors
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
|
|
||||||
def two_sum(nums: List, target: int) -> Optional[List[int]]:
|
|
||||||
seen = {}
|
|
||||||
for i, num in enumerate(nums):
|
|
||||||
complement = target - num
|
|
||||||
if complement in seen:
|
|
||||||
return [seen[complement], i]
|
|
||||||
seen[num] = i
|
|
||||||
return None
|
|
|
@ -1,32 +0,0 @@
|
||||||
# mypy: ignore-errors
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from sample_code import two_sum
|
|
||||||
|
|
||||||
|
|
||||||
def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
|
|
||||||
result = two_sum(nums, target)
|
|
||||||
print(result)
|
|
||||||
assert (
|
|
||||||
result == expected_result
|
|
||||||
), f"AssertionError: Expected the output to be {expected_result}"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# test the trivial case with the first two numbers
|
|
||||||
nums = [2, 7, 11, 15]
|
|
||||||
target = 9
|
|
||||||
expected_result = [0, 1]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for ability to use zero and the same number twice
|
|
||||||
nums = [2, 7, 0, 15, 12, 0]
|
|
||||||
target = 0
|
|
||||||
expected_result = [2, 5]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
||||||
|
|
||||||
# test for first and last index usage and negative numbers
|
|
||||||
nums = [-6, 7, 11, 4]
|
|
||||||
target = -2
|
|
||||||
expected_result = [0, 3]
|
|
||||||
test_two_sum(nums, target, expected_result)
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"category": [
|
|
||||||
"code",
|
|
||||||
"iterate"
|
|
||||||
],
|
|
||||||
"cutoff": 75,
|
|
||||||
"dependencies": [
|
|
||||||
"TestReadFile"
|
|
||||||
],
|
|
||||||
"eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f",
|
|
||||||
"ground": {
|
|
||||||
"answer": "[0, 1] [2, 5] [0, 3]",
|
|
||||||
"eval": {
|
|
||||||
"type": "python"
|
|
||||||
},
|
|
||||||
"files": [
|
|
||||||
"test.py"
|
|
||||||
],
|
|
||||||
"should_contain": [
|
|
||||||
"[0, 1]",
|
|
||||||
"[2, 5]",
|
|
||||||
"[0, 3]"
|
|
||||||
],
|
|
||||||
"should_not_contain": []
|
|
||||||
},
|
|
||||||
"info": {
|
|
||||||
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
|
|
||||||
"difficulty": "novice",
|
|
||||||
"side_effects": []
|
|
||||||
},
|
|
||||||
"name": "DebugSimpleTypoWithGuidance",
|
|
||||||
"task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n"
|
|
||||||
}
|
|
|
@ -1,2 +0,0 @@
|
||||||
The id to remember is 2314
|
|
||||||
Read the file instructions_2.txt
|
|
|
@ -1 +0,0 @@
|
||||||
Read the file instructions_3.txt
|
|
|
@ -1 +0,0 @@
|
||||||
Read the file instructions_4.txt
|
|
|
@ -1 +0,0 @@
|
||||||
Read the file instructions_5.txt
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue