First commit for AutoGPT Benchmarks

pull/5155/head
douglas 2023-04-17 17:22:31 -04:00
parent 0b899eb4cf
commit 89081d942c
11 changed files with 206 additions and 0 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "Auto-GPT"]
path = auto_gpt_benchmarking/Auto-GPT
url = https://github.com/Significant-Gravitas/Auto-GPT.git

View File

@ -1,2 +1,40 @@
# Auto-GPT-Benchmarks
A set of standardised benchmarks to assess the performance of Auto-GPTs.
# What is next?
- [ ] Build longer form tasks, (code fix backed by testing)
- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc)
- [ ] Lower priority, but put this in a webserver backend so we have a good API
- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
## Understanding OpenAI Evals
The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
The basic idea is this though:
1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
3. Run the evals against the completion function.
Then you can make more also, yaml defined evals and run them against the completion function as needed.
### Completions Functions
See our yaml file in `completion_fns` dir for the registration of the completion function.
See our completion function itself in CompletionFn.py
That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
# RANDOM SHIT
You must add the auto_gpt_bencchmarking dir to the python path
Do this with a path file in your venv. OpenAI evals needs to import it.
I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents:
`/home/douglas/AGI/Auto-GPT-Benchmarks-fork`

@ -0,0 +1 @@
Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670

View File

@ -0,0 +1,88 @@
"""
This instantiates an AutoGPT agent who is capable of handling any task.
It is designed to pass benchmarks as effectively as possible.
Loads in the ai_settings.yaml file to get the AI's name, role, and goals.
Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation.
The model is instantiated with a prompt from the AutoGPT completion function.
Eventualy we will also save and log all of the associated output and thinking for the model as well
"""
from pathlib import Path
import os
class AutoGPTAgent:
"""
A class object that contains the configuration information for the AI
The init function takes an evaluation prompt.
It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo.
It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt
It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder.
If the model has used more than 50,000 tokens, it kills the model.
If the model has used less than 50,000 tokens, it returns the output.txt file.
"""
def _clean_up_workspace(self):
"""
Cleans up the workspace by deleting the prompt.txt and output.txt files.
:return:
"""
# check if the files are there and delete them if they are
if self.prompt_file.exists():
self.prompt_file.unlink()
if self.output_file.exists():
self.output_file.unlink()
def _copy_ai_settings(self):
self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
def _copy_prompt(self):
self.prompt_file.write_text(self.prompt)
def _start_agent(self):
"""
This starts the agent in the docker container.
This assumes you have the docker image built with:
docker build -t autogpt .
In the dockerfile in the Auto-GPT repo.
You also must set up the .env file in the Auto-GPT repo.
:return:
"""
env_file = self.auto_gpt_path / ".env"
# run it in continuous mode and skip re-prompts
os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'")
def _poll_for_output(self):
"""
This polls the output file to see if the model has finished.
:return:
"""
while True:
if self.output_file.exists():
return self.output_file.read_text()
def __init__(self, prompt):
self.auto_gpt_path = Path(__file__).parent / "Auto-GPT"
self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
self.prompt_file = self.auto_workspace / "prompt.txt"
self.output_file = self.auto_workspace / "output.txt"
self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
self.prompt = prompt
self._clean_up_workspace()
self._copy_ai_settings()
self._copy_prompt()
def start(self):
self._start_agent()
answer = self._poll_for_output()
print('about to do clean up')
print(answer)
self._clean_up_workspace()
print('did clean up')
return answer

View File

@ -0,0 +1,8 @@
ai_goals:
- Evaluate the prompt in `prompt.txt`
- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided.
- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
- Save your work in the `output.txt` file, the second you do this, exit the program.
- Exit the program when you are done.
ai_name: EvaluationAgent
ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible

View File

@ -0,0 +1,27 @@
import importlib
from typing import Optional
from evals.api import CompletionFn, CompletionResult
from evals.prompt.base import CompletionPrompt
from evals.record import record_sampling
from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent
class AutoGPTCompletionResult(CompletionResult):
def __init__(self, response) -> None:
self.response = response
def get_completions(self) -> list[str]:
return [self.response.strip()]
class AutoGPTCompletionFn(CompletionFn):
def __init__(self, **kwargs) -> None:
pass
def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
prompt = CompletionPrompt(prompt).to_formatted_prompt()
agent = AutoGPTAgent(prompt)
response = agent.start()
record_sampling(prompt=prompt, sampled=response)
return AutoGPTCompletionResult(response)

View File

@ -0,0 +1,34 @@
import importlib
from typing import Optional
from evals.api import CompletionFn, CompletionResult
from langchain.llms import BaseLLM
from evals.prompt.base import CompletionPrompt
from evals.record import record_sampling
class LangChainLLMCompletionResult(CompletionResult):
def __init__(self, response) -> None:
self.response = response
def get_completions(self) -> list[str]:
return [self.response.strip()]
class LangChainLLMCompletionFn(CompletionFn):
def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None:
# Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM
module = importlib.import_module("langchain.llms")
LLMClass = getattr(module, llm)
if issubclass(LLMClass, BaseLLM):
self.llm = LLMClass(**llm_kwargs)
else:
raise ValueError(f"{llm} is not a subclass of BaseLLM")
def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult:
prompt = CompletionPrompt(prompt).to_formatted_prompt()
response = self.llm(prompt)
record_sampling(prompt=prompt, sampled=response)
return LangChainLLMCompletionResult(response)

View File

View File

@ -0,0 +1,2 @@
auto_gpt_completion_fn:
class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn

View File

@ -0,0 +1,4 @@
"""
To run auto-gpt we need to run the following command:
"""

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
evals