feat(agent/web): Improve `read_webpage` information extraction abilities

* Implement `extract_information` function in `autogpt.processing.text` module. This function extracts pieces of information from a body of text based on a list of topics of interest.
* Add `topics_of_interest` and `get_raw_content` parameters to `read_webpage` commmand
   * Limit maximum content length if `get_raw_content=true` is specified
pull/6774/head
Reinier van der Leer 2024-01-31 15:08:08 +01:00
parent 956cdc77fa
commit 55433f468a
No known key found for this signature in database
GPG Key ID: CDC1180FDAE06193
2 changed files with 214 additions and 111 deletions

View File

@ -30,11 +30,11 @@ from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
from autogpt.agents.utils.exceptions import CommandExecutionError
from autogpt.agents.utils.exceptions import CommandExecutionError, TooMuchOutputError
from autogpt.command_decorator import command
from autogpt.core.utils.json_schema import JSONSchema
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
from autogpt.processing.text import summarize_text
from autogpt.processing.text import extract_information, summarize_text
from autogpt.url_utils.validators import validate_url
COMMAND_CATEGORY = "web_browse"
@ -49,7 +49,7 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
FILE_DIR = Path(__file__).parent.parent
TOKENS_TO_TRIGGER_SUMMARY = 50
MAX_RAW_CONTENT_LENGTH = 500
LINKS_TO_RETURN = 20
@ -60,10 +60,8 @@ class BrowsingError(CommandExecutionError):
@command(
"read_webpage",
(
"Read a webpage, and extract specific information from it"
" if a question is specified."
" If you are looking to extract specific information from the webpage,"
" you should specify a question."
"Read a webpage, and extract specific information from it."
" You must specify either topics_of_interest, a question, or get_raw_content."
),
{
"url": JSONSchema(
@ -71,6 +69,15 @@ class BrowsingError(CommandExecutionError):
description="The URL to visit",
required=True,
),
"topics_of_interest": JSONSchema(
type=JSONSchema.Type.ARRAY,
items=JSONSchema(type=JSONSchema.Type.STRING),
description=(
"A list of topics about which you want to extract information "
"from the page."
),
required=False,
),
"question": JSONSchema(
type=JSONSchema.Type.STRING,
description=(
@ -78,10 +85,25 @@ class BrowsingError(CommandExecutionError):
),
required=False,
),
"get_raw_content": JSONSchema(
type=JSONSchema.Type.BOOLEAN,
description=(
"If true, the unprocessed content of the webpage will be returned. "
"This consumes a lot of tokens, so use it with caution."
),
required=False,
),
},
)
@validate_url
async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
async def read_webpage(
url: str,
agent: Agent,
*,
topics_of_interest: list[str] = [],
get_raw_content: bool = False,
question: str = "",
) -> str:
"""Browse a website and return the answer and links to the user
Args:
@ -103,12 +125,19 @@ async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
summarized = False
if not text:
return f"Website did not contain any text.\n\nLinks: {links}"
elif (
agent.llm_provider.count_tokens(text, agent.llm.name)
> TOKENS_TO_TRIGGER_SUMMARY
):
elif get_raw_content:
if (
output_tokens := agent.llm_provider.count_tokens(text, agent.llm.name)
) > MAX_RAW_CONTENT_LENGTH:
oversize_factor = round(output_tokens / MAX_RAW_CONTENT_LENGTH, 1)
raise TooMuchOutputError(
f"Page content is {oversize_factor}x the allowed length "
"for `get_raw_content=true`"
)
return text + (f"\n\nLinks: {links}" if links else "")
else:
text = await summarize_memorize_webpage(
url, text, question or None, agent, driver
url, text, question or None, topics_of_interest, agent, driver
)
return_literal_content = bool(question)
summarized = True
@ -265,6 +294,7 @@ async def summarize_memorize_webpage(
url: str,
text: str,
question: str | None,
topics_of_interest: list[str],
agent: Agent,
driver: Optional[WebDriver] = None,
) -> str:
@ -295,10 +325,21 @@ async def summarize_memorize_webpage(
# )
# memory.add(new_memory)
summary, _ = await summarize_text(
result = None
information = None
if topics_of_interest:
information = await extract_information(
text,
topics_of_interest=topics_of_interest,
llm_provider=agent.llm_provider,
config=agent.legacy_config,
)
return "\n".join(f"* {i}" for i in information)
else:
result, _ = await summarize_text(
text,
question=question,
llm_provider=agent.llm_provider,
config=agent.legacy_config, # FIXME
config=agent.legacy_config,
)
return summary
return result

View File

@ -1,4 +1,5 @@
"""Text processing functions"""
import json
import logging
import math
from typing import Iterator, Optional, TypeVar
@ -10,6 +11,7 @@ from autogpt.core.prompting import ChatPrompt
from autogpt.core.resource.model_providers import (
ChatMessage,
ChatModelProvider,
ChatModelResponse,
ModelTokenizer,
)
@ -57,74 +59,127 @@ async def summarize_text(
text: str,
llm_provider: ChatModelProvider,
config: Config,
instruction: Optional[str] = None,
question: Optional[str] = None,
) -> tuple[str, None | list[tuple[str, str]]]:
"""Summarize text using the OpenAI API
instruction: Optional[str] = None,
) -> tuple[str, list[tuple[str, str]]]:
if question:
if instruction:
raise ValueError(
"Parameters 'question' and 'instructions' cannot both be set"
)
Args:
text (str): The text to summarize.
llm_provider: LLM provider to use for summarization.
config (Config): The global application config, containing the FAST_LLM setting.
instruction (str): Additional instruction for summarization, e.g.
"focus on information related to polar bears", or
"omit personal information contained in the text".
question (str): Question to be answered by the summary.
instruction = (
f'From the text, answer the question: "{question}". '
"If the answer is not in the text, indicate this clearly "
"and concisely state why the text is not suitable to answer the question."
)
elif not instruction:
instruction = (
"Summarize or describe the text clearly and concisely, "
"whichever seems more appropriate."
)
return await _process_text( # type: ignore
text=text,
instruction=instruction,
llm_provider=llm_provider,
config=config,
)
async def extract_information(
source_text: str,
topics_of_interest: list[str],
llm_provider: ChatModelProvider,
config: Config,
) -> list[str]:
fmt_topics_list = "\n".join(f"* {topic}." for topic in topics_of_interest)
instruction = (
"Extract relevant pieces of information about the following topics:\n"
f"{fmt_topics_list}\n"
"Reword pieces of information if needed to make them self-explanatory. "
"Be concise.\n\n"
"Respond with an `Array<string>` in JSON format AND NOTHING ELSE. "
'If the text contains no relevant information, return "[]".'
)
return await _process_text( # type: ignore
text=source_text,
instruction=instruction,
output_type=list[str],
llm_provider=llm_provider,
config=config,
)
async def _process_text(
text: str,
instruction: str,
llm_provider: ChatModelProvider,
config: Config,
output_type: type[str | list[str]] = str,
) -> tuple[str, list[tuple[str, str]]] | list[str]:
"""Process text using the OpenAI API for summarization or information extraction
Params:
text (str): The text to process.
instruction (str): Additional instruction for processing.
llm_provider: LLM provider to use.
config (Config): The global application config.
output_type: `str` for summaries or `list[str]` for piece-wise info extraction.
Returns:
str: The summary of the text
list[(summary, chunk)]: Text chunks and their summary, if the text was chunked.
None otherwise.
For summarization: tuple[str, None | list[(summary, chunk)]]
For piece-wise information extraction: list[str]
"""
if not text:
raise ValueError("No text to summarize")
if instruction and question:
raise ValueError("Parameters 'question' and 'instructions' cannot both be set")
if not text.strip():
raise ValueError("No content")
model = config.fast_llm
if question:
instruction = (
'Include any information that can be used to answer the question: "%s". '
"Do not directly answer the question itself."
) % question
summarization_prompt = ChatPrompt(messages=[])
text_tlength = llm_provider.count_tokens(text, model)
logger.info(f"Text length: {text_tlength} tokens")
logger.debug(f"Text length: {text_tlength} tokens")
# reserve 50 tokens for summary prompt, 500 for the response
max_chunk_length = llm_provider.get_token_limit(model) - 550
logger.info(f"Max chunk length: {max_chunk_length} tokens")
max_result_tokens = 500
max_chunk_length = llm_provider.get_token_limit(model) - max_result_tokens - 50
logger.debug(f"Max chunk length: {max_chunk_length} tokens")
if text_tlength < max_chunk_length:
# summarization_prompt.add("user", text)
summarization_prompt.messages.append(
ChatMessage.user(
"Write a concise summary of the following text."
f"{f' {instruction}' if instruction is not None else ''}:"
"\n\n\n"
f'LITERAL TEXT: """{text}"""'
"\n\n\n"
"CONCISE SUMMARY: The text is best summarized as"
)
prompt = ChatPrompt(
messages=[
ChatMessage.system(
"The user is going to give you a text enclosed in triple quotes. "
f"{instruction}"
),
ChatMessage.user(f'"""{text}"""'),
]
)
summary = (
await llm_provider.create_chat_completion(
model_prompt=summarization_prompt.messages,
logger.debug(f"PROCESSING:\n{prompt}")
response = await llm_provider.create_chat_completion(
model_prompt=prompt.messages,
model_name=model,
temperature=0,
max_tokens=500,
temperature=0.5,
max_tokens=max_result_tokens,
completion_parser=lambda s: (
json.loads(s.content) if output_type is not str else None
),
)
).response.content
if output_type == list[str]:
logger.debug(f"Raw LLM response: {repr(response.response.content)}")
fmt_result_bullet_list = "\n".join(f"* {r}" for r in response.parsed_result)
logger.debug(
f"\n{'-'*11} EXTRACTION RESULT {'-'*12}\n"
f"{fmt_result_bullet_list}\n"
f"{'-'*42}\n"
)
return response.parsed_result
else:
summary = response.response.content
logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n")
return summary.strip(), None
summaries: list[str] = []
return summary.strip(), [(summary, text)]
else:
chunks = list(
split_text(
text,
@ -134,27 +189,34 @@ async def summarize_text(
)
)
for i, (chunk, chunk_length) in enumerate(chunks):
logger.info(
f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens"
)
summary, _ = await summarize_text(
processed_results = []
for i, (chunk, _) in enumerate(chunks):
logger.info(f"Processing chunk {i + 1} / {len(chunks)}")
chunk_result = await _process_text(
text=chunk,
instruction=instruction,
output_type=output_type,
llm_provider=llm_provider,
config=config,
)
summaries.append(summary)
processed_results.extend(
chunk_result if output_type == list[str] else [chunk_result]
)
logger.info(f"Summarized {len(chunks)} chunks")
summary, _ = await summarize_text(
"\n\n".join(summaries),
if output_type == list[str]:
return processed_results
else:
summary, _ = await _process_text(
"\n\n".join([result[0] for result in processed_results]),
instruction=(
"The text consists of multiple partial summaries. "
"Combine these partial summaries into one."
),
llm_provider=llm_provider,
config=config,
)
return summary.strip(), [
(summaries[i], chunks[i][0]) for i in range(0, len(chunks))
(processed_results[i], chunks[i][0]) for i in range(0, len(chunks))
]