feat(agent/web): Improve `read_webpage` information extraction abilities
* Implement `extract_information` function in `autogpt.processing.text` module. This function extracts pieces of information from a body of text based on a list of topics of interest. * Add `topics_of_interest` and `get_raw_content` parameters to `read_webpage` commmand * Limit maximum content length if `get_raw_content=true` is specifiedpull/6774/head
parent
956cdc77fa
commit
55433f468a
|
@ -30,11 +30,11 @@ from webdriver_manager.chrome import ChromeDriverManager
|
|||
from webdriver_manager.firefox import GeckoDriverManager
|
||||
from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
|
||||
|
||||
from autogpt.agents.utils.exceptions import CommandExecutionError
|
||||
from autogpt.agents.utils.exceptions import CommandExecutionError, TooMuchOutputError
|
||||
from autogpt.command_decorator import command
|
||||
from autogpt.core.utils.json_schema import JSONSchema
|
||||
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
|
||||
from autogpt.processing.text import summarize_text
|
||||
from autogpt.processing.text import extract_information, summarize_text
|
||||
from autogpt.url_utils.validators import validate_url
|
||||
|
||||
COMMAND_CATEGORY = "web_browse"
|
||||
|
@ -49,7 +49,7 @@ if TYPE_CHECKING:
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
FILE_DIR = Path(__file__).parent.parent
|
||||
TOKENS_TO_TRIGGER_SUMMARY = 50
|
||||
MAX_RAW_CONTENT_LENGTH = 500
|
||||
LINKS_TO_RETURN = 20
|
||||
|
||||
|
||||
|
@ -60,10 +60,8 @@ class BrowsingError(CommandExecutionError):
|
|||
@command(
|
||||
"read_webpage",
|
||||
(
|
||||
"Read a webpage, and extract specific information from it"
|
||||
" if a question is specified."
|
||||
" If you are looking to extract specific information from the webpage,"
|
||||
" you should specify a question."
|
||||
"Read a webpage, and extract specific information from it."
|
||||
" You must specify either topics_of_interest, a question, or get_raw_content."
|
||||
),
|
||||
{
|
||||
"url": JSONSchema(
|
||||
|
@ -71,6 +69,15 @@ class BrowsingError(CommandExecutionError):
|
|||
description="The URL to visit",
|
||||
required=True,
|
||||
),
|
||||
"topics_of_interest": JSONSchema(
|
||||
type=JSONSchema.Type.ARRAY,
|
||||
items=JSONSchema(type=JSONSchema.Type.STRING),
|
||||
description=(
|
||||
"A list of topics about which you want to extract information "
|
||||
"from the page."
|
||||
),
|
||||
required=False,
|
||||
),
|
||||
"question": JSONSchema(
|
||||
type=JSONSchema.Type.STRING,
|
||||
description=(
|
||||
|
@ -78,10 +85,25 @@ class BrowsingError(CommandExecutionError):
|
|||
),
|
||||
required=False,
|
||||
),
|
||||
"get_raw_content": JSONSchema(
|
||||
type=JSONSchema.Type.BOOLEAN,
|
||||
description=(
|
||||
"If true, the unprocessed content of the webpage will be returned. "
|
||||
"This consumes a lot of tokens, so use it with caution."
|
||||
),
|
||||
required=False,
|
||||
),
|
||||
},
|
||||
)
|
||||
@validate_url
|
||||
async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
|
||||
async def read_webpage(
|
||||
url: str,
|
||||
agent: Agent,
|
||||
*,
|
||||
topics_of_interest: list[str] = [],
|
||||
get_raw_content: bool = False,
|
||||
question: str = "",
|
||||
) -> str:
|
||||
"""Browse a website and return the answer and links to the user
|
||||
|
||||
Args:
|
||||
|
@ -103,12 +125,19 @@ async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
|
|||
summarized = False
|
||||
if not text:
|
||||
return f"Website did not contain any text.\n\nLinks: {links}"
|
||||
elif (
|
||||
agent.llm_provider.count_tokens(text, agent.llm.name)
|
||||
> TOKENS_TO_TRIGGER_SUMMARY
|
||||
):
|
||||
elif get_raw_content:
|
||||
if (
|
||||
output_tokens := agent.llm_provider.count_tokens(text, agent.llm.name)
|
||||
) > MAX_RAW_CONTENT_LENGTH:
|
||||
oversize_factor = round(output_tokens / MAX_RAW_CONTENT_LENGTH, 1)
|
||||
raise TooMuchOutputError(
|
||||
f"Page content is {oversize_factor}x the allowed length "
|
||||
"for `get_raw_content=true`"
|
||||
)
|
||||
return text + (f"\n\nLinks: {links}" if links else "")
|
||||
else:
|
||||
text = await summarize_memorize_webpage(
|
||||
url, text, question or None, agent, driver
|
||||
url, text, question or None, topics_of_interest, agent, driver
|
||||
)
|
||||
return_literal_content = bool(question)
|
||||
summarized = True
|
||||
|
@ -265,6 +294,7 @@ async def summarize_memorize_webpage(
|
|||
url: str,
|
||||
text: str,
|
||||
question: str | None,
|
||||
topics_of_interest: list[str],
|
||||
agent: Agent,
|
||||
driver: Optional[WebDriver] = None,
|
||||
) -> str:
|
||||
|
@ -295,10 +325,21 @@ async def summarize_memorize_webpage(
|
|||
# )
|
||||
# memory.add(new_memory)
|
||||
|
||||
summary, _ = await summarize_text(
|
||||
text,
|
||||
question=question,
|
||||
llm_provider=agent.llm_provider,
|
||||
config=agent.legacy_config, # FIXME
|
||||
)
|
||||
return summary
|
||||
result = None
|
||||
information = None
|
||||
if topics_of_interest:
|
||||
information = await extract_information(
|
||||
text,
|
||||
topics_of_interest=topics_of_interest,
|
||||
llm_provider=agent.llm_provider,
|
||||
config=agent.legacy_config,
|
||||
)
|
||||
return "\n".join(f"* {i}" for i in information)
|
||||
else:
|
||||
result, _ = await summarize_text(
|
||||
text,
|
||||
question=question,
|
||||
llm_provider=agent.llm_provider,
|
||||
config=agent.legacy_config,
|
||||
)
|
||||
return result
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Text processing functions"""
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
from typing import Iterator, Optional, TypeVar
|
||||
|
@ -10,6 +11,7 @@ from autogpt.core.prompting import ChatPrompt
|
|||
from autogpt.core.resource.model_providers import (
|
||||
ChatMessage,
|
||||
ChatModelProvider,
|
||||
ChatModelResponse,
|
||||
ModelTokenizer,
|
||||
)
|
||||
|
||||
|
@ -57,105 +59,165 @@ async def summarize_text(
|
|||
text: str,
|
||||
llm_provider: ChatModelProvider,
|
||||
config: Config,
|
||||
instruction: Optional[str] = None,
|
||||
question: Optional[str] = None,
|
||||
) -> tuple[str, None | list[tuple[str, str]]]:
|
||||
"""Summarize text using the OpenAI API
|
||||
|
||||
Args:
|
||||
text (str): The text to summarize.
|
||||
llm_provider: LLM provider to use for summarization.
|
||||
config (Config): The global application config, containing the FAST_LLM setting.
|
||||
instruction (str): Additional instruction for summarization, e.g.
|
||||
"focus on information related to polar bears", or
|
||||
"omit personal information contained in the text".
|
||||
question (str): Question to be answered by the summary.
|
||||
|
||||
Returns:
|
||||
str: The summary of the text
|
||||
list[(summary, chunk)]: Text chunks and their summary, if the text was chunked.
|
||||
None otherwise.
|
||||
"""
|
||||
if not text:
|
||||
raise ValueError("No text to summarize")
|
||||
|
||||
if instruction and question:
|
||||
raise ValueError("Parameters 'question' and 'instructions' cannot both be set")
|
||||
|
||||
model = config.fast_llm
|
||||
|
||||
instruction: Optional[str] = None,
|
||||
) -> tuple[str, list[tuple[str, str]]]:
|
||||
if question:
|
||||
if instruction:
|
||||
raise ValueError(
|
||||
"Parameters 'question' and 'instructions' cannot both be set"
|
||||
)
|
||||
|
||||
instruction = (
|
||||
'Include any information that can be used to answer the question: "%s". '
|
||||
"Do not directly answer the question itself."
|
||||
) % question
|
||||
|
||||
summarization_prompt = ChatPrompt(messages=[])
|
||||
|
||||
text_tlength = llm_provider.count_tokens(text, model)
|
||||
logger.info(f"Text length: {text_tlength} tokens")
|
||||
|
||||
# reserve 50 tokens for summary prompt, 500 for the response
|
||||
max_chunk_length = llm_provider.get_token_limit(model) - 550
|
||||
logger.info(f"Max chunk length: {max_chunk_length} tokens")
|
||||
|
||||
if text_tlength < max_chunk_length:
|
||||
# summarization_prompt.add("user", text)
|
||||
summarization_prompt.messages.append(
|
||||
ChatMessage.user(
|
||||
"Write a concise summary of the following text."
|
||||
f"{f' {instruction}' if instruction is not None else ''}:"
|
||||
"\n\n\n"
|
||||
f'LITERAL TEXT: """{text}"""'
|
||||
"\n\n\n"
|
||||
"CONCISE SUMMARY: The text is best summarized as"
|
||||
)
|
||||
f'From the text, answer the question: "{question}". '
|
||||
"If the answer is not in the text, indicate this clearly "
|
||||
"and concisely state why the text is not suitable to answer the question."
|
||||
)
|
||||
elif not instruction:
|
||||
instruction = (
|
||||
"Summarize or describe the text clearly and concisely, "
|
||||
"whichever seems more appropriate."
|
||||
)
|
||||
|
||||
summary = (
|
||||
await llm_provider.create_chat_completion(
|
||||
model_prompt=summarization_prompt.messages,
|
||||
model_name=model,
|
||||
temperature=0,
|
||||
max_tokens=500,
|
||||
)
|
||||
).response.content
|
||||
|
||||
logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n")
|
||||
return summary.strip(), None
|
||||
|
||||
summaries: list[str] = []
|
||||
chunks = list(
|
||||
split_text(
|
||||
text,
|
||||
config=config,
|
||||
max_chunk_length=max_chunk_length,
|
||||
tokenizer=llm_provider.get_tokenizer(model),
|
||||
)
|
||||
)
|
||||
|
||||
for i, (chunk, chunk_length) in enumerate(chunks):
|
||||
logger.info(
|
||||
f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens"
|
||||
)
|
||||
summary, _ = await summarize_text(
|
||||
text=chunk,
|
||||
instruction=instruction,
|
||||
llm_provider=llm_provider,
|
||||
config=config,
|
||||
)
|
||||
summaries.append(summary)
|
||||
|
||||
logger.info(f"Summarized {len(chunks)} chunks")
|
||||
|
||||
summary, _ = await summarize_text(
|
||||
"\n\n".join(summaries),
|
||||
return await _process_text( # type: ignore
|
||||
text=text,
|
||||
instruction=instruction,
|
||||
llm_provider=llm_provider,
|
||||
config=config,
|
||||
)
|
||||
return summary.strip(), [
|
||||
(summaries[i], chunks[i][0]) for i in range(0, len(chunks))
|
||||
]
|
||||
|
||||
|
||||
async def extract_information(
|
||||
source_text: str,
|
||||
topics_of_interest: list[str],
|
||||
llm_provider: ChatModelProvider,
|
||||
config: Config,
|
||||
) -> list[str]:
|
||||
fmt_topics_list = "\n".join(f"* {topic}." for topic in topics_of_interest)
|
||||
instruction = (
|
||||
"Extract relevant pieces of information about the following topics:\n"
|
||||
f"{fmt_topics_list}\n"
|
||||
"Reword pieces of information if needed to make them self-explanatory. "
|
||||
"Be concise.\n\n"
|
||||
"Respond with an `Array<string>` in JSON format AND NOTHING ELSE. "
|
||||
'If the text contains no relevant information, return "[]".'
|
||||
)
|
||||
return await _process_text( # type: ignore
|
||||
text=source_text,
|
||||
instruction=instruction,
|
||||
output_type=list[str],
|
||||
llm_provider=llm_provider,
|
||||
config=config,
|
||||
)
|
||||
|
||||
|
||||
async def _process_text(
|
||||
text: str,
|
||||
instruction: str,
|
||||
llm_provider: ChatModelProvider,
|
||||
config: Config,
|
||||
output_type: type[str | list[str]] = str,
|
||||
) -> tuple[str, list[tuple[str, str]]] | list[str]:
|
||||
"""Process text using the OpenAI API for summarization or information extraction
|
||||
|
||||
Params:
|
||||
text (str): The text to process.
|
||||
instruction (str): Additional instruction for processing.
|
||||
llm_provider: LLM provider to use.
|
||||
config (Config): The global application config.
|
||||
output_type: `str` for summaries or `list[str]` for piece-wise info extraction.
|
||||
|
||||
Returns:
|
||||
For summarization: tuple[str, None | list[(summary, chunk)]]
|
||||
For piece-wise information extraction: list[str]
|
||||
"""
|
||||
if not text.strip():
|
||||
raise ValueError("No content")
|
||||
|
||||
model = config.fast_llm
|
||||
|
||||
text_tlength = llm_provider.count_tokens(text, model)
|
||||
logger.debug(f"Text length: {text_tlength} tokens")
|
||||
|
||||
max_result_tokens = 500
|
||||
max_chunk_length = llm_provider.get_token_limit(model) - max_result_tokens - 50
|
||||
logger.debug(f"Max chunk length: {max_chunk_length} tokens")
|
||||
|
||||
if text_tlength < max_chunk_length:
|
||||
prompt = ChatPrompt(
|
||||
messages=[
|
||||
ChatMessage.system(
|
||||
"The user is going to give you a text enclosed in triple quotes. "
|
||||
f"{instruction}"
|
||||
),
|
||||
ChatMessage.user(f'"""{text}"""'),
|
||||
]
|
||||
)
|
||||
|
||||
logger.debug(f"PROCESSING:\n{prompt}")
|
||||
|
||||
response = await llm_provider.create_chat_completion(
|
||||
model_prompt=prompt.messages,
|
||||
model_name=model,
|
||||
temperature=0.5,
|
||||
max_tokens=max_result_tokens,
|
||||
completion_parser=lambda s: (
|
||||
json.loads(s.content) if output_type is not str else None
|
||||
),
|
||||
)
|
||||
|
||||
if output_type == list[str]:
|
||||
logger.debug(f"Raw LLM response: {repr(response.response.content)}")
|
||||
fmt_result_bullet_list = "\n".join(f"* {r}" for r in response.parsed_result)
|
||||
logger.debug(
|
||||
f"\n{'-'*11} EXTRACTION RESULT {'-'*12}\n"
|
||||
f"{fmt_result_bullet_list}\n"
|
||||
f"{'-'*42}\n"
|
||||
)
|
||||
return response.parsed_result
|
||||
else:
|
||||
summary = response.response.content
|
||||
logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n")
|
||||
return summary.strip(), [(summary, text)]
|
||||
else:
|
||||
chunks = list(
|
||||
split_text(
|
||||
text,
|
||||
config=config,
|
||||
max_chunk_length=max_chunk_length,
|
||||
tokenizer=llm_provider.get_tokenizer(model),
|
||||
)
|
||||
)
|
||||
|
||||
processed_results = []
|
||||
for i, (chunk, _) in enumerate(chunks):
|
||||
logger.info(f"Processing chunk {i + 1} / {len(chunks)}")
|
||||
chunk_result = await _process_text(
|
||||
text=chunk,
|
||||
instruction=instruction,
|
||||
output_type=output_type,
|
||||
llm_provider=llm_provider,
|
||||
config=config,
|
||||
)
|
||||
processed_results.extend(
|
||||
chunk_result if output_type == list[str] else [chunk_result]
|
||||
)
|
||||
|
||||
if output_type == list[str]:
|
||||
return processed_results
|
||||
else:
|
||||
summary, _ = await _process_text(
|
||||
"\n\n".join([result[0] for result in processed_results]),
|
||||
instruction=(
|
||||
"The text consists of multiple partial summaries. "
|
||||
"Combine these partial summaries into one."
|
||||
),
|
||||
llm_provider=llm_provider,
|
||||
config=config,
|
||||
)
|
||||
return summary.strip(), [
|
||||
(processed_results[i], chunks[i][0]) for i in range(0, len(chunks))
|
||||
]
|
||||
|
||||
|
||||
def split_text(
|
||||
|
|
Loading…
Reference in New Issue