feat(agent/web): Improve `read_webpage` information extraction abilities

* Implement `extract_information` function in `autogpt.processing.text` module. This function extracts pieces of information from a body of text based on a list of topics of interest. * Add `topics_of_interest` and `get_raw_content` parameters to `read_webpage` commmand * Limit maximum content length if `get_raw_content=true` is specified
2024-01-31 15:08:08 +01:00 · 2024-01-31 15:08:08 +01:00 · 55433f468a
parent 956cdc77fa
commit 55433f468a
2 changed files with 214 additions and 111 deletions
--- a/autogpts/autogpt/autogpt/commands/web_selenium.py
+++ b/autogpts/autogpt/autogpt/commands/web_selenium.py
@ -30,11 +30,11 @@ from webdriver_manager.chrome import ChromeDriverManager
 from webdriver_manager.firefox import GeckoDriverManager
 from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager

-from autogpt.agents.utils.exceptions import CommandExecutionError
+from autogpt.agents.utils.exceptions import CommandExecutionError, TooMuchOutputError
 from autogpt.command_decorator import command
 from autogpt.core.utils.json_schema import JSONSchema
 from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
-from autogpt.processing.text import summarize_text
+from autogpt.processing.text import extract_information, summarize_text
 from autogpt.url_utils.validators import validate_url

 COMMAND_CATEGORY = "web_browse"
@ -49,7 +49,7 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)

 FILE_DIR = Path(__file__).parent.parent
-TOKENS_TO_TRIGGER_SUMMARY = 50
+MAX_RAW_CONTENT_LENGTH = 500
 LINKS_TO_RETURN = 20


@ -60,10 +60,8 @@ class BrowsingError(CommandExecutionError):
@command(
    "read_webpage",
    (
-        "Read a webpage, and extract specific information from it"
-        " if a question is specified."
-        " If you are looking to extract specific information from the webpage,"
-        " you should specify a question."
+        "Read a webpage, and extract specific information from it."
+        " You must specify either topics_of_interest, a question, or get_raw_content."
    ),
    {
        "url": JSONSchema(
@ -71,6 +69,15 @@ class BrowsingError(CommandExecutionError):
            description="The URL to visit",
            required=True,
        ),
+        "topics_of_interest": JSONSchema(
+            type=JSONSchema.Type.ARRAY,
+            items=JSONSchema(type=JSONSchema.Type.STRING),
+            description=(
+                "A list of topics about which you want to extract information "
+                "from the page."
+            ),
+            required=False,
+        ),
        "question": JSONSchema(
            type=JSONSchema.Type.STRING,
            description=(
@ -78,10 +85,25 @@ class BrowsingError(CommandExecutionError):
            ),
            required=False,
        ),
+        "get_raw_content": JSONSchema(
+            type=JSONSchema.Type.BOOLEAN,
+            description=(
+                "If true, the unprocessed content of the webpage will be returned. "
+                "This consumes a lot of tokens, so use it with caution."
+            ),
+            required=False,
+        ),
    },
 )
@validate_url
-async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
+async def read_webpage(
+    url: str,
+    agent: Agent,
+    *,
+    topics_of_interest: list[str] = [],
+    get_raw_content: bool = False,
+    question: str = "",
+) -> str:
    """Browse a website and return the answer and links to the user

    Args:
@ -103,12 +125,19 @@ async def read_webpage(url: str, agent: Agent, question: str = "") -> str:
        summarized = False
        if not text:
            return f"Website did not contain any text.\n\nLinks: {links}"
-        elif (
-            agent.llm_provider.count_tokens(text, agent.llm.name)
-            > TOKENS_TO_TRIGGER_SUMMARY
-        ):
+        elif get_raw_content:
+            if (
+                output_tokens := agent.llm_provider.count_tokens(text, agent.llm.name)
+            ) > MAX_RAW_CONTENT_LENGTH:
+                oversize_factor = round(output_tokens / MAX_RAW_CONTENT_LENGTH, 1)
+                raise TooMuchOutputError(
+                    f"Page content is {oversize_factor}x the allowed length "
+                    "for `get_raw_content=true`"
+                )
+            return text + (f"\n\nLinks: {links}" if links else "")
+        else:
            text = await summarize_memorize_webpage(
-                url, text, question or None, agent, driver
+                url, text, question or None, topics_of_interest, agent, driver
            )
            return_literal_content = bool(question)
            summarized = True
@ -265,6 +294,7 @@ async def summarize_memorize_webpage(
    url: str,
    text: str,
    question: str | None,
+    topics_of_interest: list[str],
    agent: Agent,
    driver: Optional[WebDriver] = None,
 ) -> str:
@ -295,10 +325,21 @@ async def summarize_memorize_webpage(
    # )
    # memory.add(new_memory)

-    summary, _ = await summarize_text(
+    result = None
+    information = None
+    if topics_of_interest:
+        information = await extract_information(
+            text,
+            topics_of_interest=topics_of_interest,
+            llm_provider=agent.llm_provider,
+            config=agent.legacy_config,
+        )
+        return "\n".join(f"* {i}" for i in information)
+    else:
+        result, _ = await summarize_text(
            text,
            question=question,
            llm_provider=agent.llm_provider,
-        config=agent.legacy_config,  # FIXME
+            config=agent.legacy_config,
        )
-    return summary
+        return result
--- a/autogpts/autogpt/autogpt/processing/text.py
+++ b/autogpts/autogpt/autogpt/processing/text.py
@ -1,4 +1,5 @@
 """Text processing functions"""
+import json
 import logging
 import math
 from typing import Iterator, Optional, TypeVar
@ -10,6 +11,7 @@ from autogpt.core.prompting import ChatPrompt
 from autogpt.core.resource.model_providers import (
    ChatMessage,
    ChatModelProvider,
+    ChatModelResponse,
    ModelTokenizer,
 )

@ -57,74 +59,127 @@ async def summarize_text(
    text: str,
    llm_provider: ChatModelProvider,
    config: Config,
-    instruction: Optional[str] = None,
    question: Optional[str] = None,
-) -> tuple[str, None | list[tuple[str, str]]]:
-    """Summarize text using the OpenAI API
+    instruction: Optional[str] = None,
+) -> tuple[str, list[tuple[str, str]]]:
+    if question:
+        if instruction:
+            raise ValueError(
+                "Parameters 'question' and 'instructions' cannot both be set"
+            )

-    Args:
-        text (str): The text to summarize.
-        llm_provider: LLM provider to use for summarization.
-        config (Config): The global application config, containing the FAST_LLM setting.
-        instruction (str): Additional instruction for summarization, e.g.
-            "focus on information related to polar bears", or
-            "omit personal information contained in the text".
-        question (str): Question to be answered by the summary.
+        instruction = (
+            f'From the text, answer the question: "{question}". '
+            "If the answer is not in the text, indicate this clearly "
+            "and concisely state why the text is not suitable to answer the question."
+        )
+    elif not instruction:
+        instruction = (
+            "Summarize or describe the text clearly and concisely, "
+            "whichever seems more appropriate."
+        )
+
+    return await _process_text(  # type: ignore
+        text=text,
+        instruction=instruction,
+        llm_provider=llm_provider,
+        config=config,
+    )
+
+
+async def extract_information(
+    source_text: str,
+    topics_of_interest: list[str],
+    llm_provider: ChatModelProvider,
+    config: Config,
+) -> list[str]:
+    fmt_topics_list = "\n".join(f"* {topic}." for topic in topics_of_interest)
+    instruction = (
+        "Extract relevant pieces of information about the following topics:\n"
+        f"{fmt_topics_list}\n"
+        "Reword pieces of information if needed to make them self-explanatory. "
+        "Be concise.\n\n"
+        "Respond with an `Array<string>` in JSON format AND NOTHING ELSE. "
+        'If the text contains no relevant information, return "[]".'
+    )
+    return await _process_text(  # type: ignore
+        text=source_text,
+        instruction=instruction,
+        output_type=list[str],
+        llm_provider=llm_provider,
+        config=config,
+    )
+
+
+async def _process_text(
+    text: str,
+    instruction: str,
+    llm_provider: ChatModelProvider,
+    config: Config,
+    output_type: type[str | list[str]] = str,
+) -> tuple[str, list[tuple[str, str]]] | list[str]:
+    """Process text using the OpenAI API for summarization or information extraction
+
+    Params:
+        text (str): The text to process.
+        instruction (str): Additional instruction for processing.
+        llm_provider: LLM provider to use.
+        config (Config): The global application config.
+        output_type: `str` for summaries or `list[str]` for piece-wise info extraction.

    Returns:
-        str: The summary of the text
-        list[(summary, chunk)]: Text chunks and their summary, if the text was chunked.
-            None otherwise.
+        For summarization: tuple[str, None | list[(summary, chunk)]]
+        For piece-wise information extraction: list[str]
    """
-    if not text:
-        raise ValueError("No text to summarize")
-
-    if instruction and question:
-        raise ValueError("Parameters 'question' and 'instructions' cannot both be set")
+    if not text.strip():
+        raise ValueError("No content")

    model = config.fast_llm

-    if question:
-        instruction = (
-            'Include any information that can be used to answer the question: "%s". '
-            "Do not directly answer the question itself."
-        ) % question
-
-    summarization_prompt = ChatPrompt(messages=[])
-
    text_tlength = llm_provider.count_tokens(text, model)
-    logger.info(f"Text length: {text_tlength} tokens")
+    logger.debug(f"Text length: {text_tlength} tokens")

-    # reserve 50 tokens for summary prompt, 500 for the response
-    max_chunk_length = llm_provider.get_token_limit(model) - 550
-    logger.info(f"Max chunk length: {max_chunk_length} tokens")
+    max_result_tokens = 500
+    max_chunk_length = llm_provider.get_token_limit(model) - max_result_tokens - 50
+    logger.debug(f"Max chunk length: {max_chunk_length} tokens")

    if text_tlength < max_chunk_length:
-        # summarization_prompt.add("user", text)
-        summarization_prompt.messages.append(
-            ChatMessage.user(
-                "Write a concise summary of the following text."
-                f"{f' {instruction}' if instruction is not None else ''}:"
-                "\n\n\n"
-                f'LITERAL TEXT: """{text}"""'
-                "\n\n\n"
-                "CONCISE SUMMARY: The text is best summarized as"
-            )
+        prompt = ChatPrompt(
+            messages=[
+                ChatMessage.system(
+                    "The user is going to give you a text enclosed in triple quotes. "
+                    f"{instruction}"
+                ),
+                ChatMessage.user(f'"""{text}"""'),
+            ]
        )

-        summary = (
-            await llm_provider.create_chat_completion(
-                model_prompt=summarization_prompt.messages,
+        logger.debug(f"PROCESSING:\n{prompt}")
+
+        response = await llm_provider.create_chat_completion(
+            model_prompt=prompt.messages,
            model_name=model,
-                temperature=0,
-                max_tokens=500,
+            temperature=0.5,
+            max_tokens=max_result_tokens,
+            completion_parser=lambda s: (
+                json.loads(s.content) if output_type is not str else None
+            ),
        )
-        ).response.content

+        if output_type == list[str]:
+            logger.debug(f"Raw LLM response: {repr(response.response.content)}")
+            fmt_result_bullet_list = "\n".join(f"* {r}" for r in response.parsed_result)
+            logger.debug(
+                f"\n{'-'*11} EXTRACTION RESULT {'-'*12}\n"
+                f"{fmt_result_bullet_list}\n"
+                f"{'-'*42}\n"
+            )
+            return response.parsed_result
+        else:
+            summary = response.response.content
            logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n")
-        return summary.strip(), None
-
-    summaries: list[str] = []
+            return summary.strip(), [(summary, text)]
+    else:
        chunks = list(
            split_text(
                text,
@ -134,27 +189,34 @@ async def summarize_text(
            )
        )

-    for i, (chunk, chunk_length) in enumerate(chunks):
-        logger.info(
-            f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens"
-        )
-        summary, _ = await summarize_text(
+        processed_results = []
+        for i, (chunk, _) in enumerate(chunks):
+            logger.info(f"Processing chunk {i + 1} / {len(chunks)}")
+            chunk_result = await _process_text(
                text=chunk,
                instruction=instruction,
+                output_type=output_type,
                llm_provider=llm_provider,
                config=config,
            )
-        summaries.append(summary)
+            processed_results.extend(
+                chunk_result if output_type == list[str] else [chunk_result]
+            )

-    logger.info(f"Summarized {len(chunks)} chunks")
-
-    summary, _ = await summarize_text(
-        "\n\n".join(summaries),
+        if output_type == list[str]:
+            return processed_results
+        else:
+            summary, _ = await _process_text(
+                "\n\n".join([result[0] for result in processed_results]),
+                instruction=(
+                    "The text consists of multiple partial summaries. "
+                    "Combine these partial summaries into one."
+                ),
                llm_provider=llm_provider,
                config=config,
            )
            return summary.strip(), [
-        (summaries[i], chunks[i][0]) for i in range(0, len(chunks))
+                (processed_results[i], chunks[i][0]) for i in range(0, len(chunks))
            ]