From ac7fefe96ea740e14188754b5457efc8ff7c1507 Mon Sep 17 00:00:00 2001 From: ryanmac Date: Mon, 3 Apr 2023 14:05:32 -0500 Subject: [PATCH 1/4] Use playwright instead of requests for browse --- .gitignore | 1 + requirements-new.txt | 13 +++ requirements.txt | 4 +- scripts/browse_playwright.py | 150 +++++++++++++++++++++++++++++++++++ scripts/commands.py | 2 +- scripts/json_parser.py | 10 +-- 6 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 requirements-new.txt create mode 100644 scripts/browse_playwright.py diff --git a/.gitignore b/.gitignore index a4e3cc2d8..b361b4bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ scripts/__pycache__/keys.cpython-310.pyc package-lock.json *.pyc scripts/auto_gpt_workspace/* +auto_gpt_workspace/* *.mpeg .env last_run_ai_settings.yaml \ No newline at end of file diff --git a/requirements-new.txt b/requirements-new.txt new file mode 100644 index 000000000..7253c19b5 --- /dev/null +++ b/requirements-new.txt @@ -0,0 +1,13 @@ +beautifulsoup4==4.12.0 +colorama==0.4.6 +docker_py==1.10.6 +googlesearch_python==1.1.0 +numpy==1.24.2 +openai==0.27.2 +playsound==1.3.0 +playwright==1.32.1 +python-dotenv==1.0.0 +PyYAML==6.0 +requests==2.28.2 +scipy==1.10.1 +tiktoken==0.3.3 diff --git a/requirements.txt b/requirements.txt index e731354b4..4b5de5ba2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ beautifulsoup4 colorama==0.4.6 -dirtyjson==1.0. +# dirtyjson==1.0. openai==0.27.2 playsound==1.3.0 python-dotenv==1.0.0 @@ -9,5 +9,5 @@ readability-lxml==0.8.1 requests tiktoken==0.3.3 docker -# googlesearch-python +googlesearch_python==1.1.0 # Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this? \ No newline at end of file diff --git a/scripts/browse_playwright.py b/scripts/browse_playwright.py new file mode 100644 index 000000000..513724515 --- /dev/null +++ b/scripts/browse_playwright.py @@ -0,0 +1,150 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup +from config import Config +from llm_utils import create_chat_completion + +cfg = Config() + +def scrape_text(url): + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + + try: + page.goto(url) + html_content = page.content() + soup = BeautifulSoup(html_content, "html.parser") + + for script in soup(["script", "style"]): + script.extract() + + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = '\n'.join(chunk for chunk in chunks if chunk) + + except Exception as e: + text = "Error: " + str(e) + + finally: + browser.close() + + return text + + +def extract_hyperlinks(soup): + hyperlinks = [] + for link in soup.find_all('a', href=True): + hyperlinks.append((link.text, link['href'])) + return hyperlinks + + +def format_hyperlinks(hyperlinks): + formatted_links = [] + for link_text, link_url in hyperlinks: + formatted_links.append(f"{link_text} ({link_url})") + return formatted_links + + +def scrape_links(url): + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + + try: + page.goto(url) + html_content = page.content() + soup = BeautifulSoup(html_content, "html.parser") + + for script in soup(["script", "style"]): + script.extract() + + hyperlinks = extract_hyperlinks(soup) + formatted_links = format_hyperlinks(hyperlinks) + + except Exception as e: + formatted_links = "Error: " + str(e) + + finally: + browser.close() + + return formatted_links + +# The rest of the code remains unchanged. + +def split_text(text, max_length=8192): + paragraphs = text.split("\n") + current_length = 0 + current_chunk = [] + + for paragraph in paragraphs: + if current_length + len(paragraph) + 1 <= max_length: + current_chunk.append(paragraph) + current_length += len(paragraph) + 1 + else: + yield "\n".join(current_chunk) + current_chunk = [paragraph] + current_length = len(paragraph) + 1 + + if current_chunk: + yield "\n".join(current_chunk) + + +def summarize_text(text, is_website=True): + if text == "": + return "Error: No text to summarize" + + print("Text length: " + str(len(text)) + " characters") + summaries = [] + chunks = list(split_text(text)) + + for i, chunk in enumerate(chunks): + print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks))) + if is_website: + messages = [ + { + "role": "user", + "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " + + chunk}, + ] + else: + messages = [ + { + "role": "user", + "content": "Please summarize the following text, focusing on extracting concise and specific information: " + + chunk}, + ] + + summary = create_chat_completion( + model=cfg.fast_llm_model, + messages=messages, + max_tokens=300, + ) + summaries.append(summary) + print("Summarized " + str(len(chunks)) + " chunks.") + + combined_summary = "\n".join(summaries) + + # Summarize the combined summary + if is_website: + messages = [ + { + "role": "user", + "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " + + combined_summary}, + ] + else: + messages = [ + { + "role": "user", + "content": "Please summarize the following text, focusing on extracting concise and specific infomation: " + + combined_summary}, + ] + + final_summary = create_chat_completion( + model=cfg.fast_llm_model, + messages=messages, + max_tokens=300, + ) + + return final_summary diff --git a/scripts/commands.py b/scripts/commands.py index 2e332711b..3c8cba8eb 100644 --- a/scripts/commands.py +++ b/scripts/commands.py @@ -1,4 +1,4 @@ -import browse +import browse_playwright as browse import json import memory as mem import datetime diff --git a/scripts/json_parser.py b/scripts/json_parser.py index 8154b584a..2cf2aecce 100644 --- a/scripts/json_parser.py +++ b/scripts/json_parser.py @@ -1,4 +1,4 @@ -import dirtyjson +import json from call_ai_function import call_ai_function from config import Config cfg = Config() @@ -24,7 +24,7 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True): """ try: - return dirtyjson.loads(json_str) + return json.loads(json_str) except Exception as e: # Let's do something manually - sometimes GPT responds with something BEFORE the braces: # "I'm sorry, I don't understand. Please try again."{"text": "I'm sorry, I don't understand. Please try again.", "confidence": 0.0} @@ -34,14 +34,14 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True): json_str = json_str[brace_index:] last_brace_index = json_str.rindex("}") json_str = json_str[:last_brace_index+1] - return dirtyjson.loads(json_str) + return json.loads(json_str) except Exception as e: if try_to_fix_with_gpt: print(f"Warning: Failed to parse AI output, attempting to fix.\n If you see this warning frequently, it's likely that your prompt is confusing the AI. Try changing it up slightly.") # Now try to fix this up using the ai_functions ai_fixed_json = fix_json(json_str, json_schema, False) if ai_fixed_json != "failed": - return dirtyjson.loads(ai_fixed_json) + return json.loads(ai_fixed_json) else: print(f"Failed to fix ai output, telling the AI.") # This allows the AI to react to the error message, which usually results in it correcting its ways. return json_str @@ -68,7 +68,7 @@ def fix_json(json_str: str, schema: str, debug=False) -> str: print(f"Fixed JSON: {result_string}") print("----------- END OF FIX ATTEMPT ----------------") try: - return dirtyjson.loads(result_string) + return json.loads(result_string) except: # Get the call stack: # import traceback From 6ea2a97e83e3e80525c97a0657ca2af9f7eb8d72 Mon Sep 17 00:00:00 2001 From: ryanmac Date: Mon, 3 Apr 2023 14:15:21 -0500 Subject: [PATCH 2/4] Rename requirements-new.txt to requirements-mac-Python-3.11.txt --- requirements-new.txt => requirements-mac-Python-3.11.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename requirements-new.txt => requirements-mac-Python-3.11.txt (100%) diff --git a/requirements-new.txt b/requirements-mac-Python-3.11.txt similarity index 100% rename from requirements-new.txt rename to requirements-mac-Python-3.11.txt From 29c0b544a40dea5e8bc802ec77c1c572c90064fc Mon Sep 17 00:00:00 2001 From: ryanmac Date: Wed, 5 Apr 2023 20:03:46 -0500 Subject: [PATCH 3/4] Delete requirements-mac-Python-3.11.txt Removing unnecessary files --- requirements-mac-Python-3.11.txt | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 requirements-mac-Python-3.11.txt diff --git a/requirements-mac-Python-3.11.txt b/requirements-mac-Python-3.11.txt deleted file mode 100644 index 7253c19b5..000000000 --- a/requirements-mac-Python-3.11.txt +++ /dev/null @@ -1,13 +0,0 @@ -beautifulsoup4==4.12.0 -colorama==0.4.6 -docker_py==1.10.6 -googlesearch_python==1.1.0 -numpy==1.24.2 -openai==0.27.2 -playsound==1.3.0 -playwright==1.32.1 -python-dotenv==1.0.0 -PyYAML==6.0 -requests==2.28.2 -scipy==1.10.1 -tiktoken==0.3.3 From ef4e4eb5d4d9fc6f8ba5cd22e058b0d2d09b149d Mon Sep 17 00:00:00 2001 From: BillSchumacher <34168009+BillSchumacher@users.noreply.github.com> Date: Sat, 15 Apr 2023 17:30:28 -0500 Subject: [PATCH 4/4] Blacked --- autogpt/commands/web_playwright.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/autogpt/commands/web_playwright.py b/autogpt/commands/web_playwright.py index 2b0118d24..93a46ac9c 100644 --- a/autogpt/commands/web_playwright.py +++ b/autogpt/commands/web_playwright.py @@ -2,7 +2,9 @@ try: from playwright.sync_api import sync_playwright except ImportError: - print("Playwright not installed. Please install it with 'pip install playwright' to use.") + print( + "Playwright not installed. Please install it with 'pip install playwright' to use." + ) from bs4 import BeautifulSoup from autogpt.processing.html import extract_hyperlinks, format_hyperlinks from typing import List, Union @@ -10,10 +12,10 @@ from typing import List, Union def scrape_text(url: str) -> str: """Scrape text from a webpage - + Args: url (str): The URL to scrape text from - + Returns: str: The scraped text """ @@ -32,7 +34,7 @@ def scrape_text(url: str) -> str: text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = '\n'.join(chunk for chunk in chunks if chunk) + text = "\n".join(chunk for chunk in chunks if chunk) except Exception as e: text = f"Error: {str(e)}"