From ac7fefe96ea740e14188754b5457efc8ff7c1507 Mon Sep 17 00:00:00 2001
From: ryanmac <ryan.maccarthy@gmail.com>
Date: Mon, 3 Apr 2023 14:05:32 -0500
Subject: [PATCH 1/4] Use playwright instead of requests for browse

---
 .gitignore                   |   1 +
 requirements-new.txt         |  13 +++
 requirements.txt             |   4 +-
 scripts/browse_playwright.py | 150 +++++++++++++++++++++++++++++++++++
 scripts/commands.py          |   2 +-
 scripts/json_parser.py       |  10 +--
 6 files changed, 172 insertions(+), 8 deletions(-)
 create mode 100644 requirements-new.txt
 create mode 100644 scripts/browse_playwright.py

diff --git a/.gitignore b/.gitignore
index a4e3cc2d8..b361b4bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ scripts/__pycache__/keys.cpython-310.pyc
 package-lock.json
 *.pyc
 scripts/auto_gpt_workspace/*
+auto_gpt_workspace/*
 *.mpeg
 .env
 last_run_ai_settings.yaml
\ No newline at end of file
diff --git a/requirements-new.txt b/requirements-new.txt
new file mode 100644
index 000000000..7253c19b5
--- /dev/null
+++ b/requirements-new.txt
@@ -0,0 +1,13 @@
+beautifulsoup4==4.12.0
+colorama==0.4.6
+docker_py==1.10.6
+googlesearch_python==1.1.0
+numpy==1.24.2
+openai==0.27.2
+playsound==1.3.0
+playwright==1.32.1
+python-dotenv==1.0.0
+PyYAML==6.0
+requests==2.28.2
+scipy==1.10.1
+tiktoken==0.3.3
diff --git a/requirements.txt b/requirements.txt
index e731354b4..4b5de5ba2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 beautifulsoup4
 colorama==0.4.6
-dirtyjson==1.0.
+# dirtyjson==1.0.
 openai==0.27.2
 playsound==1.3.0
 python-dotenv==1.0.0
@@ -9,5 +9,5 @@ readability-lxml==0.8.1
 requests
 tiktoken==0.3.3
 docker
-# googlesearch-python
+googlesearch_python==1.1.0
 # Googlesearch python seems to be a bit cursed, anyone good at fixing thigns like this?
\ No newline at end of file
diff --git a/scripts/browse_playwright.py b/scripts/browse_playwright.py
new file mode 100644
index 000000000..513724515
--- /dev/null
+++ b/scripts/browse_playwright.py
@@ -0,0 +1,150 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+from config import Config
+from llm_utils import create_chat_completion
+
+cfg = Config()
+
+def scrape_text(url):
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+
+        try:
+            page.goto(url)
+            html_content = page.content()
+            soup = BeautifulSoup(html_content, "html.parser")
+
+            for script in soup(["script", "style"]):
+                script.extract()
+
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = '\n'.join(chunk for chunk in chunks if chunk)
+
+        except Exception as e:
+            text = "Error: " + str(e)
+
+        finally:
+            browser.close()
+
+    return text
+
+
+def extract_hyperlinks(soup):
+    hyperlinks = []
+    for link in soup.find_all('a', href=True):
+        hyperlinks.append((link.text, link['href']))
+    return hyperlinks
+
+
+def format_hyperlinks(hyperlinks):
+    formatted_links = []
+    for link_text, link_url in hyperlinks:
+        formatted_links.append(f"{link_text} ({link_url})")
+    return formatted_links
+
+
+def scrape_links(url):
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+
+        try:
+            page.goto(url)
+            html_content = page.content()
+            soup = BeautifulSoup(html_content, "html.parser")
+
+            for script in soup(["script", "style"]):
+                script.extract()
+
+            hyperlinks = extract_hyperlinks(soup)
+            formatted_links = format_hyperlinks(hyperlinks)
+
+        except Exception as e:
+            formatted_links = "Error: " + str(e)
+
+        finally:
+            browser.close()
+
+    return formatted_links
+
+# The rest of the code remains unchanged.
+
+def split_text(text, max_length=8192):
+    paragraphs = text.split("\n")
+    current_length = 0
+    current_chunk = []
+
+    for paragraph in paragraphs:
+        if current_length + len(paragraph) + 1 <= max_length:
+            current_chunk.append(paragraph)
+            current_length += len(paragraph) + 1
+        else:
+            yield "\n".join(current_chunk)
+            current_chunk = [paragraph]
+            current_length = len(paragraph) + 1
+
+    if current_chunk:
+        yield "\n".join(current_chunk)
+
+
+def summarize_text(text, is_website=True):
+    if text == "":
+        return "Error: No text to summarize"
+
+    print("Text length: " + str(len(text)) + " characters")
+    summaries = []
+    chunks = list(split_text(text))
+
+    for i, chunk in enumerate(chunks):
+        print("Summarizing chunk " + str(i + 1) + " / " + str(len(chunks)))
+        if is_website:
+            messages = [
+                {
+                    "role": "user",
+                    "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
+                    chunk},
+            ]
+        else:
+            messages = [
+                {
+                    "role": "user",
+                    "content": "Please summarize the following text, focusing on extracting concise and specific information: " +
+                    chunk},
+            ]
+
+        summary = create_chat_completion(
+            model=cfg.fast_llm_model,
+            messages=messages,
+            max_tokens=300,
+        )
+        summaries.append(summary)
+    print("Summarized " + str(len(chunks)) + " chunks.")
+
+    combined_summary = "\n".join(summaries)
+
+    # Summarize the combined summary
+    if is_website:
+        messages = [
+            {
+                "role": "user",
+                "content": "Please summarize the following website text, do not describe the general website, but instead concisely extract the specific information this subpage contains.: " +
+                combined_summary},
+        ]
+    else:
+        messages = [
+            {
+                "role": "user",
+                "content": "Please summarize the following text, focusing on extracting concise and specific infomation: " +
+                combined_summary},
+        ]
+
+    final_summary = create_chat_completion(
+        model=cfg.fast_llm_model,
+        messages=messages,
+        max_tokens=300,
+    )
+
+    return final_summary
diff --git a/scripts/commands.py b/scripts/commands.py
index 2e332711b..3c8cba8eb 100644
--- a/scripts/commands.py
+++ b/scripts/commands.py
@@ -1,4 +1,4 @@
-import browse
+import browse_playwright as browse
 import json
 import memory as mem
 import datetime
diff --git a/scripts/json_parser.py b/scripts/json_parser.py
index 8154b584a..2cf2aecce 100644
--- a/scripts/json_parser.py
+++ b/scripts/json_parser.py
@@ -1,4 +1,4 @@
-import dirtyjson
+import json
 from call_ai_function import call_ai_function
 from config import Config
 cfg = Config()
@@ -24,7 +24,7 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
     """
 
     try:
-        return dirtyjson.loads(json_str)
+        return json.loads(json_str)
     except Exception as e:
         # Let's do something manually - sometimes GPT responds with something BEFORE the braces:
         # "I'm sorry, I don't understand. Please try again."{"text": "I'm sorry, I don't understand. Please try again.", "confidence": 0.0}
@@ -34,14 +34,14 @@ def fix_and_parse_json(json_str: str, try_to_fix_with_gpt: bool = True):
           json_str = json_str[brace_index:]
           last_brace_index = json_str.rindex("}")
           json_str = json_str[:last_brace_index+1]
-          return dirtyjson.loads(json_str)
+          return json.loads(json_str)
         except Exception as e:
           if try_to_fix_with_gpt:
             print(f"Warning: Failed to parse AI output, attempting to fix.\n If you see this warning frequently, it's likely that your prompt is confusing the AI. Try changing it up slightly.")
             # Now try to fix this up using the ai_functions
             ai_fixed_json = fix_json(json_str, json_schema, False)
             if ai_fixed_json != "failed":
-              return dirtyjson.loads(ai_fixed_json)
+              return json.loads(ai_fixed_json)
             else:
               print(f"Failed to fix ai output, telling the AI.") # This allows the AI to react to the error message, which usually results in it correcting its ways.
               return json_str
@@ -68,7 +68,7 @@ def fix_json(json_str: str, schema: str, debug=False) -> str:
         print(f"Fixed JSON: {result_string}")
         print("----------- END OF FIX ATTEMPT ----------------")
     try:
-        return dirtyjson.loads(result_string)
+        return json.loads(result_string)
     except:
         # Get the call stack:
         # import traceback

From 6ea2a97e83e3e80525c97a0657ca2af9f7eb8d72 Mon Sep 17 00:00:00 2001
From: ryanmac <ryan.maccarthy@gmail.com>
Date: Mon, 3 Apr 2023 14:15:21 -0500
Subject: [PATCH 2/4] Rename requirements-new.txt to
 requirements-mac-Python-3.11.txt

---
 requirements-new.txt => requirements-mac-Python-3.11.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename requirements-new.txt => requirements-mac-Python-3.11.txt (100%)

diff --git a/requirements-new.txt b/requirements-mac-Python-3.11.txt
similarity index 100%
rename from requirements-new.txt
rename to requirements-mac-Python-3.11.txt

From 29c0b544a40dea5e8bc802ec77c1c572c90064fc Mon Sep 17 00:00:00 2001
From: ryanmac <ryan.maccarthy@gmail.com>
Date: Wed, 5 Apr 2023 20:03:46 -0500
Subject: [PATCH 3/4] Delete requirements-mac-Python-3.11.txt

Removing unnecessary files
---
 requirements-mac-Python-3.11.txt | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 requirements-mac-Python-3.11.txt

diff --git a/requirements-mac-Python-3.11.txt b/requirements-mac-Python-3.11.txt
deleted file mode 100644
index 7253c19b5..000000000
--- a/requirements-mac-Python-3.11.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-beautifulsoup4==4.12.0
-colorama==0.4.6
-docker_py==1.10.6
-googlesearch_python==1.1.0
-numpy==1.24.2
-openai==0.27.2
-playsound==1.3.0
-playwright==1.32.1
-python-dotenv==1.0.0
-PyYAML==6.0
-requests==2.28.2
-scipy==1.10.1
-tiktoken==0.3.3

From ef4e4eb5d4d9fc6f8ba5cd22e058b0d2d09b149d Mon Sep 17 00:00:00 2001
From: BillSchumacher <34168009+BillSchumacher@users.noreply.github.com>
Date: Sat, 15 Apr 2023 17:30:28 -0500
Subject: [PATCH 4/4] Blacked

---
 autogpt/commands/web_playwright.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/autogpt/commands/web_playwright.py b/autogpt/commands/web_playwright.py
index 2b0118d24..93a46ac9c 100644
--- a/autogpt/commands/web_playwright.py
+++ b/autogpt/commands/web_playwright.py
@@ -2,7 +2,9 @@
 try:
     from playwright.sync_api import sync_playwright
 except ImportError:
-    print("Playwright not installed. Please install it with 'pip install playwright' to use.")
+    print(
+        "Playwright not installed. Please install it with 'pip install playwright' to use."
+    )
 from bs4 import BeautifulSoup
 from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
 from typing import List, Union
@@ -10,10 +12,10 @@ from typing import List, Union
 
 def scrape_text(url: str) -> str:
     """Scrape text from a webpage
-    
+
     Args:
         url (str): The URL to scrape text from
-        
+
     Returns:
         str: The scraped text
     """
@@ -32,7 +34,7 @@ def scrape_text(url: str) -> str:
             text = soup.get_text()
             lines = (line.strip() for line in text.splitlines())
             chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            text = '\n'.join(chunk for chunk in chunks if chunk)
+            text = "\n".join(chunk for chunk in chunks if chunk)
 
         except Exception as e:
             text = f"Error: {str(e)}"