Merge branch 'master' into fix-#323-error-communicating-to-openai

2023-04-10 12:28:06 +05:30 · 2023-04-10 12:28:06 +05:30 · 15363cc38b
parent 13467259b4 f5bc36716b
commit 15363cc38b
4 changed files with 122 additions and 10 deletions
--- a/scripts/browse.py
+++ b/scripts/browse.py
@ -5,17 +5,21 @@ from llm_utils import create_chat_completion
 cfg = Config()
-def get_website_content(url):
+def scrape_text(url):
    # Most basic check if the URL is valid:
    if not url.startswith('http'):
        return "Error: Invalid URL"
    try:
        response = requests.get(url, headers=cfg.user_agent_header)
    except requests.exceptions.RequestException as e:
        return "Error: " + str(e)
    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        return "Error: HTTP " + str(response.status_code) + " error"
    return response
-
+    soup = BeautifulSoup(response.text, "html.parser")
 def scrape_text(website_content):
    soup = BeautifulSoup(website_content.text, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
@ -42,8 +46,14 @@ def format_hyperlinks(hyperlinks):
    return formatted_links
-def scrape_links(website_content):
+def scrape_links(url):
-    soup = BeautifulSoup(website_content.text, "html.parser")
+    response = requests.get(url, headers=cfg.user_agent_header)
    # Check if the response contains an HTTP error
    if response.status_code >= 400:
        return "error"
    soup = BeautifulSoup(response.text, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
--- a/scripts/commands.py
+++ b/scripts/commands.py
@ -106,6 +106,8 @@ def execute_command(command_name, arguments):
            return execute_python_file(arguments["file"])
        elif command_name == "generate_image":
            return generate_image(arguments["prompt"])
        elif command_name == "do_nothing":
            return "No action performed."
        elif command_name == "task_complete":
            shutdown()
        else:
--- a/scripts/data/prompt.txt
+++ b/scripts/data/prompt.txt
@ -24,6 +24,7 @@ COMMANDS:
 18. Execute Python File: "execute_python_file", args: "file": "<file>"
 19. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
 20. Generate Image: "generate_image", args: "prompt": "<prompt>"
 21. Do Nothing; command name: "do_nothing", args: ""
 RESOURCES:
--- a/tests/test_browse_scrape_text.py
+++ b/tests/test_browse_scrape_text.py
@ -0,0 +1,99 @@
 # Generated by CodiumAI
 import requests
 import pytest
 from scripts.browse import scrape_text
 """
 Code Analysis
 Objective:
 The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
 Inputs:
 - url: a string representing the URL of the webpage to be scraped.
 Flow:
 1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
 2. Check if the response contains an HTTP error. If it does, return an error message.
 3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags.
 4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup.
 5. Split the text into lines and then into chunks, removing any extra whitespace.
 6. Join the chunks into a single string with newline characters between them.
 7. Return the cleaned text.
 Outputs:
 - A string representing the cleaned text content of the webpage.
 Additional aspects:
 - The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively.
 - The function removes script and style tags from the HTML to avoid including unwanted content in the text output.
 - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
 """
 class TestScrapeText:
    # Tests that scrape_text() returns the expected text when given a valid URL. 
    def test_scrape_text_with_valid_url(self, mocker):
        # Mock the requests.get() method to return a response with expected text
        expected_text = "This is some sample text"
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>"
        mocker.patch("requests.get", return_value=mock_response)
        # Call the function with a valid URL and assert that it returns the expected text
        url = "http://www.example.com"
        assert scrape_text(url) == expected_text
    # Tests that the function returns an error message when an invalid or unreachable url is provided. 
    def test_invalid_url(self, mocker):
        # Mock the requests.get() method to raise an exception
        mocker.patch("requests.get", side_effect=requests.exceptions.RequestException)
        # Call the function with an invalid URL and assert that it returns an error message
        url = "http://www.invalidurl.com"
        error_message = scrape_text(url)
        assert "Error:" in error_message
    # Tests that the function returns an empty string when the html page contains no text to be scraped. 
    def test_no_text(self, mocker):
        # Mock the requests.get() method to return a response with no text
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = "<html><body></body></html>"
        mocker.patch("requests.get", return_value=mock_response)
        # Call the function with a valid URL and assert that it returns an empty string
        url = "http://www.example.com"
        assert scrape_text(url) == ""
    # Tests that the function returns an error message when the response status code is an http error (>=400).  
    def test_http_error(self, mocker):
        # Mock the requests.get() method to return a response with a 404 status code
        mocker.patch('requests.get', return_value=mocker.Mock(status_code=404))
        # Call the function with a URL
        result = scrape_text("https://www.example.com")
        # Check that the function returns an error message
        assert result == "Error: HTTP 404 error"
    # Tests that scrape_text() properly handles HTML tags. 
    def test_scrape_text_with_html_tags(self, mocker):
        # Create a mock response object with HTML containing tags
        html = "<html><body><p>This is <b>bold</b> text.</p></body></html>"
        mock_response = mocker.Mock()
        mock_response.status_code = 200
        mock_response.text = html
        mocker.patch("requests.get", return_value=mock_response)
        # Call the function with a URL
        result = scrape_text("https://www.example.com")
        # Check that the function properly handles HTML tags
        assert result == "This is bold text."