Merge branch 'master' into fix-#323-error-communicating-to-openai
commit
15363cc38b
|
@ -5,17 +5,21 @@ from llm_utils import create_chat_completion
|
||||||
|
|
||||||
cfg = Config()
|
cfg = Config()
|
||||||
|
|
||||||
def get_website_content(url):
|
def scrape_text(url):
|
||||||
|
# Most basic check if the URL is valid:
|
||||||
|
if not url.startswith('http'):
|
||||||
|
return "Error: Invalid URL"
|
||||||
|
|
||||||
|
try:
|
||||||
response = requests.get(url, headers=cfg.user_agent_header)
|
response = requests.get(url, headers=cfg.user_agent_header)
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
return "Error: " + str(e)
|
||||||
|
|
||||||
# Check if the response contains an HTTP error
|
# Check if the response contains an HTTP error
|
||||||
if response.status_code >= 400:
|
if response.status_code >= 400:
|
||||||
return "Error: HTTP " + str(response.status_code) + " error"
|
return "Error: HTTP " + str(response.status_code) + " error"
|
||||||
return response
|
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
def scrape_text(website_content):
|
|
||||||
soup = BeautifulSoup(website_content.text, "html.parser")
|
|
||||||
|
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
@ -42,8 +46,14 @@ def format_hyperlinks(hyperlinks):
|
||||||
return formatted_links
|
return formatted_links
|
||||||
|
|
||||||
|
|
||||||
def scrape_links(website_content):
|
def scrape_links(url):
|
||||||
soup = BeautifulSoup(website_content.text, "html.parser")
|
response = requests.get(url, headers=cfg.user_agent_header)
|
||||||
|
|
||||||
|
# Check if the response contains an HTTP error
|
||||||
|
if response.status_code >= 400:
|
||||||
|
return "error"
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
for script in soup(["script", "style"]):
|
for script in soup(["script", "style"]):
|
||||||
script.extract()
|
script.extract()
|
||||||
|
|
|
@ -106,6 +106,8 @@ def execute_command(command_name, arguments):
|
||||||
return execute_python_file(arguments["file"])
|
return execute_python_file(arguments["file"])
|
||||||
elif command_name == "generate_image":
|
elif command_name == "generate_image":
|
||||||
return generate_image(arguments["prompt"])
|
return generate_image(arguments["prompt"])
|
||||||
|
elif command_name == "do_nothing":
|
||||||
|
return "No action performed."
|
||||||
elif command_name == "task_complete":
|
elif command_name == "task_complete":
|
||||||
shutdown()
|
shutdown()
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -24,6 +24,7 @@ COMMANDS:
|
||||||
18. Execute Python File: "execute_python_file", args: "file": "<file>"
|
18. Execute Python File: "execute_python_file", args: "file": "<file>"
|
||||||
19. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
|
19. Task Complete (Shutdown): "task_complete", args: "reason": "<reason>"
|
||||||
20. Generate Image: "generate_image", args: "prompt": "<prompt>"
|
20. Generate Image: "generate_image", args: "prompt": "<prompt>"
|
||||||
|
21. Do Nothing; command name: "do_nothing", args: ""
|
||||||
|
|
||||||
RESOURCES:
|
RESOURCES:
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
|
||||||
|
# Generated by CodiumAI
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from scripts.browse import scrape_text
|
||||||
|
|
||||||
|
"""
|
||||||
|
Code Analysis
|
||||||
|
|
||||||
|
Objective:
|
||||||
|
The objective of the "scrape_text" function is to scrape the text content from a given URL and return it as a string, after removing any unwanted HTML tags and scripts.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
- url: a string representing the URL of the webpage to be scraped.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
|
||||||
|
2. Check if the response contains an HTTP error. If it does, return an error message.
|
||||||
|
3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags.
|
||||||
|
4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup.
|
||||||
|
5. Split the text into lines and then into chunks, removing any extra whitespace.
|
||||||
|
6. Join the chunks into a single string with newline characters between them.
|
||||||
|
7. Return the cleaned text.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
- A string representing the cleaned text content of the webpage.
|
||||||
|
|
||||||
|
Additional aspects:
|
||||||
|
- The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively.
|
||||||
|
- The function removes script and style tags from the HTML to avoid including unwanted content in the text output.
|
||||||
|
- The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TestScrapeText:
|
||||||
|
|
||||||
|
# Tests that scrape_text() returns the expected text when given a valid URL.
|
||||||
|
def test_scrape_text_with_valid_url(self, mocker):
|
||||||
|
# Mock the requests.get() method to return a response with expected text
|
||||||
|
expected_text = "This is some sample text"
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>"
|
||||||
|
mocker.patch("requests.get", return_value=mock_response)
|
||||||
|
|
||||||
|
# Call the function with a valid URL and assert that it returns the expected text
|
||||||
|
url = "http://www.example.com"
|
||||||
|
assert scrape_text(url) == expected_text
|
||||||
|
|
||||||
|
# Tests that the function returns an error message when an invalid or unreachable url is provided.
|
||||||
|
def test_invalid_url(self, mocker):
|
||||||
|
# Mock the requests.get() method to raise an exception
|
||||||
|
mocker.patch("requests.get", side_effect=requests.exceptions.RequestException)
|
||||||
|
|
||||||
|
# Call the function with an invalid URL and assert that it returns an error message
|
||||||
|
url = "http://www.invalidurl.com"
|
||||||
|
error_message = scrape_text(url)
|
||||||
|
assert "Error:" in error_message
|
||||||
|
|
||||||
|
# Tests that the function returns an empty string when the html page contains no text to be scraped.
|
||||||
|
def test_no_text(self, mocker):
|
||||||
|
# Mock the requests.get() method to return a response with no text
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = "<html><body></body></html>"
|
||||||
|
mocker.patch("requests.get", return_value=mock_response)
|
||||||
|
|
||||||
|
# Call the function with a valid URL and assert that it returns an empty string
|
||||||
|
url = "http://www.example.com"
|
||||||
|
assert scrape_text(url) == ""
|
||||||
|
|
||||||
|
# Tests that the function returns an error message when the response status code is an http error (>=400).
|
||||||
|
def test_http_error(self, mocker):
|
||||||
|
# Mock the requests.get() method to return a response with a 404 status code
|
||||||
|
mocker.patch('requests.get', return_value=mocker.Mock(status_code=404))
|
||||||
|
|
||||||
|
# Call the function with a URL
|
||||||
|
result = scrape_text("https://www.example.com")
|
||||||
|
|
||||||
|
# Check that the function returns an error message
|
||||||
|
assert result == "Error: HTTP 404 error"
|
||||||
|
|
||||||
|
# Tests that scrape_text() properly handles HTML tags.
|
||||||
|
def test_scrape_text_with_html_tags(self, mocker):
|
||||||
|
# Create a mock response object with HTML containing tags
|
||||||
|
html = "<html><body><p>This is <b>bold</b> text.</p></body></html>"
|
||||||
|
mock_response = mocker.Mock()
|
||||||
|
mock_response.status_code = 200
|
||||||
|
mock_response.text = html
|
||||||
|
mocker.patch("requests.get", return_value=mock_response)
|
||||||
|
|
||||||
|
# Call the function with a URL
|
||||||
|
result = scrape_text("https://www.example.com")
|
||||||
|
|
||||||
|
# Check that the function properly handles HTML tags
|
||||||
|
assert result == "This is bold text."
|
Loading…
Reference in New Issue