diff --git a/scripts/browse.py b/scripts/browse.py index 09f376a70..810606a33 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -5,25 +5,38 @@ from llm_utils import create_chat_completion cfg = Config() -# Define and check for local file address prefixes def check_local_file_access(url): + # Define and check for local file address prefixes local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost'] return any(url.startswith(prefix) for prefix in local_prefixes) +def get_validated_response(url, headers=cfg.user_agent_header): + try: + # Restrict access to local files + if check_local_file_access(url): + raise ValueError('Access to local files is restricted') + + # Most basic check if the URL is valid: + if not url.startswith('http://') and not url.startswith('https://'): + raise ValueError('Invalid URL format') + + # Make the HTTP request and return the response + response = requests.get(url, headers=headers) + response.raise_for_status() # Raise an exception if the response contains an HTTP error status code + return response, None + except ValueError as ve: + # Handle invalid URL format + return None, "Error: " + str(ve) + + except requests.exceptions.RequestException as re: + # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.) + return None, "Error: " + str(re) + def scrape_text(url): """Scrape text from a webpage""" - # Most basic check if the URL is valid: - if not url.startswith('http'): - return "Error: Invalid URL" - - # Restrict access to local files - if check_local_file_access(url): - return "Error: Access to local files is restricted" - - try: - response = requests.get(url, headers=cfg.user_agent_header) - except requests.exceptions.RequestException as e: - return "Error: " + str(e) + response, error_message = get_validated_response(url) + if error_message: + return error_message # Check if the response contains an HTTP error if response.status_code >= 400: @@ -60,7 +73,9 @@ def format_hyperlinks(hyperlinks): def scrape_links(url): """Scrape links from a webpage""" - response = requests.get(url, headers=cfg.user_agent_header) + response, error_message = get_validated_response(url) + if error_message: + return error_message # Check if the response contains an HTTP error if response.status_code >= 400: diff --git a/tests/test_browse_scrape_links.py b/tests/test_browse_scrape_links.py new file mode 100644 index 000000000..908e43b9f --- /dev/null +++ b/tests/test_browse_scrape_links.py @@ -0,0 +1,117 @@ + +# Generated by CodiumAI +from scripts.browse import scrape_links + + +# Dependencies: +# pip install pytest-mock +import pytest + +""" +Code Analysis + +Objective: +The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way. + +Inputs: +- url: a string representing the URL to be scraped. + +Flow: +1. Send a GET request to the given URL using the requests library and the user agent header from the config file. +2. Check if the response contains an HTTP error. If it does, return "error". +3. Parse the HTML content of the response using the BeautifulSoup library. +4. Remove any script and style tags from the parsed HTML. +5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function. +6. Format the extracted hyperlinks using the 'format_hyperlinks' function. +7. Return the formatted hyperlinks. + +Outputs: +- A list of formatted hyperlinks. + +Additional aspects: +- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively. +- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML. +- The 'format_hyperlinks' function is called to format the extracted hyperlinks. +- The function checks for HTTP errors and returns "error" if any are found. +""" + + + +class TestScrapeLinks: + + # Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks. + def test_valid_url_with_hyperlinks(self): + url = "https://www.google.com" + result = scrape_links(url) + assert len(result) > 0 + assert isinstance(result, list) + assert isinstance(result[0], str) + + # Tests that the function returns correctly formatted hyperlinks when given a valid url. + def test_valid_url(self, mocker): + # Mock the requests.get() function to return a response with sample HTML containing hyperlinks + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = "Google" + mocker.patch('requests.get', return_value=mock_response) + + # Call the function with a valid URL + result = scrape_links("https://www.example.com") + + # Assert that the function returns correctly formatted hyperlinks + assert result == ["Google (https://www.google.com)"] + + # Tests that the function returns "error" when given an invalid url. + def test_invalid_url(self, mocker): + # Mock the requests.get() function to return an HTTP error response + mock_response = mocker.Mock() + mock_response.status_code = 404 + mocker.patch('requests.get', return_value=mock_response) + + # Call the function with an invalid URL + result = scrape_links("https://www.invalidurl.com") + + # Assert that the function returns "error" + assert result == "error" + + # Tests that the function returns an empty list when the html contains no hyperlinks. + def test_no_hyperlinks(self, mocker): + # Mock the requests.get() function to return a response with sample HTML containing no hyperlinks + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = "

No hyperlinks here

" + mocker.patch('requests.get', return_value=mock_response) + + # Call the function with a URL containing no hyperlinks + result = scrape_links("https://www.example.com") + + # Assert that the function returns an empty list + assert result == [] + + # Tests that scrape_links() correctly extracts and formats hyperlinks from a sample HTML containing a few hyperlinks. + def test_scrape_links_with_few_hyperlinks(self, mocker): + # Mock the requests.get() function to return a response with a sample HTML containing hyperlinks + mock_response = mocker.Mock() + mock_response.status_code = 200 + mock_response.text = """ + + + +
GitHub
+
CodiumAI
+ + + """ + mocker.patch('requests.get', return_value=mock_response) + + # Call the function being tested + result = scrape_links("https://www.example.com") + + # Assert that the function returns a list of formatted hyperlinks + assert isinstance(result, list) + assert len(result) == 3 + assert result[0] == "Google (https://www.google.com)" + assert result[1] == "GitHub (https://github.com)" + assert result[2] == "CodiumAI (https://www.codium.ai)" + +