browse: (1) apply validation also to scrape_links(), (2) add tests for scrape_links()

2023-04-11 11:17:07 +03:00 · 2023-04-11 11:17:07 +03:00 · 2d5d0131bb
parent 5a6e565c52
commit 2d5d0131bb
2 changed files with 146 additions and 14 deletions
--- a/scripts/browse.py
+++ b/scripts/browse.py
@ -5,25 +5,38 @@ from llm_utils import create_chat_completion

 cfg = Config()

-# Define and check for local file address prefixes
 def check_local_file_access(url):
+    # Define and check for local file address prefixes
    local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost']
    return any(url.startswith(prefix) for prefix in local_prefixes)

-def scrape_text(url):
-    """Scrape text from a webpage"""
-    # Most basic check if the URL is valid:
-    if not url.startswith('http'):
-        return "Error: Invalid URL"
-    
+def get_validated_response(url, headers=cfg.user_agent_header):
+    try:
        # Restrict access to local files
        if check_local_file_access(url):
-        return "Error: Access to local files is restricted"
+            raise ValueError('Access to local files is restricted')
        
-    try:
-        response = requests.get(url, headers=cfg.user_agent_header)
-    except requests.exceptions.RequestException as e:
-        return "Error: " + str(e)
+        # Most basic check if the URL is valid:
+        if not url.startswith('http://') and not url.startswith('https://'):
+            raise ValueError('Invalid URL format')
+
+        # Make the HTTP request and return the response
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()  # Raise an exception if the response contains an HTTP error status code
+        return response, None
+    except ValueError as ve:
+        # Handle invalid URL format
+        return None, "Error: " + str(ve)
+
+    except requests.exceptions.RequestException as re:
+        # Handle exceptions related to the HTTP request (e.g., connection errors, timeouts, etc.)
+        return None, "Error: " + str(re)
+
+def scrape_text(url):
+    """Scrape text from a webpage"""
+    response, error_message = get_validated_response(url)
+    if error_message:
+        return error_message

    # Check if the response contains an HTTP error
    if response.status_code >= 400:
@ -60,7 +73,9 @@ def format_hyperlinks(hyperlinks):

 def scrape_links(url):
    """Scrape links from a webpage"""
-    response = requests.get(url, headers=cfg.user_agent_header)
+    response, error_message = get_validated_response(url)
+    if error_message:
+        return error_message

    # Check if the response contains an HTTP error
    if response.status_code >= 400:
--- a/tests/test_browse_scrape_links.py
+++ b/tests/test_browse_scrape_links.py
@ -0,0 +1,117 @@
+
+# Generated by CodiumAI
+from scripts.browse import scrape_links
+
+
+# Dependencies:
+# pip install pytest-mock
+import pytest
+
+"""
+Code Analysis
+
+Objective:
+The objective of the 'scrape_links' function is to scrape hyperlinks from a given URL and return them in a formatted way.
+
+Inputs:
+- url: a string representing the URL to be scraped.
+
+Flow:
+1. Send a GET request to the given URL using the requests library and the user agent header from the config file.
+2. Check if the response contains an HTTP error. If it does, return "error".
+3. Parse the HTML content of the response using the BeautifulSoup library.
+4. Remove any script and style tags from the parsed HTML.
+5. Extract all hyperlinks from the parsed HTML using the 'extract_hyperlinks' function.
+6. Format the extracted hyperlinks using the 'format_hyperlinks' function.
+7. Return the formatted hyperlinks.
+
+Outputs:
+- A list of formatted hyperlinks.
+
+Additional aspects:
+- The function uses the 'requests' and 'BeautifulSoup' libraries to send HTTP requests and parse HTML content, respectively.
+- The 'extract_hyperlinks' function is called to extract hyperlinks from the parsed HTML.
+- The 'format_hyperlinks' function is called to format the extracted hyperlinks.
+- The function checks for HTTP errors and returns "error" if any are found.
+"""
+
+
+
+class TestScrapeLinks:
+
+    # Tests that the function returns a list of formatted hyperlinks when provided with a valid url that returns a webpage with hyperlinks. 
+    def test_valid_url_with_hyperlinks(self):
+        url = "https://www.google.com"
+        result = scrape_links(url)
+        assert len(result) > 0
+        assert isinstance(result, list)
+        assert isinstance(result[0], str)
+        
+    # Tests that the function returns correctly formatted hyperlinks when given a valid url. 
+    def test_valid_url(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><a href='https://www.google.com'>Google</a></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a valid URL
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns correctly formatted hyperlinks
+        assert result == ["Google (https://www.google.com)"]
+
+    # Tests that the function returns "error" when given an invalid url. 
+    def test_invalid_url(self, mocker):
+        # Mock the requests.get() function to return an HTTP error response
+        mock_response = mocker.Mock()
+        mock_response.status_code = 404
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with an invalid URL
+        result = scrape_links("https://www.invalidurl.com")
+
+        # Assert that the function returns "error"
+        assert result == "error"
+
+    # Tests that the function returns an empty list when the html contains no hyperlinks. 
+    def test_no_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with sample HTML containing no hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = "<html><body><p>No hyperlinks here</p></body></html>"
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function with a URL containing no hyperlinks
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns an empty list
+        assert result == []
+
+    # Tests that scrape_links() correctly extracts and formats hyperlinks from a sample HTML containing a few hyperlinks. 
+    def test_scrape_links_with_few_hyperlinks(self, mocker):
+        # Mock the requests.get() function to return a response with a sample HTML containing hyperlinks
+        mock_response = mocker.Mock()
+        mock_response.status_code = 200
+        mock_response.text = """
+            <html>
+                <body>
+                    <div id="google-link"><a href="https://www.google.com">Google</a></div>
+                    <div id="github"><a href="https://github.com">GitHub</a></div>
+                    <div id="CodiumAI"><a href="https://www.codium.ai">CodiumAI</a></div>
+                </body>
+            </html>
+        """
+        mocker.patch('requests.get', return_value=mock_response)
+
+        # Call the function being tested
+        result = scrape_links("https://www.example.com")
+
+        # Assert that the function returns a list of formatted hyperlinks
+        assert isinstance(result, list)
+        assert len(result) == 3
+        assert result[0] == "Google (https://www.google.com)"
+        assert result[1] == "GitHub (https://github.com)"
+        assert result[2] == "CodiumAI (https://www.codium.ai)"
+
+