From c63645cbbafa3c84ee8b46d0e8806ab6d0fc4009 Mon Sep 17 00:00:00 2001 From: Itamar Friedman Date: Wed, 12 Apr 2023 22:41:23 +0300 Subject: [PATCH] redo suggested changes. move unit test files to the fitting directory --- scripts/browse.py | 56 ++++++-------------- tests/{ => unit}/test_browse_scrape_links.py | 4 +- tests/{ => unit}/test_browse_scrape_text.py | 0 3 files changed, 17 insertions(+), 43 deletions(-) rename tests/{ => unit}/test_browse_scrape_links.py (98%) rename tests/{ => unit}/test_browse_scrape_text.py (100%) diff --git a/scripts/browse.py b/scripts/browse.py index 046ff6ae9..912d5635e 100644 --- a/scripts/browse.py +++ b/scripts/browse.py @@ -19,22 +19,13 @@ def is_valid_url(url): def sanitize_url(url): return urljoin(url, urlparse(url).path) -# Function to make a request with a specified timeout and handle exceptions -def make_request(url, timeout=10): - try: - response = requests.get(url, headers=cfg.user_agent_header, timeout=timeout) - response.raise_for_status() - return response - except requests.exceptions.RequestException as e: - return "Error: " + str(e) - # Define and check for local file address prefixes def check_local_file_access(url): # Define and check for local file address prefixes local_prefixes = ['file:///', 'file://localhost', 'http://localhost', 'https://localhost'] return any(url.startswith(prefix) for prefix in local_prefixes) -def get_validated_response(url, headers=cfg.user_agent_header): +def get_response(url, headers=cfg.user_agent_header, timeout=10): try: # Restrict access to local files if check_local_file_access(url): @@ -44,9 +35,14 @@ def get_validated_response(url, headers=cfg.user_agent_header): if not url.startswith('http://') and not url.startswith('https://'): raise ValueError('Invalid URL format') - # Make the HTTP request and return the response - response = requests.get(url, headers=headers) - response.raise_for_status() # Raise an exception if the response contains an HTTP error status code + sanitized_url = sanitize_url(url) + + response = requests.get(sanitized_url, headers=headers, timeout=timeout) + + # Check if the response contains an HTTP error + if response.status_code >= 400: + return None, "Error: HTTP " + str(response.status_code) + " error" + return response, None except ValueError as ve: # Handle invalid URL format @@ -58,29 +54,9 @@ def get_validated_response(url, headers=cfg.user_agent_header): def scrape_text(url): """Scrape text from a webpage""" - # Basic check if the URL is valid - if not url.startswith('http'): - return "Error: Invalid URL" - - # Restrict access to local files - if check_local_file_access(url): - return "Error: Access to local files is restricted" - - # Validate the input URL - if not is_valid_url(url): - # Sanitize the input URL - sanitized_url = sanitize_url(url) - - # Make the request with a timeout and handle exceptions - response = make_request(sanitized_url) - - if isinstance(response, str): - return response - else: - # Sanitize the input URL - sanitized_url = sanitize_url(url) - - response = requests.get(sanitized_url, headers=cfg.user_agent_header) + response, error_message = get_response(url) + if error_message: + return error_message soup = BeautifulSoup(response.text, "html.parser") @@ -113,11 +89,9 @@ def format_hyperlinks(hyperlinks): def scrape_links(url): """Scrape links from a webpage""" - response = requests.get(url, headers=cfg.user_agent_header) - - # Check if the response contains an HTTP error - if response.status_code >= 400: - return "Error: HTTP " + str(response.status_code) + " error" + response, error_message = get_response(url) + if error_message: + return error_message soup = BeautifulSoup(response.text, "html.parser") diff --git a/tests/test_browse_scrape_links.py b/tests/unit/test_browse_scrape_links.py similarity index 98% rename from tests/test_browse_scrape_links.py rename to tests/unit/test_browse_scrape_links.py index b3acf394d..fdacf4c0f 100644 --- a/tests/test_browse_scrape_links.py +++ b/tests/unit/test_browse_scrape_links.py @@ -1,12 +1,12 @@ # Generated by CodiumAI -from scripts.browse import scrape_links - # Dependencies: # pip install pytest-mock import pytest +from scripts.browse import scrape_links + """ Code Analysis diff --git a/tests/test_browse_scrape_text.py b/tests/unit/test_browse_scrape_text.py similarity index 100% rename from tests/test_browse_scrape_text.py rename to tests/unit/test_browse_scrape_text.py