2023-04-09 13:39:11 +00:00
import requests
2023-03-28 22:25:42 +00:00
from bs4 import BeautifulSoup
2023-04-03 02:38:21 +00:00
from config import Config
2023-04-03 02:51:07 +00:00
from llm_utils import create_chat_completion
2023-03-28 22:25:42 +00:00
2023-04-03 02:38:21 +00:00
cfg = Config ( )
2023-03-29 08:43:10 +00:00
2023-03-28 22:25:42 +00:00
def scrape_text ( url ) :
2023-04-02 17:03:37 +00:00
""" Scrape text from a webpage """
2023-04-09 13:39:11 +00:00
response = requests . get ( url )
2023-03-29 08:43:10 +00:00
# Check if the response contains an HTTP error
if response . status_code > = 400 :
return " Error: HTTP " + str ( response . status_code ) + " error "
2023-03-28 22:25:42 +00:00
soup = BeautifulSoup ( response . text , " html.parser " )
for script in soup ( [ " script " , " style " ] ) :
script . extract ( )
text = soup . get_text ( )
lines = ( line . strip ( ) for line in text . splitlines ( ) )
chunks = ( phrase . strip ( ) for line in lines for phrase in line . split ( " " ) )
text = ' \n ' . join ( chunk for chunk in chunks if chunk )
return text
2023-04-02 08:13:15 +00:00
2023-03-30 09:10:52 +00:00
def extract_hyperlinks ( soup ) :
2023-04-02 17:03:37 +00:00
""" Extract hyperlinks from a BeautifulSoup object """
2023-03-30 09:10:52 +00:00
hyperlinks = [ ]
for link in soup . find_all ( ' a ' , href = True ) :
hyperlinks . append ( ( link . text , link [ ' href ' ] ) )
return hyperlinks
2023-04-02 08:13:15 +00:00
2023-03-30 09:10:52 +00:00
def format_hyperlinks ( hyperlinks ) :
2023-04-02 17:03:37 +00:00
""" Format hyperlinks into a list of strings """
2023-03-30 09:10:52 +00:00
formatted_links = [ ]
for link_text , link_url in hyperlinks :
formatted_links . append ( f " { link_text } ( { link_url } ) " )
2023-03-30 11:45:15 +00:00
return formatted_links
2023-03-30 09:10:52 +00:00
2023-04-02 08:13:15 +00:00
2023-03-30 09:10:52 +00:00
def scrape_links ( url ) :
2023-04-02 17:03:37 +00:00
""" Scrape hyperlinks from a webpage """
2023-04-09 13:39:11 +00:00
response = requests . get ( url )
2023-03-30 09:10:52 +00:00
# Check if the response contains an HTTP error
if response . status_code > = 400 :
return " error "
soup = BeautifulSoup ( response . text , " html.parser " )
for script in soup ( [ " script " , " style " ] ) :
script . extract ( )
hyperlinks = extract_hyperlinks ( soup )
2023-04-02 08:13:15 +00:00
2023-03-30 11:45:15 +00:00
return format_hyperlinks ( hyperlinks )
2023-03-30 09:10:52 +00:00
2023-04-02 08:13:15 +00:00
2023-03-28 22:25:42 +00:00
def split_text ( text , max_length = 8192 ) :
2023-04-02 17:03:37 +00:00
""" Split text into chunks of a maximum length """
2023-03-28 22:25:42 +00:00
paragraphs = text . split ( " \n " )
current_length = 0
current_chunk = [ ]
for paragraph in paragraphs :
if current_length + len ( paragraph ) + 1 < = max_length :
current_chunk . append ( paragraph )
current_length + = len ( paragraph ) + 1
else :
yield " \n " . join ( current_chunk )
current_chunk = [ paragraph ]
current_length = len ( paragraph ) + 1
if current_chunk :
yield " \n " . join ( current_chunk )
2023-04-02 08:13:15 +00:00
2023-04-04 01:20:42 +00:00
def create_message ( chunk , question ) :
2023-04-04 09:47:37 +00:00
""" Create a message for the user to summarize a chunk of text """
2023-04-04 01:20:42 +00:00
return {
" role " : " user " ,
" content " : f " \" \" \" { chunk } \" \" \" Using the above text, please answer the following question: \" { question } \" -- if the question cannot be answered using the text, please summarize the text. "
}
def summarize_text ( text , question ) :
2023-04-04 09:47:37 +00:00
""" Summarize text using the LLM model """
2023-04-04 01:20:42 +00:00
if not text :
2023-03-28 22:25:42 +00:00
return " Error: No text to summarize "
2023-04-02 08:13:15 +00:00
2023-04-04 01:20:42 +00:00
text_length = len ( text )
print ( f " Text length: { text_length } characters " )
2023-03-28 22:25:42 +00:00
summaries = [ ]
chunks = list ( split_text ( text ) )
for i , chunk in enumerate ( chunks ) :
2023-04-04 01:20:42 +00:00
print ( f " Summarizing chunk { i + 1 } / { len ( chunks ) } " )
messages = [ create_message ( chunk , question ) ]
2023-04-02 08:13:15 +00:00
2023-04-03 02:51:07 +00:00
summary = create_chat_completion (
2023-04-03 02:38:21 +00:00
model = cfg . fast_llm_model ,
2023-03-28 22:25:42 +00:00
messages = messages ,
max_tokens = 300 ,
)
summaries . append ( summary )
2023-04-04 01:20:42 +00:00
print ( f " Summarized { len ( chunks ) } chunks. " )
2023-03-28 22:25:42 +00:00
2023-04-04 01:20:42 +00:00
combined_summary = " \n " . join ( summaries )
messages = [ create_message ( combined_summary , question ) ]
2023-03-28 22:25:42 +00:00
2023-04-03 02:51:07 +00:00
final_summary = create_chat_completion (
2023-04-03 02:38:21 +00:00
model = cfg . fast_llm_model ,
2023-03-28 22:25:42 +00:00
messages = messages ,
max_tokens = 300 ,
)
2023-04-04 01:20:42 +00:00
return final_summary