Merge pull request #968 from maiko/add_website_memory
Add visited website to memory for recalling content without being limited by the website summary.pull/1365/head
commit
a17a850b25
|
@ -3,6 +3,12 @@
|
||||||
################################################################################
|
################################################################################
|
||||||
# EXECUTE_LOCAL_COMMANDS - Allow local command execution (Example: False)
|
# EXECUTE_LOCAL_COMMANDS - Allow local command execution (Example: False)
|
||||||
EXECUTE_LOCAL_COMMANDS=False
|
EXECUTE_LOCAL_COMMANDS=False
|
||||||
|
# BROWSE_CHUNK_MAX_LENGTH - When browsing website, define the length of chunk stored in memory
|
||||||
|
BROWSE_CHUNK_MAX_LENGTH=8192
|
||||||
|
# BROWSE_SUMMARY_MAX_TOKEN - Define the maximum length of the summary generated by GPT agent when browsing website
|
||||||
|
BROWSE_SUMMARY_MAX_TOKEN=300
|
||||||
|
# USER_AGENT - Define the user-agent used by the requests library to browse website (string)
|
||||||
|
# USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
|
||||||
# AI_SETTINGS_FILE - Specifies which AI Settings file to use (defaults to ai_settings.yaml)
|
# AI_SETTINGS_FILE - Specifies which AI Settings file to use (defaults to ai_settings.yaml)
|
||||||
AI_SETTINGS_FILE=ai_settings.yaml
|
AI_SETTINGS_FILE=ai_settings.yaml
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from memory import get_memory
|
||||||
from config import Config
|
from config import Config
|
||||||
from llm_utils import create_chat_completion
|
from llm_utils import create_chat_completion
|
||||||
from urllib.parse import urlparse, urljoin
|
from urllib.parse import urlparse, urljoin
|
||||||
|
|
||||||
cfg = Config()
|
cfg = Config()
|
||||||
|
memory = get_memory(cfg)
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({'User-Agent': cfg.user_agent})
|
||||||
|
|
||||||
|
|
||||||
# Function to check if the URL is valid
|
# Function to check if the URL is valid
|
||||||
|
@ -27,7 +32,7 @@ def check_local_file_access(url):
|
||||||
return any(url.startswith(prefix) for prefix in local_prefixes)
|
return any(url.startswith(prefix) for prefix in local_prefixes)
|
||||||
|
|
||||||
|
|
||||||
def get_response(url, headers=cfg.user_agent_header, timeout=10):
|
def get_response(url, timeout=10):
|
||||||
try:
|
try:
|
||||||
# Restrict access to local files
|
# Restrict access to local files
|
||||||
if check_local_file_access(url):
|
if check_local_file_access(url):
|
||||||
|
@ -39,7 +44,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
|
||||||
|
|
||||||
sanitized_url = sanitize_url(url)
|
sanitized_url = sanitize_url(url)
|
||||||
|
|
||||||
response = requests.get(sanitized_url, headers=headers, timeout=timeout)
|
response = session.get(sanitized_url, timeout=timeout)
|
||||||
|
|
||||||
# Check if the response contains an HTTP error
|
# Check if the response contains an HTTP error
|
||||||
if response.status_code >= 400:
|
if response.status_code >= 400:
|
||||||
|
@ -106,7 +111,7 @@ def scrape_links(url):
|
||||||
return format_hyperlinks(hyperlinks)
|
return format_hyperlinks(hyperlinks)
|
||||||
|
|
||||||
|
|
||||||
def split_text(text, max_length=8192):
|
def split_text(text, max_length=cfg.browse_chunk_max_length):
|
||||||
"""Split text into chunks of a maximum length"""
|
"""Split text into chunks of a maximum length"""
|
||||||
paragraphs = text.split("\n")
|
paragraphs = text.split("\n")
|
||||||
current_length = 0
|
current_length = 0
|
||||||
|
@ -133,7 +138,7 @@ def create_message(chunk, question):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def summarize_text(text, question):
|
def summarize_text(url, text, question):
|
||||||
"""Summarize text using the LLM model"""
|
"""Summarize text using the LLM model"""
|
||||||
if not text:
|
if not text:
|
||||||
return "Error: No text to summarize"
|
return "Error: No text to summarize"
|
||||||
|
@ -145,15 +150,28 @@ def summarize_text(text, question):
|
||||||
chunks = list(split_text(text))
|
chunks = list(split_text(text))
|
||||||
|
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
|
print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
|
||||||
|
|
||||||
|
memory_to_add = f"Source: {url}\n" \
|
||||||
|
f"Raw content part#{i + 1}: {chunk}"
|
||||||
|
|
||||||
|
memory.add(memory_to_add)
|
||||||
|
|
||||||
print(f"Summarizing chunk {i + 1} / {len(chunks)}")
|
print(f"Summarizing chunk {i + 1} / {len(chunks)}")
|
||||||
messages = [create_message(chunk, question)]
|
messages = [create_message(chunk, question)]
|
||||||
|
|
||||||
summary = create_chat_completion(
|
summary = create_chat_completion(
|
||||||
model=cfg.fast_llm_model,
|
model=cfg.fast_llm_model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_tokens=300,
|
max_tokens=cfg.browse_summary_max_token,
|
||||||
)
|
)
|
||||||
summaries.append(summary)
|
summaries.append(summary)
|
||||||
|
print(f"Added chunk {i + 1} summary to memory")
|
||||||
|
|
||||||
|
memory_to_add = f"Source: {url}\n" \
|
||||||
|
f"Content summary part#{i + 1}: {summary}"
|
||||||
|
|
||||||
|
memory.add(memory_to_add)
|
||||||
|
|
||||||
print(f"Summarized {len(chunks)} chunks.")
|
print(f"Summarized {len(chunks)} chunks.")
|
||||||
|
|
||||||
|
@ -163,7 +181,7 @@ def summarize_text(text, question):
|
||||||
final_summary = create_chat_completion(
|
final_summary = create_chat_completion(
|
||||||
model=cfg.fast_llm_model,
|
model=cfg.fast_llm_model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
max_tokens=300,
|
max_tokens=cfg.browse_summary_max_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
return final_summary
|
return final_summary
|
||||||
|
|
|
@ -191,7 +191,7 @@ def browse_website(url, question):
|
||||||
def get_text_summary(url, question):
|
def get_text_summary(url, question):
|
||||||
"""Return the results of a google search"""
|
"""Return the results of a google search"""
|
||||||
text = browse.scrape_text(url)
|
text = browse.scrape_text(url)
|
||||||
summary = browse.summarize_text(text, question)
|
summary = browse.summarize_text(url, text, question)
|
||||||
return """ "Result" : """ + summary
|
return """ "Result" : """ + summary
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,8 @@ class Config(metaclass=Singleton):
|
||||||
self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
|
self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
|
||||||
self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
|
self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
|
||||||
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
|
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
|
||||||
|
self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192))
|
||||||
|
self.browse_summary_max_token = int(os.getenv("BROWSE_SUMMARY_MAX_TOKEN", 300))
|
||||||
|
|
||||||
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||||
self.temperature = float(os.getenv("TEMPERATURE", "1"))
|
self.temperature = float(os.getenv("TEMPERATURE", "1"))
|
||||||
|
@ -78,7 +80,7 @@ class Config(metaclass=Singleton):
|
||||||
|
|
||||||
# User agent headers to use when browsing web
|
# User agent headers to use when browsing web
|
||||||
# Some websites might just completely deny request with an error code if no user agent was found.
|
# Some websites might just completely deny request with an error code if no user agent was found.
|
||||||
self.user_agent_header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
self.user_agent = os.getenv("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36")
|
||||||
self.redis_host = os.getenv("REDIS_HOST", "localhost")
|
self.redis_host = os.getenv("REDIS_HOST", "localhost")
|
||||||
self.redis_port = os.getenv("REDIS_PORT", "6379")
|
self.redis_port = os.getenv("REDIS_PORT", "6379")
|
||||||
self.redis_password = os.getenv("REDIS_PASSWORD", "")
|
self.redis_password = os.getenv("REDIS_PASSWORD", "")
|
||||||
|
@ -159,6 +161,14 @@ class Config(metaclass=Singleton):
|
||||||
"""Set the smart token limit value."""
|
"""Set the smart token limit value."""
|
||||||
self.smart_token_limit = value
|
self.smart_token_limit = value
|
||||||
|
|
||||||
|
def set_browse_chunk_max_length(self, value: int):
|
||||||
|
"""Set the browse_website command chunk max length value."""
|
||||||
|
self.browse_chunk_max_length = value
|
||||||
|
|
||||||
|
def set_browse_summary_max_token(self, value: int):
|
||||||
|
"""Set the browse_website command summary max token value."""
|
||||||
|
self.browse_summary_max_token = value
|
||||||
|
|
||||||
def set_openai_api_key(self, value: str):
|
def set_openai_api_key(self, value: str):
|
||||||
"""Set the OpenAI API key value."""
|
"""Set the OpenAI API key value."""
|
||||||
self.openai_api_key = value
|
self.openai_api_key = value
|
||||||
|
|
Loading…
Reference in New Issue