RAG-Project / rag_core.py
Rahaf2001's picture
Update rag_core.py
3d12ae8 verified
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import os
# --- Global variables for RAG components ---
vector_store = None
llm = None
retrieval_chain = None
def initialize_rag_components():
global llm
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)
def scrape_and_process_url(url: str) -> str:
global vector_store, retrieval_chain
try:
# Scrape content using WebBaseLoader for simplicity and robustness
# This handles parsing and extracting main content from various web pages
loader = WebBaseLoader(url)
docs = loader.load()
if not docs:
return "Failed to load content from the URL. Please check the URL or try another one."
# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = text_splitter.split_documents(docs)
# Create embeddings and vector store
# Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)
# Create RAG chain with polished prompt
prompt = ChatPromptTemplate.from_messages([
("system", """You are a knowledgeable and friendly assistant helping users understand documentation. Answer questions naturally and conversationally, as if you're explaining to a colleague.
Your task:
- Read the context carefully and provide clear, helpful answers based on what's there
- Explain concepts in a simple, approachable way that anyone can understand
- If you find the answer in the context, explain it thoroughly with examples when available
- Be direct and confident in your responses - act like an expert who knows this documentation well
- If the information isn't in the context, simply say "I don't see that information in this documentation"
- Use a warm, professional tone - like a helpful coworker, not a robot
Context from documentation:
{context}"""),
("user", "{input}")
])
document_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(
vector_store.as_retriever(search_kwargs={"k": 4}),
document_chain
)
return f"✅ Successfully scraped and processed content from {url}.\n\nDocument chunks created: {len(chunks)}\n\nYou can now ask questions about the documentation!"
except Exception as e:
return f"❌ An error occurred during scraping or processing: {str(e)}"
def answer_question(question: str) -> str:
global retrieval_chain
if retrieval_chain is None:
return "⚠️ Please scrape and process a URL first before asking questions."
try:
response = retrieval_chain.invoke({"input": question})
return response["answer"]
except Exception as e:
return f"❌ An error occurred while answering the question: {str(e)}"
# Initialize LLM when the module is imported
initialize_rag_components()