Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import create_retrieval_chain | |
| from langchain.chains.combine_documents import create_stuff_documents_chain | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| import os | |
| # --- Global variables for RAG components --- | |
| vector_store = None | |
| llm = None | |
| retrieval_chain = None | |
| def initialize_rag_components(): | |
| global llm | |
| llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3) | |
| def scrape_and_process_url(url: str) -> str: | |
| global vector_store, retrieval_chain | |
| try: | |
| # Scrape content using WebBaseLoader for simplicity and robustness | |
| # This handles parsing and extracting main content from various web pages | |
| loader = WebBaseLoader(url) | |
| docs = loader.load() | |
| if not docs: | |
| return "Failed to load content from the URL. Please check the URL or try another one." | |
| # Split documents into smaller chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200 | |
| ) | |
| chunks = text_splitter.split_documents(docs) | |
| # Create embeddings and vector store | |
| # Ensure OPENAI_API_KEY is set as an environment variable in Hugging Face Spaces | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vector_store = FAISS.from_documents(chunks, embeddings) | |
| # Create RAG chain with polished prompt | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", """You are a knowledgeable and friendly assistant helping users understand documentation. Answer questions naturally and conversationally, as if you're explaining to a colleague. | |
| Your task: | |
| - Read the context carefully and provide clear, helpful answers based on what's there | |
| - Explain concepts in a simple, approachable way that anyone can understand | |
| - If you find the answer in the context, explain it thoroughly with examples when available | |
| - Be direct and confident in your responses - act like an expert who knows this documentation well | |
| - If the information isn't in the context, simply say "I don't see that information in this documentation" | |
| - Use a warm, professional tone - like a helpful coworker, not a robot | |
| Context from documentation: | |
| {context}"""), | |
| ("user", "{input}") | |
| ]) | |
| document_chain = create_stuff_documents_chain(llm, prompt) | |
| retrieval_chain = create_retrieval_chain( | |
| vector_store.as_retriever(search_kwargs={"k": 4}), | |
| document_chain | |
| ) | |
| return f"✅ Successfully scraped and processed content from {url}.\n\nDocument chunks created: {len(chunks)}\n\nYou can now ask questions about the documentation!" | |
| except Exception as e: | |
| return f"❌ An error occurred during scraping or processing: {str(e)}" | |
| def answer_question(question: str) -> str: | |
| global retrieval_chain | |
| if retrieval_chain is None: | |
| return "⚠️ Please scrape and process a URL first before asking questions." | |
| try: | |
| response = retrieval_chain.invoke({"input": question}) | |
| return response["answer"] | |
| except Exception as e: | |
| return f"❌ An error occurred while answering the question: {str(e)}" | |
| # Initialize LLM when the module is imported | |
| initialize_rag_components() |