import gradio as gr import requests from bs4 import BeautifulSoup import numpy as np from sentence_transformers import SentenceTransformer import faiss from typing import List, Tuple import re model = SentenceTransformer('all-MiniLM-L6-v2') doc_chunks = [] doc_embeddings = None index = None source_url = "" def fetch_documentation(url: str) -> str: try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') for script in soup(["script", "style", "nav", "footer", "header"]): script.decompose() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = '\n'.join(chunk for chunk in chunks if chunk) return text except Exception as e: error_msg = str(e) if "403" in error_msg or "Forbidden" in error_msg: raise Exception(f"Access denied (403 Forbidden). This website blocks automated requests. Try: 1) Using the site's API if available, 2) A different documentation page, 3) GitHub raw content URLs work well (e.g., https://raw.githubusercontent.com/...)") elif "404" in error_msg: raise Exception(f"Page not found (404). Please check the URL is correct.") elif "timeout" in error_msg.lower(): raise Exception(f"Request timeout. The website took too long to respond.") else: raise Exception(f"Error fetching URL: {error_msg}") def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: sentences = re.split(r'[.!?]+', text) chunks = [] current_chunk = "" for sentence in sentences: sentence = sentence.strip() if not sentence: continue if len(current_chunk) + len(sentence) < chunk_size: current_chunk += sentence + ". " else: if current_chunk: chunks.append(current_chunk.strip()) current_chunk = sentence + ". " if current_chunk: chunks.append(current_chunk.strip()) return chunks def process_documentation(url: str) -> str: global doc_chunks, doc_embeddings, index, source_url if not url: return "Please provide a URL" try: status = "Fetching documentation..." print(status) text = fetch_documentation(url) if len(text) < 100: return "Retrieved content is too short. Please check the URL." status = "Chunking text..." print(status) doc_chunks = chunk_text(text) if not doc_chunks: return "No content chunks created. The documentation might be empty." status = f"Creating embeddings for {len(doc_chunks)} chunks..." print(status) doc_embeddings = model.encode(doc_chunks, show_progress_bar=False) dimension = doc_embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(doc_embeddings.astype('float32')) source_url = url return f"Documentation processed successfully!\n\nStatistics:\n- Chunks created: {len(doc_chunks)}\n- Text length: {len(text)} characters\n- Ready to answer questions!" except Exception as e: return f"Error: {str(e)}" def answer_question(question: str, top_k: int = 3) -> Tuple[str, str]: global doc_chunks, doc_embeddings, index, source_url if not question: return "Please enter a question", "" if index is None or not doc_chunks: return "Please process documentation first by entering a URL above", "" try: question_embedding = model.encode([question]) distances, indices = index.search(question_embedding.astype('float32'), top_k) relevant_chunks = [doc_chunks[i] for i in indices[0]] context = "\n\n".join([f"[{i+1}] {chunk}" for i, chunk in enumerate(relevant_chunks)]) answer = f"Based on the documentation at {source_url}:\n\n" answer += f"Relevant Information:\n\n{relevant_chunks[0]}" if len(relevant_chunks) > 1: answer += f"\n\nAdditional Context:\n\n{relevant_chunks[1]}" sources = "Retrieved Chunks:\n\n" for i, (chunk, dist) in enumerate(zip(relevant_chunks, distances[0])): sources += f"Chunk {i+1} (similarity: {1/(1+dist):.3f}):\n{chunk}\n\n---\n\n" return answer, sources except Exception as e: return f"Error: {str(e)}", "" with gr.Blocks(theme=gr.themes.Soft(), title="Documentation RAG System") as demo: gr.Markdown("# Documentation RAG System\n\nEnter a documentation URL, process it, then ask questions about the content using AI-powered retrieval.") with gr.Row(): with gr.Column(): url_input = gr.Textbox( label="Documentation URL", placeholder="https://docs.python.org/3/tutorial/index.html", lines=1 ) process_btn = gr.Button("Process Documentation", variant="primary") status_output = gr.Textbox( label="Status", lines=6, interactive=False ) gr.Markdown("---") with gr.Row(): with gr.Column(): question_input = gr.Textbox( label="Your Question", placeholder="What is this documentation about?", lines=3 ) top_k_slider = gr.Slider( minimum=1, maximum=5, value=3, step=1, label="Number of chunks to retrieve" ) ask_btn = gr.Button("Ask Question", variant="primary") with gr.Row(): with gr.Column(): answer_output = gr.Textbox( label="Answer", lines=10, interactive=False ) with gr.Column(): sources_output = gr.Textbox( label="Source Chunks", lines=10, interactive=False ) gr.Markdown("### Example URLs to try:") gr.Examples( examples=[ ["https://raw.githubusercontent.com/python/cpython/main/README.rst"], ["https://docs.python.org/3/tutorial/introduction.html"], ["https://raw.githubusercontent.com/huggingface/transformers/main/README.md"], ["https://pytorch.org/docs/stable/torch.html"], ], inputs=url_input ) process_btn.click( fn=process_documentation, inputs=[url_input], outputs=[status_output] ) ask_btn.click( fn=answer_question, inputs=[question_input, top_k_slider], outputs=[answer_output, sources_output] ) question_input.submit( fn=answer_question, inputs=[question_input, top_k_slider], outputs=[answer_output, sources_output] ) if __name__ == "__main__": demo.launch()