Spaces:
Sleeping
Sleeping
Commit ·
a864c4e
1
Parent(s): 765b8d8
OpenRouter Added, Rate Limitng Fixed
Browse files- .env.example +6 -0
- .gitignore +2 -2
- README.md +61 -37
- app/document_processor.py +45 -8
- app/main.py +27 -15
- app/rag_pipeline.py +132 -31
- requirements.txt +21 -37
- tests/experiments.py +93 -0
- tests/test_rag_pipeline.py +23 -9
.env.example
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment Variables
|
| 2 |
+
|
| 3 |
+
# OpenRouter API Key (Required)
|
| 4 |
+
# Get your FREE key at: https://openrouter.ai/keys
|
| 5 |
+
# Using free tier with google/gemma-3-4b-it:free model
|
| 6 |
+
OPENROUTER_API_KEY=your_openrouter_api_key_here
|
.gitignore
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
.DS_Store
|
| 2 |
__pycache__
|
| 3 |
-
data
|
| 4 |
.gradio
|
| 5 |
-
data/
|
|
|
|
|
|
| 1 |
.DS_Store
|
| 2 |
__pycache__
|
|
|
|
| 3 |
.gradio
|
| 4 |
+
data/
|
| 5 |
+
.env
|
README.md
CHANGED
|
@@ -10,44 +10,68 @@ pinned: false
|
|
| 10 |
---
|
| 11 |
|
| 12 |
# AI Document Intelligence System
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
-
|
| 16 |
-
-
|
| 17 |
-
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
## Features
|
| 20 |
-
- Interactive document processing
|
| 21 |
-
- Context-aware question answering
|
| 22 |
-
-
|
| 23 |
-
-
|
| 24 |
-
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
1.
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
2.
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
1. Upload a PDF/DOCX/TXT file
|
| 51 |
2. Click "Process Document"
|
| 52 |
-
3.
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
# AI Document Intelligence System
|
| 13 |
+
|
| 14 |
+
Upload documents and ask questions using advanced RAG (Retrieval-Augmented Generation) technology. Built with:
|
| 15 |
+
- **LangChain** for RAG orchestration
|
| 16 |
+
- **ChromaDB** for vector storage
|
| 17 |
+
- **BAAI/bge-small-en-v1.5** embeddings for superior retrieval quality
|
| 18 |
+
- **Meta Llama 3.2** via HuggingFace Inference API
|
| 19 |
+
- **Gradio** for interactive UI
|
| 20 |
|
| 21 |
## Features
|
| 22 |
+
- Interactive document processing (PDF, DOCX, TXT)
|
| 23 |
+
- Context-aware question answering with improved embeddings
|
| 24 |
+
- ⚡ Real-time processing and analysis
|
| 25 |
+
- Source citation for transparency
|
| 26 |
+
- Cloud-ready deployment on HuggingFace Spaces
|
| 27 |
+
|
| 28 |
+
## Setup
|
| 29 |
+
|
| 30 |
+
### 1. Get HuggingFace Token
|
| 31 |
+
1. Create a free account at [HuggingFace](https://huggingface.co/join)
|
| 32 |
+
2. Go to [Settings → Access Tokens](https://huggingface.co/settings/tokens)
|
| 33 |
+
3. Create a new token with **READ** access
|
| 34 |
+
4. Copy the token
|
| 35 |
+
|
| 36 |
+
### 2. Local Installation
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
# Clone the repository
|
| 40 |
+
git clone https://github.com/pkgprateek/ai-rag-document.git
|
| 41 |
+
cd ai-rag-document
|
| 42 |
+
|
| 43 |
+
# Create virtual environment
|
| 44 |
+
python -m venv venv
|
| 45 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 46 |
+
|
| 47 |
+
# Install dependencies
|
| 48 |
+
pip install -r requirements.txt
|
| 49 |
+
|
| 50 |
+
# Set up environment variables
|
| 51 |
+
cp .env.example .env
|
| 52 |
+
# Edit .env and add your HF_TOKEN
|
| 53 |
+
|
| 54 |
+
# Run the application
|
| 55 |
+
python app/main.py
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### 3. Deploy to HuggingFace Spaces
|
| 59 |
+
|
| 60 |
+
1. **Fork or upload this repo to HuggingFace Spaces**
|
| 61 |
+
2. **Add your HF_TOKEN as a Space Secret:**
|
| 62 |
+
- Go to your Space Settings → Repository secrets
|
| 63 |
+
- Add a new secret: `HF_TOKEN` = your token
|
| 64 |
+
3. **Your app will automatically deploy!**
|
| 65 |
+
|
| 66 |
+
## Usage
|
| 67 |
+
|
| 68 |
1. Upload a PDF/DOCX/TXT file
|
| 69 |
2. Click "Process Document"
|
| 70 |
+
3. Get accurate answers with markdown formatting
|
| 71 |
+
|
| 72 |
+
## Technical Details
|
| 73 |
+
|
| 74 |
+
- **Embeddings**: BAAI/bge-small-en-v1.5 (significantly better than all-MiniLM-L6-v2)
|
| 75 |
+
- **LLM**: Meta Llama-3.2-3B-Instruct via HuggingFace Inference API
|
| 76 |
+
- **Vector Store**: ChromaDB with persistent storage
|
| 77 |
+
- **Chunking**: Smart text splitting with overlap for context preservation
|
app/document_processor.py
CHANGED
|
@@ -7,15 +7,31 @@ from docx import Document as DocxDocument
|
|
| 7 |
|
| 8 |
class DocumentProcessor:
|
| 9 |
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 11 |
chunk_size=chunk_size,
|
| 12 |
chunk_overlap=chunk_overlap,
|
| 13 |
length_function=len,
|
| 14 |
)
|
| 15 |
|
| 16 |
-
|
| 17 |
def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
|
| 18 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Create documents with metadata
|
| 20 |
return self.text_splitter.create_documents(
|
| 21 |
[text],
|
|
@@ -23,7 +39,15 @@ class DocumentProcessor:
|
|
| 23 |
)
|
| 24 |
|
| 25 |
def process_pdf(self, file_path: str) -> List[Document]:
|
| 26 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
reader = PyPDF2.PdfReader(file_path)
|
| 28 |
text = ""
|
| 29 |
for page_num, page in enumerate(reader.pages):
|
|
@@ -32,17 +56,30 @@ class DocumentProcessor:
|
|
| 32 |
text += f"\n---- Page {page_num + 1} ----\n{page_text}"
|
| 33 |
return self._chunk_text(file_path, text, "pdf")
|
| 34 |
|
| 35 |
-
|
| 36 |
def process_docx(self, file_path: str) -> List[Document]:
|
| 37 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
doc = DocxDocument(file_path)
|
| 39 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 40 |
return self._chunk_text(file_path, text, "docx")
|
| 41 |
-
|
| 42 |
|
| 43 |
def process_txt(self, file_path: str) -> List[Document]:
|
| 44 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
with open(file_path, "r", encoding="utf-8") as file:
|
| 46 |
text = file.read()
|
| 47 |
return self._chunk_text(file_path, text, "txt")
|
| 48 |
-
|
|
|
|
| 7 |
|
| 8 |
class DocumentProcessor:
|
| 9 |
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
| 10 |
+
"""
|
| 11 |
+
Initialize document processor with text splitting configuration.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
chunk_size: Maximum characters per chunk (default: 1000)
|
| 15 |
+
chunk_overlap: Characters to overlap between chunks (default: 200)
|
| 16 |
+
"""
|
| 17 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 18 |
chunk_size=chunk_size,
|
| 19 |
chunk_overlap=chunk_overlap,
|
| 20 |
length_function=len,
|
| 21 |
)
|
| 22 |
|
|
|
|
| 23 |
def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
|
| 24 |
+
"""
|
| 25 |
+
Split text into overlapping chunks with metadata for better retrieval.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
file_path: Original file path for metadata
|
| 29 |
+
text: Text content to split
|
| 30 |
+
doc_type: Document type (pdf/docx/txt)
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
List[Document]: Chunked documents with metadata
|
| 34 |
+
"""
|
| 35 |
# Create documents with metadata
|
| 36 |
return self.text_splitter.create_documents(
|
| 37 |
[text],
|
|
|
|
| 39 |
)
|
| 40 |
|
| 41 |
def process_pdf(self, file_path: str) -> List[Document]:
|
| 42 |
+
"""
|
| 43 |
+
Extract text from PDF file and convert to chunked documents.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
file_path: Path to PDF file
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
List[Document]: Processed document chunks
|
| 50 |
+
"""
|
| 51 |
reader = PyPDF2.PdfReader(file_path)
|
| 52 |
text = ""
|
| 53 |
for page_num, page in enumerate(reader.pages):
|
|
|
|
| 56 |
text += f"\n---- Page {page_num + 1} ----\n{page_text}"
|
| 57 |
return self._chunk_text(file_path, text, "pdf")
|
| 58 |
|
|
|
|
| 59 |
def process_docx(self, file_path: str) -> List[Document]:
|
| 60 |
+
"""
|
| 61 |
+
Extract text from DOCX file and convert to chunked documents.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
file_path: Path to DOCX file
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
List[Document]: Processed document chunks
|
| 68 |
+
"""
|
| 69 |
doc = DocxDocument(file_path)
|
| 70 |
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 71 |
return self._chunk_text(file_path, text, "docx")
|
|
|
|
| 72 |
|
| 73 |
def process_txt(self, file_path: str) -> List[Document]:
|
| 74 |
+
"""
|
| 75 |
+
Read text file and convert to chunked documents.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
file_path: Path to TXT file
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
List[Document]: Processed document chunks
|
| 82 |
+
"""
|
| 83 |
with open(file_path, "r", encoding="utf-8") as file:
|
| 84 |
text = file.read()
|
| 85 |
return self._chunk_text(file_path, text, "txt")
|
|
|
app/main.py
CHANGED
|
@@ -2,16 +2,32 @@ import gradio as gr
|
|
| 2 |
from rag_pipeline import RAGPipeline
|
| 3 |
from document_processor import DocumentProcessor
|
| 4 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
class DocumentRagApp:
|
| 8 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
self.processor = DocumentProcessor()
|
| 10 |
self.rag_pipeline = RAGPipeline()
|
| 11 |
self.loaded_documents = []
|
| 12 |
|
| 13 |
def process_document(self, file):
|
| 14 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
if file is None:
|
| 16 |
return "Please upload a file."
|
| 17 |
try:
|
|
@@ -36,6 +52,15 @@ class DocumentRagApp:
|
|
| 36 |
return f"Error processing file: {str(e)}"
|
| 37 |
|
| 38 |
def ask_question(self, question):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if not self.loaded_documents:
|
| 40 |
return "Please upload and process a document before asking questions."
|
| 41 |
|
|
@@ -46,14 +71,6 @@ class DocumentRagApp:
|
|
| 46 |
result = self.rag_pipeline.query(question)
|
| 47 |
answer = result["answer"]
|
| 48 |
return answer
|
| 49 |
-
# sources = result["sources"]
|
| 50 |
-
# source_response = ""
|
| 51 |
-
# for i, doc in enumerate(sources[:3], start=1):
|
| 52 |
-
# src_name = doc.metadata.get("source", "Unknown Source")
|
| 53 |
-
# content_preview = doc.page_content[:100] + "..."
|
| 54 |
-
# source_response += f"\n{i}. {src_name}\n '{content_preview}'\n"
|
| 55 |
-
# source_response += f"\n{i}. {content_preview}\n"
|
| 56 |
-
# return answer, source_response
|
| 57 |
except Exception as e:
|
| 58 |
return f"Error answering question: {str(e)}"
|
| 59 |
|
|
@@ -87,11 +104,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
|
|
| 87 |
|
| 88 |
with gr.Column(scale=2):
|
| 89 |
gr.Markdown("### 3. Answer")
|
| 90 |
-
|
| 91 |
answer_output = gr.Markdown(container=True, min_height="480px")
|
| 92 |
-
# sources_output = gr.Markdown(
|
| 93 |
-
# label="Sources", container=True, min_height="120px"
|
| 94 |
-
# )
|
| 95 |
|
| 96 |
# Connect all functions
|
| 97 |
process_btn.click(
|
|
@@ -102,8 +115,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
|
|
| 102 |
fn=app.ask_question,
|
| 103 |
inputs=[question_input],
|
| 104 |
outputs=[answer_output],
|
| 105 |
-
# outputs=[answer_output, sources_output],
|
| 106 |
)
|
| 107 |
|
| 108 |
if __name__ == "__main__":
|
| 109 |
-
demo.launch(share=
|
|
|
|
| 2 |
from rag_pipeline import RAGPipeline
|
| 3 |
from document_processor import DocumentProcessor
|
| 4 |
import os
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
# Load environment variables from .env file
|
| 8 |
+
load_dotenv()
|
| 9 |
|
| 10 |
|
| 11 |
class DocumentRagApp:
|
| 12 |
def __init__(self):
|
| 13 |
+
"""
|
| 14 |
+
Initialize Document RAG application with processor and pipeline.
|
| 15 |
+
Loads environment variables and sets up components.
|
| 16 |
+
"""
|
| 17 |
self.processor = DocumentProcessor()
|
| 18 |
self.rag_pipeline = RAGPipeline()
|
| 19 |
self.loaded_documents = []
|
| 20 |
|
| 21 |
def process_document(self, file):
|
| 22 |
+
"""
|
| 23 |
+
Process uploaded document (PDF/DOCX/TXT) and add to RAG system.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
file: Gradio file upload object
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
str: Status message with processing results or error
|
| 30 |
+
"""
|
| 31 |
if file is None:
|
| 32 |
return "Please upload a file."
|
| 33 |
try:
|
|
|
|
| 52 |
return f"Error processing file: {str(e)}"
|
| 53 |
|
| 54 |
def ask_question(self, question):
|
| 55 |
+
"""
|
| 56 |
+
Answer user question using RAG pipeline with rate limiting.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
question: User's question string
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
str: Generated answer or error message
|
| 63 |
+
"""
|
| 64 |
if not self.loaded_documents:
|
| 65 |
return "Please upload and process a document before asking questions."
|
| 66 |
|
|
|
|
| 71 |
result = self.rag_pipeline.query(question)
|
| 72 |
answer = result["answer"]
|
| 73 |
return answer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
except Exception as e:
|
| 75 |
return f"Error answering question: {str(e)}"
|
| 76 |
|
|
|
|
| 104 |
|
| 105 |
with gr.Column(scale=2):
|
| 106 |
gr.Markdown("### 3. Answer")
|
|
|
|
| 107 |
answer_output = gr.Markdown(container=True, min_height="480px")
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
# Connect all functions
|
| 110 |
process_btn.click(
|
|
|
|
| 115 |
fn=app.ask_question,
|
| 116 |
inputs=[question_input],
|
| 117 |
outputs=[answer_output],
|
|
|
|
| 118 |
)
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
| 121 |
+
demo.launch(share=False)
|
app/rag_pipeline.py
CHANGED
|
@@ -1,76 +1,177 @@
|
|
| 1 |
from langchain_chroma import Chroma
|
| 2 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 3 |
-
from
|
| 4 |
from langchain_core.prompts import PromptTemplate
|
| 5 |
from langchain_core.documents import Document
|
| 6 |
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
|
| 7 |
from typing import List
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class RAGPipeline:
|
| 10 |
def __init__(self, persist_directory: str = "./data/chroma_db"):
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
self.embeddings = HuggingFaceEmbeddings(
|
| 13 |
-
model_name="
|
|
|
|
|
|
|
| 14 |
)
|
| 15 |
-
|
|
|
|
| 16 |
self.vector_store = Chroma(
|
| 17 |
persist_directory=persist_directory,
|
| 18 |
embedding_function=self.embeddings,
|
| 19 |
)
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Create RAG chain
|
| 24 |
self.rag_chain = self.create_rag_chain()
|
| 25 |
|
| 26 |
def create_rag_chain(self):
|
| 27 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
prompt = PromptTemplate(
|
| 29 |
input_variables=["context", "question"],
|
| 30 |
-
template="""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
Use factual information to answer the question. Verify the information you provide.
|
| 35 |
-
Prettify your answer with markdown formatting.".
|
| 36 |
-
|
| 37 |
Context: {context}
|
| 38 |
|
| 39 |
Question: {question}
|
| 40 |
|
| 41 |
-
|
| 42 |
-
"""
|
| 43 |
)
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
rag_chain = RunnableParallel(
|
| 46 |
{
|
| 47 |
"result": (
|
| 48 |
{"context": retriever, "question": RunnablePassthrough()}
|
| 49 |
| prompt
|
| 50 |
| self.llm
|
| 51 |
-
),
|
| 52 |
"source_documents": retriever,
|
| 53 |
}
|
| 54 |
)
|
| 55 |
return rag_chain
|
| 56 |
-
|
| 57 |
|
| 58 |
def add_documents(self, documents: List[Document]) -> None:
|
| 59 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
self.vector_store.add_documents(documents)
|
| 61 |
# In newer versions of langchain-chroma, persist() is no longer needed
|
| 62 |
# as documents are automatically persisted when added
|
| 63 |
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def query(self, question: str):
|
| 66 |
-
"""
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
answer = self.rag_chain.invoke(question)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from langchain_chroma import Chroma
|
| 2 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
from langchain_core.prompts import PromptTemplate
|
| 5 |
from langchain_core.documents import Document
|
| 6 |
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
|
| 7 |
from typing import List
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
import json
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Fix tokenizer warning
|
| 14 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 15 |
+
|
| 16 |
|
| 17 |
class RAGPipeline:
|
| 18 |
def __init__(self, persist_directory: str = "./data/chroma_db"):
|
| 19 |
+
"""
|
| 20 |
+
Initialize RAG pipeline with embeddings, vector store, and LLM.
|
| 21 |
+
Sets up rate limiting (10 queries/hour) and uses OpenRouter API with free Gemma model.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
persist_directory: Path to store ChromaDB vector database (default: ./data/chroma_db)
|
| 25 |
+
"""
|
| 26 |
+
# Initialize better embeddings (BAAI/bge-small-en-v1.5)
|
| 27 |
self.embeddings = HuggingFaceEmbeddings(
|
| 28 |
+
model_name="BAAI/bge-small-en-v1.5",
|
| 29 |
+
model_kwargs={"device": "cpu"},
|
| 30 |
+
encode_kwargs={"normalize_embeddings": True}, # Important for bge models
|
| 31 |
)
|
| 32 |
+
|
| 33 |
+
# Initialize vector store
|
| 34 |
self.vector_store = Chroma(
|
| 35 |
persist_directory=persist_directory,
|
| 36 |
embedding_function=self.embeddings,
|
| 37 |
)
|
| 38 |
+
|
| 39 |
+
# Rate limiting setup (10 queries per hour)
|
| 40 |
+
self.rate_limit_file = Path("./data/rate_limit.json")
|
| 41 |
+
self.rate_limit_file.parent.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
# Initialize LLM using OpenRouter (cheapest free option)
|
| 44 |
+
openrouter_key = os.getenv("OPENROUTER_API_KEY")
|
| 45 |
+
if not openrouter_key:
|
| 46 |
+
raise ValueError(
|
| 47 |
+
"OPENROUTER_API_KEY environment variable not set. "
|
| 48 |
+
"Get one free at https://openrouter.ai/keys"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Using google/gemma-3-4b-it:free - free tier on OpenRouter
|
| 52 |
+
self.llm = ChatOpenAI(
|
| 53 |
+
model="google/gemma-3-4b-it:free",
|
| 54 |
+
openai_api_key=openrouter_key,
|
| 55 |
+
openai_api_base="https://openrouter.ai/api/v1",
|
| 56 |
+
temperature=0.1,
|
| 57 |
+
max_tokens=512,
|
| 58 |
+
)
|
| 59 |
|
| 60 |
# Create RAG chain
|
| 61 |
self.rag_chain = self.create_rag_chain()
|
| 62 |
|
| 63 |
def create_rag_chain(self):
|
| 64 |
+
"""
|
| 65 |
+
Creates the RAG chain by combining retriever, prompt template, and LLM.
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
RunnableParallel: Chain that retrieves context and generates answers
|
| 69 |
+
"""
|
| 70 |
prompt = PromptTemplate(
|
| 71 |
input_variables=["context", "question"],
|
| 72 |
+
template="""Answer the question based on the context below. If you cannot answer based on the context, say "I don't know".
|
| 73 |
+
Do not hallucinate. Do not make up information.
|
| 74 |
+
Format your answer using markdown for better readability.
|
| 75 |
+
|
|
|
|
|
|
|
|
|
|
| 76 |
Context: {context}
|
| 77 |
|
| 78 |
Question: {question}
|
| 79 |
|
| 80 |
+
Provide a clear and concise answer:""",
|
|
|
|
| 81 |
)
|
| 82 |
+
|
| 83 |
+
retriever = self.vector_store.as_retriever(
|
| 84 |
+
search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
rag_chain = RunnableParallel(
|
| 88 |
{
|
| 89 |
"result": (
|
| 90 |
{"context": retriever, "question": RunnablePassthrough()}
|
| 91 |
| prompt
|
| 92 |
| self.llm
|
| 93 |
+
),
|
| 94 |
"source_documents": retriever,
|
| 95 |
}
|
| 96 |
)
|
| 97 |
return rag_chain
|
|
|
|
| 98 |
|
| 99 |
def add_documents(self, documents: List[Document]) -> None:
|
| 100 |
+
"""
|
| 101 |
+
Add processed document chunks to the vector store for retrieval.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
documents: List of Document objects with text and metadata
|
| 105 |
+
"""
|
| 106 |
self.vector_store.add_documents(documents)
|
| 107 |
# In newer versions of langchain-chroma, persist() is no longer needed
|
| 108 |
# as documents are automatically persisted when added
|
| 109 |
|
| 110 |
+
def _check_rate_limit(self) -> bool:
|
| 111 |
+
"""
|
| 112 |
+
Enforces rate limit of 10 queries per hour by tracking query timestamps.
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
bool: True if within limit, False if exceeded
|
| 116 |
+
"""
|
| 117 |
+
now = datetime.now()
|
| 118 |
+
|
| 119 |
+
# Load existing queries
|
| 120 |
+
if self.rate_limit_file.exists():
|
| 121 |
+
with open(self.rate_limit_file, "r") as f:
|
| 122 |
+
data = json.load(f)
|
| 123 |
+
queries = [datetime.fromisoformat(q) for q in data.get("queries", [])]
|
| 124 |
+
else:
|
| 125 |
+
queries = []
|
| 126 |
+
|
| 127 |
+
# Remove queries older than 1 hour
|
| 128 |
+
one_hour_ago = now - timedelta(hours=1)
|
| 129 |
+
recent_queries = [q for q in queries if q > one_hour_ago]
|
| 130 |
+
|
| 131 |
+
# Check limit
|
| 132 |
+
if len(recent_queries) >= 10:
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
# Add current query
|
| 136 |
+
recent_queries.append(now)
|
| 137 |
+
|
| 138 |
+
# Save updated queries
|
| 139 |
+
with open(self.rate_limit_file, "w") as f:
|
| 140 |
+
json.dump({"queries": [q.isoformat() for q in recent_queries]}, f)
|
| 141 |
+
|
| 142 |
+
return True
|
| 143 |
+
|
| 144 |
def query(self, question: str):
|
| 145 |
+
"""
|
| 146 |
+
Query the RAG system with a question, retrieves relevant context and generates answer.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
question: User's question string
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
dict: {"answer": str} containing the generated response
|
| 153 |
+
|
| 154 |
+
Raises:
|
| 155 |
+
ValueError: If rate limit (10 queries/hour) is exceeded
|
| 156 |
+
"""
|
| 157 |
+
# Check rate limit
|
| 158 |
+
if not self._check_rate_limit():
|
| 159 |
+
raise ValueError(
|
| 160 |
+
"Rate limit exceeded. You can only ask 10 questions per hour. "
|
| 161 |
+
"Please try again later."
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
answer = self.rag_chain.invoke(question)
|
| 165 |
+
result = answer["result"]
|
| 166 |
+
|
| 167 |
+
if hasattr(result, "content"):
|
| 168 |
+
answer_text = result.content
|
| 169 |
+
elif hasattr(result, "text"):
|
| 170 |
+
answer_text = result.text
|
| 171 |
+
else:
|
| 172 |
+
answer_text = str(result)
|
| 173 |
+
|
| 174 |
+
# Check if answer is empty
|
| 175 |
+
if not answer_text or answer_text.strip() == "":
|
| 176 |
+
answer_text = "I apologize, but I couldn't generate a response. Please try rephrasing your question."
|
| 177 |
+
return {"answer": answer_text}
|
requirements.txt
CHANGED
|
@@ -1,43 +1,27 @@
|
|
| 1 |
-
|
| 2 |
-
fastapi==0.121.2
|
| 3 |
gradio==5.49.1
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
jinja2==3.1.6
|
| 7 |
-
joblib==1.5.2
|
| 8 |
langchain==1.0.7
|
| 9 |
-
langchain-chroma==1.0.0
|
| 10 |
-
langchain-classic==1.0.0
|
| 11 |
-
langchain-community==0.4.1
|
| 12 |
langchain-core==1.0.5
|
| 13 |
-
langchain-huggingface==1.0.1
|
| 14 |
-
langchain-ollama==1.0.0
|
| 15 |
langchain-text-splitters==1.0.0
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
pypdf2==3.0.1
|
| 30 |
python-docx==1.2.0
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
requests-oauthlib==2.0.0
|
| 35 |
-
requests-toolbelt==1.0.0
|
| 36 |
-
ruff==0.14.5
|
| 37 |
-
scikit-learn==1.7.2
|
| 38 |
-
scipy>=1.0
|
| 39 |
-
sqlalchemy>=2.0
|
| 40 |
-
tqdm>=4.0
|
| 41 |
-
transformers>=4.0
|
| 42 |
-
urllib3>=2.0
|
| 43 |
-
sentence-transformers>=5.0
|
|
|
|
| 1 |
+
# Core App Framework
|
|
|
|
| 2 |
gradio==5.49.1
|
| 3 |
+
|
| 4 |
+
# LangChain Core
|
|
|
|
|
|
|
| 5 |
langchain==1.0.7
|
|
|
|
|
|
|
|
|
|
| 6 |
langchain-core==1.0.5
|
|
|
|
|
|
|
| 7 |
langchain-text-splitters==1.0.0
|
| 8 |
+
|
| 9 |
+
# LangChain Integrations
|
| 10 |
+
langchain-chroma==1.0.0
|
| 11 |
+
langchain-huggingface==1.0.1
|
| 12 |
+
langchain-openai
|
| 13 |
+
|
| 14 |
+
# Vector Database
|
| 15 |
+
chromadb==1.3.4
|
| 16 |
+
|
| 17 |
+
# HuggingFace & Embeddings
|
| 18 |
+
huggingface-hub==0.36.0
|
| 19 |
+
sentence-transformers>=3.0.0
|
| 20 |
+
transformers>=4.0
|
| 21 |
+
|
| 22 |
+
# Document Processing
|
| 23 |
pypdf2==3.0.1
|
| 24 |
python-docx==1.2.0
|
| 25 |
+
|
| 26 |
+
# Environment & Config
|
| 27 |
+
python-dotenv==1.2.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/experiments.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Experimental code for testing RAG pipeline
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
+
|
| 7 |
+
from app.rag_pipeline import RAGPipeline
|
| 8 |
+
from app.document_processor import DocumentProcessor
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Example 1: Simple text test
|
| 15 |
+
def test_simple_query():
|
| 16 |
+
processor = DocumentProcessor()
|
| 17 |
+
|
| 18 |
+
test_doc = """Python is a high-level programming language.
|
| 19 |
+
It was created by Guido van Rossum in 1991.
|
| 20 |
+
Python is known for its simple syntax."""
|
| 21 |
+
|
| 22 |
+
chunks = processor._chunk_text("test_doc.txt", test_doc, doc_type="txt")
|
| 23 |
+
|
| 24 |
+
# Initialize RAG
|
| 25 |
+
rag_pipeline = RAGPipeline()
|
| 26 |
+
rag_pipeline.add_documents(chunks)
|
| 27 |
+
|
| 28 |
+
# Query
|
| 29 |
+
question = "What is python known for?"
|
| 30 |
+
result = rag_pipeline.query(question)
|
| 31 |
+
print(f"Question: {question}")
|
| 32 |
+
print(f"Answer: {result['answer']}")
|
| 33 |
+
print("\n" + "=" * 50 + "\n")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Example 2: Testing with actual document
|
| 37 |
+
def test_with_pdf():
|
| 38 |
+
processor = DocumentProcessor()
|
| 39 |
+
rag_pipeline = RAGPipeline()
|
| 40 |
+
|
| 41 |
+
# Process a PDF file
|
| 42 |
+
pdf_path = "path/to/your/test.pdf" # Replace with actual path
|
| 43 |
+
if os.path.exists(pdf_path):
|
| 44 |
+
chunks = processor.process_pdf(pdf_path)
|
| 45 |
+
rag_pipeline.add_documents(chunks)
|
| 46 |
+
|
| 47 |
+
question = "What is the main topic of this document?"
|
| 48 |
+
result = rag_pipeline.query(question)
|
| 49 |
+
print(f"Question: {question}")
|
| 50 |
+
print(f"Answer: {result['answer']}")
|
| 51 |
+
else:
|
| 52 |
+
print(f"PDF not found: {pdf_path}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Example 3: Interactive testing
|
| 56 |
+
def interactive_test():
|
| 57 |
+
processor = DocumentProcessor()
|
| 58 |
+
rag_pipeline = RAGPipeline()
|
| 59 |
+
|
| 60 |
+
# Add some test content
|
| 61 |
+
test_doc = """Artificial Intelligence (AI) is transforming the world.
|
| 62 |
+
Machine learning is a subset of AI that focuses on learning from data.
|
| 63 |
+
Deep learning uses neural networks with multiple layers.
|
| 64 |
+
Natural Language Processing (NLP) helps computers understand human language."""
|
| 65 |
+
|
| 66 |
+
chunks = processor._chunk_text("ai_basics.txt", test_doc, doc_type="txt")
|
| 67 |
+
rag_pipeline.add_documents(chunks)
|
| 68 |
+
|
| 69 |
+
print("Interactive RAG Testing")
|
| 70 |
+
print("Type 'quit' to exit\n")
|
| 71 |
+
|
| 72 |
+
while True:
|
| 73 |
+
question = input("Your question: ")
|
| 74 |
+
if question.lower() == "quit":
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
result = rag_pipeline.query(question)
|
| 79 |
+
print(f"Answer: {result['answer']}\n")
|
| 80 |
+
except ValueError as e:
|
| 81 |
+
print(f"Error: {e}\n")
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
print("Running RAG Pipeline Experiments\n")
|
| 87 |
+
|
| 88 |
+
# Run simple test
|
| 89 |
+
test_simple_query()
|
| 90 |
+
|
| 91 |
+
# Uncomment to run other tests
|
| 92 |
+
# test_with_pdf()
|
| 93 |
+
# interactive_test()
|
tests/test_rag_pipeline.py
CHANGED
|
@@ -1,29 +1,43 @@
|
|
| 1 |
# Test rag pipeline
|
| 2 |
import sys
|
| 3 |
import os
|
|
|
|
| 4 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 5 |
|
| 6 |
from app.rag_pipeline import RAGPipeline
|
| 7 |
from app.document_processor import DocumentProcessor
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
processor = DocumentProcessor()
|
| 11 |
# chunks = processor.process_pdf("./data/test.pdf")
|
| 12 |
-
test_doc =
|
| 13 |
-
"""
|
| 14 |
-
Python is a high-level programming language.
|
| 15 |
It was created by Guido van Rossum in 1991.
|
| 16 |
Python is known for its simple syntax.,
|
| 17 |
-
test_python.txt
|
| 18 |
-
|
| 19 |
-
)
|
| 20 |
|
| 21 |
-
# Initialize Rag
|
| 22 |
rag_pipeline = RAGPipeline()
|
| 23 |
-
rag_pipeline.add_documents(
|
| 24 |
|
| 25 |
# Query
|
| 26 |
question = "What is python known for?"
|
| 27 |
result = rag_pipeline.query(question)
|
| 28 |
print(f"Answer: {result['answer']}")
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Test rag pipeline
|
| 2 |
import sys
|
| 3 |
import os
|
| 4 |
+
|
| 5 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 6 |
|
| 7 |
from app.rag_pipeline import RAGPipeline
|
| 8 |
from app.document_processor import DocumentProcessor
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
load_dotenv()
|
| 12 |
|
| 13 |
|
| 14 |
processor = DocumentProcessor()
|
| 15 |
# chunks = processor.process_pdf("./data/test.pdf")
|
| 16 |
+
test_doc = """Python is a high-level programming language.
|
|
|
|
|
|
|
| 17 |
It was created by Guido van Rossum in 1991.
|
| 18 |
Python is known for its simple syntax.,
|
| 19 |
+
test_python.txt"""
|
| 20 |
+
chunks = processor._chunk_text("user", test_doc, doc_type="txt")
|
|
|
|
| 21 |
|
| 22 |
+
# Initialize Rag and Using document processor
|
| 23 |
rag_pipeline = RAGPipeline()
|
| 24 |
+
rag_pipeline.add_documents(chunks)
|
| 25 |
|
| 26 |
# Query
|
| 27 |
question = "What is python known for?"
|
| 28 |
result = rag_pipeline.query(question)
|
| 29 |
print(f"Answer: {result['answer']}")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Format sources with page numbers
|
| 33 |
+
# sources = result["sources_formatted"]
|
| 34 |
+
# source_info = []
|
| 35 |
+
# for i, doc in enumerate(sources, 1):
|
| 36 |
+
# source_file = doc.metadata.get("source", "Unknown")
|
| 37 |
+
# # Extract just filename
|
| 38 |
+
# source_name = source_file.split("/")[-1] if "/" in source_file else source_file
|
| 39 |
+
# page_preview = doc.page_content[:100].replace("\n", " ")
|
| 40 |
+
# source_info.append(f"**[{i}]** {source_name}\n> {page_preview}...")
|
| 41 |
+
|
| 42 |
+
# sources_text = "\n\n".join(source_info) if source_info else "No sources found"
|
| 43 |
+
# print(f"Sources: {sources_text}")
|