Spaces:

pkgprateek
/

ai-rag-document

Sleeping

App Files Files Community

pkgprateek commited on Nov 19, 2025

Commit

a864c4e

1 Parent(s): 765b8d8

OpenRouter Added, Rate Limitng Fixed

Browse files

Files changed (9) hide show

.env.example +6 -0
.gitignore +2 -2
README.md +61 -37
app/document_processor.py +45 -8
app/main.py +27 -15
app/rag_pipeline.py +132 -31
requirements.txt +21 -37
tests/experiments.py +93 -0
tests/test_rag_pipeline.py +23 -9

.env.example ADDED Viewed

	@@ -0,0 +1,6 @@

+# Environment Variables
+# OpenRouter API Key (Required)
+# Get your FREE key at: https://openrouter.ai/keys
+# Using free tier with google/gemma-3-4b-it:free model
+OPENROUTER_API_KEY=your_openrouter_api_key_here

.gitignore CHANGED Viewed

@@ -1,5 +1,5 @@
 .DS_Store
 __pycache__
-data
 .gradio
-data/chroma_db/

 .DS_Store
 __pycache__
 .gradio
+data/
+.env

README.md CHANGED Viewed

@@ -10,44 +10,68 @@ pinned: false
 ---
 # AI Document Intelligence System
-Upload documents and ask questions. Built with:
-- LangChain for RAG orchestration
-- ChromaDB for vector storage
-- Sentence Transformers for embeddings
-- Gradio for UI
 ## Features
-- Interactive document processing
-- Context-aware question answering
-- Support for multiple file formats
-- Real-time processing and analysis
-- Multi-language support
-- Customizable knowledge base
-## Installation
-To get started with the AI Document Intelligence System, follow these steps:
-1. Clone the repository:
-   ```bash
-   git clone https://github.com/yourusername/ai-document-intelligence.git
-   cd ai-document-intelligence
-   ```
-2. Create a virtual environment and activate it:
-   ```bash
-   python -m venv venv
-   source venv/bin/activate  # On Windows, use `venv\Scripts\activate`
-   ```
-3. Install the required dependencies:
-   ```bash
-   pip install -r requirements.txt
-   ```
-4. Run the application:
-   ```bash
-   python app/main.py
-   ```
-5. Open your web browser and navigate to the provided local URL to access the Gradio interface.
-## Usage:
 1. Upload a PDF/DOCX/TXT file
 2. Click "Process Document"
-3. Ask questions about the content
-4. Get answers with source citations

 ---
 # AI Document Intelligence System
+Upload documents and ask questions using advanced RAG (Retrieval-Augmented Generation) technology. Built with:
+- **LangChain** for RAG orchestration
+- **ChromaDB** for vector storage
+- **BAAI/bge-small-en-v1.5** embeddings for superior retrieval quality
+- **Meta Llama 3.2** via HuggingFace Inference API
+- **Gradio** for interactive UI
 ## Features
+- Interactive document processing (PDF, DOCX, TXT)
+- Context-aware question answering with improved embeddings
+- ⚡ Real-time processing and analysis
+- Source citation for transparency
+- Cloud-ready deployment on HuggingFace Spaces
+## Setup
+### 1. Get HuggingFace Token
+1. Create a free account at [HuggingFace](https://huggingface.co/join)
+2. Go to [Settings → Access Tokens](https://huggingface.co/settings/tokens)
+3. Create a new token with **READ** access
+4. Copy the token
+### 2. Local Installation
+```bash
+# Clone the repository
+git clone https://github.com/pkgprateek/ai-rag-document.git
+cd ai-rag-document
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate  # On Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+# Set up environment variables
+cp .env.example .env
+# Edit .env and add your HF_TOKEN
+# Run the application
+python app/main.py
+```
+### 3. Deploy to HuggingFace Spaces
+1. **Fork or upload this repo to HuggingFace Spaces**
+2. **Add your HF_TOKEN as a Space Secret:**
+   - Go to your Space Settings → Repository secrets
+   - Add a new secret: `HF_TOKEN` = your token
+3. **Your app will automatically deploy!**
+## Usage
 1. Upload a PDF/DOCX/TXT file
 2. Click "Process Document"
+3. Get accurate answers with markdown formatting
+## Technical Details
+- **Embeddings**: BAAI/bge-small-en-v1.5 (significantly better than all-MiniLM-L6-v2)
+- **LLM**: Meta Llama-3.2-3B-Instruct via HuggingFace Inference API
+- **Vector Store**: ChromaDB with persistent storage
+- **Chunking**: Smart text splitting with overlap for context preservation

app/document_processor.py CHANGED Viewed

@@ -7,15 +7,31 @@ from docx import Document as DocxDocument
 class DocumentProcessor:
     def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             length_function=len,
         )
     def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
-        """Split text into chunks"""
         # Create documents with metadata
         return self.text_splitter.create_documents(
             [text],
@@ -23,7 +39,15 @@ class DocumentProcessor:
         )
     def process_pdf(self, file_path: str) -> List[Document]:
-        """Extract text from a PDF file and split it into chunks"""
         reader = PyPDF2.PdfReader(file_path)
         text = ""
         for page_num, page in enumerate(reader.pages):
@@ -32,17 +56,30 @@ class DocumentProcessor:
                 text += f"\n---- Page {page_num + 1} ----\n{page_text}"
         return self._chunk_text(file_path, text, "pdf")
     def process_docx(self, file_path: str) -> List[Document]:
-        """Extract text from a DOCX file and split it into chunks"""
         doc = DocxDocument(file_path)
         text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
         return self._chunk_text(file_path, text, "docx")
     def process_txt(self, file_path: str) -> List[Document]:
-        """Process raw text into chunks"""
         with open(file_path, "r", encoding="utf-8") as file:
             text = file.read()
         return self._chunk_text(file_path, text, "txt")

 class DocumentProcessor:
     def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        """
+        Initialize document processor with text splitting configuration.
+        Args:
+            chunk_size: Maximum characters per chunk (default: 1000)
+            chunk_overlap: Characters to overlap between chunks (default: 200)
+        """
         self.text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             length_function=len,
         )
     def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
+        """
+        Split text into overlapping chunks with metadata for better retrieval.
+        Args:
+            file_path: Original file path for metadata
+            text: Text content to split
+            doc_type: Document type (pdf/docx/txt)
+        Returns:
+            List[Document]: Chunked documents with metadata
+        """
         # Create documents with metadata
         return self.text_splitter.create_documents(
             [text],
         )
     def process_pdf(self, file_path: str) -> List[Document]:
+        """
+        Extract text from PDF file and convert to chunked documents.
+        Args:
+            file_path: Path to PDF file
+        Returns:
+            List[Document]: Processed document chunks
+        """
         reader = PyPDF2.PdfReader(file_path)
         text = ""
         for page_num, page in enumerate(reader.pages):
                 text += f"\n---- Page {page_num + 1} ----\n{page_text}"
         return self._chunk_text(file_path, text, "pdf")
     def process_docx(self, file_path: str) -> List[Document]:
+        """
+        Extract text from DOCX file and convert to chunked documents.
+        Args:
+            file_path: Path to DOCX file
+        Returns:
+            List[Document]: Processed document chunks
+        """
         doc = DocxDocument(file_path)
         text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
         return self._chunk_text(file_path, text, "docx")
     def process_txt(self, file_path: str) -> List[Document]:
+        """
+        Read text file and convert to chunked documents.
+        Args:
+            file_path: Path to TXT file
+        Returns:
+            List[Document]: Processed document chunks
+        """
         with open(file_path, "r", encoding="utf-8") as file:
             text = file.read()
         return self._chunk_text(file_path, text, "txt")

app/main.py CHANGED Viewed

@@ -2,16 +2,32 @@ import gradio as gr
 from rag_pipeline import RAGPipeline
 from document_processor import DocumentProcessor
 import os
 class DocumentRagApp:
     def __init__(self):
         self.processor = DocumentProcessor()
         self.rag_pipeline = RAGPipeline()
         self.loaded_documents = []
     def process_document(self, file):
-        """Process uplaoded document and add to RAG"""
         if file is None:
             return "Please upload a file."
         try:
@@ -36,6 +52,15 @@ class DocumentRagApp:
             return f"Error processing file: {str(e)}"
     def ask_question(self, question):
         if not self.loaded_documents:
             return "Please upload and process a document before asking questions."
@@ -46,14 +71,6 @@ class DocumentRagApp:
             result = self.rag_pipeline.query(question)
             answer = result["answer"]
             return answer
-            # sources = result["sources"]
-            # source_response = ""
-            # for i, doc in enumerate(sources[:3], start=1):
-            # src_name = doc.metadata.get("source", "Unknown Source")
-            # content_preview = doc.page_content[:100] + "..."
-            # source_response += f"\n{i}. {src_name}\n '{content_preview}'\n"
-            # source_response += f"\n{i}. {content_preview}\n"
-            # return answer, source_response
         except Exception as e:
             return f"Error answering question: {str(e)}"
@@ -87,11 +104,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
         with gr.Column(scale=2):
             gr.Markdown("### 3. Answer")
             answer_output = gr.Markdown(container=True, min_height="480px")
-            # sources_output = gr.Markdown(
-            #     label="Sources", container=True, min_height="120px"
-            # )
         # Connect all functions
         process_btn.click(
@@ -102,8 +115,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
             fn=app.ask_question,
             inputs=[question_input],
             outputs=[answer_output],
-            # outputs=[answer_output, sources_output],
         )
 if __name__ == "__main__":
-    demo.launch(share=True)

 from rag_pipeline import RAGPipeline
 from document_processor import DocumentProcessor
 import os
+from dotenv import load_dotenv
+# Load environment variables from .env file
+load_dotenv()
 class DocumentRagApp:
     def __init__(self):
+        """
+        Initialize Document RAG application with processor and pipeline.
+        Loads environment variables and sets up components.
+        """
         self.processor = DocumentProcessor()
         self.rag_pipeline = RAGPipeline()
         self.loaded_documents = []
     def process_document(self, file):
+        """
+        Process uploaded document (PDF/DOCX/TXT) and add to RAG system.
+        Args:
+            file: Gradio file upload object
+        Returns:
+            str: Status message with processing results or error
+        """
         if file is None:
             return "Please upload a file."
         try:
             return f"Error processing file: {str(e)}"
     def ask_question(self, question):
+        """
+        Answer user question using RAG pipeline with rate limiting.
+        Args:
+            question: User's question string
+        Returns:
+            str: Generated answer or error message
+        """
         if not self.loaded_documents:
             return "Please upload and process a document before asking questions."
             result = self.rag_pipeline.query(question)
             answer = result["answer"]
             return answer
         except Exception as e:
             return f"Error answering question: {str(e)}"
         with gr.Column(scale=2):
             gr.Markdown("### 3. Answer")
             answer_output = gr.Markdown(container=True, min_height="480px")
         # Connect all functions
         process_btn.click(
             fn=app.ask_question,
             inputs=[question_input],
             outputs=[answer_output],
         )
 if __name__ == "__main__":
+    demo.launch(share=False)

app/rag_pipeline.py CHANGED Viewed

@@ -1,76 +1,177 @@
 from langchain_chroma import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_ollama import OllamaLLM
 from langchain_core.prompts import PromptTemplate
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from typing import List
 class RAGPipeline:
     def __init__(self, persist_directory: str = "./data/chroma_db"):
-        #Initialize embeddings
         self.embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2",
         )
-        #Initialize vector store
         self.vector_store = Chroma(
             persist_directory=persist_directory,
             embedding_function=self.embeddings,
         )
-        #Initialize LLM
-        self.llm = OllamaLLM(model="gemma3:latest")
         # Create RAG chain
         self.rag_chain = self.create_rag_chain()
     def create_rag_chain(self):
-        """Create RAG chain"""
         prompt = PromptTemplate(
             input_variables=["context", "question"],
-            template="""
-            Use the following pieces of retrieved context to answer the question at the end.
-            You are an helpful assistant, so if you don't know the answer, just say that you don't know.
-            Do not hallucinate. Do not make up information. Do not guess. Do not lie.
-            Use factual information to answer the question. Verify the information you provide.
-            Prettify your answer with markdown formatting.".
             Context: {context}
             Question: {question}
-            Answer:
-            """
         )
-        retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
         rag_chain = RunnableParallel(
             {
                 "result": (
                     {"context": retriever, "question": RunnablePassthrough()}
                     | prompt
                     | self.llm
-                ),
                 "source_documents": retriever,
             }
         )
         return rag_chain
     def add_documents(self, documents: List[Document]) -> None:
-        """Add documents to the vector store"""
         self.vector_store.add_documents(documents)
         # In newer versions of langchain-chroma, persist() is no longer needed
         # as documents are automatically persisted when added
     def query(self, question: str):
-        """Query the RAG pipeline with a question"""
-        # Get answer from chain
-        # try:
-        #     answer = self.rag_chain.invoke({"question": question})
-        # except TypeError:
         answer = self.rag_chain.invoke(question)
-        return {
-            "answer": answer["result"],
-            "sources": answer["source_documents"]
-        }

 from langchain_chroma import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
 from langchain_core.prompts import PromptTemplate
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 from typing import List
+import os
+from datetime import datetime, timedelta
+import json
+from pathlib import Path
+# Fix tokenizer warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 class RAGPipeline:
     def __init__(self, persist_directory: str = "./data/chroma_db"):
+        """
+        Initialize RAG pipeline with embeddings, vector store, and LLM.
+        Sets up rate limiting (10 queries/hour) and uses OpenRouter API with free Gemma model.
+        Args:
+            persist_directory: Path to store ChromaDB vector database (default: ./data/chroma_db)
+        """
+        # Initialize better embeddings (BAAI/bge-small-en-v1.5)
         self.embeddings = HuggingFaceEmbeddings(
+            model_name="BAAI/bge-small-en-v1.5",
+            model_kwargs={"device": "cpu"},
+            encode_kwargs={"normalize_embeddings": True},  # Important for bge models
         )
+        # Initialize vector store
         self.vector_store = Chroma(
             persist_directory=persist_directory,
             embedding_function=self.embeddings,
         )
+        # Rate limiting setup (10 queries per hour)
+        self.rate_limit_file = Path("./data/rate_limit.json")
+        self.rate_limit_file.parent.mkdir(parents=True, exist_ok=True)
+        # Initialize LLM using OpenRouter (cheapest free option)
+        openrouter_key = os.getenv("OPENROUTER_API_KEY")
+        if not openrouter_key:
+            raise ValueError(
+                "OPENROUTER_API_KEY environment variable not set. "
+                "Get one free at https://openrouter.ai/keys"
+            )
+        # Using google/gemma-3-4b-it:free - free tier on OpenRouter
+        self.llm = ChatOpenAI(
+            model="google/gemma-3-4b-it:free",
+            openai_api_key=openrouter_key,
+            openai_api_base="https://openrouter.ai/api/v1",
+            temperature=0.1,
+            max_tokens=512,
+        )
         # Create RAG chain
         self.rag_chain = self.create_rag_chain()
     def create_rag_chain(self):
+        """
+        Creates the RAG chain by combining retriever, prompt template, and LLM.
+        Returns:
+            RunnableParallel: Chain that retrieves context and generates answers
+        """
         prompt = PromptTemplate(
             input_variables=["context", "question"],
+            template="""Answer the question based on the context below. If you cannot answer based on the context, say "I don't know".
+            Do not hallucinate. Do not make up information.
+            Format your answer using markdown for better readability.
             Context: {context}
             Question: {question}
+            Provide a clear and concise answer:""",
         )
+        retriever = self.vector_store.as_retriever(
+            search_kwargs={"k": 4}  # Retrieve top 4 most relevant chunks
+        )
         rag_chain = RunnableParallel(
             {
                 "result": (
                     {"context": retriever, "question": RunnablePassthrough()}
                     | prompt
                     | self.llm
+                ),
                 "source_documents": retriever,
             }
         )
         return rag_chain
     def add_documents(self, documents: List[Document]) -> None:
+        """
+        Add processed document chunks to the vector store for retrieval.
+        Args:
+            documents: List of Document objects with text and metadata
+        """
         self.vector_store.add_documents(documents)
         # In newer versions of langchain-chroma, persist() is no longer needed
         # as documents are automatically persisted when added
+    def _check_rate_limit(self) -> bool:
+        """
+        Enforces rate limit of 10 queries per hour by tracking query timestamps.
+        Returns:
+            bool: True if within limit, False if exceeded
+        """
+        now = datetime.now()
+        # Load existing queries
+        if self.rate_limit_file.exists():
+            with open(self.rate_limit_file, "r") as f:
+                data = json.load(f)
+                queries = [datetime.fromisoformat(q) for q in data.get("queries", [])]
+        else:
+            queries = []
+        # Remove queries older than 1 hour
+        one_hour_ago = now - timedelta(hours=1)
+        recent_queries = [q for q in queries if q > one_hour_ago]
+        # Check limit
+        if len(recent_queries) >= 10:
+            return False
+        # Add current query
+        recent_queries.append(now)
+        # Save updated queries
+        with open(self.rate_limit_file, "w") as f:
+            json.dump({"queries": [q.isoformat() for q in recent_queries]}, f)
+        return True
     def query(self, question: str):
+        """
+        Query the RAG system with a question, retrieves relevant context and generates answer.
+        Args:
+            question: User's question string
+        Returns:
+            dict: {"answer": str} containing the generated response
+        Raises:
+            ValueError: If rate limit (10 queries/hour) is exceeded
+        """
+        # Check rate limit
+        if not self._check_rate_limit():
+            raise ValueError(
+                "Rate limit exceeded. You can only ask 10 questions per hour. "
+                "Please try again later."
+            )
         answer = self.rag_chain.invoke(question)
+        result = answer["result"]
+        if hasattr(result, "content"):
+            answer_text = result.content
+        elif hasattr(result, "text"):
+            answer_text = result.text
+        else:
+            answer_text = str(result)
+        # Check if answer is empty
+        if not answer_text or answer_text.strip() == "":
+            answer_text = "I apologize, but I couldn't generate a response. Please try rephrasing your question."
+        return {"answer": answer_text}

requirements.txt CHANGED Viewed

@@ -1,43 +1,27 @@
-chromadb==1.3.4
-fastapi==0.121.2
 gradio==5.49.1
-gradio-client==1.13.3
-huggingface-hub==0.36.0
-jinja2==3.1.6
-joblib==1.5.2
 langchain==1.0.7
-langchain-chroma==1.0.0
-langchain-classic==1.0.0
-langchain-community==0.4.1
 langchain-core==1.0.5
-langchain-huggingface==1.0.1
-langchain-ollama==1.0.0
 langchain-text-splitters==1.0.0
-langgraph==1.0.3
-langgraph-checkpoint==3.0.1
-langgraph-prebuilt==1.0.4
-langgraph-sdk==0.2.9
-langsmith==0.4.43
-markdown-it-py==4.0.0
-numpy>=2.0
-oauthlib==3.3.1
-ollama==0.6.1
-pandas==2.3.3
-pillow==11.3.0
-pip==25.3
-pygments==2.19.2
 pypdf2==3.0.1
 python-docx==1.2.0
-python-dotenv==1.2.1
-regex==2025.11.3
-requests==2.32.5
-requests-oauthlib==2.0.0
-requests-toolbelt==1.0.0
-ruff==0.14.5
-scikit-learn==1.7.2
-scipy>=1.0
-sqlalchemy>=2.0
-tqdm>=4.0
-transformers>=4.0
-urllib3>=2.0
-sentence-transformers>=5.0

+# Core App Framework
 gradio==5.49.1
+# LangChain Core
 langchain==1.0.7
 langchain-core==1.0.5
 langchain-text-splitters==1.0.0
+# LangChain Integrations
+langchain-chroma==1.0.0
+langchain-huggingface==1.0.1
+langchain-openai
+# Vector Database
+chromadb==1.3.4
+# HuggingFace & Embeddings
+huggingface-hub==0.36.0
+sentence-transformers>=3.0.0
+transformers>=4.0
+# Document Processing
 pypdf2==3.0.1
 python-docx==1.2.0
+# Environment & Config
+python-dotenv==1.2.1

tests/experiments.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Experimental code for testing RAG pipeline
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.rag_pipeline import RAGPipeline
+from app.document_processor import DocumentProcessor
+from dotenv import load_dotenv
+load_dotenv()
+# Example 1: Simple text test
+def test_simple_query():
+    processor = DocumentProcessor()
+    test_doc = """Python is a high-level programming language.
+    It was created by Guido van Rossum in 1991.
+    Python is known for its simple syntax."""
+    chunks = processor._chunk_text("test_doc.txt", test_doc, doc_type="txt")
+    # Initialize RAG
+    rag_pipeline = RAGPipeline()
+    rag_pipeline.add_documents(chunks)
+    # Query
+    question = "What is python known for?"
+    result = rag_pipeline.query(question)
+    print(f"Question: {question}")
+    print(f"Answer: {result['answer']}")
+    print("\n" + "=" * 50 + "\n")
+# Example 2: Testing with actual document
+def test_with_pdf():
+    processor = DocumentProcessor()
+    rag_pipeline = RAGPipeline()
+    # Process a PDF file
+    pdf_path = "path/to/your/test.pdf"  # Replace with actual path
+    if os.path.exists(pdf_path):
+        chunks = processor.process_pdf(pdf_path)
+        rag_pipeline.add_documents(chunks)
+        question = "What is the main topic of this document?"
+        result = rag_pipeline.query(question)
+        print(f"Question: {question}")
+        print(f"Answer: {result['answer']}")
+    else:
+        print(f"PDF not found: {pdf_path}")
+# Example 3: Interactive testing
+def interactive_test():
+    processor = DocumentProcessor()
+    rag_pipeline = RAGPipeline()
+    # Add some test content
+    test_doc = """Artificial Intelligence (AI) is transforming the world.
+    Machine learning is a subset of AI that focuses on learning from data.
+    Deep learning uses neural networks with multiple layers.
+    Natural Language Processing (NLP) helps computers understand human language."""
+    chunks = processor._chunk_text("ai_basics.txt", test_doc, doc_type="txt")
+    rag_pipeline.add_documents(chunks)
+    print("Interactive RAG Testing")
+    print("Type 'quit' to exit\n")
+    while True:
+        question = input("Your question: ")
+        if question.lower() == "quit":
+            break
+        try:
+            result = rag_pipeline.query(question)
+            print(f"Answer: {result['answer']}\n")
+        except ValueError as e:
+            print(f"Error: {e}\n")
+            break
+if __name__ == "__main__":
+    print("Running RAG Pipeline Experiments\n")
+    # Run simple test
+    test_simple_query()
+    # Uncomment to run other tests
+    # test_with_pdf()
+    # interactive_test()

tests/test_rag_pipeline.py CHANGED Viewed

@@ -1,29 +1,43 @@
 # Test rag pipeline
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from app.rag_pipeline import RAGPipeline
 from app.document_processor import DocumentProcessor
 processor = DocumentProcessor()
 # chunks = processor.process_pdf("./data/test.pdf")
-test_doc = processor.process_txt(
-    """
-    Python is a high-level programming language.
     It was created by Guido van Rossum in 1991.
     Python is known for its simple syntax.,
-    test_python.txt
-    """
-)
-# Initialize Rag
 rag_pipeline = RAGPipeline()
-rag_pipeline.add_documents(test_doc)
 # Query
 question = "What is python known for?"
 result = rag_pipeline.query(question)
 print(f"Answer: {result['answer']}")
-print(f"Sources: {len(result['sources'])} chunks retrieved.")

 # Test rag pipeline
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from app.rag_pipeline import RAGPipeline
 from app.document_processor import DocumentProcessor
+from dotenv import load_dotenv
+load_dotenv()
 processor = DocumentProcessor()
 # chunks = processor.process_pdf("./data/test.pdf")
+test_doc = """Python is a high-level programming language.
     It was created by Guido van Rossum in 1991.
     Python is known for its simple syntax.,
+    test_python.txt"""
+chunks = processor._chunk_text("user", test_doc, doc_type="txt")
+# Initialize Rag and Using document processor
 rag_pipeline = RAGPipeline()
+rag_pipeline.add_documents(chunks)
 # Query
 question = "What is python known for?"
 result = rag_pipeline.query(question)
 print(f"Answer: {result['answer']}")
+# Format sources with page numbers
+# sources = result["sources_formatted"]
+# source_info = []
+# for i, doc in enumerate(sources, 1):
+#     source_file = doc.metadata.get("source", "Unknown")
+#     # Extract just filename
+#     source_name = source_file.split("/")[-1] if "/" in source_file else source_file
+#     page_preview = doc.page_content[:100].replace("\n", " ")
+#     source_info.append(f"**[{i}]** {source_name}\n> {page_preview}...")
+# sources_text = "\n\n".join(source_info) if source_info else "No sources found"
+# print(f"Sources: {sources_text}")