Spaces:

pkgprateek
/

ai-rag-document

Sleeping

App Files Files Community

pkgprateek commited on Nov 18, 2025

Commit

bb58af7

0 Parent(s):

Clean snapshot for Hugging Face Space (no large files)

Browse files

Files changed (8) hide show

.gitignore +5 -0
README.md +53 -0
app/document_processor.py +48 -0
app/main.py +109 -0
app/rag_pipeline.py +76 -0
requirements.txt +42 -0
tests/test_document_prrocessor.py +10 -0
tests/test_rag_pipeline.py +29 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.DS_Store
+__pycache__
+data
+.gradio
+data/chroma_db/

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+---
+title: AI Document Intelligence System (with RAG)
+emoji: 📚
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.10.0
+app_file: app/main.py
+pinned: false
+---
+# AI Document Intelligence System
+Upload documents and ask questions. Built with:
+- LangChain for RAG orchestration
+- ChromaDB for vector storage
+- Sentence Transformers for embeddings
+- Gradio for UI
+## Features
+- Interactive document processing
+- Context-aware question answering
+- Support for multiple file formats
+- Real-time processing and analysis
+- Multi-language support
+- Customizable knowledge base
+## Installation
+To get started with the AI Document Intelligence System, follow these steps:
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/yourusername/ai-document-intelligence.git
+   cd ai-document-intelligence
+   ```
+2. Create a virtual environment and activate it:
+   ```bash
+   python -m venv venv
+   source venv/bin/activate  # On Windows, use `venv\Scripts\activate`
+   ```
+3. Install the required dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. Run the application:
+   ```bash
+   python app/main.py
+   ```
+5. Open your web browser and navigate to the provided local URL to access the Gradio interface.
+## Usage:
+1. Upload a PDF/DOCX/TXT file
+2. Click "Process Document"
+3. Ask questions about the content
+4. Get answers with source citations

app/document_processor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import List
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+import PyPDF2
+from docx import Document as DocxDocument
+class DocumentProcessor:
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+    def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
+        """Split text into chunks"""
+        # Create documents with metadata
+        return self.text_splitter.create_documents(
+            [text],
+            metadatas=[{"source": file_path, "type": doc_type}],
+        )
+    def process_pdf(self, file_path: str) -> List[Document]:
+        """Extract text from a PDF file and split it into chunks"""
+        reader = PyPDF2.PdfReader(file_path)
+        text = ""
+        for page_num, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            if page_text:
+                text += f"\n---- Page {page_num + 1} ----\n{page_text}"
+        return self._chunk_text(file_path, text, "pdf")
+    def process_docx(self, file_path: str) -> List[Document]:
+        """Extract text from a DOCX file and split it into chunks"""
+        doc = DocxDocument(file_path)
+        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        return self._chunk_text(file_path, text, "docx")
+    def process_txt(self, file_path: str) -> List[Document]:
+        """Process raw text into chunks"""
+        with open(file_path, "r", encoding="utf-8") as file:
+            text = file.read()
+        return self._chunk_text(file_path, text, "txt")

app/main.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import gradio as gr
+from rag_pipeline import RAGPipeline
+from document_processor import DocumentProcessor
+import os
+class DocumentRagApp:
+    def __init__(self):
+        self.processor = DocumentProcessor()
+        self.rag_pipeline = RAGPipeline()
+        self.loaded_documents = []
+    def process_document(self, file):
+        """Process uplaoded document and add to RAG"""
+        if file is None:
+            return "Please upload a file."
+        try:
+            file_path = file.name
+            file_name = os.path.basename(file_path)
+            file_ext = os.path.splitext(file_path)[1].lower()
+            # Check file type and process the file based on its extension:
+            if file_ext == ".pdf":
+                chunks = self.processor.process_pdf(file_path)
+            elif file_ext == ".txt":
+                chunks = self.processor.process_txt(file_path)
+            elif file_ext == ".docx":
+                chunks = self.processor.process_docx(file_path)
+            else:
+                return "Unsupported file type. Please upload a PDF, TXT, or DOCX file."
+            self.rag_pipeline.add_documents(chunks)
+            self.loaded_documents.append(file_name)
+            return f"Processed {len(chunks)} chunks from '{file_name}'"
+        except Exception as e:
+            return f"Error processing file: {str(e)}"
+    def ask_question(self, question):
+        if not self.loaded_documents:
+            return "Please upload and process a document before asking questions."
+        if not question.strip():
+            return "Please enter a question."
+        try:
+            result = self.rag_pipeline.query(question)
+            answer = result["answer"]
+            return answer
+            # sources = result["sources"]
+            # source_response = ""
+            # for i, doc in enumerate(sources[:3], start=1):
+            # src_name = doc.metadata.get("source", "Unknown Source")
+            # content_preview = doc.page_content[:100] + "..."
+            # source_response += f"\n{i}. {src_name}\n '{content_preview}'\n"
+            # source_response += f"\n{i}. {content_preview}\n"
+            # return answer, source_response
+        except Exception as e:
+            return f"Error answering question: {str(e)}"
+# Initialize gradio App
+app = DocumentRagApp()
+# Create Gradio Interface
+with gr.Blocks(title="AI Document QA System") as demo:
+    gr.Markdown("AI Document QA System")
+    gr.Markdown(
+        "Uploade documents (PDF, DOCX, TXT) and talk to it with simple questions. Powered by RAG + LangChain."
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 1. Upload a Document")
+            file_upload = gr.File(
+                label="Upload Document", file_types=[".pdf", ".docx", ".txt"]
+            )
+            process_btn = gr.Button("Process Document", variant="primary")
+            process_response = gr.Textbox(label="Processing Status", lines=2)
+            gr.Markdown("### 2. Ask Questions")
+            question_input = gr.Textbox(
+                label="Your Question",
+                placeholder="Ask a question about the document...",
+                lines=2,
+            )
+            ask_btn = gr.Button("Ask", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### 3. Answer")
+            answer_output = gr.Markdown(container=True, min_height="480px")
+            # sources_output = gr.Markdown(
+            #     label="Sources", container=True, min_height="120px"
+            # )
+        # Connect all functions
+        process_btn.click(
+            fn=app.process_document, inputs=[file_upload], outputs=[process_response]
+        )
+        ask_btn.click(
+            fn=app.ask_question,
+            inputs=[question_input],
+            outputs=[answer_output],
+            # outputs=[answer_output, sources_output],
+        )
+if __name__ == "__main__":
+    demo.launch(share=True)

app/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_ollama import OllamaLLM
+from langchain_core.prompts import PromptTemplate
+from langchain_core.documents import Document
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough
+from typing import List
+class RAGPipeline:
+    def __init__(self, persist_directory: str = "./data/chroma_db"):
+        #Initialize embeddings
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+        )
+        #Initialize vector store
+        self.vector_store = Chroma(
+            persist_directory=persist_directory,
+            embedding_function=self.embeddings,
+        )
+        #Initialize LLM
+        self.llm = OllamaLLM(model="gemma3:latest")
+        # Create RAG chain
+        self.rag_chain = self.create_rag_chain()
+    def create_rag_chain(self):
+        """Create RAG chain"""
+        prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template="""
+            Use the following pieces of retrieved context to answer the question at the end.
+            You are an helpful assistant, so if you don't know the answer, just say that you don't know.
+            Do not hallucinate. Do not make up information. Do not guess. Do not lie.
+            Use factual information to answer the question. Verify the information you provide.
+            Prettify your answer with markdown formatting.".
+            Context: {context}
+            Question: {question}
+            Answer:
+            """
+        )
+        retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
+        rag_chain = RunnableParallel(
+            {
+                "result": (
+                    {"context": retriever, "question": RunnablePassthrough()}
+                    | prompt
+                    | self.llm
+                ),
+                "source_documents": retriever,
+            }
+        )
+        return rag_chain
+    def add_documents(self, documents: List[Document]) -> None:
+        """Add documents to the vector store"""
+        self.vector_store.add_documents(documents)
+        # In newer versions of langchain-chroma, persist() is no longer needed
+        # as documents are automatically persisted when added
+    def query(self, question: str):
+        """Query the RAG pipeline with a question"""
+        # Get answer from chain
+        # try:
+        #     answer = self.rag_chain.invoke({"question": question})
+        # except TypeError:
+        answer = self.rag_chain.invoke(question)
+        return {
+            "answer": answer["result"],
+            "sources": answer["source_documents"]
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+chromadb==1.3.4
+fastapi==0.121.2
+gradio==5.49.1
+gradio-client==1.13.3
+huggingface-hub==0.36.0
+jinja2==3.1.6
+joblib==1.5.2
+langchain==1.0.7
+langchain-chroma==1.0.0
+langchain-classic==1.0.0
+langchain-community==0.4.1
+langchain-core==1.0.5
+langchain-huggingface==1.0.1
+langchain-ollama==1.0.0
+langchain-text-splitters==1.0.0
+langgraph==1.0.3
+langgraph-checkpoint==3.0.1
+langgraph-prebuilt==1.0.4
+langgraph-sdk==0.2.9
+langsmith==0.4.43
+markdown-it-py==4.0.0
+numpy==2.3.5
+oauthlib==3.3.1
+ollama==0.6.1
+pandas==2.3.3
+pillow==11.3.0
+pip==25.3
+pygments==2.19.2
+pypdf2==3.0.1
+python-docx==1.2.0
+python-dotenv==1.2.1
+regex==2025.11.3
+requests==2.32.5
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+ruff==0.14.5
+scikit-learn==1.7.2
+scipy==1.16.3
+sqlalchemy==2.0.44
+tqdm==4.67.1
+transformers==4.57.1
+urllib3==2.3.0

tests/test_document_prrocessor.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from app.document_processor import DocumentProcessor
+processor = DocumentProcessor()
+pdf_path = "data/test.pdf"
+chunks = processor.process_pdf(pdf_path)
+print(f"Created {len(chunks)} chunks")
+print(f"First chunk: {chunks[0].page_content[:100]}...")
+print(f"Metadata: {chunks[0].metadata}")

tests/test_rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Test rag pipeline
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from app.rag_pipeline import RAGPipeline
+from app.document_processor import DocumentProcessor
+processor = DocumentProcessor()
+# chunks = processor.process_pdf("./data/test.pdf")
+test_doc = processor.process_txt(
+    """
+    Python is a high-level programming language.
+    It was created by Guido van Rossum in 1991.
+    Python is known for its simple syntax.,
+    test_python.txt
+    """
+)
+# Initialize Rag
+rag_pipeline = RAGPipeline()
+rag_pipeline.add_documents(test_doc)
+# Query
+question = "What is python known for?"
+result = rag_pipeline.query(question)
+print(f"Answer: {result['answer']}")
+print(f"Sources: {len(result['sources'])} chunks retrieved.")