pkgprateek commited on
Commit
bb58af7
·
0 Parent(s):

Clean snapshot for Hugging Face Space (no large files)

Browse files
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__
3
+ data
4
+ .gradio
5
+ data/chroma_db/
README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AI Document Intelligence System (with RAG)
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.10.0
8
+ app_file: app/main.py
9
+ pinned: false
10
+ ---
11
+
12
+ # AI Document Intelligence System
13
+ Upload documents and ask questions. Built with:
14
+ - LangChain for RAG orchestration
15
+ - ChromaDB for vector storage
16
+ - Sentence Transformers for embeddings
17
+ - Gradio for UI
18
+
19
+ ## Features
20
+ - Interactive document processing
21
+ - Context-aware question answering
22
+ - Support for multiple file formats
23
+ - Real-time processing and analysis
24
+ - Multi-language support
25
+ - Customizable knowledge base
26
+
27
+ ## Installation
28
+ To get started with the AI Document Intelligence System, follow these steps:
29
+ 1. Clone the repository:
30
+ ```bash
31
+ git clone https://github.com/yourusername/ai-document-intelligence.git
32
+ cd ai-document-intelligence
33
+ ```
34
+ 2. Create a virtual environment and activate it:
35
+ ```bash
36
+ python -m venv venv
37
+ source venv/bin/activate # On Windows, use `venv\Scripts\activate`
38
+ ```
39
+ 3. Install the required dependencies:
40
+ ```bash
41
+ pip install -r requirements.txt
42
+ ```
43
+ 4. Run the application:
44
+ ```bash
45
+ python app/main.py
46
+ ```
47
+ 5. Open your web browser and navigate to the provided local URL to access the Gradio interface.
48
+
49
+ ## Usage:
50
+ 1. Upload a PDF/DOCX/TXT file
51
+ 2. Click "Process Document"
52
+ 3. Ask questions about the content
53
+ 4. Get answers with source citations
app/document_processor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ from langchain_core.documents import Document
4
+ import PyPDF2
5
+ from docx import Document as DocxDocument
6
+
7
+
8
+ class DocumentProcessor:
9
+ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
10
+ self.text_splitter = RecursiveCharacterTextSplitter(
11
+ chunk_size=chunk_size,
12
+ chunk_overlap=chunk_overlap,
13
+ length_function=len,
14
+ )
15
+
16
+
17
+ def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
18
+ """Split text into chunks"""
19
+ # Create documents with metadata
20
+ return self.text_splitter.create_documents(
21
+ [text],
22
+ metadatas=[{"source": file_path, "type": doc_type}],
23
+ )
24
+
25
+ def process_pdf(self, file_path: str) -> List[Document]:
26
+ """Extract text from a PDF file and split it into chunks"""
27
+ reader = PyPDF2.PdfReader(file_path)
28
+ text = ""
29
+ for page_num, page in enumerate(reader.pages):
30
+ page_text = page.extract_text()
31
+ if page_text:
32
+ text += f"\n---- Page {page_num + 1} ----\n{page_text}"
33
+ return self._chunk_text(file_path, text, "pdf")
34
+
35
+
36
+ def process_docx(self, file_path: str) -> List[Document]:
37
+ """Extract text from a DOCX file and split it into chunks"""
38
+ doc = DocxDocument(file_path)
39
+ text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
40
+ return self._chunk_text(file_path, text, "docx")
41
+
42
+
43
+ def process_txt(self, file_path: str) -> List[Document]:
44
+ """Process raw text into chunks"""
45
+ with open(file_path, "r", encoding="utf-8") as file:
46
+ text = file.read()
47
+ return self._chunk_text(file_path, text, "txt")
48
+
app/main.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from rag_pipeline import RAGPipeline
3
+ from document_processor import DocumentProcessor
4
+ import os
5
+
6
+
7
+ class DocumentRagApp:
8
+ def __init__(self):
9
+ self.processor = DocumentProcessor()
10
+ self.rag_pipeline = RAGPipeline()
11
+ self.loaded_documents = []
12
+
13
+ def process_document(self, file):
14
+ """Process uplaoded document and add to RAG"""
15
+ if file is None:
16
+ return "Please upload a file."
17
+ try:
18
+ file_path = file.name
19
+ file_name = os.path.basename(file_path)
20
+ file_ext = os.path.splitext(file_path)[1].lower()
21
+
22
+ # Check file type and process the file based on its extension:
23
+ if file_ext == ".pdf":
24
+ chunks = self.processor.process_pdf(file_path)
25
+ elif file_ext == ".txt":
26
+ chunks = self.processor.process_txt(file_path)
27
+ elif file_ext == ".docx":
28
+ chunks = self.processor.process_docx(file_path)
29
+ else:
30
+ return "Unsupported file type. Please upload a PDF, TXT, or DOCX file."
31
+
32
+ self.rag_pipeline.add_documents(chunks)
33
+ self.loaded_documents.append(file_name)
34
+ return f"Processed {len(chunks)} chunks from '{file_name}'"
35
+ except Exception as e:
36
+ return f"Error processing file: {str(e)}"
37
+
38
+ def ask_question(self, question):
39
+ if not self.loaded_documents:
40
+ return "Please upload and process a document before asking questions."
41
+
42
+ if not question.strip():
43
+ return "Please enter a question."
44
+
45
+ try:
46
+ result = self.rag_pipeline.query(question)
47
+ answer = result["answer"]
48
+ return answer
49
+ # sources = result["sources"]
50
+ # source_response = ""
51
+ # for i, doc in enumerate(sources[:3], start=1):
52
+ # src_name = doc.metadata.get("source", "Unknown Source")
53
+ # content_preview = doc.page_content[:100] + "..."
54
+ # source_response += f"\n{i}. {src_name}\n '{content_preview}'\n"
55
+ # source_response += f"\n{i}. {content_preview}\n"
56
+ # return answer, source_response
57
+ except Exception as e:
58
+ return f"Error answering question: {str(e)}"
59
+
60
+
61
+ # Initialize gradio App
62
+ app = DocumentRagApp()
63
+
64
+ # Create Gradio Interface
65
+ with gr.Blocks(title="AI Document QA System") as demo:
66
+ gr.Markdown("AI Document QA System")
67
+ gr.Markdown(
68
+ "Uploade documents (PDF, DOCX, TXT) and talk to it with simple questions. Powered by RAG + LangChain."
69
+ )
70
+
71
+ with gr.Row():
72
+ with gr.Column(scale=1):
73
+ gr.Markdown("### 1. Upload a Document")
74
+ file_upload = gr.File(
75
+ label="Upload Document", file_types=[".pdf", ".docx", ".txt"]
76
+ )
77
+ process_btn = gr.Button("Process Document", variant="primary")
78
+ process_response = gr.Textbox(label="Processing Status", lines=2)
79
+
80
+ gr.Markdown("### 2. Ask Questions")
81
+ question_input = gr.Textbox(
82
+ label="Your Question",
83
+ placeholder="Ask a question about the document...",
84
+ lines=2,
85
+ )
86
+ ask_btn = gr.Button("Ask", variant="primary")
87
+
88
+ with gr.Column(scale=2):
89
+ gr.Markdown("### 3. Answer")
90
+
91
+ answer_output = gr.Markdown(container=True, min_height="480px")
92
+ # sources_output = gr.Markdown(
93
+ # label="Sources", container=True, min_height="120px"
94
+ # )
95
+
96
+ # Connect all functions
97
+ process_btn.click(
98
+ fn=app.process_document, inputs=[file_upload], outputs=[process_response]
99
+ )
100
+
101
+ ask_btn.click(
102
+ fn=app.ask_question,
103
+ inputs=[question_input],
104
+ outputs=[answer_output],
105
+ # outputs=[answer_output, sources_output],
106
+ )
107
+
108
+ if __name__ == "__main__":
109
+ demo.launch(share=True)
app/rag_pipeline.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_chroma import Chroma
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain_ollama import OllamaLLM
4
+ from langchain_core.prompts import PromptTemplate
5
+ from langchain_core.documents import Document
6
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough
7
+ from typing import List
8
+
9
+ class RAGPipeline:
10
+ def __init__(self, persist_directory: str = "./data/chroma_db"):
11
+ #Initialize embeddings
12
+ self.embeddings = HuggingFaceEmbeddings(
13
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
14
+ )
15
+ #Initialize vector store
16
+ self.vector_store = Chroma(
17
+ persist_directory=persist_directory,
18
+ embedding_function=self.embeddings,
19
+ )
20
+ #Initialize LLM
21
+ self.llm = OllamaLLM(model="gemma3:latest")
22
+
23
+ # Create RAG chain
24
+ self.rag_chain = self.create_rag_chain()
25
+
26
+ def create_rag_chain(self):
27
+ """Create RAG chain"""
28
+ prompt = PromptTemplate(
29
+ input_variables=["context", "question"],
30
+ template="""
31
+ Use the following pieces of retrieved context to answer the question at the end.
32
+ You are an helpful assistant, so if you don't know the answer, just say that you don't know.
33
+ Do not hallucinate. Do not make up information. Do not guess. Do not lie.
34
+ Use factual information to answer the question. Verify the information you provide.
35
+ Prettify your answer with markdown formatting.".
36
+
37
+ Context: {context}
38
+
39
+ Question: {question}
40
+
41
+ Answer:
42
+ """
43
+ )
44
+ retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
45
+ rag_chain = RunnableParallel(
46
+ {
47
+ "result": (
48
+ {"context": retriever, "question": RunnablePassthrough()}
49
+ | prompt
50
+ | self.llm
51
+ ),
52
+ "source_documents": retriever,
53
+ }
54
+ )
55
+ return rag_chain
56
+
57
+
58
+ def add_documents(self, documents: List[Document]) -> None:
59
+ """Add documents to the vector store"""
60
+ self.vector_store.add_documents(documents)
61
+ # In newer versions of langchain-chroma, persist() is no longer needed
62
+ # as documents are automatically persisted when added
63
+
64
+
65
+ def query(self, question: str):
66
+ """Query the RAG pipeline with a question"""
67
+ # Get answer from chain
68
+ # try:
69
+ # answer = self.rag_chain.invoke({"question": question})
70
+ # except TypeError:
71
+ answer = self.rag_chain.invoke(question)
72
+
73
+ return {
74
+ "answer": answer["result"],
75
+ "sources": answer["source_documents"]
76
+ }
requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chromadb==1.3.4
2
+ fastapi==0.121.2
3
+ gradio==5.49.1
4
+ gradio-client==1.13.3
5
+ huggingface-hub==0.36.0
6
+ jinja2==3.1.6
7
+ joblib==1.5.2
8
+ langchain==1.0.7
9
+ langchain-chroma==1.0.0
10
+ langchain-classic==1.0.0
11
+ langchain-community==0.4.1
12
+ langchain-core==1.0.5
13
+ langchain-huggingface==1.0.1
14
+ langchain-ollama==1.0.0
15
+ langchain-text-splitters==1.0.0
16
+ langgraph==1.0.3
17
+ langgraph-checkpoint==3.0.1
18
+ langgraph-prebuilt==1.0.4
19
+ langgraph-sdk==0.2.9
20
+ langsmith==0.4.43
21
+ markdown-it-py==4.0.0
22
+ numpy==2.3.5
23
+ oauthlib==3.3.1
24
+ ollama==0.6.1
25
+ pandas==2.3.3
26
+ pillow==11.3.0
27
+ pip==25.3
28
+ pygments==2.19.2
29
+ pypdf2==3.0.1
30
+ python-docx==1.2.0
31
+ python-dotenv==1.2.1
32
+ regex==2025.11.3
33
+ requests==2.32.5
34
+ requests-oauthlib==2.0.0
35
+ requests-toolbelt==1.0.0
36
+ ruff==0.14.5
37
+ scikit-learn==1.7.2
38
+ scipy==1.16.3
39
+ sqlalchemy==2.0.44
40
+ tqdm==4.67.1
41
+ transformers==4.57.1
42
+ urllib3==2.3.0
tests/test_document_prrocessor.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.document_processor import DocumentProcessor
2
+
3
+ processor = DocumentProcessor()
4
+
5
+ pdf_path = "data/test.pdf"
6
+ chunks = processor.process_pdf(pdf_path)
7
+
8
+ print(f"Created {len(chunks)} chunks")
9
+ print(f"First chunk: {chunks[0].page_content[:100]}...")
10
+ print(f"Metadata: {chunks[0].metadata}")
tests/test_rag_pipeline.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Test rag pipeline
2
+ import sys
3
+ import os
4
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
+
6
+ from app.rag_pipeline import RAGPipeline
7
+ from app.document_processor import DocumentProcessor
8
+
9
+
10
+ processor = DocumentProcessor()
11
+ # chunks = processor.process_pdf("./data/test.pdf")
12
+ test_doc = processor.process_txt(
13
+ """
14
+ Python is a high-level programming language.
15
+ It was created by Guido van Rossum in 1991.
16
+ Python is known for its simple syntax.,
17
+ test_python.txt
18
+ """
19
+ )
20
+
21
+ # Initialize Rag
22
+ rag_pipeline = RAGPipeline()
23
+ rag_pipeline.add_documents(test_doc)
24
+
25
+ # Query
26
+ question = "What is python known for?"
27
+ result = rag_pipeline.query(question)
28
+ print(f"Answer: {result['answer']}")
29
+ print(f"Sources: {len(result['sources'])} chunks retrieved.")