pkgprateek commited on
Commit
a864c4e
·
1 Parent(s): 765b8d8

OpenRouter Added, Rate Limitng Fixed

Browse files
.env.example ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Environment Variables
2
+
3
+ # OpenRouter API Key (Required)
4
+ # Get your FREE key at: https://openrouter.ai/keys
5
+ # Using free tier with google/gemma-3-4b-it:free model
6
+ OPENROUTER_API_KEY=your_openrouter_api_key_here
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
  .DS_Store
2
  __pycache__
3
- data
4
  .gradio
5
- data/chroma_db/
 
 
1
  .DS_Store
2
  __pycache__
 
3
  .gradio
4
+ data/
5
+ .env
README.md CHANGED
@@ -10,44 +10,68 @@ pinned: false
10
  ---
11
 
12
  # AI Document Intelligence System
13
- Upload documents and ask questions. Built with:
14
- - LangChain for RAG orchestration
15
- - ChromaDB for vector storage
16
- - Sentence Transformers for embeddings
17
- - Gradio for UI
 
 
18
 
19
  ## Features
20
- - Interactive document processing
21
- - Context-aware question answering
22
- - Support for multiple file formats
23
- - Real-time processing and analysis
24
- - Multi-language support
25
- - Customizable knowledge base
26
-
27
- ## Installation
28
- To get started with the AI Document Intelligence System, follow these steps:
29
- 1. Clone the repository:
30
- ```bash
31
- git clone https://github.com/yourusername/ai-document-intelligence.git
32
- cd ai-document-intelligence
33
- ```
34
- 2. Create a virtual environment and activate it:
35
- ```bash
36
- python -m venv venv
37
- source venv/bin/activate # On Windows, use `venv\Scripts\activate`
38
- ```
39
- 3. Install the required dependencies:
40
- ```bash
41
- pip install -r requirements.txt
42
- ```
43
- 4. Run the application:
44
- ```bash
45
- python app/main.py
46
- ```
47
- 5. Open your web browser and navigate to the provided local URL to access the Gradio interface.
48
-
49
- ## Usage:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  1. Upload a PDF/DOCX/TXT file
51
  2. Click "Process Document"
52
- 3. Ask questions about the content
53
- 4. Get answers with source citations
 
 
 
 
 
 
 
10
  ---
11
 
12
  # AI Document Intelligence System
13
+
14
+ Upload documents and ask questions using advanced RAG (Retrieval-Augmented Generation) technology. Built with:
15
+ - **LangChain** for RAG orchestration
16
+ - **ChromaDB** for vector storage
17
+ - **BAAI/bge-small-en-v1.5** embeddings for superior retrieval quality
18
+ - **Meta Llama 3.2** via HuggingFace Inference API
19
+ - **Gradio** for interactive UI
20
 
21
  ## Features
22
+ - Interactive document processing (PDF, DOCX, TXT)
23
+ - Context-aware question answering with improved embeddings
24
+ - Real-time processing and analysis
25
+ - Source citation for transparency
26
+ - Cloud-ready deployment on HuggingFace Spaces
27
+
28
+ ## Setup
29
+
30
+ ### 1. Get HuggingFace Token
31
+ 1. Create a free account at [HuggingFace](https://huggingface.co/join)
32
+ 2. Go to [Settings → Access Tokens](https://huggingface.co/settings/tokens)
33
+ 3. Create a new token with **READ** access
34
+ 4. Copy the token
35
+
36
+ ### 2. Local Installation
37
+
38
+ ```bash
39
+ # Clone the repository
40
+ git clone https://github.com/pkgprateek/ai-rag-document.git
41
+ cd ai-rag-document
42
+
43
+ # Create virtual environment
44
+ python -m venv venv
45
+ source venv/bin/activate # On Windows: venv\Scripts\activate
46
+
47
+ # Install dependencies
48
+ pip install -r requirements.txt
49
+
50
+ # Set up environment variables
51
+ cp .env.example .env
52
+ # Edit .env and add your HF_TOKEN
53
+
54
+ # Run the application
55
+ python app/main.py
56
+ ```
57
+
58
+ ### 3. Deploy to HuggingFace Spaces
59
+
60
+ 1. **Fork or upload this repo to HuggingFace Spaces**
61
+ 2. **Add your HF_TOKEN as a Space Secret:**
62
+ - Go to your Space Settings → Repository secrets
63
+ - Add a new secret: `HF_TOKEN` = your token
64
+ 3. **Your app will automatically deploy!**
65
+
66
+ ## Usage
67
+
68
  1. Upload a PDF/DOCX/TXT file
69
  2. Click "Process Document"
70
+ 3. Get accurate answers with markdown formatting
71
+
72
+ ## Technical Details
73
+
74
+ - **Embeddings**: BAAI/bge-small-en-v1.5 (significantly better than all-MiniLM-L6-v2)
75
+ - **LLM**: Meta Llama-3.2-3B-Instruct via HuggingFace Inference API
76
+ - **Vector Store**: ChromaDB with persistent storage
77
+ - **Chunking**: Smart text splitting with overlap for context preservation
app/document_processor.py CHANGED
@@ -7,15 +7,31 @@ from docx import Document as DocxDocument
7
 
8
  class DocumentProcessor:
9
  def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
 
 
 
 
 
 
 
10
  self.text_splitter = RecursiveCharacterTextSplitter(
11
  chunk_size=chunk_size,
12
  chunk_overlap=chunk_overlap,
13
  length_function=len,
14
  )
15
 
16
-
17
  def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
18
- """Split text into chunks"""
 
 
 
 
 
 
 
 
 
 
19
  # Create documents with metadata
20
  return self.text_splitter.create_documents(
21
  [text],
@@ -23,7 +39,15 @@ class DocumentProcessor:
23
  )
24
 
25
  def process_pdf(self, file_path: str) -> List[Document]:
26
- """Extract text from a PDF file and split it into chunks"""
 
 
 
 
 
 
 
 
27
  reader = PyPDF2.PdfReader(file_path)
28
  text = ""
29
  for page_num, page in enumerate(reader.pages):
@@ -32,17 +56,30 @@ class DocumentProcessor:
32
  text += f"\n---- Page {page_num + 1} ----\n{page_text}"
33
  return self._chunk_text(file_path, text, "pdf")
34
 
35
-
36
  def process_docx(self, file_path: str) -> List[Document]:
37
- """Extract text from a DOCX file and split it into chunks"""
 
 
 
 
 
 
 
 
38
  doc = DocxDocument(file_path)
39
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
40
  return self._chunk_text(file_path, text, "docx")
41
-
42
 
43
  def process_txt(self, file_path: str) -> List[Document]:
44
- """Process raw text into chunks"""
 
 
 
 
 
 
 
 
45
  with open(file_path, "r", encoding="utf-8") as file:
46
  text = file.read()
47
  return self._chunk_text(file_path, text, "txt")
48
-
 
7
 
8
  class DocumentProcessor:
9
  def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
10
+ """
11
+ Initialize document processor with text splitting configuration.
12
+
13
+ Args:
14
+ chunk_size: Maximum characters per chunk (default: 1000)
15
+ chunk_overlap: Characters to overlap between chunks (default: 200)
16
+ """
17
  self.text_splitter = RecursiveCharacterTextSplitter(
18
  chunk_size=chunk_size,
19
  chunk_overlap=chunk_overlap,
20
  length_function=len,
21
  )
22
 
 
23
  def _chunk_text(self, file_path: str, text: str, doc_type: str) -> List[Document]:
24
+ """
25
+ Split text into overlapping chunks with metadata for better retrieval.
26
+
27
+ Args:
28
+ file_path: Original file path for metadata
29
+ text: Text content to split
30
+ doc_type: Document type (pdf/docx/txt)
31
+
32
+ Returns:
33
+ List[Document]: Chunked documents with metadata
34
+ """
35
  # Create documents with metadata
36
  return self.text_splitter.create_documents(
37
  [text],
 
39
  )
40
 
41
  def process_pdf(self, file_path: str) -> List[Document]:
42
+ """
43
+ Extract text from PDF file and convert to chunked documents.
44
+
45
+ Args:
46
+ file_path: Path to PDF file
47
+
48
+ Returns:
49
+ List[Document]: Processed document chunks
50
+ """
51
  reader = PyPDF2.PdfReader(file_path)
52
  text = ""
53
  for page_num, page in enumerate(reader.pages):
 
56
  text += f"\n---- Page {page_num + 1} ----\n{page_text}"
57
  return self._chunk_text(file_path, text, "pdf")
58
 
 
59
  def process_docx(self, file_path: str) -> List[Document]:
60
+ """
61
+ Extract text from DOCX file and convert to chunked documents.
62
+
63
+ Args:
64
+ file_path: Path to DOCX file
65
+
66
+ Returns:
67
+ List[Document]: Processed document chunks
68
+ """
69
  doc = DocxDocument(file_path)
70
  text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
71
  return self._chunk_text(file_path, text, "docx")
 
72
 
73
  def process_txt(self, file_path: str) -> List[Document]:
74
+ """
75
+ Read text file and convert to chunked documents.
76
+
77
+ Args:
78
+ file_path: Path to TXT file
79
+
80
+ Returns:
81
+ List[Document]: Processed document chunks
82
+ """
83
  with open(file_path, "r", encoding="utf-8") as file:
84
  text = file.read()
85
  return self._chunk_text(file_path, text, "txt")
 
app/main.py CHANGED
@@ -2,16 +2,32 @@ import gradio as gr
2
  from rag_pipeline import RAGPipeline
3
  from document_processor import DocumentProcessor
4
  import os
 
 
 
 
5
 
6
 
7
  class DocumentRagApp:
8
  def __init__(self):
 
 
 
 
9
  self.processor = DocumentProcessor()
10
  self.rag_pipeline = RAGPipeline()
11
  self.loaded_documents = []
12
 
13
  def process_document(self, file):
14
- """Process uplaoded document and add to RAG"""
 
 
 
 
 
 
 
 
15
  if file is None:
16
  return "Please upload a file."
17
  try:
@@ -36,6 +52,15 @@ class DocumentRagApp:
36
  return f"Error processing file: {str(e)}"
37
 
38
  def ask_question(self, question):
 
 
 
 
 
 
 
 
 
39
  if not self.loaded_documents:
40
  return "Please upload and process a document before asking questions."
41
 
@@ -46,14 +71,6 @@ class DocumentRagApp:
46
  result = self.rag_pipeline.query(question)
47
  answer = result["answer"]
48
  return answer
49
- # sources = result["sources"]
50
- # source_response = ""
51
- # for i, doc in enumerate(sources[:3], start=1):
52
- # src_name = doc.metadata.get("source", "Unknown Source")
53
- # content_preview = doc.page_content[:100] + "..."
54
- # source_response += f"\n{i}. {src_name}\n '{content_preview}'\n"
55
- # source_response += f"\n{i}. {content_preview}\n"
56
- # return answer, source_response
57
  except Exception as e:
58
  return f"Error answering question: {str(e)}"
59
 
@@ -87,11 +104,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
87
 
88
  with gr.Column(scale=2):
89
  gr.Markdown("### 3. Answer")
90
-
91
  answer_output = gr.Markdown(container=True, min_height="480px")
92
- # sources_output = gr.Markdown(
93
- # label="Sources", container=True, min_height="120px"
94
- # )
95
 
96
  # Connect all functions
97
  process_btn.click(
@@ -102,8 +115,7 @@ with gr.Blocks(title="AI Document QA System") as demo:
102
  fn=app.ask_question,
103
  inputs=[question_input],
104
  outputs=[answer_output],
105
- # outputs=[answer_output, sources_output],
106
  )
107
 
108
  if __name__ == "__main__":
109
- demo.launch(share=True)
 
2
  from rag_pipeline import RAGPipeline
3
  from document_processor import DocumentProcessor
4
  import os
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
 
10
 
11
  class DocumentRagApp:
12
  def __init__(self):
13
+ """
14
+ Initialize Document RAG application with processor and pipeline.
15
+ Loads environment variables and sets up components.
16
+ """
17
  self.processor = DocumentProcessor()
18
  self.rag_pipeline = RAGPipeline()
19
  self.loaded_documents = []
20
 
21
  def process_document(self, file):
22
+ """
23
+ Process uploaded document (PDF/DOCX/TXT) and add to RAG system.
24
+
25
+ Args:
26
+ file: Gradio file upload object
27
+
28
+ Returns:
29
+ str: Status message with processing results or error
30
+ """
31
  if file is None:
32
  return "Please upload a file."
33
  try:
 
52
  return f"Error processing file: {str(e)}"
53
 
54
  def ask_question(self, question):
55
+ """
56
+ Answer user question using RAG pipeline with rate limiting.
57
+
58
+ Args:
59
+ question: User's question string
60
+
61
+ Returns:
62
+ str: Generated answer or error message
63
+ """
64
  if not self.loaded_documents:
65
  return "Please upload and process a document before asking questions."
66
 
 
71
  result = self.rag_pipeline.query(question)
72
  answer = result["answer"]
73
  return answer
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
  return f"Error answering question: {str(e)}"
76
 
 
104
 
105
  with gr.Column(scale=2):
106
  gr.Markdown("### 3. Answer")
 
107
  answer_output = gr.Markdown(container=True, min_height="480px")
 
 
 
108
 
109
  # Connect all functions
110
  process_btn.click(
 
115
  fn=app.ask_question,
116
  inputs=[question_input],
117
  outputs=[answer_output],
 
118
  )
119
 
120
  if __name__ == "__main__":
121
+ demo.launch(share=False)
app/rag_pipeline.py CHANGED
@@ -1,76 +1,177 @@
1
  from langchain_chroma import Chroma
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
- from langchain_ollama import OllamaLLM
4
  from langchain_core.prompts import PromptTemplate
5
  from langchain_core.documents import Document
6
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
7
  from typing import List
 
 
 
 
 
 
 
 
8
 
9
  class RAGPipeline:
10
  def __init__(self, persist_directory: str = "./data/chroma_db"):
11
- #Initialize embeddings
 
 
 
 
 
 
 
12
  self.embeddings = HuggingFaceEmbeddings(
13
- model_name="sentence-transformers/all-MiniLM-L6-v2",
 
 
14
  )
15
- #Initialize vector store
 
16
  self.vector_store = Chroma(
17
  persist_directory=persist_directory,
18
  embedding_function=self.embeddings,
19
  )
20
- #Initialize LLM
21
- self.llm = OllamaLLM(model="gemma3:latest")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Create RAG chain
24
  self.rag_chain = self.create_rag_chain()
25
 
26
  def create_rag_chain(self):
27
- """Create RAG chain"""
 
 
 
 
 
28
  prompt = PromptTemplate(
29
  input_variables=["context", "question"],
30
- template="""
31
- Use the following pieces of retrieved context to answer the question at the end.
32
- You are an helpful assistant, so if you don't know the answer, just say that you don't know.
33
- Do not hallucinate. Do not make up information. Do not guess. Do not lie.
34
- Use factual information to answer the question. Verify the information you provide.
35
- Prettify your answer with markdown formatting.".
36
-
37
  Context: {context}
38
 
39
  Question: {question}
40
 
41
- Answer:
42
- """
43
  )
44
- retriever = self.vector_store.as_retriever(search_kwargs={"k": 4})
 
 
 
 
45
  rag_chain = RunnableParallel(
46
  {
47
  "result": (
48
  {"context": retriever, "question": RunnablePassthrough()}
49
  | prompt
50
  | self.llm
51
- ),
52
  "source_documents": retriever,
53
  }
54
  )
55
  return rag_chain
56
-
57
 
58
  def add_documents(self, documents: List[Document]) -> None:
59
- """Add documents to the vector store"""
 
 
 
 
 
60
  self.vector_store.add_documents(documents)
61
  # In newer versions of langchain-chroma, persist() is no longer needed
62
  # as documents are automatically persisted when added
63
 
64
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def query(self, question: str):
66
- """Query the RAG pipeline with a question"""
67
- # Get answer from chain
68
- # try:
69
- # answer = self.rag_chain.invoke({"question": question})
70
- # except TypeError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  answer = self.rag_chain.invoke(question)
72
-
73
- return {
74
- "answer": answer["result"],
75
- "sources": answer["source_documents"]
76
- }
 
 
 
 
 
 
 
 
 
1
  from langchain_chroma import Chroma
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
+ from langchain_openai import ChatOpenAI
4
  from langchain_core.prompts import PromptTemplate
5
  from langchain_core.documents import Document
6
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
7
  from typing import List
8
+ import os
9
+ from datetime import datetime, timedelta
10
+ import json
11
+ from pathlib import Path
12
+
13
+ # Fix tokenizer warning
14
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
+
16
 
17
  class RAGPipeline:
18
  def __init__(self, persist_directory: str = "./data/chroma_db"):
19
+ """
20
+ Initialize RAG pipeline with embeddings, vector store, and LLM.
21
+ Sets up rate limiting (10 queries/hour) and uses OpenRouter API with free Gemma model.
22
+
23
+ Args:
24
+ persist_directory: Path to store ChromaDB vector database (default: ./data/chroma_db)
25
+ """
26
+ # Initialize better embeddings (BAAI/bge-small-en-v1.5)
27
  self.embeddings = HuggingFaceEmbeddings(
28
+ model_name="BAAI/bge-small-en-v1.5",
29
+ model_kwargs={"device": "cpu"},
30
+ encode_kwargs={"normalize_embeddings": True}, # Important for bge models
31
  )
32
+
33
+ # Initialize vector store
34
  self.vector_store = Chroma(
35
  persist_directory=persist_directory,
36
  embedding_function=self.embeddings,
37
  )
38
+
39
+ # Rate limiting setup (10 queries per hour)
40
+ self.rate_limit_file = Path("./data/rate_limit.json")
41
+ self.rate_limit_file.parent.mkdir(parents=True, exist_ok=True)
42
+
43
+ # Initialize LLM using OpenRouter (cheapest free option)
44
+ openrouter_key = os.getenv("OPENROUTER_API_KEY")
45
+ if not openrouter_key:
46
+ raise ValueError(
47
+ "OPENROUTER_API_KEY environment variable not set. "
48
+ "Get one free at https://openrouter.ai/keys"
49
+ )
50
+
51
+ # Using google/gemma-3-4b-it:free - free tier on OpenRouter
52
+ self.llm = ChatOpenAI(
53
+ model="google/gemma-3-4b-it:free",
54
+ openai_api_key=openrouter_key,
55
+ openai_api_base="https://openrouter.ai/api/v1",
56
+ temperature=0.1,
57
+ max_tokens=512,
58
+ )
59
 
60
  # Create RAG chain
61
  self.rag_chain = self.create_rag_chain()
62
 
63
  def create_rag_chain(self):
64
+ """
65
+ Creates the RAG chain by combining retriever, prompt template, and LLM.
66
+
67
+ Returns:
68
+ RunnableParallel: Chain that retrieves context and generates answers
69
+ """
70
  prompt = PromptTemplate(
71
  input_variables=["context", "question"],
72
+ template="""Answer the question based on the context below. If you cannot answer based on the context, say "I don't know".
73
+ Do not hallucinate. Do not make up information.
74
+ Format your answer using markdown for better readability.
75
+
 
 
 
76
  Context: {context}
77
 
78
  Question: {question}
79
 
80
+ Provide a clear and concise answer:""",
 
81
  )
82
+
83
+ retriever = self.vector_store.as_retriever(
84
+ search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
85
+ )
86
+
87
  rag_chain = RunnableParallel(
88
  {
89
  "result": (
90
  {"context": retriever, "question": RunnablePassthrough()}
91
  | prompt
92
  | self.llm
93
+ ),
94
  "source_documents": retriever,
95
  }
96
  )
97
  return rag_chain
 
98
 
99
  def add_documents(self, documents: List[Document]) -> None:
100
+ """
101
+ Add processed document chunks to the vector store for retrieval.
102
+
103
+ Args:
104
+ documents: List of Document objects with text and metadata
105
+ """
106
  self.vector_store.add_documents(documents)
107
  # In newer versions of langchain-chroma, persist() is no longer needed
108
  # as documents are automatically persisted when added
109
 
110
+ def _check_rate_limit(self) -> bool:
111
+ """
112
+ Enforces rate limit of 10 queries per hour by tracking query timestamps.
113
+
114
+ Returns:
115
+ bool: True if within limit, False if exceeded
116
+ """
117
+ now = datetime.now()
118
+
119
+ # Load existing queries
120
+ if self.rate_limit_file.exists():
121
+ with open(self.rate_limit_file, "r") as f:
122
+ data = json.load(f)
123
+ queries = [datetime.fromisoformat(q) for q in data.get("queries", [])]
124
+ else:
125
+ queries = []
126
+
127
+ # Remove queries older than 1 hour
128
+ one_hour_ago = now - timedelta(hours=1)
129
+ recent_queries = [q for q in queries if q > one_hour_ago]
130
+
131
+ # Check limit
132
+ if len(recent_queries) >= 10:
133
+ return False
134
+
135
+ # Add current query
136
+ recent_queries.append(now)
137
+
138
+ # Save updated queries
139
+ with open(self.rate_limit_file, "w") as f:
140
+ json.dump({"queries": [q.isoformat() for q in recent_queries]}, f)
141
+
142
+ return True
143
+
144
  def query(self, question: str):
145
+ """
146
+ Query the RAG system with a question, retrieves relevant context and generates answer.
147
+
148
+ Args:
149
+ question: User's question string
150
+
151
+ Returns:
152
+ dict: {"answer": str} containing the generated response
153
+
154
+ Raises:
155
+ ValueError: If rate limit (10 queries/hour) is exceeded
156
+ """
157
+ # Check rate limit
158
+ if not self._check_rate_limit():
159
+ raise ValueError(
160
+ "Rate limit exceeded. You can only ask 10 questions per hour. "
161
+ "Please try again later."
162
+ )
163
+
164
  answer = self.rag_chain.invoke(question)
165
+ result = answer["result"]
166
+
167
+ if hasattr(result, "content"):
168
+ answer_text = result.content
169
+ elif hasattr(result, "text"):
170
+ answer_text = result.text
171
+ else:
172
+ answer_text = str(result)
173
+
174
+ # Check if answer is empty
175
+ if not answer_text or answer_text.strip() == "":
176
+ answer_text = "I apologize, but I couldn't generate a response. Please try rephrasing your question."
177
+ return {"answer": answer_text}
requirements.txt CHANGED
@@ -1,43 +1,27 @@
1
- chromadb==1.3.4
2
- fastapi==0.121.2
3
  gradio==5.49.1
4
- gradio-client==1.13.3
5
- huggingface-hub==0.36.0
6
- jinja2==3.1.6
7
- joblib==1.5.2
8
  langchain==1.0.7
9
- langchain-chroma==1.0.0
10
- langchain-classic==1.0.0
11
- langchain-community==0.4.1
12
  langchain-core==1.0.5
13
- langchain-huggingface==1.0.1
14
- langchain-ollama==1.0.0
15
  langchain-text-splitters==1.0.0
16
- langgraph==1.0.3
17
- langgraph-checkpoint==3.0.1
18
- langgraph-prebuilt==1.0.4
19
- langgraph-sdk==0.2.9
20
- langsmith==0.4.43
21
- markdown-it-py==4.0.0
22
- numpy>=2.0
23
- oauthlib==3.3.1
24
- ollama==0.6.1
25
- pandas==2.3.3
26
- pillow==11.3.0
27
- pip==25.3
28
- pygments==2.19.2
 
 
29
  pypdf2==3.0.1
30
  python-docx==1.2.0
31
- python-dotenv==1.2.1
32
- regex==2025.11.3
33
- requests==2.32.5
34
- requests-oauthlib==2.0.0
35
- requests-toolbelt==1.0.0
36
- ruff==0.14.5
37
- scikit-learn==1.7.2
38
- scipy>=1.0
39
- sqlalchemy>=2.0
40
- tqdm>=4.0
41
- transformers>=4.0
42
- urllib3>=2.0
43
- sentence-transformers>=5.0
 
1
+ # Core App Framework
 
2
  gradio==5.49.1
3
+
4
+ # LangChain Core
 
 
5
  langchain==1.0.7
 
 
 
6
  langchain-core==1.0.5
 
 
7
  langchain-text-splitters==1.0.0
8
+
9
+ # LangChain Integrations
10
+ langchain-chroma==1.0.0
11
+ langchain-huggingface==1.0.1
12
+ langchain-openai
13
+
14
+ # Vector Database
15
+ chromadb==1.3.4
16
+
17
+ # HuggingFace & Embeddings
18
+ huggingface-hub==0.36.0
19
+ sentence-transformers>=3.0.0
20
+ transformers>=4.0
21
+
22
+ # Document Processing
23
  pypdf2==3.0.1
24
  python-docx==1.2.0
25
+
26
+ # Environment & Config
27
+ python-dotenv==1.2.1
 
 
 
 
 
 
 
 
 
 
tests/experiments.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Experimental code for testing RAG pipeline
2
+ import sys
3
+ import os
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
+
7
+ from app.rag_pipeline import RAGPipeline
8
+ from app.document_processor import DocumentProcessor
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+
14
+ # Example 1: Simple text test
15
+ def test_simple_query():
16
+ processor = DocumentProcessor()
17
+
18
+ test_doc = """Python is a high-level programming language.
19
+ It was created by Guido van Rossum in 1991.
20
+ Python is known for its simple syntax."""
21
+
22
+ chunks = processor._chunk_text("test_doc.txt", test_doc, doc_type="txt")
23
+
24
+ # Initialize RAG
25
+ rag_pipeline = RAGPipeline()
26
+ rag_pipeline.add_documents(chunks)
27
+
28
+ # Query
29
+ question = "What is python known for?"
30
+ result = rag_pipeline.query(question)
31
+ print(f"Question: {question}")
32
+ print(f"Answer: {result['answer']}")
33
+ print("\n" + "=" * 50 + "\n")
34
+
35
+
36
+ # Example 2: Testing with actual document
37
+ def test_with_pdf():
38
+ processor = DocumentProcessor()
39
+ rag_pipeline = RAGPipeline()
40
+
41
+ # Process a PDF file
42
+ pdf_path = "path/to/your/test.pdf" # Replace with actual path
43
+ if os.path.exists(pdf_path):
44
+ chunks = processor.process_pdf(pdf_path)
45
+ rag_pipeline.add_documents(chunks)
46
+
47
+ question = "What is the main topic of this document?"
48
+ result = rag_pipeline.query(question)
49
+ print(f"Question: {question}")
50
+ print(f"Answer: {result['answer']}")
51
+ else:
52
+ print(f"PDF not found: {pdf_path}")
53
+
54
+
55
+ # Example 3: Interactive testing
56
+ def interactive_test():
57
+ processor = DocumentProcessor()
58
+ rag_pipeline = RAGPipeline()
59
+
60
+ # Add some test content
61
+ test_doc = """Artificial Intelligence (AI) is transforming the world.
62
+ Machine learning is a subset of AI that focuses on learning from data.
63
+ Deep learning uses neural networks with multiple layers.
64
+ Natural Language Processing (NLP) helps computers understand human language."""
65
+
66
+ chunks = processor._chunk_text("ai_basics.txt", test_doc, doc_type="txt")
67
+ rag_pipeline.add_documents(chunks)
68
+
69
+ print("Interactive RAG Testing")
70
+ print("Type 'quit' to exit\n")
71
+
72
+ while True:
73
+ question = input("Your question: ")
74
+ if question.lower() == "quit":
75
+ break
76
+
77
+ try:
78
+ result = rag_pipeline.query(question)
79
+ print(f"Answer: {result['answer']}\n")
80
+ except ValueError as e:
81
+ print(f"Error: {e}\n")
82
+ break
83
+
84
+
85
+ if __name__ == "__main__":
86
+ print("Running RAG Pipeline Experiments\n")
87
+
88
+ # Run simple test
89
+ test_simple_query()
90
+
91
+ # Uncomment to run other tests
92
+ # test_with_pdf()
93
+ # interactive_test()
tests/test_rag_pipeline.py CHANGED
@@ -1,29 +1,43 @@
1
  # Test rag pipeline
2
  import sys
3
  import os
 
4
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
5
 
6
  from app.rag_pipeline import RAGPipeline
7
  from app.document_processor import DocumentProcessor
 
 
 
8
 
9
 
10
  processor = DocumentProcessor()
11
  # chunks = processor.process_pdf("./data/test.pdf")
12
- test_doc = processor.process_txt(
13
- """
14
- Python is a high-level programming language.
15
  It was created by Guido van Rossum in 1991.
16
  Python is known for its simple syntax.,
17
- test_python.txt
18
- """
19
- )
20
 
21
- # Initialize Rag
22
  rag_pipeline = RAGPipeline()
23
- rag_pipeline.add_documents(test_doc)
24
 
25
  # Query
26
  question = "What is python known for?"
27
  result = rag_pipeline.query(question)
28
  print(f"Answer: {result['answer']}")
29
- print(f"Sources: {len(result['sources'])} chunks retrieved.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Test rag pipeline
2
  import sys
3
  import os
4
+
5
  sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
 
7
  from app.rag_pipeline import RAGPipeline
8
  from app.document_processor import DocumentProcessor
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
 
13
 
14
  processor = DocumentProcessor()
15
  # chunks = processor.process_pdf("./data/test.pdf")
16
+ test_doc = """Python is a high-level programming language.
 
 
17
  It was created by Guido van Rossum in 1991.
18
  Python is known for its simple syntax.,
19
+ test_python.txt"""
20
+ chunks = processor._chunk_text("user", test_doc, doc_type="txt")
 
21
 
22
+ # Initialize Rag and Using document processor
23
  rag_pipeline = RAGPipeline()
24
+ rag_pipeline.add_documents(chunks)
25
 
26
  # Query
27
  question = "What is python known for?"
28
  result = rag_pipeline.query(question)
29
  print(f"Answer: {result['answer']}")
30
+
31
+
32
+ # Format sources with page numbers
33
+ # sources = result["sources_formatted"]
34
+ # source_info = []
35
+ # for i, doc in enumerate(sources, 1):
36
+ # source_file = doc.metadata.get("source", "Unknown")
37
+ # # Extract just filename
38
+ # source_name = source_file.split("/")[-1] if "/" in source_file else source_file
39
+ # page_preview = doc.page_content[:100].replace("\n", " ")
40
+ # source_info.append(f"**[{i}]** {source_name}\n> {page_preview}...")
41
+
42
+ # sources_text = "\n\n".join(source_info) if source_info else "No sources found"
43
+ # print(f"Sources: {sources_text}")