Muzenda-K commited on
Commit
40ab55e
·
0 Parent(s):

Fresh initial commit

Browse files
Files changed (13) hide show
  1. .gitattributes +2 -0
  2. .gitignore +1 -0
  3. LICENSE.md +19 -0
  4. README.md +77 -0
  5. app (1).py +162 -0
  6. chatpdf_app.py +47 -0
  7. demo.gif +3 -0
  8. pdf_utils.py +18 -0
  9. project enhancements.md +10 -0
  10. requirements.txt +48 -0
  11. sample.pdf +3 -0
  12. tiny_llama.py +38 -0
  13. vector_store.py +19 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.gif filter=lfs diff=lfs merge=lfs -text
2
+ *.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ipynb
LICENSE.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2025 Muzenda-K
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF Chatbot with LLaMA
2
+
3
+ ![Python 3.12](https://img.shields.io/badge/python-3.12-blue?logo=python&logoColor=white)
4
+ ![License](https://img.shields.io/badge/license-MIT-green)
5
+ ![Streamlit](https://img.shields.io/badge/Streamlit-1.45.1-FF4B4B)
6
+ ![PyTorch](https://img.shields.io/badge/PyTorch-2.7.0-EE4C2C)
7
+ ![Transformers](https://img.shields.io/badge/Transformers-4.52.4-yellow)
8
+
9
+ ## Overview
10
+
11
+ A powerful PDF chatbot application that allows users to upload PDF documents and ask questions about their content. Built with Streamlit for the frontend and leveraging LLaMA-based models for natural language processing, this application provides an intuitive interface for document-based question answering.
12
+
13
+ ## Features
14
+
15
+ 📄 Upload and process PDF documents
16
+
17
+ 💬 Chat interface for asking questions about document content
18
+
19
+ ⚡ Fast response generation using LLaMA-based models
20
+
21
+ 🧠 Context-aware answers based on document content
22
+
23
+ 🎨 Clean, user-friendly interface
24
+
25
+ 🔍 Sample PDF with demo questions included
26
+
27
+ ## Installation
28
+
29
+ 1. Clone the repository:
30
+
31
+ ```bash
32
+ git clone https://github.com/Muzenda-K/PDF-Chatbot.git
33
+ cd pdf-chatbot
34
+ ```
35
+
36
+ 2. Create and activate a virtual environment (recommended):
37
+
38
+ ```bash
39
+ python -m venv venv
40
+ source venv/bin/activate # On Windows use `venv\Scripts\activate`
41
+ ```
42
+
43
+ 3. Install the required dependencies:
44
+
45
+ ```bash
46
+ pip install -r requirements.txt
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ 1. Run the Streamlit application:
52
+
53
+ ```bash
54
+ streamlit run app.py
55
+ ```
56
+
57
+ 2. The application will open in your default browser at `http://localhost:8501`
58
+ 3. Either use the provided sample PDF or upload your own document
59
+ 4. Start asking questions about the document content
60
+
61
+ ## Project demo
62
+
63
+ ![Demo](demo.gif)
64
+
65
+ ## Contributing
66
+
67
+ Contributions are welcome! Please follow these steps:
68
+
69
+ 1. Fork the repository
70
+ 2. Create your feature branch (git checkout -b feature/AmazingFeature)
71
+ 3. Commit your changes (git commit -m 'Add some AmazingFeature')
72
+ 4. Push to the branch (git push origin feature/AmazingFeature)
73
+ 5. Open a Pull Request
74
+
75
+ ## License
76
+
77
+ Distributed under the MIT License.
app (1).py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf_utils import extract_text, chunk_text
3
+ from vector_store import build_index
4
+ from tiny_llama import answer_query
5
+
6
+ SAMPLE_PDF_PATH = "sample.pdf"
7
+ SAMPLE_QUESTION = "What is this document about?"
8
+
9
+ st.set_page_config(page_title="PDF Chatbot", page_icon="📄")
10
+ st.title("📄 Chat with your PDF (LLaMA-based)")
11
+
12
+ # ---------------------- Initialize Session State ----------------------
13
+ if "messages" not in st.session_state:
14
+ st.session_state.messages = []
15
+ if "index" not in st.session_state:
16
+ st.session_state.index = None
17
+ if "chunks" not in st.session_state:
18
+ st.session_state.chunks = None
19
+ if "pending_question" not in st.session_state:
20
+ st.session_state.pending_question = None
21
+ if "pdf_name" not in st.session_state:
22
+ st.session_state.pdf_name = None
23
+ if "using_sample" not in st.session_state:
24
+ st.session_state.using_sample = False
25
+ if "sample_processed" not in st.session_state:
26
+ st.session_state.sample_processed = False
27
+
28
+ # ---------------------- Sample PDF Load ----------------------
29
+ if not st.session_state.index and not st.session_state.pdf_name and not st.session_state.sample_processed:
30
+ with st.spinner("Loading sample PDF and preparing demo..."):
31
+ try:
32
+ with open(SAMPLE_PDF_PATH, "rb") as f:
33
+ text = extract_text(f)
34
+ if text:
35
+ chunks = chunk_text(text)
36
+ index, _ = build_index(chunks)
37
+
38
+ st.session_state.index = index
39
+ st.session_state.chunks = chunks
40
+ st.session_state.pdf_name = "Sample PDF"
41
+ st.session_state.using_sample = True
42
+ st.session_state.sample_processed = True
43
+ st.session_state.messages = []
44
+
45
+ # Add sample question
46
+ st.session_state.messages.append({
47
+ "role": "user",
48
+ "content": SAMPLE_QUESTION,
49
+ "is_sample": True
50
+ })
51
+
52
+ # Generate actual answer from the sample PDF
53
+ with st.spinner("Generating sample answer..."):
54
+ answer = answer_query(SAMPLE_QUESTION, index, chunks)
55
+ if not answer:
56
+ answer = "I couldn't generate an answer from this document."
57
+
58
+ st.session_state.messages.append({
59
+ "role": "assistant",
60
+ "content": answer,
61
+ "is_sample": True
62
+ })
63
+ else:
64
+ st.warning("Could not extract text from sample PDF.")
65
+ except FileNotFoundError:
66
+ st.warning("Sample PDF not found. Please upload your own.")
67
+ except Exception as e:
68
+ st.error(f"Error loading sample PDF: {str(e)}")
69
+
70
+ # ---------------------- PDF Upload ----------------------
71
+ uploaded = st.file_uploader("Upload your PDF", type=["pdf"])
72
+ if uploaded is not None:
73
+ # Reset everything if uploading a new PDF
74
+ if st.session_state.pdf_name != uploaded.name:
75
+ with st.spinner("Processing uploaded PDF..."):
76
+ try:
77
+ text = extract_text(uploaded)
78
+ if text:
79
+ chunks = chunk_text(text)
80
+ index, _ = build_index(chunks)
81
+
82
+ st.session_state.index = index
83
+ st.session_state.chunks = chunks
84
+ st.session_state.messages = []
85
+ st.session_state.pdf_name = uploaded.name
86
+ st.session_state.using_sample = False
87
+ st.success(f"Uploaded: {uploaded.name}. You can now chat!")
88
+ else:
89
+ st.warning("Could not extract text from uploaded PDF. It might be scanned or encrypted.")
90
+ except Exception as e:
91
+ st.error(f"Error processing uploaded PDF: {str(e)}")
92
+
93
+ # ---------------------- Display Messages ----------------------
94
+ if st.session_state.pdf_name:
95
+ st.subheader(f"Chatting with: {st.session_state.pdf_name}")
96
+
97
+ for msg in st.session_state.messages:
98
+ role = "🧑 You" if msg["role"] == "user" else "🤖 Assistant"
99
+
100
+ # Style sample messages differently
101
+ if msg.get("is_sample", False):
102
+ st.markdown(f"""
103
+ <div style="
104
+ background-color: #f0f2f6;
105
+ padding: 10px;
106
+ border-radius: 10px;
107
+ margin-bottom: 10px;
108
+ ">
109
+ <strong>{role}:</strong> {msg['content']}
110
+ </div>
111
+ """, unsafe_allow_html=True)
112
+ else:
113
+ st.markdown(f"**{role}:** {msg['content']}")
114
+
115
+ # ---------------------- User Input ----------------------
116
+ if st.session_state.index and st.session_state.pdf_name:
117
+ user_input = st.chat_input("Ask a question about this PDF")
118
+ if user_input:
119
+ st.session_state.messages.append({
120
+ "role": "user",
121
+ "content": user_input,
122
+ "is_sample": False
123
+ })
124
+ st.session_state.pending_question = user_input
125
+ st.rerun()
126
+
127
+ # ---------------------- Answer Generation ----------------------
128
+ if st.session_state.pending_question and st.session_state.index:
129
+ with st.spinner("Thinking..."):
130
+ try:
131
+ answer = answer_query(
132
+ st.session_state.pending_question,
133
+ st.session_state.index,
134
+ st.session_state.chunks
135
+ )
136
+ if not answer:
137
+ answer = "Sorry, I couldn't generate an answer for that question."
138
+ except Exception as e:
139
+ answer = f"An error occurred while generating the answer: {str(e)}"
140
+
141
+ st.session_state.messages.append({
142
+ "role": "assistant",
143
+ "content": answer,
144
+ "is_sample": False
145
+ })
146
+ st.session_state.pending_question = None
147
+ st.rerun()
148
+
149
+ # ---------------------- Help Text ----------------------
150
+ if st.session_state.using_sample:
151
+ st.markdown("""
152
+ <div style="
153
+ background-color: #e6f7ff;
154
+ padding: 15px;
155
+ border-radius: 10px;
156
+ margin-top: 20px;
157
+ ">
158
+ ℹ️ <strong>How this works:</strong> This is a sample PDF demonstrating the chatbot.
159
+ The question above was automatically generated from the sample document.
160
+ Upload your own PDF to ask questions about your specific documents.
161
+ </div>
162
+ """, unsafe_allow_html=True)
chatpdf_app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf_utils import extract_text, chunk_text
3
+ from vector_store import build_index
4
+ from tiny_llama import answer_query
5
+
6
+ st.set_page_config(page_title="PDF Chatbot", page_icon="📄")
7
+ st.title("📄 Chat with your PDF (LLaMA-based)")
8
+
9
+ # Initialize session state
10
+ if "messages" not in st.session_state:
11
+ st.session_state.messages = []
12
+ if "index" not in st.session_state:
13
+ st.session_state.index = None
14
+ if "chunks" not in st.session_state:
15
+ st.session_state.chunks = None
16
+
17
+ uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
18
+
19
+ # Load and index PDF if uploaded
20
+ if uploaded:
21
+ with st.spinner("Processing PDF..."):
22
+ text = extract_text(uploaded)
23
+ chunks = chunk_text(text)
24
+ index, _ = build_index(chunks)
25
+ st.session_state.index = index
26
+ st.session_state.chunks = chunks
27
+ st.success("PDF processed. Ask me anything!")
28
+
29
+ # Display chat messages
30
+ for msg in st.session_state.messages:
31
+ role = "🧑 You" if msg["role"] == "user" else "🤖 Assistant"
32
+ st.markdown(f"**{role}:** {msg['content']}")
33
+
34
+ # Chat input
35
+ if st.session_state.index:
36
+ question = st.chat_input("Ask a question about your PDF")
37
+ if question:
38
+ # Append user's question to chat history
39
+ st.session_state.messages.append({"role": "user", "content": question})
40
+
41
+ # Get answer from the LLM
42
+ with st.spinner("Thinking..."):
43
+ response = answer_query(question, st.session_state.index, st.session_state.chunks)
44
+ st.session_state.messages.append({"role": "assistant", "content": response})
45
+
46
+ # Rerun to show updated chat
47
+ st.rerun()
demo.gif ADDED

Git LFS Details

  • SHA256: 717cf701b2953a867f8506af2f1b9398e5252d442ed1b9285c97f7f47c453e8a
  • Pointer size: 131 Bytes
  • Size of remote file: 713 kB
pdf_utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[1]:
5
+
6
+
7
+ import PyPDF2
8
+
9
+ def extract_text(file_stream):
10
+ reader = PyPDF2.PdfReader(file_stream)
11
+ return "\n".join(page.extract_text() for page in reader.pages)
12
+
13
+ def chunk_text(text, chunk_size=1000, overlap=200):
14
+ chunks = []
15
+ for i in range(0, len(text), chunk_size - overlap):
16
+ chunks.append(text[i:i+chunk_size])
17
+ return chunks
18
+
project enhancements.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ 🔧 6. Upgrades & Enhancements
2
+ Use LLaMA‑3 (e.g., 8B) for better performance.
3
+
4
+ Switch to LangChain RetrievalQA and Chroma or Qdrant for vector storage.
5
+
6
+ Add chat history display, caching, UI enhancements (highlighting source text).
7
+
8
+ Add file previews, multi-PDF support.
9
+
10
+ Containerize with Docker.
requirements.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ accelerate==1.7.0
3
+ aiofiles==24.1.0
4
+ aiohttp==3.12.11
5
+ bitsandbytes==0.46.0
6
+ dataclasses-json==0.6.7
7
+ datasets==3.6.0
8
+ faiss-cpu==1.11.0
9
+ fastapi==0.115.12
10
+ filelock==3.13.1
11
+ frozenlist==1.6.2
12
+ fsspec==2025.3.0
13
+ gradio==5.33.0
14
+ gradio_client==1.10.2
15
+ huggingface-hub==0.32.4
16
+ langchain==0.3.25
17
+ langchain-community==0.3.24
18
+ langchain-core==0.3.64
19
+ langchain-text-splitters==0.3.8
20
+ markdown-it-py==3.0.0
21
+ multidict==6.4.4
22
+ numpy==2.2.6
23
+ openai==1.30.1
24
+ orjson==3.10.18
25
+ packaging==24.2
26
+ peft==0.15.2
27
+ PyPDF2==3.0.1
28
+ python-dotenv==1.1.0
29
+ python-multipart==0.0.20
30
+ regex==2024.11.6
31
+ requests==2.31.0
32
+ safetensors==0.5.3
33
+ sentence-transformers==4.1.0
34
+ sentencepiece==0.2.0
35
+ streamlit==1.45.1
36
+ tenacity==9.1.2
37
+ tokenizers==0.21.1
38
+ torch==2.7.0
39
+ torchaudio==2.7.0
40
+ torchvision==0.22.0
41
+ transformers==4.52.4
42
+ triton==3.3.0
43
+ typer==0.16.0
44
+ typing_extensions==4.11.0
45
+ uvicorn==0.34.3
46
+ websockets==15.0.1
47
+ xxhash==3.5.0
48
+ yarl==1.20.0
sample.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef0298c98084ec572463c1c9bc838471205afcb947c34ae31e92eb59d27bdebd
3
+ size 416028
tiny_llama.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ import torch
3
+
4
+ # Load TinyLlama model and tokenizer
5
+ model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ model_name,
9
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
10
+ device_map="auto"
11
+ )
12
+
13
+ def answer_query(question, index, chunks, top_k=3):
14
+ # Retrieve top-k most relevant chunks
15
+ docs = index.similarity_search(question, k=top_k)
16
+ context = "\n".join([doc.page_content for doc in docs])
17
+
18
+ # Construct prompt
19
+ prompt = f"<|system|>\nYou are a helpful assistant.\n<|user|>\n{context}\n\nQuestion: {question}\n<|assistant|>\n"
20
+
21
+ # Tokenize
22
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
23
+
24
+ # Generate
25
+ with torch.no_grad():
26
+ outputs = model.generate(
27
+ **inputs,
28
+ max_new_tokens=256,
29
+ temperature=0.7,
30
+ top_p=0.9,
31
+ do_sample=True,
32
+ eos_token_id=tokenizer.eos_token_id
33
+ )
34
+
35
+ # Decode response
36
+ full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
37
+ response = full_output[len(prompt):].strip()
38
+ return response
vector_store.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vector_store.py
2
+
3
+ from langchain.vectorstores import FAISS
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.docstore.document import Document
6
+ import faiss
7
+
8
+ # You can replace this with any sentence transformer you prefer
9
+ def build_index(chunks):
10
+ # Convert string chunks to Document objects
11
+ documents = [Document(page_content=chunk) for chunk in chunks]
12
+
13
+ # Load a small sentence transformer model for embeddings
14
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
15
+
16
+ # Create FAISS index wrapped with LangChain
17
+ vector_index = FAISS.from_documents(documents, embedding_model)
18
+
19
+ return vector_index, embedding_model