Spaces:
Running
Running
Upload 23 files
Browse files- .gitattributes +36 -35
- .gitignore +1 -0
- Dockerfile +30 -0
- Dockerfile.backend +19 -0
- Dockerfile.frontend +18 -0
- LICENSE +21 -0
- MODEL_CARD.md +20 -0
- Project.md +114 -0
- README.md +62 -7
- app.py +96 -0
- chroma.sqlite3 +3 -0
- company_policy.txt +23 -0
- data_level0.bin +3 -0
- docker-compose.yml +25 -0
- error.txt +0 -0
- header.bin +3 -0
- ingest.py +69 -0
- length.bin +3 -0
- link_lists.bin +3 -0
- main.py +130 -0
- query.py +81 -0
- requirements.txt +15 -0
- run.sh +13 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,36 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"vectorstore/chroma.sqlite3"
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y gcc g++ libc-dev
|
| 5 |
+
|
| 6 |
+
# Hugging Face Spaces require applications to run as a non-root user
|
| 7 |
+
RUN useradd -m -u 1000 user
|
| 8 |
+
USER user
|
| 9 |
+
|
| 10 |
+
# Set up environment variables
|
| 11 |
+
ENV HOME=/home/user \
|
| 12 |
+
PATH=/home/user/.local/bin:$PATH
|
| 13 |
+
|
| 14 |
+
WORKDIR $HOME/app
|
| 15 |
+
|
| 16 |
+
# Copy requirements and install
|
| 17 |
+
COPY --chown=user requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir --default-timeout=1000 -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy application files
|
| 21 |
+
COPY --chown=user . .
|
| 22 |
+
|
| 23 |
+
# Make start script executable
|
| 24 |
+
RUN chmod +x run.sh
|
| 25 |
+
|
| 26 |
+
# Expose Streamlit port
|
| 27 |
+
EXPOSE 8501
|
| 28 |
+
|
| 29 |
+
# Boot both API and UI using the shell script
|
| 30 |
+
CMD ["./run.sh"]
|
Dockerfile.backend
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y gcc g++ libc-dev
|
| 8 |
+
|
| 9 |
+
# Ensure required directories
|
| 10 |
+
RUN mkdir -p /app/vectorstore /app/raw_documents
|
| 11 |
+
|
| 12 |
+
COPY requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir --default-timeout=1000 -r requirements.txt
|
| 14 |
+
|
| 15 |
+
# Copy everything
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
# Run the FastAPI server
|
| 19 |
+
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
Dockerfile.frontend
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y gcc g++ libc-dev
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir --default-timeout=1000 -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
# Expose Streamlit port
|
| 15 |
+
EXPOSE 8501
|
| 16 |
+
|
| 17 |
+
# Run the Streamlit app
|
| 18 |
+
CMD ["streamlit", "run", "frontend/app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Tejesh Naidu
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
MODEL_CARD.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Card: DocuMind Enterprise RAG System
|
| 2 |
+
|
| 3 |
+
## Model Details
|
| 4 |
+
- **Architecture**: Retrieval-Augmented Generation (RAG)
|
| 5 |
+
- **Embedding Model**: `sentence-transformers/all-MiniLM-L6-v2` (Local HuggingFace model)
|
| 6 |
+
- **Reranker Model**: `cross-encoder/ms-marco-MiniLM-L-6-v2` (Local HuggingFace model)
|
| 7 |
+
- **Generation Model**: `llama-3.1-8b-instant` (Provided remotely via Groq)
|
| 8 |
+
- **Vector Database**: ChromaDB (SQLite-backed local instance)
|
| 9 |
+
|
| 10 |
+
## Intended Use
|
| 11 |
+
This system is intended as an internal Enterprise assistant. Its primary function is to answer employee, legal, and operational inquiries by surfacing facts *strictly* from the documents provided.
|
| 12 |
+
|
| 13 |
+
## Document Parsing Capabilities
|
| 14 |
+
- **Supported Formats**: `.pdf`, `.docx`, `.txt`
|
| 15 |
+
- **Chunking Profile**: 512 characters with a 64 character overlap, prioritizing paragraph retention to prevent loss of semantic context.
|
| 16 |
+
|
| 17 |
+
## Ethical Considerations & Limitations
|
| 18 |
+
- **Hallucination Mitigation**: The generation model is strictly prompted to answer "I don't know" if the provided context does not hold the answer. All responses are emitted alongside their explicit sources.
|
| 19 |
+
- **Data Privacy**: Documents ingested remain on-device/in-network within the ChromaDB instance. However, generated requests and contexts are passed to the Groq API. For strictly confidential environments, replacing Groq with a locally hosted Llama/Mistral node is required.
|
| 20 |
+
- **Top-K Limit**: The system pulls the 5 most statistically similar chunks and uses a CrossEncoder to rerank, passing the top 3 items to the LLM. Extremely dispersed information (e.g. "summarize all 50 documents") will result in partial or missing answers.
|
Project.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 AI/ML Projects Portfolio — 2026 & Beyond
|
| 2 |
+
> A skill reference file for Claude Code. Each project is production-grade, resume-worthy, and aligned with the hottest AI/ML job market trends of 2026+.
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## How to Use This File with Claude Code
|
| 7 |
+
Drop this file into your project directory and reference it in Claude Code:
|
| 8 |
+
```
|
| 9 |
+
Claude Code, read Projects.md and help me implement Project [N]: [Title]
|
| 10 |
+
```
|
| 11 |
+
Claude Code will use the step-by-step implementation guide, tech stack, and constraints defined here to scaffold, build, and deploy each project.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## 📋 Projects Index
|
| 16 |
+
|
| 17 |
+
| # | Project Title | Domain | Difficulty | Resume Weight |
|
| 18 |
+
|---|---|---|---|---|
|
| 19 |
+
| 1 | DocuMind — Enterprise RAG Chatbot | RAG + LLMs + Vector DB | ⚡ Medium | ★★★★★ |
|
| 20 |
+
|
| 21 |
+
---
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## Project 1 — DocuMind: Enterprise RAG Chatbot
|
| 26 |
+
|
| 27 |
+
### 📌 Description
|
| 28 |
+
DocuMind is a production-ready Retrieval-Augmented Generation (RAG) chatbot that answers natural language questions grounded in private enterprise documents (PDFs, DOCX, CSVs). Unlike generic chatbots, it never hallucinates — every answer is backed by retrieved source chunks with citations. Deployed as a FastAPI backend + Streamlit frontend on a cloud VM or Hugging Face Spaces.
|
| 29 |
+
|
| 30 |
+
### 🛠️ Step-by-Step Implementation
|
| 31 |
+
|
| 32 |
+
**Phase 1 — Setup & Ingestion Pipeline**
|
| 33 |
+
1. Set up project structure: `backend/`, `frontend/`, `vectorstore/`, `scripts/`
|
| 34 |
+
2. Create a document ingestion pipeline using `LangChain DocumentLoaders` to parse PDFs, DOCX, and TXT files
|
| 35 |
+
3. Implement chunking strategy — use `RecursiveCharacterTextSplitter` (chunk_size=512, overlap=64) for context preservation
|
| 36 |
+
4. Generate embeddings using `sentence-transformers/all-MiniLM-L6-v2` (free, fast) or OpenAI `text-embedding-3-small`
|
| 37 |
+
5. Store embeddings in ChromaDB (local dev) or Pinecone (production) with document metadata (filename, page, chunk_id)
|
| 38 |
+
|
| 39 |
+
**Phase 2 — Retrieval & Generation**
|
| 40 |
+
6. Build a retrieval chain: user query → embed query → cosine similarity search → top-k chunks (k=5) → pass to LLM
|
| 41 |
+
7. Implement `ReRanker` using `cross-encoder/ms-marco-MiniLM-L-6-v2` to improve chunk relevance ordering
|
| 42 |
+
8. Craft a strict RAG prompt template:
|
| 43 |
+
```
|
| 44 |
+
You are a factual assistant. Answer ONLY using the context below.
|
| 45 |
+
If the answer isn't in the context, say "I don't know."
|
| 46 |
+
Context: {context}
|
| 47 |
+
Question: {question}
|
| 48 |
+
```
|
| 49 |
+
9. Use `llama-3-8b-instruct` via Groq API (free tier) or `claude-haiku` as the LLM for generation
|
| 50 |
+
|
| 51 |
+
**Phase 3 — API & Frontend**
|
| 52 |
+
10. Build FastAPI endpoints: `POST /ingest`, `POST /query`, `GET /sources`
|
| 53 |
+
11. Add conversation memory using `ConversationBufferWindowMemory` (last 5 turns)
|
| 54 |
+
12. Build Streamlit frontend with file uploader, chat interface, and source citation panel
|
| 55 |
+
13. Add streaming response support using `StreamingResponse` in FastAPI
|
| 56 |
+
|
| 57 |
+
**Phase 4 — Deployment & Production**
|
| 58 |
+
14. Containerize with Docker (`Dockerfile` + `docker-compose.yml` for API + VectorDB)
|
| 59 |
+
15. Add logging, error handling, and rate limiting (slowapi)
|
| 60 |
+
16. Deploy to Hugging Face Spaces (Streamlit) or Railway/Render (FastAPI)
|
| 61 |
+
17. Write a Model Card documenting supported file types, known limitations, and ethical considerations
|
| 62 |
+
|
| 63 |
+
### 🌍 Real-World Coverage
|
| 64 |
+
**Why?** 80% of enterprise knowledge lives in unstructured documents. Every company with internal wikis, legal contracts, HR handbooks, or research reports needs this.
|
| 65 |
+
**How?** Legal firms (contract Q&A), HR departments (policy chatbots), hospitals (clinical guideline assistants), and SaaS companies (internal knowledge bases) all deploy RAG systems at scale.
|
| 66 |
+
|
| 67 |
+
### 🧰 Tech Stack
|
| 68 |
+
```
|
| 69 |
+
Backend: Python 3.11, FastAPI, LangChain, LangGraph
|
| 70 |
+
LLM: LLaMA 3 via Groq / Claude Haiku via Anthropic API
|
| 71 |
+
Embeddings: sentence-transformers, OpenAI Embeddings
|
| 72 |
+
Vector DB: ChromaDB (dev), Pinecone (prod)
|
| 73 |
+
ReRanking: cross-encoder (HuggingFace)
|
| 74 |
+
Frontend: Streamlit or Gradio
|
| 75 |
+
Deployment: Docker, Hugging Face Spaces, Render
|
| 76 |
+
Monitoring: LangSmith (tracing), Python logging
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### 🎯 Skills Covered
|
| 80 |
+
- RAG pipeline design (chunking, embedding, retrieval, reranking)
|
| 81 |
+
- Vector database operations (CRUD, similarity search, metadata filtering)
|
| 82 |
+
- LLM prompt engineering for factual, grounded responses
|
| 83 |
+
- FastAPI REST API development
|
| 84 |
+
- Streamlit UI development
|
| 85 |
+
- Docker containerization
|
| 86 |
+
- Production deployment with monitoring
|
| 87 |
+
|
| 88 |
+
### 📊 Resume Weight ★★★★★
|
| 89 |
+
This single project covers 4 of the top 10 hottest keywords: RAG, Vector Databases, Prompt Engineering, and LLM integration. RAG demand rose 340% since 2023. This project alone can anchor an entire interview.
|
| 90 |
+
|
| 91 |
+
### 🎚️ Difficulty ⚡ Medium
|
| 92 |
+
The building blocks (LangChain, Chroma, FastAPI) are well-documented. Challenge lies in chunk quality, retrieval tuning, and production hardening.
|
| 93 |
+
|
| 94 |
+
### 🏷️ ATS Keywords
|
| 95 |
+
`RAG`, `Retrieval-Augmented Generation`, `LangChain`, `Vector Database`, `ChromaDB`, `Pinecone`, `Semantic Search`, `Embeddings`, `FastAPI`, `LLM Integration`, `Prompt Engineering`, `Document Chunking`, `Sentence Transformers`, `Hugging Face`, `Python`, `Docker`, `Streamlit`, `Knowledge Base`, `Enterprise AI`, `NLP`
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## 📎 Using This File in Claude Code
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
# To start a project, say in Claude Code:
|
| 103 |
+
"Read Project.md and help me implement Project 1: DocuMind.
|
| 104 |
+
Start with Phase 1 and scaffold the full project structure."
|
| 105 |
+
|
| 106 |
+
# To continue:
|
| 107 |
+
"Continue with Phase 2 of Project 1 from Project.md"
|
| 108 |
+
|
| 109 |
+
# To adapt:
|
| 110 |
+
"Based on Project 3 in Project.md, modify the approach for a
|
| 111 |
+
legal document domain instead of medical, using my local GPU."
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
---
|
README.md
CHANGED
|
@@ -1,10 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
| 1 |
+
# 🧠 DocuMind: Enterprise RAG Chatbot
|
| 2 |
+
|
| 3 |
+
DocuMind is a production-ready, highly accurate Enterprise Retrieval-Augmented Generation (RAG) system. It allows organizations to ingest `.pdf`, `.docx`, and `.txt` documents and interact with them securely using a ChatGPT-style interface, completely circumventing LLM hallucinations by enforcing strict source-grounding.
|
| 4 |
+
|
| 5 |
+
## 🌟 Key Features
|
| 6 |
+
|
| 7 |
+
- **Multi-Format Document Ingestion**: Seamlessly upload APIs and TXTs locally or through the UI drag-and-drop.
|
| 8 |
+
- **High-Accuracy RAG Pipeline**: Combines dense retrieval via ChromaDB (`all-MiniLM-L6-v2`) with precision reranking via a CrossEncoder (`ms-marco-MiniLM-L-6-v2`) to pull only the 3 most strictly relevant context chunks.
|
| 9 |
+
- **Citation-Backed UI**: The Streamlit interface displays the exact ReRanker context chunks drawn from the documents, allowing users to verify LLM claims instantly.
|
| 10 |
+
- **Conversation Memory**: Maintains multi-turn context awareness seamlessly using LangChain's conversational buffer memory.
|
| 11 |
+
- **Enterprise-Ready Middleware**: API endpoints are secured with `slowapi` rate limiting.
|
| 12 |
+
- **Instant Deployment**: Fully dockerized with multi-container `docker-compose` routing, alongside a unified `Dockerfile` for HuggingFace Space hosting.
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 🛠️ Tech Stack
|
| 17 |
+
|
| 18 |
+
- **Backend Framework**: FastAPI, Uvicorn
|
| 19 |
+
- **Frontend UI**: Streamlit
|
| 20 |
+
- **RAG Orchestrator**: LangChain
|
| 21 |
+
- **Embeddings & ReRanker**: HuggingFace `sentence-transformers`
|
| 22 |
+
- **Vector Database**: ChromaDB (Local SQLite Persistence)
|
| 23 |
+
- **Generation LLM**: LLaMA-3.1-8B (via Groq API)
|
| 24 |
+
|
| 25 |
---
|
| 26 |
+
|
| 27 |
+
## 🚀 Getting Started Locally
|
| 28 |
+
|
| 29 |
+
### 1. Requirements
|
| 30 |
+
Ensure you have Docker installed. Clone this repository and execute:
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
git clone https://your-repo-link/DocuMind.git
|
| 34 |
+
cd DocuMind
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### 2. Configure Environment Variables
|
| 38 |
+
Create a `.env` file in the root directory and add your Groq inference key:
|
| 39 |
+
```env
|
| 40 |
+
GROQ_API_KEY=gsk_YOUR_GROQ_API_KEY_HERE
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### 3. Spin up the Containers
|
| 44 |
+
Boot the unified backend, vector store, and frontend seamlessly using Docker Compose:
|
| 45 |
+
```bash
|
| 46 |
+
docker-compose up --build -d
|
| 47 |
+
```
|
| 48 |
+
Access the application by navigating to **`http://localhost:8501`** in your browser.
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## 🌐 Deploying to Hugging Face Spaces
|
| 53 |
+
|
| 54 |
+
This project contains a unified `Dockerfile` and `run.sh` script precisely tuned to bypass HF networking permissions and boot the application perfectly.
|
| 55 |
+
|
| 56 |
+
1. Create a New Space on Hugging Face using the **Docker** SDK (Blank Template).
|
| 57 |
+
2. Clone your Space, copy the contents of this repository into it, and `git push`.
|
| 58 |
+
3. Go to the **Settings > Variables and secrets** tab of the Space.
|
| 59 |
+
4. Create a new Secret named `GROQ_API_KEY` and provide your token.
|
| 60 |
+
5. Hugging Face will automatically execute the build and render the Streamlit app!
|
| 61 |
+
|
| 62 |
---
|
| 63 |
|
| 64 |
+
## 🔒 Security & Model Boundaries
|
| 65 |
+
Please review [MODEL_CARD.md](MODEL_CARD.md) for detailed descriptions on token chunking logic, LLM fallback behaviors limiting false generation, and privacy constraints.
|
app.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
st.set_page_config(page_title="DocuMind - Enterprise RAG", page_icon="🧠", layout="wide")
|
| 7 |
+
|
| 8 |
+
API_URL = os.environ.get("API_URL", "http://127.0.0.1:8000")
|
| 9 |
+
|
| 10 |
+
st.title("🧠 DocuMind")
|
| 11 |
+
st.markdown("Enterprise Document Intelligence Chatbot")
|
| 12 |
+
|
| 13 |
+
# --- Sidebar ---
|
| 14 |
+
with st.sidebar:
|
| 15 |
+
st.header("🏢 Document Knowledge Base")
|
| 16 |
+
st.markdown("Upload PDFs, DOCX, or TXT documents to add them to the system.")
|
| 17 |
+
|
| 18 |
+
uploaded_file = st.file_uploader("Upload a new document", type=["txt", "pdf", "docx"])
|
| 19 |
+
if uploaded_file and st.button("Ingest Document"):
|
| 20 |
+
with st.spinner("Ingesting document (creating chunks & embeddings)..."):
|
| 21 |
+
files = {"file": (uploaded_file.name, uploaded_file.getvalue())}
|
| 22 |
+
try:
|
| 23 |
+
res = requests.post(f"{API_URL}/ingest", files=files)
|
| 24 |
+
if res.status_code == 200:
|
| 25 |
+
st.success(f"{uploaded_file.name} ingested successfully!")
|
| 26 |
+
else:
|
| 27 |
+
st.error(f"Failed to ingest: {res.text}")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
st.error(f"Backend is not running: {e}")
|
| 30 |
+
|
| 31 |
+
st.divider()
|
| 32 |
+
st.subheader("Indexed Documents")
|
| 33 |
+
try:
|
| 34 |
+
res = requests.get(f"{API_URL}/sources")
|
| 35 |
+
if res.status_code == 200:
|
| 36 |
+
for doc in res.json().get("documents", []):
|
| 37 |
+
st.markdown(f"- 📄 `{doc}`")
|
| 38 |
+
except:
|
| 39 |
+
st.warning("Could not connect to FastAPI server.")
|
| 40 |
+
|
| 41 |
+
# --- Chat Interface ---
|
| 42 |
+
if "messages" not in st.session_state:
|
| 43 |
+
st.session_state.messages = []
|
| 44 |
+
|
| 45 |
+
# Display history
|
| 46 |
+
for msg in st.session_state.messages:
|
| 47 |
+
with st.chat_message(msg["role"]):
|
| 48 |
+
st.markdown(msg["content"])
|
| 49 |
+
if "sources" in msg and msg["sources"]:
|
| 50 |
+
with st.expander("Show Sources"):
|
| 51 |
+
for idx, src in enumerate(msg["sources"]):
|
| 52 |
+
st.caption(f"**Source {idx+1} [Relevance: {src['score']:.2f}]**: {src['source']}")
|
| 53 |
+
st.markdown(f"> {src['content']}")
|
| 54 |
+
|
| 55 |
+
if user_input := st.chat_input("Ask a question about your documents..."):
|
| 56 |
+
# Add user message
|
| 57 |
+
st.session_state.messages.append({"role": "user", "content": user_input})
|
| 58 |
+
with st.chat_message("user"):
|
| 59 |
+
st.markdown(user_input)
|
| 60 |
+
|
| 61 |
+
# Get assistant response
|
| 62 |
+
with st.chat_message("assistant"):
|
| 63 |
+
placeholder = st.empty()
|
| 64 |
+
full_response = ""
|
| 65 |
+
sources = []
|
| 66 |
+
|
| 67 |
+
try:
|
| 68 |
+
with requests.post(f"{API_URL}/query", json={"question": user_input}, stream=True) as r:
|
| 69 |
+
r.raise_for_status()
|
| 70 |
+
for line in r.iter_lines():
|
| 71 |
+
if line:
|
| 72 |
+
decoded_line = line.decode('utf-8')
|
| 73 |
+
data = json.loads(decoded_line)
|
| 74 |
+
if data["type"] == "sources":
|
| 75 |
+
sources = data["data"]
|
| 76 |
+
elif data["type"] == "token":
|
| 77 |
+
full_response += data["content"]
|
| 78 |
+
placeholder.markdown(full_response + "▌")
|
| 79 |
+
|
| 80 |
+
placeholder.markdown(full_response)
|
| 81 |
+
if sources:
|
| 82 |
+
with st.expander("Show Sources"):
|
| 83 |
+
for idx, src in enumerate(sources):
|
| 84 |
+
st.caption(f"**Source {idx+1} [Relevance: {src['score']:.2f}]**: {src['source']}")
|
| 85 |
+
st.markdown(f"> {src['content']}")
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
st.error(f"Error querying backend: {e}")
|
| 89 |
+
full_response = "Sorry, the backend encountered an error."
|
| 90 |
+
|
| 91 |
+
# Save assistant message
|
| 92 |
+
st.session_state.messages.append({
|
| 93 |
+
"role": "assistant",
|
| 94 |
+
"content": full_response,
|
| 95 |
+
"sources": sources
|
| 96 |
+
})
|
chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b105d0a6aacedc4130e619b636acdaec8ac11e9c5951dfbc236986bbed6ba729
|
| 3 |
+
size 221184
|
company_policy.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DocuMind Enterprise AI Handboook
|
| 2 |
+
Version: 1.0
|
| 3 |
+
|
| 4 |
+
1. Introduction
|
| 5 |
+
Welcome to DocuMind! Our company is dedicated to providing enterprise-grade Retrieval-Augmented Generation (RAG) solutions. This document outlines company policies and guidelines.
|
| 6 |
+
|
| 7 |
+
2. Remote Work Policy
|
| 8 |
+
Employees are allowed to work remotely 3 days a week. Core hours are 10:00 AM to 3:00 PM EST, during which everyone is expected to be online and available for meetings. The remaining 2 days must be spent in the office.
|
| 9 |
+
|
| 10 |
+
3. Expense Policy
|
| 11 |
+
Hardware expenses up to $1000 per year are auto-approved. For software licenses, please submit a request through the internal IT portal. Any travel expenses must be pre-approved by your manager.
|
| 12 |
+
|
| 13 |
+
4. Security Guidelines
|
| 14 |
+
Security is our top priority.
|
| 15 |
+
- Ensure all passwords are at least 14 characters long and use a password manager.
|
| 16 |
+
- Two-factor authentication (2FA) is mandatory for all internal services.
|
| 17 |
+
- Customer data must never be stored on local drives. All sensitive data should remain in the encrypted corporate cloud.
|
| 18 |
+
|
| 19 |
+
5. Time Off / Leave
|
| 20 |
+
We offer 20 days of paid time off (PTO) annually. Sick leave is unlimited. Please notify your manager at least two weeks in advance for PTO exceeding 3 days.
|
| 21 |
+
|
| 22 |
+
6. About RAG Chatbots
|
| 23 |
+
RAG (Retrieval-Augmented Generation) combines large language models with a retriever to ground the model's responses in factual data. By retrieving relevant documents, the system significantly reduces hallucination and provides verifiable citations. DocuMind uses sentence-transformers for generating embeddings and Pinecone or ChromaDB for vector storage.
|
data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4211b195240e2d4413b7990cbbbc38a87b03b9fb7515c82b3e2be384c2eee81b
|
| 3 |
+
size 167600
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
backend:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile.backend
|
| 8 |
+
ports:
|
| 9 |
+
- "8000:8000"
|
| 10 |
+
volumes:
|
| 11 |
+
- ./vectorstore:/app/vectorstore
|
| 12 |
+
- ./raw_documents:/app/raw_documents
|
| 13 |
+
environment:
|
| 14 |
+
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
| 15 |
+
|
| 16 |
+
frontend:
|
| 17 |
+
build:
|
| 18 |
+
context: .
|
| 19 |
+
dockerfile: Dockerfile.frontend
|
| 20 |
+
ports:
|
| 21 |
+
- "8501:8501"
|
| 22 |
+
environment:
|
| 23 |
+
- API_URL=http://backend:8000
|
| 24 |
+
depends_on:
|
| 25 |
+
- backend
|
error.txt
ADDED
|
Binary file (1.17 kB). View file
|
|
|
header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
|
| 3 |
+
size 100
|
ingest.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
from langchain_community.document_loaders import (
|
| 4 |
+
DirectoryLoader,
|
| 5 |
+
PyPDFLoader,
|
| 6 |
+
Docx2txtLoader,
|
| 7 |
+
TextLoader
|
| 8 |
+
)
|
| 9 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 10 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
+
from langchain_chroma import Chroma
|
| 12 |
+
|
| 13 |
+
# Configuration
|
| 14 |
+
RAW_DOCS_DIR = "raw_documents"
|
| 15 |
+
CHROMA_DB_DIR = "vectorstore"
|
| 16 |
+
CHUNK_SIZE = 512
|
| 17 |
+
CHUNK_OVERLAP = 64
|
| 18 |
+
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 19 |
+
|
| 20 |
+
def main():
|
| 21 |
+
print(f"Loading documents from {RAW_DOCS_DIR}...")
|
| 22 |
+
|
| 23 |
+
text_loader_kwargs = {'autodetect_encoding': True}
|
| 24 |
+
loaders = [
|
| 25 |
+
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs),
|
| 26 |
+
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader),
|
| 27 |
+
DirectoryLoader(RAW_DOCS_DIR, glob="**/*.docx", loader_cls=Docx2txtLoader)
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
docs = []
|
| 31 |
+
for loader in loaders:
|
| 32 |
+
try:
|
| 33 |
+
loaded_docs = loader.load()
|
| 34 |
+
if loaded_docs:
|
| 35 |
+
print(f"Loaded {len(loaded_docs)} documents using {loader.loader_cls.__name__}")
|
| 36 |
+
docs.extend(loaded_docs)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Error loading with {loader.loader_cls.__name__}: {e}")
|
| 39 |
+
|
| 40 |
+
if not docs:
|
| 41 |
+
print("No documents found. Please add some .txt, .pdf, or .docx files to the raw_documents directory.")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
print(f"Total documents loaded: {len(docs)}")
|
| 45 |
+
|
| 46 |
+
print(f"Splitting documents with chunk size {CHUNK_SIZE} and overlap {CHUNK_OVERLAP}...")
|
| 47 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 48 |
+
chunk_size=CHUNK_SIZE,
|
| 49 |
+
chunk_overlap=CHUNK_OVERLAP,
|
| 50 |
+
add_start_index=True,
|
| 51 |
+
)
|
| 52 |
+
splits = text_splitter.split_documents(docs)
|
| 53 |
+
print(f"Generated {len(splits)} chunks.")
|
| 54 |
+
|
| 55 |
+
print(f"Initializing embedding model '{EMBEDDING_MODEL}'...")
|
| 56 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 57 |
+
|
| 58 |
+
print(f"Storing embeddings in ChromaDB at {CHROMA_DB_DIR}...")
|
| 59 |
+
# Initialize Chroma, which will embed and store the chunks
|
| 60 |
+
vectorstore = Chroma.from_documents(
|
| 61 |
+
documents=splits,
|
| 62 |
+
embedding=embeddings,
|
| 63 |
+
persist_directory=CHROMA_DB_DIR
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
print("Ingestion complete. Vector store persisted locally.")
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
main()
|
length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:69b231d3d2b148f5ba9416cf30fdbe7f4ceb4fd8d9467bdb0895cbb32d4cf2af
|
| 3 |
+
size 400
|
link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
| 3 |
+
size 0
|
main.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import json
|
| 4 |
+
from fastapi import FastAPI, HTTPException, File, UploadFile
|
| 5 |
+
from fastapi.responses import StreamingResponse
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
from langchain_chroma import Chroma
|
| 10 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
+
from langchain_core.prompts import PromptTemplate
|
| 12 |
+
from langchain_groq import ChatGroq
|
| 13 |
+
from sentence_transformers import CrossEncoder
|
| 14 |
+
|
| 15 |
+
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 16 |
+
from slowapi.util import get_remote_address
|
| 17 |
+
from slowapi.errors import RateLimitExceeded
|
| 18 |
+
from slowapi.middleware import SlowAPIMiddleware
|
| 19 |
+
from fastapi import Request
|
| 20 |
+
|
| 21 |
+
load_dotenv()
|
| 22 |
+
|
| 23 |
+
app = FastAPI(title="DocuMind Enterprise RAG API")
|
| 24 |
+
|
| 25 |
+
# Setup Rate Limiter
|
| 26 |
+
limiter = Limiter(key_func=get_remote_address)
|
| 27 |
+
app.state.limiter = limiter
|
| 28 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 29 |
+
app.add_middleware(SlowAPIMiddleware)
|
| 30 |
+
|
| 31 |
+
CHROMA_DB_DIR = "vectorstore"
|
| 32 |
+
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 33 |
+
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 34 |
+
LLM_MODEL = "llama-3.1-8b-instant"
|
| 35 |
+
|
| 36 |
+
embeddings = None
|
| 37 |
+
vectorstore = None
|
| 38 |
+
base_retriever = None
|
| 39 |
+
cross_encoder = None
|
| 40 |
+
llm = None
|
| 41 |
+
|
| 42 |
+
@app.on_event("startup")
|
| 43 |
+
def startup_event():
|
| 44 |
+
global embeddings, vectorstore, base_retriever, cross_encoder, llm
|
| 45 |
+
|
| 46 |
+
print("Loading vector store & embedding model...")
|
| 47 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 48 |
+
if os.path.exists(CHROMA_DB_DIR):
|
| 49 |
+
vectorstore = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)
|
| 50 |
+
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
| 51 |
+
|
| 52 |
+
print("Initializing CrossEncoder ReRanker...")
|
| 53 |
+
cross_encoder = CrossEncoder(RERANKER_MODEL)
|
| 54 |
+
|
| 55 |
+
print("Initializing LLM via Groq...")
|
| 56 |
+
if not os.environ.get("GROQ_API_KEY"):
|
| 57 |
+
print("WARNING: GROQ_API_KEY not found in environment!")
|
| 58 |
+
else:
|
| 59 |
+
llm = ChatGroq(model_name=LLM_MODEL, temperature=0, streaming=True)
|
| 60 |
+
|
| 61 |
+
class QueryRequest(BaseModel):
|
| 62 |
+
question: str
|
| 63 |
+
|
| 64 |
+
prompt_template = PromptTemplate.from_template("""You are a factual assistant for DocuMind. Answer ONLY using the context below.
|
| 65 |
+
If the answer isn't in the context, say "I don't know."
|
| 66 |
+
Context: {context}
|
| 67 |
+
Question: {question}""")
|
| 68 |
+
|
| 69 |
+
@app.post("/query")
|
| 70 |
+
@limiter.limit("5/minute")
|
| 71 |
+
async def query_documents(request: Request, req: QueryRequest):
|
| 72 |
+
if not base_retriever or not llm:
|
| 73 |
+
raise HTTPException(status_code=500, detail="Backend not fully initialized (Vectorstore or LLM missing).")
|
| 74 |
+
|
| 75 |
+
initial_docs = base_retriever.invoke(req.question)
|
| 76 |
+
|
| 77 |
+
if not initial_docs:
|
| 78 |
+
# Stream "I don't know." with empty sources
|
| 79 |
+
async def empty_response():
|
| 80 |
+
yield json.dumps({"type": "sources", "data": []}) + "\n"
|
| 81 |
+
yield json.dumps({"type": "token", "content": "I don't know."}) + "\n"
|
| 82 |
+
return StreamingResponse(empty_response(), media_type="application/x-ndjson")
|
| 83 |
+
|
| 84 |
+
pairs = [[req.question, doc.page_content] for doc in initial_docs]
|
| 85 |
+
scores = cross_encoder.predict(pairs)
|
| 86 |
+
for doc, score in zip(initial_docs, scores):
|
| 87 |
+
doc.metadata['relevance_score'] = float(score)
|
| 88 |
+
|
| 89 |
+
initial_docs.sort(key=lambda d: d.metadata['relevance_score'], reverse=True)
|
| 90 |
+
top_docs = initial_docs[:3]
|
| 91 |
+
|
| 92 |
+
context_text = "\n\n".join([doc.page_content for doc in top_docs])
|
| 93 |
+
chain = prompt_template | llm
|
| 94 |
+
|
| 95 |
+
async def generate_response():
|
| 96 |
+
sources = [{"source": d.metadata.get("source", "Unknown"), "score": d.metadata.get("relevance_score"), "content": d.page_content} for d in top_docs]
|
| 97 |
+
# Emit sources first
|
| 98 |
+
yield json.dumps({"type": "sources", "data": sources}) + "\n"
|
| 99 |
+
|
| 100 |
+
# Emit tokens
|
| 101 |
+
async for chunk in chain.astream({"context": context_text, "question": req.question}):
|
| 102 |
+
if chunk.content:
|
| 103 |
+
yield json.dumps({"type": "token", "content": chunk.content}) + "\n"
|
| 104 |
+
|
| 105 |
+
return StreamingResponse(generate_response(), media_type="application/x-ndjson")
|
| 106 |
+
|
| 107 |
+
@app.post("/ingest")
|
| 108 |
+
async def ingest_document(file: UploadFile = File(...)):
|
| 109 |
+
os.makedirs("raw_documents", exist_ok=True)
|
| 110 |
+
file_path = os.path.join("raw_documents", file.filename)
|
| 111 |
+
with open(file_path, "wb") as f:
|
| 112 |
+
f.write(await file.read())
|
| 113 |
+
|
| 114 |
+
# Run the ingestion script
|
| 115 |
+
process = subprocess.run(["python", "scripts/ingest.py"], capture_output=True, text=True)
|
| 116 |
+
|
| 117 |
+
# Reload vectorstore inline
|
| 118 |
+
startup_event()
|
| 119 |
+
|
| 120 |
+
if process.returncode != 0:
|
| 121 |
+
raise HTTPException(status_code=500, detail=f"Ingestion failed: {process.stderr}")
|
| 122 |
+
|
| 123 |
+
return {"message": f"Successfully ingested {file.filename}", "logs": process.stdout}
|
| 124 |
+
|
| 125 |
+
@app.get("/sources")
|
| 126 |
+
async def get_sources():
|
| 127 |
+
docs = []
|
| 128 |
+
if os.path.exists("raw_documents"):
|
| 129 |
+
docs = os.listdir("raw_documents")
|
| 130 |
+
return {"documents": docs}
|
query.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
from langchain_chroma import Chroma
|
| 5 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 6 |
+
from langchain_core.prompts import PromptTemplate
|
| 7 |
+
from langchain_groq import ChatGroq
|
| 8 |
+
from sentence_transformers import CrossEncoder
|
| 9 |
+
|
| 10 |
+
# Setup Configuration
|
| 11 |
+
CHROMA_DB_DIR = "vectorstore"
|
| 12 |
+
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 13 |
+
RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 14 |
+
LLM_MODEL = "llama-3.1-8b-instant" # Use a currently active Groq model
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# 1. Initialize embeddings and reload the vector store
|
| 20 |
+
print("Loading vector store & embedding model...")
|
| 21 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
| 22 |
+
vectorstore = Chroma(persist_directory=CHROMA_DB_DIR, embedding_function=embeddings)
|
| 23 |
+
|
| 24 |
+
# 2. Setup the base retriever to get top k=5 chunks
|
| 25 |
+
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
|
| 26 |
+
|
| 27 |
+
# 3. Setup ReRanker for relevance ordering
|
| 28 |
+
print("Initializing CrossEncoder ReRanker...")
|
| 29 |
+
cross_encoder = CrossEncoder(RERANKER_MODEL)
|
| 30 |
+
|
| 31 |
+
# 4. Craft strict RAG prompt
|
| 32 |
+
template = """You are a factual assistant. Answer ONLY using the context below.
|
| 33 |
+
If the answer isn't in the context, say "I don't know."
|
| 34 |
+
Context: {context}
|
| 35 |
+
Question: {question}"""
|
| 36 |
+
prompt = PromptTemplate.from_template(template)
|
| 37 |
+
|
| 38 |
+
# 5. Initialize the Groq LLM
|
| 39 |
+
print("Initializing LLM via Groq...")
|
| 40 |
+
if not os.environ.get("GROQ_API_KEY"):
|
| 41 |
+
print("ERROR: GROQ_API_KEY not found in environment!")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
llm = ChatGroq(model_name=LLM_MODEL, temperature=0)
|
| 45 |
+
|
| 46 |
+
# The query workflow
|
| 47 |
+
query = "What is the company policy for remote work?"
|
| 48 |
+
print(f"\nQUERY: {query}\n")
|
| 49 |
+
|
| 50 |
+
print("Retrieving and re-ranking documents...")
|
| 51 |
+
initial_docs = base_retriever.invoke(query)
|
| 52 |
+
|
| 53 |
+
# Apply CrossEncoder manually
|
| 54 |
+
pairs = [[query, doc.page_content] for doc in initial_docs]
|
| 55 |
+
scores = cross_encoder.predict(pairs)
|
| 56 |
+
|
| 57 |
+
# Attach scores and sort
|
| 58 |
+
for doc, score in zip(initial_docs, scores):
|
| 59 |
+
doc.metadata['relevance_score'] = score
|
| 60 |
+
|
| 61 |
+
# Sort docs by score descending and take top 3
|
| 62 |
+
initial_docs.sort(key=lambda d: d.metadata['relevance_score'], reverse=True)
|
| 63 |
+
top_docs = initial_docs[:3]
|
| 64 |
+
|
| 65 |
+
# Format the context text from the retrieved docs
|
| 66 |
+
context_text = "\n\n".join([doc.page_content for doc in top_docs])
|
| 67 |
+
|
| 68 |
+
print("Generating response...")
|
| 69 |
+
# Format prompt and call LLM
|
| 70 |
+
chain = prompt | llm
|
| 71 |
+
response = chain.invoke({"context": context_text, "question": query})
|
| 72 |
+
|
| 73 |
+
print("\n--- FINAL ANSWER ---")
|
| 74 |
+
print(response.content)
|
| 75 |
+
print("\n--- SOURCES ---")
|
| 76 |
+
for idx, doc in enumerate(top_docs):
|
| 77 |
+
print(f"\n[Source {idx+1}] Score: {doc.metadata.get('relevance_score'):.4f}")
|
| 78 |
+
print(doc.page_content[:150] + "...")
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain
|
| 2 |
+
langchain-community
|
| 3 |
+
langchain-huggingface
|
| 4 |
+
langchain-chroma
|
| 5 |
+
chromadb
|
| 6 |
+
sentence-transformers
|
| 7 |
+
pypdf
|
| 8 |
+
docx2txt
|
| 9 |
+
python-dotenv
|
| 10 |
+
langchain-groq
|
| 11 |
+
fastapi
|
| 12 |
+
uvicorn
|
| 13 |
+
streamlit
|
| 14 |
+
python-multipart
|
| 15 |
+
slowapi
|
run.sh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Start the FastAPI backend in the background
|
| 4 |
+
echo "Starting FastAPI Backend..."
|
| 5 |
+
uvicorn backend.main:app --host 0.0.0.0 --port 8000 &
|
| 6 |
+
|
| 7 |
+
# Give the backend a few seconds to boot up before launching the UI
|
| 8 |
+
sleep 5
|
| 9 |
+
|
| 10 |
+
# Start the Streamlit frontend in the foreground
|
| 11 |
+
echo "Starting Streamlit Frontend..."
|
| 12 |
+
export API_URL="http://127.0.0.1:8000"
|
| 13 |
+
streamlit run frontend/app.py --server.port=8501 --server.address=0.0.0.0
|