LifeGuide / ingest.py
Shouvik599
added feature improvements
7ae27cd
"""
ingest.py β€” Step 1: Build the vector knowledge base from religious PDFs.
Run this ONCE before starting the app:
python ingest.py
It will:
1. Load all PDFs from the ./books/ directory
2. Split them into overlapping semantic chunks
3. Embed each chunk using NVIDIA's llama-nemotron embedding model
4. Persist everything into a local ChromaDB vector store
"""
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_chroma import Chroma
import re
load_dotenv()
# ─── Configuration ────────────────────────────────────────────────────────────
BOOKS_DIR = Path("./books")
CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "sacred_texts")
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
# Mapping of filename keywords β†’ friendly book name stored in metadata
BOOK_NAME_MAP = {
"gita": "Bhagavad Gita",
"bhagavad": "Bhagavad Gita",
"quran": "Quran",
"koran": "Quran",
"bible": "Bible",
"testament": "Bible",
"granth": "Guru Granth Sahib", # ← ADD
"guru": "Guru Granth Sahib", # ← ADD
}
# Chunk settings β€” tuned for religious texts (verses are short)
CHUNK_SIZE = 800 # characters per chunk
CHUNK_OVERLAP = 150 # overlap to preserve verse context across boundaries
# Regex patterns for different scriptures
VERSE_PATTERNS = {
"Bhagavad Gita": r"(?:Verse\s+)?(\d+\.\d+)", # Matches 2.47 or Verse 2.47
"Quran": r"(\d+:\d+)", # Matches 2:286
"Bible": r"(\d+\s+)?[A-Z][a-z]+\s+\d+:\d+", # Matches John 3:16 or 1 Cor 13:4
"Guru Granth Sahib": r"(?:Ang\s+)?(\d+)" # Matches Ang 1 or 1
}
# Patterns to identify structure in the text
STRUCTURE_PATTERNS = {
"Bhagavad Gita": r"(\d+)\.(\d+)", # Matches 2.47 (Chapter.Verse)
"Quran": r"(\d+):(\d+)", # Matches 2:186 (Surah:Verse)
"Bible": r"(\d+):(\d+)", # Matches 3:16 (Chapter:Verse)
"Guru Granth Sahib": r"Ang\s+(\d+)" # Matches Ang 1
}
# ─── Helpers ──────────────────────────────────────────────────────────────────
def parse_structure(text, book_name):
pattern = STRUCTURE_PATTERNS.get(book_name)
if not pattern:
return {}
match = re.search(pattern, text)
if match:
if book_name == "Guru Granth Sahib":
return {"ang": int(match.group(1))}
return {"chapter": int(match.group(1)), "verse": int(match.group(2))}
return {}
def extract_verse(text: str, book_name: str) -> str:
"""Extracts a verse reference from a text chunk based on the book."""
pattern = VERSE_PATTERNS.get(book_name)
if not pattern:
return "Unknown"
match = re.search(pattern, text)
return match.group(0) if match else "General Context"
def detect_book_name(filename: str) -> str:
"""Infer the book's display name from its filename."""
name_lower = filename.lower()
for keyword, book_name in BOOK_NAME_MAP.items():
if keyword in name_lower:
return book_name
# Fallback: use the filename stem, title-cased
return Path(filename).stem.replace("_", " ").title()
def load_pdf(pdf_path: Path) -> list:
"""
Load a PDF using PyMuPDF (preferred) or PyPDF as fallback.
Returns a list of LangChain Document objects.
"""
try:
loader = PyMuPDFLoader(str(pdf_path))
print(f" πŸ“– Loading with PyMuPDF: {pdf_path.name}")
except Exception:
loader = PyPDFLoader(str(pdf_path))
print(f" πŸ“– Loading with PyPDF: {pdf_path.name}")
docs = loader.load()
print(f" β†’ {len(docs)} pages loaded")
return docs
def tag_documents(docs: list, book_name: str, source_file: str) -> list:
"""
Enrich each document's metadata with:
- book: display name (e.g. "Bhagavad Gita")
- source_file: original filename
"""
for doc in docs:
doc.metadata["book"] = book_name
doc.metadata["verse_citation"] = extract_verse(doc.page_content, book_name)
doc.metadata["source_file"] = source_file
# Keep the page number if already present from the loader
if "page" not in doc.metadata:
doc.metadata["page"] = 0
return docs
# ─── Main Ingestion ───────────────────────────────────────────────────────────
def ingest():
if not NVIDIA_API_KEY:
print("❌ NVIDIA_API_KEY not set. Add it to your .env file.")
sys.exit(1)
if not BOOKS_DIR.exists():
print(f"❌ Books directory not found: {BOOKS_DIR.resolve()}")
print(" Create a ./books/ folder and add your PDFs there.")
sys.exit(1)
pdf_files = list(BOOKS_DIR.glob("*.pdf"))
if not pdf_files:
print(f"❌ No PDF files found in {BOOKS_DIR.resolve()}")
sys.exit(1)
print(f"\nπŸ•ŠοΈ Sacred Texts RAG β€” Ingestion Pipeline")
print(f"{'─' * 50}")
print(f"πŸ“‚ Books directory : {BOOKS_DIR.resolve()}")
print(f"πŸ’Ύ ChromaDB path : {Path(CHROMA_DB_PATH).resolve()}")
print(f"πŸ“š PDFs found : {len(pdf_files)}")
print(f"{'─' * 50}\n")
# ── Step 1: Load all PDFs ────────────────────────────────────────────────
all_docs = []
for pdf_path in pdf_files:
book_name = detect_book_name(pdf_path.name)
print(f"πŸ“• {book_name}")
raw_docs = load_pdf(pdf_path)
tagged_docs = tag_documents(raw_docs, book_name, pdf_path.name)
all_docs.extend(tagged_docs)
print(f" βœ… Tagged as '{book_name}'\n")
print(f"πŸ“„ Total pages loaded: {len(all_docs)}")
# ── Step 2: Split into chunks ────────────────────────────────────────────
print(f"\nβœ‚οΈ Splitting into chunks (size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})...")
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " ", ""], # Respect paragraph/verse boundaries
)
chunks = splitter.split_documents(all_docs)
print(f" β†’ {len(chunks)} chunks created")
# Add verse citations to chunk metadata for better source attribution
print(f"🏷️ Parsing structure (chapters/verses) for {len(chunks)} chunks...")
for chunk in chunks:
# Use the parse_structure function you defined
structure = parse_structure(chunk.page_content, chunk.metadata["book"])
# Update the chunk metadata so it is saved in ChromaDB
chunk.metadata.update(structure)
print(f" β†’ {len(chunks)} chunks created and tagged")
# ── Step 3: Embed & store ────────────────────────────────────────────────
print(f"\nπŸ”’ Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...")
embeddings = NVIDIAEmbeddings(
model="nvidia/llama-nemotron-embed-vl-1b-v2",
api_key=NVIDIA_API_KEY,
truncate="NONE",
)
print(f"πŸ’Ύ Building ChromaDB vector store β€” this may take a few minutes...")
print(f" (Embedding {len(chunks)} chunks...)\n")
# Process in batches to avoid rate limits
BATCH_SIZE = 100
vector_store = None
for i in range(0, len(chunks), BATCH_SIZE):
batch = chunks[i : i + BATCH_SIZE]
batch_num = i // BATCH_SIZE + 1
total_batches = (len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE
print(f" Batch {batch_num}/{total_batches}: embedding {len(batch)} chunks...")
if vector_store is None:
vector_store = Chroma.from_documents(
documents=batch,
embedding=embeddings,
persist_directory=CHROMA_DB_PATH,
collection_name=COLLECTION_NAME,
)
else:
vector_store.add_documents(batch)
print(f"\n{'─' * 50}")
print(f"βœ… Ingestion complete!")
print(f" πŸ“¦ {len(chunks)} chunks stored in ChromaDB")
print(f" πŸ“‚ Location: {Path(CHROMA_DB_PATH).resolve()}")
print(f"\nπŸ‘‰ Now run: python app.py")
print(f"{'─' * 50}\n")
if __name__ == "__main__":
ingest()