Spaces:

Shouvik99
/

LifeGuide

Sleeping

App Files Files Community

Shouvik599 commited on 26 days ago

Commit

7ae27cd

1 Parent(s): f1f031f

added feature improvements

Browse files

Files changed (8) hide show

Dockerfile +16 -7
app.py +9 -9
features_to_add.txt +20 -0
frontend/index.html +130 -5
ingest.py +49 -0
rag_chain.py +184 -37
requirements.txt +3 -2
start.sh +8 -5

Dockerfile CHANGED Viewed

@@ -1,19 +1,28 @@
-# Use an official Python runtime as a parent image
 FROM python:3.11-slim
-# Set the working directory in the container
 WORKDIR /app
-# Copy the requirements file and install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the application code
-COPY . .
-# Make the start script executable
 RUN chmod +x start.sh
 # HF Spaces requires port 7860
-# We use the shell script as the entry point
 CMD ["./start.sh"]

 FROM python:3.11-slim
+# Set working directory
 WORKDIR /app
+# Install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Create a non-root user for HF compliance
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set working directory to user's home
+WORKDIR $HOME/app
+# Copy application code and set ownership to our user
+COPY --chown=user . $HOME/app
+# Ensure the start script is executable
 RUN chmod +x start.sh
 # HF Spaces requires port 7860
+EXPOSE 7860
 CMD ["./start.sh"]

app.py CHANGED Viewed

@@ -15,8 +15,9 @@ from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from dotenv import load_dotenv
-from fastapi.responses import FileResponse
 from rag_chain import query_sacred_texts, get_embeddings, get_vector_store  # ← FIXED
 load_dotenv()
@@ -91,8 +92,8 @@ def list_books():
         raise HTTPException(status_code=500, detail=f"Could not read knowledge base: {e}")
-@app.post("/ask", response_model=AskResponse, tags=["Query"])
-def ask(request: AskRequest):
     """
     Ask a spiritual or philosophical question.
     The answer is grounded strictly in the sacred texts.
@@ -101,11 +102,10 @@ def ask(request: AskRequest):
         raise HTTPException(status_code=400, detail="Question cannot be empty.")
     try:
-        result = query_sacred_texts(request.question)
-        return AskResponse(
-            question=request.question,
-            answer=result["answer"],
-            sources=[Source(**s) for s in result["sources"]],
         )
     except FileNotFoundError:
         raise HTTPException(
@@ -137,4 +137,4 @@ if __name__ == "__main__":
     print(f"🌐  Running at : http://{host}:{port}")
     print(f"{'─' * 40}\n")
-    uvicorn.run("app:app", host=host, port=port, reload=False) # reload=False for production

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from dotenv import load_dotenv
+from fastapi.responses import StreamingResponse, FileResponse
 from rag_chain import query_sacred_texts, get_embeddings, get_vector_store  # ← FIXED
+from starlette.concurrency import run_in_threadpool
 load_dotenv()
         raise HTTPException(status_code=500, detail=f"Could not read knowledge base: {e}")
+@app.post("/ask", tags=["Query"])
+async def ask(request: AskRequest):
     """
     Ask a spiritual or philosophical question.
     The answer is grounded strictly in the sacred texts.
         raise HTTPException(status_code=400, detail="Question cannot be empty.")
     try:
+        return StreamingResponse(
+        query_sacred_texts(request.question),
+        media_type="application/json"
         )
     except FileNotFoundError:
         raise HTTPException(
     print(f"🌐  Running at : http://{host}:{port}")
     print(f"{'─' * 40}\n")
+    uvicorn.run("app:app", host=host, port=port, reload=False) # reload=False for production

features_to_add.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+Contextual chunk expansion — when a chunk is retrieved, also fetch the surrounding chunks (±1) to avoid cut-off verses losing their meaning
+Hypothetical Document Embedding (HyDE) — generate a hypothetical ideal answer first, embed that, then search — dramatically improves recall for abstract questions
+Multi-turn conversation — add chat history using LangChain ConversationBufferMemory so users can ask follow-up questions like "Elaborate on the second point"
+Answer faithfulness scoring — use an LLM-as-judge step to self-check whether the answer is actually grounded in the retrieved chunks before returning it
+Query rewriting — if the user query is vague, have the LLM rephrase it into a better search query before retrieval (improves semantic matching)
+Multi-language support — ingest Arabic Quran + Sanskrit Gita alongside English translations; embed both and let users query in their preferred language
+Incremental ingestion — track which PDFs have been ingested (via a manifest file) so re-running ingest.py only processes new books, not the whole library
+Book versioning — support multiple translations of the same book (e.g. KJV vs NIV Bible) and let users choose
+Snippet preview on hover — show the actual retrieved passage when hovering over a source badge in the UI
+Query suggestions — after each answer, suggest 2-3 related follow-up questions
+Topic explorer — a sidebar with pre-grouped themes (Death & Afterlife, Compassion, Duty, Prayer) that users can browse
+Compare mode — a dedicated side-by-side view for "How does Book A vs Book B address X"
+Hallucination guardrail — run a separate verification pass checking every claim in the answer maps back to a retrieved chunk; flag or remove unsupported claims
+Out-of-scope detection — classify queries before retrieval; politely decline non-spiritual questions (e.g. "Write me code") with a prompt-level or classifier-level guard
+Rate limiting — add per-IP request throttling in FastAPI to prevent API key exhaustion
+API key security — move to server-side key storage properly; never expose NVIDIA_API_KEY or GEMINI_API_KEY in frontend calls

frontend/index.html CHANGED Viewed

@@ -41,6 +41,57 @@
       /* violet — Sikh royal purple */
     }
     html,
     body {
       height: 100%;
@@ -723,13 +774,69 @@
           throw new Error(err.detail || "Server error");
         }
-        const data = await res.json();
-        replaceLoadingWithAnswer(loadingEl, data);
       } catch (err) {
         let msg = err.message;
-        if (msg.includes("fetch") || msg.includes("NetworkError") || msg.includes("Failed")) {
-          msg = "Connecting to the divine knowledge base... Please wait a moment or refresh the page.";
-        }
         replaceLoadingWithError(loadingEl, msg);
       } finally {
         isLoading = false;
@@ -738,6 +845,24 @@
       }
     }
     function askSuggested(btn) {
       const input = document.getElementById("questionInput");
       input.value = btn.textContent;

       /* violet — Sikh royal purple */
     }
+    /* Animated Thinking state for streaming */
+    .thinking-dots {
+      display: inline-flex;
+      gap: 4px;
+      margin-left: 4px;
+    }
+    .thinking-dots span {
+      width: 4px;
+      height: 4px;
+      background: var(--gold);
+      border-radius: 50%;
+      animation: bounce 1.4s infinite ease-in-out;
+    }
+    @keyframes bounce {
+      0%,
+      80%,
+      100% {
+        transform: scale(0);
+      }
+      40% {
+        transform: scale(1);
+      }
+    }
+    /* Make streaming text fade in slightly for smoothness */
+    #currentStreamingMsg p {
+      animation: fadeIn 0.3s ease-in;
+    }
+    @keyframes fadeIn {
+      from {
+        opacity: 0.7;
+      }
+      to {
+        opacity: 1;
+      }
+    }
+    /* Ensure the bubble has a minimum height so it doesn't look like a "small block" */
+    .msg-bubble:empty::before {
+      content: "Writing wisdom...";
+      color: var(--muted);
+      font-style: italic;
+      font-size: 0.9rem;
+    }
     html,
     body {
       height: 100%;
           throw new Error(err.detail || "Server error");
         }
+        // Initialize variables to build the UI
+        const reader = res.body.getReader();
+        const decoder = new TextDecoder();
+        let fullAnswer = "";
+        let sourcesData = [];
+        // Prepare the assistant UI bubble immediately
+        loadingEl.innerHTML = `
+  <span class="msg-label">Sacred Texts</span>
+  <div class="msg-bubble" id="currentStreamingMsg">
+    <div class="loading-text">The scriptures are being revealed<span class="thinking-dots"><span></span><span></span><span></span></span></div>
+  </div>
+  <div id="currentStreamingSources"></div>
+`;
+        const bubble = document.getElementById("currentStreamingMsg");
+        const sourcesContainer = document.getElementById("currentStreamingSources");
+        let firstTokenReceived = false;
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+          const chunk = decoder.decode(value, { stream: true });
+          const lines = chunk.split("\n");
+          for (const line of lines) {
+            if (!line.trim()) continue;
+            try {
+              const parsed = JSON.parse(line);
+              if (parsed.type === "token") {
+                //Remove the loading text as soon as the first word arrives
+                if (!firstTokenReceived) {
+                  bubble.innerHTML = "";
+                  firstTokenReceived = true;
+                }
+                fullAnswer += parsed.data;
+                // Dynamically update the bubble with formatted markdown/paragraphs
+                bubble.innerHTML = formatAnswer(fullAnswer);
+                scrollToBottom();
+              }
+              else if (parsed.type === "sources") {
+                sourcesData = parsed.data;
+                renderSourcesInPlace(sourcesContainer, sourcesData);
+              }
+              else if (parsed.type === "cache") {
+                bubble.innerHTML = formatAnswer(parsed.data.answer);
+                renderSourcesInPlace(sourcesContainer, parsed.data.sources);
+                scrollToBottom();
+              }
+            } catch (e) {
+              console.error("Error parsing NDJSON line:", e);
+            }
+          }
+        }
+        // Clean up IDs once done so next messages don't conflict
+        bubble.removeAttribute("id");
+        sourcesContainer.removeAttribute("id");
       } catch (err) {
         let msg = err.message;
         replaceLoadingWithError(loadingEl, msg);
       } finally {
         isLoading = false;
       }
     }
+    // Helper to render sources inside the streaming flow
+    function renderSourcesInPlace(container, sources) {
+      const sourceTags = (sources || []).map(s => {
+        const cls = getSourceClass(s.book);
+        // Use verse citations as the primary text
+        return `<span class="source-tag ${cls}" title="${s.snippet}">📖 ${s.book}</span>`;
+      }).join("");
+      if (sourceTags) {
+        container.innerHTML = `
+      <div class="sources">
+        <div class="sources-label">Citations</div>
+        <div class="source-tags">${sourceTags}</div>
+      </div>
+    `;
+      }
+    }
     function askSuggested(btn) {
       const input = document.getElementById("questionInput");
       input.value = btn.textContent;

ingest.py CHANGED Viewed

@@ -20,6 +20,7 @@ from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
 from langchain_chroma import Chroma
 load_dotenv()
@@ -46,8 +47,45 @@ CHUNK_SIZE = 800       # characters per chunk
 CHUNK_OVERLAP = 150    # overlap to preserve verse context across boundaries
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def detect_book_name(filename: str) -> str:
     """Infer the book's display name from its filename."""
     name_lower = filename.lower()
@@ -83,6 +121,7 @@ def tag_documents(docs: list, book_name: str, source_file: str) -> list:
     """
     for doc in docs:
         doc.metadata["book"] = book_name
         doc.metadata["source_file"] = source_file
         # Keep the page number if already present from the loader
         if "page" not in doc.metadata:
@@ -135,6 +174,16 @@ def ingest():
     )
     chunks = splitter.split_documents(all_docs)
     print(f"     → {len(chunks)} chunks created")
     # ── Step 3: Embed & store ────────────────────────────────────────────────
     print(f"\n🔢  Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...")

 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
 from langchain_chroma import Chroma
+import re
 load_dotenv()
 CHUNK_OVERLAP = 150    # overlap to preserve verse context across boundaries
+# Regex patterns for different scriptures
+VERSE_PATTERNS = {
+    "Bhagavad Gita": r"(?:Verse\s+)?(\d+\.\d+)",          # Matches 2.47 or Verse 2.47
+    "Quran": r"(\d+:\d+)",                                # Matches 2:286
+    "Bible": r"(\d+\s+)?[A-Z][a-z]+\s+\d+:\d+",           # Matches John 3:16 or 1 Cor 13:4
+    "Guru Granth Sahib": r"(?:Ang\s+)?(\d+)"              # Matches Ang 1 or 1
+}
+# Patterns to identify structure in the text
+STRUCTURE_PATTERNS = {
+    "Bhagavad Gita": r"(\d+)\.(\d+)",       # Matches 2.47 (Chapter.Verse)
+    "Quran": r"(\d+):(\d+)",               # Matches 2:186 (Surah:Verse)
+    "Bible": r"(\d+):(\d+)",               # Matches 3:16 (Chapter:Verse)
+    "Guru Granth Sahib": r"Ang\s+(\d+)"    # Matches Ang 1
+}
 # ─── Helpers ──────────────────────────────────────────────────────────────────
+def parse_structure(text, book_name):
+    pattern = STRUCTURE_PATTERNS.get(book_name)
+    if not pattern:
+        return {}
+    match = re.search(pattern, text)
+    if match:
+        if book_name == "Guru Granth Sahib":
+            return {"ang": int(match.group(1))}
+        return {"chapter": int(match.group(1)), "verse": int(match.group(2))}
+    return {}
+def extract_verse(text: str, book_name: str) -> str:
+    """Extracts a verse reference from a text chunk based on the book."""
+    pattern = VERSE_PATTERNS.get(book_name)
+    if not pattern:
+        return "Unknown"
+    match = re.search(pattern, text)
+    return match.group(0) if match else "General Context"
 def detect_book_name(filename: str) -> str:
     """Infer the book's display name from its filename."""
     name_lower = filename.lower()
     """
     for doc in docs:
         doc.metadata["book"] = book_name
+        doc.metadata["verse_citation"] = extract_verse(doc.page_content, book_name)
         doc.metadata["source_file"] = source_file
         # Keep the page number if already present from the loader
         if "page" not in doc.metadata:
     )
     chunks = splitter.split_documents(all_docs)
     print(f"     → {len(chunks)} chunks created")
+    # Add verse citations to chunk metadata for better source attribution
+    print(f"🏷️   Parsing structure (chapters/verses) for {len(chunks)} chunks...")
+    for chunk in chunks:
+        # Use the parse_structure function you defined
+        structure = parse_structure(chunk.page_content, chunk.metadata["book"])
+        # Update the chunk metadata so it is saved in ChromaDB
+        chunk.metadata.update(structure)
+    print(f"     → {len(chunks)} chunks created and tagged")
     # ── Step 3: Embed & store ────────────────────────────────────────────────
     print(f"\n🔢  Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...")

rag_chain.py CHANGED Viewed

@@ -19,12 +19,16 @@ Returns a dict with:
 """
 import os
 from dotenv import load_dotenv
-from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA
 from langchain_chroma import Chroma
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 load_dotenv()
 NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
 CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
@@ -41,6 +45,8 @@ KNOWN_BOOKS = [
     "Guru Granth Sahib",
 ]
 # ─── System Prompt ────────────────────────────────────────────────────────────
@@ -51,7 +57,7 @@ STRICT RULES you must ALWAYS follow:
 1. Answer ONLY using the provided context passages. Do NOT use any external knowledge.
 2. If a specific book's passages are provided but not relevant to the question, skip that book.
 3. If NONE of the context is relevant, say: "The provided texts do not directly address this question."
-4. Always cite which book(s) your answer draws from.
 5. When the question asks to COMPARE books (e.g. "what do Quran and Gita say"), you MUST
    address EACH of those books separately, then synthesise the common thread.
 6. Be respectful and neutral toward all faiths — treat each text with equal reverence.
@@ -90,6 +96,26 @@ def get_vector_store(embeddings):
 # ─── Per-Book Retrieval ───────────────────────────────────────────────────────
 def retrieve_per_book(question: str, vector_store: Chroma) -> list:
     """
     Retrieve CHUNKS_PER_BOOK chunks from EACH known book independently,
@@ -97,23 +123,83 @@ def retrieve_per_book(question: str, vector_store: Chroma) -> list:
     in the context — no book can be crowded out by higher-scoring chunks
     from another book.
     """
-    all_docs = []
-    for book in KNOWN_BOOKS:
         try:
-            results = vector_store.similarity_search(
-                query=question,
-                k=CHUNKS_PER_BOOK,
-                filter={"book": book},          # ← metadata filter: only this book
-            )
-            if results:
-                print(f"  📖  {book}: {len(results)} chunk(s) retrieved")
-            else:
-                print(f"  ⚠️   {book}: 0 chunks found (not ingested?)")
-            all_docs.extend(results)
         except Exception as e:
             print(f"  ❌  {book}: retrieval error — {e}")
-    return all_docs
 # ─── Format Retrieved Docs ────────────────────────────────────────────────────
@@ -135,7 +221,18 @@ def format_docs(docs: list) -> str:
         chunks = []
         for i, doc in enumerate(book_docs, 1):
             page = doc.metadata.get("page", "?")
-            chunks.append(f"  [{i}] (Page {page}): {doc.page_content.strip()}")
         sections.append(header + "\n" + "\n\n".join(chunks))
     return "\n\n".join(sections)
@@ -174,7 +271,7 @@ _llm_chain = None
 _vector_store = None
-def query_sacred_texts(question: str) -> dict:
     """
     Query the sacred texts knowledge base with guaranteed per-book retrieval.
@@ -192,38 +289,88 @@ def query_sacred_texts(question: str) -> dict:
     if _llm_chain is None:
         print("🔧  Initialising RAG chain (first call)...")
         _llm_chain, _vector_store = build_chain()
     # Step 1: Retrieve per-book (guaranteed slots for every scripture)
     print(f"\n🔍  Retrieving {CHUNKS_PER_BOOK} chunks per book for: '{question}'")
     source_docs = retrieve_per_book(question, _vector_store)
     if not source_docs:
-        return {
-            "answer": "No content found in the knowledge base. Please run ingest.py first.",
-            "sources": [],
-        }
-    # Step 2: Format context grouped by book
-    context = format_docs(source_docs)
-    # Step 3: Generate answer
-    answer = _llm_chain.invoke({"context": context, "question": question})
-    # Step 4: Build deduplicated source list for the UI
-    seen_books = set()
     sources = []
     for doc in source_docs:
         book = doc.metadata.get("book", "Unknown")
-        page = doc.metadata.get("page", "?")
         snippet = doc.page_content[:200].strip() + "..."
-        if book not in seen_books:
-            seen_books.add(book)
-            sources.append({"book": book, "page": page, "snippet": snippet})
-    return {
-        "answer": answer,
-        "sources": sources,
     }
 # ─── Quick CLI Test ───────────────────────────────────────────────────────────

 """
 import os
+from pydoc import doc
 from dotenv import load_dotenv
+from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings, ChatNVIDIA, NVIDIARerank
 from langchain_chroma import Chroma
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_community.retrievers import BM25Retriever
+from langchain_classic.retrievers import EnsembleRetriever, ContextualCompressionRetriever
 load_dotenv()
+import json
 NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
 CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
     "Guru Granth Sahib",
 ]
+# Create a separate collection for semantic cache
+CACHE_COLLECTION = "semantic_cache"
 # ─── System Prompt ────────────────────────────────────────────────────────────
 1. Answer ONLY using the provided context passages. Do NOT use any external knowledge.
 2. If a specific book's passages are provided but not relevant to the question, skip that book.
 3. If NONE of the context is relevant, say: "The provided texts do not directly address this question."
+4. Always explicitly name and cite which book(s) your answer draws from in the text of your answer.
 5. When the question asks to COMPARE books (e.g. "what do Quran and Gita say"), you MUST
    address EACH of those books separately, then synthesise the common thread.
 6. Be respectful and neutral toward all faiths — treat each text with equal reverence.
 # ─── Per-Book Retrieval ───────────────────────────────────────────────────────
+def get_reranked_retriever(base_retriever):
+    """
+    Wraps your Hybrid/Per-Book retriever with a Reranking layer.
+    """
+    # 1. Initialize the NVIDIA Reranker (NIM or API Catalog)
+    # Using nvidia/llama-3.2-nv-rerankqa-1b-v2 or similar
+    reranker = NVIDIARerank(
+        model="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+        api_key=NVIDIA_API_KEY,
+        top_n=5 # Only send the top 5 most relevant chunks to the LLM
+    )
+    # 2. Wrap the base retriever
+    compression_retriever = ContextualCompressionRetriever(
+        base_compressor=reranker,
+        base_retriever=base_retriever
+    )
+    return compression_retriever
 def retrieve_per_book(question: str, vector_store: Chroma) -> list:
     """
     Retrieve CHUNKS_PER_BOOK chunks from EACH known book independently,
     in the context — no book can be crowded out by higher-scoring chunks
     from another book.
     """
+    all_candidates = []
+    # Detect if user is asking about a specific book
+    target_books = []
+    question_lower = question.lower()
+    # Check for keywords in the question
+    if any(kw in question_lower for kw in ["gita", "bhagavad", "hindu", "hinduism"]):
+        target_books.append("Bhagavad Gita")
+    if any(kw in question_lower for kw in ["quran", "koran", "islam", "muslim", "muhammad"]):
+        target_books.append("Quran")
+    if any(kw in question_lower for kw in ["bible", "testament", "christian", "jesus", "christ"]):
+        target_books.append("Bible")
+    if any(kw in question_lower for kw in ["granth", "guru", "sikh", "sikhism", "nanak"]):
+        target_books.append("Guru Granth Sahib")
+    # If no specific book is detected, use all books
+    books_to_search = target_books if target_books else KNOWN_BOOKS
+    print(f"🎯 Routing query to: {books_to_search}")
+    for book in books_to_search:
         try:
+            # Increase k for the base retrieval to 10
+            CANDIDATE_COUNT = 10
+            # Get the full collection of documents for this book to build BM25
+            # For small demo, we can pull into memory; for larger corpora, consider a more efficient approach
+            book_data = vector_store.get(where={"book": book})
+            book_docs = []
+            from langchain_core.documents import Document
+            book_docs = [Document(page_content=d, metadata=m)
+                         for d, m in zip(book_data["documents"], book_data["metadatas"])]
+            if not book_docs:
+                continue
+            # Setup BM25
+            bm25_retriever = BM25Retriever.from_documents(book_docs)
+            bm25_retriever.k = CANDIDATE_COUNT
+            # Setup vector retriever
+            vector_retriever = vector_store.as_retriever(search_kwargs={"k": CANDIDATE_COUNT, "filter": {"book": book}})
+            #  Combine into ensemble retriever
+            ensemble_retriver = EnsembleRetriever(retrievers=[bm25_retriever, vector_retriever], weights=[0.5, 0.5])
+            # Colect candidates without reranking yet
+            book_candidates = ensemble_retriver.invoke(question)
+            all_candidates.extend(book_candidates)
+            print(f"  📦 {book}: Found {len(book_candidates)} candidates")
         except Exception as e:
             print(f"  ❌  {book}: retrieval error — {e}")
+    # Rerank the entire pool at once
+    if not all_candidates:
+        return []
+    print(f"🚀 Reranking {len(all_candidates)} total candidates...")
+    reranker = NVIDIARerank(
+        model="nvidia/llama-3.2-nv-rerankqa-1b-v2",
+        api_key=NVIDIA_API_KEY,
+        top_n=5 # Final count for LLM context
+    )
+    # Use the reranker directly to compress the full list
+    final_docs = reranker.compress_documents(all_candidates, question)
+    for i, doc in enumerate(final_docs):
+        score = doc.metadata.get("relevance_score", "N/A")
+        print(f"Rank {i+1} [{doc.metadata['book']}]: Score {score}")
+    return final_docs
 # ─── Format Retrieved Docs ────────────────────────────────────────────────────
         chunks = []
         for i, doc in enumerate(book_docs, 1):
             page = doc.metadata.get("page", "?")
+            ch = doc.metadata.get("chapter")
+            vs = doc.metadata.get("verse")
+            ang = doc.metadata.get("ang")
+            # Create a clean citation string
+            if ang:
+                citation = f"Ang {ang}"
+            elif ch and vs:
+                citation = f"{ch}:{vs}"
+            else:
+                citation = f"Page {doc.metadata.get('page', '?')}"
+            chunks.append(f"  [{i}] ({citation}): {doc.page_content.strip()}")
         sections.append(header + "\n" + "\n\n".join(chunks))
     return "\n\n".join(sections)
 _vector_store = None
+def query_sacred_texts(question: str):
     """
     Query the sacred texts knowledge base with guaranteed per-book retrieval.
     if _llm_chain is None:
         print("🔧  Initialising RAG chain (first call)...")
         _llm_chain, _vector_store = build_chain()
+    # --- Semantic cache check ---
+    cache_coll = _vector_store._client.get_or_create_collection(CACHE_COLLECTION)
+    cache_results = cache_coll.query(
+        query_texts=[question],
+        n_results=1
+    )
+    THRESHOLD = 0.35
+    # FIXED: Added check for cache_results['ids'] and ensuring distances is not empty
+    if cache_results['ids'] and cache_results['ids'][0]:
+        distance = cache_results['distances'][0][0]
+        if distance < THRESHOLD:  # Similarity threshold
+            print(f"⚡️ Semantic Cache Hit! (Distance: {distance:.4f})")
+            yield json.dumps({"type": "cache","data": json.loads(cache_results['metadatas'][0][0]['response_json'])}) + "\n"
+            return
     # Step 1: Retrieve per-book (guaranteed slots for every scripture)
     print(f"\n🔍  Retrieving {CHUNKS_PER_BOOK} chunks per book for: '{question}'")
     source_docs = retrieve_per_book(question, _vector_store)
     if not source_docs:
+        yield json.dumps({"type": "token", "data": "No content found in the knowledge base."}) + "\n"
+        return
+    # 3. Step 2: Format sources for the UI immediately
+    seen_sources = set()
     sources = []
     for doc in source_docs:
         book = doc.metadata.get("book", "Unknown")
+        ch = doc.metadata.get("chapter")
+        vs = doc.metadata.get("verse")
+        ang = doc.metadata.get("ang")
+        if ang:
+            cite_val = f"Ang {ang}"
+        elif ch and vs:
+            cite_val = f"{ch}:{vs}"
+        else:
+            cite_val = f"p. {doc.metadata.get('page', '?')}"
+        display_name = f"{book} {cite_val}"
         snippet = doc.page_content[:200].strip() + "..."
+        if display_name not in seen_sources:
+            seen_sources.add(display_name)
+            sources.append({"book": display_name, "page": cite_val, "snippet": snippet})
+    # Step 2: Format context grouped by book
+    context = format_docs(source_docs)
+    full_answer =""
+    # Step 3: Stream from the chain:
+    for chunk in _llm_chain.invoke({"context": context, "question": question}):
+        full_answer += chunk
+        yield json.dumps({"type": "token", "data": chunk}) + "\n"  # Stream the answer as it's generated
+    # Filter sources to only those the LLM actually referenced
+    final_sources = []
+    ansnwer_lower = full_answer.lower()
+    for s in sources:
+        if s["book"].lower() in ansnwer_lower:
+            final_sources.append(s)
+    # If the LLM didn't explicitly reference any sources, we can optionally include all retrieved ones or none
+    display_sources = final_sources if final_sources else []
+    # Step 4: After streaming is done, save to semantic cache for future similar queries
+    result = {
+        "answer": full_answer,
+        "sources": display_sources,
     }
+    cache_coll.add(
+        documents=[question],
+        metadatas=[{"response_json": json.dumps(result)}],
+        ids=[question]
+    )
+    # Send sources as a final message after the answer is fully streamed
+    yield json.dumps({"type": "sources", "data": sources}) + "\n"
 # ─── Quick CLI Test ───────────────────────────────────────────────────────────

requirements.txt CHANGED Viewed

@@ -3,8 +3,9 @@ langchain
 langchain-community
 langchain-chroma
 langchain-nvidia-ai-endpoints
-langchain-text-splitters
 # Vector Store
 chromadb

 langchain-community
 langchain-chroma
 langchain-nvidia-ai-endpoints
+langchain-text-splitters
+langchain-core
+rank_bm25
 # Vector Store
 chromadb

start.sh CHANGED Viewed

@@ -1,13 +1,16 @@
 #!/bin/bash
-# Check if the ChromaDB directory already exists
-if [ ! -d "/code/chroma_db" ]; then
     echo "📦 ChromaDB not found. Starting ingestion..."
     python ingest.py
 else
     echo "✅ ChromaDB found. Skipping ingestion."
 fi
-# Start the FastAPI application
-echo "🚀 Starting FastAPI server..."
-uvicorn app:app --host 0.0.0.0 --port 7860

 #!/bin/bash
+# Use the absolute path relative to the app directory
+CHROMA_PATH="./chroma_db"
+if [ ! -d "$CHROMA_PATH" ]; then
     echo "📦 ChromaDB not found. Starting ingestion..."
     python ingest.py
 else
     echo "✅ ChromaDB found. Skipping ingestion."
 fi
+echo "🚀 Starting FastAPI server with concurrency..."
+# --workers 2 allows two simultaneous processes
+# --timeout-keep-alive is increased for slow LLM responses
+exec uvicorn app:app --host 0.0.0.0 --port 7860 --workers 2 --timeout-keep-alive 60