Spaces:

nothex
/

morpheus-rag

Running

App Files Files Community

nothex commited on 17 days ago

Commit

ff7e045

1 Parent(s): 9f2ae84

fix: harden supabase setup and clean stale retrieval/ingestion code

Browse files

Files changed (10) hide show

.env.example +2 -1
.github/workflows/smoke.yml +3 -3
README.md +11 -4
backend/api/query.py +55 -22
backend/core/classifier.py +3 -11
backend/core/pipeline.py +133 -102
frontend/js/chat.js +11 -1
supabase/migrations/0004_hierarchical_nodes.sql +4 -4
supabase/migrations/0006_multi_tenant_file_uniqueness.sql +23 -0
supabase/schema_backup.sql +34 -38

.env.example CHANGED Viewed

@@ -10,6 +10,7 @@ GEMINI_API_KEY=****
 SUPABASE_URL=https://example.supabase.co
 SUPABASE_SERVICE_KEY=****
 SUPABASE_ANON_KEY=****
 # Cohere
 COHERE_API_KEY=****
@@ -23,7 +24,7 @@ REDIS_URL=redis://localhost:6379/0
 MASTER_ADMIN_KEY=****
 # Local dev:   ALLOWED_ORIGINS=*
-# Production:  ALLOWED_ORIGINS=https://your-nexus.vercel.app
 ALLOWED_ORIGINS=*
 # Docs enabled in dev, disabled in prod

 SUPABASE_URL=https://example.supabase.co
 SUPABASE_SERVICE_KEY=****
 SUPABASE_ANON_KEY=****
+SUPABASE_JWT_SECRET=****
 # Cohere
 COHERE_API_KEY=****
 MASTER_ADMIN_KEY=****
 # Local dev:   ALLOWED_ORIGINS=*
+# Production:  ALLOWED_ORIGINS=https://your-morpheus.vercel.app
 ALLOWED_ORIGINS=*
 # Docs enabled in dev, disabled in prod

.github/workflows/smoke.yml CHANGED Viewed

@@ -25,13 +25,13 @@ jobs:
       - name: Build assets (light mode)
         env:
-          NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
-          NEXUS_BUILD_ASSETS_MODE: "light"
         run: |
           python -m backend.core.build_ml_assets
       - name: Intent classifier smoke predict
         env:
-          NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
         run: |
           python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"

       - name: Build assets (light mode)
         env:
+          MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
+          MORPHEUS_BUILD_ASSETS_MODE: "light"
         run: |
           python -m backend.core.build_ml_assets
       - name: Intent classifier smoke predict
         env:
+          MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
         run: |
           python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"

README.md CHANGED Viewed

@@ -99,9 +99,16 @@ pip install -r requirements.txt
 ### 3. Set up Supabase
 1. Create a project at [supabase.com](https://supabase.com)
-2. Go to the SQL editor and run `supabase/schema_backup.sql` in full
-3. Run `supabase/rls/multi_tenancy_rls.sql` to enable row-level security
-4. Enable the `pgvector` extension (Database → Extensions)
 ### 4. Configure environment
@@ -235,4 +242,4 @@ The architecture is designed for independent layer scaling:
 - **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
 - **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
 - **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
-- **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.

 ### 3. Set up Supabase
 1. Create a project at [supabase.com](https://supabase.com)
+2. Enable the `vector`, `uuid-ossp`, and `pgcrypto` extensions first (Database → Extensions)
+3. Go to the SQL editor and run `supabase/schema_backup.sql` in full
+4. Create two Storage buckets:
+   - `rag-images` as a public bucket
+   - `rag-models` as a private bucket
+5. Optional but recommended: run `supabase/rls/rpc_security_rls_audit.sql`
+For an existing older database, apply the files in `supabase/migrations/` and
+the scripts in `supabase/rls/` as incremental upgrade helpers. For a brand-new
+project, `schema_backup.sql` is already the full source of truth.
 ### 4. Configure environment
 - **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
 - **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
 - **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
+- **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.

backend/api/query.py CHANGED Viewed

@@ -5,7 +5,11 @@ import asyncio
 from fastapi import APIRouter, Header, Depends, Request
 from fastapi.responses import StreamingResponse
 from shared.types import QueryRequest, SourceChunk
-from backend.core.pipeline import retrieve_chunks, generate_answer_stream, analyse_intent
 from backend.core.auth_utils import require_auth_token
 from backend.main import limiter
@@ -13,6 +17,49 @@ log = logging.getLogger("morpheus.api.query")
 router = APIRouter()
 @router.post("")
 @limiter.limit("60/hour")
 async def query(
@@ -60,7 +107,7 @@ async def query(
             loop   = asyncio.get_event_loop()
             chunks = await loop.run_in_executor(
                 None,
-                lambda: retrieve_chunks(
                     effective_query,
                     k=req.k,
                     category=category,
@@ -75,6 +122,7 @@ async def query(
             # ── Step 3: Stream answer tokens ──────────────────────────────────
             images = []
             # 🚀 Define the boolean once for readability
             is_eval = x_eval_mode == "true"
             async for event in generate_answer_stream(
@@ -90,27 +138,12 @@ async def query(
                     yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
                 elif event["type"] == "done":
                     images = event.get("images", [])
             # ── Step 4: Emit sources + images ─────────────────────────────────
-            sources = []
-            for chunk in chunks:
-                meta = chunk.metadata
-                orig = meta.get("original_content", "{}")
-                if isinstance(orig, str):
-                    try:    orig = json.loads(orig)  # noqa: E701
-                    except: orig = {}  # noqa: E701, E722
-                full_text = orig.get("raw_text") or chunk.page_content
-                snippet_text = full_text if x_eval_mode == "true" else full_text[:200]
-                sources.append(
-                    SourceChunk(
-                        source=meta.get("source", "Unknown"),
-                        score=meta.get("relevance_score"),
-                        chunk=meta.get("chunk_index"),
-                        snippet=snippet_text,
-                        doc_type=meta.get("document_type"),
-                        pages=meta.get("page_numbers"),
-                    ).dict()
-                )
             yield "data: " + json.dumps({
                 "type":    "done",
@@ -144,4 +177,4 @@ async def query(
             "X-Accel-Buffering":           "no",
             "Access-Control-Allow-Origin": "*",
         }
-    )

 from fastapi import APIRouter, Header, Depends, Request
 from fastapi.responses import StreamingResponse
 from shared.types import QueryRequest, SourceChunk
+from backend.core.pipeline import (
+    retrieve_chunks_routed,
+    generate_answer_stream,
+    analyse_intent,
+)
 from backend.core.auth_utils import require_auth_token
 from backend.main import limiter
 router = APIRouter()
+def _normalise_original_content(raw):
+    """Best-effort decode for metadata that may already be dict or JSON string."""
+    if isinstance(raw, dict):
+        return raw
+    if isinstance(raw, str):
+        candidate = raw
+        for _ in range(2):
+            try:
+                candidate = json.loads(candidate)
+            except Exception:
+                return {}
+            if isinstance(candidate, dict):
+                return candidate
+            if not isinstance(candidate, str):
+                break
+    return {}
+def _build_sources_from_chunks(chunks, include_full_text: bool = False):
+    sources = []
+    for chunk in chunks:
+        try:
+            if getattr(chunk, "page_content", None) == "__CACHE_HIT__":
+                continue
+            meta = getattr(chunk, "metadata", {}) or {}
+            orig = _normalise_original_content(meta.get("original_content", {}))
+            full_text = orig.get("raw_text") or chunk.page_content
+            snippet_text = full_text if include_full_text else full_text[:200]
+            sources.append(
+                SourceChunk(
+                    source=meta.get("source", "Unknown"),
+                    score=meta.get("relevance_score"),
+                    chunk=meta.get("chunk_index"),
+                    snippet=snippet_text,
+                    doc_type=meta.get("document_type"),
+                    pages=meta.get("page_numbers"),
+                ).dict()
+            )
+        except Exception as exc:
+            log.warning("Skipping source serialization for chunk: %s", exc)
+    return sources
 @router.post("")
 @limiter.limit("60/hour")
 async def query(
             loop   = asyncio.get_event_loop()
             chunks = await loop.run_in_executor(
                 None,
+                lambda: retrieve_chunks_routed(
                     effective_query,
                     k=req.k,
                     category=category,
             # ── Step 3: Stream answer tokens ──────────────────────────────────
             images = []
+            done_sources = []
             # 🚀 Define the boolean once for readability
             is_eval = x_eval_mode == "true"
             async for event in generate_answer_stream(
                     yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
                 elif event["type"] == "done":
                     images = event.get("images", [])
+                    done_sources = event.get("sources", []) or []
             # ── Step 4: Emit sources + images ─────────────────────────────────
+            sources = done_sources or _build_sources_from_chunks(
+                chunks, include_full_text=is_eval
+            )
             yield "data: " + json.dumps({
                 "type":    "done",
             "X-Accel-Buffering":           "no",
             "Access-Control-Allow-Origin": "*",
         }
+    )

backend/core/classifier.py CHANGED Viewed

@@ -1,8 +1,5 @@
-# PASTE classifier.py HERE
-# Fix: 'import config' -> 'from backend.core import config'
 """
-classifier.py — Hierarchical Ensemble Document Classifier
-==========================================================
 3-stage cascade:
   Stage 1: Embedding nearest-centroid  (cosine similarity, no API calls after warmup)
@@ -15,14 +12,9 @@ before the normal pipeline (catches periodic tables, reference charts etc.)
 Each stage only activates if the previous stage's confidence is below its threshold.
 Centroid embeddings are persisted to Supabase so they survive restarts.
-Usage (from cl.py):
-    from classifier import DocumentClassifier
     clf = DocumentClassifier()
     result = clf.classify(sample_text, elements)
-    # result.document_type  → "machine_learning_paper"
-    # result.confidence     → 0.91
-    # result.stage_used     → "centroid"
-    # result.is_new_type    → False
 """
 import re
@@ -672,4 +664,4 @@ JSON:"""
             if score > best_score:
                 best_score = score
                 best_type  = cat
-        return (best_type, best_score) if best_type else (None, None)

 """
+Hierarchical ensemble document classifier.
 3-stage cascade:
   Stage 1: Embedding nearest-centroid  (cosine similarity, no API calls after warmup)
 Each stage only activates if the previous stage's confidence is below its threshold.
 Centroid embeddings are persisted to Supabase so they survive restarts.
+Typical usage:
     clf = DocumentClassifier()
     result = clf.classify(sample_text, elements)
 """
 import re
             if score > best_score:
                 best_score = score
                 best_type  = cat
+        return (best_type, best_score) if best_type else (None, None)

backend/core/pipeline.py CHANGED Viewed

@@ -1,24 +1,7 @@
-# PASTE cl.py HERE
-# Fix: 'import config' -> 'from backend.core import config'
-# Fix: 'from classifier import DocumentClassifier' -> 'from backend.core.classifier import DocumentClassifier'
 """
-RAG Ingestion & Retrieval Pipeline  (cl.py)
-============================================
-Run modes:
-  python cl.py --ingest --pdf path/to/file.pdf
-  python cl.py --ingest --pdf file.pdf --force
-  python cl.py --query "your question here"
-  python cl.py --ingest --pdf file.pdf --export
-Improvements in this version:
-  - FIX: process_chunks now uses graph_data.document_type (not .categories[0])
-  - FIX: is_file_already_ingested now hits ingested_files registry table (O(1))
-  - NEW: In-memory embedding cache for repeated queries (thread-safe LRU via functools)
-  - NEW: ingested_files registry insert after successful upload
-  - NEW: relevance_score surfaced in metadata for UI badge display
-  - NEW: Source deduplication by chunk content hash (prevents near-duplicate passages)
-  - NEW: Graceful empty-query guard in generate_sub_queries
-  - NEW: MMR-style post-rerank diversity filter to stop one source dominating
 """
 import os
@@ -631,6 +614,7 @@ def process_chunks(
     file_path: str,
     file_hash: str,
     graph_data: DocumentGraphMetadata,
     pdf_images=None,
 ) -> tuple[List[Document], List[str]]:
     """Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
@@ -718,7 +702,7 @@ def process_chunks(
         )
         docs.append(doc)
-        unique_string = f"{file_hash}_chunk_{i}"
         chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
         ids.append(chunk_id)
@@ -727,7 +711,7 @@ def process_chunks(
 def build_raptor_tree(
-    leaf_docs: List[Document], leaf_ids: List[str]
 ) -> tuple[List[Document], List[str]]:
     """
     RAPTOR implementation: recursively clusters documents and generates
@@ -801,15 +785,23 @@ def build_raptor_tree(
             # Generate deterministic ID for the parent
             import hashlib
             parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
             parent_id = str(
-                uuid.uuid5(uuid.NAMESPACE_DNS, f"raptor_{current_level}_{parent_hash}")
             )
             # Create the parent document
             # Inherit metadata from the first child (source array, file hash, document type)
-            base_meta = cluster[0].metadata
             # Gather all unique page numbers from children
             all_pages = set()
             for c in cluster:
@@ -925,12 +917,6 @@ def _register_ingested_file(
         log.warning("Could not register in ingested_files: %s", exc)
-# =========================================================================== #
-#  ADD THESE TWO FUNCTIONS TO cl.py                                           #
-#  Place them right after _register_ingested_file()                           #
-# =========================================================================== #
 def _apply_category_override(
     file_hash: str, new_category: str, access_token: str = None
 ) -> None:
@@ -1265,7 +1251,29 @@ def run_ingestion(
     if already_exists and not force:
         log.info("SKIPPING — already ingested.")
         return "already_ingested"
     # 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
     # RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
     if already_exists or force:
@@ -1276,30 +1284,13 @@ def run_ingestion(
             "user_id", user_id
         ).contains("metadata", {"file_hash": file_hash}).execute()
         # 2. Clear the registry
-        supabase.table("ingested_files").delete().eq("file_hash", file_hash).execute()
         # 3. Clear the tree if it exists
-        supabase.table("document_trees").delete().eq("file_hash", file_hash).execute()
-    # NEW: Check if user has previously overridden the category for this file.
-    # If so, skip the classifier and use their choice directly.
-    forced_category = None
-    try:
-        _sb = _build_supabase_client(access_token)
-        _existing = (
-            _sb.table("ingested_files")
-            .select("document_type, user_overridden")
-            .eq("file_hash", file_hash)
-            .limit(1)
-            .execute()
-        )
-        if _existing.data and _existing.data[0].get("user_overridden"):
-            forced_category = _existing.data[0]["document_type"]
-            log.info(
-                "User override active — forcing category '%s', skipping classifier.",
-                forced_category,
-            )
-    except Exception as _exc:
-        log.warning("Could not check user override: %s", _exc)
     _progress(2, "Partitioning PDF (OCR + layout detection)…")
     elements = partition_document(pdf_path)
@@ -1332,8 +1323,9 @@ def run_ingestion(
         doc_tree = _build_document_tree(elements)
         sb = _build_service_supabase_client()
-        sb.table("document_trees").insert(
-            {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree}
         ).execute()
         log.info("✅ PageIndex tree saved to Supabase.")
     except Exception as e:
@@ -1347,12 +1339,12 @@ def run_ingestion(
     else:
         pdf_path_for_naming = pdf_path
     docs, ids = process_chunks(
-        chunks, elements, pdf_path_for_naming, file_hash, graph_data, pdf_images
     )
     # --- NATIVE RAPTOR INDEXING ---
     _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
-    docs, ids = build_raptor_tree(docs, ids)
     smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
     if export_json:
@@ -1374,7 +1366,7 @@ def run_ingestion(
                 "document_type": graph_data.document_type,
                 "chunk_count": len(docs),
             },
-            on_conflict="file_hash",
         ).execute()
     except Exception as e:
         log.error("Failed to register file: %s", e)
@@ -1936,10 +1928,12 @@ def retrieve_chunks(
                 max_per_source=max_per_source,
             )
-        docs = [
-            Document(page_content=c["content"], metadata=c.get("metadata", {}))
-            for c in diverse
-        ]
         log.info(
             "Dropped %d low-relevance/duplicate chunks.",
             len(all_candidates) - len(docs),
@@ -2269,12 +2263,6 @@ def generate_answer(
         log.error("Answer generation failed: %s", exc)
         return f"Failed to generate answer: {exc}", []
-# ── ADD THIS FUNCTION to pipeline.py right after generate_answer() ──────────
-# Also add these imports at the top of pipeline.py if not already there:
-#   from typing import AsyncGenerator
 async def generate_answer_stream(
     chunks: List[Document],
     query: str,
@@ -2568,13 +2556,12 @@ def _save_to_memory(
             {
                 "query_embedding": query_vector,
                 "match_session_id": session_id,
-                "match_threshold": 0.98,  # 98% similarity = practically identical
                 "match_count": 1,
             },
         ).execute()
         # If we found a nearly identical query in this session, skip saving!
-        if dup_check.data:
             log.info(
                 "🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
                 session_id[:8],
@@ -2713,7 +2700,9 @@ def _should_use_tree_path(query: str) -> bool:
     return False
-def tree_search(query: str, access_token: str = None) -> List[Document]:
     """
     Navigates the structural JSON trees in Supabase to answer highly specific
     'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
@@ -2761,6 +2750,21 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
         if not res.data:
             return []
         matched_chunks = []
         # 3. Recursive Tree Traversal
@@ -2789,6 +2793,8 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
         # 4. Execute traversal across all trees
         for tree_row in res.data:
             _traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
         log.info(
@@ -2803,6 +2809,50 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
         return []
 def run_query(
     query: str,
     k: int = 3,
@@ -2815,35 +2865,16 @@ def run_query(
 ) -> Tuple[str, List[str]]:
     # 1. Document Retrieval (Routed)
-    use_tree_path = _should_use_tree_path(query)
-    if use_tree_path:
-        log.info("🎯 PageIndex triggered: Query routed to structural tree path.")
-        chunks = tree_search(query, access_token=access_token)
-        # Failsafe: If the tree search somehow comes up empty, fall back to vector search
-        if not chunks:
-            log.info("Tree search yielded 0 results. Falling back to vector search.")
-            chunks = retrieve_chunks(
-                query,
-                k=k,
-                source_file=source_file,
-                category=category,
-                alpha=alpha,
-                session_id=session_id,
-                access_token=access_token,
-            )
-    else:
-        log.info("🌊 Semantic path triggered: Query routed to vector search.")
-        chunks = retrieve_chunks(
-            query,
-            k=k,
-            source_file=source_file,
-            category=category,
-            alpha=alpha,
-            session_id=session_id,
-            access_token=access_token,
-        )
     # 2. Retrieve Episodic Memory (Semantic Search)
     past_memories = []

 """
+Core ingestion, retrieval, generation, memory, and tree-search pipeline.
+This module backs the FastAPI routes and Celery ingestion worker for Morpheus.
 """
 import os
     file_path: str,
     file_hash: str,
     graph_data: DocumentGraphMetadata,
+    user_id: str,
     pdf_images=None,
 ) -> tuple[List[Document], List[str]]:
     """Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
         )
         docs.append(doc)
+        unique_string = f"{user_id}:{file_hash}:chunk:{i}"
         chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
         ids.append(chunk_id)
 def build_raptor_tree(
+    leaf_docs: List[Document], leaf_ids: List[str], user_id: str
 ) -> tuple[List[Document], List[str]]:
     """
     RAPTOR implementation: recursively clusters documents and generates
             # Generate deterministic ID for the parent
             import hashlib
+            child_node_ids = [
+                str(c.metadata.get("node_id", "")) for c in cluster if c.metadata.get("node_id")
+            ]
             parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
+            base_meta = cluster[0].metadata
             parent_id = str(
+                uuid.uuid5(
+                    uuid.NAMESPACE_DNS,
+                    (
+                        f"{user_id}:raptor:{base_meta.get('file_hash', '')}:"
+                        f"{current_level}:{'|'.join(child_node_ids)}:{parent_hash}"
+                    ),
+                )
             )
             # Create the parent document
             # Inherit metadata from the first child (source array, file hash, document type)
             # Gather all unique page numbers from children
             all_pages = set()
             for c in cluster:
         log.warning("Could not register in ingested_files: %s", exc)
 def _apply_category_override(
     file_hash: str, new_category: str, access_token: str = None
 ) -> None:
     if already_exists and not force:
         log.info("SKIPPING — already ingested.")
         return "already_ingested"
+    # NEW: Check if user has previously overridden the category for this file.
+    # If so, skip the classifier and use their choice directly.
+    forced_category = None
+    if already_exists or force:
+        try:
+            _sb = _build_supabase_client(access_token)
+            _existing = (
+                _sb.table("ingested_files")
+                .select("document_type, user_overridden")
+                .eq("file_hash", file_hash)
+                .limit(1)
+                .execute()
+            )
+            if _existing.data and _existing.data[0].get("user_overridden"):
+                forced_category = _existing.data[0]["document_type"]
+                log.info(
+                    "User override active — forcing category '%s', skipping classifier.",
+                    forced_category,
+                )
+        except Exception as _exc:
+            log.warning("Could not check user override: %s", _exc)
     # 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
     # RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
     if already_exists or force:
             "user_id", user_id
         ).contains("metadata", {"file_hash": file_hash}).execute()
         # 2. Clear the registry
+        supabase.table("ingested_files").delete().eq("user_id", user_id).eq(
+            "file_hash", file_hash
+        ).execute()
         # 3. Clear the tree if it exists
+        supabase.table("document_trees").delete().eq("user_id", user_id).eq(
+            "file_hash", file_hash
+        ).execute()
     _progress(2, "Partitioning PDF (OCR + layout detection)…")
     elements = partition_document(pdf_path)
         doc_tree = _build_document_tree(elements)
         sb = _build_service_supabase_client()
+        sb.table("document_trees").upsert(
+            {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
+            on_conflict="user_id,file_hash",
         ).execute()
         log.info("✅ PageIndex tree saved to Supabase.")
     except Exception as e:
     else:
         pdf_path_for_naming = pdf_path
     docs, ids = process_chunks(
+        chunks, elements, pdf_path_for_naming, file_hash, graph_data, user_id, pdf_images
     )
     # --- NATIVE RAPTOR INDEXING ---
     _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
+    docs, ids = build_raptor_tree(docs, ids, user_id)
     smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
     if export_json:
                 "document_type": graph_data.document_type,
                 "chunk_count": len(docs),
             },
+            on_conflict="user_id,file_hash",
         ).execute()
     except Exception as e:
         log.error("Failed to register file: %s", e)
                 max_per_source=max_per_source,
             )
+        docs = []
+        for c in diverse:
+            meta = dict(c.get("metadata", {}) or {})
+            if c.get("id") is not None:
+                meta["id"] = str(c["id"])
+            docs.append(Document(page_content=c["content"], metadata=meta))
         log.info(
             "Dropped %d low-relevance/duplicate chunks.",
             len(all_candidates) - len(docs),
         log.error("Answer generation failed: %s", exc)
         return f"Failed to generate answer: {exc}", []
 async def generate_answer_stream(
     chunks: List[Document],
     query: str,
             {
                 "query_embedding": query_vector,
                 "match_session_id": session_id,
                 "match_count": 1,
             },
         ).execute()
         # If we found a nearly identical query in this session, skip saving!
+        if dup_check.data and float(dup_check.data[0].get("similarity", 0.0)) >= 0.98:
             log.info(
                 "🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
                 session_id[:8],
     return False
+def tree_search(
+    query: str, access_token: str = None, category: str = None
+) -> List[Document]:
     """
     Navigates the structural JSON trees in Supabase to answer highly specific
     'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
         if not res.data:
             return []
+        allowed_hashes = None
+        if category and category != "All":
+            try:
+                allowed_res = (
+                    sb.table("ingested_files")
+                    .select("file_hash")
+                    .eq("document_type", category)
+                    .execute()
+                )
+                allowed_hashes = {
+                    row.get("file_hash") for row in (allowed_res.data or []) if row.get("file_hash")
+                }
+            except Exception as exc:
+                log.warning("Could not apply tree-search category filter: %s", exc)
         matched_chunks = []
         # 3. Recursive Tree Traversal
         # 4. Execute traversal across all trees
         for tree_row in res.data:
+            if allowed_hashes is not None and tree_row.get("file_hash") not in allowed_hashes:
+                continue
             _traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
         log.info(
         return []
+def retrieve_chunks_routed(
+    query: str,
+    k: int = 3,
+    source_file: str = None,
+    category: str = None,
+    alpha: float = 0.5,
+    session_id: str = "default_session",
+    access_token: str = None,
+    user_id: str = None,
+    original_query: str = None,
+    eval_mode: bool = False,
+) -> List[Document]:
+    """
+    Live request-path retrieval entrypoint.
+    Routes structural queries to the tree index first, then falls back to vector retrieval.
+    """
+    routing_query = (original_query or query or "").strip()
+    if routing_query and _should_use_tree_path(routing_query):
+        log.info("🎯 PageIndex triggered: query routed to structural tree path.")
+        tree_chunks = tree_search(
+            routing_query, access_token=access_token, category=category
+        )
+        if tree_chunks:
+            if session_id:
+                session_key = _session_cache_key(session_id, user_id=user_id)
+                with _last_chunks_lock:
+                    _last_chunks[session_key] = tree_chunks
+            return tree_chunks
+        log.info("Tree search yielded 0 results. Falling back to vector search.")
+    return retrieve_chunks(
+        query,
+        k=k,
+        source_file=source_file,
+        category=category,
+        alpha=alpha,
+        session_id=session_id,
+        access_token=access_token,
+        user_id=user_id,
+        original_query=original_query,
+        eval_mode=eval_mode,
+    )
 def run_query(
     query: str,
     k: int = 3,
 ) -> Tuple[str, List[str]]:
     # 1. Document Retrieval (Routed)
+    chunks = retrieve_chunks_routed(
+        query,
+        k=k,
+        source_file=source_file,
+        category=category,
+        alpha=alpha,
+        session_id=session_id,
+        access_token=access_token,
+        original_query=query,
+    )
     # 2. Retrieve Episodic Memory (Semantic Search)
     past_memories = []

frontend/js/chat.js CHANGED Viewed

@@ -203,6 +203,16 @@ async function sendChat() {
         el.scrollTop = el.scrollHeight;
       },
       onError(errMsg) {
         bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
       },
     });
@@ -306,4 +316,4 @@ function handleChatKey(e) {
 function autoResize(el) {
   el.style.height = 'auto';
   el.style.height = Math.min(el.scrollHeight, 120) + 'px';
-}

         el.scrollTop = el.scrollHeight;
       },
       onError(errMsg) {
+        if (fullText.trim()) {
+          if (!assistantDiv.querySelector('.stream-error-note')) {
+            const note = document.createElement('div');
+            note.className = 'stream-error-note';
+            note.innerHTML = `<p class="msg-p" style="color:var(--red);margin-top:10px">${esc(errMsg)}</p>`;
+            assistantDiv.appendChild(note);
+          }
+          toast(errMsg, 'error');
+          return;
+        }
         bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
       },
     });
 function autoResize(el) {
   el.style.height = 'auto';
   el.style.height = Math.min(el.scrollHeight, 120) + 'px';
+}

supabase/migrations/0004_hierarchical_nodes.sql CHANGED Viewed

@@ -22,7 +22,7 @@ CREATE OR REPLACE FUNCTION public.insert_document_chunk(
     p_id uuid,
     p_content text,
     p_metadata jsonb,
-    p_embedding public.vector,
     p_user_id uuid,
     p_node_type text DEFAULT 'leaf',
     p_parent_node_id uuid DEFAULT NULL,
@@ -58,7 +58,7 @@ $$;
 CREATE OR REPLACE FUNCTION public.hybrid_search(
     query_text text,
-    query_embedding public.vector,
     match_count integer DEFAULT 10,
     filter jsonb DEFAULT '{}'::jsonb,
     semantic_weight double precision DEFAULT 0.7,
@@ -72,10 +72,10 @@ begin
     semantic as (
         select
             d.id, d.content, d.metadata,
-            (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
         from documents d
         where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
-        order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
         limit match_count * 3
     ),
     keyword as (

     p_id uuid,
     p_content text,
     p_metadata jsonb,
+    p_embedding extensions.vector,
     p_user_id uuid,
     p_node_type text DEFAULT 'leaf',
     p_parent_node_id uuid DEFAULT NULL,
 CREATE OR REPLACE FUNCTION public.hybrid_search(
     query_text text,
+    query_embedding extensions.vector,
     match_count integer DEFAULT 10,
     filter jsonb DEFAULT '{}'::jsonb,
     semantic_weight double precision DEFAULT 0.7,
     semantic as (
         select
             d.id, d.content, d.metadata,
+            (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
         from documents d
         where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+        order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
         limit match_count * 3
     ),
     keyword as (

supabase/migrations/0006_multi_tenant_file_uniqueness.sql ADDED Viewed

	@@ -0,0 +1,23 @@

+-- Migration 0006: Tenant-safe duplicate file support
+--
+-- The application now derives document node IDs from (user_id + file_hash + chunk),
+-- so duplicate PDFs across tenants no longer collide in public.documents.
+-- This migration fixes the remaining per-file tables that still used global
+-- file_hash uniqueness.
+-- Allow the same file_hash to exist for multiple tenants in ingested_files.
+ALTER TABLE public.ingested_files
+  DROP CONSTRAINT IF EXISTS ingested_files_file_hash_key;
+CREATE UNIQUE INDEX IF NOT EXISTS ingested_files_user_file_hash_uidx
+  ON public.ingested_files (user_id, file_hash);
+-- Allow the same file_hash to exist for multiple tenants in document_trees.
+ALTER TABLE public.document_trees
+  DROP CONSTRAINT IF EXISTS document_trees_pkey;
+CREATE UNIQUE INDEX IF NOT EXISTS document_trees_user_file_hash_uidx
+  ON public.document_trees (user_id, file_hash);
+CREATE INDEX IF NOT EXISTS document_trees_user_id_idx
+  ON public.document_trees (user_id);

supabase/schema_backup.sql CHANGED Viewed

@@ -1,8 +1,6 @@
---
--- PostgreSQL database dump
---
-\restrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx
 -- Dumped from database version 17.6
 -- Dumped by pg_dump version 18.3
@@ -23,7 +21,7 @@ SET row_security = off;
 -- Name: public; Type: SCHEMA; Schema: -; Owner: -
 --
-CREATE SCHEMA public;
 --
@@ -77,10 +75,10 @@ $$;
 --
--- Name: hybrid_search(text, public.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
 --
-CREATE FUNCTION public.hybrid_search(query_text text, query_embedding public.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
     LANGUAGE plpgsql
     AS $$
 begin
@@ -89,10 +87,10 @@ begin
     semantic as (
         select
             d.id, d.content, d.metadata,
-            (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
         from documents d
         where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
-        order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
         limit match_count * 3
     ),
     keyword as (
@@ -137,10 +135,10 @@ $$;
 --
--- Name: insert_document_chunk(uuid, text, jsonb, public.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
 --
-CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding public.vector, p_user_id uuid) RETURNS void
     LANGUAGE plpgsql SECURITY DEFINER
     AS $$
 BEGIN
@@ -155,10 +153,10 @@ $$;
 --
--- Name: insert_document_chunk(uuid, text, jsonb, public.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
 --
-CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding public.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
     LANGUAGE plpgsql SECURITY DEFINER
     AS $$
 BEGIN
@@ -182,10 +180,10 @@ $$;
 --
--- Name: match_documents(public.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
 --
-CREATE FUNCTION public.match_documents(query_embedding public.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
     LANGUAGE plpgsql
     AS $$
 begin
@@ -194,20 +192,20 @@ begin
         d.id,
         d.content,
         d.metadata,
-        (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as similarity
     from documents d
     where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
-    order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
     limit match_count;
 end;
 $$;
 --
--- Name: match_memory(public.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
 --
-CREATE FUNCTION public.match_memory(query_embedding public.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
     LANGUAGE plpgsql
     AS $$
 BEGIN
@@ -266,7 +264,7 @@ CREATE TABLE public.chat_memory (
     session_id text NOT NULL,
     role text NOT NULL,
     content text NOT NULL,
-    embedding public.vector(2048),
     created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
     user_id uuid DEFAULT auth.uid()
 );
@@ -292,7 +290,7 @@ CREATE TABLE public.documents (
     id uuid DEFAULT gen_random_uuid() NOT NULL,
     content text,
     metadata jsonb,
-    embedding public.vector(2048),
     user_id uuid DEFAULT auth.uid(),
     node_type text DEFAULT 'leaf'::text,
     parent_node_id uuid,
@@ -512,11 +510,11 @@ ALTER TABLE ONLY public.chat_memory
 --
--- Name: document_trees document_trees_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-ALTER TABLE ONLY public.document_trees
-    ADD CONSTRAINT document_trees_pkey PRIMARY KEY (file_hash);
 --
@@ -536,11 +534,11 @@ ALTER TABLE ONLY public.evaluation_logs
 --
--- Name: ingested_files ingested_files_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
---
-ALTER TABLE ONLY public.ingested_files
-    ADD CONSTRAINT ingested_files_file_hash_key UNIQUE (file_hash);
 --
@@ -621,7 +619,7 @@ CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvecto
 -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
 --
-CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::public.halfvec(2048)) public.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
 --
@@ -939,9 +937,7 @@ ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
 CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
---
--- PostgreSQL database dump complete
---
-\unrestrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx

+--
+-- PostgreSQL database dump
+--
 -- Dumped from database version 17.6
 -- Dumped by pg_dump version 18.3
 -- Name: public; Type: SCHEMA; Schema: -; Owner: -
 --
+CREATE SCHEMA IF NOT EXISTS public;
 --
 --
+-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
 --
+CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
     LANGUAGE plpgsql
     AS $$
 begin
     semantic as (
         select
             d.id, d.content, d.metadata,
+            (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
         from documents d
         where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+        order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
         limit match_count * 3
     ),
     keyword as (
 --
+-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
 --
+CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
     LANGUAGE plpgsql SECURITY DEFINER
     AS $$
 BEGIN
 --
+-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
 --
+CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
     LANGUAGE plpgsql SECURITY DEFINER
     AS $$
 BEGIN
 --
+-- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
 --
+CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
     LANGUAGE plpgsql
     AS $$
 begin
         d.id,
         d.content,
         d.metadata,
+          (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as similarity
     from documents d
     where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+      order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
     limit match_count;
 end;
 $$;
 --
+-- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
 --
+CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
     LANGUAGE plpgsql
     AS $$
 BEGIN
     session_id text NOT NULL,
     role text NOT NULL,
     content text NOT NULL,
+    embedding extensions.vector(2048),
     created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
     user_id uuid DEFAULT auth.uid()
 );
     id uuid DEFAULT gen_random_uuid() NOT NULL,
     content text,
     metadata jsonb,
+    embedding extensions.vector(2048),
     user_id uuid DEFAULT auth.uid(),
     node_type text DEFAULT 'leaf'::text,
     parent_node_id uuid,
 --
+-- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+ALTER TABLE ONLY public.document_trees
+    ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
 --
 --
+-- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+ALTER TABLE ONLY public.ingested_files
+    ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
 --
 -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
 --
+CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
 --
 CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
+--
+-- PostgreSQL database dump complete
+--