Spaces:
Running
Running
nothex commited on
Commit ·
ff7e045
1
Parent(s): 9f2ae84
fix: harden supabase setup and clean stale retrieval/ingestion code
Browse files- .env.example +2 -1
- .github/workflows/smoke.yml +3 -3
- README.md +11 -4
- backend/api/query.py +55 -22
- backend/core/classifier.py +3 -11
- backend/core/pipeline.py +133 -102
- frontend/js/chat.js +11 -1
- supabase/migrations/0004_hierarchical_nodes.sql +4 -4
- supabase/migrations/0006_multi_tenant_file_uniqueness.sql +23 -0
- supabase/schema_backup.sql +34 -38
.env.example
CHANGED
|
@@ -10,6 +10,7 @@ GEMINI_API_KEY=****
|
|
| 10 |
SUPABASE_URL=https://example.supabase.co
|
| 11 |
SUPABASE_SERVICE_KEY=****
|
| 12 |
SUPABASE_ANON_KEY=****
|
|
|
|
| 13 |
|
| 14 |
# Cohere
|
| 15 |
COHERE_API_KEY=****
|
|
@@ -23,7 +24,7 @@ REDIS_URL=redis://localhost:6379/0
|
|
| 23 |
MASTER_ADMIN_KEY=****
|
| 24 |
|
| 25 |
# Local dev: ALLOWED_ORIGINS=*
|
| 26 |
-
# Production: ALLOWED_ORIGINS=https://your-
|
| 27 |
ALLOWED_ORIGINS=*
|
| 28 |
|
| 29 |
# Docs enabled in dev, disabled in prod
|
|
|
|
| 10 |
SUPABASE_URL=https://example.supabase.co
|
| 11 |
SUPABASE_SERVICE_KEY=****
|
| 12 |
SUPABASE_ANON_KEY=****
|
| 13 |
+
SUPABASE_JWT_SECRET=****
|
| 14 |
|
| 15 |
# Cohere
|
| 16 |
COHERE_API_KEY=****
|
|
|
|
| 24 |
MASTER_ADMIN_KEY=****
|
| 25 |
|
| 26 |
# Local dev: ALLOWED_ORIGINS=*
|
| 27 |
+
# Production: ALLOWED_ORIGINS=https://your-morpheus.vercel.app
|
| 28 |
ALLOWED_ORIGINS=*
|
| 29 |
|
| 30 |
# Docs enabled in dev, disabled in prod
|
.github/workflows/smoke.yml
CHANGED
|
@@ -25,13 +25,13 @@ jobs:
|
|
| 25 |
|
| 26 |
- name: Build assets (light mode)
|
| 27 |
env:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
run: |
|
| 31 |
python -m backend.core.build_ml_assets
|
| 32 |
|
| 33 |
- name: Intent classifier smoke predict
|
| 34 |
env:
|
| 35 |
-
|
| 36 |
run: |
|
| 37 |
python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"
|
|
|
|
| 25 |
|
| 26 |
- name: Build assets (light mode)
|
| 27 |
env:
|
| 28 |
+
MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
|
| 29 |
+
MORPHEUS_BUILD_ASSETS_MODE: "light"
|
| 30 |
run: |
|
| 31 |
python -m backend.core.build_ml_assets
|
| 32 |
|
| 33 |
- name: Intent classifier smoke predict
|
| 34 |
env:
|
| 35 |
+
MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
|
| 36 |
run: |
|
| 37 |
python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"
|
README.md
CHANGED
|
@@ -99,9 +99,16 @@ pip install -r requirements.txt
|
|
| 99 |
### 3. Set up Supabase
|
| 100 |
|
| 101 |
1. Create a project at [supabase.com](https://supabase.com)
|
| 102 |
-
2.
|
| 103 |
-
3.
|
| 104 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
### 4. Configure environment
|
| 107 |
|
|
@@ -235,4 +242,4 @@ The architecture is designed for independent layer scaling:
|
|
| 235 |
- **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
|
| 236 |
- **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
|
| 237 |
- **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
|
| 238 |
-
- **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.
|
|
|
|
| 99 |
### 3. Set up Supabase
|
| 100 |
|
| 101 |
1. Create a project at [supabase.com](https://supabase.com)
|
| 102 |
+
2. Enable the `vector`, `uuid-ossp`, and `pgcrypto` extensions first (Database → Extensions)
|
| 103 |
+
3. Go to the SQL editor and run `supabase/schema_backup.sql` in full
|
| 104 |
+
4. Create two Storage buckets:
|
| 105 |
+
- `rag-images` as a public bucket
|
| 106 |
+
- `rag-models` as a private bucket
|
| 107 |
+
5. Optional but recommended: run `supabase/rls/rpc_security_rls_audit.sql`
|
| 108 |
+
|
| 109 |
+
For an existing older database, apply the files in `supabase/migrations/` and
|
| 110 |
+
the scripts in `supabase/rls/` as incremental upgrade helpers. For a brand-new
|
| 111 |
+
project, `schema_backup.sql` is already the full source of truth.
|
| 112 |
|
| 113 |
### 4. Configure environment
|
| 114 |
|
|
|
|
| 242 |
- **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
|
| 243 |
- **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
|
| 244 |
- **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
|
| 245 |
+
- **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.
|
backend/api/query.py
CHANGED
|
@@ -5,7 +5,11 @@ import asyncio
|
|
| 5 |
from fastapi import APIRouter, Header, Depends, Request
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
from shared.types import QueryRequest, SourceChunk
|
| 8 |
-
from backend.core.pipeline import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from backend.core.auth_utils import require_auth_token
|
| 10 |
from backend.main import limiter
|
| 11 |
|
|
@@ -13,6 +17,49 @@ log = logging.getLogger("morpheus.api.query")
|
|
| 13 |
router = APIRouter()
|
| 14 |
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
@router.post("")
|
| 17 |
@limiter.limit("60/hour")
|
| 18 |
async def query(
|
|
@@ -60,7 +107,7 @@ async def query(
|
|
| 60 |
loop = asyncio.get_event_loop()
|
| 61 |
chunks = await loop.run_in_executor(
|
| 62 |
None,
|
| 63 |
-
lambda:
|
| 64 |
effective_query,
|
| 65 |
k=req.k,
|
| 66 |
category=category,
|
|
@@ -75,6 +122,7 @@ async def query(
|
|
| 75 |
|
| 76 |
# ── Step 3: Stream answer tokens ──────────────────────────────────
|
| 77 |
images = []
|
|
|
|
| 78 |
# 🚀 Define the boolean once for readability
|
| 79 |
is_eval = x_eval_mode == "true"
|
| 80 |
async for event in generate_answer_stream(
|
|
@@ -90,27 +138,12 @@ async def query(
|
|
| 90 |
yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
|
| 91 |
elif event["type"] == "done":
|
| 92 |
images = event.get("images", [])
|
|
|
|
| 93 |
|
| 94 |
# ── Step 4: Emit sources + images ─────────────────────────────────
|
| 95 |
-
sources =
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
orig = meta.get("original_content", "{}")
|
| 99 |
-
if isinstance(orig, str):
|
| 100 |
-
try: orig = json.loads(orig) # noqa: E701
|
| 101 |
-
except: orig = {} # noqa: E701, E722
|
| 102 |
-
full_text = orig.get("raw_text") or chunk.page_content
|
| 103 |
-
snippet_text = full_text if x_eval_mode == "true" else full_text[:200]
|
| 104 |
-
sources.append(
|
| 105 |
-
SourceChunk(
|
| 106 |
-
source=meta.get("source", "Unknown"),
|
| 107 |
-
score=meta.get("relevance_score"),
|
| 108 |
-
chunk=meta.get("chunk_index"),
|
| 109 |
-
snippet=snippet_text,
|
| 110 |
-
doc_type=meta.get("document_type"),
|
| 111 |
-
pages=meta.get("page_numbers"),
|
| 112 |
-
).dict()
|
| 113 |
-
)
|
| 114 |
|
| 115 |
yield "data: " + json.dumps({
|
| 116 |
"type": "done",
|
|
@@ -144,4 +177,4 @@ async def query(
|
|
| 144 |
"X-Accel-Buffering": "no",
|
| 145 |
"Access-Control-Allow-Origin": "*",
|
| 146 |
}
|
| 147 |
-
)
|
|
|
|
| 5 |
from fastapi import APIRouter, Header, Depends, Request
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
from shared.types import QueryRequest, SourceChunk
|
| 8 |
+
from backend.core.pipeline import (
|
| 9 |
+
retrieve_chunks_routed,
|
| 10 |
+
generate_answer_stream,
|
| 11 |
+
analyse_intent,
|
| 12 |
+
)
|
| 13 |
from backend.core.auth_utils import require_auth_token
|
| 14 |
from backend.main import limiter
|
| 15 |
|
|
|
|
| 17 |
router = APIRouter()
|
| 18 |
|
| 19 |
|
| 20 |
+
def _normalise_original_content(raw):
|
| 21 |
+
"""Best-effort decode for metadata that may already be dict or JSON string."""
|
| 22 |
+
if isinstance(raw, dict):
|
| 23 |
+
return raw
|
| 24 |
+
if isinstance(raw, str):
|
| 25 |
+
candidate = raw
|
| 26 |
+
for _ in range(2):
|
| 27 |
+
try:
|
| 28 |
+
candidate = json.loads(candidate)
|
| 29 |
+
except Exception:
|
| 30 |
+
return {}
|
| 31 |
+
if isinstance(candidate, dict):
|
| 32 |
+
return candidate
|
| 33 |
+
if not isinstance(candidate, str):
|
| 34 |
+
break
|
| 35 |
+
return {}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _build_sources_from_chunks(chunks, include_full_text: bool = False):
|
| 39 |
+
sources = []
|
| 40 |
+
for chunk in chunks:
|
| 41 |
+
try:
|
| 42 |
+
if getattr(chunk, "page_content", None) == "__CACHE_HIT__":
|
| 43 |
+
continue
|
| 44 |
+
meta = getattr(chunk, "metadata", {}) or {}
|
| 45 |
+
orig = _normalise_original_content(meta.get("original_content", {}))
|
| 46 |
+
full_text = orig.get("raw_text") or chunk.page_content
|
| 47 |
+
snippet_text = full_text if include_full_text else full_text[:200]
|
| 48 |
+
sources.append(
|
| 49 |
+
SourceChunk(
|
| 50 |
+
source=meta.get("source", "Unknown"),
|
| 51 |
+
score=meta.get("relevance_score"),
|
| 52 |
+
chunk=meta.get("chunk_index"),
|
| 53 |
+
snippet=snippet_text,
|
| 54 |
+
doc_type=meta.get("document_type"),
|
| 55 |
+
pages=meta.get("page_numbers"),
|
| 56 |
+
).dict()
|
| 57 |
+
)
|
| 58 |
+
except Exception as exc:
|
| 59 |
+
log.warning("Skipping source serialization for chunk: %s", exc)
|
| 60 |
+
return sources
|
| 61 |
+
|
| 62 |
+
|
| 63 |
@router.post("")
|
| 64 |
@limiter.limit("60/hour")
|
| 65 |
async def query(
|
|
|
|
| 107 |
loop = asyncio.get_event_loop()
|
| 108 |
chunks = await loop.run_in_executor(
|
| 109 |
None,
|
| 110 |
+
lambda: retrieve_chunks_routed(
|
| 111 |
effective_query,
|
| 112 |
k=req.k,
|
| 113 |
category=category,
|
|
|
|
| 122 |
|
| 123 |
# ── Step 3: Stream answer tokens ──────────────────────────────────
|
| 124 |
images = []
|
| 125 |
+
done_sources = []
|
| 126 |
# 🚀 Define the boolean once for readability
|
| 127 |
is_eval = x_eval_mode == "true"
|
| 128 |
async for event in generate_answer_stream(
|
|
|
|
| 138 |
yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
|
| 139 |
elif event["type"] == "done":
|
| 140 |
images = event.get("images", [])
|
| 141 |
+
done_sources = event.get("sources", []) or []
|
| 142 |
|
| 143 |
# ── Step 4: Emit sources + images ─────────────────────────────────
|
| 144 |
+
sources = done_sources or _build_sources_from_chunks(
|
| 145 |
+
chunks, include_full_text=is_eval
|
| 146 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
yield "data: " + json.dumps({
|
| 149 |
"type": "done",
|
|
|
|
| 177 |
"X-Accel-Buffering": "no",
|
| 178 |
"Access-Control-Allow-Origin": "*",
|
| 179 |
}
|
| 180 |
+
)
|
backend/core/classifier.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
-
# PASTE classifier.py HERE
|
| 2 |
-
# Fix: 'import config' -> 'from backend.core import config'
|
| 3 |
"""
|
| 4 |
-
|
| 5 |
-
==========================================================
|
| 6 |
|
| 7 |
3-stage cascade:
|
| 8 |
Stage 1: Embedding nearest-centroid (cosine similarity, no API calls after warmup)
|
|
@@ -15,14 +12,9 @@ before the normal pipeline (catches periodic tables, reference charts etc.)
|
|
| 15 |
Each stage only activates if the previous stage's confidence is below its threshold.
|
| 16 |
Centroid embeddings are persisted to Supabase so they survive restarts.
|
| 17 |
|
| 18 |
-
|
| 19 |
-
from classifier import DocumentClassifier
|
| 20 |
clf = DocumentClassifier()
|
| 21 |
result = clf.classify(sample_text, elements)
|
| 22 |
-
# result.document_type → "machine_learning_paper"
|
| 23 |
-
# result.confidence → 0.91
|
| 24 |
-
# result.stage_used → "centroid"
|
| 25 |
-
# result.is_new_type → False
|
| 26 |
"""
|
| 27 |
|
| 28 |
import re
|
|
@@ -672,4 +664,4 @@ JSON:"""
|
|
| 672 |
if score > best_score:
|
| 673 |
best_score = score
|
| 674 |
best_type = cat
|
| 675 |
-
return (best_type, best_score) if best_type else (None, None)
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Hierarchical ensemble document classifier.
|
|
|
|
| 3 |
|
| 4 |
3-stage cascade:
|
| 5 |
Stage 1: Embedding nearest-centroid (cosine similarity, no API calls after warmup)
|
|
|
|
| 12 |
Each stage only activates if the previous stage's confidence is below its threshold.
|
| 13 |
Centroid embeddings are persisted to Supabase so they survive restarts.
|
| 14 |
|
| 15 |
+
Typical usage:
|
|
|
|
| 16 |
clf = DocumentClassifier()
|
| 17 |
result = clf.classify(sample_text, elements)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
import re
|
|
|
|
| 664 |
if score > best_score:
|
| 665 |
best_score = score
|
| 666 |
best_type = cat
|
| 667 |
+
return (best_type, best_score) if best_type else (None, None)
|
backend/core/pipeline.py
CHANGED
|
@@ -1,24 +1,7 @@
|
|
| 1 |
-
# PASTE cl.py HERE
|
| 2 |
-
# Fix: 'import config' -> 'from backend.core import config'
|
| 3 |
-
# Fix: 'from classifier import DocumentClassifier' -> 'from backend.core.classifier import DocumentClassifier'
|
| 4 |
"""
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
python cl.py --ingest --pdf path/to/file.pdf
|
| 9 |
-
python cl.py --ingest --pdf file.pdf --force
|
| 10 |
-
python cl.py --query "your question here"
|
| 11 |
-
python cl.py --ingest --pdf file.pdf --export
|
| 12 |
-
|
| 13 |
-
Improvements in this version:
|
| 14 |
-
- FIX: process_chunks now uses graph_data.document_type (not .categories[0])
|
| 15 |
-
- FIX: is_file_already_ingested now hits ingested_files registry table (O(1))
|
| 16 |
-
- NEW: In-memory embedding cache for repeated queries (thread-safe LRU via functools)
|
| 17 |
-
- NEW: ingested_files registry insert after successful upload
|
| 18 |
-
- NEW: relevance_score surfaced in metadata for UI badge display
|
| 19 |
-
- NEW: Source deduplication by chunk content hash (prevents near-duplicate passages)
|
| 20 |
-
- NEW: Graceful empty-query guard in generate_sub_queries
|
| 21 |
-
- NEW: MMR-style post-rerank diversity filter to stop one source dominating
|
| 22 |
"""
|
| 23 |
|
| 24 |
import os
|
|
@@ -631,6 +614,7 @@ def process_chunks(
|
|
| 631 |
file_path: str,
|
| 632 |
file_hash: str,
|
| 633 |
graph_data: DocumentGraphMetadata,
|
|
|
|
| 634 |
pdf_images=None,
|
| 635 |
) -> tuple[List[Document], List[str]]:
|
| 636 |
"""Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
|
|
@@ -718,7 +702,7 @@ def process_chunks(
|
|
| 718 |
)
|
| 719 |
docs.append(doc)
|
| 720 |
|
| 721 |
-
unique_string = f"{file_hash}
|
| 722 |
chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
|
| 723 |
ids.append(chunk_id)
|
| 724 |
|
|
@@ -727,7 +711,7 @@ def process_chunks(
|
|
| 727 |
|
| 728 |
|
| 729 |
def build_raptor_tree(
|
| 730 |
-
leaf_docs: List[Document], leaf_ids: List[str]
|
| 731 |
) -> tuple[List[Document], List[str]]:
|
| 732 |
"""
|
| 733 |
RAPTOR implementation: recursively clusters documents and generates
|
|
@@ -801,15 +785,23 @@ def build_raptor_tree(
|
|
| 801 |
# Generate deterministic ID for the parent
|
| 802 |
import hashlib
|
| 803 |
|
|
|
|
|
|
|
|
|
|
| 804 |
parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
|
|
|
|
| 805 |
parent_id = str(
|
| 806 |
-
uuid.uuid5(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
)
|
| 808 |
|
| 809 |
# Create the parent document
|
| 810 |
# Inherit metadata from the first child (source array, file hash, document type)
|
| 811 |
-
base_meta = cluster[0].metadata
|
| 812 |
-
|
| 813 |
# Gather all unique page numbers from children
|
| 814 |
all_pages = set()
|
| 815 |
for c in cluster:
|
|
@@ -925,12 +917,6 @@ def _register_ingested_file(
|
|
| 925 |
log.warning("Could not register in ingested_files: %s", exc)
|
| 926 |
|
| 927 |
|
| 928 |
-
# =========================================================================== #
|
| 929 |
-
# ADD THESE TWO FUNCTIONS TO cl.py #
|
| 930 |
-
# Place them right after _register_ingested_file() #
|
| 931 |
-
# =========================================================================== #
|
| 932 |
-
|
| 933 |
-
|
| 934 |
def _apply_category_override(
|
| 935 |
file_hash: str, new_category: str, access_token: str = None
|
| 936 |
) -> None:
|
|
@@ -1265,7 +1251,29 @@ def run_ingestion(
|
|
| 1265 |
if already_exists and not force:
|
| 1266 |
log.info("SKIPPING — already ingested.")
|
| 1267 |
return "already_ingested"
|
| 1268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1269 |
# 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
|
| 1270 |
# RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
|
| 1271 |
if already_exists or force:
|
|
@@ -1276,30 +1284,13 @@ def run_ingestion(
|
|
| 1276 |
"user_id", user_id
|
| 1277 |
).contains("metadata", {"file_hash": file_hash}).execute()
|
| 1278 |
# 2. Clear the registry
|
| 1279 |
-
supabase.table("ingested_files").delete().eq("
|
|
|
|
|
|
|
| 1280 |
# 3. Clear the tree if it exists
|
| 1281 |
-
supabase.table("document_trees").delete().eq("
|
| 1282 |
-
|
| 1283 |
-
|
| 1284 |
-
# If so, skip the classifier and use their choice directly.
|
| 1285 |
-
forced_category = None
|
| 1286 |
-
try:
|
| 1287 |
-
_sb = _build_supabase_client(access_token)
|
| 1288 |
-
_existing = (
|
| 1289 |
-
_sb.table("ingested_files")
|
| 1290 |
-
.select("document_type, user_overridden")
|
| 1291 |
-
.eq("file_hash", file_hash)
|
| 1292 |
-
.limit(1)
|
| 1293 |
-
.execute()
|
| 1294 |
-
)
|
| 1295 |
-
if _existing.data and _existing.data[0].get("user_overridden"):
|
| 1296 |
-
forced_category = _existing.data[0]["document_type"]
|
| 1297 |
-
log.info(
|
| 1298 |
-
"User override active — forcing category '%s', skipping classifier.",
|
| 1299 |
-
forced_category,
|
| 1300 |
-
)
|
| 1301 |
-
except Exception as _exc:
|
| 1302 |
-
log.warning("Could not check user override: %s", _exc)
|
| 1303 |
|
| 1304 |
_progress(2, "Partitioning PDF (OCR + layout detection)…")
|
| 1305 |
elements = partition_document(pdf_path)
|
|
@@ -1332,8 +1323,9 @@ def run_ingestion(
|
|
| 1332 |
doc_tree = _build_document_tree(elements)
|
| 1333 |
|
| 1334 |
sb = _build_service_supabase_client()
|
| 1335 |
-
sb.table("document_trees").
|
| 1336 |
-
{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree}
|
|
|
|
| 1337 |
).execute()
|
| 1338 |
log.info("✅ PageIndex tree saved to Supabase.")
|
| 1339 |
except Exception as e:
|
|
@@ -1347,12 +1339,12 @@ def run_ingestion(
|
|
| 1347 |
else:
|
| 1348 |
pdf_path_for_naming = pdf_path
|
| 1349 |
docs, ids = process_chunks(
|
| 1350 |
-
chunks, elements, pdf_path_for_naming, file_hash, graph_data, pdf_images
|
| 1351 |
)
|
| 1352 |
|
| 1353 |
# --- NATIVE RAPTOR INDEXING ---
|
| 1354 |
_progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
|
| 1355 |
-
docs, ids = build_raptor_tree(docs, ids)
|
| 1356 |
|
| 1357 |
smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
|
| 1358 |
if export_json:
|
|
@@ -1374,7 +1366,7 @@ def run_ingestion(
|
|
| 1374 |
"document_type": graph_data.document_type,
|
| 1375 |
"chunk_count": len(docs),
|
| 1376 |
},
|
| 1377 |
-
on_conflict="file_hash",
|
| 1378 |
).execute()
|
| 1379 |
except Exception as e:
|
| 1380 |
log.error("Failed to register file: %s", e)
|
|
@@ -1936,10 +1928,12 @@ def retrieve_chunks(
|
|
| 1936 |
max_per_source=max_per_source,
|
| 1937 |
)
|
| 1938 |
|
| 1939 |
-
docs = [
|
| 1940 |
-
|
| 1941 |
-
|
| 1942 |
-
|
|
|
|
|
|
|
| 1943 |
log.info(
|
| 1944 |
"Dropped %d low-relevance/duplicate chunks.",
|
| 1945 |
len(all_candidates) - len(docs),
|
|
@@ -2269,12 +2263,6 @@ def generate_answer(
|
|
| 2269 |
log.error("Answer generation failed: %s", exc)
|
| 2270 |
return f"Failed to generate answer: {exc}", []
|
| 2271 |
|
| 2272 |
-
|
| 2273 |
-
# ── ADD THIS FUNCTION to pipeline.py right after generate_answer() ──────────
|
| 2274 |
-
# Also add these imports at the top of pipeline.py if not already there:
|
| 2275 |
-
# from typing import AsyncGenerator
|
| 2276 |
-
|
| 2277 |
-
|
| 2278 |
async def generate_answer_stream(
|
| 2279 |
chunks: List[Document],
|
| 2280 |
query: str,
|
|
@@ -2568,13 +2556,12 @@ def _save_to_memory(
|
|
| 2568 |
{
|
| 2569 |
"query_embedding": query_vector,
|
| 2570 |
"match_session_id": session_id,
|
| 2571 |
-
"match_threshold": 0.98, # 98% similarity = practically identical
|
| 2572 |
"match_count": 1,
|
| 2573 |
},
|
| 2574 |
).execute()
|
| 2575 |
|
| 2576 |
# If we found a nearly identical query in this session, skip saving!
|
| 2577 |
-
if dup_check.data:
|
| 2578 |
log.info(
|
| 2579 |
"🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
|
| 2580 |
session_id[:8],
|
|
@@ -2713,7 +2700,9 @@ def _should_use_tree_path(query: str) -> bool:
|
|
| 2713 |
return False
|
| 2714 |
|
| 2715 |
|
| 2716 |
-
def tree_search(
|
|
|
|
|
|
|
| 2717 |
"""
|
| 2718 |
Navigates the structural JSON trees in Supabase to answer highly specific
|
| 2719 |
'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
|
|
@@ -2761,6 +2750,21 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
|
|
| 2761 |
if not res.data:
|
| 2762 |
return []
|
| 2763 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2764 |
matched_chunks = []
|
| 2765 |
|
| 2766 |
# 3. Recursive Tree Traversal
|
|
@@ -2789,6 +2793,8 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
|
|
| 2789 |
|
| 2790 |
# 4. Execute traversal across all trees
|
| 2791 |
for tree_row in res.data:
|
|
|
|
|
|
|
| 2792 |
_traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
|
| 2793 |
|
| 2794 |
log.info(
|
|
@@ -2803,6 +2809,50 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
|
|
| 2803 |
return []
|
| 2804 |
|
| 2805 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2806 |
def run_query(
|
| 2807 |
query: str,
|
| 2808 |
k: int = 3,
|
|
@@ -2815,35 +2865,16 @@ def run_query(
|
|
| 2815 |
) -> Tuple[str, List[str]]:
|
| 2816 |
|
| 2817 |
# 1. Document Retrieval (Routed)
|
| 2818 |
-
|
| 2819 |
-
|
| 2820 |
-
|
| 2821 |
-
|
| 2822 |
-
|
| 2823 |
-
|
| 2824 |
-
|
| 2825 |
-
|
| 2826 |
-
|
| 2827 |
-
|
| 2828 |
-
query,
|
| 2829 |
-
k=k,
|
| 2830 |
-
source_file=source_file,
|
| 2831 |
-
category=category,
|
| 2832 |
-
alpha=alpha,
|
| 2833 |
-
session_id=session_id,
|
| 2834 |
-
access_token=access_token,
|
| 2835 |
-
)
|
| 2836 |
-
else:
|
| 2837 |
-
log.info("🌊 Semantic path triggered: Query routed to vector search.")
|
| 2838 |
-
chunks = retrieve_chunks(
|
| 2839 |
-
query,
|
| 2840 |
-
k=k,
|
| 2841 |
-
source_file=source_file,
|
| 2842 |
-
category=category,
|
| 2843 |
-
alpha=alpha,
|
| 2844 |
-
session_id=session_id,
|
| 2845 |
-
access_token=access_token,
|
| 2846 |
-
)
|
| 2847 |
|
| 2848 |
# 2. Retrieve Episodic Memory (Semantic Search)
|
| 2849 |
past_memories = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Core ingestion, retrieval, generation, memory, and tree-search pipeline.
|
| 3 |
+
|
| 4 |
+
This module backs the FastAPI routes and Celery ingestion worker for Morpheus.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
|
|
|
| 614 |
file_path: str,
|
| 615 |
file_hash: str,
|
| 616 |
graph_data: DocumentGraphMetadata,
|
| 617 |
+
user_id: str,
|
| 618 |
pdf_images=None,
|
| 619 |
) -> tuple[List[Document], List[str]]:
|
| 620 |
"""Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
|
|
|
|
| 702 |
)
|
| 703 |
docs.append(doc)
|
| 704 |
|
| 705 |
+
unique_string = f"{user_id}:{file_hash}:chunk:{i}"
|
| 706 |
chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
|
| 707 |
ids.append(chunk_id)
|
| 708 |
|
|
|
|
| 711 |
|
| 712 |
|
| 713 |
def build_raptor_tree(
|
| 714 |
+
leaf_docs: List[Document], leaf_ids: List[str], user_id: str
|
| 715 |
) -> tuple[List[Document], List[str]]:
|
| 716 |
"""
|
| 717 |
RAPTOR implementation: recursively clusters documents and generates
|
|
|
|
| 785 |
# Generate deterministic ID for the parent
|
| 786 |
import hashlib
|
| 787 |
|
| 788 |
+
child_node_ids = [
|
| 789 |
+
str(c.metadata.get("node_id", "")) for c in cluster if c.metadata.get("node_id")
|
| 790 |
+
]
|
| 791 |
parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
|
| 792 |
+
base_meta = cluster[0].metadata
|
| 793 |
parent_id = str(
|
| 794 |
+
uuid.uuid5(
|
| 795 |
+
uuid.NAMESPACE_DNS,
|
| 796 |
+
(
|
| 797 |
+
f"{user_id}:raptor:{base_meta.get('file_hash', '')}:"
|
| 798 |
+
f"{current_level}:{'|'.join(child_node_ids)}:{parent_hash}"
|
| 799 |
+
),
|
| 800 |
+
)
|
| 801 |
)
|
| 802 |
|
| 803 |
# Create the parent document
|
| 804 |
# Inherit metadata from the first child (source array, file hash, document type)
|
|
|
|
|
|
|
| 805 |
# Gather all unique page numbers from children
|
| 806 |
all_pages = set()
|
| 807 |
for c in cluster:
|
|
|
|
| 917 |
log.warning("Could not register in ingested_files: %s", exc)
|
| 918 |
|
| 919 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
def _apply_category_override(
|
| 921 |
file_hash: str, new_category: str, access_token: str = None
|
| 922 |
) -> None:
|
|
|
|
| 1251 |
if already_exists and not force:
|
| 1252 |
log.info("SKIPPING — already ingested.")
|
| 1253 |
return "already_ingested"
|
| 1254 |
+
|
| 1255 |
+
# NEW: Check if user has previously overridden the category for this file.
|
| 1256 |
+
# If so, skip the classifier and use their choice directly.
|
| 1257 |
+
forced_category = None
|
| 1258 |
+
if already_exists or force:
|
| 1259 |
+
try:
|
| 1260 |
+
_sb = _build_supabase_client(access_token)
|
| 1261 |
+
_existing = (
|
| 1262 |
+
_sb.table("ingested_files")
|
| 1263 |
+
.select("document_type, user_overridden")
|
| 1264 |
+
.eq("file_hash", file_hash)
|
| 1265 |
+
.limit(1)
|
| 1266 |
+
.execute()
|
| 1267 |
+
)
|
| 1268 |
+
if _existing.data and _existing.data[0].get("user_overridden"):
|
| 1269 |
+
forced_category = _existing.data[0]["document_type"]
|
| 1270 |
+
log.info(
|
| 1271 |
+
"User override active — forcing category '%s', skipping classifier.",
|
| 1272 |
+
forced_category,
|
| 1273 |
+
)
|
| 1274 |
+
except Exception as _exc:
|
| 1275 |
+
log.warning("Could not check user override: %s", _exc)
|
| 1276 |
+
|
| 1277 |
# 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
|
| 1278 |
# RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
|
| 1279 |
if already_exists or force:
|
|
|
|
| 1284 |
"user_id", user_id
|
| 1285 |
).contains("metadata", {"file_hash": file_hash}).execute()
|
| 1286 |
# 2. Clear the registry
|
| 1287 |
+
supabase.table("ingested_files").delete().eq("user_id", user_id).eq(
|
| 1288 |
+
"file_hash", file_hash
|
| 1289 |
+
).execute()
|
| 1290 |
# 3. Clear the tree if it exists
|
| 1291 |
+
supabase.table("document_trees").delete().eq("user_id", user_id).eq(
|
| 1292 |
+
"file_hash", file_hash
|
| 1293 |
+
).execute()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1294 |
|
| 1295 |
_progress(2, "Partitioning PDF (OCR + layout detection)…")
|
| 1296 |
elements = partition_document(pdf_path)
|
|
|
|
| 1323 |
doc_tree = _build_document_tree(elements)
|
| 1324 |
|
| 1325 |
sb = _build_service_supabase_client()
|
| 1326 |
+
sb.table("document_trees").upsert(
|
| 1327 |
+
{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
|
| 1328 |
+
on_conflict="user_id,file_hash",
|
| 1329 |
).execute()
|
| 1330 |
log.info("✅ PageIndex tree saved to Supabase.")
|
| 1331 |
except Exception as e:
|
|
|
|
| 1339 |
else:
|
| 1340 |
pdf_path_for_naming = pdf_path
|
| 1341 |
docs, ids = process_chunks(
|
| 1342 |
+
chunks, elements, pdf_path_for_naming, file_hash, graph_data, user_id, pdf_images
|
| 1343 |
)
|
| 1344 |
|
| 1345 |
# --- NATIVE RAPTOR INDEXING ---
|
| 1346 |
_progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
|
| 1347 |
+
docs, ids = build_raptor_tree(docs, ids, user_id)
|
| 1348 |
|
| 1349 |
smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
|
| 1350 |
if export_json:
|
|
|
|
| 1366 |
"document_type": graph_data.document_type,
|
| 1367 |
"chunk_count": len(docs),
|
| 1368 |
},
|
| 1369 |
+
on_conflict="user_id,file_hash",
|
| 1370 |
).execute()
|
| 1371 |
except Exception as e:
|
| 1372 |
log.error("Failed to register file: %s", e)
|
|
|
|
| 1928 |
max_per_source=max_per_source,
|
| 1929 |
)
|
| 1930 |
|
| 1931 |
+
docs = []
|
| 1932 |
+
for c in diverse:
|
| 1933 |
+
meta = dict(c.get("metadata", {}) or {})
|
| 1934 |
+
if c.get("id") is not None:
|
| 1935 |
+
meta["id"] = str(c["id"])
|
| 1936 |
+
docs.append(Document(page_content=c["content"], metadata=meta))
|
| 1937 |
log.info(
|
| 1938 |
"Dropped %d low-relevance/duplicate chunks.",
|
| 1939 |
len(all_candidates) - len(docs),
|
|
|
|
| 2263 |
log.error("Answer generation failed: %s", exc)
|
| 2264 |
return f"Failed to generate answer: {exc}", []
|
| 2265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2266 |
async def generate_answer_stream(
|
| 2267 |
chunks: List[Document],
|
| 2268 |
query: str,
|
|
|
|
| 2556 |
{
|
| 2557 |
"query_embedding": query_vector,
|
| 2558 |
"match_session_id": session_id,
|
|
|
|
| 2559 |
"match_count": 1,
|
| 2560 |
},
|
| 2561 |
).execute()
|
| 2562 |
|
| 2563 |
# If we found a nearly identical query in this session, skip saving!
|
| 2564 |
+
if dup_check.data and float(dup_check.data[0].get("similarity", 0.0)) >= 0.98:
|
| 2565 |
log.info(
|
| 2566 |
"🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
|
| 2567 |
session_id[:8],
|
|
|
|
| 2700 |
return False
|
| 2701 |
|
| 2702 |
|
| 2703 |
+
def tree_search(
|
| 2704 |
+
query: str, access_token: str = None, category: str = None
|
| 2705 |
+
) -> List[Document]:
|
| 2706 |
"""
|
| 2707 |
Navigates the structural JSON trees in Supabase to answer highly specific
|
| 2708 |
'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
|
|
|
|
| 2750 |
if not res.data:
|
| 2751 |
return []
|
| 2752 |
|
| 2753 |
+
allowed_hashes = None
|
| 2754 |
+
if category and category != "All":
|
| 2755 |
+
try:
|
| 2756 |
+
allowed_res = (
|
| 2757 |
+
sb.table("ingested_files")
|
| 2758 |
+
.select("file_hash")
|
| 2759 |
+
.eq("document_type", category)
|
| 2760 |
+
.execute()
|
| 2761 |
+
)
|
| 2762 |
+
allowed_hashes = {
|
| 2763 |
+
row.get("file_hash") for row in (allowed_res.data or []) if row.get("file_hash")
|
| 2764 |
+
}
|
| 2765 |
+
except Exception as exc:
|
| 2766 |
+
log.warning("Could not apply tree-search category filter: %s", exc)
|
| 2767 |
+
|
| 2768 |
matched_chunks = []
|
| 2769 |
|
| 2770 |
# 3. Recursive Tree Traversal
|
|
|
|
| 2793 |
|
| 2794 |
# 4. Execute traversal across all trees
|
| 2795 |
for tree_row in res.data:
|
| 2796 |
+
if allowed_hashes is not None and tree_row.get("file_hash") not in allowed_hashes:
|
| 2797 |
+
continue
|
| 2798 |
_traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
|
| 2799 |
|
| 2800 |
log.info(
|
|
|
|
| 2809 |
return []
|
| 2810 |
|
| 2811 |
|
| 2812 |
+
def retrieve_chunks_routed(
|
| 2813 |
+
query: str,
|
| 2814 |
+
k: int = 3,
|
| 2815 |
+
source_file: str = None,
|
| 2816 |
+
category: str = None,
|
| 2817 |
+
alpha: float = 0.5,
|
| 2818 |
+
session_id: str = "default_session",
|
| 2819 |
+
access_token: str = None,
|
| 2820 |
+
user_id: str = None,
|
| 2821 |
+
original_query: str = None,
|
| 2822 |
+
eval_mode: bool = False,
|
| 2823 |
+
) -> List[Document]:
|
| 2824 |
+
"""
|
| 2825 |
+
Live request-path retrieval entrypoint.
|
| 2826 |
+
Routes structural queries to the tree index first, then falls back to vector retrieval.
|
| 2827 |
+
"""
|
| 2828 |
+
routing_query = (original_query or query or "").strip()
|
| 2829 |
+
if routing_query and _should_use_tree_path(routing_query):
|
| 2830 |
+
log.info("🎯 PageIndex triggered: query routed to structural tree path.")
|
| 2831 |
+
tree_chunks = tree_search(
|
| 2832 |
+
routing_query, access_token=access_token, category=category
|
| 2833 |
+
)
|
| 2834 |
+
if tree_chunks:
|
| 2835 |
+
if session_id:
|
| 2836 |
+
session_key = _session_cache_key(session_id, user_id=user_id)
|
| 2837 |
+
with _last_chunks_lock:
|
| 2838 |
+
_last_chunks[session_key] = tree_chunks
|
| 2839 |
+
return tree_chunks
|
| 2840 |
+
log.info("Tree search yielded 0 results. Falling back to vector search.")
|
| 2841 |
+
|
| 2842 |
+
return retrieve_chunks(
|
| 2843 |
+
query,
|
| 2844 |
+
k=k,
|
| 2845 |
+
source_file=source_file,
|
| 2846 |
+
category=category,
|
| 2847 |
+
alpha=alpha,
|
| 2848 |
+
session_id=session_id,
|
| 2849 |
+
access_token=access_token,
|
| 2850 |
+
user_id=user_id,
|
| 2851 |
+
original_query=original_query,
|
| 2852 |
+
eval_mode=eval_mode,
|
| 2853 |
+
)
|
| 2854 |
+
|
| 2855 |
+
|
| 2856 |
def run_query(
|
| 2857 |
query: str,
|
| 2858 |
k: int = 3,
|
|
|
|
| 2865 |
) -> Tuple[str, List[str]]:
|
| 2866 |
|
| 2867 |
# 1. Document Retrieval (Routed)
|
| 2868 |
+
chunks = retrieve_chunks_routed(
|
| 2869 |
+
query,
|
| 2870 |
+
k=k,
|
| 2871 |
+
source_file=source_file,
|
| 2872 |
+
category=category,
|
| 2873 |
+
alpha=alpha,
|
| 2874 |
+
session_id=session_id,
|
| 2875 |
+
access_token=access_token,
|
| 2876 |
+
original_query=query,
|
| 2877 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2878 |
|
| 2879 |
# 2. Retrieve Episodic Memory (Semantic Search)
|
| 2880 |
past_memories = []
|
frontend/js/chat.js
CHANGED
|
@@ -203,6 +203,16 @@ async function sendChat() {
|
|
| 203 |
el.scrollTop = el.scrollHeight;
|
| 204 |
},
|
| 205 |
onError(errMsg) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
|
| 207 |
},
|
| 208 |
});
|
|
@@ -306,4 +316,4 @@ function handleChatKey(e) {
|
|
| 306 |
function autoResize(el) {
|
| 307 |
el.style.height = 'auto';
|
| 308 |
el.style.height = Math.min(el.scrollHeight, 120) + 'px';
|
| 309 |
-
}
|
|
|
|
| 203 |
el.scrollTop = el.scrollHeight;
|
| 204 |
},
|
| 205 |
onError(errMsg) {
|
| 206 |
+
if (fullText.trim()) {
|
| 207 |
+
if (!assistantDiv.querySelector('.stream-error-note')) {
|
| 208 |
+
const note = document.createElement('div');
|
| 209 |
+
note.className = 'stream-error-note';
|
| 210 |
+
note.innerHTML = `<p class="msg-p" style="color:var(--red);margin-top:10px">${esc(errMsg)}</p>`;
|
| 211 |
+
assistantDiv.appendChild(note);
|
| 212 |
+
}
|
| 213 |
+
toast(errMsg, 'error');
|
| 214 |
+
return;
|
| 215 |
+
}
|
| 216 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
|
| 217 |
},
|
| 218 |
});
|
|
|
|
| 316 |
function autoResize(el) {
|
| 317 |
el.style.height = 'auto';
|
| 318 |
el.style.height = Math.min(el.scrollHeight, 120) + 'px';
|
| 319 |
+
}
|
supabase/migrations/0004_hierarchical_nodes.sql
CHANGED
|
@@ -22,7 +22,7 @@ CREATE OR REPLACE FUNCTION public.insert_document_chunk(
|
|
| 22 |
p_id uuid,
|
| 23 |
p_content text,
|
| 24 |
p_metadata jsonb,
|
| 25 |
-
p_embedding
|
| 26 |
p_user_id uuid,
|
| 27 |
p_node_type text DEFAULT 'leaf',
|
| 28 |
p_parent_node_id uuid DEFAULT NULL,
|
|
@@ -58,7 +58,7 @@ $$;
|
|
| 58 |
|
| 59 |
CREATE OR REPLACE FUNCTION public.hybrid_search(
|
| 60 |
query_text text,
|
| 61 |
-
query_embedding
|
| 62 |
match_count integer DEFAULT 10,
|
| 63 |
filter jsonb DEFAULT '{}'::jsonb,
|
| 64 |
semantic_weight double precision DEFAULT 0.7,
|
|
@@ -72,10 +72,10 @@ begin
|
|
| 72 |
semantic as (
|
| 73 |
select
|
| 74 |
d.id, d.content, d.metadata,
|
| 75 |
-
(1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
|
| 76 |
from documents d
|
| 77 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 78 |
-
order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
|
| 79 |
limit match_count * 3
|
| 80 |
),
|
| 81 |
keyword as (
|
|
|
|
| 22 |
p_id uuid,
|
| 23 |
p_content text,
|
| 24 |
p_metadata jsonb,
|
| 25 |
+
p_embedding extensions.vector,
|
| 26 |
p_user_id uuid,
|
| 27 |
p_node_type text DEFAULT 'leaf',
|
| 28 |
p_parent_node_id uuid DEFAULT NULL,
|
|
|
|
| 58 |
|
| 59 |
CREATE OR REPLACE FUNCTION public.hybrid_search(
|
| 60 |
query_text text,
|
| 61 |
+
query_embedding extensions.vector,
|
| 62 |
match_count integer DEFAULT 10,
|
| 63 |
filter jsonb DEFAULT '{}'::jsonb,
|
| 64 |
semantic_weight double precision DEFAULT 0.7,
|
|
|
|
| 72 |
semantic as (
|
| 73 |
select
|
| 74 |
d.id, d.content, d.metadata,
|
| 75 |
+
(1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
|
| 76 |
from documents d
|
| 77 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 78 |
+
order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
|
| 79 |
limit match_count * 3
|
| 80 |
),
|
| 81 |
keyword as (
|
supabase/migrations/0006_multi_tenant_file_uniqueness.sql
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Migration 0006: Tenant-safe duplicate file support
|
| 2 |
+
--
|
| 3 |
+
-- The application now derives document node IDs from (user_id + file_hash + chunk),
|
| 4 |
+
-- so duplicate PDFs across tenants no longer collide in public.documents.
|
| 5 |
+
-- This migration fixes the remaining per-file tables that still used global
|
| 6 |
+
-- file_hash uniqueness.
|
| 7 |
+
|
| 8 |
+
-- Allow the same file_hash to exist for multiple tenants in ingested_files.
|
| 9 |
+
ALTER TABLE public.ingested_files
|
| 10 |
+
DROP CONSTRAINT IF EXISTS ingested_files_file_hash_key;
|
| 11 |
+
|
| 12 |
+
CREATE UNIQUE INDEX IF NOT EXISTS ingested_files_user_file_hash_uidx
|
| 13 |
+
ON public.ingested_files (user_id, file_hash);
|
| 14 |
+
|
| 15 |
+
-- Allow the same file_hash to exist for multiple tenants in document_trees.
|
| 16 |
+
ALTER TABLE public.document_trees
|
| 17 |
+
DROP CONSTRAINT IF EXISTS document_trees_pkey;
|
| 18 |
+
|
| 19 |
+
CREATE UNIQUE INDEX IF NOT EXISTS document_trees_user_file_hash_uidx
|
| 20 |
+
ON public.document_trees (user_id, file_hash);
|
| 21 |
+
|
| 22 |
+
CREATE INDEX IF NOT EXISTS document_trees_user_id_idx
|
| 23 |
+
ON public.document_trees (user_id);
|
supabase/schema_backup.sql
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
-
--
|
| 2 |
-
-- PostgreSQL database dump
|
| 3 |
-
--
|
| 4 |
-
|
| 5 |
-
\restrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx
|
| 6 |
|
| 7 |
-- Dumped from database version 17.6
|
| 8 |
-- Dumped by pg_dump version 18.3
|
|
@@ -23,7 +21,7 @@ SET row_security = off;
|
|
| 23 |
-- Name: public; Type: SCHEMA; Schema: -; Owner: -
|
| 24 |
--
|
| 25 |
|
| 26 |
-
CREATE SCHEMA public;
|
| 27 |
|
| 28 |
|
| 29 |
--
|
|
@@ -77,10 +75,10 @@ $$;
|
|
| 77 |
|
| 78 |
|
| 79 |
--
|
| 80 |
-
-- Name: hybrid_search(text,
|
| 81 |
--
|
| 82 |
|
| 83 |
-
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding
|
| 84 |
LANGUAGE plpgsql
|
| 85 |
AS $$
|
| 86 |
begin
|
|
@@ -89,10 +87,10 @@ begin
|
|
| 89 |
semantic as (
|
| 90 |
select
|
| 91 |
d.id, d.content, d.metadata,
|
| 92 |
-
(1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
|
| 93 |
from documents d
|
| 94 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 95 |
-
order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
|
| 96 |
limit match_count * 3
|
| 97 |
),
|
| 98 |
keyword as (
|
|
@@ -137,10 +135,10 @@ $$;
|
|
| 137 |
|
| 138 |
|
| 139 |
--
|
| 140 |
-
-- Name: insert_document_chunk(uuid, text, jsonb,
|
| 141 |
--
|
| 142 |
|
| 143 |
-
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding
|
| 144 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 145 |
AS $$
|
| 146 |
BEGIN
|
|
@@ -155,10 +153,10 @@ $$;
|
|
| 155 |
|
| 156 |
|
| 157 |
--
|
| 158 |
-
-- Name: insert_document_chunk(uuid, text, jsonb,
|
| 159 |
--
|
| 160 |
|
| 161 |
-
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding
|
| 162 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 163 |
AS $$
|
| 164 |
BEGIN
|
|
@@ -182,10 +180,10 @@ $$;
|
|
| 182 |
|
| 183 |
|
| 184 |
--
|
| 185 |
-
-- Name: match_documents(
|
| 186 |
--
|
| 187 |
|
| 188 |
-
CREATE FUNCTION public.match_documents(query_embedding
|
| 189 |
LANGUAGE plpgsql
|
| 190 |
AS $$
|
| 191 |
begin
|
|
@@ -194,20 +192,20 @@ begin
|
|
| 194 |
d.id,
|
| 195 |
d.content,
|
| 196 |
d.metadata,
|
| 197 |
-
|
| 198 |
from documents d
|
| 199 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 200 |
-
|
| 201 |
limit match_count;
|
| 202 |
end;
|
| 203 |
$$;
|
| 204 |
|
| 205 |
|
| 206 |
--
|
| 207 |
-
-- Name: match_memory(
|
| 208 |
--
|
| 209 |
|
| 210 |
-
CREATE FUNCTION public.match_memory(query_embedding
|
| 211 |
LANGUAGE plpgsql
|
| 212 |
AS $$
|
| 213 |
BEGIN
|
|
@@ -266,7 +264,7 @@ CREATE TABLE public.chat_memory (
|
|
| 266 |
session_id text NOT NULL,
|
| 267 |
role text NOT NULL,
|
| 268 |
content text NOT NULL,
|
| 269 |
-
embedding
|
| 270 |
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
|
| 271 |
user_id uuid DEFAULT auth.uid()
|
| 272 |
);
|
|
@@ -292,7 +290,7 @@ CREATE TABLE public.documents (
|
|
| 292 |
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 293 |
content text,
|
| 294 |
metadata jsonb,
|
| 295 |
-
embedding
|
| 296 |
user_id uuid DEFAULT auth.uid(),
|
| 297 |
node_type text DEFAULT 'leaf'::text,
|
| 298 |
parent_node_id uuid,
|
|
@@ -512,11 +510,11 @@ ALTER TABLE ONLY public.chat_memory
|
|
| 512 |
|
| 513 |
|
| 514 |
--
|
| 515 |
-
-- Name: document_trees
|
| 516 |
-
--
|
| 517 |
-
|
| 518 |
-
ALTER TABLE ONLY public.document_trees
|
| 519 |
-
ADD CONSTRAINT
|
| 520 |
|
| 521 |
|
| 522 |
--
|
|
@@ -536,11 +534,11 @@ ALTER TABLE ONLY public.evaluation_logs
|
|
| 536 |
|
| 537 |
|
| 538 |
--
|
| 539 |
-
-- Name: ingested_files
|
| 540 |
-
--
|
| 541 |
-
|
| 542 |
-
ALTER TABLE ONLY public.ingested_files
|
| 543 |
-
ADD CONSTRAINT
|
| 544 |
|
| 545 |
|
| 546 |
--
|
|
@@ -621,7 +619,7 @@ CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvecto
|
|
| 621 |
-- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
|
| 622 |
--
|
| 623 |
|
| 624 |
-
CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::
|
| 625 |
|
| 626 |
|
| 627 |
--
|
|
@@ -939,9 +937,7 @@ ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
|
|
| 939 |
CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
|
| 940 |
|
| 941 |
|
| 942 |
-
--
|
| 943 |
-
-- PostgreSQL database dump complete
|
| 944 |
-
--
|
| 945 |
-
|
| 946 |
-
\unrestrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx
|
| 947 |
|
|
|
|
| 1 |
+
--
|
| 2 |
+
-- PostgreSQL database dump
|
| 3 |
+
--
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-- Dumped from database version 17.6
|
| 6 |
-- Dumped by pg_dump version 18.3
|
|
|
|
| 21 |
-- Name: public; Type: SCHEMA; Schema: -; Owner: -
|
| 22 |
--
|
| 23 |
|
| 24 |
+
CREATE SCHEMA IF NOT EXISTS public;
|
| 25 |
|
| 26 |
|
| 27 |
--
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
--
|
| 78 |
+
-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
|
| 79 |
--
|
| 80 |
|
| 81 |
+
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
|
| 82 |
LANGUAGE plpgsql
|
| 83 |
AS $$
|
| 84 |
begin
|
|
|
|
| 87 |
semantic as (
|
| 88 |
select
|
| 89 |
d.id, d.content, d.metadata,
|
| 90 |
+
(1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
|
| 91 |
from documents d
|
| 92 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 93 |
+
order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
|
| 94 |
limit match_count * 3
|
| 95 |
),
|
| 96 |
keyword as (
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
--
|
| 138 |
+
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 139 |
--
|
| 140 |
|
| 141 |
+
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
|
| 142 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 143 |
AS $$
|
| 144 |
BEGIN
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
--
|
| 156 |
+
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 157 |
--
|
| 158 |
|
| 159 |
+
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
|
| 160 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 161 |
AS $$
|
| 162 |
BEGIN
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
--
|
| 183 |
+
-- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
|
| 184 |
--
|
| 185 |
|
| 186 |
+
CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
|
| 187 |
LANGUAGE plpgsql
|
| 188 |
AS $$
|
| 189 |
begin
|
|
|
|
| 192 |
d.id,
|
| 193 |
d.content,
|
| 194 |
d.metadata,
|
| 195 |
+
(1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as similarity
|
| 196 |
from documents d
|
| 197 |
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 198 |
+
order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
|
| 199 |
limit match_count;
|
| 200 |
end;
|
| 201 |
$$;
|
| 202 |
|
| 203 |
|
| 204 |
--
|
| 205 |
+
-- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 206 |
--
|
| 207 |
|
| 208 |
+
CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
|
| 209 |
LANGUAGE plpgsql
|
| 210 |
AS $$
|
| 211 |
BEGIN
|
|
|
|
| 264 |
session_id text NOT NULL,
|
| 265 |
role text NOT NULL,
|
| 266 |
content text NOT NULL,
|
| 267 |
+
embedding extensions.vector(2048),
|
| 268 |
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
|
| 269 |
user_id uuid DEFAULT auth.uid()
|
| 270 |
);
|
|
|
|
| 290 |
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 291 |
content text,
|
| 292 |
metadata jsonb,
|
| 293 |
+
embedding extensions.vector(2048),
|
| 294 |
user_id uuid DEFAULT auth.uid(),
|
| 295 |
node_type text DEFAULT 'leaf'::text,
|
| 296 |
parent_node_id uuid,
|
|
|
|
| 510 |
|
| 511 |
|
| 512 |
--
|
| 513 |
+
-- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 514 |
+
--
|
| 515 |
+
|
| 516 |
+
ALTER TABLE ONLY public.document_trees
|
| 517 |
+
ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
|
| 518 |
|
| 519 |
|
| 520 |
--
|
|
|
|
| 534 |
|
| 535 |
|
| 536 |
--
|
| 537 |
+
-- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 538 |
+
--
|
| 539 |
+
|
| 540 |
+
ALTER TABLE ONLY public.ingested_files
|
| 541 |
+
ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
|
| 542 |
|
| 543 |
|
| 544 |
--
|
|
|
|
| 619 |
-- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
|
| 620 |
--
|
| 621 |
|
| 622 |
+
CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
|
| 623 |
|
| 624 |
|
| 625 |
--
|
|
|
|
| 937 |
CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
|
| 938 |
|
| 939 |
|
| 940 |
+
--
|
| 941 |
+
-- PostgreSQL database dump complete
|
| 942 |
+
--
|
|
|
|
|
|
|
| 943 |
|