nothex commited on
Commit
ff7e045
·
1 Parent(s): 9f2ae84

fix: harden supabase setup and clean stale retrieval/ingestion code

Browse files
.env.example CHANGED
@@ -10,6 +10,7 @@ GEMINI_API_KEY=****
10
  SUPABASE_URL=https://example.supabase.co
11
  SUPABASE_SERVICE_KEY=****
12
  SUPABASE_ANON_KEY=****
 
13
 
14
  # Cohere
15
  COHERE_API_KEY=****
@@ -23,7 +24,7 @@ REDIS_URL=redis://localhost:6379/0
23
  MASTER_ADMIN_KEY=****
24
 
25
  # Local dev: ALLOWED_ORIGINS=*
26
- # Production: ALLOWED_ORIGINS=https://your-nexus.vercel.app
27
  ALLOWED_ORIGINS=*
28
 
29
  # Docs enabled in dev, disabled in prod
 
10
  SUPABASE_URL=https://example.supabase.co
11
  SUPABASE_SERVICE_KEY=****
12
  SUPABASE_ANON_KEY=****
13
+ SUPABASE_JWT_SECRET=****
14
 
15
  # Cohere
16
  COHERE_API_KEY=****
 
24
  MASTER_ADMIN_KEY=****
25
 
26
  # Local dev: ALLOWED_ORIGINS=*
27
+ # Production: ALLOWED_ORIGINS=https://your-morpheus.vercel.app
28
  ALLOWED_ORIGINS=*
29
 
30
  # Docs enabled in dev, disabled in prod
.github/workflows/smoke.yml CHANGED
@@ -25,13 +25,13 @@ jobs:
25
 
26
  - name: Build assets (light mode)
27
  env:
28
- NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
29
- NEXUS_BUILD_ASSETS_MODE: "light"
30
  run: |
31
  python -m backend.core.build_ml_assets
32
 
33
  - name: Intent classifier smoke predict
34
  env:
35
- NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
36
  run: |
37
  python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"
 
25
 
26
  - name: Build assets (light mode)
27
  env:
28
+ MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
29
+ MORPHEUS_BUILD_ASSETS_MODE: "light"
30
  run: |
31
  python -m backend.core.build_ml_assets
32
 
33
  - name: Intent classifier smoke predict
34
  env:
35
+ MORPHEUS_DISABLE_INTENT_BOOTSTRAP: "true"
36
  run: |
37
  python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"
README.md CHANGED
@@ -99,9 +99,16 @@ pip install -r requirements.txt
99
  ### 3. Set up Supabase
100
 
101
  1. Create a project at [supabase.com](https://supabase.com)
102
- 2. Go to the SQL editor and run `supabase/schema_backup.sql` in full
103
- 3. Run `supabase/rls/multi_tenancy_rls.sql` to enable row-level security
104
- 4. Enable the `pgvector` extension (Database → Extensions)
 
 
 
 
 
 
 
105
 
106
  ### 4. Configure environment
107
 
@@ -235,4 +242,4 @@ The architecture is designed for independent layer scaling:
235
  - **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
236
  - **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
237
  - **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
238
- - **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.
 
99
  ### 3. Set up Supabase
100
 
101
  1. Create a project at [supabase.com](https://supabase.com)
102
+ 2. Enable the `vector`, `uuid-ossp`, and `pgcrypto` extensions first (Database → Extensions)
103
+ 3. Go to the SQL editor and run `supabase/schema_backup.sql` in full
104
+ 4. Create two Storage buckets:
105
+ - `rag-images` as a public bucket
106
+ - `rag-models` as a private bucket
107
+ 5. Optional but recommended: run `supabase/rls/rpc_security_rls_audit.sql`
108
+
109
+ For an existing older database, apply the files in `supabase/migrations/` and
110
+ the scripts in `supabase/rls/` as incremental upgrade helpers. For a brand-new
111
+ project, `schema_backup.sql` is already the full source of truth.
112
 
113
  ### 4. Configure environment
114
 
 
242
  - **Query concurrency**: Increase Gunicorn workers in `render.yaml` from `-w 1` to `-w 2` or more when moving off the free tier.
243
  - **Embedding cost**: Swap `EMBEDDING_MODEL` in `config.py` — the `FallbackEmbeddings` wrapper handles the rest.
244
  - **LLM cost**: All model lists live in `config.py`. Replace free OpenRouter models with paid ones for higher rate limits.
245
+ - **New document types**: The document classifier learns them automatically. Run warmup after your first example of each type.
backend/api/query.py CHANGED
@@ -5,7 +5,11 @@ import asyncio
5
  from fastapi import APIRouter, Header, Depends, Request
6
  from fastapi.responses import StreamingResponse
7
  from shared.types import QueryRequest, SourceChunk
8
- from backend.core.pipeline import retrieve_chunks, generate_answer_stream, analyse_intent
 
 
 
 
9
  from backend.core.auth_utils import require_auth_token
10
  from backend.main import limiter
11
 
@@ -13,6 +17,49 @@ log = logging.getLogger("morpheus.api.query")
13
  router = APIRouter()
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  @router.post("")
17
  @limiter.limit("60/hour")
18
  async def query(
@@ -60,7 +107,7 @@ async def query(
60
  loop = asyncio.get_event_loop()
61
  chunks = await loop.run_in_executor(
62
  None,
63
- lambda: retrieve_chunks(
64
  effective_query,
65
  k=req.k,
66
  category=category,
@@ -75,6 +122,7 @@ async def query(
75
 
76
  # ── Step 3: Stream answer tokens ──────────────────────────────────
77
  images = []
 
78
  # 🚀 Define the boolean once for readability
79
  is_eval = x_eval_mode == "true"
80
  async for event in generate_answer_stream(
@@ -90,27 +138,12 @@ async def query(
90
  yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
91
  elif event["type"] == "done":
92
  images = event.get("images", [])
 
93
 
94
  # ── Step 4: Emit sources + images ─────────────────────────────────
95
- sources = []
96
- for chunk in chunks:
97
- meta = chunk.metadata
98
- orig = meta.get("original_content", "{}")
99
- if isinstance(orig, str):
100
- try: orig = json.loads(orig) # noqa: E701
101
- except: orig = {} # noqa: E701, E722
102
- full_text = orig.get("raw_text") or chunk.page_content
103
- snippet_text = full_text if x_eval_mode == "true" else full_text[:200]
104
- sources.append(
105
- SourceChunk(
106
- source=meta.get("source", "Unknown"),
107
- score=meta.get("relevance_score"),
108
- chunk=meta.get("chunk_index"),
109
- snippet=snippet_text,
110
- doc_type=meta.get("document_type"),
111
- pages=meta.get("page_numbers"),
112
- ).dict()
113
- )
114
 
115
  yield "data: " + json.dumps({
116
  "type": "done",
@@ -144,4 +177,4 @@ async def query(
144
  "X-Accel-Buffering": "no",
145
  "Access-Control-Allow-Origin": "*",
146
  }
147
- )
 
5
  from fastapi import APIRouter, Header, Depends, Request
6
  from fastapi.responses import StreamingResponse
7
  from shared.types import QueryRequest, SourceChunk
8
+ from backend.core.pipeline import (
9
+ retrieve_chunks_routed,
10
+ generate_answer_stream,
11
+ analyse_intent,
12
+ )
13
  from backend.core.auth_utils import require_auth_token
14
  from backend.main import limiter
15
 
 
17
  router = APIRouter()
18
 
19
 
20
+ def _normalise_original_content(raw):
21
+ """Best-effort decode for metadata that may already be dict or JSON string."""
22
+ if isinstance(raw, dict):
23
+ return raw
24
+ if isinstance(raw, str):
25
+ candidate = raw
26
+ for _ in range(2):
27
+ try:
28
+ candidate = json.loads(candidate)
29
+ except Exception:
30
+ return {}
31
+ if isinstance(candidate, dict):
32
+ return candidate
33
+ if not isinstance(candidate, str):
34
+ break
35
+ return {}
36
+
37
+
38
+ def _build_sources_from_chunks(chunks, include_full_text: bool = False):
39
+ sources = []
40
+ for chunk in chunks:
41
+ try:
42
+ if getattr(chunk, "page_content", None) == "__CACHE_HIT__":
43
+ continue
44
+ meta = getattr(chunk, "metadata", {}) or {}
45
+ orig = _normalise_original_content(meta.get("original_content", {}))
46
+ full_text = orig.get("raw_text") or chunk.page_content
47
+ snippet_text = full_text if include_full_text else full_text[:200]
48
+ sources.append(
49
+ SourceChunk(
50
+ source=meta.get("source", "Unknown"),
51
+ score=meta.get("relevance_score"),
52
+ chunk=meta.get("chunk_index"),
53
+ snippet=snippet_text,
54
+ doc_type=meta.get("document_type"),
55
+ pages=meta.get("page_numbers"),
56
+ ).dict()
57
+ )
58
+ except Exception as exc:
59
+ log.warning("Skipping source serialization for chunk: %s", exc)
60
+ return sources
61
+
62
+
63
  @router.post("")
64
  @limiter.limit("60/hour")
65
  async def query(
 
107
  loop = asyncio.get_event_loop()
108
  chunks = await loop.run_in_executor(
109
  None,
110
+ lambda: retrieve_chunks_routed(
111
  effective_query,
112
  k=req.k,
113
  category=category,
 
122
 
123
  # ── Step 3: Stream answer tokens ──────────────────────────────────
124
  images = []
125
+ done_sources = []
126
  # 🚀 Define the boolean once for readability
127
  is_eval = x_eval_mode == "true"
128
  async for event in generate_answer_stream(
 
138
  yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
139
  elif event["type"] == "done":
140
  images = event.get("images", [])
141
+ done_sources = event.get("sources", []) or []
142
 
143
  # ── Step 4: Emit sources + images ─────────────────────────────────
144
+ sources = done_sources or _build_sources_from_chunks(
145
+ chunks, include_full_text=is_eval
146
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  yield "data: " + json.dumps({
149
  "type": "done",
 
177
  "X-Accel-Buffering": "no",
178
  "Access-Control-Allow-Origin": "*",
179
  }
180
+ )
backend/core/classifier.py CHANGED
@@ -1,8 +1,5 @@
1
- # PASTE classifier.py HERE
2
- # Fix: 'import config' -> 'from backend.core import config'
3
  """
4
- classifier.py — Hierarchical Ensemble Document Classifier
5
- ==========================================================
6
 
7
  3-stage cascade:
8
  Stage 1: Embedding nearest-centroid (cosine similarity, no API calls after warmup)
@@ -15,14 +12,9 @@ before the normal pipeline (catches periodic tables, reference charts etc.)
15
  Each stage only activates if the previous stage's confidence is below its threshold.
16
  Centroid embeddings are persisted to Supabase so they survive restarts.
17
 
18
- Usage (from cl.py):
19
- from classifier import DocumentClassifier
20
  clf = DocumentClassifier()
21
  result = clf.classify(sample_text, elements)
22
- # result.document_type → "machine_learning_paper"
23
- # result.confidence → 0.91
24
- # result.stage_used → "centroid"
25
- # result.is_new_type → False
26
  """
27
 
28
  import re
@@ -672,4 +664,4 @@ JSON:"""
672
  if score > best_score:
673
  best_score = score
674
  best_type = cat
675
- return (best_type, best_score) if best_type else (None, None)
 
 
 
1
  """
2
+ Hierarchical ensemble document classifier.
 
3
 
4
  3-stage cascade:
5
  Stage 1: Embedding nearest-centroid (cosine similarity, no API calls after warmup)
 
12
  Each stage only activates if the previous stage's confidence is below its threshold.
13
  Centroid embeddings are persisted to Supabase so they survive restarts.
14
 
15
+ Typical usage:
 
16
  clf = DocumentClassifier()
17
  result = clf.classify(sample_text, elements)
 
 
 
 
18
  """
19
 
20
  import re
 
664
  if score > best_score:
665
  best_score = score
666
  best_type = cat
667
+ return (best_type, best_score) if best_type else (None, None)
backend/core/pipeline.py CHANGED
@@ -1,24 +1,7 @@
1
- # PASTE cl.py HERE
2
- # Fix: 'import config' -> 'from backend.core import config'
3
- # Fix: 'from classifier import DocumentClassifier' -> 'from backend.core.classifier import DocumentClassifier'
4
  """
5
- RAG Ingestion & Retrieval Pipeline (cl.py)
6
- ============================================
7
- Run modes:
8
- python cl.py --ingest --pdf path/to/file.pdf
9
- python cl.py --ingest --pdf file.pdf --force
10
- python cl.py --query "your question here"
11
- python cl.py --ingest --pdf file.pdf --export
12
-
13
- Improvements in this version:
14
- - FIX: process_chunks now uses graph_data.document_type (not .categories[0])
15
- - FIX: is_file_already_ingested now hits ingested_files registry table (O(1))
16
- - NEW: In-memory embedding cache for repeated queries (thread-safe LRU via functools)
17
- - NEW: ingested_files registry insert after successful upload
18
- - NEW: relevance_score surfaced in metadata for UI badge display
19
- - NEW: Source deduplication by chunk content hash (prevents near-duplicate passages)
20
- - NEW: Graceful empty-query guard in generate_sub_queries
21
- - NEW: MMR-style post-rerank diversity filter to stop one source dominating
22
  """
23
 
24
  import os
@@ -631,6 +614,7 @@ def process_chunks(
631
  file_path: str,
632
  file_hash: str,
633
  graph_data: DocumentGraphMetadata,
 
634
  pdf_images=None,
635
  ) -> tuple[List[Document], List[str]]:
636
  """Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
@@ -718,7 +702,7 @@ def process_chunks(
718
  )
719
  docs.append(doc)
720
 
721
- unique_string = f"{file_hash}_chunk_{i}"
722
  chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
723
  ids.append(chunk_id)
724
 
@@ -727,7 +711,7 @@ def process_chunks(
727
 
728
 
729
  def build_raptor_tree(
730
- leaf_docs: List[Document], leaf_ids: List[str]
731
  ) -> tuple[List[Document], List[str]]:
732
  """
733
  RAPTOR implementation: recursively clusters documents and generates
@@ -801,15 +785,23 @@ def build_raptor_tree(
801
  # Generate deterministic ID for the parent
802
  import hashlib
803
 
 
 
 
804
  parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
 
805
  parent_id = str(
806
- uuid.uuid5(uuid.NAMESPACE_DNS, f"raptor_{current_level}_{parent_hash}")
 
 
 
 
 
 
807
  )
808
 
809
  # Create the parent document
810
  # Inherit metadata from the first child (source array, file hash, document type)
811
- base_meta = cluster[0].metadata
812
-
813
  # Gather all unique page numbers from children
814
  all_pages = set()
815
  for c in cluster:
@@ -925,12 +917,6 @@ def _register_ingested_file(
925
  log.warning("Could not register in ingested_files: %s", exc)
926
 
927
 
928
- # =========================================================================== #
929
- # ADD THESE TWO FUNCTIONS TO cl.py #
930
- # Place them right after _register_ingested_file() #
931
- # =========================================================================== #
932
-
933
-
934
  def _apply_category_override(
935
  file_hash: str, new_category: str, access_token: str = None
936
  ) -> None:
@@ -1265,7 +1251,29 @@ def run_ingestion(
1265
  if already_exists and not force:
1266
  log.info("SKIPPING — already ingested.")
1267
  return "already_ingested"
1268
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1269
  # 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
1270
  # RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
1271
  if already_exists or force:
@@ -1276,30 +1284,13 @@ def run_ingestion(
1276
  "user_id", user_id
1277
  ).contains("metadata", {"file_hash": file_hash}).execute()
1278
  # 2. Clear the registry
1279
- supabase.table("ingested_files").delete().eq("file_hash", file_hash).execute()
 
 
1280
  # 3. Clear the tree if it exists
1281
- supabase.table("document_trees").delete().eq("file_hash", file_hash).execute()
1282
-
1283
- # NEW: Check if user has previously overridden the category for this file.
1284
- # If so, skip the classifier and use their choice directly.
1285
- forced_category = None
1286
- try:
1287
- _sb = _build_supabase_client(access_token)
1288
- _existing = (
1289
- _sb.table("ingested_files")
1290
- .select("document_type, user_overridden")
1291
- .eq("file_hash", file_hash)
1292
- .limit(1)
1293
- .execute()
1294
- )
1295
- if _existing.data and _existing.data[0].get("user_overridden"):
1296
- forced_category = _existing.data[0]["document_type"]
1297
- log.info(
1298
- "User override active — forcing category '%s', skipping classifier.",
1299
- forced_category,
1300
- )
1301
- except Exception as _exc:
1302
- log.warning("Could not check user override: %s", _exc)
1303
 
1304
  _progress(2, "Partitioning PDF (OCR + layout detection)…")
1305
  elements = partition_document(pdf_path)
@@ -1332,8 +1323,9 @@ def run_ingestion(
1332
  doc_tree = _build_document_tree(elements)
1333
 
1334
  sb = _build_service_supabase_client()
1335
- sb.table("document_trees").insert(
1336
- {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree}
 
1337
  ).execute()
1338
  log.info("✅ PageIndex tree saved to Supabase.")
1339
  except Exception as e:
@@ -1347,12 +1339,12 @@ def run_ingestion(
1347
  else:
1348
  pdf_path_for_naming = pdf_path
1349
  docs, ids = process_chunks(
1350
- chunks, elements, pdf_path_for_naming, file_hash, graph_data, pdf_images
1351
  )
1352
 
1353
  # --- NATIVE RAPTOR INDEXING ---
1354
  _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
1355
- docs, ids = build_raptor_tree(docs, ids)
1356
 
1357
  smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
1358
  if export_json:
@@ -1374,7 +1366,7 @@ def run_ingestion(
1374
  "document_type": graph_data.document_type,
1375
  "chunk_count": len(docs),
1376
  },
1377
- on_conflict="file_hash",
1378
  ).execute()
1379
  except Exception as e:
1380
  log.error("Failed to register file: %s", e)
@@ -1936,10 +1928,12 @@ def retrieve_chunks(
1936
  max_per_source=max_per_source,
1937
  )
1938
 
1939
- docs = [
1940
- Document(page_content=c["content"], metadata=c.get("metadata", {}))
1941
- for c in diverse
1942
- ]
 
 
1943
  log.info(
1944
  "Dropped %d low-relevance/duplicate chunks.",
1945
  len(all_candidates) - len(docs),
@@ -2269,12 +2263,6 @@ def generate_answer(
2269
  log.error("Answer generation failed: %s", exc)
2270
  return f"Failed to generate answer: {exc}", []
2271
 
2272
-
2273
- # ── ADD THIS FUNCTION to pipeline.py right after generate_answer() ──────────
2274
- # Also add these imports at the top of pipeline.py if not already there:
2275
- # from typing import AsyncGenerator
2276
-
2277
-
2278
  async def generate_answer_stream(
2279
  chunks: List[Document],
2280
  query: str,
@@ -2568,13 +2556,12 @@ def _save_to_memory(
2568
  {
2569
  "query_embedding": query_vector,
2570
  "match_session_id": session_id,
2571
- "match_threshold": 0.98, # 98% similarity = practically identical
2572
  "match_count": 1,
2573
  },
2574
  ).execute()
2575
 
2576
  # If we found a nearly identical query in this session, skip saving!
2577
- if dup_check.data:
2578
  log.info(
2579
  "🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
2580
  session_id[:8],
@@ -2713,7 +2700,9 @@ def _should_use_tree_path(query: str) -> bool:
2713
  return False
2714
 
2715
 
2716
- def tree_search(query: str, access_token: str = None) -> List[Document]:
 
 
2717
  """
2718
  Navigates the structural JSON trees in Supabase to answer highly specific
2719
  'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
@@ -2761,6 +2750,21 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
2761
  if not res.data:
2762
  return []
2763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2764
  matched_chunks = []
2765
 
2766
  # 3. Recursive Tree Traversal
@@ -2789,6 +2793,8 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
2789
 
2790
  # 4. Execute traversal across all trees
2791
  for tree_row in res.data:
 
 
2792
  _traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
2793
 
2794
  log.info(
@@ -2803,6 +2809,50 @@ def tree_search(query: str, access_token: str = None) -> List[Document]:
2803
  return []
2804
 
2805
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2806
  def run_query(
2807
  query: str,
2808
  k: int = 3,
@@ -2815,35 +2865,16 @@ def run_query(
2815
  ) -> Tuple[str, List[str]]:
2816
 
2817
  # 1. Document Retrieval (Routed)
2818
- use_tree_path = _should_use_tree_path(query)
2819
-
2820
- if use_tree_path:
2821
- log.info("🎯 PageIndex triggered: Query routed to structural tree path.")
2822
- chunks = tree_search(query, access_token=access_token)
2823
-
2824
- # Failsafe: If the tree search somehow comes up empty, fall back to vector search
2825
- if not chunks:
2826
- log.info("Tree search yielded 0 results. Falling back to vector search.")
2827
- chunks = retrieve_chunks(
2828
- query,
2829
- k=k,
2830
- source_file=source_file,
2831
- category=category,
2832
- alpha=alpha,
2833
- session_id=session_id,
2834
- access_token=access_token,
2835
- )
2836
- else:
2837
- log.info("🌊 Semantic path triggered: Query routed to vector search.")
2838
- chunks = retrieve_chunks(
2839
- query,
2840
- k=k,
2841
- source_file=source_file,
2842
- category=category,
2843
- alpha=alpha,
2844
- session_id=session_id,
2845
- access_token=access_token,
2846
- )
2847
 
2848
  # 2. Retrieve Episodic Memory (Semantic Search)
2849
  past_memories = []
 
 
 
 
1
  """
2
+ Core ingestion, retrieval, generation, memory, and tree-search pipeline.
3
+
4
+ This module backs the FastAPI routes and Celery ingestion worker for Morpheus.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
 
614
  file_path: str,
615
  file_hash: str,
616
  graph_data: DocumentGraphMetadata,
617
+ user_id: str,
618
  pdf_images=None,
619
  ) -> tuple[List[Document], List[str]]:
620
  """Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
 
702
  )
703
  docs.append(doc)
704
 
705
+ unique_string = f"{user_id}:{file_hash}:chunk:{i}"
706
  chunk_id = str(uuid.uuid5(NAMESPACE, unique_string))
707
  ids.append(chunk_id)
708
 
 
711
 
712
 
713
  def build_raptor_tree(
714
+ leaf_docs: List[Document], leaf_ids: List[str], user_id: str
715
  ) -> tuple[List[Document], List[str]]:
716
  """
717
  RAPTOR implementation: recursively clusters documents and generates
 
785
  # Generate deterministic ID for the parent
786
  import hashlib
787
 
788
+ child_node_ids = [
789
+ str(c.metadata.get("node_id", "")) for c in cluster if c.metadata.get("node_id")
790
+ ]
791
  parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
792
+ base_meta = cluster[0].metadata
793
  parent_id = str(
794
+ uuid.uuid5(
795
+ uuid.NAMESPACE_DNS,
796
+ (
797
+ f"{user_id}:raptor:{base_meta.get('file_hash', '')}:"
798
+ f"{current_level}:{'|'.join(child_node_ids)}:{parent_hash}"
799
+ ),
800
+ )
801
  )
802
 
803
  # Create the parent document
804
  # Inherit metadata from the first child (source array, file hash, document type)
 
 
805
  # Gather all unique page numbers from children
806
  all_pages = set()
807
  for c in cluster:
 
917
  log.warning("Could not register in ingested_files: %s", exc)
918
 
919
 
 
 
 
 
 
 
920
  def _apply_category_override(
921
  file_hash: str, new_category: str, access_token: str = None
922
  ) -> None:
 
1251
  if already_exists and not force:
1252
  log.info("SKIPPING — already ingested.")
1253
  return "already_ingested"
1254
+
1255
+ # NEW: Check if user has previously overridden the category for this file.
1256
+ # If so, skip the classifier and use their choice directly.
1257
+ forced_category = None
1258
+ if already_exists or force:
1259
+ try:
1260
+ _sb = _build_supabase_client(access_token)
1261
+ _existing = (
1262
+ _sb.table("ingested_files")
1263
+ .select("document_type, user_overridden")
1264
+ .eq("file_hash", file_hash)
1265
+ .limit(1)
1266
+ .execute()
1267
+ )
1268
+ if _existing.data and _existing.data[0].get("user_overridden"):
1269
+ forced_category = _existing.data[0]["document_type"]
1270
+ log.info(
1271
+ "User override active — forcing category '%s', skipping classifier.",
1272
+ forced_category,
1273
+ )
1274
+ except Exception as _exc:
1275
+ log.warning("Could not check user override: %s", _exc)
1276
+
1277
  # 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
1278
  # RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
1279
  if already_exists or force:
 
1284
  "user_id", user_id
1285
  ).contains("metadata", {"file_hash": file_hash}).execute()
1286
  # 2. Clear the registry
1287
+ supabase.table("ingested_files").delete().eq("user_id", user_id).eq(
1288
+ "file_hash", file_hash
1289
+ ).execute()
1290
  # 3. Clear the tree if it exists
1291
+ supabase.table("document_trees").delete().eq("user_id", user_id).eq(
1292
+ "file_hash", file_hash
1293
+ ).execute()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1294
 
1295
  _progress(2, "Partitioning PDF (OCR + layout detection)…")
1296
  elements = partition_document(pdf_path)
 
1323
  doc_tree = _build_document_tree(elements)
1324
 
1325
  sb = _build_service_supabase_client()
1326
+ sb.table("document_trees").upsert(
1327
+ {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
1328
+ on_conflict="user_id,file_hash",
1329
  ).execute()
1330
  log.info("✅ PageIndex tree saved to Supabase.")
1331
  except Exception as e:
 
1339
  else:
1340
  pdf_path_for_naming = pdf_path
1341
  docs, ids = process_chunks(
1342
+ chunks, elements, pdf_path_for_naming, file_hash, graph_data, user_id, pdf_images
1343
  )
1344
 
1345
  # --- NATIVE RAPTOR INDEXING ---
1346
  _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
1347
+ docs, ids = build_raptor_tree(docs, ids, user_id)
1348
 
1349
  smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
1350
  if export_json:
 
1366
  "document_type": graph_data.document_type,
1367
  "chunk_count": len(docs),
1368
  },
1369
+ on_conflict="user_id,file_hash",
1370
  ).execute()
1371
  except Exception as e:
1372
  log.error("Failed to register file: %s", e)
 
1928
  max_per_source=max_per_source,
1929
  )
1930
 
1931
+ docs = []
1932
+ for c in diverse:
1933
+ meta = dict(c.get("metadata", {}) or {})
1934
+ if c.get("id") is not None:
1935
+ meta["id"] = str(c["id"])
1936
+ docs.append(Document(page_content=c["content"], metadata=meta))
1937
  log.info(
1938
  "Dropped %d low-relevance/duplicate chunks.",
1939
  len(all_candidates) - len(docs),
 
2263
  log.error("Answer generation failed: %s", exc)
2264
  return f"Failed to generate answer: {exc}", []
2265
 
 
 
 
 
 
 
2266
  async def generate_answer_stream(
2267
  chunks: List[Document],
2268
  query: str,
 
2556
  {
2557
  "query_embedding": query_vector,
2558
  "match_session_id": session_id,
 
2559
  "match_count": 1,
2560
  },
2561
  ).execute()
2562
 
2563
  # If we found a nearly identical query in this session, skip saving!
2564
+ if dup_check.data and float(dup_check.data[0].get("similarity", 0.0)) >= 0.98:
2565
  log.info(
2566
  "🧠 Memory Bouncer: Duplicate query detected in session %s. Skipping save.",
2567
  session_id[:8],
 
2700
  return False
2701
 
2702
 
2703
+ def tree_search(
2704
+ query: str, access_token: str = None, category: str = None
2705
+ ) -> List[Document]:
2706
  """
2707
  Navigates the structural JSON trees in Supabase to answer highly specific
2708
  'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
 
2750
  if not res.data:
2751
  return []
2752
 
2753
+ allowed_hashes = None
2754
+ if category and category != "All":
2755
+ try:
2756
+ allowed_res = (
2757
+ sb.table("ingested_files")
2758
+ .select("file_hash")
2759
+ .eq("document_type", category)
2760
+ .execute()
2761
+ )
2762
+ allowed_hashes = {
2763
+ row.get("file_hash") for row in (allowed_res.data or []) if row.get("file_hash")
2764
+ }
2765
+ except Exception as exc:
2766
+ log.warning("Could not apply tree-search category filter: %s", exc)
2767
+
2768
  matched_chunks = []
2769
 
2770
  # 3. Recursive Tree Traversal
 
2793
 
2794
  # 4. Execute traversal across all trees
2795
  for tree_row in res.data:
2796
+ if allowed_hashes is not None and tree_row.get("file_hash") not in allowed_hashes:
2797
+ continue
2798
  _traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
2799
 
2800
  log.info(
 
2809
  return []
2810
 
2811
 
2812
+ def retrieve_chunks_routed(
2813
+ query: str,
2814
+ k: int = 3,
2815
+ source_file: str = None,
2816
+ category: str = None,
2817
+ alpha: float = 0.5,
2818
+ session_id: str = "default_session",
2819
+ access_token: str = None,
2820
+ user_id: str = None,
2821
+ original_query: str = None,
2822
+ eval_mode: bool = False,
2823
+ ) -> List[Document]:
2824
+ """
2825
+ Live request-path retrieval entrypoint.
2826
+ Routes structural queries to the tree index first, then falls back to vector retrieval.
2827
+ """
2828
+ routing_query = (original_query or query or "").strip()
2829
+ if routing_query and _should_use_tree_path(routing_query):
2830
+ log.info("🎯 PageIndex triggered: query routed to structural tree path.")
2831
+ tree_chunks = tree_search(
2832
+ routing_query, access_token=access_token, category=category
2833
+ )
2834
+ if tree_chunks:
2835
+ if session_id:
2836
+ session_key = _session_cache_key(session_id, user_id=user_id)
2837
+ with _last_chunks_lock:
2838
+ _last_chunks[session_key] = tree_chunks
2839
+ return tree_chunks
2840
+ log.info("Tree search yielded 0 results. Falling back to vector search.")
2841
+
2842
+ return retrieve_chunks(
2843
+ query,
2844
+ k=k,
2845
+ source_file=source_file,
2846
+ category=category,
2847
+ alpha=alpha,
2848
+ session_id=session_id,
2849
+ access_token=access_token,
2850
+ user_id=user_id,
2851
+ original_query=original_query,
2852
+ eval_mode=eval_mode,
2853
+ )
2854
+
2855
+
2856
  def run_query(
2857
  query: str,
2858
  k: int = 3,
 
2865
  ) -> Tuple[str, List[str]]:
2866
 
2867
  # 1. Document Retrieval (Routed)
2868
+ chunks = retrieve_chunks_routed(
2869
+ query,
2870
+ k=k,
2871
+ source_file=source_file,
2872
+ category=category,
2873
+ alpha=alpha,
2874
+ session_id=session_id,
2875
+ access_token=access_token,
2876
+ original_query=query,
2877
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2878
 
2879
  # 2. Retrieve Episodic Memory (Semantic Search)
2880
  past_memories = []
frontend/js/chat.js CHANGED
@@ -203,6 +203,16 @@ async function sendChat() {
203
  el.scrollTop = el.scrollHeight;
204
  },
205
  onError(errMsg) {
 
 
 
 
 
 
 
 
 
 
206
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
207
  },
208
  });
@@ -306,4 +316,4 @@ function handleChatKey(e) {
306
  function autoResize(el) {
307
  el.style.height = 'auto';
308
  el.style.height = Math.min(el.scrollHeight, 120) + 'px';
309
- }
 
203
  el.scrollTop = el.scrollHeight;
204
  },
205
  onError(errMsg) {
206
+ if (fullText.trim()) {
207
+ if (!assistantDiv.querySelector('.stream-error-note')) {
208
+ const note = document.createElement('div');
209
+ note.className = 'stream-error-note';
210
+ note.innerHTML = `<p class="msg-p" style="color:var(--red);margin-top:10px">${esc(errMsg)}</p>`;
211
+ assistantDiv.appendChild(note);
212
+ }
213
+ toast(errMsg, 'error');
214
+ return;
215
+ }
216
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
217
  },
218
  });
 
316
  function autoResize(el) {
317
  el.style.height = 'auto';
318
  el.style.height = Math.min(el.scrollHeight, 120) + 'px';
319
+ }
supabase/migrations/0004_hierarchical_nodes.sql CHANGED
@@ -22,7 +22,7 @@ CREATE OR REPLACE FUNCTION public.insert_document_chunk(
22
  p_id uuid,
23
  p_content text,
24
  p_metadata jsonb,
25
- p_embedding public.vector,
26
  p_user_id uuid,
27
  p_node_type text DEFAULT 'leaf',
28
  p_parent_node_id uuid DEFAULT NULL,
@@ -58,7 +58,7 @@ $$;
58
 
59
  CREATE OR REPLACE FUNCTION public.hybrid_search(
60
  query_text text,
61
- query_embedding public.vector,
62
  match_count integer DEFAULT 10,
63
  filter jsonb DEFAULT '{}'::jsonb,
64
  semantic_weight double precision DEFAULT 0.7,
@@ -72,10 +72,10 @@ begin
72
  semantic as (
73
  select
74
  d.id, d.content, d.metadata,
75
- (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
76
  from documents d
77
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
78
- order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
79
  limit match_count * 3
80
  ),
81
  keyword as (
 
22
  p_id uuid,
23
  p_content text,
24
  p_metadata jsonb,
25
+ p_embedding extensions.vector,
26
  p_user_id uuid,
27
  p_node_type text DEFAULT 'leaf',
28
  p_parent_node_id uuid DEFAULT NULL,
 
58
 
59
  CREATE OR REPLACE FUNCTION public.hybrid_search(
60
  query_text text,
61
+ query_embedding extensions.vector,
62
  match_count integer DEFAULT 10,
63
  filter jsonb DEFAULT '{}'::jsonb,
64
  semantic_weight double precision DEFAULT 0.7,
 
72
  semantic as (
73
  select
74
  d.id, d.content, d.metadata,
75
+ (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
76
  from documents d
77
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
78
+ order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
79
  limit match_count * 3
80
  ),
81
  keyword as (
supabase/migrations/0006_multi_tenant_file_uniqueness.sql ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration 0006: Tenant-safe duplicate file support
2
+ --
3
+ -- The application now derives document node IDs from (user_id + file_hash + chunk),
4
+ -- so duplicate PDFs across tenants no longer collide in public.documents.
5
+ -- This migration fixes the remaining per-file tables that still used global
6
+ -- file_hash uniqueness.
7
+
8
+ -- Allow the same file_hash to exist for multiple tenants in ingested_files.
9
+ ALTER TABLE public.ingested_files
10
+ DROP CONSTRAINT IF EXISTS ingested_files_file_hash_key;
11
+
12
+ CREATE UNIQUE INDEX IF NOT EXISTS ingested_files_user_file_hash_uidx
13
+ ON public.ingested_files (user_id, file_hash);
14
+
15
+ -- Allow the same file_hash to exist for multiple tenants in document_trees.
16
+ ALTER TABLE public.document_trees
17
+ DROP CONSTRAINT IF EXISTS document_trees_pkey;
18
+
19
+ CREATE UNIQUE INDEX IF NOT EXISTS document_trees_user_file_hash_uidx
20
+ ON public.document_trees (user_id, file_hash);
21
+
22
+ CREATE INDEX IF NOT EXISTS document_trees_user_id_idx
23
+ ON public.document_trees (user_id);
supabase/schema_backup.sql CHANGED
@@ -1,8 +1,6 @@
1
- --
2
- -- PostgreSQL database dump
3
- --
4
-
5
- \restrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx
6
 
7
  -- Dumped from database version 17.6
8
  -- Dumped by pg_dump version 18.3
@@ -23,7 +21,7 @@ SET row_security = off;
23
  -- Name: public; Type: SCHEMA; Schema: -; Owner: -
24
  --
25
 
26
- CREATE SCHEMA public;
27
 
28
 
29
  --
@@ -77,10 +75,10 @@ $$;
77
 
78
 
79
  --
80
- -- Name: hybrid_search(text, public.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
81
  --
82
 
83
- CREATE FUNCTION public.hybrid_search(query_text text, query_embedding public.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
84
  LANGUAGE plpgsql
85
  AS $$
86
  begin
@@ -89,10 +87,10 @@ begin
89
  semantic as (
90
  select
91
  d.id, d.content, d.metadata,
92
- (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as score
93
  from documents d
94
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
95
- order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
96
  limit match_count * 3
97
  ),
98
  keyword as (
@@ -137,10 +135,10 @@ $$;
137
 
138
 
139
  --
140
- -- Name: insert_document_chunk(uuid, text, jsonb, public.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
141
  --
142
 
143
- CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding public.vector, p_user_id uuid) RETURNS void
144
  LANGUAGE plpgsql SECURITY DEFINER
145
  AS $$
146
  BEGIN
@@ -155,10 +153,10 @@ $$;
155
 
156
 
157
  --
158
- -- Name: insert_document_chunk(uuid, text, jsonb, public.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
159
  --
160
 
161
- CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding public.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
162
  LANGUAGE plpgsql SECURITY DEFINER
163
  AS $$
164
  BEGIN
@@ -182,10 +180,10 @@ $$;
182
 
183
 
184
  --
185
- -- Name: match_documents(public.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
186
  --
187
 
188
- CREATE FUNCTION public.match_documents(query_embedding public.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
189
  LANGUAGE plpgsql
190
  AS $$
191
  begin
@@ -194,20 +192,20 @@ begin
194
  d.id,
195
  d.content,
196
  d.metadata,
197
- (1 - (d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)))::float as similarity
198
  from documents d
199
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
200
- order by d.embedding::halfvec(2048) <=> query_embedding::halfvec(2048)
201
  limit match_count;
202
  end;
203
  $$;
204
 
205
 
206
  --
207
- -- Name: match_memory(public.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
208
  --
209
 
210
- CREATE FUNCTION public.match_memory(query_embedding public.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
211
  LANGUAGE plpgsql
212
  AS $$
213
  BEGIN
@@ -266,7 +264,7 @@ CREATE TABLE public.chat_memory (
266
  session_id text NOT NULL,
267
  role text NOT NULL,
268
  content text NOT NULL,
269
- embedding public.vector(2048),
270
  created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
271
  user_id uuid DEFAULT auth.uid()
272
  );
@@ -292,7 +290,7 @@ CREATE TABLE public.documents (
292
  id uuid DEFAULT gen_random_uuid() NOT NULL,
293
  content text,
294
  metadata jsonb,
295
- embedding public.vector(2048),
296
  user_id uuid DEFAULT auth.uid(),
297
  node_type text DEFAULT 'leaf'::text,
298
  parent_node_id uuid,
@@ -512,11 +510,11 @@ ALTER TABLE ONLY public.chat_memory
512
 
513
 
514
  --
515
- -- Name: document_trees document_trees_pkey; Type: CONSTRAINT; Schema: public; Owner: -
516
- --
517
-
518
- ALTER TABLE ONLY public.document_trees
519
- ADD CONSTRAINT document_trees_pkey PRIMARY KEY (file_hash);
520
 
521
 
522
  --
@@ -536,11 +534,11 @@ ALTER TABLE ONLY public.evaluation_logs
536
 
537
 
538
  --
539
- -- Name: ingested_files ingested_files_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
540
- --
541
-
542
- ALTER TABLE ONLY public.ingested_files
543
- ADD CONSTRAINT ingested_files_file_hash_key UNIQUE (file_hash);
544
 
545
 
546
  --
@@ -621,7 +619,7 @@ CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvecto
621
  -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
622
  --
623
 
624
- CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::public.halfvec(2048)) public.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
625
 
626
 
627
  --
@@ -939,9 +937,7 @@ ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
939
  CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
940
 
941
 
942
- --
943
- -- PostgreSQL database dump complete
944
- --
945
-
946
- \unrestrict D5DLkgreJkSzmm3K8XWpdjuj7WrATcgxPzakxedCgOsd9eMGt0ScgsbXbeWVrAx
947
 
 
1
+ --
2
+ -- PostgreSQL database dump
3
+ --
 
 
4
 
5
  -- Dumped from database version 17.6
6
  -- Dumped by pg_dump version 18.3
 
21
  -- Name: public; Type: SCHEMA; Schema: -; Owner: -
22
  --
23
 
24
+ CREATE SCHEMA IF NOT EXISTS public;
25
 
26
 
27
  --
 
75
 
76
 
77
  --
78
+ -- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
79
  --
80
 
81
+ CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
82
  LANGUAGE plpgsql
83
  AS $$
84
  begin
 
87
  semantic as (
88
  select
89
  d.id, d.content, d.metadata,
90
+ (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as score
91
  from documents d
92
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
93
+ order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
94
  limit match_count * 3
95
  ),
96
  keyword as (
 
135
 
136
 
137
  --
138
+ -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
139
  --
140
 
141
+ CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
142
  LANGUAGE plpgsql SECURITY DEFINER
143
  AS $$
144
  BEGIN
 
153
 
154
 
155
  --
156
+ -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
157
  --
158
 
159
+ CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
160
  LANGUAGE plpgsql SECURITY DEFINER
161
  AS $$
162
  BEGIN
 
180
 
181
 
182
  --
183
+ -- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
184
  --
185
 
186
+ CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
187
  LANGUAGE plpgsql
188
  AS $$
189
  begin
 
192
  d.id,
193
  d.content,
194
  d.metadata,
195
+ (1 - (d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)))::float as similarity
196
  from documents d
197
  where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
198
+ order by d.embedding::extensions.halfvec(2048) <=> query_embedding::extensions.halfvec(2048)
199
  limit match_count;
200
  end;
201
  $$;
202
 
203
 
204
  --
205
+ -- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
206
  --
207
 
208
+ CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
209
  LANGUAGE plpgsql
210
  AS $$
211
  BEGIN
 
264
  session_id text NOT NULL,
265
  role text NOT NULL,
266
  content text NOT NULL,
267
+ embedding extensions.vector(2048),
268
  created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
269
  user_id uuid DEFAULT auth.uid()
270
  );
 
290
  id uuid DEFAULT gen_random_uuid() NOT NULL,
291
  content text,
292
  metadata jsonb,
293
+ embedding extensions.vector(2048),
294
  user_id uuid DEFAULT auth.uid(),
295
  node_type text DEFAULT 'leaf'::text,
296
  parent_node_id uuid,
 
510
 
511
 
512
  --
513
+ -- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
514
+ --
515
+
516
+ ALTER TABLE ONLY public.document_trees
517
+ ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
518
 
519
 
520
  --
 
534
 
535
 
536
  --
537
+ -- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
538
+ --
539
+
540
+ ALTER TABLE ONLY public.ingested_files
541
+ ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
542
 
543
 
544
  --
 
619
  -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
620
  --
621
 
622
+ CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
623
 
624
 
625
  --
 
937
  CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
938
 
939
 
940
+ --
941
+ -- PostgreSQL database dump complete
942
+ --
 
 
943