nothex commited on
Commit
4abd98f
·
1 Parent(s): ca5846e

Harden ingestion and retrieval reliability across the pipeline

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ARCHITECTURE.md +9 -1
  2. backend/api/admin.py +286 -3
  3. backend/api/auth.py +75 -4
  4. backend/api/frontend_config.py +8 -2
  5. backend/api/ingest.py +54 -12
  6. backend/api/query.py +116 -4
  7. backend/core/auth_utils.py +40 -1
  8. backend/core/classifier.py +10 -15
  9. backend/core/config.py +52 -2
  10. backend/core/pipeline.py +0 -0
  11. backend/core/pipeline_ambiguity.py +221 -0
  12. backend/core/pipeline_generation.py +54 -0
  13. backend/core/pipeline_ingestion.py +465 -0
  14. backend/core/pipeline_memory.py +23 -0
  15. backend/core/pipeline_pageindex.py +263 -0
  16. backend/core/pipeline_retrieval.py +83 -0
  17. backend/core/pipeline_routing.py +149 -0
  18. backend/core/pipeline_supabase.py +46 -0
  19. backend/core/pipeline_types.py +65 -0
  20. backend/core/rate_limit.py +39 -0
  21. backend/core/tasks.py +60 -19
  22. backend/core/warmup_classifier.py +5 -1
  23. backend/eval/run_eval.py +66 -0
  24. backend/main.py +12 -15
  25. frontend/index.html +72 -20
  26. frontend/js/admin.js +234 -0
  27. frontend/js/api.js +126 -24
  28. frontend/js/chat.js +173 -30
  29. frontend/js/config.js +39 -5
  30. frontend/js/corpus.js +79 -3
  31. frontend/js/graph.js +98 -66
  32. frontend/js/main.js +312 -63
  33. frontend/js/state.js +21 -12
  34. recent_changes.txt +0 -0
  35. requirements.txt +2 -1
  36. scripts/rebuild_pageindex.py +83 -0
  37. shared/types.py +41 -4
  38. supabase/migrations/0010_query_traces_feedback_graph.sql +131 -0
  39. supabase/migrations/0011_admin_review_eval_workflow.sql +38 -0
  40. supabase/migrations/0012_lock_down_evaluation_datasets.sql +14 -0
  41. supabase/migrations/0013_backend_owned_retrieval_hardening.sql +260 -0
  42. supabase/migrations/0014_drop_legacy_category_centroid_policies.sql +20 -0
  43. supabase/migrations/0015_ingested_file_identity_json.sql +2 -0
  44. supabase/migrations/0016_ingestion_file_hash_checkpoints.sql +5 -0
  45. supabase/schema_backup.before_0013.sql +0 -0
  46. supabase/schema_backup.sql +1349 -908
  47. tests/test_guest_mode.py +74 -0
  48. tests/test_ingest_api.py +156 -0
  49. tests/test_pipeline_regressions.py +1831 -0
  50. tests/test_routing_stress_matrix.py +98 -0
ARCHITECTURE.md CHANGED
@@ -133,7 +133,7 @@ morpheus/
133
 
134
  | Function | Purpose |
135
  |----------|---------|
136
- | `hybrid_search(query_text, query_embedding, match_count, filter, semantic_weight, keyword_weight)` | Combined BM25 + pgvector search |
137
  | `match_memory(query_embedding, match_session_id, match_count)` | Semantic search over chat history |
138
  | `insert_document_chunk(p_id, p_content, p_metadata, p_embedding, p_user_id)` | Secure insert with explicit user_id |
139
  | `get_document_types()` | Returns distinct categories for this tenant |
@@ -221,6 +221,12 @@ Step 1: Intent analysis (analyse_intent)
221
  Reference queries ("summarise it"): replaced with previous query
222
  Every query logged to intent_feedback for online retraining
223
 
 
 
 
 
 
 
224
  Step 2: Query routing
225
  Structural queries (table of contents, numbered items, specific codes)?
226
  → tree_search(): recursive traversal of document_trees for this user
@@ -231,6 +237,8 @@ Step 3: retrieve_chunks() — vector path
231
  a) Follow-up detection
232
  Query ≤8 words with pronouns (it/this/that/they)?
233
  Reuse _last_chunks[session_key] — no re-search
 
 
234
 
235
  b) Semantic cache check
236
  Embed query (256-entry in-memory LRU cache)
 
133
 
134
  | Function | Purpose |
135
  |----------|---------|
136
+ | `hybrid_search(query_text, query_embedding, match_count, filter, semantic_weight, keyword_weight, p_user_id)` | Combined BM25 + pgvector search (tenant-scoped overload) |
137
  | `match_memory(query_embedding, match_session_id, match_count)` | Semantic search over chat history |
138
  | `insert_document_chunk(p_id, p_content, p_metadata, p_embedding, p_user_id)` | Secure insert with explicit user_id |
139
  | `get_document_types()` | Returns distinct categories for this tenant |
 
221
  Reference queries ("summarise it"): replaced with previous query
222
  Every query logged to intent_feedback for online retraining
223
 
224
+ Step 1.5: Ambiguity / scope safety (check_query_ambiguity)
225
+ If the user has NOT pinned a document:
226
+ - If **multiple docs are in scope** and the query is **identity/page-scoped** (owner/title/publisher/cover/first page), Morpheus **asks the user to pick a document** (never guesses).
227
+ - Otherwise, Morpheus may ask a clarification question for generic queries when multiple docs match.
228
+ Implementation detail: ambiguity scoring uses `hybrid_search(..., p_user_id=...)` to avoid PostgREST overload ambiguity.
229
+
230
  Step 2: Query routing
231
  Structural queries (table of contents, numbered items, specific codes)?
232
  → tree_search(): recursive traversal of document_trees for this user
 
237
  a) Follow-up detection
238
  Query ≤8 words with pronouns (it/this/that/they)?
239
  Reuse _last_chunks[session_key] — no re-search
240
+ Safety guard: ordinal follow-ups like "the second one" must have an explicit referent (a list);
241
+ otherwise the API asks for clarification instead of guessing.
242
 
243
  b) Semantic cache check
244
  Embed query (256-entry in-memory LRU cache)
backend/api/admin.py CHANGED
@@ -1,11 +1,16 @@
1
  """backend/api/admin.py — Admin endpoints, protected by X-Admin-Key header."""
2
 
3
  import os, hmac, logging # noqa: E401
 
 
 
 
4
  from fastapi import APIRouter, HTTPException, Header, Depends
 
 
5
  from backend.core.auth_utils import require_auth_token
6
  from backend.core.warmup_classifier import warmup, warmup_cross_encoder
7
- from datetime import datetime, timedelta, timezone
8
- from collections import Counter
9
 
10
  log = logging.getLogger("morpheus.api.admin")
11
  router = APIRouter()
@@ -19,6 +24,78 @@ def _check_admin(key: str):
19
  raise HTTPException(status_code=403, detail="Invalid admin key.")
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @router.post("/warmup")
23
  def run_warmup(x_admin_key: str = Header(..., alias="X-Admin-Key")):
24
  _check_admin(x_admin_key)
@@ -105,4 +182,210 @@ def get_corpus_health(
105
  "recommendation": "Prompt user to upload documents regarding content gaps."
106
  if missing_topics
107
  else "Corpus coverage is sufficient.",
108
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """backend/api/admin.py — Admin endpoints, protected by X-Admin-Key header."""
2
 
3
  import os, hmac, logging # noqa: E401
4
+ from datetime import datetime, timedelta, timezone
5
+ from collections import Counter
6
+ from typing import Optional
7
+
8
  from fastapi import APIRouter, HTTPException, Header, Depends
9
+ from pydantic import BaseModel
10
+
11
  from backend.core.auth_utils import require_auth_token
12
  from backend.core.warmup_classifier import warmup, warmup_cross_encoder
13
+ from backend.core.pipeline import _build_service_supabase_client
 
14
 
15
  log = logging.getLogger("morpheus.api.admin")
16
  router = APIRouter()
 
24
  raise HTTPException(status_code=403, detail="Invalid admin key.")
25
 
26
 
27
+ class ReviewPayload(BaseModel):
28
+ review_state: str = "reviewed"
29
+ review_notes: Optional[str] = None
30
+
31
+
32
+ def _admin_client():
33
+ return _build_service_supabase_client()
34
+
35
+
36
+ def _trace_sort_key(row: dict):
37
+ return row.get("created_at") or ""
38
+
39
+
40
+ def _feedback_sort_key(row: dict):
41
+ return row.get("created_at") or ""
42
+
43
+
44
+ def _load_recent_traces(*, limit: int = 100) -> list[dict]:
45
+ rows = (
46
+ _admin_client()
47
+ .table("query_traces")
48
+ .select(
49
+ "trace_id, question, route_mode, selected_experts, expert_weights, "
50
+ "document_types, doc_diagnostics, failure_modes, quality_metrics, "
51
+ "answer_preview, latency_ms, review_state, review_notes, reviewed_at, "
52
+ "reviewed_by, promoted_to_eval, created_at"
53
+ )
54
+ .limit(limit)
55
+ .execute()
56
+ .data
57
+ or []
58
+ )
59
+ return sorted(rows, key=_trace_sort_key, reverse=True)
60
+
61
+
62
+ def _load_recent_feedback(*, limit: int = 100) -> list[dict]:
63
+ rows = (
64
+ _admin_client()
65
+ .table("answer_feedback")
66
+ .select(
67
+ "id, trace_id, helpful, accepted, reason_code, correction_text, "
68
+ "promote_to_eval, review_state, review_notes, reviewed_at, reviewed_by, "
69
+ "promoted_at, created_at, user_id"
70
+ )
71
+ .limit(limit)
72
+ .execute()
73
+ .data
74
+ or []
75
+ )
76
+ return sorted(rows, key=_feedback_sort_key, reverse=True)
77
+
78
+
79
+ def _build_eval_dataset_row(trace_row: dict, feedback_row: dict) -> dict:
80
+ correction_text = (feedback_row.get("correction_text") or "").strip()
81
+ answer_preview = (trace_row.get("answer_preview") or "").strip()
82
+ return {
83
+ "trace_id": trace_row.get("trace_id"),
84
+ "source": "feedback_trace",
85
+ "question": trace_row.get("question"),
86
+ "gold_context_refs": [],
87
+ "gold_evidence_text": correction_text or answer_preview,
88
+ "is_answerable": bool(
89
+ feedback_row.get("accepted")
90
+ or feedback_row.get("helpful")
91
+ ),
92
+ "failure_modes": trace_row.get("failure_modes") or [],
93
+ "doc_diagnostics": trace_row.get("doc_diagnostics") or [],
94
+ "reason_code": feedback_row.get("reason_code"),
95
+ "is_active": False,
96
+ }
97
+
98
+
99
  @router.post("/warmup")
100
  def run_warmup(x_admin_key: str = Header(..., alias="X-Admin-Key")):
101
  _check_admin(x_admin_key)
 
182
  "recommendation": "Prompt user to upload documents regarding content gaps."
183
  if missing_topics
184
  else "Corpus coverage is sufficient.",
185
+ }
186
+
187
+
188
+ @router.get("/traces")
189
+ def list_query_traces(
190
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
191
+ limit: int = 50,
192
+ route_mode: Optional[str] = None,
193
+ failure_mode: Optional[str] = None,
194
+ category: Optional[str] = None,
195
+ hours: int = 168,
196
+ review_state: Optional[str] = None,
197
+ ):
198
+ _check_admin(x_admin_key)
199
+ traces = _load_recent_traces(limit=max(limit * 3, 100))
200
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, hours))
201
+ filtered = []
202
+ for row in traces:
203
+ created_raw = row.get("created_at")
204
+ created_at = None
205
+ if isinstance(created_raw, str):
206
+ try:
207
+ created_at = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
208
+ except Exception:
209
+ created_at = None
210
+ if created_at and created_at < cutoff:
211
+ continue
212
+ if route_mode and row.get("route_mode") != route_mode:
213
+ continue
214
+ if failure_mode and failure_mode not in (row.get("failure_modes") or []):
215
+ continue
216
+ if review_state and row.get("review_state") != review_state:
217
+ continue
218
+ if category and category not in (row.get("document_types") or []):
219
+ continue
220
+ filtered.append(row)
221
+ return {"items": filtered[:limit]}
222
+
223
+
224
+ @router.get("/traces/{trace_id}")
225
+ def get_query_trace(
226
+ trace_id: str,
227
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
228
+ ):
229
+ _check_admin(x_admin_key)
230
+ sb = _admin_client()
231
+ trace_rows = (
232
+ sb.table("query_traces")
233
+ .select("*")
234
+ .eq("trace_id", trace_id)
235
+ .limit(1)
236
+ .execute()
237
+ .data
238
+ or []
239
+ )
240
+ if not trace_rows:
241
+ raise HTTPException(status_code=404, detail="Trace not found.")
242
+ feedback_rows = (
243
+ sb.table("answer_feedback")
244
+ .select("*")
245
+ .eq("trace_id", trace_id)
246
+ .execute()
247
+ .data
248
+ or []
249
+ )
250
+ return {"trace": trace_rows[0], "feedback": sorted(feedback_rows, key=_feedback_sort_key, reverse=True)}
251
+
252
+
253
+ @router.post("/traces/{trace_id}/review")
254
+ def review_query_trace(
255
+ trace_id: str,
256
+ payload: ReviewPayload,
257
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
258
+ ):
259
+ _check_admin(x_admin_key)
260
+ now_iso = datetime.now(timezone.utc).isoformat()
261
+ _admin_client().table("query_traces").update(
262
+ {
263
+ "review_state": payload.review_state,
264
+ "review_notes": payload.review_notes,
265
+ "reviewed_at": now_iso,
266
+ "reviewed_by": "admin",
267
+ }
268
+ ).eq("trace_id", trace_id).execute()
269
+ return {"ok": True}
270
+
271
+
272
+ @router.get("/feedback")
273
+ def list_feedback(
274
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
275
+ limit: int = 50,
276
+ review_state: Optional[str] = None,
277
+ promote_only: bool = False,
278
+ ):
279
+ _check_admin(x_admin_key)
280
+ rows = _load_recent_feedback(limit=max(limit * 3, 100))
281
+ filtered = []
282
+ for row in rows:
283
+ if review_state and row.get("review_state") != review_state:
284
+ continue
285
+ if promote_only and not row.get("promote_to_eval"):
286
+ continue
287
+ filtered.append(row)
288
+ return {"items": filtered[:limit]}
289
+
290
+
291
+ @router.get("/feedback/{feedback_id}")
292
+ def get_feedback_detail(
293
+ feedback_id: int,
294
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
295
+ ):
296
+ _check_admin(x_admin_key)
297
+ sb = _admin_client()
298
+ rows = (
299
+ sb.table("answer_feedback")
300
+ .select("*")
301
+ .eq("id", feedback_id)
302
+ .limit(1)
303
+ .execute()
304
+ .data
305
+ or []
306
+ )
307
+ if not rows:
308
+ raise HTTPException(status_code=404, detail="Feedback not found.")
309
+ feedback = rows[0]
310
+ trace_rows = (
311
+ sb.table("query_traces")
312
+ .select("*")
313
+ .eq("trace_id", feedback.get("trace_id"))
314
+ .limit(1)
315
+ .execute()
316
+ .data
317
+ or []
318
+ )
319
+ return {"feedback": feedback, "trace": trace_rows[0] if trace_rows else None}
320
+
321
+
322
+ @router.post("/feedback/{feedback_id}/review")
323
+ def review_feedback(
324
+ feedback_id: int,
325
+ payload: ReviewPayload,
326
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
327
+ ):
328
+ _check_admin(x_admin_key)
329
+ now_iso = datetime.now(timezone.utc).isoformat()
330
+ _admin_client().table("answer_feedback").update(
331
+ {
332
+ "review_state": payload.review_state,
333
+ "review_notes": payload.review_notes,
334
+ "reviewed_at": now_iso,
335
+ "reviewed_by": "admin",
336
+ }
337
+ ).eq("id", feedback_id).execute()
338
+ return {"ok": True}
339
+
340
+
341
+ @router.post("/feedback/{feedback_id}/promote")
342
+ def promote_feedback_to_eval(
343
+ feedback_id: int,
344
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
345
+ ):
346
+ _check_admin(x_admin_key)
347
+ sb = _admin_client()
348
+ feedback_rows = (
349
+ sb.table("answer_feedback")
350
+ .select("*")
351
+ .eq("id", feedback_id)
352
+ .limit(1)
353
+ .execute()
354
+ .data
355
+ or []
356
+ )
357
+ if not feedback_rows:
358
+ raise HTTPException(status_code=404, detail="Feedback not found.")
359
+ feedback = feedback_rows[0]
360
+ trace_rows = (
361
+ sb.table("query_traces")
362
+ .select("*")
363
+ .eq("trace_id", feedback.get("trace_id"))
364
+ .limit(1)
365
+ .execute()
366
+ .data
367
+ or []
368
+ )
369
+ if not trace_rows:
370
+ raise HTTPException(status_code=404, detail="Trace not found.")
371
+ trace = trace_rows[0]
372
+ row = _build_eval_dataset_row(trace, feedback)
373
+ sb.table("evaluation_datasets").upsert(row, on_conflict="trace_id").execute()
374
+ now_iso = datetime.now(timezone.utc).isoformat()
375
+ sb.table("answer_feedback").update(
376
+ {
377
+ "review_state": "promoted",
378
+ "promoted_at": now_iso,
379
+ "reviewed_at": now_iso,
380
+ "reviewed_by": "admin",
381
+ }
382
+ ).eq("id", feedback_id).execute()
383
+ sb.table("query_traces").update(
384
+ {
385
+ "review_state": "promoted",
386
+ "promoted_to_eval": True,
387
+ "reviewed_at": now_iso,
388
+ "reviewed_by": "admin",
389
+ }
390
+ ).eq("trace_id", trace.get("trace_id")).execute()
391
+ return {"ok": True}
backend/api/auth.py CHANGED
@@ -7,10 +7,11 @@ declare `auth: AuthContext = Depends(require_auth)` — see the pattern
7
  at the bottom of this file and replicate it in each router.
8
  """
9
 
10
- from fastapi import APIRouter, Depends
11
 
12
- from backend.core.auth_utils import require_auth_token
13
- from backend.services.auth import get_daily_password, verify_admin_key, verify_password
 
14
  from shared.types import AuthRequest, AuthResponse
15
 
16
  router = APIRouter()
@@ -31,7 +32,7 @@ def verify(req: AuthRequest):
31
  @router.post("/admin", response_model=AuthResponse)
32
  def admin_verify(req: AuthRequest):
33
  if verify_admin_key(req.password):
34
- return AuthResponse(valid=True, token=get_daily_password(), message="Admin verified.")
35
  return AuthResponse(valid=False, message="Invalid admin key.")
36
 
37
 
@@ -40,3 +41,73 @@ def admin_verify(req: AuthRequest):
40
  async def get_me(user_id: str = Depends(require_auth_token)):
41
  return {"user_id": user_id, "authenticated": True}
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  at the bottom of this file and replicate it in each router.
8
  """
9
 
10
+ from fastapi import APIRouter, Depends, Header, HTTPException
11
 
12
+ from backend.core.auth_utils import is_guest_token, require_auth_token
13
+ from backend.core.pipeline import _build_service_supabase_client
14
+ from backend.services.auth import verify_admin_key, verify_password
15
  from shared.types import AuthRequest, AuthResponse
16
 
17
  router = APIRouter()
 
32
  @router.post("/admin", response_model=AuthResponse)
33
  def admin_verify(req: AuthRequest):
34
  if verify_admin_key(req.password):
35
+ return AuthResponse(valid=True, message="Admin verified.")
36
  return AuthResponse(valid=False, message="Invalid admin key.")
37
 
38
 
 
41
  async def get_me(user_id: str = Depends(require_auth_token)):
42
  return {"user_id": user_id, "authenticated": True}
43
 
44
+
45
+ @router.delete("/guest-workspace")
46
+ async def clear_guest_workspace(
47
+ user_id: str = Depends(require_auth_token),
48
+ x_auth_token: str = Header(None, alias="X-Auth-Token"),
49
+ ):
50
+ if not is_guest_token(x_auth_token):
51
+ raise HTTPException(status_code=403, detail="Guest workspace cleanup is only for guest sessions.")
52
+
53
+ sb = _build_service_supabase_client()
54
+
55
+ # Preserve anonymized adaptive signals while removing the guest's actual workspace.
56
+ try:
57
+ sb.table("query_traces").update(
58
+ {
59
+ "user_id": None,
60
+ "session_id": "guest_archived",
61
+ "question": "[guest session removed]",
62
+ "pinned_file_hashes": [],
63
+ "selected_chunk_ids": [],
64
+ "doc_diagnostics": [],
65
+ "answer_preview": None,
66
+ "document_types": [],
67
+ }
68
+ ).eq("user_id", user_id).execute()
69
+ except Exception:
70
+ pass
71
+
72
+ try:
73
+ sb.table("answer_feedback").update(
74
+ {
75
+ "user_id": None,
76
+ "correction_text": None,
77
+ }
78
+ ).eq("user_id", user_id).execute()
79
+ except Exception:
80
+ pass
81
+
82
+ try:
83
+ sb.table("evaluation_logs").update(
84
+ {
85
+ "user_id": None,
86
+ "question": "[guest session removed]",
87
+ }
88
+ ).eq("user_id", user_id).execute()
89
+ except Exception:
90
+ pass
91
+
92
+ def _purge(table_name: str) -> None:
93
+ try:
94
+ sb.table(table_name).delete().eq("user_id", user_id).execute()
95
+ except Exception:
96
+ # Optional/older tables should not break guest cleanup.
97
+ pass
98
+
99
+ # Delete child/content tables first, then registry-ish tables.
100
+ for table_name in (
101
+ "documents",
102
+ "document_trees",
103
+ "chat_memory",
104
+ "ingestion_retry_logs",
105
+ "rerank_feedback",
106
+ "intent_feedback",
107
+ "graph_edges",
108
+ "graph_nodes",
109
+ "ingested_files",
110
+ ):
111
+ _purge(table_name)
112
+
113
+ return {"ok": True, "message": "Guest workspace cleared."}
backend/api/frontend_config.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import APIRouter
2
  from backend.core import config
3
 
4
  router = APIRouter()
@@ -9,7 +9,13 @@ def get_frontend_config():
9
  Returns public config values the frontend needs.
10
  Only exposes the anon key (safe by design) — never the service key.
11
  """
 
 
 
 
 
12
  return {
13
  "supabase_url": config.SUPABASE_URL,
14
  "supabase_anon": config.SUPABASE_ANON_KEY,
15
- }
 
 
1
+ from fastapi import APIRouter, HTTPException
2
  from backend.core import config
3
 
4
  router = APIRouter()
 
9
  Returns public config values the frontend needs.
10
  Only exposes the anon key (safe by design) — never the service key.
11
  """
12
+ if not config.SUPABASE_URL or not config.SUPABASE_ANON_KEY:
13
+ raise HTTPException(
14
+ status_code=503,
15
+ detail="Supabase frontend config is missing on the server.",
16
+ )
17
  return {
18
  "supabase_url": config.SUPABASE_URL,
19
  "supabase_anon": config.SUPABASE_ANON_KEY,
20
+ "guest_enabled": config.GUEST_MODE_ENABLED,
21
+ }
backend/api/ingest.py CHANGED
@@ -1,8 +1,10 @@
1
  import os
2
  import tempfile
3
  import logging
4
- from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends
5
- from backend.core.auth_utils import require_auth_token
 
 
6
  from backend.core.tasks import process_pdf_task
7
  from backend.core.tasks import celery_app
8
 
@@ -10,15 +12,39 @@ log = logging.getLogger("morpheus.api.ingest")
10
  router = APIRouter()
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  @router.post("/upload")
 
14
  async def upload(
 
15
  file: UploadFile = File(...),
16
  user_id: str = Depends(require_auth_token),
17
  x_auth_token: str = Header(None, alias="X-Auth-Token"),
18
  ):
 
19
  if not file.filename.lower().endswith(".pdf"):
20
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
21
 
 
 
22
  # NEW: Secure file signature validation using python-magic
23
  import magic
24
 
@@ -33,6 +59,8 @@ async def upload(
33
  )
34
 
35
  # ── Per-user document limit ───────────────────────────────────────────────
 
 
36
  try:
37
  from backend.core.pipeline import _build_supabase_client
38
 
@@ -43,20 +71,33 @@ async def upload(
43
  .eq("user_id", user_id)
44
  .execute()
45
  )
46
- if (result.count or 0) >= 50:
47
  raise HTTPException(
48
- status_code=429, detail="Document limit reached (50 max)."
 
49
  )
50
  except HTTPException:
51
  raise
52
- except Exception:
53
- pass # don't block upload if count check fails
 
 
 
 
 
 
54
 
55
  # Safely save to disk as before
56
  tmp_fd, tmp_path = tempfile.mkstemp(suffix=f"_{file.filename}")
57
  os.close(tmp_fd) # close fd immediately, manage file separately
58
  try:
59
  contents = await file.read()
 
 
 
 
 
 
60
  with open(tmp_path, "wb") as f:
61
  f.write(contents)
62
  task = process_pdf_task.delay(tmp_path, file.filename, x_auth_token)
@@ -65,18 +106,19 @@ async def upload(
65
  "task_id": task.id,
66
  "filename": file.filename,
67
  }
 
 
 
68
  except Exception as e:
69
- log.error("Failed to queue file: %s", e)
70
- try:
71
- os.unlink(tmp_path)
72
- except OSError:
73
- pass
74
- raise HTTPException(status_code=500, detail="Failed to queue file.")
75
 
76
 
77
  # NEW ROUTE: The frontend will poll this every 2 seconds
78
  @router.get("/status/{task_id}")
79
  def get_ingest_status(task_id: str):
 
80
  task_result = celery_app.AsyncResult(task_id)
81
 
82
  if task_result.state == "PENDING":
 
1
  import os
2
  import tempfile
3
  import logging
4
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends, Request
5
+ from backend.core import config
6
+ from backend.core.auth_utils import is_guest_token, require_auth_token
7
+ from backend.core.rate_limit import limiter
8
  from backend.core.tasks import process_pdf_task
9
  from backend.core.tasks import celery_app
10
 
 
12
  router = APIRouter()
13
 
14
 
15
+ def _cleanup_temp_upload(tmp_path: str) -> None:
16
+ if not tmp_path:
17
+ return
18
+ try:
19
+ os.unlink(tmp_path)
20
+ except FileNotFoundError:
21
+ return
22
+ except OSError as exc:
23
+ log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
24
+
25
+
26
+ def _ensure_ingest_worker_available() -> None:
27
+ if celery_app is None or not hasattr(process_pdf_task, "delay"):
28
+ raise HTTPException(
29
+ status_code=503,
30
+ detail="Background ingestion worker is unavailable.",
31
+ )
32
+
33
+
34
  @router.post("/upload")
35
+ @limiter.limit("12/hour")
36
  async def upload(
37
+ request: Request,
38
  file: UploadFile = File(...),
39
  user_id: str = Depends(require_auth_token),
40
  x_auth_token: str = Header(None, alias="X-Auth-Token"),
41
  ):
42
+ del request
43
  if not file.filename.lower().endswith(".pdf"):
44
  raise HTTPException(status_code=400, detail="Only PDF files are supported.")
45
 
46
+ guest_workspace = is_guest_token(x_auth_token)
47
+
48
  # NEW: Secure file signature validation using python-magic
49
  import magic
50
 
 
59
  )
60
 
61
  # ── Per-user document limit ───────────────────────────────────────────────
62
+ doc_limit = config.GUEST_MAX_DOCS if guest_workspace else config.MAX_DOCS_PER_USER
63
+
64
  try:
65
  from backend.core.pipeline import _build_supabase_client
66
 
 
71
  .eq("user_id", user_id)
72
  .execute()
73
  )
74
+ if (result.count or 0) >= doc_limit:
75
  raise HTTPException(
76
+ status_code=429,
77
+ detail=f"Document limit reached ({doc_limit} max).",
78
  )
79
  except HTTPException:
80
  raise
81
+ except Exception as exc:
82
+ log.error("Upload limit check failed for user %s: %s", user_id, exc)
83
+ raise HTTPException(
84
+ status_code=503,
85
+ detail="Could not verify upload limits right now. Please try again.",
86
+ ) from exc
87
+
88
+ _ensure_ingest_worker_available()
89
 
90
  # Safely save to disk as before
91
  tmp_fd, tmp_path = tempfile.mkstemp(suffix=f"_{file.filename}")
92
  os.close(tmp_fd) # close fd immediately, manage file separately
93
  try:
94
  contents = await file.read()
95
+ max_upload_mb = config.GUEST_MAX_UPLOAD_MB if guest_workspace else config.MAX_UPLOAD_MB
96
+ if len(contents) > max_upload_mb * 1024 * 1024:
97
+ raise HTTPException(
98
+ status_code=413,
99
+ detail=f"File too large ({max_upload_mb} MB max).",
100
+ )
101
  with open(tmp_path, "wb") as f:
102
  f.write(contents)
103
  task = process_pdf_task.delay(tmp_path, file.filename, x_auth_token)
 
106
  "task_id": task.id,
107
  "filename": file.filename,
108
  }
109
+ except HTTPException:
110
+ _cleanup_temp_upload(tmp_path)
111
+ raise
112
  except Exception as e:
113
+ log.exception("Failed to queue file: %s", e)
114
+ _cleanup_temp_upload(tmp_path)
115
+ raise HTTPException(status_code=500, detail="Failed to queue file.") from e
 
 
 
116
 
117
 
118
  # NEW ROUTE: The frontend will poll this every 2 seconds
119
  @router.get("/status/{task_id}")
120
  def get_ingest_status(task_id: str):
121
+ _ensure_ingest_worker_available()
122
  task_result = celery_app.AsyncResult(task_id)
123
 
124
  if task_result.state == "PENDING":
backend/api/query.py CHANGED
@@ -2,21 +2,58 @@
2
  import json
3
  import logging
4
  import asyncio
5
- from fastapi import APIRouter, Header, Depends, Request
6
  from fastapi.responses import StreamingResponse
7
- from shared.types import QueryRequest, SourceChunk
8
  from backend.core.pipeline import (
9
  retrieve_chunks_routed,
10
  generate_answer_stream,
11
  analyse_intent,
 
 
12
  )
13
  from backend.core.auth_utils import require_auth_token
14
- from backend.main import limiter
15
 
16
  log = logging.getLogger("morpheus.api.query")
17
  router = APIRouter()
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def _normalise_original_content(raw):
21
  """Best-effort decode for metadata that may already be dict or JSON string."""
22
  if isinstance(raw, dict):
@@ -91,14 +128,68 @@ async def query(
91
  user_id = user_id,
92
  )
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  if not intent.get("is_clear"):
95
  # Stream clarification question as a normal assistant message
96
  # User answers it → next turn history resolves the subject
97
  question = intent.get("clarification_question", "Could you clarify?")
98
  yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
99
- yield "data: " + json.dumps({"type": "done", "sources": [], "images": []}) + "\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  return
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  # ── Step 2: Retrieve using enriched query ─────────────────────────
103
  # enriched_query has better embedding signal (category/history injected)
104
  # but we answer with the ORIGINAL query so the response sounds natural
@@ -117,12 +208,15 @@ async def query(
117
  user_id=user_id,
118
  original_query=req.query,
119
  eval_mode=(x_eval_mode == "true"),
 
120
  ),
121
  )
122
 
123
  # ── Step 3: Stream answer tokens ──────────────────────────────────
124
  images = []
125
  done_sources = []
 
 
126
  # 🚀 Define the boolean once for readability
127
  is_eval = x_eval_mode == "true"
128
  async for event in generate_answer_stream(
@@ -133,12 +227,15 @@ async def query(
133
  access_token=x_auth_token,
134
  category=category,
135
  eval_mode=is_eval,
 
136
  ):
137
  if event["type"] == "token":
138
  yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
139
  elif event["type"] == "done":
140
  images = event.get("images", [])
141
  done_sources = event.get("sources", []) or []
 
 
142
 
143
  # ── Step 4: Emit sources + images ─────────────────────────────────
144
  sources = done_sources or _build_sources_from_chunks(
@@ -149,6 +246,8 @@ async def query(
149
  "type": "done",
150
  "sources": sources,
151
  "images": images,
 
 
152
  }) + "\n\n"
153
 
154
  except Exception as e:
@@ -178,3 +277,16 @@ async def query(
178
  "Access-Control-Allow-Origin": "*",
179
  }
180
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import logging
4
  import asyncio
5
+ from fastapi import APIRouter, Header, Depends, Request, HTTPException
6
  from fastapi.responses import StreamingResponse
7
+ from shared.types import AnswerFeedback, QueryRequest, SourceChunk
8
  from backend.core.pipeline import (
9
  retrieve_chunks_routed,
10
  generate_answer_stream,
11
  analyse_intent,
12
+ check_query_ambiguity,
13
+ record_answer_feedback,
14
  )
15
  from backend.core.auth_utils import require_auth_token
16
+ from backend.core.rate_limit import limiter
17
 
18
  log = logging.getLogger("morpheus.api.query")
19
  router = APIRouter()
20
 
21
 
22
+ def _contains_ordinal_followup(query: str) -> bool:
23
+ q = (query or "").strip().lower()
24
+ if not q:
25
+ return False
26
+ return any(
27
+ phrase in q
28
+ for phrase in (
29
+ "the second one",
30
+ "the first one",
31
+ "the other one",
32
+ "second one",
33
+ "first one",
34
+ "other one",
35
+ )
36
+ )
37
+
38
+
39
+ def _history_has_explicit_enumeration(history: list[dict]) -> bool:
40
+ """
41
+ Heuristic: if the last assistant message contains an explicit list, then
42
+ ordinal follow-ups (\"second one\") can be resolved. Otherwise, ask.
43
+ """
44
+ for msg in reversed(history or []):
45
+ if (msg.get("role") or "").lower() != "assistant":
46
+ continue
47
+ content = str(msg.get("content") or "")
48
+ if not content.strip():
49
+ return False
50
+ # Common enumeration patterns (numbers, bullets).
51
+ if any(token in content for token in ("\n1.", "\n2.", "\n- ", "\n• ")):
52
+ return True
53
+ return False
54
+ return False
55
+
56
+
57
  def _normalise_original_content(raw):
58
  """Best-effort decode for metadata that may already be dict or JSON string."""
59
  if isinstance(raw, dict):
 
128
  user_id = user_id,
129
  )
130
 
131
+ if intent.get("route_class") == "no_retrieval":
132
+ yield "data: " + json.dumps({
133
+ "type": "token",
134
+ "content": "Ask me about your uploaded documents or a topic inside them, and I’ll dig in.",
135
+ }) + "\n\n"
136
+ yield "data: " + json.dumps({
137
+ "type": "done",
138
+ "sources": [],
139
+ "images": [],
140
+ "trace_id": None,
141
+ "doc_diagnostics": [],
142
+ }) + "\n\n"
143
+ return
144
+
145
  if not intent.get("is_clear"):
146
  # Stream clarification question as a normal assistant message
147
  # User answers it → next turn history resolves the subject
148
  question = intent.get("clarification_question", "Could you clarify?")
149
  yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
150
+ yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
151
+ return
152
+
153
+ # Guardrail: ordinal follow-ups without an explicit referent should not guess.
154
+ if (
155
+ intent.get("route_class") == "follow_up"
156
+ and _contains_ordinal_followup(req.query)
157
+ and not _history_has_explicit_enumeration(history)
158
+ ):
159
+ yield "data: " + json.dumps(
160
+ {
161
+ "type": "token",
162
+ "content": "Second one of what? Please reference the items you mean (e.g., paste the list or restate the names).",
163
+ }
164
+ ) + "\n\n"
165
+ yield "data: " + json.dumps(
166
+ {"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}
167
+ ) + "\n\n"
168
  return
169
 
170
+ # ── Step 1.5: Phase 2 Ambiguity Detection ────────────────────────
171
+ # If no manual pin is active, check if the query is too ambiguous
172
+ if not req.priority_file_hashes:
173
+ ambiguity_res = check_query_ambiguity(
174
+ req.query,
175
+ access_token=x_auth_token,
176
+ category=req.category,
177
+ )
178
+ if ambiguity_res.get("is_ambiguous"):
179
+ question = ambiguity_res.get("clarification_question", "Which document do you mean?")
180
+ # Use a distinct identifier so the frontend understands it's a structural prompt
181
+ yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
182
+
183
+ options = ambiguity_res.get("clarification_options")
184
+ if options:
185
+ yield "data: " + json.dumps({"type": "clarification_options", "options": options}) + "\n\n"
186
+
187
+ yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
188
+ return
189
+ if ambiguity_res.get("top_file_hash") and not ambiguity_res.get("is_ambiguous"):
190
+ req.priority_file_hashes = [ambiguity_res["top_file_hash"]]
191
+ log.info("Auto-pinned file hash: %s", ambiguity_res["top_file_hash"])
192
+
193
  # ── Step 2: Retrieve using enriched query ─────────────────────────
194
  # enriched_query has better embedding signal (category/history injected)
195
  # but we answer with the ORIGINAL query so the response sounds natural
 
208
  user_id=user_id,
209
  original_query=req.query,
210
  eval_mode=(x_eval_mode == "true"),
211
+ priority_file_hashes=req.priority_file_hashes or None,
212
  ),
213
  )
214
 
215
  # ── Step 3: Stream answer tokens ──────────────────────────────────
216
  images = []
217
  done_sources = []
218
+ trace_id = None
219
+ doc_diagnostics = []
220
  # 🚀 Define the boolean once for readability
221
  is_eval = x_eval_mode == "true"
222
  async for event in generate_answer_stream(
 
227
  access_token=x_auth_token,
228
  category=category,
229
  eval_mode=is_eval,
230
+ priority_file_hashes=req.priority_file_hashes or None,
231
  ):
232
  if event["type"] == "token":
233
  yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
234
  elif event["type"] == "done":
235
  images = event.get("images", [])
236
  done_sources = event.get("sources", []) or []
237
+ trace_id = event.get("trace_id")
238
+ doc_diagnostics = event.get("doc_diagnostics", []) or []
239
 
240
  # ── Step 4: Emit sources + images ─────────────────────────────────
241
  sources = done_sources or _build_sources_from_chunks(
 
246
  "type": "done",
247
  "sources": sources,
248
  "images": images,
249
+ "trace_id": trace_id,
250
+ "doc_diagnostics": doc_diagnostics,
251
  }) + "\n\n"
252
 
253
  except Exception as e:
 
277
  "Access-Control-Allow-Origin": "*",
278
  }
279
  )
280
+
281
+
282
+ @router.post("/feedback")
283
+ async def submit_feedback(
284
+ payload: AnswerFeedback,
285
+ user_id: str = Depends(require_auth_token),
286
+ x_auth_token: str = Header(None, alias="X-Auth-Token"),
287
+ ):
288
+ del user_id
289
+ ok = record_answer_feedback(payload.dict(), access_token=x_auth_token)
290
+ if not ok:
291
+ raise HTTPException(status_code=500, detail="Could not record answer feedback.")
292
+ return {"ok": True}
backend/core/auth_utils.py CHANGED
@@ -12,7 +12,7 @@ TASK 1 — Auth Bridge:
12
 
13
  import jwt
14
  import logging
15
- from typing import Optional
16
  from backend.core import config
17
  from fastapi import Header, HTTPException, status
18
 
@@ -22,6 +22,45 @@ from fastapi import Header, HTTPException, status
22
  log = logging.getLogger("morpheus.auth")
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def extract_jwt_sub(access_token: str) -> str:
26
  """
27
  Extract the Supabase user id (JWT `sub`) while strictly verifying the signature.
 
12
 
13
  import jwt
14
  import logging
15
+ from typing import Any, Optional
16
  from backend.core import config
17
  from fastapi import Header, HTTPException, status
18
 
 
22
  log = logging.getLogger("morpheus.auth")
23
 
24
 
25
+ def _decode_unverified_claims(access_token: Optional[str]) -> dict[str, Any]:
26
+ """Peek at JWT claims without verifying the signature for non-security decisions."""
27
+ if not access_token:
28
+ return {}
29
+ try:
30
+ claims = jwt.decode(
31
+ access_token,
32
+ options={
33
+ "verify_signature": False,
34
+ "verify_exp": False,
35
+ "verify_aud": False,
36
+ },
37
+ algorithms=["ES256", "HS256", "RS256"],
38
+ )
39
+ return claims if isinstance(claims, dict) else {}
40
+ except Exception:
41
+ return {}
42
+
43
+
44
+ def is_guest_token(access_token: Optional[str]) -> bool:
45
+ """
46
+ Supabase anonymous users still get real JWTs.
47
+ We treat them as guest workspaces for UI/limits/rate-limiting.
48
+ """
49
+ claims = _decode_unverified_claims(access_token)
50
+ if not claims:
51
+ return False
52
+
53
+ app_meta = claims.get("app_metadata") or {}
54
+ provider = str(app_meta.get("provider") or "").strip().lower()
55
+ providers = app_meta.get("providers") or []
56
+ return bool(
57
+ claims.get("is_anonymous")
58
+ or app_meta.get("is_anonymous")
59
+ or provider == "anonymous"
60
+ or "anonymous" in providers
61
+ )
62
+
63
+
64
  def extract_jwt_sub(access_token: str) -> str:
65
  """
66
  Extract the Supabase user id (JWT `sub`) while strictly verifying the signature.
backend/core/classifier.py CHANGED
@@ -167,8 +167,9 @@ class CentroidStore:
167
  self._access_token = access_token
168
  self._user_id = None
169
  if access_token:
170
- from backend.core.auth_utils import extract_jwt_sub
171
- self._user_id = extract_jwt_sub(access_token)
 
172
  self._cache: Dict[str, Dict] = {}
173
  self._lock = threading.Lock()
174
  self._client = None
@@ -176,23 +177,17 @@ class CentroidStore:
176
 
177
  def _get_client(self):
178
  if self._client is None:
179
- # Tenant-scoped client (anon + access token) is required for RLS isolation.
180
- if self._access_token:
181
- if not config.SUPABASE_ANON_KEY:
182
- raise RuntimeError("SUPABASE_ANON_KEY is not set but access_token was provided.")
183
- self._client = create_client(
184
- config.SUPABASE_URL,
185
- config.SUPABASE_ANON_KEY,
186
- )
187
- self._client.postgrest.auth(self._access_token)
188
- else:
189
- # Admin / legacy fallback (bypasses RLS via service role).
190
- self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
191
  return self._client
192
 
193
  def _load_from_db(self):
194
  try:
195
- result = self._get_client().table(self.TABLE).select("*").execute()
 
 
 
196
  for row in (result.data or []):
197
  self._cache[row["document_type"]] = {
198
  "vector": np.array(row["centroid_vector"], dtype=np.float32),
 
167
  self._access_token = access_token
168
  self._user_id = None
169
  if access_token:
170
+ from backend.core.auth_utils import safe_extract_jwt_sub
171
+
172
+ self._user_id = safe_extract_jwt_sub(access_token)
173
  self._cache: Dict[str, Dict] = {}
174
  self._lock = threading.Lock()
175
  self._client = None
 
177
 
178
  def _get_client(self):
179
  if self._client is None:
180
+ # Backend-owned access model: always use the service-role client and
181
+ # scope rows explicitly by user_id where applicable.
182
+ self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
 
 
 
 
 
 
 
 
 
183
  return self._client
184
 
185
  def _load_from_db(self):
186
  try:
187
+ query = self._get_client().table(self.TABLE).select("*")
188
+ if self._user_id:
189
+ query = query.eq("user_id", self._user_id)
190
+ result = query.execute()
191
  for row in (result.data or []):
192
  self._cache[row["document_type"]] = {
193
  "vector": np.array(row["centroid_vector"], dtype=np.float32),
backend/core/config.py CHANGED
@@ -19,6 +19,15 @@ SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
19
  SUPABASE_JWT_SECRET = os.getenv("SUPABASE_JWT_SECRET")
20
  VECTOR_TABLE_NAME = "documents"
21
  IMAGE_STORAGE_BUCKET = "rag-images"
 
 
 
 
 
 
 
 
 
22
 
23
  # ==================== API KEYS ====================
24
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
@@ -37,9 +46,19 @@ OLLAMA_MODELS = ["llama3.2", "mistral"]
37
  EMBEDDING_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2:free"
38
  EMBEDDING_DIMENSIONS = 2048
39
  EMBEDDING_DEVICE = "cuda"
 
 
 
 
 
 
40
  EMBEDDING_MODELS = [
41
- "nvidia/llama-nemotron-embed-vl-1b-v2:free",
42
- "text-embedding-3-small", # OpenRouter fallback
 
 
 
 
43
  ]
44
 
45
  # ==================== GROQ MODELS ====================
@@ -119,6 +138,17 @@ UPLOAD_RETRY_MAX_ATTEMPTS = int(os.getenv("UPLOAD_RETRY_MAX_ATTEMPTS", "4"))
119
  UPLOAD_RETRY_BASE_SLEEP_S = float(os.getenv("UPLOAD_RETRY_BASE_SLEEP_S", "2"))
120
  UPLOAD_RETRY_MAX_SLEEP_S = float(os.getenv("UPLOAD_RETRY_MAX_SLEEP_S", "20"))
121
 
 
 
 
 
 
 
 
 
 
 
 
122
  # ==================== RETRIEVAL ====================
123
  CHAT_MEMORY_TURNS = 3
124
  EMBEDDING_CACHE_SIZE = 256
@@ -127,6 +157,26 @@ RELEVANCE_THRESHOLD = 0.35
127
  LLM_MAX_TOKENS = 4096
128
  MAX_CONTEXT_CHARS = 14000
129
  CATEGORY_SLOTS = 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  # ==================== LOGGING ====================
132
  LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
 
19
  SUPABASE_JWT_SECRET = os.getenv("SUPABASE_JWT_SECRET")
20
  VECTOR_TABLE_NAME = "documents"
21
  IMAGE_STORAGE_BUCKET = "rag-images"
22
+ GUEST_MODE_ENABLED = os.getenv("GUEST_MODE_ENABLED", "true").lower() in {
23
+ "1",
24
+ "true",
25
+ "yes",
26
+ }
27
+ MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "25"))
28
+ GUEST_MAX_UPLOAD_MB = int(os.getenv("GUEST_MAX_UPLOAD_MB", "10"))
29
+ MAX_DOCS_PER_USER = int(os.getenv("MAX_DOCS_PER_USER", "50"))
30
+ GUEST_MAX_DOCS = int(os.getenv("GUEST_MAX_DOCS", "10"))
31
 
32
  # ==================== API KEYS ====================
33
  OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 
46
  EMBEDDING_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2:free"
47
  EMBEDDING_DIMENSIONS = 2048
48
  EMBEDDING_DEVICE = "cuda"
49
+ RETRIEVAL_EMBEDDING_VARIANT = os.getenv(
50
+ "RETRIEVAL_EMBEDDING_VARIANT", "control"
51
+ ).strip().lower()
52
+ RETRIEVAL_EMBEDDING_MODEL_OVERRIDE = os.getenv(
53
+ "RETRIEVAL_EMBEDDING_MODEL_OVERRIDE", ""
54
+ ).strip()
55
  EMBEDDING_MODELS = [
56
+ model
57
+ for model in [
58
+ RETRIEVAL_EMBEDDING_MODEL_OVERRIDE or EMBEDDING_MODEL,
59
+ EMBEDDING_MODEL if RETRIEVAL_EMBEDDING_MODEL_OVERRIDE else "",
60
+ ]
61
+ if model
62
  ]
63
 
64
  # ==================== GROQ MODELS ====================
 
138
  UPLOAD_RETRY_BASE_SLEEP_S = float(os.getenv("UPLOAD_RETRY_BASE_SLEEP_S", "2"))
139
  UPLOAD_RETRY_MAX_SLEEP_S = float(os.getenv("UPLOAD_RETRY_MAX_SLEEP_S", "20"))
140
 
141
+ # ==================== CELERY / REDIS ====================
142
+ CELERY_VISIBILITY_TIMEOUT_S = int(os.getenv("CELERY_VISIBILITY_TIMEOUT_S", "7200"))
143
+ CELERY_BROKER_HEARTBEAT_S = int(os.getenv("CELERY_BROKER_HEARTBEAT_S", "30"))
144
+ CELERY_BROKER_POOL_LIMIT = int(os.getenv("CELERY_BROKER_POOL_LIMIT", "1"))
145
+ CELERY_REDIS_SOCKET_TIMEOUT_S = float(
146
+ os.getenv("CELERY_REDIS_SOCKET_TIMEOUT_S", "30")
147
+ )
148
+ CELERY_REDIS_HEALTH_CHECK_INTERVAL_S = int(
149
+ os.getenv("CELERY_REDIS_HEALTH_CHECK_INTERVAL_S", "30")
150
+ )
151
+
152
  # ==================== RETRIEVAL ====================
153
  CHAT_MEMORY_TURNS = 3
154
  EMBEDDING_CACHE_SIZE = 256
 
157
  LLM_MAX_TOKENS = 4096
158
  MAX_CONTEXT_CHARS = 14000
159
  CATEGORY_SLOTS = 2
160
+ ENABLE_STRICT_OUTPUT_SANITIZER = os.getenv(
161
+ "ENABLE_STRICT_OUTPUT_SANITIZER", "true"
162
+ ).lower() in {"1", "true", "yes"}
163
+ ENABLE_DUPLICATE_CHUNK_COLLAPSE = os.getenv(
164
+ "ENABLE_DUPLICATE_CHUNK_COLLAPSE", "true"
165
+ ).lower() in {"1", "true", "yes"}
166
+ ENABLE_HYDE = os.getenv("ENABLE_HYDE", "false").lower() in {"1", "true", "yes"}
167
+ ENABLE_RETRIEVE_THEN_STUFF = os.getenv(
168
+ "ENABLE_RETRIEVE_THEN_STUFF", "true"
169
+ ).lower() in {"1", "true", "yes"}
170
+ ENABLE_CONTEXTUAL_CHUNKING = os.getenv(
171
+ "ENABLE_CONTEXTUAL_CHUNKING", "false"
172
+ ).lower() in {"1", "true", "yes"}
173
+ FOLLOWUP_SESSION_TTL_S = int(os.getenv("FOLLOWUP_SESSION_TTL_S", "1800"))
174
+ HISTORY_RECENT_TURNS = int(os.getenv("HISTORY_RECENT_TURNS", "3"))
175
+ HISTORY_IMPORTANT_MAX = int(os.getenv("HISTORY_IMPORTANT_MAX", "6"))
176
+ RETRIEVE_THEN_STUFF_K = int(os.getenv("RETRIEVE_THEN_STUFF_K", "12"))
177
+ RETRIEVE_THEN_STUFF_FETCH_K = int(
178
+ os.getenv("RETRIEVE_THEN_STUFF_FETCH_K", "20")
179
+ )
180
 
181
  # ==================== LOGGING ====================
182
  LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
backend/core/pipeline.py CHANGED
The diff for this file is too large to render. See raw diff
 
backend/core/pipeline_ambiguity.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ambiguity / scope safety logic.
3
+
4
+ Extracted from `backend/core/pipeline.py` to isolate multi-doc clarification
5
+ rules and reduce coupling with retrieval/generation.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+
12
+ log = logging.getLogger("rag_pipeline")
13
+
14
+
15
+ def check_query_ambiguity(
16
+ query: str,
17
+ access_token: str = None,
18
+ category: str = None,
19
+ ) -> dict:
20
+ from backend.core import pipeline as pipeline_facade
21
+
22
+ AMBIGUITY_GAP = 0.12
23
+ MIN_MATCH_SCORE = 0.05
24
+ MIN_WORDS_FOR_SPECIFICITY = 10
25
+
26
+ words = query.strip().split()
27
+ if len(words) > MIN_WORDS_FOR_SPECIFICITY and not pipeline_facade._is_generic_ambiguous_query(query):
28
+ # Still check if category resolves to a single file — if so, auto-pin it
29
+ try:
30
+ supabase = pipeline_facade._build_supabase_client(access_token)
31
+ user_id = None
32
+ if access_token:
33
+ from backend.core.auth_utils import safe_extract_jwt_sub
34
+
35
+ user_id = safe_extract_jwt_sub(access_token)
36
+ files_q = supabase.table("ingested_files").select("file_hash, filename")
37
+ if user_id:
38
+ files_q = files_q.eq("user_id", user_id)
39
+ if category and category != "All":
40
+ files_q = files_q.eq("document_type", category)
41
+ files_resp = files_q.execute()
42
+ files = files_resp.data or []
43
+ if len(files) == 1:
44
+ single_hash = files[0]["file_hash"]
45
+ else:
46
+ single_hash = None
47
+
48
+ if len(files) > 1 and pipeline_facade._query_requires_identity_lookup(query):
49
+ top_files = sorted(
50
+ (
51
+ (str(f.get("file_hash") or "").strip(), str(f.get("filename") or "").strip())
52
+ for f in files
53
+ ),
54
+ key=lambda x: (x[1] or x[0]),
55
+ )
56
+ top_files = [(h, n) for h, n in top_files if h][:3]
57
+ options = [
58
+ {
59
+ "mode": "single",
60
+ "label": (name or fhash).replace(".pdf", ""),
61
+ "file_hash": fhash,
62
+ }
63
+ for fhash, name in top_files
64
+ ]
65
+ return {
66
+ "is_ambiguous": True,
67
+ "clarification_question": "Which document do you mean? Please pick one.",
68
+ "clarification_options": options,
69
+ "top_file_hash": None,
70
+ }
71
+ except Exception:
72
+ single_hash = None
73
+ return {
74
+ "is_ambiguous": False,
75
+ "clarification_question": None,
76
+ "clarification_options": None,
77
+ "top_file_hash": single_hash,
78
+ }
79
+
80
+ try:
81
+ supabase = pipeline_facade._build_supabase_client(access_token)
82
+ user_id = None
83
+ if access_token:
84
+ from backend.core.auth_utils import safe_extract_jwt_sub
85
+
86
+ user_id = safe_extract_jwt_sub(access_token)
87
+
88
+ files_q = supabase.table("ingested_files").select("file_hash, filename")
89
+ if user_id:
90
+ files_q = files_q.eq("user_id", user_id)
91
+ if category and category != "All":
92
+ files_q = files_q.eq("document_type", category)
93
+ files_resp = files_q.execute()
94
+ files = files_resp.data or []
95
+ if len(files) == 0:
96
+ return {
97
+ "is_ambiguous": False,
98
+ "clarification_question": None,
99
+ "clarification_options": None,
100
+ "top_file_hash": None,
101
+ }
102
+ if len(files) == 1:
103
+ return {
104
+ "is_ambiguous": False,
105
+ "clarification_question": None,
106
+ "clarification_options": None,
107
+ "top_file_hash": files[0]["file_hash"],
108
+ }
109
+
110
+ if pipeline_facade._query_requires_identity_lookup(query):
111
+ top_files = sorted(
112
+ (
113
+ (str(f.get("file_hash") or "").strip(), str(f.get("filename") or "").strip())
114
+ for f in files
115
+ ),
116
+ key=lambda x: (x[1] or x[0]),
117
+ )
118
+ top_files = [(h, n) for h, n in top_files if h][:3]
119
+ options = [
120
+ {
121
+ "mode": "single",
122
+ "label": (name or fhash).replace(".pdf", ""),
123
+ "file_hash": fhash,
124
+ }
125
+ for fhash, name in top_files
126
+ ]
127
+ return {
128
+ "is_ambiguous": True,
129
+ "clarification_question": "Which document do you mean? Please pick one.",
130
+ "clarification_options": options,
131
+ "top_file_hash": None,
132
+ }
133
+
134
+ query_vec = pipeline_facade.get_cached_embedding(query)
135
+ file_scores: list[tuple[str, str, float]] = [] # (file_hash, label, best_score)
136
+
137
+ for f in files:
138
+ fhash = f.get("file_hash")
139
+ fname = (f.get("filename") or fhash or "Untitled").strip()
140
+ if not fhash:
141
+ continue
142
+ try:
143
+ resp = supabase.rpc(
144
+ "hybrid_search",
145
+ {
146
+ "query_text": query,
147
+ "query_embedding": query_vec,
148
+ "match_count": 1,
149
+ "filter": {"file_hash": fhash},
150
+ "semantic_weight": 0.7,
151
+ "keyword_weight": 0.3,
152
+ "p_user_id": user_id,
153
+ },
154
+ ).execute()
155
+ rows = resp.data or []
156
+ if rows:
157
+ score = float(rows[0].get("combined_score", 0.0))
158
+ file_scores.append((fhash, fname, score))
159
+ except Exception as exc:
160
+ log.warning("Ambiguity check RPC error for %s: %s", str(fhash)[:8], exc)
161
+
162
+ if len(file_scores) < 2:
163
+ return {
164
+ "is_ambiguous": False,
165
+ "clarification_question": None,
166
+ "clarification_options": None,
167
+ "top_file_hash": None,
168
+ }
169
+
170
+ file_scores.sort(key=lambda x: x[2], reverse=True)
171
+ top_hash, top_name, top_score = file_scores[0]
172
+ second_hash, second_name, second_score = file_scores[1]
173
+ gap = top_score - second_score
174
+ generic = pipeline_facade._is_generic_ambiguous_query(query)
175
+
176
+ log.info(
177
+ "Ambiguity check: top=%r (%.3f), 2nd=%r (%.3f), gap=%.3f, generic=%s, category=%r",
178
+ top_name,
179
+ top_score,
180
+ second_name,
181
+ second_score,
182
+ gap,
183
+ generic,
184
+ category,
185
+ )
186
+
187
+ if gap >= AMBIGUITY_GAP and top_score >= MIN_MATCH_SCORE and not generic:
188
+ return {
189
+ "is_ambiguous": False,
190
+ "clarification_question": None,
191
+ "clarification_options": None,
192
+ "top_file_hash": top_hash,
193
+ }
194
+
195
+ options = []
196
+ for fhash, fname, score in file_scores[:3]:
197
+ options.append(
198
+ {
199
+ "mode": "single",
200
+ "label": (fname or fhash).replace(".pdf", ""),
201
+ "file_hash": fhash,
202
+ "score": round(float(score), 4),
203
+ }
204
+ )
205
+
206
+ return {
207
+ "is_ambiguous": True,
208
+ "clarification_question": "Which document do you mean? Please pick one.",
209
+ "clarification_options": options,
210
+ "top_file_hash": None,
211
+ }
212
+
213
+ except Exception as e:
214
+ log.warning("Ambiguity detector failed: %s", e)
215
+ return {
216
+ "is_ambiguous": False,
217
+ "clarification_question": None,
218
+ "clarification_options": None,
219
+ "top_file_hash": None,
220
+ }
221
+
backend/core/pipeline_generation.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generation / streaming facade functions.
3
+
4
+ The implementation lives in `pipeline.py` during migration; this module makes
5
+ generation a distinct unit for debugging and future refactors.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, AsyncGenerator, List, Optional, Tuple
11
+
12
+ from langchain_core.documents import Document
13
+
14
+
15
+ def generate_answer(
16
+ chunks: List[Document],
17
+ query: str,
18
+ chat_history: Optional[List[dict]] = None,
19
+ past_memories: Optional[List[dict]] = None,
20
+ ) -> Tuple[str, List[str]]:
21
+ from backend.core import pipeline as pipeline_facade
22
+
23
+ return pipeline_facade._generate_answer_impl(
24
+ chunks=chunks,
25
+ query=query,
26
+ chat_history=chat_history,
27
+ past_memories=past_memories,
28
+ )
29
+
30
+
31
+ async def generate_answer_stream(
32
+ chunks: List[Document],
33
+ query: str,
34
+ chat_history: Optional[List[dict]] = None,
35
+ session_id: str = "default_session",
36
+ access_token: str = None,
37
+ category: str = None,
38
+ eval_mode: bool = False,
39
+ priority_file_hashes: List[str] = None,
40
+ ) -> AsyncGenerator[dict, None]:
41
+ from backend.core import pipeline as pipeline_facade
42
+
43
+ async for event in pipeline_facade._generate_answer_stream_impl(
44
+ chunks=chunks,
45
+ query=query,
46
+ chat_history=chat_history,
47
+ session_id=session_id,
48
+ access_token=access_token,
49
+ category=category,
50
+ eval_mode=eval_mode,
51
+ priority_file_hashes=priority_file_hashes,
52
+ ):
53
+ yield event
54
+
backend/core/pipeline_ingestion.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingestion entrypoints and helpers.
3
+
4
+ This module intentionally keeps imports lightweight where possible and defers
5
+ heavy dependencies to function scope. It is part of the gradual de-monolith
6
+ refactor: `backend/core/pipeline.py` remains a stable facade.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ import logging
14
+ import os
15
+ import time
16
+ from types import SimpleNamespace
17
+ from typing import List, Optional
18
+
19
+ try:
20
+ import fitz
21
+ except Exception: # optional at import time (only used for PDF/image helpers)
22
+ fitz = None
23
+
24
+ from backend.core.cache_manager import invalidate_user_cache
25
+ from backend.core.pipeline_supabase import _build_service_supabase_client, _build_supabase_client
26
+
27
+ log = logging.getLogger("rag_pipeline")
28
+
29
+
30
+ def get_file_fingerprint(file_path: str) -> str:
31
+ """SHA-256 hash — collision-resistant dedup key."""
32
+ hasher = hashlib.sha256()
33
+ with open(file_path, "rb") as f:
34
+ for chunk in iter(lambda: f.read(65536), b""):
35
+ hasher.update(chunk)
36
+ return hasher.hexdigest()
37
+
38
+
39
+ def extract_images_from_pdf(file_path: str) -> dict:
40
+ """
41
+ Extract images per page using PyMuPDF.
42
+ Returns dict: {page_number: [base64_string, ...]}
43
+ """
44
+ if fitz is None:
45
+ log.warning("PyMuPDF (fitz) not installed; skipping image extraction.")
46
+ return {}
47
+
48
+ page_images = {}
49
+ try:
50
+ doc = fitz.open(file_path)
51
+ for page_num in range(len(doc)):
52
+ page = doc[page_num]
53
+ images = []
54
+ for img in page.get_images(full=True):
55
+ xref = img[0]
56
+ base_image = doc.extract_image(xref)
57
+ if base_image and base_image.get("image"):
58
+ # --- NEW LOGIC: Junk Image Filter ---
59
+ w = base_image.get("width", 0)
60
+ h = base_image.get("height", 0)
61
+
62
+ # 1. Skip tiny icons (e.g., smaller than 100x100 pixels)
63
+ if w < 100 or h < 100:
64
+ continue
65
+
66
+ # 2. Skip extreme aspect ratios (e.g., skinny banners/logos)
67
+ aspect_ratio = w / h if h > 0 else 0
68
+ if aspect_ratio > 5.0 or aspect_ratio < 0.2:
69
+ continue
70
+ # ------------------------------------
71
+ import base64
72
+
73
+ b64 = base64.b64encode(base_image["image"]).decode("utf-8")
74
+ images.append(b64)
75
+ if images:
76
+ page_images[page_num + 1] = images # 1-indexed to match page_numbers
77
+ doc.close()
78
+ log.info("PyMuPDF extracted images from %d pages", len(page_images))
79
+ except Exception as exc:
80
+ log.warning("PyMuPDF image extraction failed: %s", exc)
81
+ return page_images
82
+
83
+
84
+ def _has_text_layer(pdf_path: str) -> bool:
85
+ """Check if the PDF has native digital text to skip expensive OCR."""
86
+ if fitz is None:
87
+ # Without PyMuPDF we can't cheaply inspect the text layer.
88
+ return False
89
+ try:
90
+ doc = fitz.open(pdf_path)
91
+ for page in doc:
92
+ if page.get_text().strip():
93
+ return True
94
+ return False
95
+ except Exception:
96
+ return False
97
+
98
+
99
+
100
+ def _extract_element_metrics(elements: list) -> dict[str, float]:
101
+ page_numbers = {
102
+ getattr(getattr(el, "metadata", None), "page_number", None)
103
+ for el in elements
104
+ if getattr(getattr(el, "metadata", None), "page_number", None) is not None
105
+ }
106
+ page_count = max(1, len(page_numbers))
107
+ text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
108
+ element_count = len(elements)
109
+ chars_per_page = text_chars / max(1, page_count)
110
+ return {
111
+ "text_chars": text_chars,
112
+ "element_count": element_count,
113
+ "page_count": page_count,
114
+ "chars_per_page": chars_per_page,
115
+ }
116
+
117
+
118
+ def _should_retry_with_hi_res(
119
+ strategy: str,
120
+ metrics: dict[str, float],
121
+ ) -> bool:
122
+ return (
123
+ strategy == "fast"
124
+ and metrics["chars_per_page"] < 200
125
+ and metrics["element_count"] < 10
126
+ )
127
+
128
+
129
+ def partition_document(file_path: str) -> list:
130
+ # Dynamic OCR routing + guarded high-resolution retry for suspiciously thin extraction
131
+ # Use facade symbol so tests can monkeypatch `backend.core.pipeline._has_text_layer`.
132
+ from backend.core import pipeline as pipeline_facade
133
+
134
+ partition_pdf = getattr(pipeline_facade, "partition_pdf", None)
135
+ if not callable(partition_pdf):
136
+ try:
137
+ from unstructured.partition.pdf import partition_pdf as _partition_pdf
138
+ except Exception as exc:
139
+ raise RuntimeError(
140
+ "Missing dependency 'unstructured'. Install it to ingest PDFs."
141
+ ) from exc
142
+ partition_pdf = _partition_pdf
143
+
144
+ has_text = pipeline_facade._has_text_layer(file_path)
145
+ strategy = "fast" if has_text else "hi_res"
146
+ log.info("PDF text layer detected: %s. Using partition strategy: %s", has_text, strategy)
147
+ elements = partition_pdf(
148
+ filename=file_path,
149
+ strategy=strategy,
150
+ infer_table_structure=True,
151
+ extract_image_block_types=["Image"],
152
+ extract_image_block_to_payload=True,
153
+ )
154
+ metrics = _extract_element_metrics(elements)
155
+ log.info(
156
+ "%d elements extracted (text_chars=%d, page_count=%d, chars_per_page=%.1f)",
157
+ len(elements),
158
+ metrics["text_chars"],
159
+ metrics["page_count"],
160
+ metrics["chars_per_page"],
161
+ )
162
+
163
+ if _should_retry_with_hi_res(strategy, metrics):
164
+ log.info(
165
+ "Extraction looked suspiciously thin (chars_per_page=%.1f, elements=%d) — retrying once with hi_res.",
166
+ metrics["chars_per_page"],
167
+ metrics["element_count"],
168
+ )
169
+ hi_res_elements = partition_pdf(
170
+ filename=file_path,
171
+ strategy="hi_res",
172
+ infer_table_structure=True,
173
+ extract_image_block_types=["Image"],
174
+ extract_image_block_to_payload=True,
175
+ )
176
+ hi_res_metrics = _extract_element_metrics(hi_res_elements)
177
+ if (
178
+ hi_res_metrics["text_chars"] > metrics["text_chars"]
179
+ or hi_res_metrics["element_count"] > metrics["element_count"]
180
+ ):
181
+ log.info(
182
+ "Using hi_res extraction instead (text_chars=%d, elements=%d).",
183
+ hi_res_metrics["text_chars"],
184
+ hi_res_metrics["element_count"],
185
+ )
186
+ return hi_res_elements
187
+ log.info("Keeping fast extraction — hi_res did not improve coverage.")
188
+
189
+ return elements
190
+
191
+
192
+ def _build_document_tree(elements: list) -> dict:
193
+ """
194
+ Converts a flat list of unstructured elements into a nested JSON tree.
195
+ Titles become parent nodes, and Text/Tables become their children.
196
+ """
197
+ tree = {"title": "Document Root", "type": "root", "children": []}
198
+ current_section = tree
199
+
200
+ for el in elements:
201
+ category = getattr(el, "category", "Text")
202
+ text = str(el).strip()
203
+ if not text:
204
+ continue
205
+ page_num = getattr(getattr(el, "metadata", None), "page_number", None)
206
+ try:
207
+ page_num = int(page_num) if page_num is not None else None
208
+ except Exception:
209
+ page_num = None
210
+
211
+ if category == "Title":
212
+ new_section = {
213
+ "type": "section",
214
+ "title": text[:150], # Keep titles concise
215
+ "content": text,
216
+ "children": [],
217
+ }
218
+ tree["children"].append(new_section)
219
+ current_section = new_section
220
+ elif category in ("Table", "Text", "NarrativeText", "ListItem"):
221
+ child = {"type": category, "content": text}
222
+ if page_num is not None:
223
+ child["page_numbers"] = [page_num]
224
+ current_section["children"].append(child)
225
+
226
+ return tree
227
+
228
+
229
+ def run_ingestion(
230
+ pdf_path: str,
231
+ export_json: bool = False,
232
+ force: bool = False,
233
+ progress_callback=None,
234
+ original_filename: str = None,
235
+ access_token: str = None,
236
+ ) -> str:
237
+ """
238
+ Ingestion orchestrator.
239
+
240
+ Note: during the de-monolith refactor, some collaborators still live on the
241
+ facade module. We import them lazily to avoid circular imports at module load.
242
+ """
243
+ from backend.core.auth_utils import extract_jwt_sub
244
+ from backend.core import pipeline as pipeline_facade
245
+
246
+ STEPS = 6
247
+ stage_timings_ms: dict[str, int] = {}
248
+
249
+ def _progress(step: int, msg: str):
250
+ log.info("[%d/%d] %s", step, STEPS, msg)
251
+ if progress_callback:
252
+ progress_callback(step, STEPS, msg)
253
+
254
+ def _record_stage_timing(stage_name: str, started_at: float) -> None:
255
+ elapsed_ms = max(0, int((time.perf_counter() - started_at) * 1000))
256
+ stage_timings_ms[stage_name] = elapsed_ms
257
+ log.info("Ingestion stage '%s' completed in %d ms", stage_name, elapsed_ms)
258
+ pipeline_facade._log_ingestion_retry_event(
259
+ user_id=user_id,
260
+ file_hash=file_hash if "file_hash" in locals() else None,
261
+ batch_num=0,
262
+ total_batches=0,
263
+ attempt=1,
264
+ event_type="stage_timing",
265
+ message=json.dumps({"stage": stage_name, "elapsed_ms": elapsed_ms})[:500],
266
+ sleep_s=0,
267
+ )
268
+
269
+ log.info("=" * 50)
270
+ log.info("Starting ingestion: %s", pdf_path)
271
+
272
+ user_id = (
273
+ extract_jwt_sub(access_token)
274
+ if access_token
275
+ else "00000000-0000-0000-0000-000000000000"
276
+ )
277
+
278
+ _progress(1, "Computing file fingerprint…")
279
+ # Use facade symbol so tests can monkeypatch `backend.core.pipeline.get_file_fingerprint`.
280
+ file_hash = pipeline_facade.get_file_fingerprint(pdf_path)
281
+ already_exists = pipeline_facade.is_file_already_ingested(file_hash, access_token=access_token)
282
+ if not already_exists:
283
+ recovered_existing = pipeline_facade._recover_or_prepare_orphaned_upload(
284
+ file_hash,
285
+ user_id=user_id,
286
+ access_token=access_token,
287
+ filename_hint=original_filename or os.path.basename(pdf_path),
288
+ force=force,
289
+ )
290
+ if recovered_existing:
291
+ return recovered_existing
292
+ if already_exists and not force:
293
+ log.info("SKIPPING — already ingested.")
294
+ return "already_ingested"
295
+
296
+ forced_category = None
297
+ if already_exists or force:
298
+ try:
299
+ _sb = pipeline_facade._build_supabase_client(access_token)
300
+ _existing = (
301
+ _sb.table("ingested_files")
302
+ .select("document_type, user_overridden")
303
+ .eq("user_id", user_id)
304
+ .eq("file_hash", file_hash)
305
+ .limit(1)
306
+ .execute()
307
+ )
308
+ if _existing.data and _existing.data[0].get("user_overridden"):
309
+ forced_category = _existing.data[0]["document_type"]
310
+ log.info(
311
+ "User override active — forcing category '%s', skipping classifier.",
312
+ forced_category,
313
+ )
314
+ except Exception as _exc:
315
+ log.warning("Could not check user override: %s", _exc)
316
+
317
+ if already_exists or force:
318
+ pipeline_facade._cleanup_existing_ingestion_fragments(
319
+ file_hash,
320
+ user_id=user_id,
321
+ access_token=access_token,
322
+ )
323
+
324
+ _progress(2, "Partitioning PDF (OCR + layout detection)…")
325
+ stage_started = time.perf_counter()
326
+ # Use facade symbols so tests can monkeypatch these helpers.
327
+ elements = pipeline_facade.partition_document(pdf_path)
328
+ pdf_images = pipeline_facade.extract_images_from_pdf(pdf_path)
329
+ if not elements:
330
+ raise ValueError(
331
+ "The PDF appears blank or unreadable. "
332
+ "If scanned, ensure tesseract-ocr is installed."
333
+ )
334
+ text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
335
+ coverage_metrics = _extract_element_metrics(elements)
336
+ if text_chars < 50:
337
+ raise ValueError(
338
+ f"PDF contains almost no readable text ({text_chars} chars). "
339
+ "May be corrupted or image-only without OCR layer."
340
+ )
341
+ identity_json = pipeline_facade._identity_json_from_elements(
342
+ elements,
343
+ fallback_title=pipeline_facade._extract_pdf_title(elements, os.path.basename(pdf_path)),
344
+ )
345
+ _record_stage_timing("partition", stage_started)
346
+
347
+ _progress(3, "Classifying document and building taxonomy…")
348
+ stage_started = time.perf_counter()
349
+ graph_data = pipeline_facade.extract_document_entities(
350
+ elements,
351
+ access_token=access_token,
352
+ forced_category=forced_category,
353
+ )
354
+ if not graph_data.is_allowed:
355
+ raise ValueError("Document rejected: appears blank, spam, or unreadable.")
356
+ log.info("Category: '%s'", graph_data.document_type)
357
+ _record_stage_timing("classify", stage_started)
358
+
359
+ try:
360
+ log.info("🌳 Generating structural PageIndex tree...")
361
+ doc_tree = pipeline_facade._build_document_tree(elements)
362
+ sb = pipeline_facade._build_service_supabase_client()
363
+ sb.table("document_trees").upsert(
364
+ {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
365
+ on_conflict="user_id,file_hash",
366
+ ).execute()
367
+ log.info("✅ PageIndex tree saved to Supabase.")
368
+ except Exception as e:
369
+ log.warning("⚠️ Failed to generate/save document tree: %s", e)
370
+
371
+ _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
372
+ stage_started = time.perf_counter()
373
+ chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
374
+ pdf_path_for_naming = original_filename if original_filename else pdf_path
375
+ docs, ids = pipeline_facade.process_chunks(
376
+ chunks,
377
+ elements,
378
+ pdf_path_for_naming,
379
+ file_hash,
380
+ graph_data,
381
+ user_id,
382
+ pdf_images,
383
+ coverage_metrics=coverage_metrics,
384
+ )
385
+ _record_stage_timing("chunk_process", stage_started)
386
+
387
+ _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
388
+ stage_started = time.perf_counter()
389
+ docs, ids = pipeline_facade.build_raptor_tree(docs, ids, user_id)
390
+ pipeline_facade._persist_graph_foundation(
391
+ user_id=user_id,
392
+ file_hash=file_hash,
393
+ docs=docs,
394
+ graph_data=graph_data,
395
+ )
396
+ _record_stage_timing("raptor", stage_started)
397
+
398
+ smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
399
+ if export_json:
400
+ log.info("💾 Exporting processed chunks to local JSON...")
401
+ pipeline_facade.export_to_json(docs)
402
+
403
+ _progress(6, f"Embedding and uploading {len(docs)} tree nodes…")
404
+ stage_started = time.perf_counter()
405
+ pipeline_facade.upload_to_supabase(docs, ids, access_token=access_token)
406
+ _record_stage_timing("upload", stage_started)
407
+
408
+ try:
409
+ sb = pipeline_facade._build_service_supabase_client()
410
+ sb.table("ingested_files").upsert(
411
+ {
412
+ "user_id": user_id,
413
+ "file_hash": file_hash,
414
+ "filename": smart_name,
415
+ "document_type": graph_data.document_type,
416
+ "chunk_count": len(docs),
417
+ "identity_json": identity_json,
418
+ },
419
+ on_conflict="user_id,file_hash",
420
+ ).execute()
421
+ pipeline_facade._log_ingestion_retry_event(
422
+ user_id=user_id,
423
+ file_hash=file_hash,
424
+ batch_num=0,
425
+ total_batches=0,
426
+ attempt=1,
427
+ event_type="registry_saved",
428
+ message="Registered ingested file after successful upload.",
429
+ )
430
+ except Exception as e:
431
+ log.error("Failed to register file: %s", e)
432
+ pipeline_facade._log_ingestion_retry_event(
433
+ user_id=user_id,
434
+ file_hash=file_hash,
435
+ batch_num=0,
436
+ total_batches=0,
437
+ attempt=1,
438
+ event_type="registry_failed",
439
+ message=str(e),
440
+ )
441
+
442
+ if access_token:
443
+ try:
444
+ invalidate_user_cache(user_id, reason="new_document_ingested")
445
+ except Exception:
446
+ pass
447
+
448
+ log.info("Ingestion complete!")
449
+ pipeline_facade._log_ingestion_retry_event(
450
+ user_id=user_id,
451
+ file_hash=file_hash,
452
+ batch_num=0,
453
+ total_batches=0,
454
+ attempt=1,
455
+ event_type="ingestion_complete",
456
+ message="Ingestion completed successfully.",
457
+ )
458
+ log.info("Ingestion stage timings (ms): %s", stage_timings_ms)
459
+ return {
460
+ "pending_review": True,
461
+ "document_type": graph_data.document_type,
462
+ "filename": smart_name,
463
+ "file_hash": file_hash,
464
+ }
465
+
backend/core/pipeline_memory.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Memory & prefetch facade functions.
3
+
4
+ The implementation lives in `pipeline.py` during migration; this module gives
5
+ it a clear ownership boundary and makes it easy to feature-flag later.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ def _predict_and_prefetch(
12
+ original_query: str, answer: str, category: str, session_id: str, access_token: str
13
+ ):
14
+ from backend.core import pipeline as pipeline_facade
15
+
16
+ return pipeline_facade._predict_and_prefetch_impl(
17
+ original_query=original_query,
18
+ answer=answer,
19
+ category=category,
20
+ session_id=session_id,
21
+ access_token=access_token,
22
+ )
23
+
backend/core/pipeline_pageindex.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PageIndex / structural tree path retrieval.
3
+
4
+ This module isolates TOC/page lookup heuristics and Supabase `document_trees`
5
+ traversal so issues in structural retrieval don't churn the main retrieval path.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import re
12
+ import time
13
+ from typing import List
14
+
15
+ from langchain_core.documents import Document
16
+
17
+
18
+ log = logging.getLogger("rag_pipeline")
19
+
20
+
21
+ def _should_use_tree_path(query: str) -> bool:
22
+ """
23
+ Zero-latency heuristic to route structured/specific queries to PageIndex
24
+ instead of the standard vector semantic search.
25
+ """
26
+ # 1. Regex match for Course Codes (e.g., DSN4097, CSE2001, ENG101)
27
+ if re.search(r"\b[A-Z]{2,4}\s?[0-9]{3,4}\b", query, re.IGNORECASE):
28
+ return True
29
+
30
+ q = (query or "").lower()
31
+
32
+ # 2. Structured-document intents that benefit from PageIndex.
33
+ # Keep this conservative: over-triggering PageIndex causes irrelevant “structural” hits.
34
+ if "table of contents" in q or ("contents" in q and "page" in q):
35
+ return True
36
+ trigger_words = {"list", "exactly", "code"}
37
+ query_words = set(q.split())
38
+ if query_words.intersection(trigger_words):
39
+ return True
40
+
41
+ return False
42
+
43
+
44
+ def tree_search(
45
+ query: str,
46
+ access_token: str = None,
47
+ category: str = None,
48
+ priority_file_hashes: List[str] = None,
49
+ ) -> List[Document]:
50
+ """
51
+ Navigates the structural JSON trees in Supabase to answer highly specific
52
+ 'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
53
+ """
54
+ log.info("🔍 Executing Tree Search for query: %s", query)
55
+
56
+ q = (query or "").strip()
57
+ q_lower = q.lower()
58
+
59
+ def _norm_for_match(s: str) -> str:
60
+ """
61
+ Lightweight normalization to make TOC matching robust across OCR/partition quirks:
62
+ - normalize curly quotes/apostrophes to ASCII
63
+ - lowercase
64
+ - collapse whitespace
65
+ """
66
+ s = str(s or "")
67
+ s = (
68
+ s.replace("’", "'")
69
+ .replace("‘", "'")
70
+ .replace("“", '"')
71
+ .replace("”", '"')
72
+ .replace("`", "'")
73
+ )
74
+ s = s.lower()
75
+ s = re.sub(r"\s+", " ", s).strip()
76
+ return s
77
+
78
+ # 1. Extract the specific targets from the query (e.g., Course Codes)
79
+ targets = set(re.findall(r"\b[A-Z]{2,4}\s?[0-9]{3,4}\b", q, re.IGNORECASE))
80
+
81
+ # Special-case: Table of contents lookups (“what page is X on?”).
82
+ # Extract the section title inside quotes if present, otherwise fall back
83
+ # to a small target set to avoid matching the entire tree.
84
+ toc_lookup = ("table of contents" in q_lower) or ("contents" in q_lower and "page" in q_lower)
85
+ toc_target = None
86
+ if toc_lookup and not targets:
87
+ m = re.search(r"[\"'“”‘’](.+?)[\"'“”‘’]", q)
88
+ if m:
89
+ toc_target = m.group(1).strip()
90
+ if toc_target:
91
+ # Add normalized variants so “What’s New” matches "What's New" / "Whats New".
92
+ norm = _norm_for_match(toc_target)
93
+ if norm:
94
+ targets = {norm, norm.replace("'", "")}
95
+
96
+ # Fallback: extract important keywords if no explicit course code is found
97
+ if not targets:
98
+ trigger_words = {"table", "contents", "list", "exactly", "code", "section", "capstone", "credits"}
99
+ stopwords = {
100
+ "what",
101
+ "is",
102
+ "the",
103
+ "how",
104
+ "many",
105
+ "for",
106
+ "in",
107
+ "a",
108
+ "of",
109
+ "to",
110
+ "on",
111
+ "only",
112
+ "page",
113
+ }
114
+ words = {w.strip(".,:;!?()[]{}") for w in q_lower.split()}
115
+ words = {w for w in words if w}
116
+ targets = words - trigger_words - stopwords
117
+
118
+ if not targets:
119
+ log.info("No specific targets extracted for tree search.")
120
+ return []
121
+
122
+ try:
123
+ from backend.core.auth_utils import extract_jwt_sub
124
+ from backend.core import pipeline as pipeline_facade
125
+
126
+ user_id = (
127
+ extract_jwt_sub(access_token)
128
+ if access_token
129
+ else "00000000-0000-0000-0000-000000000000"
130
+ )
131
+ # Use facade symbol so tests can monkeypatch `backend.core.pipeline._build_supabase_client`.
132
+ sb = pipeline_facade._build_supabase_client(access_token)
133
+
134
+ # 2. Fetch all structural trees for this user
135
+ res = (
136
+ sb.table("document_trees")
137
+ .select("file_hash, tree_json")
138
+ .eq("user_id", user_id)
139
+ .execute()
140
+ )
141
+ if not res.data:
142
+ return []
143
+
144
+ allowed_hashes = None
145
+ if category and category != "All":
146
+ try:
147
+ allowed_res = (
148
+ sb.table("ingested_files")
149
+ .select("file_hash")
150
+ .eq("document_type", category)
151
+ .execute()
152
+ )
153
+ allowed_hashes = {
154
+ row.get("file_hash") for row in (allowed_res.data or []) if row.get("file_hash")
155
+ }
156
+ except Exception as exc:
157
+ log.warning("Could not apply tree-search category filter: %s", exc)
158
+ if priority_file_hashes:
159
+ pinned_hashes = {h for h in priority_file_hashes if h}
160
+ if pinned_hashes:
161
+ allowed_hashes = (
162
+ pinned_hashes
163
+ if allowed_hashes is None
164
+ else allowed_hashes.intersection(pinned_hashes)
165
+ )
166
+
167
+ matched_chunks: list[Document] = []
168
+
169
+ # 3. Recursive Tree Traversal
170
+ def _traverse(node, parent_title="", file_hash=""):
171
+ title = str(node.get("title", "") or "")
172
+ content = str(node.get("content", "") or "")
173
+ node_text = _norm_for_match(title + " " + content)
174
+
175
+ # If the node contains our target noun/code, we capture it
176
+ norm_targets = [_norm_for_match(t) for t in targets]
177
+ is_match = any(t and t in node_text for t in norm_targets)
178
+
179
+ if is_match and content:
180
+ parent_chain = f"{parent_title} {title}".strip().lower()
181
+
182
+ # TOC lookups should only match TOC entries (not random headers/sections that mention the phrase).
183
+ # NOTE: Many PDFs don't label the TOC as a distinct "Title" element during partitioning,
184
+ # so TOC rows can end up under "Document Root" or a different parent. We therefore treat
185
+ # "in TOC section" as a relevance boost (not a hard filter) and rely on the stricter
186
+ # "dotted leader -> page number" extraction below to keep TOC matches precise.
187
+ in_toc_section = False
188
+ if toc_lookup:
189
+ in_toc_section = ("table of contents" in parent_chain) or (
190
+ parent_chain.startswith("contents") or "contents" in parent_chain
191
+ )
192
+
193
+ # Score matches: prefer nodes that contain the full target phrase and a TOC-like page number.
194
+ score = 0.2
195
+ if toc_lookup and in_toc_section:
196
+ score += 0.15
197
+ if toc_target and _norm_for_match(toc_target) in node_text:
198
+ score += 0.6
199
+ score += 0.2 if any(t and t in node_text for t in norm_targets) else 0.0
200
+
201
+ # Attempt to extract page numbers from TOC lines ("..... 6") or "Page 6".
202
+ page_numbers: list[int] = []
203
+ # TOC dotted leaders can appear as "..... 6" or ". . . . 6"
204
+ toc_page_match = re.search(r"(?:\.\s*){2,}(\d{1,3})\b", content)
205
+ if toc_page_match:
206
+ page_numbers.append(int(toc_page_match.group(1)))
207
+ score += 0.3
208
+ elif toc_lookup:
209
+ leader_page = re.search(
210
+ r"(?:[.\u00b7\u2026]\s*){1,}(\d{1,3})\s*$", content
211
+ )
212
+ if leader_page:
213
+ page_numbers.append(int(leader_page.group(1)))
214
+ score += 0.25
215
+ else:
216
+ spaced_page = re.search(r"\s{2,}(\d{1,3})\s*$", content)
217
+ if spaced_page:
218
+ page_numbers.append(int(spaced_page.group(1)))
219
+ score += 0.2
220
+ elif not toc_lookup:
221
+ page_hint = re.search(
222
+ r"\bpage\s+(\d{1,3})\b", content, flags=re.IGNORECASE
223
+ )
224
+ if page_hint:
225
+ page_numbers.append(int(page_hint.group(1)))
226
+ score += 0.2
227
+
228
+ if toc_lookup and not page_numbers:
229
+ return
230
+
231
+ matched_chunks.append(
232
+ Document(
233
+ page_content=f"Section Context: {parent_title} -> {title}\n\n{content}",
234
+ metadata={
235
+ "source": "PageIndex Tree Structure",
236
+ "file_hash": file_hash,
237
+ "type": "structural_node",
238
+ "page_numbers": page_numbers,
239
+ "relevance_score": round(min(1.0, max(0.0, score)), 4),
240
+ "retrieved_at_ms": int(time.time() * 1000),
241
+ },
242
+ )
243
+ )
244
+
245
+ for child in node.get("children", []):
246
+ _traverse(child, node.get("title", parent_title), file_hash)
247
+
248
+ for tree_row in res.data:
249
+ if allowed_hashes is not None and tree_row.get("file_hash") not in allowed_hashes:
250
+ continue
251
+ _traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
252
+
253
+ log.info("✅ Tree search found %d matching structural nodes.", len(matched_chunks))
254
+
255
+ matched_chunks.sort(
256
+ key=lambda d: float((d.metadata or {}).get("relevance_score") or 0.0), reverse=True
257
+ )
258
+ return matched_chunks[:5]
259
+
260
+ except Exception as e:
261
+ log.warning("⚠️ Tree Search failed, falling back to empty chunks: %s", e)
262
+ return []
263
+
backend/core/pipeline_retrieval.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Retrieval facade functions.
3
+
4
+ During the gradual de-monolith refactor, we keep the heavy implementations in
5
+ `pipeline.py` (renamed to *_impl) and provide stable entrypoints here. This
6
+ lets API/tests import retrieval without pulling generation/ingestion concerns.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import List
12
+
13
+ from langchain_core.documents import Document
14
+
15
+
16
+ def generate_sub_queries(original_query: str, *, route_class: str = "factoid") -> List[str]:
17
+ from backend.core import pipeline as pipeline_facade
18
+
19
+ return pipeline_facade._generate_sub_queries_impl(
20
+ original_query,
21
+ route_class=route_class,
22
+ )
23
+
24
+
25
+ def retrieve_chunks(
26
+ query: str,
27
+ k: int = 3,
28
+ source_file: str = None,
29
+ category: str = None,
30
+ alpha: float = 0.5,
31
+ session_id: str = "default_session",
32
+ access_token: str = None,
33
+ user_id: str = None,
34
+ original_query: str = None,
35
+ eval_mode: bool = False,
36
+ priority_file_hashes: List[str] = None,
37
+ ) -> List[Document]:
38
+ from backend.core import pipeline as pipeline_facade
39
+
40
+ return pipeline_facade._retrieve_chunks_impl(
41
+ query,
42
+ k=k,
43
+ source_file=source_file,
44
+ category=category,
45
+ alpha=alpha,
46
+ session_id=session_id,
47
+ access_token=access_token,
48
+ user_id=user_id,
49
+ original_query=original_query,
50
+ eval_mode=eval_mode,
51
+ priority_file_hashes=priority_file_hashes,
52
+ )
53
+
54
+
55
+ def retrieve_chunks_routed(
56
+ query: str,
57
+ k: int = 3,
58
+ source_file: str = None,
59
+ category: str = None,
60
+ alpha: float = 0.5,
61
+ session_id: str = "default_session",
62
+ access_token: str = None,
63
+ user_id: str = None,
64
+ original_query: str = None,
65
+ eval_mode: bool = False,
66
+ priority_file_hashes: List[str] = None,
67
+ ) -> List[Document]:
68
+ from backend.core import pipeline as pipeline_facade
69
+
70
+ return pipeline_facade._retrieve_chunks_routed_impl(
71
+ query,
72
+ k=k,
73
+ source_file=source_file,
74
+ category=category,
75
+ alpha=alpha,
76
+ session_id=session_id,
77
+ access_token=access_token,
78
+ user_id=user_id,
79
+ original_query=original_query,
80
+ eval_mode=eval_mode,
81
+ priority_file_hashes=priority_file_hashes,
82
+ )
83
+
backend/core/pipeline_routing.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Routing and expert selection logic.
3
+
4
+ This module is extracted from `backend/core/pipeline.py` as part of the
5
+ de-monolith refactor. The facade still owns many helpers; we import them
6
+ lazily to avoid circular imports during migration.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from typing import List, Optional
13
+
14
+ from backend.core.pipeline_types import RouteDecision
15
+
16
+ log = logging.getLogger("rag_pipeline")
17
+
18
+
19
+ def _classify_query_route_decision(
20
+ query: str,
21
+ *,
22
+ session_id: Optional[str] = None,
23
+ user_id: Optional[str] = None,
24
+ priority_file_hashes: Optional[List[str]] = None,
25
+ ) -> RouteDecision:
26
+ from backend.core import pipeline as pipeline_facade
27
+
28
+ q = (query or "").strip().lower()
29
+ if not q:
30
+ return RouteDecision(route_class="factoid", route_reason="empty_query")
31
+ if q in {"hi", "hello", "hey", "thanks", "thank you"}:
32
+ return RouteDecision(route_class="no_retrieval", route_reason="greeting")
33
+ page_scope = pipeline_facade._detect_page_scope(q)
34
+ exact_field = pipeline_facade._detect_identity_field(q)
35
+ if page_scope:
36
+ return RouteDecision(
37
+ route_class="page_scoped",
38
+ route_reason=f"page_scope:{page_scope}",
39
+ preserve_query=True,
40
+ disable_memory=True,
41
+ page_scope=page_scope,
42
+ exact_field=exact_field,
43
+ )
44
+ if exact_field or pipeline_facade._is_exact_fact_query(q):
45
+ return RouteDecision(
46
+ route_class="exact_fact",
47
+ route_reason=f"identity_field:{exact_field or 'generic'}",
48
+ preserve_query=True,
49
+ disable_memory=True,
50
+ exact_field=exact_field,
51
+ )
52
+ if pipeline_facade._is_follow_up_reference(query, session_id=session_id, user_id=user_id):
53
+ return RouteDecision(
54
+ route_class="follow_up",
55
+ route_reason="session_reference",
56
+ preserve_query=False,
57
+ disable_memory=False,
58
+ )
59
+ if pipeline_facade._is_compare_like_query(query) or bool(
60
+ priority_file_hashes and len(priority_file_hashes) > 1
61
+ ):
62
+ return RouteDecision(route_class="compare", route_reason="compare_keywords")
63
+ if pipeline_facade._is_multi_part_query(query):
64
+ return RouteDecision(route_class="multi_part", route_reason="multi_part_keywords")
65
+ if pipeline_facade._is_summary_like_query(query):
66
+ return RouteDecision(route_class="summary", route_reason="summary_keywords")
67
+ if pipeline_facade._is_relational_query(query):
68
+ return RouteDecision(route_class="relational", route_reason="relational_keywords")
69
+
70
+ llm_decision = pipeline_facade._llm_route_classifier(
71
+ query,
72
+ session_id=session_id,
73
+ user_id=user_id,
74
+ priority_file_hashes=priority_file_hashes,
75
+ )
76
+ if llm_decision and llm_decision.route_class:
77
+ return llm_decision
78
+
79
+ return RouteDecision(route_class="factoid", route_reason="heuristic_default")
80
+
81
+
82
+ def _route_query_experts(
83
+ query: str,
84
+ *,
85
+ session_id: Optional[str] = None,
86
+ user_id: Optional[str] = None,
87
+ priority_file_hashes: Optional[List[str]] = None,
88
+ ) -> dict:
89
+ from backend.core import pipeline as pipeline_facade
90
+
91
+ q = (query or "").strip()
92
+ q_lower = q.lower()
93
+ embedding_scores: dict[str, float] = {}
94
+ try:
95
+ query_vec = pipeline_facade.get_cached_embedding(q or "general document information")
96
+ for expert, prototypes in pipeline_facade._ROUTER_PROTOTYPES.items():
97
+ sims = [
98
+ pipeline_facade._vector_cosine(
99
+ query_vec, pipeline_facade.get_cached_embedding(proto)
100
+ )
101
+ for proto in prototypes
102
+ ]
103
+ embedding_scores[expert] = max(0.0, sum(sims) / max(1, len(sims)))
104
+ except Exception as exc:
105
+ log.debug("Router embedding stage unavailable: %s", exc)
106
+ embedding_scores = {expert: 0.2 for expert in pipeline_facade._ROUTER_PROTOTYPES}
107
+
108
+ feature_scores = {expert: 0.0 for expert in pipeline_facade._ROUTER_PROTOTYPES}
109
+ if pipeline_facade._is_summary_like_query(q_lower):
110
+ feature_scores["raptor_summary"] += 0.35
111
+ if pipeline_facade._is_compare_like_query(q_lower):
112
+ feature_scores["hybrid_compare"] += 0.45
113
+ feature_scores["graph_traversal"] += 0.10
114
+ if any(
115
+ token in q_lower
116
+ for token in ("relationship", "connected", "connection", "link", "linked", "why", "cause")
117
+ ):
118
+ feature_scores["graph_traversal"] += 0.35
119
+ if priority_file_hashes and len(priority_file_hashes) > 1:
120
+ feature_scores["hybrid_compare"] += 0.15
121
+ if session_id:
122
+ session_key = pipeline_facade._session_cache_key(session_id, user_id=user_id)
123
+ if session_key in pipeline_facade._last_chunks and any(
124
+ token in q_lower for token in ("it", "this", "that", "previous", "above", "earlier")
125
+ ):
126
+ feature_scores["episodic_memory"] += 0.35
127
+ if not priority_file_hashes:
128
+ feature_scores["dense_chunk"] += 0.10
129
+
130
+ combined = {
131
+ expert: (embedding_scores.get(expert, 0.0) * 0.65)
132
+ + (feature_scores.get(expert, 0.0) * 0.35)
133
+ for expert in pipeline_facade._ROUTER_PROTOTYPES
134
+ }
135
+ weights = pipeline_facade._normalize_weight_map(combined)
136
+ ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
137
+ confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
138
+ if confidence_gap < 0.06 and len(q.split()) >= 4:
139
+ llm_weights = pipeline_facade._llm_router_fallback(q)
140
+ if llm_weights:
141
+ weights = llm_weights
142
+ ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
143
+ confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
144
+ return {
145
+ "expert_weights": weights,
146
+ "selected_experts": [expert for expert, score in ranked if score >= 0.18][:3],
147
+ "confidence": round(confidence_gap, 4),
148
+ }
149
+
backend/core/pipeline_supabase.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Supabase client builders and small DB helpers for the RAG pipeline.
3
+
4
+ Separated so API/worker code can use Supabase utilities without importing the
5
+ entire pipeline (LLMs, unstructured, etc.).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ try:
11
+ from supabase.client import create_client
12
+ except Exception: # optional at import time
13
+ create_client = None
14
+
15
+ from backend.core import config
16
+
17
+
18
+ def _build_service_supabase_client():
19
+ """Service-role client (bypasses RLS). Use only for admin/bootstrap paths."""
20
+ if create_client is None:
21
+ raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to use DB features.")
22
+ return create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
23
+
24
+
25
+ def _build_user_supabase_client(access_token: str):
26
+ if create_client is None:
27
+ raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to use DB features.")
28
+ if not config.SUPABASE_ANON_KEY:
29
+ raise RuntimeError(
30
+ "SUPABASE_ANON_KEY is not set but a tenant access_token was provided."
31
+ )
32
+ client = create_client(config.SUPABASE_URL, config.SUPABASE_ANON_KEY)
33
+ # supabase-py v2: set JWT for RLS via postgrest auth header
34
+ client.postgrest.auth(access_token)
35
+ return client
36
+
37
+
38
+ def _build_supabase_client(access_token: str = None):
39
+ """
40
+ Default to service role for legacy/internal call paths.
41
+ API routes should pass access_token so RLS is enforced.
42
+ """
43
+ if access_token:
44
+ return _build_user_supabase_client(access_token)
45
+ return _build_service_supabase_client()
46
+
backend/core/pipeline_types.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared Pydantic schemas and lightweight types for the RAG pipeline.
3
+
4
+ Kept in a separate module so API/worker code can import types without pulling
5
+ in the full pipeline runtime (LLM clients, unstructured, etc.).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import List, Optional
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+
15
+ class DocumentGraphMetadata(BaseModel):
16
+ """
17
+ Dynamic taxonomy classification.
18
+ All fields have safe defaults so partial LLM responses never raise.
19
+ """
20
+
21
+ is_allowed: bool = Field(
22
+ default=True,
23
+ description=(
24
+ "True for any real document with meaningful content. "
25
+ "False ONLY for blank/empty files, pure spam, or completely unreadable content."
26
+ ),
27
+ )
28
+ document_type: str = Field(
29
+ default="general_document",
30
+ description=(
31
+ "A snake_case category label. Choose from the existing list if a good match exists. "
32
+ "Otherwise invent a concise new label e.g. 'machine_learning_paper', 'legal_contract'."
33
+ ),
34
+ )
35
+ key_entities: List[str] = Field(
36
+ default_factory=list,
37
+ description="Names of algorithms, people, organizations, places, or technologies mentioned.",
38
+ )
39
+ primary_topics: List[str] = Field(
40
+ default_factory=list,
41
+ description="The 2-3 broad themes of the document.",
42
+ )
43
+ brief_summary: str = Field(
44
+ default="No summary available.",
45
+ description="A one-sentence summary of what this document is about.",
46
+ )
47
+ # Absorb extra fields older LLM responses include — prevents Pydantic crash
48
+ categories: Optional[List[str]] = Field(default=None, exclude=True)
49
+ audience: Optional[str] = Field(default=None, exclude=True)
50
+
51
+
52
+ class QueryVariants(BaseModel):
53
+ sub_queries: List[str] = Field(
54
+ description="1-3 highly optimized, distinct search queries broken down from the original prompt."
55
+ )
56
+
57
+
58
+ class RouteDecision(BaseModel):
59
+ route_class: str = Field(default="factoid")
60
+ route_reason: str = Field(default="heuristic_default")
61
+ preserve_query: bool = Field(default=False)
62
+ disable_memory: bool = Field(default=False)
63
+ page_scope: Optional[str] = Field(default=None)
64
+ exact_field: Optional[str] = Field(default=None)
65
+
backend/core/rate_limit.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from starlette.requests import Request
2
+
3
+ from backend.core.auth_utils import is_guest_token
4
+
5
+ try:
6
+ from slowapi import Limiter, _rate_limit_exceeded_handler
7
+ from slowapi.errors import RateLimitExceeded
8
+ from slowapi.util import get_remote_address
9
+ except Exception: # optional in minimal/test envs
10
+ Limiter = None
11
+ RateLimitExceeded = Exception
12
+ _rate_limit_exceeded_handler = None
13
+
14
+ def get_remote_address(request): # type: ignore
15
+ return request.client.host if getattr(request, "client", None) else "unknown"
16
+
17
+
18
+ def _rate_limit_key(request: Request) -> str:
19
+ """Use stricter IP limits for guest workspaces, user token limits otherwise."""
20
+ token = request.headers.get("X-Auth-Token") or request.headers.get("Authorization")
21
+ if token and token.startswith("Bearer "):
22
+ token = token.split(" ", 1)[1]
23
+ if token and not is_guest_token(token):
24
+ return token
25
+ return get_remote_address(request)
26
+
27
+
28
+ if Limiter is not None:
29
+ limiter = Limiter(key_func=_rate_limit_key)
30
+ else:
31
+ class _NoopLimiter:
32
+ def limit(self, *_args, **_kwargs):
33
+ def _decorator(fn):
34
+ return fn
35
+
36
+ return _decorator
37
+
38
+ limiter = _NoopLimiter()
39
+
backend/core/tasks.py CHANGED
@@ -1,14 +1,60 @@
 
1
  import os
2
- from celery import Celery
 
3
  from backend.core.pipeline import run_ingestion
4
 
 
 
5
  # Initialize Celery pointing to your Redis broker
6
  REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
7
 
8
- celery_app = Celery("morpheus_worker", broker=REDIS_URL, backend=REDIS_URL)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- @celery_app.task(bind=True)
11
- def process_pdf_task(self, tmp_path: str, original_filename: str, access_token: str):
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
  This runs in a completely separate background process!
14
  We pass a progress_callback to run_ingestion so it can report its status.
@@ -21,23 +67,18 @@ def process_pdf_task(self, tmp_path: str, original_filename: str, access_token:
21
  )
22
 
23
  try:
24
- # Call your existing pipeline
25
- result = run_ingestion(
26
  pdf_path=tmp_path,
27
  original_filename=original_filename,
28
  progress_callback=update_progress,
29
  access_token=access_token,
30
  )
31
-
32
- # Cleanup the temp file after the heavy ML job is done
33
- try: os.unlink(tmp_path) # noqa: E701
34
- except OSError: pass # noqa: E701
35
-
36
- return result
37
- except Exception as e:
38
- try: os.unlink(tmp_path) # noqa: E701
39
- except OSError: pass # noqa: E701
40
- # Reraising the exception tells Celery the task failed
41
- raise Exception(str(e))
42
-
43
-
 
1
+ import logging
2
  import os
3
+
4
+ from backend.core import config
5
  from backend.core.pipeline import run_ingestion
6
 
7
+ log = logging.getLogger("morpheus.tasks")
8
+
9
  # Initialize Celery pointing to your Redis broker
10
  REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
11
 
12
+ try:
13
+ from celery import Celery
14
+ except Exception:
15
+ Celery = None
16
+
17
+ if Celery is not None:
18
+ celery_app = Celery("morpheus_worker", broker=REDIS_URL, backend=REDIS_URL)
19
+ celery_app.conf.update(
20
+ task_track_started=True,
21
+ task_acks_late=True,
22
+ task_reject_on_worker_lost=True,
23
+ worker_cancel_long_running_tasks_on_connection_loss=True,
24
+ broker_connection_retry_on_startup=True,
25
+ broker_connection_max_retries=None,
26
+ broker_heartbeat=config.CELERY_BROKER_HEARTBEAT_S,
27
+ broker_pool_limit=config.CELERY_BROKER_POOL_LIMIT,
28
+ broker_transport_options={
29
+ "visibility_timeout": config.CELERY_VISIBILITY_TIMEOUT_S,
30
+ "socket_keepalive": True,
31
+ "socket_timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S,
32
+ "socket_connect_timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S,
33
+ "retry_on_timeout": True,
34
+ "health_check_interval": config.CELERY_REDIS_HEALTH_CHECK_INTERVAL_S,
35
+ },
36
+ result_backend_transport_options={
37
+ "visibility_timeout": config.CELERY_VISIBILITY_TIMEOUT_S,
38
+ "retry_policy": {"timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S},
39
+ "health_check_interval": config.CELERY_REDIS_HEALTH_CHECK_INTERVAL_S,
40
+ },
41
+ )
42
+ else:
43
+ celery_app = None
44
 
45
+
46
+ def _cleanup_temp_upload(tmp_path: str) -> None:
47
+ if not tmp_path:
48
+ return
49
+ try:
50
+ os.unlink(tmp_path)
51
+ except FileNotFoundError:
52
+ return
53
+ except OSError as exc:
54
+ log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
55
+
56
+
57
+ def _process_pdf_task_impl(self, tmp_path: str, original_filename: str, access_token: str):
58
  """
59
  This runs in a completely separate background process!
60
  We pass a progress_callback to run_ingestion so it can report its status.
 
67
  )
68
 
69
  try:
70
+ return run_ingestion(
 
71
  pdf_path=tmp_path,
72
  original_filename=original_filename,
73
  progress_callback=update_progress,
74
  access_token=access_token,
75
  )
76
+ finally:
77
+ _cleanup_temp_upload(tmp_path)
78
+
79
+
80
+ if celery_app is not None:
81
+ process_pdf_task = celery_app.task(bind=True)(_process_pdf_task_impl)
82
+ else:
83
+ def process_pdf_task(*_args, **_kwargs):
84
+ raise RuntimeError("Celery not installed; background ingestion is unavailable.")
 
 
 
 
backend/core/warmup_classifier.py CHANGED
@@ -19,7 +19,6 @@ Usage:
19
 
20
  import numpy as np
21
  import logging
22
- from supabase.client import create_client
23
  from dotenv import load_dotenv
24
  from backend.core import config
25
 
@@ -32,6 +31,11 @@ log = logging.getLogger("warmup")
32
 
33
 
34
  def warmup():
 
 
 
 
 
35
  supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
36
 
37
  # Step 1 — find which categories already have centroids
 
19
 
20
  import numpy as np
21
  import logging
 
22
  from dotenv import load_dotenv
23
  from backend.core import config
24
 
 
31
 
32
 
33
  def warmup():
34
+ try:
35
+ from supabase.client import create_client
36
+ except Exception as exc:
37
+ raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to warm up classifier.") from exc
38
+
39
  supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
40
 
41
  # Step 1 — find which categories already have centroids
backend/eval/run_eval.py CHANGED
@@ -75,6 +75,72 @@ def _load_from_supabase(
75
  return res.data or []
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def _parse_csv_floats(s: str) -> List[float]:
79
  return [float(x.strip()) for x in s.split(",") if x.strip()]
80
 
 
75
  return res.data or []
76
 
77
 
78
+ def load_feedback_dataset_candidates(
79
+ access_token: Optional[str],
80
+ user_id: Optional[str],
81
+ *,
82
+ limit: int = 50,
83
+ ) -> List[Dict[str, Any]]:
84
+ """
85
+ Promote explicit user feedback into dataset-shaped rows for offline eval curation.
86
+ These candidates are intentionally separate from `evaluation_datasets` so we can
87
+ review them before activation.
88
+ """
89
+ from backend.core.pipeline import _build_service_supabase_client
90
+
91
+ sb = _build_service_supabase_client()
92
+ feedback_q = (
93
+ sb.table("answer_feedback")
94
+ .select("trace_id, helpful, accepted, reason_code, correction_text, promote_to_eval, user_id")
95
+ .eq("promote_to_eval", True)
96
+ .limit(limit)
97
+ )
98
+ if user_id:
99
+ feedback_q = feedback_q.eq("user_id", user_id)
100
+ feedback_rows = feedback_q.execute().data or []
101
+ trace_ids = [row.get("trace_id") for row in feedback_rows if row.get("trace_id")]
102
+ if not trace_ids:
103
+ return []
104
+
105
+ trace_rows = (
106
+ sb.table("query_traces")
107
+ .select("trace_id, question, doc_diagnostics, failure_modes, answer_preview")
108
+ .in_("trace_id", trace_ids)
109
+ .execute()
110
+ .data
111
+ or []
112
+ )
113
+ trace_map = {row.get("trace_id"): row for row in trace_rows if row.get("trace_id")}
114
+
115
+ dataset_rows: List[Dict[str, Any]] = []
116
+ seen_trace_ids = set()
117
+ for feedback in feedback_rows:
118
+ trace_id = feedback.get("trace_id")
119
+ if trace_id in seen_trace_ids:
120
+ continue
121
+ trace = trace_map.get(trace_id, {})
122
+ question = (trace.get("question") or "").strip()
123
+ if not question:
124
+ continue
125
+ seen_trace_ids.add(trace_id)
126
+ correction_text = (feedback.get("correction_text") or "").strip()
127
+ answer_preview = (trace.get("answer_preview") or "").strip()
128
+ dataset_rows.append(
129
+ {
130
+ "question": question,
131
+ "gold_context_refs": [],
132
+ "gold_evidence_text": correction_text or answer_preview,
133
+ "is_answerable": bool(feedback.get("accepted") or feedback.get("helpful")),
134
+ "trace_id": trace_id,
135
+ "failure_modes": trace.get("failure_modes") or [],
136
+ "doc_diagnostics": trace.get("doc_diagnostics") or [],
137
+ "reason_code": feedback.get("reason_code"),
138
+ "source": "feedback_trace",
139
+ }
140
+ )
141
+ return dataset_rows
142
+
143
+
144
  def _parse_csv_floats(s: str) -> List[float]:
145
  return [float(x.strip()) for x in s.split(",") if x.strip()]
146
 
backend/main.py CHANGED
@@ -7,19 +7,6 @@ Production: gunicorn -w 1 -k uvicorn.workers.UvicornWorker backend.main:app --b
7
 
8
  import os
9
  import sys
10
- from slowapi import Limiter, _rate_limit_exceeded_handler
11
- from slowapi.util import get_remote_address
12
- from slowapi.errors import RateLimitExceeded
13
- from starlette.requests import Request
14
-
15
-
16
- def _rate_limit_key(request: Request) -> str:
17
- """Key rate limits by JWT token (per-user), fall back to IP."""
18
- token = request.headers.get("X-Auth-Token") or request.headers.get("Authorization")
19
- return token or get_remote_address(request)
20
-
21
-
22
- limiter = Limiter(key_func=_rate_limit_key)
23
  import logging # noqa: E402
24
  import subprocess # noqa: E402
25
  from contextlib import asynccontextmanager # noqa: E402
@@ -32,8 +19,17 @@ from dotenv import load_dotenv # noqa: E402
32
 
33
  load_dotenv()
34
 
 
 
 
 
 
35
  from backend.api import auth, corpus, ingest, query, admin, frontend_config # noqa: E402
36
- from backend.core.intent_classifier import get_intent_classifier_status # noqa: E402
 
 
 
 
37
 
38
  log = logging.getLogger("morpheus.main")
39
 
@@ -87,7 +83,8 @@ app = FastAPI(
87
 
88
  # ── Rate limiting ─────────────────────────────────────────────────────────────
89
  app.state.limiter = limiter
90
- app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 
91
 
92
  _origins = [
93
  o.strip() for o in os.getenv("ALLOWED_ORIGINS", "*").split(",") if o.strip()
 
7
 
8
  import os
9
  import sys
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import logging # noqa: E402
11
  import subprocess # noqa: E402
12
  from contextlib import asynccontextmanager # noqa: E402
 
19
 
20
  load_dotenv()
21
 
22
+ from backend.core.rate_limit import ( # noqa: E402
23
+ RateLimitExceeded,
24
+ _rate_limit_exceeded_handler,
25
+ limiter,
26
+ )
27
  from backend.api import auth, corpus, ingest, query, admin, frontend_config # noqa: E402
28
+ try: # noqa: E402
29
+ from backend.core.intent_classifier import get_intent_classifier_status
30
+ except Exception:
31
+ def get_intent_classifier_status(): # type: ignore
32
+ return {"ok": False, "reason": "intent_classifier_unavailable"}
33
 
34
  log = logging.getLogger("morpheus.main")
35
 
 
83
 
84
  # ── Rate limiting ─────────────────────────────────────────────────────────────
85
  app.state.limiter = limiter
86
+ if _rate_limit_exceeded_handler is not None:
87
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
88
 
89
  _origins = [
90
  o.strip() for o in os.getenv("ALLOWED_ORIGINS", "*").split(",") if o.strip()
frontend/index.html CHANGED
@@ -212,24 +212,26 @@
212
  onclick="submitLogin()"
213
  >SIGN IN →</button>
214
 
215
- <!-- Admin panel — collapsed by default -->
216
- <details style="margin-top:20px; width:100%;">
217
- <summary style="
218
- font-family:var(--font-mono); font-size:0.58rem;
219
- color:var(--muted); cursor:pointer; letter-spacing:0.12em;
220
- text-transform:uppercase; list-style:none; text-align:center;
221
- ">▸ Admin access</summary>
222
- <div style="margin-top:10px; display:flex; flex-direction:column; gap:6px;">
223
- <input type="password" id="adminKey" placeholder="Master admin key…" style="width:100%;box-sizing:border-box;"/>
224
- <button class="btn-secondary" onclick="submitAdmin()">GET TODAY'S CODE</button>
225
- <div id="adminResult" style="font-family:var(--font-mono);font-size:0.7rem;color:var(--phosphor);min-height:14px;text-align:center;"></div>
226
- <div id="auth-toggle-panel" style="display:none; margin-top:12px;">
227
- <div class="section-label">AUTH GATE</div>
228
- <p id="auth-toggle-label" style="font-size:0.72rem;color:var(--muted);margin-bottom:10px;"></p>
229
- <button id="auth-toggle-btn" onclick="toggleAuth()" style="width:100%;padding:9px;border-radius:6px;border:1px solid;font-family:var(--font-mono);font-size:0.72rem;cursor:pointer;letter-spacing:0.08em;transition:all 0.15s;">DISABLE AUTH</button>
230
- </div>
231
- </div>
232
- </details>
 
 
233
  </div>
234
  </div>
235
 
@@ -251,8 +253,19 @@
251
  <button class="nav-btn" id="nav-chat" onclick="switchView('chat')">
252
  CHAT
253
  </button>
 
 
 
 
 
 
 
 
254
  </nav>
255
  <div class="topbar-right">
 
 
 
256
  <div class="stat-pill">
257
  DOCS <span class="val" id="stat-docs">0</span>
258
  </div>
@@ -263,6 +276,7 @@
263
  <div class="conn-dot offline" id="conn-dot"></div>
264
  <span id="conn-label">OFFLINE</span>
265
  </div>
 
266
  <button onclick="signOut()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">SIGN OUT</button>
267
  </div>
268
  </header>
@@ -486,6 +500,43 @@
486
  </button>
487
  </div>
488
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  </aside>
490
  <!-- Mobile bottom navigation — must be inside #app for grid to work -->
491
  <div id="mobile-nav">
@@ -525,7 +576,8 @@
525
  <script src="js/corpus.js"></script>
526
  <script src="js/inspect.js"></script>
527
  <script src="js/chat.js?v=3"></script>
528
- <script src="js/main.js"></script>
 
529
  <script>
530
  function mobileNav(tab) {
531
  document
@@ -563,4 +615,4 @@
563
  }
564
  </script>
565
  </body>
566
- </html>
 
212
  onclick="submitLogin()"
213
  >SIGN IN →</button>
214
 
215
+ <button
216
+ id="guestBtn"
217
+ class="btn-secondary"
218
+ style="width:100%; letter-spacing:0.1em; margin-top:10px; display:none;"
219
+ onclick="submitGuest()"
220
+ >CONTINUE AS GUEST</button>
221
+ <label
222
+ id="guestPersistWrap"
223
+ style="display:none;width:100%;margin-top:10px;font-size:0.68rem;color:var(--muted);line-height:1.45;text-align:left;"
224
+ >
225
+ <input type="checkbox" id="guestPersist" style="margin-right:8px;accent-color:var(--phosphor);" />
226
+ Keep this guest workspace on this device
227
+ </label>
228
+ <div
229
+ id="guestInfo"
230
+ style="display:none;font-size:0.68rem;color:var(--muted);text-align:center;margin-top:8px;line-height:1.5;"
231
+ >
232
+ Guest mode is isolated and rate-limited. By default it expires when the guest session truly ends.
233
+ </div>
234
+
235
  </div>
236
  </div>
237
 
 
253
  <button class="nav-btn" id="nav-chat" onclick="switchView('chat')">
254
  CHAT
255
  </button>
256
+ <button
257
+ class="nav-btn"
258
+ id="nav-admin"
259
+ onclick="switchView('admin')"
260
+ style="display: none"
261
+ >
262
+ ADMIN
263
+ </button>
264
  </nav>
265
  <div class="topbar-right">
266
+ <div class="stat-pill" id="session-mode-pill" style="display:none;">
267
+ MODE <span class="val" id="session-mode-label">GUEST</span>
268
+ </div>
269
  <div class="stat-pill">
270
  DOCS <span class="val" id="stat-docs">0</span>
271
  </div>
 
276
  <div class="conn-dot offline" id="conn-dot"></div>
277
  <span id="conn-label">OFFLINE</span>
278
  </div>
279
+ <button onclick="unlockOperatorTools()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">OPERATOR</button>
280
  <button onclick="signOut()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">SIGN OUT</button>
281
  </div>
282
  </header>
 
500
  </button>
501
  </div>
502
  </div>
503
+
504
+ <!-- ── ADMIN VIEW ── -->
505
+ <div class="view" id="view-admin">
506
+ <div class="view-header">
507
+ <div class="view-title">ADMIN REVIEW</div>
508
+ <div class="view-subtitle">Trace triage, feedback, and eval promotion</div>
509
+ </div>
510
+ <div class="view-body" style="padding-top: 12px">
511
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:14px;">
512
+ <input type="text" id="adminTraceFailure" placeholder="failure mode" style="flex:1;min-width:120px;" />
513
+ <input type="text" id="adminTraceCategory" placeholder="category" style="flex:1;min-width:120px;" />
514
+ <select id="adminTraceRoute" style="flex:1;min-width:120px;">
515
+ <option value="">All routes</option>
516
+ <option value="default">default</option>
517
+ <option value="single">single</option>
518
+ <option value="generic_pinned">generic_pinned</option>
519
+ <option value="explicit_compare">explicit_compare</option>
520
+ </select>
521
+ <button class="btn-secondary" onclick="refreshAdminDashboard()">REFRESH</button>
522
+ </div>
523
+ <div id="adminSummary" style="font-size:0.78rem;color:var(--muted);margin-bottom:12px;"></div>
524
+ <div style="display:grid;gap:14px;">
525
+ <div>
526
+ <div class="section-label">Recent Traces</div>
527
+ <div id="adminTraceList"></div>
528
+ </div>
529
+ <div>
530
+ <div class="section-label">Trace Detail</div>
531
+ <div id="adminTraceDetail"></div>
532
+ </div>
533
+ <div>
534
+ <div class="section-label">Recent Feedback</div>
535
+ <div id="adminFeedbackList"></div>
536
+ </div>
537
+ </div>
538
+ </div>
539
+ </div>
540
  </aside>
541
  <!-- Mobile bottom navigation — must be inside #app for grid to work -->
542
  <div id="mobile-nav">
 
576
  <script src="js/corpus.js"></script>
577
  <script src="js/inspect.js"></script>
578
  <script src="js/chat.js?v=3"></script>
579
+ <script src="js/admin.js?v=1"></script>
580
+ <script src="js/main.js?v=1"></script>
581
  <script>
582
  function mobileNav(tab) {
583
  document
 
615
  }
616
  </script>
617
  </body>
618
+ </html>
frontend/js/admin.js ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function _adminBadge(text, tone = 'muted') {
2
+ const color = tone === 'danger' ? '#fb7185' : tone === 'success' ? '#34d399' : '#93c5fd';
3
+ return `<span style="display:inline-block;padding:2px 8px;border:1px solid ${color};border-radius:999px;font-size:0.72rem;color:${color};margin-right:6px;">${esc(text)}</span>`;
4
+ }
5
+
6
+ function _adminPages(pages) {
7
+ if (!Array.isArray(pages) || !pages.length) return 'none';
8
+ return pages.join(', ');
9
+ }
10
+
11
+ function _adminSignalBadges(quality) {
12
+ const badges = [];
13
+ badges.push(_adminBadge(`route ${quality.route_class || 'factoid'}`));
14
+ if (quality.route_reason) badges.push(_adminBadge(`reason ${quality.route_reason}`));
15
+ badges.push(_adminBadge(`identity ${quality.identity_store_hit ? 'hit' : 'miss'}`, quality.identity_store_hit ? 'success' : 'muted'));
16
+ if (quality.history_injected) badges.push(_adminBadge('history injected', 'danger'));
17
+ if (quality.memory_injected) badges.push(_adminBadge('memory injected', 'danger'));
18
+ if (quality.sanitizer_triggered) badges.push(_adminBadge(`sanitized ${Number(quality.sanitized_token_count || 0)}`, 'danger'));
19
+ if (quality.page_scope_required) badges.push(_adminBadge(`pages ${quality.page_scope_supported ? 'supported' : 'violated'}`, quality.page_scope_supported ? 'success' : 'danger'));
20
+ return badges.join('');
21
+ }
22
+
23
+ function _adminRerankAudit(quality) {
24
+ const deltas = Array.isArray(quality.rerank_deltas) ? quality.rerank_deltas : [];
25
+ if (!deltas.length) return '<div class="confirm-zone">No rerank audit captured.</div>';
26
+ return deltas.slice(0, 8).map(delta => `
27
+ <div style="padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
28
+ <div style="font-weight:600;color:#dbeafe">${esc(delta.chunk_id || delta.source || 'candidate')}</div>
29
+ <div style="font-size:0.78rem;color:#94a3b8;">
30
+ pre ${Number(delta.pre_rank ?? -1)} → post ${Number(delta.post_rank ?? -1)} ·
31
+ branch ${esc(delta.branch || 'unknown')} ·
32
+ score ${Number(delta.score ?? 0).toFixed(2)} ·
33
+ pages ${esc(_adminPages(delta.page_numbers || []))}
34
+ </div>
35
+ </div>
36
+ `).join('');
37
+ }
38
+
39
+ function _renderTraceSummary(trace) {
40
+ const failures = Array.isArray(trace.failure_modes) ? trace.failure_modes : [];
41
+ const experts = Array.isArray(trace.selected_experts) ? trace.selected_experts : [];
42
+ const quality = trace.quality_metrics || {};
43
+ return `
44
+ <div style="padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);margin-bottom:10px;">
45
+ <div style="display:flex;justify-content:space-between;gap:12px;align-items:flex-start;">
46
+ <div>
47
+ <div style="font-weight:600;color:#e2e8f0;">${esc(trace.question || 'Untitled trace')}</div>
48
+ <div style="font-size:0.78rem;color:#94a3b8;margin-top:4px;">${esc(trace.trace_id || '')}</div>
49
+ </div>
50
+ <div style="font-size:0.76rem;color:#94a3b8;text-align:right;">
51
+ <div>${esc(trace.route_mode || 'default')} · ${esc(quality.route_class || 'factoid')}</div>
52
+ <div>${esc(trace.review_state || 'pending')}</div>
53
+ </div>
54
+ </div>
55
+ <div style="margin-top:10px;">${experts.map(exp => _adminBadge(exp)).join('')}</div>
56
+ <div style="margin-top:8px;">${_adminSignalBadges(quality)}</div>
57
+ <div style="margin-top:8px;">${failures.length ? failures.map(f => _adminBadge(f, 'danger')).join('') : _adminBadge('no failure flags', 'success')}</div>
58
+ <div style="font-size:0.78rem;color:#cbd5e1;margin-top:10px;">
59
+ relevance ${Number(quality.retrieval_relevance_proxy ?? 0).toFixed(2)} ·
60
+ balance ${Number(quality.document_balance ?? 0).toFixed(2)} ·
61
+ thin docs ${Number(quality.thin_doc_count ?? 0)} ·
62
+ pages ${esc(_adminPages(quality.selected_page_numbers || []))}
63
+ </div>
64
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
65
+ <button class="btn-secondary" onclick="selectAdminTrace('${esc(trace.trace_id)}')">OPEN</button>
66
+ <button class="btn-secondary" onclick="reviewAdminTrace('${esc(trace.trace_id)}','reviewed')">MARK REVIEWED</button>
67
+ <button class="btn-danger" onclick="reviewAdminTrace('${esc(trace.trace_id)}','rejected')">REJECT</button>
68
+ </div>
69
+ </div>
70
+ `;
71
+ }
72
+
73
+ function _renderTraceDetail(trace, feedbackRows) {
74
+ if (!trace) return '<div class="confirm-zone">No trace selected yet.</div>';
75
+ const quality = trace.quality_metrics || {};
76
+ const feedbackHtml = (feedbackRows || []).map(row => `
77
+ <div style="padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
78
+ <div style="font-size:0.76rem;color:#94a3b8;">Feedback #${row.id} · ${esc(row.review_state || 'pending')}</div>
79
+ <div style="font-size:0.86rem;color:#e2e8f0;margin-top:4px;">
80
+ helpful=${String(row.helpful)} · accepted=${String(row.accepted)} · reason=${esc(row.reason_code || 'none')}
81
+ </div>
82
+ ${row.correction_text ? `<div style="margin-top:6px;color:#cbd5e1;">${esc(row.correction_text)}</div>` : ''}
83
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
84
+ <button class="btn-secondary" onclick="reviewAdminFeedback(${row.id},'reviewed')">REVIEW</button>
85
+ <button class="btn-danger" onclick="reviewAdminFeedback(${row.id},'rejected')">REJECT</button>
86
+ <button class="btn-primary" onclick="promoteAdminFeedback(${row.id})">PROMOTE TO EVAL</button>
87
+ </div>
88
+ </div>
89
+ `).join('');
90
+
91
+ const diagnostics = (trace.doc_diagnostics || []).map(diag => `
92
+ <div style="padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
93
+ <div style="font-weight:600;color:#dbeafe">${esc(diag.source || diag.file_hash || 'Unknown')}</div>
94
+ <div style="font-size:0.78rem;color:#94a3b8;">${esc(diag.reason || 'unknown')} · support ${esc(diag.support_label || diag.confidence_label || 'unknown')} · candidates ${Number(diag.candidate_count ?? 0)}</div>
95
+ </div>
96
+ `).join('');
97
+
98
+ return `
99
+ <div style="padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);">
100
+ <div style="font-size:0.78rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Question</div>
101
+ <div style="color:#e2e8f0;margin-top:6px;">${esc(trace.question || '')}</div>
102
+ <div style="font-size:0.78rem;color:#94a3b8;margin-top:10px;">${esc(trace.trace_id || '')}</div>
103
+ <div style="margin-top:12px;">
104
+ ${(trace.failure_modes || []).map(flag => _adminBadge(flag, 'danger')).join('')}
105
+ </div>
106
+ <div style="margin-top:10px;">${_adminSignalBadges(quality)}</div>
107
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Trace Signals</div>
108
+ <div style="margin-top:8px;padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);color:#cbd5e1;">
109
+ <div>route ${esc(quality.route_class || 'factoid')} · ${esc(quality.route_reason || 'heuristic_default')}</div>
110
+ <div style="margin-top:4px;">identity store ${esc(quality.identity_store_hit ? 'hit' : 'miss')} · history ${esc(quality.history_injected ? 'yes' : 'no')} · memory ${esc(quality.memory_injected ? 'yes' : 'no')}</div>
111
+ <div style="margin-top:4px;">pages ${esc(_adminPages(quality.selected_page_numbers || []))} · opening candidates ${Number(quality.opening_page_candidate_count ?? 0)} · opening selected ${Number(quality.opening_page_selected_count ?? 0)}</div>
112
+ <div style="margin-top:4px;">page scope ${esc(quality.page_scope_required ? 'required' : 'not required')} · ${esc(quality.page_scope_supported ? 'supported' : 'violated')}</div>
113
+ <div style="margin-top:4px;">sanitizer ${esc(quality.sanitizer_triggered ? 'triggered' : 'clean')} · tokens removed ${Number(quality.sanitized_token_count ?? 0)}</div>
114
+ </div>
115
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Experts</div>
116
+ <pre style="white-space:pre-wrap;background:rgba(2,6,23,0.9);padding:10px;border-radius:8px;border:1px solid #1e293b;color:#cbd5e1;">${esc(JSON.stringify({
117
+ selected_experts: trace.selected_experts || [],
118
+ expert_weights: trace.expert_weights || {},
119
+ quality_metrics: quality,
120
+ selected_chunk_ids: trace.selected_chunk_ids || [],
121
+ }, null, 2))}</pre>
122
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Rerank Audit</div>
123
+ ${_adminRerankAudit(quality)}
124
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Diagnostics</div>
125
+ ${diagnostics || '<div class="confirm-zone">No diagnostics captured.</div>'}
126
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Answer Preview</div>
127
+ <div style="margin-top:8px;padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);color:#cbd5e1;white-space:pre-wrap;">${esc(trace.answer_preview || '')}</div>
128
+ <div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Linked Feedback</div>
129
+ ${feedbackHtml || '<div class="confirm-zone">No linked feedback yet.</div>'}
130
+ </div>
131
+ `;
132
+ }
133
+
134
+ function _renderFeedbackList(rows) {
135
+ if (!rows.length) return '<div class="confirm-zone">No feedback captured yet.</div>';
136
+ return rows.map(row => `
137
+ <div style="padding:10px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);margin-bottom:10px;">
138
+ <div style="font-size:0.76rem;color:#94a3b8;">Feedback #${row.id} · trace ${esc(row.trace_id || '')}</div>
139
+ <div style="color:#e2e8f0;margin-top:4px;">helpful=${String(row.helpful)} · accepted=${String(row.accepted)} · ${esc(row.reason_code || 'no reason')}</div>
140
+ ${row.correction_text ? `<div style="margin-top:6px;color:#cbd5e1;">${esc(row.correction_text)}</div>` : ''}
141
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
142
+ <button class="btn-secondary" onclick="openAdminFeedbackTrace('${esc(row.trace_id || '')}')">OPEN TRACE</button>
143
+ <button class="btn-secondary" onclick="reviewAdminFeedback(${row.id},'reviewed')">REVIEW</button>
144
+ <button class="btn-danger" onclick="reviewAdminFeedback(${row.id},'rejected')">REJECT</button>
145
+ <button class="btn-primary" onclick="promoteAdminFeedback(${row.id})">PROMOTE</button>
146
+ </div>
147
+ </div>
148
+ `).join('');
149
+ }
150
+
151
+ async function refreshAdminDashboard() {
152
+ if (!STATE.adminUnlocked || !STATE.adminKey) return;
153
+ const params = {
154
+ limit: 20,
155
+ failure_mode: document.getElementById('adminTraceFailure')?.value || '',
156
+ category: document.getElementById('adminTraceCategory')?.value || '',
157
+ route_mode: document.getElementById('adminTraceRoute')?.value || '',
158
+ };
159
+ const [traceRes, feedbackRes] = await Promise.all([
160
+ apiAdminListTraces(STATE.adminKey, params),
161
+ apiAdminListFeedback(STATE.adminKey, { limit: 20 }),
162
+ ]);
163
+ STATE.adminTraces = traceRes.items || [];
164
+ STATE.adminFeedback = feedbackRes.items || [];
165
+ document.getElementById('adminSummary').textContent =
166
+ `${STATE.adminTraces.length} trace(s), ${STATE.adminFeedback.length} feedback row(s) loaded.`;
167
+ document.getElementById('adminTraceList').innerHTML = STATE.adminTraces.map(_renderTraceSummary).join('');
168
+ document.getElementById('adminFeedbackList').innerHTML = _renderFeedbackList(STATE.adminFeedback);
169
+ if (STATE.selectedTraceId) {
170
+ await selectAdminTrace(STATE.selectedTraceId);
171
+ } else {
172
+ document.getElementById('adminTraceDetail').innerHTML = '<div class="confirm-zone">Select a trace to inspect it.</div>';
173
+ }
174
+ }
175
+
176
+ async function selectAdminTrace(traceId) {
177
+ if (!STATE.adminUnlocked || !STATE.adminKey || !traceId) return;
178
+ STATE.selectedTraceId = traceId;
179
+ const detail = await apiAdminGetTrace(STATE.adminKey, traceId);
180
+ document.getElementById('adminTraceDetail').innerHTML = _renderTraceDetail(detail.trace, detail.feedback || []);
181
+ }
182
+
183
+ async function openAdminFeedbackTrace(traceId) {
184
+ if (!traceId) return;
185
+ await selectAdminTrace(traceId);
186
+ switchView('admin');
187
+ }
188
+
189
+ async function reviewAdminTrace(traceId, reviewState) {
190
+ if (!STATE.adminKey) return;
191
+ const reviewNotes = window.prompt(`Notes for ${reviewState}?`, '') || null;
192
+ await apiAdminReviewTrace(STATE.adminKey, traceId, {
193
+ review_state: reviewState,
194
+ review_notes: reviewNotes,
195
+ });
196
+ toast(`Trace marked ${reviewState}.`, 'success');
197
+ await refreshAdminDashboard();
198
+ }
199
+
200
+ async function reviewAdminFeedback(feedbackId, reviewState) {
201
+ if (!STATE.adminKey) return;
202
+ const reviewNotes = window.prompt(`Notes for ${reviewState}?`, '') || null;
203
+ await apiAdminReviewFeedback(STATE.adminKey, feedbackId, {
204
+ review_state: reviewState,
205
+ review_notes: reviewNotes,
206
+ });
207
+ toast(`Feedback marked ${reviewState}.`, 'success');
208
+ await refreshAdminDashboard();
209
+ }
210
+
211
+ async function promoteAdminFeedback(feedbackId) {
212
+ if (!STATE.adminKey) return;
213
+ await apiAdminPromoteFeedback(STATE.adminKey, feedbackId);
214
+ toast('Feedback promoted to evaluation_datasets.', 'success');
215
+ await refreshAdminDashboard();
216
+ }
217
+
218
+ function enableAdminReview(adminKey) {
219
+ STATE.adminKey = adminKey;
220
+ STATE.adminUnlocked = true;
221
+ const nav = document.getElementById('nav-admin');
222
+ if (nav) nav.style.display = '';
223
+ refreshAdminDashboard().catch(err => {
224
+ toast(`Admin dashboard failed: ${err.message}`, 'error');
225
+ });
226
+ }
227
+
228
+ window.refreshAdminDashboard = refreshAdminDashboard;
229
+ window.selectAdminTrace = selectAdminTrace;
230
+ window.openAdminFeedbackTrace = openAdminFeedbackTrace;
231
+ window.reviewAdminTrace = reviewAdminTrace;
232
+ window.reviewAdminFeedback = reviewAdminFeedback;
233
+ window.promoteAdminFeedback = promoteAdminFeedback;
234
+ window.enableAdminReview = enableAdminReview;
frontend/js/api.js CHANGED
@@ -18,13 +18,38 @@
18
  */
19
  async function getSupabaseToken() {
20
  try {
21
- const { data } = await supabaseClient.auth.getSession();
 
 
22
  return data.session?.access_token ?? null;
23
  } catch {
24
  return null;
25
  }
26
  }
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  // ── Core fetch wrapper ────────────────────────────────────────────────────────
29
  async function apiFetch(path, opts = {}) {
30
  // Always pull a fresh token — Supabase auto-refreshes silently.
@@ -41,7 +66,7 @@ async function apiFetch(path, opts = {}) {
41
 
42
  if (!res.ok) {
43
  let detail = `HTTP ${res.status}`;
44
- try { detail = (await res.json()).detail || detail; } catch {}
45
  throw new Error(detail);
46
  }
47
 
@@ -55,7 +80,7 @@ async function apiVerifyPassword(password) {
55
  // Token injection is handled by apiFetch — no sessionStorage involved.
56
  const data = await apiFetch('/api/v1/auth/verify', {
57
  method: 'POST',
58
- body: JSON.stringify({ password }),
59
  });
60
  return data;
61
  }
@@ -63,7 +88,68 @@ async function apiVerifyPassword(password) {
63
  async function apiVerifyAdmin(key) {
64
  return apiFetch('/api/v1/auth/admin', {
65
  method: 'POST',
66
- body: JSON.stringify({ password: key }),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  });
68
  }
69
 
@@ -75,14 +161,14 @@ async function apiLoadFiles() {
75
  async function apiOverrideCategory(fileHash, newCategory) {
76
  return apiFetch('/api/v1/corpus/recategorise', {
77
  method: 'POST',
78
- body: JSON.stringify({ file_hash: fileHash, new_category: newCategory }),
79
  });
80
  }
81
 
82
  async function apiRenameDocument(fileHash, newName) {
83
  return apiFetch('/api/v1/corpus/rename', {
84
  method: 'POST',
85
- body: JSON.stringify({ file_hash: fileHash, new_name: newName }),
86
  });
87
  }
88
 
@@ -90,6 +176,13 @@ async function apiDeleteDocument(fileHash) {
90
  return apiFetch(`/api/v1/corpus/${fileHash}`, { method: 'DELETE' });
91
  }
92
 
 
 
 
 
 
 
 
93
  // ── Ingest ────────────────────────────────────────────────────────────────────
94
  async function apiIngestFile(file) {
95
  // multipart/form-data — cannot go through apiFetch (no JSON body),
@@ -100,15 +193,15 @@ async function apiIngestFile(file) {
100
  formData.append('file', file);
101
 
102
  const res = await fetch(`${CONFIG.API_URL}/api/v1/ingest/upload`, {
103
- method: 'POST',
104
  headers: token ? { 'X-Auth-Token': token } : {},
105
- body: formData,
106
  });
107
 
108
  if (res.status === 409) throw new Error('already_ingested');
109
  if (!res.ok) {
110
  let detail = `HTTP ${res.status}`;
111
- try { detail = (await res.json()).detail || detail; } catch {}
112
  throw new Error(detail);
113
  }
114
  return res.json();
@@ -119,41 +212,42 @@ async function apiIngestStatus(taskId) {
119
  }
120
 
121
  // ── Query ─────────────────────────────────────────────────────────────────────
122
- async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
123
  /**
124
  * SSE streaming query.
125
  * callbacks = {
126
  * onToken(text) — called for each streamed token
127
- * onDone(sources, images) — called when stream ends
128
  * onError(msg) — called on error
129
  * }
130
  */
131
  const token = await getSupabaseToken(); // ← Supabase JWT
132
 
133
  const res = await fetch(`${CONFIG.API_URL}/api/v1/query`, {
134
- method: 'POST',
135
  headers: {
136
  'Content-Type': 'application/json',
137
  ...(token ? { 'X-Auth-Token': token } : {}),
138
  },
139
  body: JSON.stringify({
140
  query,
141
- category: category || 'All',
142
- history: history || [],
143
- session_id: sessionId || 'default_session',
144
- alpha: alpha ?? 0.5,
 
145
  }),
146
  });
147
 
148
  if (!res.ok) {
149
  let detail = `HTTP ${res.status}`;
150
- try { detail = (await res.json()).detail || detail; } catch {}
151
  throw new Error(detail);
152
  }
153
 
154
- const reader = res.body.getReader();
155
  const decoder = new TextDecoder();
156
- let buffer = '';
157
 
158
  while (true) {
159
  const { done, value } = await reader.read();
@@ -169,10 +263,18 @@ async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
169
  if (!raw) continue;
170
  try {
171
  const event = JSON.parse(raw);
172
- if (event.type === 'token' && callbacks?.onToken) callbacks.onToken(event.content);
173
- else if (event.type === 'done' && callbacks?.onDone) callbacks.onDone(event.sources || [], event.images || []);
174
- else if (event.type === 'error' && callbacks?.onError) callbacks.onError(event.content);
175
- } catch {}
 
 
 
 
 
 
 
 
176
  }
177
  }
178
- }
 
18
  */
19
  async function getSupabaseToken() {
20
  try {
21
+ const client = await initSupabase();
22
+ if (!client?.auth) return null;
23
+ const { data } = await client.auth.getSession();
24
  return data.session?.access_token ?? null;
25
  } catch {
26
  return null;
27
  }
28
  }
29
 
30
+ async function getSupabaseSession() {
31
+ try {
32
+ const client = await initSupabase();
33
+ if (!client?.auth) return null;
34
+ const { data } = await client.auth.getSession();
35
+ return data.session ?? null;
36
+ } catch {
37
+ return null;
38
+ }
39
+ }
40
+
41
+ async function isGuestSession() {
42
+ const session = await getSupabaseSession();
43
+ const appMeta = session?.user?.app_metadata || {};
44
+ const provider = String(appMeta.provider || '').toLowerCase();
45
+ return Boolean(
46
+ session?.user?.is_anonymous ||
47
+ appMeta.is_anonymous ||
48
+ provider === 'anonymous' ||
49
+ (Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
50
+ );
51
+ }
52
+
53
  // ── Core fetch wrapper ────────────────────────────────────────────────────────
54
  async function apiFetch(path, opts = {}) {
55
  // Always pull a fresh token — Supabase auto-refreshes silently.
 
66
 
67
  if (!res.ok) {
68
  let detail = `HTTP ${res.status}`;
69
+ try { detail = (await res.json()).detail || detail; } catch { }
70
  throw new Error(detail);
71
  }
72
 
 
80
  // Token injection is handled by apiFetch — no sessionStorage involved.
81
  const data = await apiFetch('/api/v1/auth/verify', {
82
  method: 'POST',
83
+ body: JSON.stringify({ password }),
84
  });
85
  return data;
86
  }
 
88
  async function apiVerifyAdmin(key) {
89
  return apiFetch('/api/v1/auth/admin', {
90
  method: 'POST',
91
+ body: JSON.stringify({ password: key }),
92
+ });
93
+ }
94
+
95
+ async function apiCleanupGuestWorkspace() {
96
+ return apiFetch('/api/v1/auth/guest-workspace', {
97
+ method: 'DELETE',
98
+ });
99
+ }
100
+
101
+ async function apiAdminFetch(path, adminKey, opts = {}) {
102
+ if (!adminKey) throw new Error('Admin key required.');
103
+ return apiFetch(path, {
104
+ ...opts,
105
+ headers: {
106
+ 'X-Admin-Key': adminKey,
107
+ ...(opts.headers || {}),
108
+ },
109
+ });
110
+ }
111
+
112
+ async function apiAdminListTraces(adminKey, params = {}) {
113
+ const qs = new URLSearchParams();
114
+ Object.entries(params).forEach(([key, value]) => {
115
+ if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
116
+ });
117
+ return apiAdminFetch(`/api/v1/admin/traces${qs.toString() ? `?${qs}` : ''}`, adminKey);
118
+ }
119
+
120
+ async function apiAdminGetTrace(adminKey, traceId) {
121
+ return apiAdminFetch(`/api/v1/admin/traces/${traceId}`, adminKey);
122
+ }
123
+
124
+ async function apiAdminReviewTrace(adminKey, traceId, payload) {
125
+ return apiAdminFetch(`/api/v1/admin/traces/${traceId}/review`, adminKey, {
126
+ method: 'POST',
127
+ body: JSON.stringify(payload),
128
+ });
129
+ }
130
+
131
+ async function apiAdminListFeedback(adminKey, params = {}) {
132
+ const qs = new URLSearchParams();
133
+ Object.entries(params).forEach(([key, value]) => {
134
+ if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
135
+ });
136
+ return apiAdminFetch(`/api/v1/admin/feedback${qs.toString() ? `?${qs}` : ''}`, adminKey);
137
+ }
138
+
139
+ async function apiAdminGetFeedback(adminKey, feedbackId) {
140
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}`, adminKey);
141
+ }
142
+
143
+ async function apiAdminReviewFeedback(adminKey, feedbackId, payload) {
144
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/review`, adminKey, {
145
+ method: 'POST',
146
+ body: JSON.stringify(payload),
147
+ });
148
+ }
149
+
150
+ async function apiAdminPromoteFeedback(adminKey, feedbackId) {
151
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/promote`, adminKey, {
152
+ method: 'POST',
153
  });
154
  }
155
 
 
161
  async function apiOverrideCategory(fileHash, newCategory) {
162
  return apiFetch('/api/v1/corpus/recategorise', {
163
  method: 'POST',
164
+ body: JSON.stringify({ file_hash: fileHash, new_category: newCategory }),
165
  });
166
  }
167
 
168
  async function apiRenameDocument(fileHash, newName) {
169
  return apiFetch('/api/v1/corpus/rename', {
170
  method: 'POST',
171
+ body: JSON.stringify({ file_hash: fileHash, new_name: newName }),
172
  });
173
  }
174
 
 
176
  return apiFetch(`/api/v1/corpus/${fileHash}`, { method: 'DELETE' });
177
  }
178
 
179
+ async function apiSubmitAnswerFeedback(payload) {
180
+ return apiFetch('/api/v1/query/feedback', {
181
+ method: 'POST',
182
+ body: JSON.stringify(payload),
183
+ });
184
+ }
185
+
186
  // ── Ingest ────────────────────────────────────────────────────────────────────
187
  async function apiIngestFile(file) {
188
  // multipart/form-data — cannot go through apiFetch (no JSON body),
 
193
  formData.append('file', file);
194
 
195
  const res = await fetch(`${CONFIG.API_URL}/api/v1/ingest/upload`, {
196
+ method: 'POST',
197
  headers: token ? { 'X-Auth-Token': token } : {},
198
+ body: formData,
199
  });
200
 
201
  if (res.status === 409) throw new Error('already_ingested');
202
  if (!res.ok) {
203
  let detail = `HTTP ${res.status}`;
204
+ try { detail = (await res.json()).detail || detail; } catch { }
205
  throw new Error(detail);
206
  }
207
  return res.json();
 
212
  }
213
 
214
  // ── Query ─────────────────────────────────────────────────────────────────────
215
+ async function apiQuery(query, category, history, sessionId, alpha, callbacks, pinnedFiles) {
216
  /**
217
  * SSE streaming query.
218
  * callbacks = {
219
  * onToken(text) — called for each streamed token
220
+ * onDone({ sources, images, traceId, docDiagnostics }) — called when stream ends
221
  * onError(msg) — called on error
222
  * }
223
  */
224
  const token = await getSupabaseToken(); // ← Supabase JWT
225
 
226
  const res = await fetch(`${CONFIG.API_URL}/api/v1/query`, {
227
+ method: 'POST',
228
  headers: {
229
  'Content-Type': 'application/json',
230
  ...(token ? { 'X-Auth-Token': token } : {}),
231
  },
232
  body: JSON.stringify({
233
  query,
234
+ category: category || 'All',
235
+ history: history || [],
236
+ session_id: sessionId || 'default_session',
237
+ alpha: alpha ?? 0.5,
238
+ priority_file_hashes: pinnedFiles || [],
239
  }),
240
  });
241
 
242
  if (!res.ok) {
243
  let detail = `HTTP ${res.status}`;
244
+ try { detail = (await res.json()).detail || detail; } catch { }
245
  throw new Error(detail);
246
  }
247
 
248
+ const reader = res.body.getReader();
249
  const decoder = new TextDecoder();
250
+ let buffer = '';
251
 
252
  while (true) {
253
  const { done, value } = await reader.read();
 
263
  if (!raw) continue;
264
  try {
265
  const event = JSON.parse(raw);
266
+ if (event.type === 'token' && callbacks?.onToken) callbacks.onToken(event.content);
267
+ else if (event.type === 'done' && callbacks?.onDone) {
268
+ callbacks.onDone({
269
+ sources: event.sources || [],
270
+ images: event.images || [],
271
+ traceId: event.trace_id || null,
272
+ docDiagnostics: event.doc_diagnostics || [],
273
+ });
274
+ }
275
+ else if (event.type === 'error' && callbacks?.onError) callbacks.onError(event.content);
276
+ else if (event.type === 'clarification_options' && callbacks?.onOptions) callbacks.onOptions(event.options);
277
+ } catch { }
278
  }
279
  }
280
+ }
frontend/js/chat.js CHANGED
@@ -10,7 +10,7 @@
10
  lb.style.cssText = `display:none;position:fixed;inset:0;background:rgba(0,0,0,0.88);
11
  z-index:9998;align-items:center;justify-content:center;cursor:zoom-out;
12
  backdrop-filter:blur(4px);`;
13
- lb.innerHTML = `
14
  <button id="img-lightbox-close"
15
  onclick="event.stopPropagation(); document.getElementById('img-lightbox').style.display='none'">
16
 
@@ -34,14 +34,14 @@ function renderMarkdown(text) {
34
  let inUL = false;
35
  let inOL = false;
36
 
37
- const closeUL = () => { if (inUL) { html += '</ul>'; inUL = false; } };
38
- const closeOL = () => { if (inOL) { html += '</ol>'; inOL = false; } };
39
 
40
  const inline = (str) => str
41
  .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
42
  .replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
43
- .replace(/\*(.+?)\*/g, '<em>$1</em>')
44
- .replace(/`([^`]+)`/g, '<code class="inline-code">$1</code>')
45
  .replace(/\[Source (\d+)\]/g,
46
  '<span class="source-ref">[S$1]</span>');
47
 
@@ -100,6 +100,74 @@ function renderMarkdown(text) {
100
  return html;
101
  }
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  // ── Chat core ─────────────────────────────────────────────
104
 
105
  // Debounce guard — prevents double-submit on rapid Enter + button click
@@ -111,7 +179,7 @@ async function sendChat() {
111
  _lastSendTime = now;
112
 
113
  const input = document.getElementById('chatInput');
114
- const msg = input.value.trim();
115
  if (!msg || STATE.isThinking) return;
116
  input.value = '';
117
  autoResize(input);
@@ -122,15 +190,15 @@ async function sendChat() {
122
  document.getElementById('chatSend').disabled = true;
123
 
124
  const category = document.getElementById('chatFilterSelect').value;
125
- const history = STATE.chatHistory.slice(-CONFIG.CHAT_HISTORY_TURNS);
126
 
127
  // Create assistant bubble immediately — will be filled by stream
128
  const assistantDiv = appendMsg('assistant', '', [], []);
129
- const bubble = assistantDiv.querySelector('.msg-bubble');
130
- bubble.innerHTML = '<div class="thinking-dots"><span></span><span></span><span></span></div>';
131
 
132
- let fullText = '';
133
- let started = false;
134
 
135
  try {
136
  await apiQuery(msg, category, history, STATE.sessionId, STATE.alpha, {
@@ -142,11 +210,11 @@ async function sendChat() {
142
  fullText += token;
143
  bubble.innerHTML = renderMarkdown(fullText);
144
  // Auto scroll
145
- document.getElementById('chatMessages').scrollTop =
146
- document.getElementById('chatMessages').scrollHeight;
147
- await new Promise(r => setTimeout(r, 0));
148
  },
149
- onDone(sources, images) {
150
  // Finalize markdown render
151
  bubble.innerHTML = renderMarkdown(fullText);
152
  STATE.chatHistory.push({ role: 'assistant', content: fullText });
@@ -162,11 +230,11 @@ async function sendChat() {
162
 
163
  // Append sources
164
  if (visibleSources.length > 0) {
165
- const n = visibleSources.length;
166
  const chips = visibleSources.map(s => {
167
- const score = s.score != null ? Math.round(s.score * 100) : null;
168
  const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
169
- const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
170
  return `<div class="source-chip ${cls}">
171
  <div class="source-chip-header">
172
  <span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
@@ -180,12 +248,18 @@ async function sendChat() {
180
  <button class="sources-toggle" onclick="
181
  const p=this.nextElementSibling;
182
  const open=p.classList.toggle('open');
183
- this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
184
  ">▼ show ${n} source${n > 1 ? 's' : ''}</button>
185
  <div class="sources-panel">${chips}</div>`;
186
  assistantDiv.appendChild(srcEl);
187
  }
188
 
 
 
 
 
 
 
189
  // Append images
190
  if (images.length > 0) {
191
  const uniqueImages = [...new Set(images)];
@@ -199,6 +273,75 @@ async function sendChat() {
199
  assistantDiv.appendChild(imgEl);
200
  }
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  const el = document.getElementById('chatMessages');
203
  el.scrollTop = el.scrollHeight;
204
  },
@@ -215,7 +358,7 @@ async function sendChat() {
215
  }
216
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
217
  },
218
- });
219
  } catch (e) {
220
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">Request failed: ${esc(e.message)}</p>`;
221
  } finally {
@@ -225,7 +368,7 @@ async function sendChat() {
225
  }
226
 
227
  function appendMsg(role, text, sources = [], images = []) {
228
- const el = document.getElementById('chatMessages');
229
  const div = document.createElement('div');
230
  div.className = `msg ${role}`;
231
  const n = sources.length;
@@ -237,11 +380,11 @@ function appendMsg(role, text, sources = [], images = []) {
237
  imgHtml = `
238
  <div style="display:flex; flex-direction:row; gap:10px; margin-top:12px; width:100%; overflow-x:auto; padding-bottom:8px;">
239
  ${uniqueImages.map(img => {
240
- const src = img.startsWith('data:') || img.startsWith('http')
241
- ? img
242
- : `data:image/jpeg;base64,${img}`;
243
- return `<img src="${src}" style="max-height: 220px; max-width: 100%; object-fit: contain; border-radius: 8px; background: white; border: 1px solid #334155; cursor: zoom-in;" onclick="openLightbox(this.src)">`;
244
- }).join('')}
245
  </div>`;
246
  }
247
 
@@ -249,9 +392,9 @@ function appendMsg(role, text, sources = [], images = []) {
249
  let srcHtml = '';
250
  if (n > 0) {
251
  const chips = sources.map(s => {
252
- const score = s.score != null ? Math.round(s.score * 100) : null;
253
  const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
254
- const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
255
  return `<div class="source-chip ${cls}">
256
  <div class="source-chip-header">
257
  <span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
@@ -264,7 +407,7 @@ function appendMsg(role, text, sources = [], images = []) {
264
  <button class="sources-toggle" onclick="
265
  const p=this.nextElementSibling;
266
  const open=p.classList.toggle('open');
267
- this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
268
  ">▼ show ${n} source${n > 1 ? 's' : ''}</button>
269
  <div class="sources-panel">${chips}</div>`;
270
  }
@@ -284,7 +427,7 @@ function appendMsg(role, text, sources = [], images = []) {
284
  }
285
 
286
  function appendThinking() {
287
- const el = document.getElementById('chatMessages');
288
  const div = document.createElement('div');
289
  div.className = 'msg assistant';
290
  div.innerHTML = `
 
10
  lb.style.cssText = `display:none;position:fixed;inset:0;background:rgba(0,0,0,0.88);
11
  z-index:9998;align-items:center;justify-content:center;cursor:zoom-out;
12
  backdrop-filter:blur(4px);`;
13
+ lb.innerHTML = `
14
  <button id="img-lightbox-close"
15
  onclick="event.stopPropagation(); document.getElementById('img-lightbox').style.display='none'">
16
 
 
34
  let inUL = false;
35
  let inOL = false;
36
 
37
+ const closeUL = () => { if (inUL) { html += '</ul>'; inUL = false; } };
38
+ const closeOL = () => { if (inOL) { html += '</ol>'; inOL = false; } };
39
 
40
  const inline = (str) => str
41
  .replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
42
  .replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
43
+ .replace(/\*(.+?)\*/g, '<em>$1</em>')
44
+ .replace(/`([^`]+)`/g, '<code class="inline-code">$1</code>')
45
  .replace(/\[Source (\d+)\]/g,
46
  '<span class="source-ref">[S$1]</span>');
47
 
 
100
  return html;
101
  }
102
 
103
+ function renderDocDiagnostics(docDiagnostics) {
104
+ if (!Array.isArray(docDiagnostics) || docDiagnostics.length === 0) return '';
105
+ const rows = docDiagnostics.map(diag => {
106
+ const score = diag.doc_score != null ? `${Math.round(diag.doc_score * 100)}%` : 'n/a';
107
+ const reason = diag.reason || 'unknown';
108
+ const status = diag.included ? 'included' : 'excluded';
109
+ return `
110
+ <div style="display:flex;justify-content:space-between;gap:12px;padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
111
+ <div>
112
+ <div style="font-weight:600;color:#dbeafe">${esc(diag.source || diag.file_hash || 'Unknown')}</div>
113
+ <div style="font-size:0.85em;color:#94a3b8">${esc(status)} · ${esc(reason)} · candidates ${Number(diag.candidate_count ?? 0)}</div>
114
+ </div>
115
+ <div style="font-size:0.85em;color:#cbd5e1;white-space:nowrap">${esc(diag.confidence_label || 'unknown')} · ${esc(score)}</div>
116
+ </div>
117
+ `;
118
+ }).join('');
119
+ return `
120
+ <div style="margin-top:12px;padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);">
121
+ <div style="font-size:0.8em;letter-spacing:0.14em;text-transform:uppercase;color:#7dd3fc;">Retrieval Diagnostics</div>
122
+ ${rows}
123
+ </div>
124
+ `;
125
+ }
126
+
127
+ function attachFeedbackControls(container, traceId) {
128
+ if (!traceId) return;
129
+ const bar = document.createElement('div');
130
+ bar.style.cssText = 'display:flex;flex-wrap:wrap;gap:8px;margin-top:12px;';
131
+
132
+ const disableAll = () => {
133
+ Array.from(bar.querySelectorAll('button')).forEach(btn => { btn.disabled = true; btn.style.opacity = '0.65'; });
134
+ };
135
+
136
+ const makeBtn = (label, handler) => {
137
+ const btn = document.createElement('button');
138
+ btn.textContent = label;
139
+ btn.style.cssText = 'background:rgba(255,255,255,0.05);border:1px solid #334155;color:var(--fg);padding:7px 12px;border-radius:8px;font-size:0.85em;cursor:pointer;';
140
+ btn.onclick = async () => {
141
+ try {
142
+ await handler();
143
+ disableAll();
144
+ toast('Feedback saved.', 'success');
145
+ } catch (err) {
146
+ toast(err?.message || 'Could not save feedback.', 'error');
147
+ }
148
+ };
149
+ return btn;
150
+ };
151
+
152
+ bar.appendChild(makeBtn('Helpful', async () => {
153
+ await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true });
154
+ }));
155
+ bar.appendChild(makeBtn('Not Helpful', async () => {
156
+ const note = window.prompt('What went wrong? You can add a short reason or a correction.', '') || '';
157
+ await apiSubmitAnswerFeedback({
158
+ trace_id: traceId,
159
+ helpful: false,
160
+ reason_code: note ? 'user_reported_issue' : 'needs_improvement',
161
+ correction_text: note || null,
162
+ });
163
+ }));
164
+ bar.appendChild(makeBtn('Save Answer', async () => {
165
+ await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true, accepted: true });
166
+ }));
167
+
168
+ container.appendChild(bar);
169
+ }
170
+
171
  // ── Chat core ─────────────────────────────────────────────
172
 
173
  // Debounce guard — prevents double-submit on rapid Enter + button click
 
179
  _lastSendTime = now;
180
 
181
  const input = document.getElementById('chatInput');
182
+ const msg = input.value.trim();
183
  if (!msg || STATE.isThinking) return;
184
  input.value = '';
185
  autoResize(input);
 
190
  document.getElementById('chatSend').disabled = true;
191
 
192
  const category = document.getElementById('chatFilterSelect').value;
193
+ const history = STATE.chatHistory.slice(-CONFIG.CHAT_HISTORY_TURNS);
194
 
195
  // Create assistant bubble immediately — will be filled by stream
196
  const assistantDiv = appendMsg('assistant', '', [], []);
197
+ const bubble = assistantDiv.querySelector('.msg-bubble');
198
+ bubble.innerHTML = '<div class="thinking-dots"><span></span><span></span><span></span></div>';
199
 
200
+ let fullText = '';
201
+ let started = false;
202
 
203
  try {
204
  await apiQuery(msg, category, history, STATE.sessionId, STATE.alpha, {
 
210
  fullText += token;
211
  bubble.innerHTML = renderMarkdown(fullText);
212
  // Auto scroll
213
+ document.getElementById('chatMessages').scrollTop =
214
+ document.getElementById('chatMessages').scrollHeight;
215
+ await new Promise(r => setTimeout(r, 0));
216
  },
217
+ onDone({ sources, images, traceId, docDiagnostics }) {
218
  // Finalize markdown render
219
  bubble.innerHTML = renderMarkdown(fullText);
220
  STATE.chatHistory.push({ role: 'assistant', content: fullText });
 
230
 
231
  // Append sources
232
  if (visibleSources.length > 0) {
233
+ const n = visibleSources.length;
234
  const chips = visibleSources.map(s => {
235
+ const score = s.score != null ? Math.round(s.score * 100) : null;
236
  const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
237
+ const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
238
  return `<div class="source-chip ${cls}">
239
  <div class="source-chip-header">
240
  <span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
 
248
  <button class="sources-toggle" onclick="
249
  const p=this.nextElementSibling;
250
  const open=p.classList.toggle('open');
251
+ this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
252
  ">▼ show ${n} source${n > 1 ? 's' : ''}</button>
253
  <div class="sources-panel">${chips}</div>`;
254
  assistantDiv.appendChild(srcEl);
255
  }
256
 
257
+ if (docDiagnostics && docDiagnostics.length > 0) {
258
+ const diagEl = document.createElement('div');
259
+ diagEl.innerHTML = renderDocDiagnostics(docDiagnostics);
260
+ assistantDiv.appendChild(diagEl);
261
+ }
262
+
263
  // Append images
264
  if (images.length > 0) {
265
  const uniqueImages = [...new Set(images)];
 
273
  assistantDiv.appendChild(imgEl);
274
  }
275
 
276
+ attachFeedbackControls(assistantDiv, traceId);
277
+
278
+ const el = document.getElementById('chatMessages');
279
+ el.scrollTop = el.scrollHeight;
280
+ },
281
+ onOptions(options) {
282
+ // Render inline choice buttons
283
+ const btnContainer = document.createElement('div');
284
+ btnContainer.style.cssText = 'display:flex;flex-direction:row;flex-wrap:wrap;gap:8px;margin-top:12px;';
285
+
286
+ const syncGraphPinStyles = () => {
287
+ const d3 = window.d3;
288
+ if (!d3) return;
289
+
290
+ d3.selectAll('.node')
291
+ .filter(d => d && d.type === 'document')
292
+ .select('circle')
293
+ .attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
294
+ .attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
295
+ .attr('filter', d => {
296
+ if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
297
+ const idx = STATE.categories.indexOf(d.category);
298
+ return idx >= 0 ? `url(#glow-${idx})` : null;
299
+ });
300
+ };
301
+
302
+ options.forEach(opt => {
303
+ const btn = document.createElement('button');
304
+ btn.textContent = opt.label;
305
+ btn.style.cssText = `
306
+ background: rgba(255, 255, 255, 0.05);
307
+ border: 1px solid #334155;
308
+ color: var(--fg);
309
+ padding: 8px 16px;
310
+ border-radius: 6px;
311
+ font-size: 0.9em;
312
+ cursor: pointer;
313
+ transition: all 0.2s;
314
+ `;
315
+ btn.onmouseover = () => {
316
+ btn.style.background = 'rgba(255, 255, 255, 0.1)';
317
+ btn.style.borderColor = 'var(--text-glow)';
318
+ };
319
+ btn.onmouseout = () => {
320
+ btn.style.background = 'rgba(255, 255, 255, 0.05)';
321
+ btn.style.borderColor = '#334155';
322
+ };
323
+
324
+ btn.onclick = () => {
325
+ // 1) Apply selected routing scope (single-doc or multi-doc)
326
+ const selectedHashes = opt.mode === 'all'
327
+ ? (Array.isArray(opt.file_hashes) ? opt.file_hashes.filter(Boolean) : [])
328
+ : (opt.file_hash ? [opt.file_hash] : []);
329
+ STATE.pinnedFiles = [...new Set(selectedHashes)];
330
+ syncGraphPinStyles();
331
+
332
+ // 2. Hide the buttons
333
+ btnContainer.style.display = 'none';
334
+
335
+ // 3. Resubmit the query now that it has a pin
336
+ const input = document.getElementById('chatInput');
337
+ input.value = msg; // original msg
338
+ document.getElementById('chatSend').click();
339
+ };
340
+ btnContainer.appendChild(btn);
341
+ });
342
+
343
+ assistantDiv.appendChild(btnContainer);
344
+
345
  const el = document.getElementById('chatMessages');
346
  el.scrollTop = el.scrollHeight;
347
  },
 
358
  }
359
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
360
  },
361
+ }, STATE.pinnedFiles);
362
  } catch (e) {
363
  bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">Request failed: ${esc(e.message)}</p>`;
364
  } finally {
 
368
  }
369
 
370
  function appendMsg(role, text, sources = [], images = []) {
371
+ const el = document.getElementById('chatMessages');
372
  const div = document.createElement('div');
373
  div.className = `msg ${role}`;
374
  const n = sources.length;
 
380
  imgHtml = `
381
  <div style="display:flex; flex-direction:row; gap:10px; margin-top:12px; width:100%; overflow-x:auto; padding-bottom:8px;">
382
  ${uniqueImages.map(img => {
383
+ const src = img.startsWith('data:') || img.startsWith('http')
384
+ ? img
385
+ : `data:image/jpeg;base64,${img}`;
386
+ return `<img src="${src}" style="max-height: 220px; max-width: 100%; object-fit: contain; border-radius: 8px; background: white; border: 1px solid #334155; cursor: zoom-in;" onclick="openLightbox(this.src)">`;
387
+ }).join('')}
388
  </div>`;
389
  }
390
 
 
392
  let srcHtml = '';
393
  if (n > 0) {
394
  const chips = sources.map(s => {
395
+ const score = s.score != null ? Math.round(s.score * 100) : null;
396
  const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
397
+ const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
398
  return `<div class="source-chip ${cls}">
399
  <div class="source-chip-header">
400
  <span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
 
407
  <button class="sources-toggle" onclick="
408
  const p=this.nextElementSibling;
409
  const open=p.classList.toggle('open');
410
+ this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
411
  ">▼ show ${n} source${n > 1 ? 's' : ''}</button>
412
  <div class="sources-panel">${chips}</div>`;
413
  }
 
427
  }
428
 
429
  function appendThinking() {
430
+ const el = document.getElementById('chatMessages');
431
  const div = document.createElement('div');
432
  div.className = 'msg assistant';
433
  div.innerHTML = `
frontend/js/config.js CHANGED
@@ -2,14 +2,48 @@ const CONFIG = {
2
  API_URL: '',
3
  CAT_PALETTE: ['#00ff88','#4a9eff','#f5a623','#ff6b9d','#a78bfa','#34d399','#fb923c','#60a5fa'],
4
  CHAT_HISTORY_TURNS: 6,
 
5
  };
6
 
7
  // Supabase client — keys loaded from backend, never hardcoded here
8
  let supabaseClient = null;
 
9
 
10
  async function initSupabase() {
11
- const res = await fetch('/api/v1/config');
12
- const cfg = await res.json();
13
- const { createClient } = supabase;
14
- supabaseClient = createClient(cfg.supabase_url, cfg.supabase_anon);
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  API_URL: '',
3
  CAT_PALETTE: ['#00ff88','#4a9eff','#f5a623','#ff6b9d','#a78bfa','#34d399','#fb923c','#60a5fa'],
4
  CHAT_HISTORY_TURNS: 6,
5
+ GUEST_ENABLED: true,
6
  };
7
 
8
  // Supabase client — keys loaded from backend, never hardcoded here
9
  let supabaseClient = null;
10
+ let supabaseReady = null;
11
 
12
  async function initSupabase() {
13
+ if (supabaseClient?.auth) return supabaseClient;
14
+ if (supabaseReady) return supabaseReady;
15
+
16
+ supabaseReady = (async () => {
17
+ try {
18
+ const res = await fetch('/api/v1/config', { cache: 'no-store' });
19
+ if (!res.ok) {
20
+ throw new Error(`Config endpoint failed (${res.status})`);
21
+ }
22
+
23
+ const cfg = await res.json();
24
+ const createClient = window.supabase?.createClient;
25
+ if (typeof createClient !== 'function') {
26
+ throw new Error('Supabase browser SDK failed to load.');
27
+ }
28
+ if (!cfg?.supabase_url || !cfg?.supabase_anon) {
29
+ throw new Error('Supabase frontend config is missing.');
30
+ }
31
+
32
+ CONFIG.GUEST_ENABLED = cfg?.guest_enabled !== false;
33
+ const client = createClient(cfg.supabase_url, cfg.supabase_anon);
34
+ if (!client?.auth) {
35
+ throw new Error('Supabase auth client failed to initialize.');
36
+ }
37
+
38
+ supabaseClient = client;
39
+ window.supabaseClient = client;
40
+ return client;
41
+ } catch (err) {
42
+ supabaseClient = null;
43
+ supabaseReady = null;
44
+ throw err;
45
+ }
46
+ })();
47
+
48
+ return supabaseReady;
49
+ }
frontend/js/corpus.js CHANGED
@@ -3,6 +3,30 @@
3
  * Document list, upload (real FastAPI call), category review.
4
  */
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  // ── Doc list ──────────────────────────────────────────────────────────────────
7
  function renderDocList() {
8
  const el = document.getElementById('docList');
@@ -81,6 +105,7 @@ async function processUpload(file) {
81
  try {
82
  const queued = await apiIngestFile(file);
83
  // queued = {task_id, filename, message}
 
84
 
85
  setProgress(20, 'Queued — processing in background…');
86
 
@@ -92,20 +117,71 @@ async function processUpload(file) {
92
 
93
  setProgress(100, 'Complete!');
94
  setTimeout(() => pc.classList.remove('visible'), 1500);
 
95
 
96
- if (result && result.file_hash) {
 
 
97
  showCategoryReview(result.file_hash, result.filename, result.document_type);
98
  }
99
  await refreshCorpus();
100
 
101
  } catch (err) {
102
- pc.classList.remove('visible');
 
 
 
103
  if (err.message === 'already_ingested') toast('Already ingested — skipped', 'error');
104
  else toast('Ingestion failed: ' + err.message, 'error');
105
  }
106
  document.getElementById('fileInput').value = '';
107
  }
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  async function pollIngestStatus(taskId, onProgress) {
110
  // No hard timeout — poll until COMPLETED or FAILED.
111
  // A large PDF with AI vision summaries can take 5-10 minutes on free-tier
@@ -218,4 +294,4 @@ function populateFilterDropdowns() {
218
  const sel = document.getElementById('chatFilterSelect');
219
  sel.innerHTML = '<option value="All">All Categories</option>' +
220
  STATE.categories.map(c => `<option value="${c}">${c.replace(/_/g,' ')}</option>`).join('');
221
- }
 
3
  * Document list, upload (real FastAPI call), category review.
4
  */
5
 
6
+ const ACTIVE_INGEST_KEY = 'morpheus_active_ingest';
7
+ let ACTIVE_INGEST_PROMISE = null;
8
+
9
+ function saveActiveIngest(taskId, filename) {
10
+ localStorage.setItem(ACTIVE_INGEST_KEY, JSON.stringify({
11
+ taskId,
12
+ filename,
13
+ savedAt: Date.now(),
14
+ }));
15
+ }
16
+
17
+ function loadActiveIngest() {
18
+ try {
19
+ const raw = localStorage.getItem(ACTIVE_INGEST_KEY);
20
+ return raw ? JSON.parse(raw) : null;
21
+ } catch {
22
+ return null;
23
+ }
24
+ }
25
+
26
+ function clearActiveIngest() {
27
+ localStorage.removeItem(ACTIVE_INGEST_KEY);
28
+ }
29
+
30
  // ── Doc list ──────────────────────────────────────────────────────────────────
31
  function renderDocList() {
32
  const el = document.getElementById('docList');
 
105
  try {
106
  const queued = await apiIngestFile(file);
107
  // queued = {task_id, filename, message}
108
+ saveActiveIngest(queued.task_id, queued.filename || file.name);
109
 
110
  setProgress(20, 'Queued — processing in background…');
111
 
 
117
 
118
  setProgress(100, 'Complete!');
119
  setTimeout(() => pc.classList.remove('visible'), 1500);
120
+ clearActiveIngest();
121
 
122
+ if (result && result.recovered_existing) {
123
+ toast('Recovered previous upload without recomputing.', 'success');
124
+ } else if (result && result.file_hash) {
125
  showCategoryReview(result.file_hash, result.filename, result.document_type);
126
  }
127
  await refreshCorpus();
128
 
129
  } catch (err) {
130
+ if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
131
+ clearActiveIngest();
132
+ pc.classList.remove('visible');
133
+ }
134
  if (err.message === 'already_ingested') toast('Already ingested — skipped', 'error');
135
  else toast('Ingestion failed: ' + err.message, 'error');
136
  }
137
  document.getElementById('fileInput').value = '';
138
  }
139
 
140
+ async function resumeActiveIngestionIfNeeded() {
141
+ if (ACTIVE_INGEST_PROMISE) return ACTIVE_INGEST_PROMISE;
142
+ const active = loadActiveIngest();
143
+ if (!active || !active.taskId) return null;
144
+
145
+ const pc = document.getElementById('progressCard');
146
+ pc.classList.add('visible');
147
+ document.getElementById('progressFilename').textContent = active.filename || 'Uploading PDF';
148
+ setProgress(25, 'Reconnecting to active ingestion…');
149
+
150
+ ACTIVE_INGEST_PROMISE = (async () => {
151
+ try {
152
+ const result = await pollIngestStatus(active.taskId, (step, total, msg) => {
153
+ const pct = Math.round((step / total) * 80) + 20;
154
+ setProgress(pct, msg);
155
+ });
156
+
157
+ clearActiveIngest();
158
+ setProgress(100, 'Complete!');
159
+ setTimeout(() => pc.classList.remove('visible'), 1500);
160
+
161
+ if (result && result.recovered_existing) {
162
+ toast('Recovered previous upload without recomputing.', 'success');
163
+ } else if (result && result.file_hash) {
164
+ showCategoryReview(result.file_hash, result.filename, result.document_type);
165
+ }
166
+ await refreshCorpus();
167
+ return result;
168
+ } catch (err) {
169
+ if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
170
+ clearActiveIngest();
171
+ pc.classList.remove('visible');
172
+ }
173
+ if (err.message === 'already_ingested') {
174
+ await refreshCorpus();
175
+ }
176
+ throw err;
177
+ } finally {
178
+ ACTIVE_INGEST_PROMISE = null;
179
+ }
180
+ })();
181
+
182
+ return ACTIVE_INGEST_PROMISE;
183
+ }
184
+
185
  async function pollIngestStatus(taskId, onProgress) {
186
  // No hard timeout — poll until COMPLETED or FAILED.
187
  // A large PDF with AI vision summaries can take 5-10 minutes on free-tier
 
294
  const sel = document.getElementById('chatFilterSelect');
295
  sel.innerHTML = '<option value="All">All Categories</option>' +
296
  STATE.categories.map(c => `<option value="${c}">${c.replace(/_/g,' ')}</option>`).join('');
297
+ }
frontend/js/graph.js CHANGED
@@ -17,10 +17,10 @@
17
  */
18
 
19
  function renderGraph() {
20
- const svg = d3.select('#graph-svg');
21
  const panel = document.getElementById('graph-panel');
22
- const W = panel.clientWidth;
23
- const H = panel.clientHeight;
24
  const empty = document.getElementById('graph-empty');
25
 
26
  svg.selectAll('*').remove();
@@ -37,12 +37,12 @@ function renderGraph() {
37
 
38
  STATE.categories.forEach(cat => {
39
  nodes.push({
40
- id: `cat::${cat}`,
41
- type: 'category',
42
  label: cat.replace(/_/g, ' '),
43
- raw: cat,
44
  color: STATE.catColors[cat],
45
- r: 26,
46
  pinned: false,
47
  count: STATE.files.filter(f => (f.document_type || 'uncategorised') === cat).length,
48
  });
@@ -51,26 +51,26 @@ function renderGraph() {
51
  STATE.files.forEach(f => {
52
  const cat = f.document_type || 'uncategorised';
53
  nodes.push({
54
- id: `doc::${f.file_hash}`,
55
- type: 'document',
56
- label: f.filename,
57
  file_hash: f.file_hash,
58
- category: cat,
59
- color: STATE.catColors[cat] || '#4a9eff',
60
- r: 7,
61
- pinned: false,
62
- chunks: f.chunk_count,
63
- ingested: (f.ingested_at || '').slice(0, 10),
64
  });
65
  links.push({ source: `cat::${cat}`, target: `doc::${f.file_hash}` });
66
  });
67
 
68
  // ── Zoom + pan ─────────────────────────────────────────
69
- const g = svg.append('g');
70
  const zoom = d3.zoom()
71
- .scaleExtent([0.3, 3])
72
- // scroll to zoom only, no drag-to-pan
73
- .on('zoom', e => g.attr('transform', e.transform));
74
  svg.call(zoom).on('dblclick.zoom', null);
75
  STATE.svgZoom = { zoom, svg };
76
 
@@ -102,29 +102,56 @@ function renderGraph() {
102
  .style('cursor', 'pointer')
103
  .call(d3.drag()
104
  .on('start', (e, d) => {
105
- if (!e.active) STATE.simulation.alphaTarget(0.3).restart();
106
- d.fx = d.x; d.fy = d.y;
107
- d._lastX = d.x; d._lastY = d.y;
108
- })
109
  .on('drag', (e, d) => {
110
- d._vx = e.x - (d._lastX || e.x);
111
- d._vy = e.y - (d._lastY || e.y);
112
- d._lastX = e.x; d._lastY = e.y;
113
- d.fx = e.x; d.fy = e.y;
114
- })
115
  .on('end', (e, d) => {
116
- if (!e.active) STATE.simulation.alphaTarget(0.05);
117
- if (!d.pinned) {
118
- d.fx = null; d.fy = null;
119
- d.vx = (d._vx || 0) * 3;
120
- d.vy = (d._vy || 0) * 3;
121
- STATE.simulation.alphaTarget(0.3).restart();
122
- setTimeout(() => STATE.simulation.alphaTarget(0.05), 2000);
123
- }
124
- })
125
  )
126
  .on('click', (event, d) => {
127
  event.stopPropagation();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  onNodeClick(d);
129
  })
130
  .on('contextmenu', (event, d) => {
@@ -149,7 +176,7 @@ function renderGraph() {
149
  node.filter(d => d.type === 'category')
150
  .append('circle')
151
  .attr('r', 26)
152
- .attr('fill', d => d.color + '18')
153
  .attr('stroke', d => d.color)
154
  .attr('stroke-width', 2)
155
  .attr('filter', d => {
@@ -170,9 +197,14 @@ function renderGraph() {
170
  node.filter(d => d.type === 'document')
171
  .append('circle')
172
  .attr('r', 7)
173
- .attr('fill', d => d.color + '55')
174
- .attr('stroke', d => d.color)
175
- .attr('stroke-width', 1.5);
 
 
 
 
 
176
 
177
  // Labels
178
  node.append('text')
@@ -180,14 +212,14 @@ function renderGraph() {
180
  .attr('dy', d => d.type === 'category' ? -32 : -12)
181
  .attr('text-anchor', 'middle')
182
  .attr('fill', d => d.type === 'category' ? d.color : 'rgba(200,216,244,0.7)')
183
- .attr('font-size', d => d.type === 'category' ? '10px' : '8px')
184
  .attr('font-family', 'Syne Mono, monospace')
185
  .attr('font-weight', d => d.type === 'category' ? '600' : '400')
186
  .text(d => trunc(d.label, d.type === 'category' ? 18 : 16))
187
  .style('pointer-events', 'none')
188
  .style('user-select', 'none');
189
 
190
- svg.on('click', () => {});
191
 
192
  // ── Simulation — Obsidian style ────────────────────────
193
  STATE.simulation = d3.forceSimulation(nodes)
@@ -207,25 +239,25 @@ function renderGraph() {
207
  .alphaDecay(0.02)
208
  .velocityDecay(0.4)
209
  .on('tick', () => {
210
- const liveW = document.getElementById('graph-panel').clientWidth;
211
- const liveH = document.getElementById('graph-panel').clientHeight;
212
- nodes.forEach(d => {
213
- if (d.fx == null) {
214
- const pad = 40;
215
- if (d.x < pad) { d.x = pad; d.vx = Math.abs(d.vx) * 0.7; }
216
- if (d.x > liveW - pad) { d.x = liveW - pad; d.vx = -Math.abs(d.vx) * 0.7; }
217
- if (d.y < pad) { d.y = pad; d.vy = Math.abs(d.vy) * 0.7; }
218
- if (d.y > liveH - pad) { d.y = liveH - pad; d.vy = -Math.abs(d.vy) * 0.7; }
219
- }
220
- });
221
- link
222
- .attr('x1', d => d.source.x).attr('y1', d => d.source.y)
223
- .attr('x2', d => d.target.x).attr('y2', d => d.target.y);
224
- node.attr('transform', d => `translate(${d.x},${d.y})`);
225
-
226
- const maxV = Math.max(...nodes.map(d => Math.abs(d.vx||0) + Math.abs(d.vy||0)));
227
- if (maxV > 0.5) STATE.simulation.alphaTarget(0.1).restart();
228
- });
229
 
230
  setTimeout(() => STATE.simulation.alphaTarget(0.05), 3000);
231
  }
@@ -352,7 +384,7 @@ function setupGraphObservers() {
352
  }
353
  });
354
  mo.observe(panel, {
355
- attributes: true,
356
  attributeFilter: ['style', 'class'],
357
  });
358
 
@@ -366,7 +398,7 @@ function setupGraphObservers() {
366
  if (W && H) graphReheat();
367
  });
368
  moParent.observe(panel.parentElement, {
369
- attributes: true,
370
  attributeFilter: ['style', 'class'],
371
  });
372
  }
@@ -377,4 +409,4 @@ function setupGraphObservers() {
377
  window.addEventListener('resize', () => graphReheat());
378
  }
379
 
380
- setupGraphObservers();
 
17
  */
18
 
19
  function renderGraph() {
20
+ const svg = d3.select('#graph-svg');
21
  const panel = document.getElementById('graph-panel');
22
+ const W = panel.clientWidth;
23
+ const H = panel.clientHeight;
24
  const empty = document.getElementById('graph-empty');
25
 
26
  svg.selectAll('*').remove();
 
37
 
38
  STATE.categories.forEach(cat => {
39
  nodes.push({
40
+ id: `cat::${cat}`,
41
+ type: 'category',
42
  label: cat.replace(/_/g, ' '),
43
+ raw: cat,
44
  color: STATE.catColors[cat],
45
+ r: 26,
46
  pinned: false,
47
  count: STATE.files.filter(f => (f.document_type || 'uncategorised') === cat).length,
48
  });
 
51
  STATE.files.forEach(f => {
52
  const cat = f.document_type || 'uncategorised';
53
  nodes.push({
54
+ id: `doc::${f.file_hash}`,
55
+ type: 'document',
56
+ label: f.filename,
57
  file_hash: f.file_hash,
58
+ category: cat,
59
+ color: STATE.catColors[cat] || '#4a9eff',
60
+ r: 7,
61
+ pinned: false,
62
+ chunks: f.chunk_count,
63
+ ingested: (f.ingested_at || '').slice(0, 10),
64
  });
65
  links.push({ source: `cat::${cat}`, target: `doc::${f.file_hash}` });
66
  });
67
 
68
  // ── Zoom + pan ─────────────────────────────────────────
69
+ const g = svg.append('g');
70
  const zoom = d3.zoom()
71
+ .scaleExtent([0.3, 3])
72
+ // scroll to zoom only, no drag-to-pan
73
+ .on('zoom', e => g.attr('transform', e.transform));
74
  svg.call(zoom).on('dblclick.zoom', null);
75
  STATE.svgZoom = { zoom, svg };
76
 
 
102
  .style('cursor', 'pointer')
103
  .call(d3.drag()
104
  .on('start', (e, d) => {
105
+ if (!e.active) STATE.simulation.alphaTarget(0.3).restart();
106
+ d.fx = d.x; d.fy = d.y;
107
+ d._lastX = d.x; d._lastY = d.y;
108
+ })
109
  .on('drag', (e, d) => {
110
+ d._vx = e.x - (d._lastX || e.x);
111
+ d._vy = e.y - (d._lastY || e.y);
112
+ d._lastX = e.x; d._lastY = e.y;
113
+ d.fx = e.x; d.fy = e.y;
114
+ })
115
  .on('end', (e, d) => {
116
+ if (!e.active) STATE.simulation.alphaTarget(0.05);
117
+ if (!d.pinned) {
118
+ d.fx = null; d.fy = null;
119
+ d.vx = (d._vx || 0) * 3;
120
+ d.vy = (d._vy || 0) * 3;
121
+ STATE.simulation.alphaTarget(0.3).restart();
122
+ setTimeout(() => STATE.simulation.alphaTarget(0.05), 2000);
123
+ }
124
+ })
125
  )
126
  .on('click', (event, d) => {
127
  event.stopPropagation();
128
+
129
+ if (d.type === 'document') {
130
+ // Toggle this document's file_hash in the pinned set
131
+ const idx = STATE.pinnedFiles.indexOf(d.file_hash);
132
+ if (idx >= 0) {
133
+ STATE.pinnedFiles.splice(idx, 1);
134
+ } else {
135
+ STATE.pinnedFiles.push(d.file_hash);
136
+ }
137
+ // Visual: bright white stroke when pinned, original colour when not
138
+ node.filter(n => n && n.type === 'document').select('circle')
139
+ .attr('stroke', n => STATE.pinnedFiles.includes(n.file_hash) ? '#ffffff' : n.color)
140
+ .attr('stroke-width', n => STATE.pinnedFiles.includes(n.file_hash) ? 3 : 1.5)
141
+ .attr('filter', n => {
142
+ if (!STATE.pinnedFiles.includes(n.file_hash)) return null;
143
+ const glowIdx = STATE.categories.indexOf(n.category);
144
+ return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
145
+ });
146
+ } else if (d.type === 'category') {
147
+ // Clicking a category node clears ALL pins
148
+ STATE.pinnedFiles = [];
149
+ node.filter(n => n && n.type === 'document').select('circle')
150
+ .attr('stroke', n => n.color)
151
+ .attr('stroke-width', 1.5)
152
+ .attr('filter', null);
153
+ }
154
+
155
  onNodeClick(d);
156
  })
157
  .on('contextmenu', (event, d) => {
 
176
  node.filter(d => d.type === 'category')
177
  .append('circle')
178
  .attr('r', 26)
179
+ .attr('fill', d => d.color + '18')
180
  .attr('stroke', d => d.color)
181
  .attr('stroke-width', 2)
182
  .attr('filter', d => {
 
197
  node.filter(d => d.type === 'document')
198
  .append('circle')
199
  .attr('r', 7)
200
+ .attr('fill', d => d.color + '55')
201
+ .attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
202
+ .attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
203
+ .attr('filter', d => {
204
+ if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
205
+ const glowIdx = STATE.categories.indexOf(d.category);
206
+ return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
207
+ });
208
 
209
  // Labels
210
  node.append('text')
 
212
  .attr('dy', d => d.type === 'category' ? -32 : -12)
213
  .attr('text-anchor', 'middle')
214
  .attr('fill', d => d.type === 'category' ? d.color : 'rgba(200,216,244,0.7)')
215
+ .attr('font-size', d => d.type === 'category' ? '10px' : '8px')
216
  .attr('font-family', 'Syne Mono, monospace')
217
  .attr('font-weight', d => d.type === 'category' ? '600' : '400')
218
  .text(d => trunc(d.label, d.type === 'category' ? 18 : 16))
219
  .style('pointer-events', 'none')
220
  .style('user-select', 'none');
221
 
222
+ svg.on('click', () => { });
223
 
224
  // ── Simulation — Obsidian style ────────────────────────
225
  STATE.simulation = d3.forceSimulation(nodes)
 
239
  .alphaDecay(0.02)
240
  .velocityDecay(0.4)
241
  .on('tick', () => {
242
+ const liveW = document.getElementById('graph-panel').clientWidth;
243
+ const liveH = document.getElementById('graph-panel').clientHeight;
244
+ nodes.forEach(d => {
245
+ if (d.fx == null) {
246
+ const pad = 40;
247
+ if (d.x < pad) { d.x = pad; d.vx = Math.abs(d.vx) * 0.7; }
248
+ if (d.x > liveW - pad) { d.x = liveW - pad; d.vx = -Math.abs(d.vx) * 0.7; }
249
+ if (d.y < pad) { d.y = pad; d.vy = Math.abs(d.vy) * 0.7; }
250
+ if (d.y > liveH - pad) { d.y = liveH - pad; d.vy = -Math.abs(d.vy) * 0.7; }
251
+ }
252
+ });
253
+ link
254
+ .attr('x1', d => d.source.x).attr('y1', d => d.source.y)
255
+ .attr('x2', d => d.target.x).attr('y2', d => d.target.y);
256
+ node.attr('transform', d => `translate(${d.x},${d.y})`);
257
+
258
+ const maxV = Math.max(...nodes.map(d => Math.abs(d.vx || 0) + Math.abs(d.vy || 0)));
259
+ if (maxV > 0.5) STATE.simulation.alphaTarget(0.1).restart();
260
+ });
261
 
262
  setTimeout(() => STATE.simulation.alphaTarget(0.05), 3000);
263
  }
 
384
  }
385
  });
386
  mo.observe(panel, {
387
+ attributes: true,
388
  attributeFilter: ['style', 'class'],
389
  });
390
 
 
398
  if (W && H) graphReheat();
399
  });
400
  moParent.observe(panel.parentElement, {
401
+ attributes: true,
402
  attributeFilter: ['style', 'class'],
403
  });
404
  }
 
409
  window.addEventListener('resize', () => graphReheat());
410
  }
411
 
412
+ setupGraphObservers();
frontend/js/main.js CHANGED
@@ -6,17 +6,126 @@
6
  * On success, supabase-js stores the session in localStorage automatically.
7
  * getSupabaseToken() in api.js reads it on every request.
8
  *
9
- * The daily-password system is kept ONLY for the admin panel (getting today's
10
- * code). It no longer gates the main app Supabase JWT does that now.
11
  *
12
- * Set AUTH_DISABLED = true to skip login during local dev.
 
13
  */
14
 
15
- const AUTH_DISABLED = false; // set false in production
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  window.addEventListener('DOMContentLoaded', async () => {
18
  try {
19
- await initSupabase();
 
20
 
21
  if (AUTH_DISABLED) {
22
  showApp();
@@ -34,27 +143,71 @@ window.addEventListener('DOMContentLoaded', async () => {
34
  // once with INITIAL_SESSION (with or without a session), then again on
35
  // SIGNED_IN / SIGNED_OUT. No polling, no timeouts.
36
  let booted = false;
37
- supabaseClient.auth.onAuthStateChange((event, session) => {
38
- if (event === 'INITIAL_SESSION') {
39
- if (session) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  booted = true;
41
  showApp();
42
  bootApp();
43
- } else {
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  showLogin();
45
  }
46
- } else if (event === 'SIGNED_IN' && !booted) {
47
- booted = true;
48
- showApp();
49
- bootApp();
50
- } else if (event === 'SIGNED_OUT') {
51
- booted = false;
52
  showLogin();
53
- }
54
  });
55
 
56
  } catch (err) {
57
  console.error("Boot failed:", err);
 
 
58
  showLogin();
59
  }
60
  });
@@ -86,7 +239,12 @@ async function submitLogin() {
86
  err.textContent = '';
87
 
88
  try {
89
- const {error } = await supabaseClient.auth.signInWithPassword({
 
 
 
 
 
90
  email,
91
  password: pw,
92
  });
@@ -94,12 +252,14 @@ async function submitLogin() {
94
  if (error) {
95
  err.textContent = error.message || 'Invalid credentials.';
96
  btn.disabled = false;
97
- btn.textContent = 'UNLOCK →';
98
  return;
99
  }
100
  // EXPLICIT UI TAKEOVER:
101
  // Wait 500ms to guarantee local storage has the token, then force the system online.
102
  STATE.authenticated = true;
 
 
103
  showApp();
104
 
105
  setTimeout(() => {
@@ -111,7 +271,60 @@ async function submitLogin() {
111
  } catch (e) {
112
  err.textContent = 'Server unreachable: ' + e.message;
113
  btn.disabled = false;
114
- btn.textContent = 'UNLOCK →';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  }
116
  }
117
 
@@ -163,7 +376,12 @@ async function submitSignup() {
163
  btn.textContent = 'CREATING ACCOUNT…';
164
 
165
  try {
166
- const { data, error } = await supabaseClient.auth.signUp({ email, password: pw });
 
 
 
 
 
167
 
168
  if (error) {
169
  err.textContent = error.message || 'Sign-up failed.';
@@ -187,55 +405,43 @@ async function submitSignup() {
187
  }
188
  }
189
 
190
- // ── Admin panel daily code (unchanged, still uses master key) ───────────────
191
- async function submitAdmin() {
192
- const key = document.getElementById('adminKey').value.trim();
193
- if (!key) return;
194
  try {
195
  const res = await apiVerifyAdmin(key);
196
  if (res.valid) {
197
- document.getElementById('adminResult').textContent =
198
- `Today's code: ${res.token}`;
199
- document.getElementById('auth-toggle-panel').style.display = 'block';
200
- const locked = localStorage.getItem('nexus_auth_locked') !== 'false';
201
- updateToggleUI(locked);
 
 
 
 
 
 
 
 
 
 
 
202
  } else {
203
- document.getElementById('adminResult').textContent = 'Invalid admin key.';
204
  }
205
  } catch (e) {
206
- document.getElementById('adminResult').textContent = 'Error: ' + e.message;
207
  }
 
208
  }
209
 
210
- // ── Auth toggle (admin only) ──────────────────────────────────────────────────
211
- function updateToggleUI(locked) {
212
- const btn = document.getElementById('auth-toggle-btn');
213
- const label = document.getElementById('auth-toggle-label');
214
- if (locked) {
215
- btn.textContent = 'DISABLE AUTH';
216
- btn.style.background = 'rgba(255,71,87,0.15)';
217
- btn.style.borderColor = 'var(--red)';
218
- btn.style.color = 'var(--red)';
219
- label.textContent = 'Auth is ON — users must sign in';
220
- } else {
221
- btn.textContent = 'ENABLE AUTH';
222
- btn.style.background = 'rgba(0,255,136,0.08)';
223
- btn.style.borderColor = 'var(--phosphor)';
224
- btn.style.color = 'var(--phosphor)';
225
- label.textContent = 'Auth is OFF — anyone can access';
226
- }
227
- }
228
-
229
- function toggleAuth() {
230
- const current = localStorage.getItem('nexus_auth_locked') !== 'false';
231
- const next = !current;
232
- localStorage.setItem('nexus_auth_locked', next ? 'true' : 'false');
233
- updateToggleUI(next);
234
- toast(
235
- next ? 'Auth enabled — sign-in required on next visit'
236
- : 'Auth disabled — open access',
237
- next ? 'error' : 'success',
238
- );
239
  }
240
 
241
  function handleLoginKey(e) {
@@ -247,10 +453,38 @@ function handleLoginKey(e) {
247
 
248
  // ── Sign out ──────────────────────────────────────────────────────────────────
249
  async function signOut() {
250
- await supabaseClient.auth.signOut();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  STATE.authenticated = false;
 
 
 
 
 
 
252
  STATE.files = [];
253
  STATE.categories = [];
 
 
 
 
 
254
  showLogin();
255
  authTab('signin');
256
  }
@@ -262,7 +496,22 @@ async function bootApp() {
262
  setOnline(true);
263
  try {
264
  await refreshCorpus();
265
- switchView('corpus');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  } catch (e) {
267
  setOnline(false);
268
  toast('Could not reach backend: ' + e.message, 'error');
@@ -292,4 +541,4 @@ async function refreshCorpus() {
292
  }
293
  };
294
  }, 50);
295
- })();
 
6
  * On success, supabase-js stores the session in localStorage automatically.
7
  * getSupabaseToken() in api.js reads it on every request.
8
  *
9
+ * Legacy daily-password UI has been removed. Supabase JWT gates the main app,
10
+ * while the admin key only unlocks operator review tools.
11
  *
12
+ * AUTH_DISABLED is a local-dev escape hatch only.
13
+ * Product guest access should use Supabase anonymous sessions instead.
14
  */
15
 
16
+ const AUTH_DISABLED = false; // local dev only — keep false in real use
17
+ const GUEST_PERSIST_KEY = 'morpheus_guest_persist';
18
+ const GUEST_TAB_KEY = 'morpheus_guest_tab_alive';
19
+ const GUEST_LAST_SEEN_KEY = 'morpheus_guest_last_seen_at';
20
+ const GUEST_ACTIVITY_WINDOW_MS = 45000;
21
+ let guestHeartbeatTimer = null;
22
+
23
+ function shouldPersistGuestWorkspace() {
24
+ return localStorage.getItem(GUEST_PERSIST_KEY) === '1';
25
+ }
26
+
27
+ function setGuestPersistPreference(keep) {
28
+ localStorage.setItem(GUEST_PERSIST_KEY, keep ? '1' : '0');
29
+ STATE.guestPersist = Boolean(keep);
30
+ }
31
+
32
+ function markGuestTabAlive() {
33
+ sessionStorage.setItem(GUEST_TAB_KEY, '1');
34
+ }
35
+
36
+ function clearGuestSessionMarkers() {
37
+ sessionStorage.removeItem(GUEST_TAB_KEY);
38
+ localStorage.removeItem(GUEST_LAST_SEEN_KEY);
39
+ }
40
+
41
+ function hasGuestTabMarker() {
42
+ return sessionStorage.getItem(GUEST_TAB_KEY) === '1';
43
+ }
44
+
45
+ function touchGuestHeartbeat() {
46
+ localStorage.setItem(GUEST_LAST_SEEN_KEY, String(Date.now()));
47
+ }
48
+
49
+ function hasRecentGuestHeartbeat() {
50
+ const raw = Number(localStorage.getItem(GUEST_LAST_SEEN_KEY) || 0);
51
+ return Number.isFinite(raw) && raw > 0 && (Date.now() - raw) < GUEST_ACTIVITY_WINDOW_MS;
52
+ }
53
+
54
+ function startGuestHeartbeat() {
55
+ stopGuestHeartbeat();
56
+ touchGuestHeartbeat();
57
+ guestHeartbeatTimer = window.setInterval(() => {
58
+ if (!STATE.isGuest) return;
59
+ touchGuestHeartbeat();
60
+ }, 15000);
61
+ }
62
+
63
+ function stopGuestHeartbeat() {
64
+ if (!guestHeartbeatTimer) return;
65
+ clearInterval(guestHeartbeatTimer);
66
+ guestHeartbeatTimer = null;
67
+ }
68
+
69
+ function setGuestControlsVisibility() {
70
+ const guestBtn = document.getElementById('guestBtn');
71
+ const guestInfo = document.getElementById('guestInfo');
72
+ const guestPersistWrap = document.getElementById('guestPersistWrap');
73
+ const visible = Boolean(CONFIG.GUEST_ENABLED);
74
+ if (guestBtn) guestBtn.style.display = visible ? '' : 'none';
75
+ if (guestInfo) guestInfo.style.display = visible ? 'block' : 'none';
76
+ if (guestPersistWrap) guestPersistWrap.style.display = visible ? 'block' : 'none';
77
+ }
78
+
79
+ function setSessionMode(session) {
80
+ const appMeta = session?.user?.app_metadata || {};
81
+ const provider = String(appMeta.provider || '').toLowerCase();
82
+ STATE.isGuest = Boolean(
83
+ session?.user?.is_anonymous ||
84
+ appMeta.is_anonymous ||
85
+ provider === 'anonymous' ||
86
+ (Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
87
+ );
88
+ STATE.guestPersist = STATE.isGuest ? shouldPersistGuestWorkspace() : false;
89
+
90
+ const pill = document.getElementById('session-mode-pill');
91
+ const label = document.getElementById('session-mode-label');
92
+ if (pill) pill.style.display = STATE.isGuest ? '' : 'none';
93
+ if (label) label.textContent = STATE.isGuest ? 'GUEST' : 'ACCOUNT';
94
+ }
95
+
96
+ function isTemporaryGuestResume(session) {
97
+ if (!session || !STATE.isGuest || STATE.guestPersist) return false;
98
+ return !hasGuestTabMarker() && !hasRecentGuestHeartbeat();
99
+ }
100
+
101
+ async function expireTemporaryGuestSession(client) {
102
+ try {
103
+ await apiCleanupGuestWorkspace();
104
+ } catch {
105
+ // best effort only
106
+ }
107
+ try {
108
+ await client.auth.signOut();
109
+ } catch {
110
+ // best effort only
111
+ }
112
+ localStorage.removeItem(GUEST_PERSIST_KEY);
113
+ clearGuestSessionMarkers();
114
+ STATE.isGuest = false;
115
+ STATE.guestPersist = false;
116
+ setSessionMode(null);
117
+ showLogin();
118
+ const info = document.getElementById('loginInfo');
119
+ if (info) {
120
+ info.textContent = 'Temporary guest workspace expired after the previous guest session ended.';
121
+ info.style.display = 'block';
122
+ }
123
+ }
124
 
125
  window.addEventListener('DOMContentLoaded', async () => {
126
  try {
127
+ const client = await initSupabase();
128
+ setGuestControlsVisibility();
129
 
130
  if (AUTH_DISABLED) {
131
  showApp();
 
143
  // once with INITIAL_SESSION (with or without a session), then again on
144
  // SIGNED_IN / SIGNED_OUT. No polling, no timeouts.
145
  let booted = false;
146
+ client.auth.onAuthStateChange((event, session) => {
147
+ const handle = async () => {
148
+ if (event === 'INITIAL_SESSION') {
149
+ if (session) {
150
+ setSessionMode(session);
151
+ if (isTemporaryGuestResume(session)) {
152
+ booted = false;
153
+ await expireTemporaryGuestSession(client);
154
+ return;
155
+ }
156
+ if (STATE.isGuest) {
157
+ markGuestTabAlive();
158
+ startGuestHeartbeat();
159
+ } else {
160
+ stopGuestHeartbeat();
161
+ }
162
+ booted = true;
163
+ showApp();
164
+ bootApp();
165
+ } else {
166
+ stopGuestHeartbeat();
167
+ STATE.isGuest = false;
168
+ STATE.guestPersist = false;
169
+ showLogin();
170
+ }
171
+ } else if (event === 'SIGNED_IN' && !booted) {
172
+ setSessionMode(session);
173
+ if (STATE.isGuest) {
174
+ markGuestTabAlive();
175
+ startGuestHeartbeat();
176
+ } else {
177
+ stopGuestHeartbeat();
178
+ }
179
  booted = true;
180
  showApp();
181
  bootApp();
182
+ } else if (event === 'SIGNED_IN') {
183
+ setSessionMode(session);
184
+ if (STATE.isGuest) {
185
+ markGuestTabAlive();
186
+ startGuestHeartbeat();
187
+ } else {
188
+ stopGuestHeartbeat();
189
+ }
190
+ } else if (event === 'SIGNED_OUT') {
191
+ booted = false;
192
+ stopGuestHeartbeat();
193
+ STATE.isGuest = false;
194
+ STATE.guestPersist = false;
195
+ setSessionMode(null);
196
  showLogin();
197
  }
198
+ };
199
+
200
+ handle().catch(err => {
201
+ console.error('Auth transition failed:', err);
202
+ stopGuestHeartbeat();
 
203
  showLogin();
204
+ });
205
  });
206
 
207
  } catch (err) {
208
  console.error("Boot failed:", err);
209
+ const errEl = document.getElementById('loginError');
210
+ if (errEl) errEl.textContent = 'Auth init failed: ' + err.message;
211
  showLogin();
212
  }
213
  });
 
239
  err.textContent = '';
240
 
241
  try {
242
+ const client = await initSupabase();
243
+ if (!client?.auth) {
244
+ throw new Error('Supabase auth client is unavailable.');
245
+ }
246
+
247
+ const {error } = await client.auth.signInWithPassword({
248
  email,
249
  password: pw,
250
  });
 
252
  if (error) {
253
  err.textContent = error.message || 'Invalid credentials.';
254
  btn.disabled = false;
255
+ btn.textContent = 'SIGN IN →';
256
  return;
257
  }
258
  // EXPLICIT UI TAKEOVER:
259
  // Wait 500ms to guarantee local storage has the token, then force the system online.
260
  STATE.authenticated = true;
261
+ const session = await getSupabaseSession();
262
+ setSessionMode(session);
263
  showApp();
264
 
265
  setTimeout(() => {
 
271
  } catch (e) {
272
  err.textContent = 'Server unreachable: ' + e.message;
273
  btn.disabled = false;
274
+ btn.textContent = 'SIGN IN →';
275
+ }
276
+ }
277
+
278
+ async function submitGuest() {
279
+ const btn = document.getElementById('guestBtn');
280
+ const err = document.getElementById('loginError');
281
+ const info = document.getElementById('loginInfo');
282
+ const persistCheckbox = document.getElementById('guestPersist');
283
+ const keepWorkspace = Boolean(persistCheckbox?.checked);
284
+
285
+ err.textContent = '';
286
+ if (info) {
287
+ info.style.display = 'none';
288
+ info.textContent = '';
289
+ }
290
+
291
+ btn.disabled = true;
292
+ btn.textContent = 'STARTING GUEST WORKSPACE…';
293
+
294
+ try {
295
+ const client = await initSupabase();
296
+ if (!client?.auth) {
297
+ throw new Error('Supabase auth client is unavailable.');
298
+ }
299
+
300
+ const { error } = await client.auth.signInAnonymously();
301
+ if (error) {
302
+ throw error;
303
+ }
304
+
305
+ setGuestPersistPreference(keepWorkspace);
306
+ const session = await getSupabaseSession();
307
+ setSessionMode(session);
308
+ markGuestTabAlive();
309
+ startGuestHeartbeat();
310
+ STATE.authenticated = true;
311
+ showApp();
312
+ setTimeout(() => {
313
+ setOnline(true);
314
+ bootApp();
315
+ const msg = keepWorkspace
316
+ ? 'Guest workspace ready. It will stay on this device until you end it.'
317
+ : 'Temporary guest workspace ready. It will expire after the guest session truly ends.';
318
+ toast(msg, 'success');
319
+ }, 300);
320
+ } catch (e) {
321
+ err.textContent = e?.message || 'Could not start guest workspace.';
322
+ if (/anonymous/i.test(err.textContent)) {
323
+ err.textContent = 'Guest mode is disabled in Supabase Auth settings.';
324
+ }
325
+ } finally {
326
+ btn.disabled = false;
327
+ btn.textContent = 'CONTINUE AS GUEST';
328
  }
329
  }
330
 
 
376
  btn.textContent = 'CREATING ACCOUNT…';
377
 
378
  try {
379
+ const client = await initSupabase();
380
+ if (!client?.auth) {
381
+ throw new Error('Supabase auth client is unavailable.');
382
+ }
383
+
384
+ const { data, error } = await client.auth.signUp({ email, password: pw });
385
 
386
  if (error) {
387
  err.textContent = error.message || 'Sign-up failed.';
 
405
  }
406
  }
407
 
408
+ // ── Operator tools unlock ──────────────────────────────────────────────────────
409
+ async function submitAdmin(adminKey) {
410
+ const key = String(adminKey || '').trim();
411
+ if (!key) return false;
412
  try {
413
  const res = await apiVerifyAdmin(key);
414
  if (res.valid) {
415
+ if (typeof window.enableAdminReview === 'function') {
416
+ window.enableAdminReview(key);
417
+ STATE.adminPendingView = true;
418
+ if (document.getElementById('app')?.style.display !== 'none') {
419
+ switchView('admin');
420
+ } else {
421
+ const info = document.getElementById('loginInfo');
422
+ if (info) {
423
+ info.textContent = 'Admin dashboard unlocked. Sign in to open it.';
424
+ info.style.display = 'block';
425
+ }
426
+ }
427
+ } else {
428
+ toast('Admin dashboard assets are stale. Hard refresh with Ctrl+Shift+R.', 'error');
429
+ }
430
+ return true;
431
  } else {
432
+ toast('Invalid operator key.', 'error');
433
  }
434
  } catch (e) {
435
+ toast('Operator unlock failed: ' + e.message, 'error');
436
  }
437
+ return false;
438
  }
439
 
440
+ async function unlockOperatorTools() {
441
+ const key = window.prompt('Enter operator key to open review tools:', '') || '';
442
+ if (!key.trim()) return;
443
+ const ok = await submitAdmin(key);
444
+ if (ok) toast('Operator tools unlocked.', 'success');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  }
446
 
447
  function handleLoginKey(e) {
 
453
 
454
  // ── Sign out ──────────────────────────────────────────────────────────────────
455
  async function signOut() {
456
+ const client = await initSupabase();
457
+ if (!client?.auth) {
458
+ throw new Error('Supabase auth client is unavailable.');
459
+ }
460
+ if (STATE.isGuest) {
461
+ if (STATE.guestPersist) {
462
+ const shouldEnd = window.confirm(
463
+ 'This guest workspace is set to stay on this device. Click OK to end and delete it now, or Cancel to keep it and just close the tab later.'
464
+ );
465
+ if (!shouldEnd) return;
466
+ }
467
+ try {
468
+ await apiCleanupGuestWorkspace();
469
+ } catch (err) {
470
+ toast('Guest workspace cleanup failed: ' + err.message, 'error');
471
+ }
472
+ }
473
+ await client.auth.signOut();
474
  STATE.authenticated = false;
475
+ STATE.isGuest = false;
476
+ STATE.guestPersist = false;
477
+ stopGuestHeartbeat();
478
+ clearGuestSessionMarkers();
479
+ localStorage.removeItem(GUEST_PERSIST_KEY);
480
+ setSessionMode(null);
481
  STATE.files = [];
482
  STATE.categories = [];
483
+ STATE.adminUnlocked = false;
484
+ STATE.adminKey = '';
485
+ STATE.adminPendingView = false;
486
+ const navAdmin = document.getElementById('nav-admin');
487
+ if (navAdmin) navAdmin.style.display = 'none';
488
  showLogin();
489
  authTab('signin');
490
  }
 
496
  setOnline(true);
497
  try {
498
  await refreshCorpus();
499
+ if (typeof resumeActiveIngestionIfNeeded === 'function') {
500
+ resumeActiveIngestionIfNeeded().catch(err => {
501
+ console.warn('Ingestion resume failed:', err?.message || err);
502
+ });
503
+ }
504
+ if (STATE.adminUnlocked && STATE.adminPendingView) {
505
+ switchView('admin');
506
+ STATE.adminPendingView = false;
507
+ if (typeof refreshAdminDashboard === 'function') {
508
+ refreshAdminDashboard().catch(err => {
509
+ toast('Admin dashboard failed: ' + err.message, 'error');
510
+ });
511
+ }
512
+ } else {
513
+ switchView('corpus');
514
+ }
515
  } catch (e) {
516
  setOnline(false);
517
  toast('Could not reach backend: ' + e.message, 'error');
 
541
  }
542
  };
543
  }, 50);
544
+ })();
frontend/js/state.js CHANGED
@@ -3,19 +3,28 @@
3
  * Single source of truth. All data flows through api.js, never direct Supabase.
4
  */
5
  const STATE = {
6
- authenticated: false,
7
- files: [],
8
- categories: [],
9
- catColors: {},
10
- simulation: null,
11
- svgZoom: null,
12
- selectedNode: null,
13
  deleteConfirmed: false,
14
- pendingReview: null,
15
- chatHistory: [],
16
- isThinking: false,
17
- sessionId: crypto.randomUUID(),
18
  alpha: 0.5,
 
 
 
 
 
 
 
 
 
19
  };
20
 
21
  function stateRefreshCategories() {
@@ -29,7 +38,7 @@ function stateRefreshCategories() {
29
  }
30
 
31
  async function stateLoadCorpus() {
32
- const data = await apiLoadFiles();
33
  STATE.files = data.files || [];
34
  stateRefreshCategories();
35
  document.getElementById('stat-docs').textContent = STATE.files.length;
 
3
  * Single source of truth. All data flows through api.js, never direct Supabase.
4
  */
5
  const STATE = {
6
+ authenticated: false,
7
+ files: [],
8
+ categories: [],
9
+ catColors: {},
10
+ simulation: null,
11
+ svgZoom: null,
12
+ selectedNode: null,
13
  deleteConfirmed: false,
14
+ pendingReview: null,
15
+ chatHistory: [],
16
+ isThinking: false,
17
+ sessionId: crypto.randomUUID(),
18
  alpha: 0.5,
19
+ pinnedFiles: [], // file_hashes of graph-pinned documents
20
+ adminKey: '',
21
+ adminUnlocked: false,
22
+ adminTraces: [],
23
+ adminFeedback: [],
24
+ selectedTraceId: null,
25
+ adminPendingView: false,
26
+ isGuest: false,
27
+ guestPersist: false,
28
  };
29
 
30
  function stateRefreshCategories() {
 
38
  }
39
 
40
  async function stateLoadCorpus() {
41
+ const data = await apiLoadFiles();
42
  STATE.files = data.files || [];
43
  stateRefreshCategories();
44
  document.getElementById('stat-docs').textContent = STATE.files.length;
recent_changes.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -22,4 +22,5 @@ celery[redis]
22
  scikit-learn
23
  joblib
24
  sentence-transformers
25
- python-magic
 
 
22
  scikit-learn
23
  joblib
24
  sentence-transformers
25
+ python-magic
26
+ pytest
scripts/rebuild_pageindex.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rebuild the PageIndex (document_trees) for an already-ingested PDF.
3
+
4
+ Why this exists:
5
+ - Ingestion deletes the uploaded temp PDF after processing.
6
+ - PageIndex behavior evolves (better TOC handling, page_numbers, etc.).
7
+ - You may want to refresh only the structural index without re-embedding/re-uploading chunks.
8
+
9
+ Usage (PowerShell):
10
+ conda activate rag_env
11
+ python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "<JWT>"
12
+
13
+ Notes:
14
+ - This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it).
15
+ - It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import os
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ # Ensure repo root is on sys.path so `import backend...` works when executed as a script.
26
+ REPO_ROOT = Path(__file__).resolve().parents[1]
27
+ if str(REPO_ROOT) not in sys.path:
28
+ sys.path.insert(0, str(REPO_ROOT))
29
+
30
+ from backend.core.pipeline import (
31
+ _build_document_tree,
32
+ _build_service_supabase_client,
33
+ get_file_fingerprint,
34
+ partition_document,
35
+ )
36
+ from backend.core.auth_utils import extract_jwt_sub
37
+
38
+
39
+ def main() -> int:
40
+ parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.")
41
+ parser.add_argument("--pdf", required=True, help="Path to local PDF file.")
42
+ parser.add_argument(
43
+ "--access-token",
44
+ required=False,
45
+ default=None,
46
+ help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.",
47
+ )
48
+ parser.add_argument(
49
+ "--user-id",
50
+ required=False,
51
+ default=None,
52
+ help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.",
53
+ )
54
+ args = parser.parse_args()
55
+
56
+ pdf_path = os.path.abspath(args.pdf)
57
+ if not os.path.exists(pdf_path):
58
+ raise SystemExit(f"PDF not found: {pdf_path}")
59
+
60
+ if args.user_id:
61
+ user_id = str(args.user_id).strip()
62
+ elif args.access_token:
63
+ user_id = extract_jwt_sub(args.access_token)
64
+ else:
65
+ raise SystemExit("Provide either --user-id or --access-token.")
66
+ file_hash = get_file_fingerprint(pdf_path)
67
+
68
+ elements = partition_document(pdf_path)
69
+ doc_tree = _build_document_tree(elements)
70
+
71
+ sb = _build_service_supabase_client()
72
+ sb.table("document_trees").upsert(
73
+ {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
74
+ on_conflict="user_id,file_hash",
75
+ ).execute()
76
+
77
+ print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}")
78
+ return 0
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())
83
+
shared/types.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
2
  from pydantic import BaseModel, Field
3
 
4
  class IngestResponse(BaseModel):
@@ -24,16 +24,53 @@ class ChatMessage(BaseModel):
24
  role: str; content: str
25
 
26
  class QueryRequest(BaseModel):
27
- query: str; category: str = "All"
28
- history: List[ChatMessage] = Field(default_factory=list); k: int = 3
 
 
29
  session_id: str = "default_session"
30
- alpha: float = 0.5
 
31
 
32
  class SourceChunk(BaseModel):
33
  source: str; score: Optional[float]=None; chunk: Optional[int | str] = None
34
  snippet: Optional[str]=None; doc_type: Optional[str]=None
35
  pages: Optional[List[int]]=None
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  class QueryResponse(BaseModel):
38
  answer: str; sources: List[SourceChunk] = Field(default_factory=list)
39
  images: List[str] = []
 
1
+ from typing import Any, Dict, List, Optional
2
  from pydantic import BaseModel, Field
3
 
4
  class IngestResponse(BaseModel):
 
24
  role: str; content: str
25
 
26
  class QueryRequest(BaseModel):
27
+ query: str
28
+ category: str = "All"
29
+ history: List[ChatMessage] = Field(default_factory=list)
30
+ k: int = 3
31
  session_id: str = "default_session"
32
+ alpha: float = 0.5
33
+ priority_file_hashes: List[str] = Field(default_factory=list)
34
 
35
  class SourceChunk(BaseModel):
36
  source: str; score: Optional[float]=None; chunk: Optional[int | str] = None
37
  snippet: Optional[str]=None; doc_type: Optional[str]=None
38
  pages: Optional[List[int]]=None
39
 
40
+ class DocDiagnostic(BaseModel):
41
+ file_hash: str
42
+ source: str
43
+ included: bool = True
44
+ candidate_count: int = 0
45
+ doc_score: Optional[float] = None
46
+ confidence_label: Optional[str] = None
47
+ reason: Optional[str] = None
48
+ support_label: Optional[str] = None
49
+ thin_doc: Optional[bool] = None
50
+
51
+ class QueryTrace(BaseModel):
52
+ trace_id: str
53
+ query: str
54
+ session_id: str
55
+ route_mode: str
56
+ selected_experts: List[str] = Field(default_factory=list)
57
+ expert_weights: Dict[str, float] = Field(default_factory=dict)
58
+ pinned_file_hashes: List[str] = Field(default_factory=list)
59
+ candidate_counts: Dict[str, int] = Field(default_factory=dict)
60
+ selected_chunk_ids: List[str] = Field(default_factory=list)
61
+ doc_diagnostics: List[DocDiagnostic] = Field(default_factory=list)
62
+ failure_modes: List[str] = Field(default_factory=list)
63
+ quality_metrics: Dict[str, Any] = Field(default_factory=dict)
64
+ latency_ms: Optional[int] = None
65
+ answer_hash: Optional[str] = None
66
+
67
+ class AnswerFeedback(BaseModel):
68
+ trace_id: str
69
+ helpful: Optional[bool] = None
70
+ accepted: Optional[bool] = None
71
+ reason_code: Optional[str] = None
72
+ correction_text: Optional[str] = None
73
+
74
  class QueryResponse(BaseModel):
75
  answer: str; sources: List[SourceChunk] = Field(default_factory=list)
76
  images: List[str] = []
supabase/migrations/0010_query_traces_feedback_graph.sql ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ create table if not exists public.query_traces (
2
+ trace_id uuid primary key default gen_random_uuid(),
3
+ user_id uuid null,
4
+ session_id text not null default 'default_session',
5
+ question text not null,
6
+ route_mode text not null default 'default',
7
+ selected_experts jsonb not null default '[]'::jsonb,
8
+ expert_weights jsonb not null default '{}'::jsonb,
9
+ pinned_file_hashes jsonb not null default '[]'::jsonb,
10
+ candidate_counts jsonb not null default '{}'::jsonb,
11
+ selected_chunk_ids jsonb not null default '[]'::jsonb,
12
+ doc_diagnostics jsonb not null default '[]'::jsonb,
13
+ failure_modes jsonb not null default '[]'::jsonb,
14
+ quality_metrics jsonb not null default '{}'::jsonb,
15
+ answer_hash text null,
16
+ answer_preview text null,
17
+ latency_ms integer null,
18
+ created_at timestamptz not null default timezone('utc', now())
19
+ );
20
+
21
+ create index if not exists idx_query_traces_user_created
22
+ on public.query_traces (user_id, created_at desc);
23
+
24
+ create index if not exists idx_query_traces_session_created
25
+ on public.query_traces (session_id, created_at desc);
26
+
27
+ alter table public.query_traces enable row level security;
28
+
29
+ drop policy if exists query_traces_select_own on public.query_traces;
30
+ create policy query_traces_select_own
31
+ on public.query_traces
32
+ for select
33
+ to authenticated
34
+ using (auth.uid() = user_id);
35
+
36
+ drop policy if exists query_traces_insert_own on public.query_traces;
37
+ create policy query_traces_insert_own
38
+ on public.query_traces
39
+ for insert
40
+ to authenticated
41
+ with check (auth.uid() = user_id);
42
+
43
+
44
+ create table if not exists public.answer_feedback (
45
+ id bigint generated by default as identity primary key,
46
+ trace_id uuid not null references public.query_traces(trace_id) on delete cascade,
47
+ user_id uuid null,
48
+ helpful boolean null,
49
+ accepted boolean null,
50
+ reason_code text null,
51
+ correction_text text null,
52
+ promote_to_eval boolean not null default false,
53
+ created_at timestamptz not null default timezone('utc', now())
54
+ );
55
+
56
+ create index if not exists idx_answer_feedback_trace_created
57
+ on public.answer_feedback (trace_id, created_at desc);
58
+
59
+ create index if not exists idx_answer_feedback_user_created
60
+ on public.answer_feedback (user_id, created_at desc);
61
+
62
+ alter table public.answer_feedback enable row level security;
63
+
64
+ drop policy if exists answer_feedback_select_own on public.answer_feedback;
65
+ create policy answer_feedback_select_own
66
+ on public.answer_feedback
67
+ for select
68
+ to authenticated
69
+ using (auth.uid() = user_id);
70
+
71
+ drop policy if exists answer_feedback_insert_own on public.answer_feedback;
72
+ create policy answer_feedback_insert_own
73
+ on public.answer_feedback
74
+ for insert
75
+ to authenticated
76
+ with check (auth.uid() = user_id);
77
+
78
+
79
+ create table if not exists public.graph_nodes (
80
+ id bigint generated by default as identity primary key,
81
+ user_id uuid null,
82
+ node_key text not null,
83
+ node_type text not null,
84
+ label text not null,
85
+ payload jsonb not null default '{}'::jsonb,
86
+ created_at timestamptz not null default timezone('utc', now()),
87
+ unique (user_id, node_key)
88
+ );
89
+
90
+ create index if not exists idx_graph_nodes_user_type
91
+ on public.graph_nodes (user_id, node_type);
92
+
93
+ create index if not exists idx_graph_nodes_user_label
94
+ on public.graph_nodes (user_id, label);
95
+
96
+ alter table public.graph_nodes enable row level security;
97
+
98
+ drop policy if exists graph_nodes_select_own on public.graph_nodes;
99
+ create policy graph_nodes_select_own
100
+ on public.graph_nodes
101
+ for select
102
+ to authenticated
103
+ using (auth.uid() = user_id);
104
+
105
+
106
+ create table if not exists public.graph_edges (
107
+ id bigint generated by default as identity primary key,
108
+ user_id uuid null,
109
+ source_node_key text not null,
110
+ target_node_key text not null,
111
+ edge_type text not null,
112
+ weight double precision not null default 1.0,
113
+ payload jsonb not null default '{}'::jsonb,
114
+ created_at timestamptz not null default timezone('utc', now()),
115
+ unique (user_id, source_node_key, target_node_key, edge_type)
116
+ );
117
+
118
+ create index if not exists idx_graph_edges_user_source
119
+ on public.graph_edges (user_id, source_node_key);
120
+
121
+ create index if not exists idx_graph_edges_user_target
122
+ on public.graph_edges (user_id, target_node_key);
123
+
124
+ alter table public.graph_edges enable row level security;
125
+
126
+ drop policy if exists graph_edges_select_own on public.graph_edges;
127
+ create policy graph_edges_select_own
128
+ on public.graph_edges
129
+ for select
130
+ to authenticated
131
+ using (auth.uid() = user_id);
supabase/migrations/0011_admin_review_eval_workflow.sql ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alter table public.query_traces
2
+ add column if not exists review_state text not null default 'pending',
3
+ add column if not exists review_notes text null,
4
+ add column if not exists reviewed_at timestamptz null,
5
+ add column if not exists reviewed_by text null,
6
+ add column if not exists promoted_to_eval boolean not null default false,
7
+ add column if not exists document_types jsonb not null default '[]'::jsonb;
8
+
9
+ create index if not exists idx_query_traces_review_state_created
10
+ on public.query_traces (review_state, created_at desc);
11
+
12
+ alter table public.answer_feedback
13
+ add column if not exists review_state text not null default 'pending',
14
+ add column if not exists review_notes text null,
15
+ add column if not exists reviewed_at timestamptz null,
16
+ add column if not exists reviewed_by text null,
17
+ add column if not exists promoted_at timestamptz null;
18
+
19
+ create index if not exists idx_answer_feedback_review_state_created
20
+ on public.answer_feedback (review_state, created_at desc);
21
+
22
+ create table if not exists public.evaluation_datasets (
23
+ id bigint generated by default as identity primary key,
24
+ trace_id uuid unique null references public.query_traces(trace_id) on delete set null,
25
+ source text not null default 'feedback_trace',
26
+ question text not null,
27
+ gold_context_refs jsonb not null default '[]'::jsonb,
28
+ gold_evidence_text text null,
29
+ is_answerable boolean not null default true,
30
+ failure_modes jsonb not null default '[]'::jsonb,
31
+ doc_diagnostics jsonb not null default '[]'::jsonb,
32
+ reason_code text null,
33
+ is_active boolean not null default false,
34
+ created_at timestamptz not null default timezone('utc', now())
35
+ );
36
+
37
+ create index if not exists idx_evaluation_datasets_active_created
38
+ on public.evaluation_datasets (is_active, created_at desc);
supabase/migrations/0012_lock_down_evaluation_datasets.sql ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alter table public.evaluation_datasets
2
+ enable row level security;
3
+
4
+ revoke all on public.evaluation_datasets from anon, authenticated;
5
+
6
+ drop policy if exists evaluation_datasets_select_own on public.evaluation_datasets;
7
+ drop policy if exists evaluation_datasets_insert_own on public.evaluation_datasets;
8
+ drop policy if exists evaluation_datasets_update_own on public.evaluation_datasets;
9
+ drop policy if exists evaluation_datasets_delete_own on public.evaluation_datasets;
10
+
11
+ -- evaluation_datasets is an internal curation/evaluation table.
12
+ -- The app reads/writes it via service-role admin/eval paths only.
13
+ -- With RLS enabled and no anon/authenticated policies, normal clients cannot
14
+ -- access it through PostgREST even though it lives in the public schema.
supabase/migrations/0013_backend_owned_retrieval_hardening.sql ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration 0013: backend-owned retrieval hardening
2
+ --
3
+ -- Goals:
4
+ -- 1. Add a bulk chunk insert RPC for ingestion throughput.
5
+ -- 2. Move retrieval/memory RPCs to explicit user_id scoping so the backend can
6
+ -- call them with the service role instead of relying on browser RLS.
7
+ -- 3. Lock internal telemetry/eval tables down to backend-only access.
8
+
9
+ CREATE OR REPLACE FUNCTION public.insert_document_chunks_batch(
10
+ p_rows jsonb
11
+ ) RETURNS void
12
+ LANGUAGE plpgsql
13
+ SECURITY DEFINER
14
+ SET search_path = ''
15
+ AS $$
16
+ BEGIN
17
+ IF p_rows IS NULL OR jsonb_typeof(p_rows) <> 'array' THEN
18
+ RETURN;
19
+ END IF;
20
+
21
+ INSERT INTO public.documents (
22
+ id,
23
+ content,
24
+ metadata,
25
+ embedding,
26
+ user_id,
27
+ node_type,
28
+ parent_node_id,
29
+ node_level
30
+ )
31
+ SELECT
32
+ (row->>'id')::uuid,
33
+ row->>'content',
34
+ COALESCE(row->'metadata', '{}'::jsonb),
35
+ (row->'embedding')::text::extensions.vector,
36
+ (row->>'user_id')::uuid,
37
+ COALESCE(NULLIF(row->>'node_type', ''), 'leaf'),
38
+ NULLIF(row->>'parent_node_id', '')::uuid,
39
+ COALESCE(NULLIF(row->>'node_level', '')::integer, 0)
40
+ FROM jsonb_array_elements(p_rows) AS row
41
+ ON CONFLICT (id) DO UPDATE
42
+ SET content = EXCLUDED.content,
43
+ metadata = EXCLUDED.metadata,
44
+ embedding = EXCLUDED.embedding,
45
+ user_id = EXCLUDED.user_id,
46
+ node_type = EXCLUDED.node_type,
47
+ parent_node_id = EXCLUDED.parent_node_id,
48
+ node_level = EXCLUDED.node_level;
49
+ END;
50
+ $$;
51
+
52
+
53
+ CREATE OR REPLACE FUNCTION public.hybrid_search(
54
+ query_text text,
55
+ query_embedding extensions.vector,
56
+ match_count integer DEFAULT 10,
57
+ filter jsonb DEFAULT '{}'::jsonb,
58
+ semantic_weight double precision DEFAULT 0.7,
59
+ keyword_weight double precision DEFAULT 0.3,
60
+ p_user_id uuid DEFAULT NULL::uuid
61
+ ) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
62
+ LANGUAGE plpgsql
63
+ SET search_path = ''
64
+ AS $$
65
+ BEGIN
66
+ RETURN QUERY
67
+ WITH
68
+ semantic AS (
69
+ SELECT
70
+ d.id,
71
+ d.content,
72
+ d.metadata,
73
+ (
74
+ 1 - (
75
+ d.embedding::extensions.halfvec(2048)
76
+ OPERATOR(extensions.<=>)
77
+ query_embedding::extensions.halfvec(2048)
78
+ )
79
+ )::float AS score
80
+ FROM public.documents AS d
81
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
82
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
83
+ ORDER BY d.embedding::extensions.halfvec(2048)
84
+ OPERATOR(extensions.<=>)
85
+ query_embedding::extensions.halfvec(2048)
86
+ LIMIT match_count * 3
87
+ ),
88
+ keyword AS (
89
+ SELECT
90
+ d.id,
91
+ d.content,
92
+ d.metadata,
93
+ pg_catalog.ts_rank(
94
+ pg_catalog.to_tsvector('english', d.content),
95
+ pg_catalog.plainto_tsquery('english', query_text)
96
+ )::float AS raw_score
97
+ FROM public.documents AS d
98
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
99
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
100
+ AND pg_catalog.to_tsvector('english', d.content)
101
+ @@ pg_catalog.plainto_tsquery('english', query_text)
102
+ ORDER BY raw_score DESC
103
+ LIMIT match_count * 3
104
+ ),
105
+ keyword_norm AS (
106
+ SELECT
107
+ k.id,
108
+ k.content,
109
+ k.metadata,
110
+ CASE
111
+ WHEN max(k.raw_score) OVER () = 0 THEN 0::float
112
+ ELSE (k.raw_score / max(k.raw_score) OVER ())::float
113
+ END AS score
114
+ FROM keyword AS k
115
+ ),
116
+ blended AS (
117
+ SELECT
118
+ COALESCE(s.id, kn.id) AS id,
119
+ COALESCE(s.content, kn.content) AS content,
120
+ COALESCE(s.metadata, kn.metadata) AS metadata,
121
+ (
122
+ COALESCE(s.score, 0::float) * semantic_weight +
123
+ COALESCE(kn.score, 0::float) * keyword_weight
124
+ ) AS combined_score
125
+ FROM semantic AS s
126
+ FULL OUTER JOIN keyword_norm AS kn ON s.id = kn.id
127
+ )
128
+ SELECT
129
+ b.id,
130
+ b.content,
131
+ b.metadata,
132
+ b.combined_score
133
+ FROM blended AS b
134
+ ORDER BY b.combined_score DESC
135
+ LIMIT match_count;
136
+ END;
137
+ $$;
138
+
139
+
140
+ CREATE OR REPLACE FUNCTION public.match_documents(
141
+ query_embedding extensions.vector,
142
+ match_count integer DEFAULT 5,
143
+ filter jsonb DEFAULT '{}'::jsonb,
144
+ p_user_id uuid DEFAULT NULL::uuid
145
+ ) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
146
+ LANGUAGE plpgsql
147
+ SET search_path = ''
148
+ AS $$
149
+ BEGIN
150
+ RETURN QUERY
151
+ SELECT
152
+ d.id,
153
+ d.content,
154
+ d.metadata,
155
+ (
156
+ 1 - (
157
+ d.embedding::extensions.halfvec(2048)
158
+ OPERATOR(extensions.<=>)
159
+ query_embedding::extensions.halfvec(2048)
160
+ )
161
+ )::float AS similarity
162
+ FROM public.documents AS d
163
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
164
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
165
+ ORDER BY d.embedding::extensions.halfvec(2048)
166
+ OPERATOR(extensions.<=>)
167
+ query_embedding::extensions.halfvec(2048)
168
+ LIMIT match_count;
169
+ END;
170
+ $$;
171
+
172
+
173
+ CREATE OR REPLACE FUNCTION public.match_memory(
174
+ query_embedding extensions.vector,
175
+ match_session_id text,
176
+ match_count integer DEFAULT 4,
177
+ p_user_id uuid DEFAULT NULL::uuid
178
+ ) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
179
+ LANGUAGE plpgsql
180
+ SET search_path = ''
181
+ AS $$
182
+ BEGIN
183
+ RETURN QUERY
184
+ SELECT
185
+ cm.id,
186
+ cm.role,
187
+ cm.content,
188
+ 1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
189
+ FROM public.chat_memory AS cm
190
+ WHERE cm.session_id = match_session_id
191
+ AND (p_user_id IS NULL OR cm.user_id = p_user_id)
192
+ ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
193
+ LIMIT match_count;
194
+ END;
195
+ $$;
196
+
197
+
198
+ DO $$
199
+ BEGIN
200
+ IF to_regclass('public.query_traces') IS NOT NULL THEN
201
+ EXECUTE 'ALTER TABLE public.query_traces ENABLE ROW LEVEL SECURITY';
202
+ EXECUTE 'REVOKE ALL ON TABLE public.query_traces FROM anon, authenticated';
203
+ EXECUTE 'DROP POLICY IF EXISTS query_traces_select_own ON public.query_traces';
204
+ EXECUTE 'DROP POLICY IF EXISTS query_traces_insert_own ON public.query_traces';
205
+ END IF;
206
+
207
+ IF to_regclass('public.answer_feedback') IS NOT NULL THEN
208
+ EXECUTE 'ALTER TABLE public.answer_feedback ENABLE ROW LEVEL SECURITY';
209
+ EXECUTE 'REVOKE ALL ON TABLE public.answer_feedback FROM anon, authenticated';
210
+ EXECUTE 'DROP POLICY IF EXISTS answer_feedback_select_own ON public.answer_feedback';
211
+ EXECUTE 'DROP POLICY IF EXISTS answer_feedback_insert_own ON public.answer_feedback';
212
+ END IF;
213
+
214
+ IF to_regclass('public.evaluation_logs') IS NOT NULL THEN
215
+ EXECUTE 'ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY';
216
+ EXECUTE 'REVOKE ALL ON TABLE public.evaluation_logs FROM anon, authenticated';
217
+ EXECUTE 'DROP POLICY IF EXISTS evaluation_logs_insert_own ON public.evaluation_logs';
218
+ EXECUTE 'DROP POLICY IF EXISTS evaluation_logs_select_own ON public.evaluation_logs';
219
+ END IF;
220
+
221
+ IF to_regclass('public.intent_feedback') IS NOT NULL THEN
222
+ EXECUTE 'ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY';
223
+ EXECUTE 'REVOKE ALL ON TABLE public.intent_feedback FROM anon, authenticated';
224
+ EXECUTE 'DROP POLICY IF EXISTS intent_feedback_select_own ON public.intent_feedback';
225
+ EXECUTE 'DROP POLICY IF EXISTS intent_feedback_insert_own ON public.intent_feedback';
226
+ END IF;
227
+
228
+ IF to_regclass('public.rerank_feedback') IS NOT NULL THEN
229
+ EXECUTE 'ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY';
230
+ EXECUTE 'REVOKE ALL ON TABLE public.rerank_feedback FROM anon, authenticated';
231
+ EXECUTE 'DROP POLICY IF EXISTS rerank_feedback_select_own ON public.rerank_feedback';
232
+ END IF;
233
+
234
+ IF to_regclass('public.graph_nodes') IS NOT NULL THEN
235
+ EXECUTE 'ALTER TABLE public.graph_nodes ENABLE ROW LEVEL SECURITY';
236
+ EXECUTE 'REVOKE ALL ON TABLE public.graph_nodes FROM anon, authenticated';
237
+ EXECUTE 'DROP POLICY IF EXISTS graph_nodes_select_own ON public.graph_nodes';
238
+ END IF;
239
+
240
+ IF to_regclass('public.graph_edges') IS NOT NULL THEN
241
+ EXECUTE 'ALTER TABLE public.graph_edges ENABLE ROW LEVEL SECURITY';
242
+ EXECUTE 'REVOKE ALL ON TABLE public.graph_edges FROM anon, authenticated';
243
+ EXECUTE 'DROP POLICY IF EXISTS graph_edges_select_own ON public.graph_edges';
244
+ END IF;
245
+
246
+ IF to_regclass('public.category_centroids') IS NOT NULL THEN
247
+ EXECUTE 'ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY';
248
+ EXECUTE 'REVOKE ALL ON TABLE public.category_centroids FROM anon, authenticated';
249
+ END IF;
250
+
251
+ IF to_regclass('public.ingestion_retry_logs') IS NOT NULL THEN
252
+ EXECUTE 'ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY';
253
+ EXECUTE 'REVOKE ALL ON TABLE public.ingestion_retry_logs FROM anon, authenticated';
254
+ EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_select_own ON public.ingestion_retry_logs';
255
+ EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_insert_own ON public.ingestion_retry_logs';
256
+ EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_update_own ON public.ingestion_retry_logs';
257
+ EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_delete_own ON public.ingestion_retry_logs';
258
+ END IF;
259
+ END;
260
+ $$;
supabase/migrations/0014_drop_legacy_category_centroid_policies.sql ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -- Migration 0014: drop legacy category_centroids user-facing RLS policies
2
+ --
3
+ -- 0013 moved centroid access to backend-owned service-role calls with
4
+ -- explicit user_id filtering, but it did not remove the older auth.uid()
5
+ -- policies. Those stale policies keep Security Advisor warning about
6
+ -- anonymous access on public.category_centroids and also keep schema dumps
7
+ -- out of sync with the intended access model.
8
+
9
+ DO $$
10
+ BEGIN
11
+ IF to_regclass('public.category_centroids') IS NOT NULL THEN
12
+ EXECUTE 'ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY';
13
+ EXECUTE 'REVOKE ALL ON TABLE public.category_centroids FROM anon, authenticated';
14
+ EXECUTE 'DROP POLICY IF EXISTS centroids_select_own ON public.category_centroids';
15
+ EXECUTE 'DROP POLICY IF EXISTS centroids_insert_own ON public.category_centroids';
16
+ EXECUTE 'DROP POLICY IF EXISTS centroids_update_own ON public.category_centroids';
17
+ EXECUTE 'DROP POLICY IF EXISTS centroids_delete_own ON public.category_centroids';
18
+ END IF;
19
+ END
20
+ $$;
supabase/migrations/0015_ingested_file_identity_json.sql ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ alter table public.ingested_files
2
+ add column if not exists identity_json jsonb not null default '{}'::jsonb;
supabase/migrations/0016_ingestion_file_hash_checkpoints.sql ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ALTER TABLE public.ingestion_retry_logs
2
+ ADD COLUMN IF NOT EXISTS file_hash text;
3
+
4
+ CREATE INDEX IF NOT EXISTS ingestion_retry_logs_user_file_event_idx
5
+ ON public.ingestion_retry_logs (user_id, file_hash, event_type, created_at DESC);
supabase/schema_backup.before_0013.sql ADDED
File without changes
supabase/schema_backup.sql CHANGED
@@ -1,74 +1,60 @@
1
  --
2
  -- PostgreSQL database dump
3
  --
4
-
5
- -- Dumped from database version 17.6
6
- -- Dumped by pg_dump version 18.3
7
-
8
- SET statement_timeout = 0;
9
- SET lock_timeout = 0;
10
- SET idle_in_transaction_session_timeout = 0;
11
- SET transaction_timeout = 0;
12
- SET client_encoding = 'UTF8';
13
- SET standard_conforming_strings = on;
14
- SELECT pg_catalog.set_config('search_path', '', false);
15
- SET check_function_bodies = false;
16
- SET xmloption = content;
17
- SET client_min_messages = warning;
18
- SET row_security = off;
19
-
20
- --
21
- -- Name: public; Type: SCHEMA; Schema: -; Owner: -
22
- --
23
-
24
- CREATE SCHEMA IF NOT EXISTS public;
25
-
26
-
27
- --
28
- -- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: -
29
- --
30
-
31
- COMMENT ON SCHEMA public IS 'standard public schema';
32
-
33
-
34
- --
35
- -- Name: _trg_refresh_mv_document_types(); Type: FUNCTION; Schema: public; Owner: -
36
- --
37
-
38
- -- CREATE FUNCTION public._trg_refresh_mv_document_types() RETURNS trigger
39
- -- LANGUAGE plpgsql
40
- -- AS $$
41
- -- begin
42
- -- -- Fire-and-forget: refresh in background via pg_notify
43
- -- -- (avoids blocking the INSERT transaction itself)
44
- -- perform pg_notify('refresh_mv', 'document_types');
45
- -- return new;
46
- -- end;
47
- -- $$;
48
-
49
-
50
- --
51
- -- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
52
- --
53
-
54
  CREATE FUNCTION public._trg_set_updated_at() RETURNS trigger
55
  LANGUAGE plpgsql
56
- SET search_path = ''
57
- AS $$
58
- begin
59
  new.updated_at = pg_catalog.now();
60
- return new;
61
- end;
62
- $$;
63
-
64
-
65
- --
66
- -- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
67
- --
68
-
69
  CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
70
  LANGUAGE sql STABLE
71
- SET search_path = ''
72
  AS $$
73
  select distinct f.document_type
74
  from public.ingested_files as f
@@ -76,23 +62,25 @@ CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
76
  and f.document_type is not null
77
  and f.document_type <> 'unknown'
78
  order by f.document_type;
79
- $$;
80
-
81
-
82
- --
83
  -- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
84
- --
85
-
86
  CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
87
  LANGUAGE plpgsql
88
- SET search_path = ''
89
  AS $$
90
- begin
91
- return query
92
- with
93
- semantic as (
94
- select
95
- d.id, d.content, d.metadata,
 
 
96
  (
97
  1 - (
98
  d.embedding::extensions.halfvec(2048)
@@ -101,937 +89,1390 @@ begin
101
  )
102
  )::float as score
103
  from public.documents d
104
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
105
  order by d.embedding::extensions.halfvec(2048)
106
  OPERATOR(extensions.<=>)
107
  query_embedding::extensions.halfvec(2048)
108
- limit match_count * 3
109
- ),
110
- keyword as (
111
- select
112
- d.id, d.content, d.metadata,
 
 
113
  pg_catalog.ts_rank(
114
  pg_catalog.to_tsvector('english', d.content),
115
  pg_catalog.plainto_tsquery('english', query_text)
116
- )::float as raw_score
117
  from public.documents d
118
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
119
  and pg_catalog.to_tsvector('english', d.content) @@ pg_catalog.plainto_tsquery('english', query_text)
120
- order by raw_score desc
121
- limit match_count * 3
122
- ),
123
- keyword_norm as (
124
- select k.id, k.content, k.metadata,
125
- case
126
- when max(k.raw_score) over () = 0 then 0::float
127
- else (k.raw_score / max(k.raw_score) over ())::float
128
- end as score
129
- from keyword k
130
- ),
131
- blended as (
132
- select
133
- coalesce(s.id, kn.id) as id,
134
- coalesce(s.content, kn.content) as content,
135
- coalesce(s.metadata, kn.metadata) as metadata,
136
- (
137
- coalesce(s.score, 0::float) * semantic_weight +
138
- coalesce(kn.score, 0::float) * keyword_weight
139
- ) as combined_score
140
- from semantic s
141
- full outer join keyword_norm kn on s.id = kn.id
142
- )
143
- select b.id, b.content, b.metadata, b.combined_score
144
- from blended b
145
- order by b.combined_score desc
146
- limit match_count;
147
- end;
148
- $$;
149
-
150
-
151
- --
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
153
- --
154
-
155
  CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
156
  LANGUAGE plpgsql SECURITY DEFINER
157
- SET search_path = ''
158
  AS $$
159
- BEGIN
160
  INSERT INTO public.documents (id, content, metadata, embedding, user_id)
161
- VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
162
- ON CONFLICT (id) DO UPDATE
163
- SET content = EXCLUDED.content,
164
- metadata = EXCLUDED.metadata,
165
- embedding = EXCLUDED.embedding;
166
- END;
167
- $$;
168
-
169
-
170
- --
171
  -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
172
- --
173
-
174
  CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
175
  LANGUAGE plpgsql SECURITY DEFINER
176
- SET search_path = ''
177
  AS $$
178
- BEGIN
179
  INSERT INTO public.documents (
180
- id, content, metadata, embedding, user_id,
181
- node_type, parent_node_id, node_level
182
- )
183
- VALUES (
184
- p_id, p_content, p_metadata, p_embedding, p_user_id,
185
- p_node_type, p_parent_node_id, p_node_level
186
- )
187
- ON CONFLICT (id) DO UPDATE
188
- SET content = EXCLUDED.content,
189
- metadata = EXCLUDED.metadata,
190
- embedding = EXCLUDED.embedding,
191
- node_type = EXCLUDED.node_type,
192
- parent_node_id = EXCLUDED.parent_node_id,
193
- node_level = EXCLUDED.node_level;
194
- END;
195
- $$;
196
-
197
-
198
- --
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  -- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
200
- --
201
-
202
  CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
203
  LANGUAGE plpgsql
204
- SET search_path = ''
205
  AS $$
206
- begin
207
- return query
208
- select
209
- d.id,
210
- d.content,
211
- d.metadata,
212
- (
213
- 1 - (
214
- d.embedding::extensions.halfvec(2048)
215
- OPERATOR(extensions.<=>)
216
- query_embedding::extensions.halfvec(2048)
217
- )
218
- )::float as similarity
219
  from public.documents d
220
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
221
- order by d.embedding::extensions.halfvec(2048)
222
- OPERATOR(extensions.<=>)
223
- query_embedding::extensions.halfvec(2048)
224
- limit match_count;
225
- end;
226
- $$;
227
-
228
-
229
- --
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  -- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
231
- --
232
-
233
  CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
234
  LANGUAGE plpgsql
235
- SET search_path = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  AS $$
237
- BEGIN
238
- RETURN QUERY
239
- SELECT
240
  cm.id,
241
  cm.role,
242
  cm.content,
243
  1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
244
  FROM public.chat_memory AS cm
245
  WHERE cm.session_id = match_session_id
 
246
  ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
247
- LIMIT match_count;
248
- END;
249
- $$;
250
-
251
-
252
- --
253
- -- Name: refresh_document_types_mv(); Type: FUNCTION; Schema: public; Owner: -
254
- --
255
-
256
- -- CREATE FUNCTION public.refresh_document_types_mv() RETURNS void
257
- -- LANGUAGE plpgsql
258
- -- AS $$
259
- -- begin
260
- -- refresh materialized view concurrently mv_document_types;
261
- -- end;
262
- -- $$;
263
-
264
-
265
- SET default_tablespace = '';
266
-
267
- SET default_table_access_method = heap;
268
-
269
- --
270
- -- Name: category_centroids; Type: TABLE; Schema: public; Owner: -
271
- --
272
-
273
- CREATE TABLE public.category_centroids (
274
- id uuid DEFAULT gen_random_uuid() NOT NULL,
275
- document_type text NOT NULL,
276
- centroid_vector double precision[] NOT NULL,
277
- document_count integer DEFAULT 1,
278
- created_at timestamp with time zone DEFAULT now(),
279
- updated_at timestamp with time zone DEFAULT now(),
280
- user_id uuid DEFAULT auth.uid()
281
- );
282
-
283
-
284
- --
285
- -- Name: chat_memory; Type: TABLE; Schema: public; Owner: -
286
- --
287
-
288
- CREATE TABLE public.chat_memory (
289
- id uuid DEFAULT extensions.uuid_generate_v4() NOT NULL,
290
- session_id text NOT NULL,
291
- role text NOT NULL,
292
- content text NOT NULL,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  embedding extensions.vector(2048),
294
- created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
295
- user_id uuid DEFAULT auth.uid()
296
- );
297
-
298
-
299
- --
300
- -- Name: document_trees; Type: TABLE; Schema: public; Owner: -
301
- --
302
-
303
- CREATE TABLE public.document_trees (
304
- file_hash text NOT NULL,
305
- user_id uuid NOT NULL,
306
- tree_json jsonb NOT NULL,
307
- created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
308
- );
309
-
310
-
311
- --
312
- -- Name: documents; Type: TABLE; Schema: public; Owner: -
313
- --
314
-
315
- CREATE TABLE public.documents (
316
- id uuid DEFAULT gen_random_uuid() NOT NULL,
317
- content text,
318
- metadata jsonb,
319
  embedding extensions.vector(2048),
320
- user_id uuid DEFAULT auth.uid(),
321
- node_type text DEFAULT 'leaf'::text,
322
- parent_node_id uuid,
323
- node_level integer DEFAULT 0
324
- );
325
-
326
-
327
- --
328
- -- Name: evaluation_logs; Type: TABLE; Schema: public; Owner: -
329
- --
330
-
331
- CREATE TABLE public.evaluation_logs (
332
- id uuid DEFAULT gen_random_uuid() NOT NULL,
333
- run_label text,
334
- evaluated_at timestamp with time zone,
335
- alpha double precision,
336
- k integer,
337
- question text,
338
- is_answerable boolean,
339
- precision_at_k double precision,
340
- faithfulness_proxy double precision,
341
- relevance_proxy double precision,
342
- local_reward double precision,
343
- llm_judge_score double precision,
344
- judge_a_verdict boolean,
345
- judge_b_verdict boolean,
346
- judge_a_model text,
347
- judge_b_model text,
348
- calibration_score double precision,
349
- final_score double precision,
350
- requires_manual_review boolean DEFAULT false,
351
- disagreement_note text DEFAULT ''::text,
352
- user_id uuid
353
- );
354
-
355
-
356
- --
357
- -- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
358
- --
359
-
360
- CREATE TABLE public.ingested_files (
361
- id uuid DEFAULT gen_random_uuid() NOT NULL,
362
- file_hash text NOT NULL,
363
- filename text NOT NULL,
364
- document_type text,
365
- chunk_count integer DEFAULT 0,
366
- ingested_at timestamp with time zone DEFAULT now(),
367
- user_id uuid DEFAULT auth.uid(),
368
- user_overridden boolean DEFAULT false
369
- );
370
-
371
-
372
- --
373
- -- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
374
- --
375
-
376
- CREATE TABLE public.ingestion_retry_logs (
377
- id bigint NOT NULL,
378
- created_at timestamp with time zone DEFAULT now() NOT NULL,
379
- user_id uuid,
380
- batch_num integer NOT NULL,
381
- total_batches integer NOT NULL,
382
- attempt integer NOT NULL,
383
- event_type text NOT NULL,
384
- message text,
385
- sleep_s double precision DEFAULT 0
386
- );
387
-
388
-
389
- --
390
- -- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
391
- --
392
-
393
- CREATE SEQUENCE public.ingestion_retry_logs_id_seq
394
- START WITH 1
395
- INCREMENT BY 1
396
- NO MINVALUE
397
- NO MAXVALUE
398
- CACHE 1;
399
-
400
-
401
- --
402
- -- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
403
- --
404
-
405
- ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
406
-
407
-
408
- --
409
- -- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
410
- --
411
-
412
- CREATE TABLE public.intent_feedback (
413
- id bigint NOT NULL,
414
- user_id uuid,
415
- query text NOT NULL,
416
- has_category boolean DEFAULT false NOT NULL,
417
- has_history boolean DEFAULT false NOT NULL,
418
- label integer NOT NULL,
419
- created_at timestamp with time zone DEFAULT now() NOT NULL,
420
- CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
421
- );
422
-
423
-
424
- --
425
- -- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
426
- --
427
-
428
- CREATE SEQUENCE public.intent_feedback_id_seq
429
- START WITH 1
430
- INCREMENT BY 1
431
- NO MINVALUE
432
- NO MAXVALUE
433
- CACHE 1;
434
-
435
-
436
- --
437
- -- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
438
- --
439
-
440
- ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
441
-
442
-
443
- --
444
- -- Name: mv_document_types; Type: MATERIALIZED VIEW; Schema: public; Owner: -
445
- --
446
-
447
- -- CREATE MATERIALIZED VIEW public.mv_document_types AS
448
- -- SELECT DISTINCT (metadata ->> 'document_type'::text) AS document_type
449
- -- FROM public.documents
450
- -- WHERE (((metadata ->> 'document_type'::text) IS NOT NULL) AND ((metadata ->> 'document_type'::text) <> 'unknown'::text))
451
- -- ORDER BY (metadata ->> 'document_type'::text)
452
- -- WITH NO DATA;
453
-
454
-
455
- --
456
- -- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
457
- --
458
-
459
- CREATE TABLE public.rerank_feedback (
460
- id bigint NOT NULL,
461
- user_id uuid,
462
- query_hash text NOT NULL,
463
- chunk_id uuid,
464
- chunk_hash text NOT NULL,
465
- document_type text,
466
- cohere_score real NOT NULL,
467
- was_selected boolean NOT NULL,
468
- created_at timestamp with time zone DEFAULT now() NOT NULL,
469
- query_text text,
470
- chunk_text text
471
- );
472
-
473
-
474
- --
475
- -- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
476
- --
477
-
478
- CREATE SEQUENCE public.rerank_feedback_id_seq
479
- START WITH 1
480
- INCREMENT BY 1
481
- NO MINVALUE
482
- NO MAXVALUE
483
- CACHE 1;
484
-
485
-
486
- --
487
- -- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
488
- --
489
-
490
- ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
491
-
492
-
493
- --
494
- -- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
495
- --
496
-
497
- ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
498
-
499
-
500
- --
501
- -- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
502
- --
503
-
504
- ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
505
-
506
-
507
- --
508
- -- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
509
- --
510
-
511
- ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
512
-
513
-
514
- --
515
- -- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
516
- --
517
-
518
- ALTER TABLE ONLY public.category_centroids
519
- ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
520
-
521
-
522
- --
523
- -- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
524
- --
525
-
526
- ALTER TABLE ONLY public.category_centroids
527
- ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
528
-
529
-
530
- --
531
- -- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
532
- --
533
-
534
- ALTER TABLE ONLY public.chat_memory
535
- ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
536
-
537
-
538
- --
539
- -- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
540
  --
541
 
542
- ALTER TABLE ONLY public.document_trees
543
- ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
544
-
545
-
546
- --
547
- -- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: -
548
- --
549
-
550
- ALTER TABLE ONLY public.documents
551
- ADD CONSTRAINT documents_pkey PRIMARY KEY (id);
552
-
553
-
554
- --
555
- -- Name: evaluation_logs evaluation_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
556
- --
557
-
558
- ALTER TABLE ONLY public.evaluation_logs
559
- ADD CONSTRAINT evaluation_logs_pkey PRIMARY KEY (id);
560
-
561
-
562
- --
563
- -- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
564
  --
565
 
566
- ALTER TABLE ONLY public.ingested_files
567
- ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
568
-
569
-
570
- --
571
- -- Name: ingested_files ingested_files_pkey; Type: CONSTRAINT; Schema: public; Owner: -
572
- --
573
-
574
- ALTER TABLE ONLY public.ingested_files
575
- ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
576
-
577
-
578
- --
579
- -- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
580
- --
581
-
582
- ALTER TABLE ONLY public.ingestion_retry_logs
583
- ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
584
-
585
-
586
- --
587
- -- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
588
- --
589
-
590
- ALTER TABLE ONLY public.intent_feedback
591
- ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
592
-
593
-
594
- --
595
- -- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
596
- --
597
-
598
- ALTER TABLE ONLY public.rerank_feedback
599
- ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
600
-
601
-
602
- --
603
- -- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
604
- --
605
-
606
- CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
607
-
608
-
609
- --
610
- -- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
611
- --
612
-
613
- CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
614
-
615
-
616
- --
617
- -- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
618
- --
619
-
620
- CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
621
-
622
-
623
- --
624
- -- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
625
- --
626
-
627
- CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
628
-
629
-
630
- --
631
- -- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
632
- --
633
-
634
- CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
635
-
636
-
637
- --
638
- -- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
639
- --
640
-
641
- CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
642
-
643
-
644
- --
645
- -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
646
- --
647
-
648
- CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
649
-
650
-
651
- --
652
- -- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
653
- --
654
-
655
- CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
656
-
657
-
658
- --
659
- -- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
660
- --
661
-
662
- CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
663
-
664
-
665
- --
666
- -- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
667
- --
668
-
669
- CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
670
-
671
-
672
- --
673
- -- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
674
- --
675
-
676
- CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
677
-
678
-
679
- --
680
- -- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
681
- --
682
-
683
- CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
684
-
685
-
686
- --
687
- -- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
688
- --
689
-
690
- CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
691
-
692
-
693
- --
694
- -- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
695
- --
696
-
697
- CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
698
-
699
-
700
- --
701
- -- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
702
- --
703
-
704
- CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
705
-
706
-
707
- --
708
- -- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
709
- --
710
-
711
- CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
712
-
713
-
714
- --
715
- -- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
716
- --
717
-
718
- CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
719
-
720
-
721
- --
722
- -- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
723
- --
724
-
725
- CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
726
-
727
-
728
- --
729
- -- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
730
- --
731
-
732
- CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
733
-
734
-
735
- --
736
- -- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
737
- --
738
-
739
- CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
740
-
741
-
742
- --
743
- -- Name: mv_document_types_idx; Type: INDEX; Schema: public; Owner: -
744
- --
745
-
746
- -- CREATE UNIQUE INDEX mv_document_types_idx ON public.mv_document_types USING btree (document_type);
747
-
748
-
749
- --
750
- -- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
751
- --
752
-
753
- CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
754
-
755
-
756
- --
757
- -- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
758
- --
759
-
760
- CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
761
-
762
-
763
- --
764
- -- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
765
- --
766
-
767
- CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
768
-
769
-
770
- --
771
- -- Name: documents trg_refresh_mv_document_types; Type: TRIGGER; Schema: public; Owner: -
772
- --
773
-
774
- -- CREATE TRIGGER trg_refresh_mv_document_types AFTER INSERT ON public.documents FOR EACH STATEMENT EXECUTE FUNCTION public._trg_refresh_mv_document_types();
775
-
776
-
777
- --
778
- -- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
779
- --
780
-
781
- ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
782
-
783
- --
784
- -- Name: category_centroids centroids_delete_own; Type: POLICY; Schema: public; Owner: -
785
- --
786
-
787
- CREATE POLICY centroids_delete_own ON public.category_centroids FOR DELETE USING ((user_id = auth.uid()));
788
-
789
-
790
- --
791
- -- Name: category_centroids centroids_insert_own; Type: POLICY; Schema: public; Owner: -
792
- --
793
-
794
- CREATE POLICY centroids_insert_own ON public.category_centroids FOR INSERT WITH CHECK ((user_id = auth.uid()));
795
-
796
-
797
- --
798
- -- Name: category_centroids centroids_select_own; Type: POLICY; Schema: public; Owner: -
799
- --
800
-
801
- CREATE POLICY centroids_select_own ON public.category_centroids FOR SELECT USING ((user_id = auth.uid()));
802
-
803
-
804
- --
805
- -- Name: category_centroids centroids_update_own; Type: POLICY; Schema: public; Owner: -
806
- --
807
-
808
- CREATE POLICY centroids_update_own ON public.category_centroids FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
809
-
810
-
811
- --
812
- -- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
813
- --
814
-
815
- ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
816
-
817
- --
818
- -- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
819
- --
820
-
821
- CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
822
-
823
-
824
- --
825
- -- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
826
- --
827
-
828
- CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
829
-
830
-
831
- --
832
- -- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
833
- --
834
-
835
- CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
836
-
837
-
838
- --
839
- -- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
840
- --
841
-
842
- CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
843
-
844
-
845
- --
846
- -- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
847
- --
848
-
849
- ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
850
 
851
  --
852
- -- Name: documents documents_delete_own; Type: POLICY; Schema: public; Owner: -
853
- --
854
-
855
- CREATE POLICY documents_delete_own ON public.documents FOR DELETE USING ((user_id = auth.uid()));
856
-
857
-
858
- --
859
- -- Name: documents documents_insert_own; Type: POLICY; Schema: public; Owner: -
860
- --
861
-
862
- CREATE POLICY documents_insert_own ON public.documents FOR INSERT WITH CHECK ((user_id = auth.uid()));
863
-
864
-
865
- --
866
- -- Name: documents documents_select_own; Type: POLICY; Schema: public; Owner: -
867
- --
868
-
869
- CREATE POLICY documents_select_own ON public.documents FOR SELECT USING ((user_id = auth.uid()));
870
-
871
-
872
- --
873
- -- Name: documents documents_update_own; Type: POLICY; Schema: public; Owner: -
874
- --
875
-
876
- CREATE POLICY documents_update_own ON public.documents FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
 
877
 
878
 
879
  --
880
- -- Name: document_trees; Type: ROW SECURITY; Schema: public; Owner: -
881
  --
882
 
883
- ALTER TABLE public.document_trees ENABLE ROW LEVEL SECURITY;
 
 
 
 
 
 
 
 
 
 
884
 
885
  --
886
- -- Name: document_trees document_trees_delete_own; Type: POLICY; Schema: public; Owner: -
887
  --
888
 
889
- CREATE POLICY document_trees_delete_own ON public.document_trees FOR DELETE USING ((user_id = auth.uid()));
 
 
 
 
 
 
 
890
 
891
 
892
  --
893
- -- Name: document_trees document_trees_insert_own; Type: POLICY; Schema: public; Owner: -
894
  --
895
 
896
- CREATE POLICY document_trees_insert_own ON public.document_trees FOR INSERT WITH CHECK ((user_id = auth.uid()));
 
 
 
 
 
 
 
 
897
 
898
 
899
  --
900
- -- Name: document_trees document_trees_select_own; Type: POLICY; Schema: public; Owner: -
901
  --
902
 
903
- CREATE POLICY document_trees_select_own ON public.document_trees FOR SELECT USING ((user_id = auth.uid()));
 
 
 
 
 
 
 
904
 
905
 
906
  --
907
- -- Name: document_trees document_trees_update_own; Type: POLICY; Schema: public; Owner: -
908
  --
909
 
910
- CREATE POLICY document_trees_update_own ON public.document_trees FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
 
 
 
 
 
 
 
 
 
 
911
 
912
 
913
  --
914
- -- Name: evaluation_logs; Type: ROW SECURITY; Schema: public; Owner: -
915
  --
916
 
917
- ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY;
918
-
919
- --
920
- -- Name: evaluation_logs evaluation_logs_insert_own; Type: POLICY; Schema: public; Owner: -
921
- --
922
-
923
- CREATE POLICY evaluation_logs_insert_own ON public.evaluation_logs FOR INSERT WITH CHECK ((user_id = auth.uid()));
924
-
925
-
926
- --
927
- -- Name: evaluation_logs evaluation_logs_select_own; Type: POLICY; Schema: public; Owner: -
928
- --
929
-
930
- CREATE POLICY evaluation_logs_select_own ON public.evaluation_logs FOR SELECT USING ((user_id = auth.uid()));
931
 
932
 
933
  --
934
- -- Name: ingestion_retry_logs; Type: ROW SECURITY; Schema: public; Owner: -
935
  --
936
 
937
- ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY;
 
 
 
 
 
 
938
 
939
  --
940
- -- Name: ingestion_retry_logs ingestion_retry_logs_delete_own; Type: POLICY; Schema: public; Owner: -
941
  --
942
 
943
- CREATE POLICY ingestion_retry_logs_delete_own ON public.ingestion_retry_logs FOR DELETE USING ((user_id = auth.uid()));
944
 
945
 
946
  --
947
- -- Name: ingestion_retry_logs ingestion_retry_logs_insert_own; Type: POLICY; Schema: public; Owner: -
948
  --
949
 
950
- CREATE POLICY ingestion_retry_logs_insert_own ON public.ingestion_retry_logs FOR INSERT WITH CHECK ((user_id = auth.uid()));
 
 
 
 
 
 
 
 
 
951
 
952
 
953
  --
954
- -- Name: ingestion_retry_logs ingestion_retry_logs_select_own; Type: POLICY; Schema: public; Owner: -
955
  --
956
 
957
- CREATE POLICY ingestion_retry_logs_select_own ON public.ingestion_retry_logs FOR SELECT USING ((user_id = auth.uid()));
 
 
 
 
 
958
 
959
 
960
  --
961
- -- Name: ingestion_retry_logs ingestion_retry_logs_update_own; Type: POLICY; Schema: public; Owner: -
962
  --
963
 
964
- CREATE POLICY ingestion_retry_logs_update_own ON public.ingestion_retry_logs FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
965
 
966
 
967
  --
968
- -- Name: ingested_files; Type: ROW SECURITY; Schema: public; Owner: -
969
  --
970
 
971
- ALTER TABLE public.ingested_files ENABLE ROW LEVEL SECURITY;
972
-
973
- --
974
- -- Name: ingested_files ingested_files_delete_own; Type: POLICY; Schema: public; Owner: -
975
- --
976
-
977
- CREATE POLICY ingested_files_delete_own ON public.ingested_files FOR DELETE USING ((user_id = auth.uid()));
978
-
979
-
980
- --
981
- -- Name: ingested_files ingested_files_insert_own; Type: POLICY; Schema: public; Owner: -
982
- --
983
-
984
- CREATE POLICY ingested_files_insert_own ON public.ingested_files FOR INSERT WITH CHECK ((user_id = auth.uid()));
985
-
986
-
987
- --
988
- -- Name: ingested_files ingested_files_select_own; Type: POLICY; Schema: public; Owner: -
989
- --
990
-
991
- CREATE POLICY ingested_files_select_own ON public.ingested_files FOR SELECT USING ((user_id = auth.uid()));
992
-
993
-
994
- --
995
- -- Name: ingested_files ingested_files_update_own; Type: POLICY; Schema: public; Owner: -
996
- --
997
-
998
- CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
999
-
1000
-
1001
- --
1002
- -- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
1003
- --
1004
-
1005
- ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
1006
-
1007
- --
1008
- -- Name: intent_feedback intent_feedback_insert_own; Type: POLICY; Schema: public; Owner: -
1009
- --
1010
-
1011
- CREATE POLICY intent_feedback_insert_own ON public.intent_feedback FOR INSERT WITH CHECK ((user_id = auth.uid()));
1012
-
1013
-
1014
- --
1015
- -- Name: intent_feedback intent_feedback_select_own; Type: POLICY; Schema: public; Owner: -
1016
- --
1017
-
1018
- CREATE POLICY intent_feedback_select_own ON public.intent_feedback FOR SELECT USING ((user_id = auth.uid()));
1019
-
1020
-
1021
- --
1022
- -- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
1023
- --
1024
-
1025
- ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
1026
-
1027
- --
1028
- -- Name: rerank_feedback rerank_feedback_select_own; Type: POLICY; Schema: public; Owner: -
1029
- --
1030
-
1031
- CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
1032
-
1033
-
1034
  --
1035
- -- PostgreSQL database dump complete
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1036
  --
1037
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  --
2
  -- PostgreSQL database dump
3
  --
4
+
5
+ \restrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua
6
+
7
+ -- Dumped from database version 17.6
8
+ -- Dumped by pg_dump version 18.3
9
+
10
+ SET statement_timeout = 0;
11
+ SET lock_timeout = 0;
12
+ SET idle_in_transaction_session_timeout = 0;
13
+ SET transaction_timeout = 0;
14
+ SET client_encoding = 'UTF8';
15
+ SET standard_conforming_strings = on;
16
+ SELECT pg_catalog.set_config('search_path', '', false);
17
+ SET check_function_bodies = false;
18
+ SET xmloption = content;
19
+ SET client_min_messages = warning;
20
+ SET row_security = off;
21
+
22
+ --
23
+ -- Name: public; Type: SCHEMA; Schema: -; Owner: -
24
+ --
25
+
26
+ CREATE SCHEMA public;
27
+
28
+
29
+ --
30
+ -- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: -
31
+ --
32
+
33
+ COMMENT ON SCHEMA public IS 'standard public schema';
34
+
35
+
36
+ --
37
+ -- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
38
+ --
39
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  CREATE FUNCTION public._trg_set_updated_at() RETURNS trigger
41
  LANGUAGE plpgsql
42
+ SET search_path TO ''
43
+ AS $$
44
+ begin
45
  new.updated_at = pg_catalog.now();
46
+ return new;
47
+ end;
48
+ $$;
49
+
50
+
51
+ --
52
+ -- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
53
+ --
54
+
55
  CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
56
  LANGUAGE sql STABLE
57
+ SET search_path TO ''
58
  AS $$
59
  select distinct f.document_type
60
  from public.ingested_files as f
 
62
  and f.document_type is not null
63
  and f.document_type <> 'unknown'
64
  order by f.document_type;
65
+ $$;
66
+
67
+
68
+ --
69
  -- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
70
+ --
71
+
72
  CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
73
  LANGUAGE plpgsql
74
+ SET search_path TO ''
75
  AS $$
76
+ begin
77
+ return query
78
+ with
79
+ semantic as (
80
+ select
81
+ d.id,
82
+ d.content,
83
+ d.metadata,
84
  (
85
  1 - (
86
  d.embedding::extensions.halfvec(2048)
 
89
  )
90
  )::float as score
91
  from public.documents d
92
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
93
  order by d.embedding::extensions.halfvec(2048)
94
  OPERATOR(extensions.<=>)
95
  query_embedding::extensions.halfvec(2048)
96
+ limit match_count * 3
97
+ ),
98
+ keyword as (
99
+ select
100
+ d.id,
101
+ d.content,
102
+ d.metadata,
103
  pg_catalog.ts_rank(
104
  pg_catalog.to_tsvector('english', d.content),
105
  pg_catalog.plainto_tsquery('english', query_text)
106
+ )::float as raw_score
107
  from public.documents d
108
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
109
  and pg_catalog.to_tsvector('english', d.content) @@ pg_catalog.plainto_tsquery('english', query_text)
110
+ order by raw_score desc
111
+ limit match_count * 3
112
+ ),
113
+ keyword_norm as (
114
+ select
115
+ k.id,
116
+ k.content,
117
+ k.metadata,
118
+ case
119
+ when max(k.raw_score) over () = 0 then 0::float
120
+ else (k.raw_score / max(k.raw_score) over ())::float
121
+ end as score
122
+ from keyword k
123
+ ),
124
+ blended as (
125
+ select
126
+ coalesce(s.id, kn.id) as id,
127
+ coalesce(s.content, kn.content) as content,
128
+ coalesce(s.metadata, kn.metadata) as metadata,
129
+ (
130
+ coalesce(s.score, 0::float) * semantic_weight +
131
+ coalesce(kn.score, 0::float) * keyword_weight
132
+ ) as combined_score
133
+ from semantic s
134
+ full outer join keyword_norm kn on s.id = kn.id
135
+ )
136
+ select
137
+ b.id,
138
+ b.content,
139
+ b.metadata,
140
+ b.combined_score
141
+ from blended b
142
+ order by b.combined_score desc
143
+ limit match_count;
144
+ end;
145
+ $$;
146
+
147
+
148
+ --
149
+ -- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision, uuid); Type: FUNCTION; Schema: public; Owner: -
150
+ --
151
+
152
+ CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
153
+ LANGUAGE plpgsql
154
+ SET search_path TO ''
155
+ AS $$
156
+ BEGIN
157
+ RETURN QUERY
158
+ WITH
159
+ semantic AS (
160
+ SELECT
161
+ d.id,
162
+ d.content,
163
+ d.metadata,
164
+ (
165
+ 1 - (
166
+ d.embedding::extensions.halfvec(2048)
167
+ OPERATOR(extensions.<=>)
168
+ query_embedding::extensions.halfvec(2048)
169
+ )
170
+ )::float AS score
171
+ FROM public.documents AS d
172
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
173
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
174
+ ORDER BY d.embedding::extensions.halfvec(2048)
175
+ OPERATOR(extensions.<=>)
176
+ query_embedding::extensions.halfvec(2048)
177
+ LIMIT match_count * 3
178
+ ),
179
+ keyword AS (
180
+ SELECT
181
+ d.id,
182
+ d.content,
183
+ d.metadata,
184
+ pg_catalog.ts_rank(
185
+ pg_catalog.to_tsvector('english', d.content),
186
+ pg_catalog.plainto_tsquery('english', query_text)
187
+ )::float AS raw_score
188
+ FROM public.documents AS d
189
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
190
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
191
+ AND pg_catalog.to_tsvector('english', d.content)
192
+ @@ pg_catalog.plainto_tsquery('english', query_text)
193
+ ORDER BY raw_score DESC
194
+ LIMIT match_count * 3
195
+ ),
196
+ keyword_norm AS (
197
+ SELECT
198
+ k.id,
199
+ k.content,
200
+ k.metadata,
201
+ CASE
202
+ WHEN max(k.raw_score) OVER () = 0 THEN 0::float
203
+ ELSE (k.raw_score / max(k.raw_score) OVER ())::float
204
+ END AS score
205
+ FROM keyword AS k
206
+ ),
207
+ blended AS (
208
+ SELECT
209
+ COALESCE(s.id, kn.id) AS id,
210
+ COALESCE(s.content, kn.content) AS content,
211
+ COALESCE(s.metadata, kn.metadata) AS metadata,
212
+ (
213
+ COALESCE(s.score, 0::float) * semantic_weight +
214
+ COALESCE(kn.score, 0::float) * keyword_weight
215
+ ) AS combined_score
216
+ FROM semantic AS s
217
+ FULL OUTER JOIN keyword_norm AS kn ON s.id = kn.id
218
+ )
219
+ SELECT
220
+ b.id,
221
+ b.content,
222
+ b.metadata,
223
+ b.combined_score
224
+ FROM blended AS b
225
+ ORDER BY b.combined_score DESC
226
+ LIMIT match_count;
227
+ END;
228
+ $$;
229
+
230
+
231
+ --
232
  -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
233
+ --
234
+
235
  CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
236
  LANGUAGE plpgsql SECURITY DEFINER
237
+ SET search_path TO ''
238
  AS $$
239
+ BEGIN
240
  INSERT INTO public.documents (id, content, metadata, embedding, user_id)
241
+ VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
242
+ ON CONFLICT (id) DO UPDATE
243
+ SET content = EXCLUDED.content,
244
+ metadata = EXCLUDED.metadata,
245
+ embedding = EXCLUDED.embedding;
246
+ END;
247
+ $$;
248
+
249
+
250
+ --
251
  -- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
252
+ --
253
+
254
  CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
255
  LANGUAGE plpgsql SECURITY DEFINER
256
+ SET search_path TO ''
257
  AS $$
258
+ BEGIN
259
  INSERT INTO public.documents (
260
+ id, content, metadata, embedding, user_id,
261
+ node_type, parent_node_id, node_level
262
+ )
263
+ VALUES (
264
+ p_id, p_content, p_metadata, p_embedding, p_user_id,
265
+ p_node_type, p_parent_node_id, p_node_level
266
+ )
267
+ ON CONFLICT (id) DO UPDATE
268
+ SET content = EXCLUDED.content,
269
+ metadata = EXCLUDED.metadata,
270
+ embedding = EXCLUDED.embedding,
271
+ node_type = EXCLUDED.node_type,
272
+ parent_node_id = EXCLUDED.parent_node_id,
273
+ node_level = EXCLUDED.node_level;
274
+ END;
275
+ $$;
276
+
277
+
278
+ --
279
+ -- Name: insert_document_chunks_batch(jsonb); Type: FUNCTION; Schema: public; Owner: -
280
+ --
281
+
282
+ CREATE FUNCTION public.insert_document_chunks_batch(p_rows jsonb) RETURNS void
283
+ LANGUAGE plpgsql SECURITY DEFINER
284
+ SET search_path TO ''
285
+ AS $$
286
+ BEGIN
287
+ IF p_rows IS NULL OR jsonb_typeof(p_rows) <> 'array' THEN
288
+ RETURN;
289
+ END IF;
290
+
291
+ INSERT INTO public.documents (
292
+ id,
293
+ content,
294
+ metadata,
295
+ embedding,
296
+ user_id,
297
+ node_type,
298
+ parent_node_id,
299
+ node_level
300
+ )
301
+ SELECT
302
+ (row->>'id')::uuid,
303
+ row->>'content',
304
+ COALESCE(row->'metadata', '{}'::jsonb),
305
+ (row->'embedding')::text::extensions.vector,
306
+ (row->>'user_id')::uuid,
307
+ COALESCE(NULLIF(row->>'node_type', ''), 'leaf'),
308
+ NULLIF(row->>'parent_node_id', '')::uuid,
309
+ COALESCE(NULLIF(row->>'node_level', '')::integer, 0)
310
+ FROM jsonb_array_elements(p_rows) AS row
311
+ ON CONFLICT (id) DO UPDATE
312
+ SET content = EXCLUDED.content,
313
+ metadata = EXCLUDED.metadata,
314
+ embedding = EXCLUDED.embedding,
315
+ user_id = EXCLUDED.user_id,
316
+ node_type = EXCLUDED.node_type,
317
+ parent_node_id = EXCLUDED.parent_node_id,
318
+ node_level = EXCLUDED.node_level;
319
+ END;
320
+ $$;
321
+
322
+
323
+ --
324
  -- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
325
+ --
326
+
327
  CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
328
  LANGUAGE plpgsql
329
+ SET search_path TO ''
330
  AS $$
331
+ begin
332
+ return query
333
+ select
334
+ d.id,
335
+ d.content,
336
+ d.metadata,
337
+ (
338
+ 1 - (
339
+ d.embedding::extensions.halfvec(2048)
340
+ OPERATOR(extensions.<=>)
341
+ query_embedding::extensions.halfvec(2048)
342
+ )
343
+ )::float as similarity
344
  from public.documents d
345
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
346
+ order by d.embedding::extensions.halfvec(2048)
347
+ OPERATOR(extensions.<=>)
348
+ query_embedding::extensions.halfvec(2048)
349
+ limit match_count;
350
+ end;
351
+ $$;
352
+
353
+
354
+ --
355
+ -- Name: match_documents(extensions.vector, integer, jsonb, uuid); Type: FUNCTION; Schema: public; Owner: -
356
+ --
357
+
358
+ CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
359
+ LANGUAGE plpgsql
360
+ SET search_path TO ''
361
+ AS $$
362
+ BEGIN
363
+ RETURN QUERY
364
+ SELECT
365
+ d.id,
366
+ d.content,
367
+ d.metadata,
368
+ (
369
+ 1 - (
370
+ d.embedding::extensions.halfvec(2048)
371
+ OPERATOR(extensions.<=>)
372
+ query_embedding::extensions.halfvec(2048)
373
+ )
374
+ )::float AS similarity
375
+ FROM public.documents AS d
376
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
377
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
378
+ ORDER BY d.embedding::extensions.halfvec(2048)
379
+ OPERATOR(extensions.<=>)
380
+ query_embedding::extensions.halfvec(2048)
381
+ LIMIT match_count;
382
+ END;
383
+ $$;
384
+
385
+
386
+ --
387
  -- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
388
+ --
389
+
390
  CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
391
  LANGUAGE plpgsql
392
+ SET search_path TO ''
393
+ AS $$
394
+ BEGIN
395
+ RETURN QUERY
396
+ SELECT
397
+ cm.id,
398
+ cm.role,
399
+ cm.content,
400
+ 1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
401
+ FROM public.chat_memory AS cm
402
+ WHERE cm.session_id = match_session_id
403
+ ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
404
+ LIMIT match_count;
405
+ END;
406
+ $$;
407
+
408
+
409
+ --
410
+ -- Name: match_memory(extensions.vector, text, integer, uuid); Type: FUNCTION; Schema: public; Owner: -
411
+ --
412
+
413
+ CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
414
+ LANGUAGE plpgsql
415
+ SET search_path TO ''
416
  AS $$
417
+ BEGIN
418
+ RETURN QUERY
419
+ SELECT
420
  cm.id,
421
  cm.role,
422
  cm.content,
423
  1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
424
  FROM public.chat_memory AS cm
425
  WHERE cm.session_id = match_session_id
426
+ AND (p_user_id IS NULL OR cm.user_id = p_user_id)
427
  ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
428
+ LIMIT match_count;
429
+ END;
430
+ $$;
431
+
432
+
433
+ --
434
+ -- Name: rls_auto_enable(); Type: FUNCTION; Schema: public; Owner: -
435
+ --
436
+
437
+ CREATE FUNCTION public.rls_auto_enable() RETURNS event_trigger
438
+ LANGUAGE plpgsql SECURITY DEFINER
439
+ SET search_path TO 'pg_catalog'
440
+ AS $$
441
+ DECLARE
442
+ cmd record;
443
+ BEGIN
444
+ FOR cmd IN
445
+ SELECT *
446
+ FROM pg_event_trigger_ddl_commands()
447
+ WHERE command_tag IN ('CREATE TABLE', 'CREATE TABLE AS', 'SELECT INTO')
448
+ AND object_type IN ('table','partitioned table')
449
+ LOOP
450
+ IF cmd.schema_name IS NOT NULL AND cmd.schema_name IN ('public') AND cmd.schema_name NOT IN ('pg_catalog','information_schema') AND cmd.schema_name NOT LIKE 'pg_toast%' AND cmd.schema_name NOT LIKE 'pg_temp%' THEN
451
+ BEGIN
452
+ EXECUTE format('alter table if exists %s enable row level security', cmd.object_identity);
453
+ RAISE LOG 'rls_auto_enable: enabled RLS on %', cmd.object_identity;
454
+ EXCEPTION
455
+ WHEN OTHERS THEN
456
+ RAISE LOG 'rls_auto_enable: failed to enable RLS on %', cmd.object_identity;
457
+ END;
458
+ ELSE
459
+ RAISE LOG 'rls_auto_enable: skip % (either system schema or not in enforced list: %.)', cmd.object_identity, cmd.schema_name;
460
+ END IF;
461
+ END LOOP;
462
+ END;
463
+ $$;
464
+
465
+
466
+ SET default_tablespace = '';
467
+
468
+ SET default_table_access_method = heap;
469
+
470
+ --
471
+ -- Name: answer_feedback; Type: TABLE; Schema: public; Owner: -
472
+ --
473
+
474
+ CREATE TABLE public.answer_feedback (
475
+ id bigint NOT NULL,
476
+ trace_id uuid NOT NULL,
477
+ user_id uuid,
478
+ helpful boolean,
479
+ accepted boolean,
480
+ reason_code text,
481
+ correction_text text,
482
+ promote_to_eval boolean DEFAULT false NOT NULL,
483
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
484
+ review_state text DEFAULT 'pending'::text NOT NULL,
485
+ review_notes text,
486
+ reviewed_at timestamp with time zone,
487
+ reviewed_by text,
488
+ promoted_at timestamp with time zone
489
+ );
490
+
491
+
492
+ --
493
+ -- Name: answer_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
494
+ --
495
+
496
+ ALTER TABLE public.answer_feedback ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
497
+ SEQUENCE NAME public.answer_feedback_id_seq
498
+ START WITH 1
499
+ INCREMENT BY 1
500
+ NO MINVALUE
501
+ NO MAXVALUE
502
+ CACHE 1
503
+ );
504
+
505
+
506
+ --
507
+ -- Name: category_centroids; Type: TABLE; Schema: public; Owner: -
508
+ --
509
+
510
+ CREATE TABLE public.category_centroids (
511
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
512
+ document_type text NOT NULL,
513
+ centroid_vector double precision[] NOT NULL,
514
+ document_count integer DEFAULT 1,
515
+ created_at timestamp with time zone DEFAULT now(),
516
+ updated_at timestamp with time zone DEFAULT now(),
517
+ user_id uuid DEFAULT auth.uid()
518
+ );
519
+
520
+
521
+ --
522
+ -- Name: chat_memory; Type: TABLE; Schema: public; Owner: -
523
+ --
524
+
525
+ CREATE TABLE public.chat_memory (
526
+ id uuid DEFAULT extensions.uuid_generate_v4() NOT NULL,
527
+ session_id text NOT NULL,
528
+ role text NOT NULL,
529
+ content text NOT NULL,
530
  embedding extensions.vector(2048),
531
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
532
+ user_id uuid DEFAULT auth.uid()
533
+ );
534
+
535
+
536
+ --
537
+ -- Name: document_trees; Type: TABLE; Schema: public; Owner: -
538
+ --
539
+
540
+ CREATE TABLE public.document_trees (
541
+ file_hash text NOT NULL,
542
+ user_id uuid NOT NULL,
543
+ tree_json jsonb NOT NULL,
544
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
545
+ );
546
+
547
+
548
+ --
549
+ -- Name: documents; Type: TABLE; Schema: public; Owner: -
550
+ --
551
+
552
+ CREATE TABLE public.documents (
553
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
554
+ content text,
555
+ metadata jsonb,
556
  embedding extensions.vector(2048),
557
+ user_id uuid DEFAULT auth.uid(),
558
+ node_type text DEFAULT 'leaf'::text,
559
+ parent_node_id uuid,
560
+ node_level integer DEFAULT 0
561
+ );
562
+
563
+
564
+ --
565
+ -- Name: evaluation_datasets; Type: TABLE; Schema: public; Owner: -
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  --
567
 
568
+ CREATE TABLE public.evaluation_datasets (
569
+ id bigint NOT NULL,
570
+ trace_id uuid,
571
+ source text DEFAULT 'feedback_trace'::text NOT NULL,
572
+ question text NOT NULL,
573
+ gold_context_refs jsonb DEFAULT '[]'::jsonb NOT NULL,
574
+ gold_evidence_text text,
575
+ is_answerable boolean DEFAULT true NOT NULL,
576
+ failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
577
+ doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
578
+ reason_code text,
579
+ is_active boolean DEFAULT false NOT NULL,
580
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
581
+ );
582
+
583
+
584
+ --
585
+ -- Name: evaluation_datasets_id_seq; Type: SEQUENCE; Schema: public; Owner: -
 
 
 
 
586
  --
587
 
588
+ ALTER TABLE public.evaluation_datasets ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
589
+ SEQUENCE NAME public.evaluation_datasets_id_seq
590
+ START WITH 1
591
+ INCREMENT BY 1
592
+ NO MINVALUE
593
+ NO MAXVALUE
594
+ CACHE 1
595
+ );
596
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
  --
599
+ -- Name: evaluation_logs; Type: TABLE; Schema: public; Owner: -
600
+ --
601
+
602
+ CREATE TABLE public.evaluation_logs (
603
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
604
+ run_label text,
605
+ evaluated_at timestamp with time zone,
606
+ alpha double precision,
607
+ k integer,
608
+ question text,
609
+ is_answerable boolean,
610
+ precision_at_k double precision,
611
+ faithfulness_proxy double precision,
612
+ relevance_proxy double precision,
613
+ local_reward double precision,
614
+ llm_judge_score double precision,
615
+ judge_a_verdict boolean,
616
+ judge_b_verdict boolean,
617
+ judge_a_model text,
618
+ judge_b_model text,
619
+ calibration_score double precision,
620
+ final_score double precision,
621
+ requires_manual_review boolean DEFAULT false,
622
+ disagreement_note text DEFAULT ''::text,
623
+ user_id uuid
624
+ );
625
 
626
 
627
  --
628
+ -- Name: graph_edges; Type: TABLE; Schema: public; Owner: -
629
  --
630
 
631
+ CREATE TABLE public.graph_edges (
632
+ id bigint NOT NULL,
633
+ user_id uuid,
634
+ source_node_key text NOT NULL,
635
+ target_node_key text NOT NULL,
636
+ edge_type text NOT NULL,
637
+ weight double precision DEFAULT 1.0 NOT NULL,
638
+ payload jsonb DEFAULT '{}'::jsonb NOT NULL,
639
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
640
+ );
641
+
642
 
643
  --
644
+ -- Name: graph_edges_id_seq; Type: SEQUENCE; Schema: public; Owner: -
645
  --
646
 
647
+ ALTER TABLE public.graph_edges ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
648
+ SEQUENCE NAME public.graph_edges_id_seq
649
+ START WITH 1
650
+ INCREMENT BY 1
651
+ NO MINVALUE
652
+ NO MAXVALUE
653
+ CACHE 1
654
+ );
655
 
656
 
657
  --
658
+ -- Name: graph_nodes; Type: TABLE; Schema: public; Owner: -
659
  --
660
 
661
+ CREATE TABLE public.graph_nodes (
662
+ id bigint NOT NULL,
663
+ user_id uuid,
664
+ node_key text NOT NULL,
665
+ node_type text NOT NULL,
666
+ label text NOT NULL,
667
+ payload jsonb DEFAULT '{}'::jsonb NOT NULL,
668
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
669
+ );
670
 
671
 
672
  --
673
+ -- Name: graph_nodes_id_seq; Type: SEQUENCE; Schema: public; Owner: -
674
  --
675
 
676
+ ALTER TABLE public.graph_nodes ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
677
+ SEQUENCE NAME public.graph_nodes_id_seq
678
+ START WITH 1
679
+ INCREMENT BY 1
680
+ NO MINVALUE
681
+ NO MAXVALUE
682
+ CACHE 1
683
+ );
684
 
685
 
686
  --
687
+ -- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
688
  --
689
 
690
+ CREATE TABLE public.ingested_files (
691
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
692
+ file_hash text NOT NULL,
693
+ filename text NOT NULL,
694
+ document_type text,
695
+ chunk_count integer DEFAULT 0,
696
+ ingested_at timestamp with time zone DEFAULT now(),
697
+ user_id uuid DEFAULT auth.uid(),
698
+ user_overridden boolean DEFAULT false,
699
+ identity_json jsonb DEFAULT '{}'::jsonb NOT NULL
700
+ );
701
 
702
 
703
  --
704
+ -- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
705
  --
706
 
707
+ CREATE TABLE public.ingestion_retry_logs (
708
+ id bigint NOT NULL,
709
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
710
+ user_id uuid,
711
+ file_hash text,
712
+ batch_num integer NOT NULL,
713
+ total_batches integer NOT NULL,
714
+ attempt integer NOT NULL,
715
+ event_type text NOT NULL,
716
+ message text,
717
+ sleep_s double precision DEFAULT 0
718
+ );
 
 
719
 
720
 
721
  --
722
+ -- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
723
  --
724
 
725
+ CREATE SEQUENCE public.ingestion_retry_logs_id_seq
726
+ START WITH 1
727
+ INCREMENT BY 1
728
+ NO MINVALUE
729
+ NO MAXVALUE
730
+ CACHE 1;
731
+
732
 
733
  --
734
+ -- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
735
  --
736
 
737
+ ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
738
 
739
 
740
  --
741
+ -- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
742
  --
743
 
744
+ CREATE TABLE public.intent_feedback (
745
+ id bigint NOT NULL,
746
+ user_id uuid,
747
+ query text NOT NULL,
748
+ has_category boolean DEFAULT false NOT NULL,
749
+ has_history boolean DEFAULT false NOT NULL,
750
+ label integer NOT NULL,
751
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
752
+ CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
753
+ );
754
 
755
 
756
  --
757
+ -- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
758
  --
759
 
760
+ CREATE SEQUENCE public.intent_feedback_id_seq
761
+ START WITH 1
762
+ INCREMENT BY 1
763
+ NO MINVALUE
764
+ NO MAXVALUE
765
+ CACHE 1;
766
 
767
 
768
  --
769
+ -- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
770
  --
771
 
772
+ ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
773
 
774
 
775
  --
776
+ -- Name: query_traces; Type: TABLE; Schema: public; Owner: -
777
  --
778
 
779
+ CREATE TABLE public.query_traces (
780
+ trace_id uuid DEFAULT gen_random_uuid() NOT NULL,
781
+ user_id uuid,
782
+ session_id text DEFAULT 'default_session'::text NOT NULL,
783
+ question text NOT NULL,
784
+ route_mode text DEFAULT 'default'::text NOT NULL,
785
+ selected_experts jsonb DEFAULT '[]'::jsonb NOT NULL,
786
+ expert_weights jsonb DEFAULT '{}'::jsonb NOT NULL,
787
+ pinned_file_hashes jsonb DEFAULT '[]'::jsonb NOT NULL,
788
+ candidate_counts jsonb DEFAULT '{}'::jsonb NOT NULL,
789
+ selected_chunk_ids jsonb DEFAULT '[]'::jsonb NOT NULL,
790
+ doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
791
+ failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
792
+ quality_metrics jsonb DEFAULT '{}'::jsonb NOT NULL,
793
+ answer_hash text,
794
+ answer_preview text,
795
+ latency_ms integer,
796
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
797
+ review_state text DEFAULT 'pending'::text NOT NULL,
798
+ review_notes text,
799
+ reviewed_at timestamp with time zone,
800
+ reviewed_by text,
801
+ promoted_to_eval boolean DEFAULT false NOT NULL,
802
+ document_types jsonb DEFAULT '[]'::jsonb NOT NULL
803
+ );
804
+
805
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
  --
807
+ -- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
808
+ --
809
+
810
+ CREATE TABLE public.rerank_feedback (
811
+ id bigint NOT NULL,
812
+ user_id uuid,
813
+ query_hash text NOT NULL,
814
+ chunk_id uuid,
815
+ chunk_hash text NOT NULL,
816
+ document_type text,
817
+ cohere_score real NOT NULL,
818
+ was_selected boolean NOT NULL,
819
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
820
+ query_text text,
821
+ chunk_text text
822
+ );
823
+
824
+
825
+ --
826
+ -- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
827
+ --
828
+
829
+ CREATE SEQUENCE public.rerank_feedback_id_seq
830
+ START WITH 1
831
+ INCREMENT BY 1
832
+ NO MINVALUE
833
+ NO MAXVALUE
834
+ CACHE 1;
835
+
836
+
837
+ --
838
+ -- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
839
+ --
840
+
841
+ ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
842
+
843
+
844
+ --
845
+ -- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
846
+ --
847
+
848
+ ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
849
+
850
+
851
+ --
852
+ -- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
853
+ --
854
+
855
+ ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
856
+
857
+
858
+ --
859
+ -- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
860
+ --
861
+
862
+ ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
863
+
864
+
865
+ --
866
+ -- Name: answer_feedback answer_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
867
+ --
868
+
869
+ ALTER TABLE ONLY public.answer_feedback
870
+ ADD CONSTRAINT answer_feedback_pkey PRIMARY KEY (id);
871
+
872
+
873
+ --
874
+ -- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
875
+ --
876
+
877
+ ALTER TABLE ONLY public.category_centroids
878
+ ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
879
+
880
+
881
+ --
882
+ -- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
883
+ --
884
+
885
+ ALTER TABLE ONLY public.category_centroids
886
+ ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
887
+
888
+
889
+ --
890
+ -- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
891
+ --
892
+
893
+ ALTER TABLE ONLY public.chat_memory
894
+ ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
895
+
896
+
897
+ --
898
+ -- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
899
+ --
900
+
901
+ ALTER TABLE ONLY public.document_trees
902
+ ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
903
+
904
+
905
+ --
906
+ -- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: -
907
+ --
908
+
909
+ ALTER TABLE ONLY public.documents
910
+ ADD CONSTRAINT documents_pkey PRIMARY KEY (id);
911
+
912
+
913
+ --
914
+ -- Name: evaluation_datasets evaluation_datasets_pkey; Type: CONSTRAINT; Schema: public; Owner: -
915
+ --
916
+
917
+ ALTER TABLE ONLY public.evaluation_datasets
918
+ ADD CONSTRAINT evaluation_datasets_pkey PRIMARY KEY (id);
919
+
920
+
921
+ --
922
+ -- Name: evaluation_datasets evaluation_datasets_trace_id_key; Type: CONSTRAINT; Schema: public; Owner: -
923
+ --
924
+
925
+ ALTER TABLE ONLY public.evaluation_datasets
926
+ ADD CONSTRAINT evaluation_datasets_trace_id_key UNIQUE (trace_id);
927
+
928
+
929
  --
930
+ -- Name: evaluation_logs evaluation_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
931
+ --
932
+
933
+ ALTER TABLE ONLY public.evaluation_logs
934
+ ADD CONSTRAINT evaluation_logs_pkey PRIMARY KEY (id);
935
+
936
+
937
+ --
938
+ -- Name: graph_edges graph_edges_pkey; Type: CONSTRAINT; Schema: public; Owner: -
939
+ --
940
+
941
+ ALTER TABLE ONLY public.graph_edges
942
+ ADD CONSTRAINT graph_edges_pkey PRIMARY KEY (id);
943
+
944
+
945
+ --
946
+ -- Name: graph_edges graph_edges_user_id_source_node_key_target_node_key_edge_ty_key; Type: CONSTRAINT; Schema: public; Owner: -
947
+ --
948
+
949
+ ALTER TABLE ONLY public.graph_edges
950
+ ADD CONSTRAINT graph_edges_user_id_source_node_key_target_node_key_edge_ty_key UNIQUE (user_id, source_node_key, target_node_key, edge_type);
951
+
952
+
953
+ --
954
+ -- Name: graph_nodes graph_nodes_pkey; Type: CONSTRAINT; Schema: public; Owner: -
955
+ --
956
+
957
+ ALTER TABLE ONLY public.graph_nodes
958
+ ADD CONSTRAINT graph_nodes_pkey PRIMARY KEY (id);
959
+
960
+
961
+ --
962
+ -- Name: graph_nodes graph_nodes_user_id_node_key_key; Type: CONSTRAINT; Schema: public; Owner: -
963
+ --
964
+
965
+ ALTER TABLE ONLY public.graph_nodes
966
+ ADD CONSTRAINT graph_nodes_user_id_node_key_key UNIQUE (user_id, node_key);
967
+
968
+
969
+ --
970
+ -- Name: ingested_files ingested_files_pkey; Type: CONSTRAINT; Schema: public; Owner: -
971
+ --
972
+
973
+ ALTER TABLE ONLY public.ingested_files
974
+ ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
975
+
976
+
977
+ --
978
+ -- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
979
+ --
980
+
981
+ ALTER TABLE ONLY public.ingested_files
982
+ ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
983
+
984
+
985
+ --
986
+ -- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
987
+ --
988
+
989
+ ALTER TABLE ONLY public.ingestion_retry_logs
990
+ ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
991
+
992
+
993
+ --
994
+ -- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
995
+ --
996
+
997
+ ALTER TABLE ONLY public.intent_feedback
998
+ ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
999
+
1000
+
1001
+ --
1002
+ -- Name: query_traces query_traces_pkey; Type: CONSTRAINT; Schema: public; Owner: -
1003
+ --
1004
+
1005
+ ALTER TABLE ONLY public.query_traces
1006
+ ADD CONSTRAINT query_traces_pkey PRIMARY KEY (trace_id);
1007
+
1008
+
1009
+ --
1010
+ -- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
1011
+ --
1012
+
1013
+ ALTER TABLE ONLY public.rerank_feedback
1014
+ ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
1015
+
1016
+
1017
+ --
1018
+ -- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
1019
+ --
1020
+
1021
+ CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
1022
+
1023
+
1024
+ --
1025
+ -- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
1026
+ --
1027
+
1028
+ CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
1029
+
1030
+
1031
+ --
1032
+ -- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
1033
+ --
1034
+
1035
+ CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
1036
+
1037
+
1038
+ --
1039
+ -- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
1040
+ --
1041
+
1042
+ CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
1043
+
1044
+
1045
+ --
1046
+ -- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
1047
+ --
1048
+
1049
+ CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
1050
+
1051
+
1052
+ --
1053
+ -- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
1054
+ --
1055
+
1056
+ CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
1057
+
1058
+
1059
+ --
1060
+ -- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
1061
+ --
1062
+
1063
+ CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
1064
+
1065
+
1066
+ --
1067
+ -- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
1068
+ --
1069
+
1070
+ CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
1071
+
1072
+
1073
+ --
1074
+ -- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
1075
+ --
1076
+
1077
+ CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
1078
+
1079
+
1080
+ --
1081
+ -- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
1082
+ --
1083
+
1084
+ CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
1085
+
1086
+
1087
+ --
1088
+ -- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
1089
+ --
1090
+
1091
+ CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
1092
+
1093
+
1094
+ --
1095
+ -- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
1096
+ --
1097
+
1098
+ CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
1099
+
1100
+
1101
+ --
1102
+ -- Name: idx_answer_feedback_review_state_created; Type: INDEX; Schema: public; Owner: -
1103
+ --
1104
+
1105
+ CREATE INDEX idx_answer_feedback_review_state_created ON public.answer_feedback USING btree (review_state, created_at DESC);
1106
+
1107
+
1108
+ --
1109
+ -- Name: idx_answer_feedback_trace_created; Type: INDEX; Schema: public; Owner: -
1110
+ --
1111
+
1112
+ CREATE INDEX idx_answer_feedback_trace_created ON public.answer_feedback USING btree (trace_id, created_at DESC);
1113
+
1114
+
1115
+ --
1116
+ -- Name: idx_answer_feedback_user_created; Type: INDEX; Schema: public; Owner: -
1117
+ --
1118
+
1119
+ CREATE INDEX idx_answer_feedback_user_created ON public.answer_feedback USING btree (user_id, created_at DESC);
1120
+
1121
+
1122
+ --
1123
+ -- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
1124
+ --
1125
+
1126
+ CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
1127
+
1128
+
1129
+ --
1130
+ -- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
1131
+ --
1132
+
1133
+ CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
1134
+
1135
+
1136
+ --
1137
+ -- Name: idx_evaluation_datasets_active_created; Type: INDEX; Schema: public; Owner: -
1138
+ --
1139
+
1140
+ CREATE INDEX idx_evaluation_datasets_active_created ON public.evaluation_datasets USING btree (is_active, created_at DESC);
1141
+
1142
+
1143
+ --
1144
+ -- Name: idx_graph_edges_user_source; Type: INDEX; Schema: public; Owner: -
1145
+ --
1146
+
1147
+ CREATE INDEX idx_graph_edges_user_source ON public.graph_edges USING btree (user_id, source_node_key);
1148
+
1149
+
1150
+ --
1151
+ -- Name: idx_graph_edges_user_target; Type: INDEX; Schema: public; Owner: -
1152
+ --
1153
+
1154
+ CREATE INDEX idx_graph_edges_user_target ON public.graph_edges USING btree (user_id, target_node_key);
1155
+
1156
+
1157
+ --
1158
+ -- Name: idx_graph_nodes_user_label; Type: INDEX; Schema: public; Owner: -
1159
+ --
1160
+
1161
+ CREATE INDEX idx_graph_nodes_user_label ON public.graph_nodes USING btree (user_id, label);
1162
+
1163
+
1164
+ --
1165
+ -- Name: idx_graph_nodes_user_type; Type: INDEX; Schema: public; Owner: -
1166
+ --
1167
+
1168
+ CREATE INDEX idx_graph_nodes_user_type ON public.graph_nodes USING btree (user_id, node_type);
1169
+
1170
+
1171
+ --
1172
+ -- Name: idx_query_traces_review_state_created; Type: INDEX; Schema: public; Owner: -
1173
+ --
1174
+
1175
+ CREATE INDEX idx_query_traces_review_state_created ON public.query_traces USING btree (review_state, created_at DESC);
1176
+
1177
+
1178
+ --
1179
+ -- Name: idx_query_traces_session_created; Type: INDEX; Schema: public; Owner: -
1180
+ --
1181
+
1182
+ CREATE INDEX idx_query_traces_session_created ON public.query_traces USING btree (session_id, created_at DESC);
1183
+
1184
+
1185
+ --
1186
+ -- Name: idx_query_traces_user_created; Type: INDEX; Schema: public; Owner: -
1187
+ --
1188
+
1189
+ CREATE INDEX idx_query_traces_user_created ON public.query_traces USING btree (user_id, created_at DESC);
1190
+
1191
+
1192
+ --
1193
+ -- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
1194
+ --
1195
+
1196
+ CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
1197
+
1198
+
1199
+ --
1200
+ -- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
1201
+ --
1202
+
1203
+ CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
1204
+
1205
+
1206
+ --
1207
+ -- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
1208
+ --
1209
+
1210
+ CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
1211
+
1212
+
1213
+ --
1214
+ -- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
1215
+ --
1216
+
1217
+ CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
1218
+
1219
+
1220
+ --
1221
+ -- Name: ingestion_retry_logs_user_file_event_idx; Type: INDEX; Schema: public; Owner: -
1222
+ --
1223
+
1224
+ CREATE INDEX ingestion_retry_logs_user_file_event_idx ON public.ingestion_retry_logs USING btree (user_id, file_hash, event_type, created_at DESC);
1225
+
1226
+
1227
+ --
1228
+ -- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
1229
+ --
1230
+
1231
+ CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
1232
+
1233
+
1234
+ --
1235
+ -- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
1236
+ --
1237
+
1238
+ CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
1239
+
1240
+
1241
+ --
1242
+ -- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
1243
+ --
1244
+
1245
+ CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
1246
+
1247
+
1248
+ --
1249
+ -- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
1250
+ --
1251
+
1252
+ CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
1253
+
1254
+
1255
+ --
1256
+ -- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
1257
+ --
1258
+
1259
+ CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
1260
+
1261
+
1262
+ --
1263
+ -- Name: answer_feedback answer_feedback_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
1264
+ --
1265
+
1266
+ ALTER TABLE ONLY public.answer_feedback
1267
+ ADD CONSTRAINT answer_feedback_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE CASCADE;
1268
+
1269
+
1270
+ --
1271
+ -- Name: evaluation_datasets evaluation_datasets_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
1272
+ --
1273
+
1274
+ ALTER TABLE ONLY public.evaluation_datasets
1275
+ ADD CONSTRAINT evaluation_datasets_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE SET NULL;
1276
+
1277
+
1278
+ --
1279
+ -- Name: answer_feedback; Type: ROW SECURITY; Schema: public; Owner: -
1280
+ --
1281
+
1282
+ ALTER TABLE public.answer_feedback ENABLE ROW LEVEL SECURITY;
1283
+
1284
+ --
1285
+ -- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
1286
+ --
1287
+
1288
+ ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
1289
+
1290
+ --
1291
+ -- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
1292
+ --
1293
+
1294
+ ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
1295
+
1296
+ --
1297
+ -- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
1298
+ --
1299
+
1300
+ CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
1301
+
1302
+
1303
+ --
1304
+ -- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
1305
+ --
1306
+
1307
+ CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
1308
+
1309
+
1310
+ --
1311
+ -- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
1312
+ --
1313
+
1314
+ CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
1315
+
1316
+
1317
+ --
1318
+ -- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
1319
+ --
1320
+
1321
+ CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
1322
+
1323
+
1324
+ --
1325
+ -- Name: document_trees; Type: ROW SECURITY; Schema: public; Owner: -
1326
+ --
1327
+
1328
+ ALTER TABLE public.document_trees ENABLE ROW LEVEL SECURITY;
1329
+
1330
+ --
1331
+ -- Name: document_trees document_trees_delete_own; Type: POLICY; Schema: public; Owner: -
1332
+ --
1333
+
1334
+ CREATE POLICY document_trees_delete_own ON public.document_trees FOR DELETE USING ((user_id = auth.uid()));
1335
+
1336
+
1337
+ --
1338
+ -- Name: document_trees document_trees_insert_own; Type: POLICY; Schema: public; Owner: -
1339
+ --
1340
+
1341
+ CREATE POLICY document_trees_insert_own ON public.document_trees FOR INSERT WITH CHECK ((user_id = auth.uid()));
1342
+
1343
+
1344
+ --
1345
+ -- Name: document_trees document_trees_select_own; Type: POLICY; Schema: public; Owner: -
1346
+ --
1347
+
1348
+ CREATE POLICY document_trees_select_own ON public.document_trees FOR SELECT USING ((user_id = auth.uid()));
1349
+
1350
+
1351
+ --
1352
+ -- Name: document_trees document_trees_update_own; Type: POLICY; Schema: public; Owner: -
1353
+ --
1354
+
1355
+ CREATE POLICY document_trees_update_own ON public.document_trees FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
1356
+
1357
+
1358
+ --
1359
+ -- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
1360
+ --
1361
+
1362
+ ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
1363
+
1364
+ --
1365
+ -- Name: documents documents_delete_own; Type: POLICY; Schema: public; Owner: -
1366
+ --
1367
+
1368
+ CREATE POLICY documents_delete_own ON public.documents FOR DELETE USING ((user_id = auth.uid()));
1369
+
1370
+
1371
+ --
1372
+ -- Name: documents documents_insert_own; Type: POLICY; Schema: public; Owner: -
1373
+ --
1374
+
1375
+ CREATE POLICY documents_insert_own ON public.documents FOR INSERT WITH CHECK ((user_id = auth.uid()));
1376
+
1377
+
1378
+ --
1379
+ -- Name: documents documents_select_own; Type: POLICY; Schema: public; Owner: -
1380
+ --
1381
+
1382
+ CREATE POLICY documents_select_own ON public.documents FOR SELECT USING ((user_id = auth.uid()));
1383
+
1384
+
1385
+ --
1386
+ -- Name: documents documents_update_own; Type: POLICY; Schema: public; Owner: -
1387
+ --
1388
+
1389
+ CREATE POLICY documents_update_own ON public.documents FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
1390
+
1391
+
1392
+ --
1393
+ -- Name: evaluation_datasets; Type: ROW SECURITY; Schema: public; Owner: -
1394
+ --
1395
+
1396
+ ALTER TABLE public.evaluation_datasets ENABLE ROW LEVEL SECURITY;
1397
+
1398
+ --
1399
+ -- Name: evaluation_logs; Type: ROW SECURITY; Schema: public; Owner: -
1400
+ --
1401
+
1402
+ ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY;
1403
+
1404
+ --
1405
+ -- Name: graph_edges; Type: ROW SECURITY; Schema: public; Owner: -
1406
+ --
1407
+
1408
+ ALTER TABLE public.graph_edges ENABLE ROW LEVEL SECURITY;
1409
+
1410
+ --
1411
+ -- Name: graph_nodes; Type: ROW SECURITY; Schema: public; Owner: -
1412
+ --
1413
+
1414
+ ALTER TABLE public.graph_nodes ENABLE ROW LEVEL SECURITY;
1415
+
1416
+ --
1417
+ -- Name: ingested_files; Type: ROW SECURITY; Schema: public; Owner: -
1418
+ --
1419
+
1420
+ ALTER TABLE public.ingested_files ENABLE ROW LEVEL SECURITY;
1421
+
1422
+ --
1423
+ -- Name: ingested_files ingested_files_delete_own; Type: POLICY; Schema: public; Owner: -
1424
+ --
1425
+
1426
+ CREATE POLICY ingested_files_delete_own ON public.ingested_files FOR DELETE USING ((user_id = auth.uid()));
1427
+
1428
+
1429
+ --
1430
+ -- Name: ingested_files ingested_files_insert_own; Type: POLICY; Schema: public; Owner: -
1431
+ --
1432
+
1433
+ CREATE POLICY ingested_files_insert_own ON public.ingested_files FOR INSERT WITH CHECK ((user_id = auth.uid()));
1434
+
1435
+
1436
+ --
1437
+ -- Name: ingested_files ingested_files_select_own; Type: POLICY; Schema: public; Owner: -
1438
+ --
1439
+
1440
+ CREATE POLICY ingested_files_select_own ON public.ingested_files FOR SELECT USING ((user_id = auth.uid()));
1441
+
1442
+
1443
+ --
1444
+ -- Name: ingested_files ingested_files_update_own; Type: POLICY; Schema: public; Owner: -
1445
+ --
1446
+
1447
+ CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
1448
+
1449
+
1450
+ --
1451
+ -- Name: ingestion_retry_logs; Type: ROW SECURITY; Schema: public; Owner: -
1452
+ --
1453
+
1454
+ ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY;
1455
+
1456
+ --
1457
+ -- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
1458
+ --
1459
+
1460
+ ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
1461
+
1462
+ --
1463
+ -- Name: query_traces; Type: ROW SECURITY; Schema: public; Owner: -
1464
+ --
1465
+
1466
+ ALTER TABLE public.query_traces ENABLE ROW LEVEL SECURITY;
1467
+
1468
+ --
1469
+ -- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
1470
+ --
1471
+
1472
+ ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
1473
+
1474
+ --
1475
+ -- PostgreSQL database dump complete
1476
+ --
1477
+
1478
+ \unrestrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua
tests/test_guest_mode.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jwt
2
+ from starlette.requests import Request
3
+
4
+ from backend.core.auth_utils import is_guest_token
5
+ from backend.main import _rate_limit_key
6
+
7
+ _TEST_GUEST_KEY = "guest-secret-key-that-is-long-enough"
8
+ _TEST_USER_KEY = "user-secret-key-that-is-long-enough"
9
+
10
+
11
+ def _make_request(headers: dict[str, str], client_ip: str = "127.0.0.1") -> Request:
12
+ scope = {
13
+ "type": "http",
14
+ "method": "GET",
15
+ "path": "/",
16
+ "scheme": "http",
17
+ "client": (client_ip, 4321),
18
+ "server": ("testserver", 80),
19
+ "headers": [
20
+ (key.lower().encode("latin-1"), value.encode("latin-1"))
21
+ for key, value in headers.items()
22
+ ],
23
+ }
24
+ return Request(scope)
25
+
26
+
27
+ def test_is_guest_token_detects_anonymous_provider():
28
+ token = jwt.encode(
29
+ {
30
+ "sub": "11111111-1111-1111-1111-111111111111",
31
+ "app_metadata": {"provider": "anonymous", "providers": ["anonymous"]},
32
+ },
33
+ _TEST_GUEST_KEY,
34
+ algorithm="HS256",
35
+ )
36
+ assert is_guest_token(token) is True
37
+
38
+
39
+ def test_is_guest_token_ignores_regular_authenticated_user():
40
+ token = jwt.encode(
41
+ {
42
+ "sub": "22222222-2222-2222-2222-222222222222",
43
+ "app_metadata": {"provider": "email", "providers": ["email"]},
44
+ },
45
+ _TEST_USER_KEY,
46
+ algorithm="HS256",
47
+ )
48
+ assert is_guest_token(token) is False
49
+
50
+
51
+ def test_rate_limit_key_uses_ip_for_guest_tokens():
52
+ token = jwt.encode(
53
+ {
54
+ "sub": "33333333-3333-3333-3333-333333333333",
55
+ "app_metadata": {"provider": "anonymous"},
56
+ },
57
+ _TEST_GUEST_KEY,
58
+ algorithm="HS256",
59
+ )
60
+ request = _make_request({"X-Auth-Token": token}, client_ip="10.0.0.8")
61
+ assert _rate_limit_key(request) == "10.0.0.8"
62
+
63
+
64
+ def test_rate_limit_key_uses_token_for_regular_users():
65
+ token = jwt.encode(
66
+ {
67
+ "sub": "44444444-4444-4444-4444-444444444444",
68
+ "app_metadata": {"provider": "email"},
69
+ },
70
+ _TEST_USER_KEY,
71
+ algorithm="HS256",
72
+ )
73
+ request = _make_request({"Authorization": f"Bearer {token}"}, client_ip="10.0.0.8")
74
+ assert _rate_limit_key(request) == token
tests/test_ingest_api.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import sys
4
+ import tempfile
5
+ from types import SimpleNamespace
6
+
7
+ import pytest
8
+ from fastapi import HTTPException
9
+ from starlette.requests import Request
10
+
11
+ from backend.api import ingest as ingest_api
12
+ from backend.core import pipeline, tasks
13
+
14
+
15
+ class FakeUploadFile:
16
+ def __init__(self, filename: str, content: bytes):
17
+ self.filename = filename
18
+ self._content = content
19
+ self._cursor = 0
20
+
21
+ async def read(self, size: int = -1) -> bytes:
22
+ if size is None or size < 0:
23
+ size = len(self._content) - self._cursor
24
+ start = self._cursor
25
+ end = min(len(self._content), self._cursor + size)
26
+ self._cursor = end
27
+ return self._content[start:end]
28
+
29
+ async def seek(self, offset: int) -> None:
30
+ self._cursor = max(0, offset)
31
+
32
+
33
+ class FakeCountQuery:
34
+ def __init__(self, count: int):
35
+ self.count = count
36
+
37
+ def select(self, *_args, **_kwargs):
38
+ return self
39
+
40
+ def eq(self, *_args, **_kwargs):
41
+ return self
42
+
43
+ def execute(self):
44
+ return SimpleNamespace(count=self.count)
45
+
46
+
47
+ class FakeCountSupabase:
48
+ def __init__(self, count: int = 0):
49
+ self.count = count
50
+
51
+ def table(self, _name: str):
52
+ return FakeCountQuery(self.count)
53
+
54
+
55
+ def _install_fake_magic(monkeypatch):
56
+ monkeypatch.setitem(
57
+ sys.modules,
58
+ "magic",
59
+ SimpleNamespace(from_buffer=lambda *_args, **_kwargs: "application/pdf"),
60
+ )
61
+
62
+
63
+ def _fake_request() -> Request:
64
+ return Request(
65
+ {
66
+ "type": "http",
67
+ "method": "POST",
68
+ "path": "/api/v1/ingest/upload",
69
+ "headers": [],
70
+ "client": ("127.0.0.1", 12345),
71
+ }
72
+ )
73
+
74
+
75
+ def test_upload_rejects_large_pdf_with_original_http_status(monkeypatch):
76
+ _install_fake_magic(monkeypatch)
77
+ monkeypatch.setattr(
78
+ pipeline,
79
+ "_build_supabase_client",
80
+ lambda **_kwargs: FakeCountSupabase(count=0),
81
+ )
82
+ monkeypatch.setattr(ingest_api, "celery_app", SimpleNamespace())
83
+ monkeypatch.setattr(
84
+ ingest_api,
85
+ "process_pdf_task",
86
+ SimpleNamespace(delay=lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("should not queue"))),
87
+ )
88
+ monkeypatch.setattr(ingest_api.config, "MAX_UPLOAD_MB", 1, raising=False)
89
+ monkeypatch.setattr(ingest_api.config, "GUEST_MAX_UPLOAD_MB", 1, raising=False)
90
+
91
+ file = FakeUploadFile("guide.pdf", b"%PDF-1.4\n" + (b"x" * (2 * 1024 * 1024)))
92
+
93
+ with pytest.raises(HTTPException) as exc_info:
94
+ asyncio.run(
95
+ ingest_api.upload(
96
+ request=_fake_request(),
97
+ file=file,
98
+ user_id="user-1",
99
+ x_auth_token="token",
100
+ )
101
+ )
102
+
103
+ assert exc_info.value.status_code == 413
104
+
105
+
106
+ def test_upload_returns_503_when_worker_is_unavailable(monkeypatch):
107
+ _install_fake_magic(monkeypatch)
108
+ monkeypatch.setattr(
109
+ pipeline,
110
+ "_build_supabase_client",
111
+ lambda **_kwargs: FakeCountSupabase(count=0),
112
+ )
113
+ monkeypatch.setattr(ingest_api, "celery_app", None)
114
+ monkeypatch.setattr(ingest_api, "process_pdf_task", SimpleNamespace())
115
+
116
+ file = FakeUploadFile("guide.pdf", b"%PDF-1.4\nsmall")
117
+
118
+ with pytest.raises(HTTPException) as exc_info:
119
+ asyncio.run(
120
+ ingest_api.upload(
121
+ request=_fake_request(),
122
+ file=file,
123
+ user_id="user-1",
124
+ x_auth_token="token",
125
+ )
126
+ )
127
+
128
+ assert exc_info.value.status_code == 503
129
+ assert "worker is unavailable" in exc_info.value.detail.lower()
130
+
131
+
132
+ def test_get_ingest_status_requires_available_worker(monkeypatch):
133
+ monkeypatch.setattr(ingest_api, "celery_app", None)
134
+
135
+ with pytest.raises(HTTPException) as exc_info:
136
+ ingest_api.get_ingest_status("task-1")
137
+
138
+ assert exc_info.value.status_code == 503
139
+
140
+
141
+ def test_process_pdf_task_impl_preserves_original_exception_and_cleans_temp_file(monkeypatch):
142
+ fd, tmp_path = tempfile.mkstemp(suffix="_guide.pdf")
143
+ os.close(fd)
144
+
145
+ monkeypatch.setattr(
146
+ tasks,
147
+ "run_ingestion",
148
+ lambda **_kwargs: (_ for _ in ()).throw(ValueError("boom")),
149
+ )
150
+
151
+ fake_task = SimpleNamespace(update_state=lambda **_kwargs: None)
152
+
153
+ with pytest.raises(ValueError, match="boom"):
154
+ tasks._process_pdf_task_impl(fake_task, tmp_path, "guide.pdf", "token")
155
+
156
+ assert not os.path.exists(tmp_path)
tests/test_pipeline_regressions.py ADDED
@@ -0,0 +1,1831 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from types import SimpleNamespace
5
+
6
+ from langchain_core.documents import Document
7
+
8
+ from backend.api import admin
9
+ from backend.api import query as query_api
10
+ from backend.core import auth_utils, pipeline, providers
11
+ from backend.eval import run_eval
12
+
13
+
14
+ class FakeElement:
15
+ def __init__(self, text: str, category: str = "Text", page_number: int = 1):
16
+ self.text = text
17
+ self.category = category
18
+ self.metadata = SimpleNamespace(page_number=page_number)
19
+
20
+ def __str__(self) -> str:
21
+ return self.text
22
+
23
+
24
+ class FakeIngestionTable:
25
+ def __init__(self, supabase, name: str):
26
+ self.supabase = supabase
27
+ self.name = name
28
+ self.action = None
29
+ self.filters = {}
30
+ self.payload = None
31
+
32
+ def select(self, *_args):
33
+ self.action = "select"
34
+ return self
35
+
36
+ def delete(self):
37
+ self.action = "delete"
38
+ return self
39
+
40
+ def upsert(self, payload, on_conflict=None):
41
+ self.action = "upsert"
42
+ self.payload = payload
43
+ self.on_conflict = on_conflict
44
+ return self
45
+
46
+ def insert(self, payload):
47
+ self.action = "insert"
48
+ self.payload = payload
49
+ return self
50
+
51
+ def eq(self, key, value):
52
+ self.filters[key] = value
53
+ return self
54
+
55
+ def contains(self, key, value):
56
+ self.filters[key] = value
57
+ return self
58
+
59
+ def limit(self, value):
60
+ self.filters["limit"] = value
61
+ return self
62
+
63
+ def execute(self):
64
+ self.supabase.ops.append((self.name, self.action, dict(self.filters)))
65
+ if self.action == "insert":
66
+ self.supabase.inserts.append((self.name, self.payload))
67
+ if self.name == "ingested_files" and self.action == "select":
68
+ return SimpleNamespace(
69
+ data=[{"document_type": "short_story", "user_overridden": True}]
70
+ )
71
+ return SimpleNamespace(data=[])
72
+
73
+
74
+ class FakeIngestionSupabase:
75
+ def __init__(self):
76
+ self.ops = []
77
+ self.inserts = []
78
+
79
+ def table(self, name: str):
80
+ return FakeIngestionTable(self, name)
81
+
82
+
83
+ class FakeRecoveryTable:
84
+ def __init__(self, supabase, name: str):
85
+ self.supabase = supabase
86
+ self.name = name
87
+ self.action = None
88
+ self.filters = {}
89
+ self.payload = None
90
+ self.limit_value = None
91
+
92
+ def select(self, *_args):
93
+ self.action = "select"
94
+ return self
95
+
96
+ def upsert(self, payload, on_conflict=None):
97
+ self.action = "upsert"
98
+ self.payload = payload
99
+ self.on_conflict = on_conflict
100
+ self.supabase.upserts.append((self.name, payload, on_conflict))
101
+ return self
102
+
103
+ def insert(self, payload):
104
+ self.action = "insert"
105
+ self.payload = payload
106
+ self.supabase.inserts.append((self.name, payload))
107
+ return self
108
+
109
+ def eq(self, key, value):
110
+ self.filters[key] = value
111
+ return self
112
+
113
+ def contains(self, key, value):
114
+ self.filters[key] = value
115
+ return self
116
+
117
+ def limit(self, value):
118
+ self.limit_value = value
119
+ return self
120
+
121
+ def execute(self):
122
+ if self.name == "documents" and self.action == "select":
123
+ file_hash = (self.filters.get("metadata") or {}).get("file_hash")
124
+ user_id = self.filters.get("user_id")
125
+ rows = [
126
+ row for row in self.supabase.documents
127
+ if (not user_id or row.get("user_id") == user_id)
128
+ and ((row.get("metadata") or {}).get("file_hash") == file_hash)
129
+ ]
130
+ if self.limit_value is not None:
131
+ rows = rows[: self.limit_value]
132
+ return SimpleNamespace(data=rows)
133
+ if self.name == "ingestion_retry_logs" and self.action == "select":
134
+ user_id = self.filters.get("user_id")
135
+ file_hash = self.filters.get("file_hash")
136
+ event_type = self.filters.get("event_type")
137
+ rows = [
138
+ row for row in self.supabase.ingestion_logs
139
+ if (not user_id or row.get("user_id") == user_id)
140
+ and (not file_hash or row.get("file_hash") == file_hash)
141
+ and (not event_type or row.get("event_type") == event_type)
142
+ ]
143
+ if self.limit_value is not None:
144
+ rows = rows[: self.limit_value]
145
+ return SimpleNamespace(data=rows)
146
+ return SimpleNamespace(data=[])
147
+
148
+
149
+ class FakeRecoverySupabase:
150
+ def __init__(self, *, documents=None, ingestion_logs=None):
151
+ self.documents = list(documents or [])
152
+ self.ingestion_logs = list(ingestion_logs or [])
153
+ self.upserts = []
154
+ self.inserts = []
155
+
156
+ def table(self, name: str):
157
+ return FakeRecoveryTable(self, name)
158
+
159
+
160
+ class FakeRetrieveTable:
161
+ def __init__(self, supabase, name: str):
162
+ self.supabase = supabase
163
+ self.name = name
164
+ self.filters = {}
165
+
166
+ def select(self, *_args):
167
+ return self
168
+
169
+ def in_(self, key, values):
170
+ self.filters[key] = tuple(values)
171
+ return self
172
+
173
+ def eq(self, key, value):
174
+ self.filters[key] = value
175
+ return self
176
+
177
+ def execute(self):
178
+ if self.name == "ingested_files":
179
+ requested = self.filters.get("file_hash", ())
180
+ data = []
181
+ for item in requested:
182
+ if item == "A":
183
+ data.append({"file_hash": "A", "filename": "About Love Anton Chekhov"})
184
+ if item == "B":
185
+ data.append({"file_hash": "B", "filename": "BEYOND BOUNDS"})
186
+ return SimpleNamespace(data=data)
187
+ return SimpleNamespace(data=[])
188
+
189
+
190
+ class FakeRetrieveRpc:
191
+ def __init__(self, supabase, params):
192
+ self.supabase = supabase
193
+ self.params = params
194
+
195
+ def execute(self):
196
+ file_hash = self.params["filter"]["file_hash"]
197
+ if file_hash == "A":
198
+ return SimpleNamespace(
199
+ data=[
200
+ {
201
+ "id": "A-1",
202
+ "content": "A" * 400,
203
+ "metadata": {
204
+ "file_hash": "A",
205
+ "source": "About Love Anton Chekhov",
206
+ "chunk_index": 1,
207
+ "document_type": "short_story",
208
+ "page_numbers": [1],
209
+ },
210
+ },
211
+ {
212
+ "id": "A-2",
213
+ "content": "B" * 400,
214
+ "metadata": {
215
+ "file_hash": "A",
216
+ "source": "About Love Anton Chekhov",
217
+ "chunk_index": 2,
218
+ "document_type": "short_story",
219
+ "page_numbers": [2],
220
+ },
221
+ },
222
+ ]
223
+ )
224
+ return SimpleNamespace(
225
+ data=[
226
+ {
227
+ "id": "B-1",
228
+ "content": "C" * 200,
229
+ "metadata": {
230
+ "file_hash": "B",
231
+ "source": "BEYOND BOUNDS",
232
+ "chunk_index": 1,
233
+ "document_type": "short_story",
234
+ "page_numbers": [1],
235
+ },
236
+ }
237
+ ]
238
+ )
239
+
240
+
241
+ class FakeRetrieveSupabase:
242
+ def table(self, name: str):
243
+ return FakeRetrieveTable(self, name)
244
+
245
+ def rpc(self, _name: str, params):
246
+ return FakeRetrieveRpc(self, params)
247
+
248
+
249
+ class FakeServiceTable:
250
+ def __init__(self, supabase, name: str):
251
+ self.supabase = supabase
252
+ self.name = name
253
+ self.filters = {}
254
+ self.action = None
255
+ self.payload = None
256
+
257
+ def insert(self, payload):
258
+ self.action = "insert"
259
+ self.payload = payload
260
+ self.supabase.inserts.append((self.name, payload))
261
+ return self
262
+
263
+ def update(self, payload):
264
+ self.action = "update"
265
+ self.payload = payload
266
+ return self
267
+
268
+ def upsert(self, payload, on_conflict=None):
269
+ self.action = "upsert"
270
+ self.payload = payload
271
+ self.on_conflict = on_conflict
272
+ self.supabase.upserts.append((self.name, payload, on_conflict))
273
+ return self
274
+
275
+ def select(self, *_args):
276
+ self.action = "select"
277
+ return self
278
+
279
+ def eq(self, key, value):
280
+ self.filters[key] = value
281
+ return self
282
+
283
+ def in_(self, key, values):
284
+ self.filters[key] = tuple(values)
285
+ return self
286
+
287
+ def limit(self, value):
288
+ self.filters["limit"] = value
289
+ return self
290
+
291
+ def execute(self):
292
+ if self.name == "query_traces" and self.action == "select":
293
+ trace_ids = self.filters.get("trace_id")
294
+ data = [
295
+ row
296
+ for row in self.supabase.trace_rows
297
+ if trace_ids is None
298
+ or (
299
+ isinstance(trace_ids, tuple)
300
+ and row.get("trace_id") in trace_ids
301
+ )
302
+ or row.get("trace_id") == trace_ids
303
+ ]
304
+ if "user_id" in self.filters:
305
+ data = [row for row in data if row.get("user_id") == self.filters["user_id"]]
306
+ if "session_id" in self.filters:
307
+ data = [row for row in data if row.get("session_id") == self.filters["session_id"]]
308
+ return SimpleNamespace(data=data[: self.filters.get("limit", len(data))])
309
+ if self.name == "answer_feedback" and self.action == "select":
310
+ rows = [
311
+ row
312
+ for row in self.supabase.feedback_rows
313
+ if ("promote_to_eval" not in self.filters or row.get("promote_to_eval") is self.filters["promote_to_eval"])
314
+ ]
315
+ if "user_id" in self.filters:
316
+ rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
317
+ if "trace_id" in self.filters:
318
+ rows = [row for row in rows if row.get("trace_id") == self.filters["trace_id"]]
319
+ if "id" in self.filters:
320
+ rows = [row for row in rows if row.get("id") == self.filters["id"]]
321
+ return SimpleNamespace(data=rows[: self.filters.get("limit", len(rows))])
322
+ if self.name == "evaluation_datasets" and self.action == "select":
323
+ rows = list(self.supabase.eval_rows)
324
+ return SimpleNamespace(data=rows[: self.filters.get("limit", len(rows))])
325
+ if self.name == "query_traces" and self.action == "insert":
326
+ self.supabase.trace_rows.append(self.payload)
327
+ if self.name == "answer_feedback" and self.action == "insert":
328
+ self.supabase.feedback_rows.append(self.payload)
329
+ if self.name == "query_traces" and self.action == "update":
330
+ for row in self.supabase.trace_rows:
331
+ if all(row.get(k) == v for k, v in self.filters.items()):
332
+ row.update(self.payload)
333
+ if self.name == "answer_feedback" and self.action == "update":
334
+ for row in self.supabase.feedback_rows:
335
+ if all(row.get(k) == v for k, v in self.filters.items()):
336
+ row.update(self.payload)
337
+ if self.name == "evaluation_datasets" and self.action == "upsert":
338
+ trace_id = self.payload.get("trace_id")
339
+ existing = next(
340
+ (row for row in self.supabase.eval_rows if row.get("trace_id") == trace_id),
341
+ None,
342
+ )
343
+ if existing:
344
+ existing.update(self.payload)
345
+ else:
346
+ self.supabase.eval_rows.append(self.payload)
347
+ return SimpleNamespace(data=[])
348
+
349
+
350
+ class FakeServiceSupabase:
351
+ def __init__(self):
352
+ self.inserts = []
353
+ self.upserts = []
354
+ self.trace_rows = []
355
+ self.feedback_rows = []
356
+ self.eval_rows = []
357
+
358
+ def table(self, name: str):
359
+ return FakeServiceTable(self, name)
360
+
361
+ def rpc(self, _name: str, _params):
362
+ return SimpleNamespace(execute=lambda: SimpleNamespace(data=[]))
363
+
364
+
365
+ class FakeGraphServiceTable(FakeServiceTable):
366
+ def execute(self):
367
+ if self.name == "graph_nodes" and self.action == "select":
368
+ rows = list(self.supabase.graph_nodes)
369
+ if "user_id" in self.filters:
370
+ rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
371
+ return SimpleNamespace(data=rows)
372
+ if self.name == "graph_edges" and self.action == "select":
373
+ rows = list(self.supabase.graph_edges)
374
+ if "user_id" in self.filters:
375
+ rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
376
+ return SimpleNamespace(data=rows)
377
+ return super().execute()
378
+
379
+
380
+ class FakeGraphServiceSupabase(FakeServiceSupabase):
381
+ def __init__(self):
382
+ super().__init__()
383
+ self.graph_nodes = []
384
+ self.graph_edges = []
385
+
386
+ def table(self, name: str):
387
+ return FakeGraphServiceTable(self, name)
388
+
389
+
390
+ class FakeGraphVectorTable:
391
+ def __init__(self, rows):
392
+ self.rows = rows
393
+ self.filters = {}
394
+
395
+ def select(self, *_args):
396
+ return self
397
+
398
+ def eq(self, key, value):
399
+ self.filters[key] = value
400
+ return self
401
+
402
+ def contains(self, key, value):
403
+ self.filters[key] = value
404
+ return self
405
+
406
+ def execute(self):
407
+ rows = list(self.rows)
408
+ if "user_id" in self.filters:
409
+ rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
410
+ metadata_contains = self.filters.get("metadata")
411
+ if metadata_contains:
412
+ rows = [
413
+ row for row in rows
414
+ if all((row.get("metadata", {}) or {}).get(k) == v for k, v in metadata_contains.items())
415
+ ]
416
+ return SimpleNamespace(data=rows)
417
+
418
+
419
+ class FakeGraphVectorSupabase:
420
+ def __init__(self, rows):
421
+ self.rows = rows
422
+
423
+ def table(self, _name: str):
424
+ return FakeGraphVectorTable(self.rows)
425
+
426
+
427
+ class FakeRerankResult:
428
+ def __init__(self, index: int, relevance_score: float):
429
+ self.index = index
430
+ self.relevance_score = relevance_score
431
+
432
+
433
+ class FakeCohereClient:
434
+ def __init__(self, *_args, **_kwargs):
435
+ pass
436
+
437
+ def rerank(self, model, query, documents, top_n):
438
+ del model, query, top_n
439
+ if len(documents) == 2:
440
+ scores = [0.9, 0.8]
441
+ else:
442
+ scores = [0.2]
443
+ return SimpleNamespace(
444
+ results=[
445
+ FakeRerankResult(index=i, relevance_score=score)
446
+ for i, score in enumerate(scores[: len(documents)])
447
+ ]
448
+ )
449
+
450
+
451
+ def test_create_chunks_uses_short_document_settings(monkeypatch):
452
+ seen = {}
453
+
454
+ def fake_chunk_by_title(elements, **kwargs):
455
+ seen["kwargs"] = kwargs
456
+ return list(elements)
457
+
458
+ monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
459
+
460
+ chunks = pipeline.create_chunks([FakeElement("short text")], text_chars=5_000)
461
+
462
+ assert len(chunks) == 1
463
+ assert seen["kwargs"] == {
464
+ "max_characters": 3000,
465
+ "new_after_n_chars": 2500,
466
+ "combine_text_under_n_chars": 300,
467
+ }
468
+
469
+
470
+ def test_create_chunks_keeps_large_document_settings(monkeypatch):
471
+ seen = {}
472
+
473
+ def fake_chunk_by_title(elements, **kwargs):
474
+ seen["kwargs"] = kwargs
475
+ return list(elements)
476
+
477
+ monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
478
+
479
+ chunks = pipeline.create_chunks([FakeElement("large text")], text_chars=40_000)
480
+
481
+ assert len(chunks) == 1
482
+ assert seen["kwargs"] == {
483
+ "max_characters": 8000,
484
+ "new_after_n_chars": 7000,
485
+ "combine_text_under_n_chars": 500,
486
+ }
487
+
488
+
489
+ def test_predict_and_prefetch_uses_rewriter_provider(monkeypatch):
490
+ seen_purposes = []
491
+ warmed_queries = []
492
+
493
+ class FakeLLM:
494
+ def invoke(self, _messages):
495
+ return SimpleNamespace(content='["follow-up question"]')
496
+
497
+ def fake_build_chat_llm(*, purpose="text", **_kwargs):
498
+ seen_purposes.append(purpose)
499
+ return FakeLLM()
500
+
501
+ monkeypatch.setattr(
502
+ providers.ProviderFactory, "build_chat_llm", staticmethod(fake_build_chat_llm)
503
+ )
504
+ monkeypatch.setattr(
505
+ pipeline,
506
+ "retrieve_chunks",
507
+ lambda **kwargs: warmed_queries.append(kwargs["query"]) or [],
508
+ )
509
+
510
+ pipeline._predict_and_prefetch(
511
+ original_query="original",
512
+ answer="answer",
513
+ category="short_story",
514
+ session_id="session-1",
515
+ access_token="token",
516
+ )
517
+
518
+ assert seen_purposes == ["rewriter"]
519
+ assert warmed_queries == ["follow-up question"]
520
+
521
+
522
+ def test_generate_answer_stream_marks_summary_nodes(monkeypatch):
523
+ captured = {}
524
+
525
+ class FakeLLM:
526
+ async def astream(self, messages):
527
+ captured["prompt"] = messages[0].content[0]["text"]
528
+ yield SimpleNamespace(content="ok")
529
+
530
+ monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
531
+ monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
532
+ monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
533
+ monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
534
+
535
+ summary_chunk = Document(
536
+ page_content="Summary body",
537
+ metadata={
538
+ "source": "About Love Anton Chekhov",
539
+ "node_type": "summary",
540
+ "node_level": 2,
541
+ "chunk_index": "summary-1",
542
+ "document_type": "short_story",
543
+ "relevance_score": 0.8,
544
+ },
545
+ )
546
+ leaf_chunk = Document(
547
+ page_content="Leaf fallback",
548
+ metadata={
549
+ "source": "About Love Anton Chekhov",
550
+ "chunk_index": 1,
551
+ "document_type": "short_story",
552
+ "relevance_score": 0.6,
553
+ "original_content": {"raw_text": "Leaf raw text", "tables_html": []},
554
+ },
555
+ )
556
+
557
+ async def collect():
558
+ events = []
559
+ async for event in pipeline.generate_answer_stream(
560
+ chunks=[summary_chunk, leaf_chunk],
561
+ query="summarise this",
562
+ access_token=None,
563
+ category="short_story",
564
+ priority_file_hashes=None,
565
+ ):
566
+ events.append(event)
567
+ return events
568
+
569
+ events = asyncio.run(collect())
570
+
571
+ assert any(event["type"] == "done" for event in events)
572
+ assert "[SYNTHESIZED CHAPTER SUMMARY - LEVEL 2]" in captured["prompt"]
573
+ assert "TEXT:\nSummary body" in captured["prompt"]
574
+ assert "TEXT:\nLeaf raw text" in captured["prompt"]
575
+
576
+
577
+ def test_run_ingestion_preserves_user_override_before_cleanup(monkeypatch):
578
+ fake_supabase = FakeIngestionSupabase()
579
+ captured = {}
580
+
581
+ monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
582
+ monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
583
+ monkeypatch.setattr(
584
+ pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: True
585
+ )
586
+ monkeypatch.setattr(
587
+ pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake_supabase
588
+ )
589
+ monkeypatch.setattr(
590
+ pipeline,
591
+ "_build_service_supabase_client",
592
+ lambda *_args, **_kwargs: fake_supabase,
593
+ )
594
+ monkeypatch.setattr(
595
+ pipeline, "partition_document", lambda _path: [FakeElement("x" * 100)]
596
+ )
597
+ monkeypatch.setattr(pipeline, "extract_images_from_pdf", lambda _path: {})
598
+
599
+ def fake_extract_document_entities(
600
+ elements, access_token=None, forced_category=None
601
+ ):
602
+ del elements, access_token
603
+ captured["forced_category"] = forced_category
604
+ return SimpleNamespace(is_allowed=True, document_type=forced_category)
605
+
606
+ monkeypatch.setattr(
607
+ pipeline, "extract_document_entities", fake_extract_document_entities
608
+ )
609
+ monkeypatch.setattr(
610
+ pipeline, "create_chunks", lambda elements, text_chars=None: ["chunk"]
611
+ )
612
+ monkeypatch.setattr(
613
+ pipeline,
614
+ "process_chunks",
615
+ lambda *args, **kwargs: (
616
+ [Document(page_content="body", metadata={"source": "Test Doc"})],
617
+ ["doc-1"],
618
+ ),
619
+ )
620
+ monkeypatch.setattr(
621
+ pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids)
622
+ )
623
+ monkeypatch.setattr(pipeline, "upload_to_supabase", lambda *args, **kwargs: None)
624
+ monkeypatch.setattr(
625
+ pipeline, "invalidate_user_cache", lambda *args, **kwargs: None
626
+ )
627
+
628
+ result = pipeline.run_ingestion(
629
+ pdf_path="file.pdf",
630
+ force=True,
631
+ original_filename="file.pdf",
632
+ access_token="token",
633
+ )
634
+
635
+ assert result["document_type"] == "short_story"
636
+ assert captured["forced_category"] == "short_story"
637
+
638
+ select_idx = fake_supabase.ops.index(
639
+ ("ingested_files", "select", {"user_id": "user-1", "file_hash": "file-hash", "limit": 1})
640
+ )
641
+ delete_idx = fake_supabase.ops.index(
642
+ ("ingested_files", "delete", {"user_id": "user-1", "file_hash": "file-hash"})
643
+ )
644
+ assert select_idx < delete_idx
645
+
646
+
647
+ def test_upload_to_supabase_uses_batch_rpc_and_skips_success_sleep(monkeypatch):
648
+ calls = []
649
+ sleeps = []
650
+
651
+ class FakeRpc:
652
+ def __init__(self, name, params):
653
+ self.name = name
654
+ self.params = params
655
+
656
+ def execute(self):
657
+ calls.append((self.name, self.params))
658
+ return SimpleNamespace(data=[])
659
+
660
+ class FakeBatchSupabase:
661
+ def rpc(self, name, params):
662
+ return FakeRpc(name, params)
663
+
664
+ class FakeEmbedder:
665
+ def embed_documents(self, texts):
666
+ return [[float(i), float(len(text))] for i, text in enumerate(texts, 1)]
667
+
668
+ monkeypatch.setattr(auth_utils, "safe_extract_jwt_sub", lambda _token: "user-1")
669
+ monkeypatch.setattr(pipeline, "_build_embeddings", lambda: FakeEmbedder())
670
+ monkeypatch.setattr(
671
+ pipeline,
672
+ "_build_service_supabase_client",
673
+ lambda *_args, **_kwargs: FakeBatchSupabase(),
674
+ )
675
+ monkeypatch.setattr(pipeline.time, "sleep", lambda seconds: sleeps.append(seconds))
676
+
677
+ docs = [
678
+ Document(page_content="alpha", metadata={"source": "A", "node_type": "leaf"}),
679
+ Document(page_content="beta", metadata={"source": "B", "node_type": "summary"}),
680
+ ]
681
+
682
+ pipeline.upload_to_supabase(
683
+ docs,
684
+ ["doc-1", "doc-2"],
685
+ access_token="token",
686
+ )
687
+
688
+ assert calls
689
+ assert calls[0][0] == "insert_document_chunks_batch"
690
+ assert len(calls[0][1]["p_rows"]) == 2
691
+ assert sleeps == []
692
+
693
+
694
+ def test_run_ingestion_records_stage_timing_events(monkeypatch):
695
+ fake_supabase = FakeIngestionSupabase()
696
+
697
+ monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
698
+ monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
699
+ monkeypatch.setattr(
700
+ pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: False
701
+ )
702
+ monkeypatch.setattr(
703
+ pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake_supabase
704
+ )
705
+ monkeypatch.setattr(
706
+ pipeline,
707
+ "_build_service_supabase_client",
708
+ lambda *_args, **_kwargs: fake_supabase,
709
+ )
710
+ monkeypatch.setattr(
711
+ pipeline, "partition_document", lambda _path: [FakeElement("x" * 120)]
712
+ )
713
+ monkeypatch.setattr(pipeline, "extract_images_from_pdf", lambda _path: {})
714
+ monkeypatch.setattr(
715
+ pipeline,
716
+ "extract_document_entities",
717
+ lambda *args, **kwargs: SimpleNamespace(
718
+ is_allowed=True,
719
+ document_type="short_story",
720
+ primary_topics=[],
721
+ brief_summary="Short story",
722
+ key_entities=[],
723
+ ),
724
+ )
725
+ monkeypatch.setattr(pipeline, "create_chunks", lambda elements, text_chars=None: ["chunk"])
726
+ monkeypatch.setattr(
727
+ pipeline,
728
+ "process_chunks",
729
+ lambda *args, **kwargs: (
730
+ [Document(page_content="body", metadata={"source": "Test Doc"})],
731
+ ["doc-1"],
732
+ ),
733
+ )
734
+ monkeypatch.setattr(
735
+ pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids)
736
+ )
737
+ monkeypatch.setattr(pipeline, "upload_to_supabase", lambda *args, **kwargs: None)
738
+ monkeypatch.setattr(
739
+ pipeline, "invalidate_user_cache", lambda *args, **kwargs: None
740
+ )
741
+
742
+ pipeline.run_ingestion(
743
+ pdf_path="file.pdf",
744
+ force=False,
745
+ original_filename="file.pdf",
746
+ access_token="token",
747
+ )
748
+
749
+ stage_rows = [
750
+ payload
751
+ for name, payload in fake_supabase.inserts
752
+ if name == "ingestion_retry_logs" and payload.get("event_type") == "stage_timing"
753
+ ]
754
+ stages = {payload["message"] for payload in stage_rows}
755
+ assert any('"stage": "partition"' in stage for stage in stages)
756
+ assert any('"stage": "classify"' in stage for stage in stages)
757
+ assert any('"stage": "chunk_process"' in stage for stage in stages)
758
+ assert any('"stage": "raptor"' in stage for stage in stages)
759
+ assert any('"stage": "upload"' in stage for stage in stages)
760
+
761
+
762
+ def test_recover_or_prepare_orphaned_upload_repairs_completed_upload(monkeypatch):
763
+ fake_service = FakeRecoverySupabase(
764
+ documents=[
765
+ {
766
+ "user_id": "user-1",
767
+ "content": "Abdul Manan — Deep Foundations Guide",
768
+ "metadata": {
769
+ "file_hash": "file-hash",
770
+ "source": "Recovered Guide",
771
+ "document_type": "technical_guide",
772
+ },
773
+ },
774
+ {
775
+ "user_id": "user-1",
776
+ "content": 'The "Why Before What" Bible for ML/DL/AI Engineering',
777
+ "metadata": {
778
+ "file_hash": "file-hash",
779
+ "source": "Recovered Guide",
780
+ "document_type": "technical_guide",
781
+ },
782
+ },
783
+ ],
784
+ ingestion_logs=[
785
+ {
786
+ "user_id": "user-1",
787
+ "file_hash": "file-hash",
788
+ "event_type": "upload_complete",
789
+ }
790
+ ],
791
+ )
792
+ monkeypatch.setattr(
793
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
794
+ )
795
+
796
+ result = pipeline._recover_or_prepare_orphaned_upload(
797
+ "file-hash",
798
+ user_id="user-1",
799
+ filename_hint="fallback.pdf",
800
+ )
801
+
802
+ assert result["recovered_existing"] is True
803
+ upsert = next(item for item in fake_service.upserts if item[0] == "ingested_files")
804
+ assert upsert[1]["file_hash"] == "file-hash"
805
+ assert upsert[1]["document_type"] == "technical_guide"
806
+ assert upsert[1]["chunk_count"] == 2
807
+
808
+
809
+ def test_run_ingestion_short_circuits_on_recovered_existing_upload(monkeypatch):
810
+ monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
811
+ monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
812
+ monkeypatch.setattr(
813
+ pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: False
814
+ )
815
+ monkeypatch.setattr(
816
+ pipeline,
817
+ "_recover_or_prepare_orphaned_upload",
818
+ lambda *_args, **_kwargs: {
819
+ "pending_review": False,
820
+ "document_type": "technical_guide",
821
+ "filename": "Recovered Guide",
822
+ "file_hash": "file-hash",
823
+ "recovered_existing": True,
824
+ },
825
+ )
826
+ monkeypatch.setattr(
827
+ pipeline,
828
+ "partition_document",
829
+ lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("should not recompute")),
830
+ )
831
+
832
+ result = pipeline.run_ingestion(
833
+ pdf_path="file.pdf",
834
+ force=False,
835
+ original_filename="file.pdf",
836
+ access_token="token",
837
+ )
838
+
839
+ assert result["recovered_existing"] is True
840
+ assert result["file_hash"] == "file-hash"
841
+
842
+
843
+ def test_identity_json_extracts_cover_metadata():
844
+ identity = pipeline._identity_json_from_elements(
845
+ [
846
+ FakeElement("Abdul Manan — Deep Foundations Guide", page_number=1),
847
+ FakeElement('The "Why Before What" Bible for ML/DL/AI Engineering', page_number=1),
848
+ FakeElement(
849
+ "This guide exists because knowing definitions is not enough. Most people learn ML backwards.",
850
+ page_number=1,
851
+ ),
852
+ ],
853
+ fallback_title="Fallback Guide",
854
+ )
855
+
856
+ # Display title should be the actual title line; the personalized cover-owner line is stored separately.
857
+ assert identity["display_title"] == 'The "Why Before What" Bible for ML/DL/AI Engineering'
858
+ assert identity["subtitle"] == "This guide exists because knowing definitions is not enough. Most people learn ML backwards."
859
+ assert identity["named_owner"] == "Abdul Manan"
860
+ assert "knowing definitions is not enough" in identity["opening_page_summary"].lower()
861
+ assert identity["field_presence"]["publisher"] is False
862
+
863
+
864
+ def test_identity_json_strips_null_bytes_from_opening_page_fields():
865
+ identity = pipeline._identity_json_from_elements(
866
+ [
867
+ FakeElement("Abdul\x00 Manan — Deep Foundations Guide", page_number=1),
868
+ FakeElement('The "Why Before What"\x00 Bible for ML/DL/AI Engineering', page_number=1),
869
+ FakeElement("Publisher:\x00 Not stated", page_number=1),
870
+ ],
871
+ fallback_title="Fallback Guide",
872
+ )
873
+
874
+ serialized = json.dumps(identity)
875
+ assert "\u0000" not in serialized
876
+ assert "\x00" not in identity["display_title"]
877
+ assert "\x00" not in identity["subtitle"]
878
+ assert "\x00" not in identity["cover_text"]
879
+
880
+
881
+ def test_identity_json_from_docs_dedupes_repeated_opening_page_content():
882
+ repeated_row = {
883
+ "content": (
884
+ "Abdul Manan — Deep Foundations Guide\n"
885
+ 'The "Why Before What" Bible for ML/DL/AI Engineering\n'
886
+ "This guide exists because knowing definitions is not enough."
887
+ ),
888
+ "metadata": {
889
+ "page_numbers": [1],
890
+ "original_content": {
891
+ "raw_text": (
892
+ "Abdul Manan — Deep Foundations Guide\n"
893
+ 'The "Why Before What" Bible for ML/DL/AI Engineering\n'
894
+ "This guide exists because knowing definitions is not enough."
895
+ )
896
+ },
897
+ },
898
+ }
899
+
900
+ identity = pipeline._identity_json_from_docs(
901
+ [repeated_row, repeated_row],
902
+ fallback_title="Fallback Guide",
903
+ )
904
+
905
+ assert identity["cover_text"].count("Abdul Manan — Deep Foundations Guide") == 1
906
+ assert (
907
+ identity["opening_page_summary"].count(
908
+ 'The "Why Before What" Bible for ML/DL/AI Engineering'
909
+ )
910
+ == 1
911
+ )
912
+
913
+
914
+ def test_classify_query_route_decision_marks_exact_fact_query():
915
+ decision = pipeline._classify_query_route_decision(
916
+ "Whose guide is this? Answer using the exact name written in the document."
917
+ )
918
+
919
+ assert decision.route_class == "exact_fact"
920
+ assert decision.exact_field == "owner"
921
+ assert decision.preserve_query is True
922
+ assert decision.disable_memory is True
923
+
924
+
925
+ def test_classify_query_route_decision_marks_page_scoped_query():
926
+ decision = pipeline._classify_query_route_decision(
927
+ "Summarize only the first page, not the whole guide."
928
+ )
929
+
930
+ assert decision.route_class == "page_scoped"
931
+ assert decision.page_scope == "first_page"
932
+ assert decision.preserve_query is True
933
+ assert decision.disable_memory is True
934
+
935
+
936
+ class FakeAmbiguityTable:
937
+ def __init__(self, rows):
938
+ self.rows = rows
939
+ self.filters = {}
940
+ self.action = None
941
+
942
+ def select(self, *_args):
943
+ self.action = "select"
944
+ return self
945
+
946
+ def eq(self, key, value):
947
+ self.filters[key] = value
948
+ return self
949
+
950
+ def execute(self):
951
+ return SimpleNamespace(data=list(self.rows))
952
+
953
+
954
+ class FakeAmbiguityRpc:
955
+ def __init__(self, supabase, params):
956
+ self.supabase = supabase
957
+ self.params = params
958
+
959
+ def execute(self):
960
+ self.supabase.rpc_calls.append(self.params)
961
+ fhash = (self.params.get("filter") or {}).get("file_hash")
962
+ score = 0.22 if fhash == "A" else 0.11
963
+ return SimpleNamespace(data=[{"combined_score": score}])
964
+
965
+
966
+ class FakeAmbiguitySupabase:
967
+ def __init__(self, rows):
968
+ self.rows = rows
969
+ self.rpc_calls = []
970
+
971
+ def table(self, _name: str):
972
+ return FakeAmbiguityTable(self.rows)
973
+
974
+ def rpc(self, _name: str, params):
975
+ return FakeAmbiguityRpc(self, params)
976
+
977
+
978
+ def test_check_query_ambiguity_forces_clarification_for_identity_queries_in_multi_doc_scope(monkeypatch):
979
+ fake = FakeAmbiguitySupabase(
980
+ rows=[
981
+ {"file_hash": "A", "filename": "Guide A.pdf"},
982
+ {"file_hash": "B", "filename": "Guide B.pdf"},
983
+ ]
984
+ )
985
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
986
+
987
+ res = pipeline.check_query_ambiguity("Whose guide is this?", access_token=None, category="All")
988
+
989
+ assert res["is_ambiguous"] is True
990
+ assert res["top_file_hash"] is None
991
+ assert res["clarification_options"]
992
+ assert fake.rpc_calls == []
993
+
994
+
995
+ def test_check_query_ambiguity_rpc_includes_p_user_id_to_avoid_overload(monkeypatch):
996
+ fake = FakeAmbiguitySupabase(
997
+ rows=[
998
+ {"file_hash": "A", "filename": "Doc A.pdf"},
999
+ {"file_hash": "B", "filename": "Doc B.pdf"},
1000
+ ]
1001
+ )
1002
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
1003
+
1004
+ res = pipeline.check_query_ambiguity("summarize the document", access_token=None, category="All")
1005
+
1006
+ assert res["is_ambiguous"] is True
1007
+ assert fake.rpc_calls, "Expected ambiguity scoring RPC calls"
1008
+ assert all("p_user_id" in call for call in fake.rpc_calls)
1009
+
1010
+
1011
+ def test_check_query_ambiguity_autopins_single_doc_in_category_even_for_identity_query(monkeypatch):
1012
+ fake = FakeAmbiguitySupabase(
1013
+ rows=[
1014
+ {"file_hash": "ONLY", "filename": "Only Doc.pdf"},
1015
+ ]
1016
+ )
1017
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
1018
+
1019
+ res = pipeline.check_query_ambiguity(
1020
+ "Whose guide is this?",
1021
+ access_token=None,
1022
+ category="technical_guide",
1023
+ )
1024
+
1025
+ assert res["is_ambiguous"] is False
1026
+ assert res["top_file_hash"] == "ONLY"
1027
+
1028
+
1029
+ def test_check_query_ambiguity_lists_only_three_options_when_many_docs(monkeypatch):
1030
+ fake = FakeAmbiguitySupabase(
1031
+ rows=[
1032
+ {"file_hash": "A", "filename": "A.pdf"},
1033
+ {"file_hash": "B", "filename": "B.pdf"},
1034
+ {"file_hash": "C", "filename": "C.pdf"},
1035
+ {"file_hash": "D", "filename": "D.pdf"},
1036
+ {"file_hash": "E", "filename": "E.pdf"},
1037
+ ]
1038
+ )
1039
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
1040
+
1041
+ res = pipeline.check_query_ambiguity("What is the exact full title?", access_token=None, category="All")
1042
+
1043
+ assert res["is_ambiguous"] is True
1044
+ assert len(res.get("clarification_options") or []) == 3
1045
+
1046
+
1047
+ def test_query_followup_guard_detects_ordinal_without_enumeration():
1048
+ assert query_api._contains_ordinal_followup("What about the second one?") is True
1049
+ assert query_api._history_has_explicit_enumeration(
1050
+ [{"role": "assistant", "content": "No list here."}]
1051
+ ) is False
1052
+
1053
+
1054
+ def test_query_followup_guard_allows_ordinal_when_prior_answer_lists_items():
1055
+ history = [
1056
+ {"role": "assistant", "content": "1. Alice\n2. Bob\n"},
1057
+ ]
1058
+ assert query_api._contains_ordinal_followup("What about the second one?") is True
1059
+ assert query_api._history_has_explicit_enumeration(history) is True
1060
+
1061
+
1062
+ def test_generate_sub_queries_skips_rewrite_for_exact_fact(monkeypatch):
1063
+ monkeypatch.setattr(
1064
+ providers.ProviderFactory,
1065
+ "build_chat_llm",
1066
+ staticmethod(lambda **_kwargs: (_ for _ in ()).throw(AssertionError("rewriter should not be called"))),
1067
+ )
1068
+
1069
+ queries = pipeline.generate_sub_queries(
1070
+ "What is the exact full title of this guide?",
1071
+ route_class="exact_fact",
1072
+ )
1073
+
1074
+ assert queries == ["What is the exact full title of this guide?"]
1075
+
1076
+
1077
+ def test_identity_documents_for_query_answers_not_stated_publisher():
1078
+ row = {
1079
+ "filename": "Guide.pdf",
1080
+ "identity_json": {
1081
+ "display_title": "Abdul Manan — Deep Foundations Guide",
1082
+ "field_presence": {"publisher": False},
1083
+ "source_pages": [1],
1084
+ },
1085
+ }
1086
+ route_decision = pipeline.RouteDecision(
1087
+ route_class="exact_fact",
1088
+ route_reason="identity_field:publisher",
1089
+ exact_field="publisher",
1090
+ )
1091
+
1092
+ docs = pipeline._identity_documents_for_query(
1093
+ row,
1094
+ query="Does this guide explicitly name a publisher on the opening pages? If not, say not stated.",
1095
+ route_decision=route_decision,
1096
+ )
1097
+
1098
+ assert len(docs) == 1
1099
+ assert "not stated on the opening pages" in docs[0].page_content.lower()
1100
+ assert docs[0].metadata["retrieval_branch"] == "identity_store"
1101
+
1102
+
1103
+ def test_build_history_block_returns_structured_state_without_role_labels():
1104
+ block = pipeline._build_history_block(
1105
+ [
1106
+ {"role": "user", "content": "Whose guide is this?"},
1107
+ {"role": "assistant", "content": "ASSISTANT: Abdul Manan — Deep Foundations Guide."},
1108
+ ],
1109
+ route_class="factoid",
1110
+ eval_mode=False,
1111
+ )
1112
+
1113
+ assert "CONVERSATION STATE:" in block
1114
+ assert "previous_user_intent:" in block
1115
+ assert "previous_answer_summary:" in block
1116
+ assert "ASSISTANT:" not in block
1117
+ assert "USER:" not in block
1118
+
1119
+
1120
+ def test_save_to_memory_writes_structured_payloads(monkeypatch):
1121
+ fake_service = FakeServiceSupabase()
1122
+ monkeypatch.setattr(
1123
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
1124
+ )
1125
+ monkeypatch.setattr(pipeline, "_stable_user_id", lambda *_args, **_kwargs: "user-1")
1126
+ monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _text: [0.1, 0.2])
1127
+
1128
+ chunks = [
1129
+ Document(
1130
+ page_content="body",
1131
+ metadata={"file_hash": "file-1", "document_type": "machine_learning_guide"},
1132
+ )
1133
+ ]
1134
+
1135
+ pipeline._save_to_memory(
1136
+ "session-1",
1137
+ "Whose guide is this?",
1138
+ "Abdul Manan — Deep Foundations Guide\n\n---\n**Sources:**\n[Source 1]",
1139
+ access_token=None,
1140
+ route_class="factoid",
1141
+ chunks=chunks,
1142
+ )
1143
+
1144
+ assert len(fake_service.inserts) == 2
1145
+ user_payload = json.loads(fake_service.inserts[0][1]["content"])
1146
+ answer_payload = json.loads(fake_service.inserts[1][1]["content"])
1147
+ assert user_payload["kind"] == "user_query"
1148
+ assert answer_payload["kind"] == "assistant_fact"
1149
+ assert answer_payload["file_hashes"] == ["file-1"]
1150
+ assert "Sources" not in answer_payload["summary"]
1151
+
1152
+
1153
+ def test_generate_answer_stream_eval_mode_skips_history_and_memory_injection(monkeypatch):
1154
+ captured = {}
1155
+
1156
+ class FakeLLM:
1157
+ async def astream(self, messages):
1158
+ captured["prompt"] = messages[0].content[0]["text"]
1159
+ yield SimpleNamespace(content="clean answer")
1160
+
1161
+ monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
1162
+ monkeypatch.setattr(
1163
+ pipeline,
1164
+ "_get_episodic_memory",
1165
+ lambda *args, **kwargs: "" if kwargs.get("eval_mode") else "SESSION FACTS:\n- prior answer: x\n",
1166
+ )
1167
+ monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
1168
+ monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
1169
+ monkeypatch.setattr(pipeline, "_persist_query_trace", lambda **_kwargs: "trace-1")
1170
+
1171
+ chunk = Document(
1172
+ page_content="Body text",
1173
+ metadata={
1174
+ "source": "Guide",
1175
+ "chunk_index": 1,
1176
+ "document_type": "machine_learning_guide",
1177
+ "relevance_score": 0.9,
1178
+ "route_class": "factoid",
1179
+ "original_content": {"raw_text": "Body text", "tables_html": []},
1180
+ },
1181
+ )
1182
+
1183
+ async def collect():
1184
+ events = []
1185
+ async for event in pipeline.generate_answer_stream(
1186
+ chunks=[chunk],
1187
+ query="Tell me more",
1188
+ chat_history=[
1189
+ {"role": "user", "content": "Who is this guide for?"},
1190
+ {"role": "assistant", "content": "It is personalized."},
1191
+ ],
1192
+ session_id="session-1",
1193
+ eval_mode=True,
1194
+ ):
1195
+ events.append(event)
1196
+ return events
1197
+
1198
+ events = asyncio.run(collect())
1199
+
1200
+ assert any(event["type"] == "done" for event in events)
1201
+ assert "CONVERSATION STATE:" not in captured["prompt"]
1202
+ assert "SESSION FACTS:" not in captured["prompt"]
1203
+
1204
+
1205
+ def test_persist_query_trace_marks_output_echo_and_contamination(monkeypatch):
1206
+ fake_service = FakeServiceSupabase()
1207
+ monkeypatch.setattr(
1208
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
1209
+ )
1210
+ monkeypatch.setattr(pipeline, "_persist_trace_graph_enrichment", lambda *args, **kwargs: None)
1211
+
1212
+ chunks = [
1213
+ Document(
1214
+ page_content="body",
1215
+ metadata={
1216
+ "trace_id": "trace-echo",
1217
+ "route_class": "factoid",
1218
+ "route_mode": "default",
1219
+ "source": "Guide",
1220
+ "document_type": "machine_learning_guide",
1221
+ "trace_quality": {
1222
+ "retrieval_relevance_proxy": 0.8,
1223
+ "history_injected": True,
1224
+ "memory_injected": True,
1225
+ },
1226
+ },
1227
+ )
1228
+ ]
1229
+
1230
+ pipeline._persist_query_trace(
1231
+ query="Why does this guide say it exists?",
1232
+ session_id="session-1",
1233
+ chunks=chunks,
1234
+ answer="ASSISTANT: This guide exists because knowing definitions is not enough.",
1235
+ access_token=None,
1236
+ )
1237
+
1238
+ upsert = next(item for item in fake_service.upserts if item[0] == "query_traces")
1239
+ failure_modes = set(upsert[1]["failure_modes"])
1240
+ assert {"output_echo", "history_contamination", "memory_contamination"} <= failure_modes
1241
+
1242
+
1243
+ def test_retrieve_chunks_exact_fact_prefers_identity_store(monkeypatch):
1244
+ monkeypatch.setattr(pipeline, "_stable_user_id", lambda *_args, **_kwargs: "user-1")
1245
+ monkeypatch.setattr(pipeline, "_route_query_experts", lambda *args, **kwargs: {
1246
+ "selected_experts": ["dense_chunk"],
1247
+ "expert_weights": {"dense_chunk": 1.0},
1248
+ "confidence": 0.9,
1249
+ })
1250
+ monkeypatch.setattr(
1251
+ pipeline,
1252
+ "_load_or_backfill_identity_row",
1253
+ lambda *args, **kwargs: {
1254
+ "filename": "Guide.pdf",
1255
+ "identity_json": {
1256
+ "display_title": "Abdul Manan — Deep Foundations Guide",
1257
+ "subtitle": 'The "Why Before What" Bible for ML/DL/AI Engineering',
1258
+ "named_owner": "Abdul Manan",
1259
+ "field_presence": {"owner": True},
1260
+ "source_pages": [1],
1261
+ },
1262
+ },
1263
+ )
1264
+
1265
+ docs = pipeline.retrieve_chunks(
1266
+ query="Whose guide is this? Answer using the exact name written in the document.",
1267
+ original_query="Whose guide is this? Answer using the exact name written in the document.",
1268
+ user_id="user-1",
1269
+ priority_file_hashes=["file-1"],
1270
+ )
1271
+
1272
+ assert len(docs) == 1
1273
+ assert docs[0].metadata["retrieval_branch"] == "identity_store"
1274
+ assert "Abdul Manan" in docs[0].page_content
1275
+
1276
+
1277
+ def test_multi_doc_context_budget_preserves_one_chunk_per_pinned_doc(monkeypatch):
1278
+ monkeypatch.setattr(
1279
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: FakeRetrieveSupabase()
1280
+ )
1281
+ monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
1282
+ monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
1283
+ monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
1284
+ monkeypatch.setattr(pipeline.config, "MAX_CONTEXT_CHARS", 700, raising=False)
1285
+
1286
+ docs = pipeline.retrieve_chunks(
1287
+ query="compare the themes of both stories",
1288
+ category="short_story",
1289
+ access_token="token",
1290
+ priority_file_hashes=["A", "B"],
1291
+ )
1292
+
1293
+ assert len(docs) == 2
1294
+ assert {doc.metadata["file_hash"] for doc in docs} == {"A", "B"}
1295
+
1296
+
1297
+ def test_build_pinned_query_plan_scopes_title_queries_to_own_doc():
1298
+ plan = pipeline._build_pinned_query_plan(
1299
+ "summarise the story short story",
1300
+ [
1301
+ {"file_hash": "A", "filename": "About Love Anton Chekhov"},
1302
+ {"file_hash": "B", "filename": "BEYOND BOUNDS"},
1303
+ ],
1304
+ "generic_pinned",
1305
+ )
1306
+
1307
+ about_entries = [entry for entry in plan if "About Love Anton Chekhov" in entry["query_text"]]
1308
+ beyond_entries = [entry for entry in plan if "BEYOND BOUNDS" in entry["query_text"]]
1309
+
1310
+ assert about_entries
1311
+ assert beyond_entries
1312
+ assert all(entry["target_file_hashes"] == ["A"] for entry in about_entries)
1313
+ assert all(entry["target_file_hashes"] == ["B"] for entry in beyond_entries)
1314
+
1315
+
1316
+ def test_partition_document_retries_with_hi_res_when_fast_is_suspiciously_thin(monkeypatch):
1317
+ calls = []
1318
+
1319
+ def fake_partition_pdf(*, filename, strategy, **_kwargs):
1320
+ del filename
1321
+ calls.append(strategy)
1322
+ if strategy == "fast":
1323
+ return [FakeElement("x" * 50, page_number=1)]
1324
+ return [
1325
+ FakeElement("x" * 500, page_number=1),
1326
+ FakeElement("y" * 500, page_number=1),
1327
+ FakeElement("z" * 500, page_number=1),
1328
+ ]
1329
+
1330
+ monkeypatch.setattr(pipeline, "_has_text_layer", lambda _path: True)
1331
+ monkeypatch.setattr(pipeline, "partition_pdf", fake_partition_pdf)
1332
+
1333
+ elements = pipeline.partition_document("file.pdf")
1334
+
1335
+ assert calls == ["fast", "hi_res"]
1336
+ assert len(elements) == 3
1337
+
1338
+
1339
+ def test_create_chunks_splits_single_thin_narrative(monkeypatch):
1340
+ long_text = (
1341
+ '"Every single night..." Lee said softly. '
1342
+ "The same demons kept returning, and the weight of them was unbearable. "
1343
+ "She kept remembering the dream, the corridor, the whispering, and the crushing fear. "
1344
+ "Classes were slipping away from her, and every conversation with the doctor felt more urgent. "
1345
+ "Still, she tried to describe what she saw, heard, and felt in careful detail."
1346
+ ) * 3
1347
+
1348
+ def fake_chunk_by_title(elements, **_kwargs):
1349
+ del elements
1350
+ return [
1351
+ SimpleNamespace(
1352
+ text=long_text,
1353
+ metadata=SimpleNamespace(orig_elements=[FakeElement(long_text)]),
1354
+ )
1355
+ ]
1356
+
1357
+ monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
1358
+
1359
+ chunks = pipeline.create_chunks([FakeElement(long_text)], text_chars=len(long_text))
1360
+
1361
+ assert len(chunks) >= 2
1362
+ assert all(getattr(chunk, "text", "") for chunk in chunks)
1363
+
1364
+
1365
+ def test_build_raptor_tree_synthesizes_root_for_single_leaf(monkeypatch):
1366
+ class FakeLLM:
1367
+ def invoke(self, _messages):
1368
+ return SimpleNamespace(content="Root summary")
1369
+
1370
+ monkeypatch.setattr(pipeline, "_build_llm", lambda **_kwargs: FakeLLM())
1371
+ leaf = Document(
1372
+ page_content="Leaf content",
1373
+ metadata={
1374
+ "source": "BEYOND BOUNDS",
1375
+ "file_hash": "B",
1376
+ "document_type": "short_story",
1377
+ "summary": "Leaf summary",
1378
+ "chunk_index": 1,
1379
+ "page_numbers": [1],
1380
+ },
1381
+ )
1382
+
1383
+ docs, ids = pipeline.build_raptor_tree([leaf], ["leaf-1"], "user-1")
1384
+
1385
+ assert len(docs) == 2
1386
+ assert len(ids) == 2
1387
+ assert any(doc.metadata.get("node_type") == "summary" for doc in docs)
1388
+
1389
+
1390
+ def test_generic_multi_doc_mode_keeps_weak_doc_with_candidates(monkeypatch):
1391
+ monkeypatch.setattr(
1392
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: FakeRetrieveSupabase()
1393
+ )
1394
+ monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
1395
+ monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
1396
+ monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
1397
+ monkeypatch.setattr(pipeline.config, "MAX_CONTEXT_CHARS", 2_000, raising=False)
1398
+
1399
+ docs = pipeline.retrieve_chunks(
1400
+ query="summarise the story short story",
1401
+ category="short_story",
1402
+ access_token="token",
1403
+ priority_file_hashes=["A", "B"],
1404
+ original_query="summarise the story",
1405
+ )
1406
+
1407
+ assert {doc.metadata["file_hash"] for doc in docs} == {"A", "B"}
1408
+ assert docs[0].metadata["route_mode"] == "generic_pinned"
1409
+ assert len(docs[0].metadata["doc_diagnostics"]) == 2
1410
+
1411
+
1412
+ def test_weighted_doc_prior_fusion_does_not_saturate_scores():
1413
+ fused = pipeline._combine_local_and_doc_score(0.95, 1.0, 0.2)
1414
+ assert fused < 1.0
1415
+ assert fused == 0.96
1416
+
1417
+
1418
+ def test_generate_answer_stream_done_event_includes_trace_metadata(monkeypatch):
1419
+ class FakeLLM:
1420
+ async def astream(self, _messages):
1421
+ yield SimpleNamespace(content="ok")
1422
+
1423
+ monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
1424
+ monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
1425
+ monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
1426
+ monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
1427
+ monkeypatch.setattr(pipeline, "_persist_query_trace", lambda **_kwargs: "trace-123")
1428
+
1429
+ chunk = Document(
1430
+ page_content="Leaf fallback",
1431
+ metadata={
1432
+ "source": "About Love Anton Chekhov",
1433
+ "chunk_index": 1,
1434
+ "document_type": "short_story",
1435
+ "relevance_score": 0.6,
1436
+ "original_content": {"raw_text": "Leaf raw text", "tables_html": []},
1437
+ "trace_id": "trace-123",
1438
+ "route_mode": "explicit_compare",
1439
+ "doc_diagnostics": [{"file_hash": "A", "source": "About Love Anton Chekhov", "included": True, "candidate_count": 2, "doc_score": 0.6, "confidence_label": "high", "reason": "supported"}],
1440
+ },
1441
+ )
1442
+
1443
+ async def collect():
1444
+ events = []
1445
+ async for event in pipeline.generate_answer_stream(
1446
+ chunks=[chunk],
1447
+ query="compare the themes",
1448
+ access_token=None,
1449
+ category="short_story",
1450
+ priority_file_hashes=["A", "B"],
1451
+ ):
1452
+ events.append(event)
1453
+ return events
1454
+
1455
+ events = asyncio.run(collect())
1456
+ done_event = next(event for event in events if event["type"] == "done")
1457
+
1458
+ assert done_event["trace_id"] == "trace-123"
1459
+ assert done_event["doc_diagnostics"][0]["source"] == "About Love Anton Chekhov"
1460
+
1461
+
1462
+ def test_generate_answer_stream_sanitizes_template_tokens_and_records_metrics(monkeypatch):
1463
+ captured = {}
1464
+
1465
+ def fake_persist_query_trace(**kwargs):
1466
+ captured["kwargs"] = kwargs
1467
+ return "trace-xyz"
1468
+
1469
+ class FakeLLM:
1470
+ async def astream(self, _messages):
1471
+ yield SimpleNamespace(content="assistant<|header_end|>Hello")
1472
+ yield SimpleNamespace(content=" there<|eot_id|>")
1473
+
1474
+ monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
1475
+ monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
1476
+ monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
1477
+ monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
1478
+ monkeypatch.setattr(pipeline, "_persist_query_trace", fake_persist_query_trace)
1479
+
1480
+ chunk = Document(
1481
+ page_content="Leaf fallback",
1482
+ metadata={
1483
+ "source": "About Love Anton Chekhov",
1484
+ "chunk_index": 1,
1485
+ "document_type": "short_story",
1486
+ "relevance_score": 0.6,
1487
+ "original_content": {"raw_text": "Leaf raw text", "tables_html": []},
1488
+ "trace_id": "trace-xyz",
1489
+ "route_mode": "default",
1490
+ },
1491
+ )
1492
+
1493
+ async def collect():
1494
+ events = []
1495
+ async for event in pipeline.generate_answer_stream(
1496
+ chunks=[chunk],
1497
+ query="hello",
1498
+ access_token=None,
1499
+ category="short_story",
1500
+ ):
1501
+ events.append(event)
1502
+ return events
1503
+
1504
+ events = asyncio.run(collect())
1505
+ tokens = "".join(event["content"] for event in events if event["type"] == "token")
1506
+
1507
+ assert "<|" not in tokens
1508
+ assert "Hello there" in tokens
1509
+ assert captured["kwargs"]["sanitizer_metrics"]["sanitizer_triggered"] is True
1510
+ assert captured["kwargs"]["sanitizer_metrics"]["sanitized_token_count"] > 0
1511
+
1512
+
1513
+ def test_duplicate_chunk_collapse_removes_overlap():
1514
+ kept, collapsed = pipeline._collapse_near_duplicate_candidates(
1515
+ [
1516
+ {
1517
+ "id": "a",
1518
+ "content": "Alpha beta gamma delta epsilon zeta",
1519
+ "metadata": {"file_hash": "doc-a", "source": "Doc A"},
1520
+ },
1521
+ {
1522
+ "id": "b",
1523
+ "content": "Alpha beta gamma delta epsilon zeta eta",
1524
+ "metadata": {"file_hash": "doc-a", "source": "Doc A"},
1525
+ },
1526
+ {
1527
+ "id": "c",
1528
+ "content": "Completely different content",
1529
+ "metadata": {"file_hash": "doc-a", "source": "Doc A"},
1530
+ },
1531
+ ]
1532
+ )
1533
+
1534
+ assert collapsed == 1
1535
+ assert [row["id"] for row in kept] == ["a", "c"]
1536
+
1537
+
1538
+ def test_analyse_intent_rewrites_follow_up_query(monkeypatch):
1539
+ monkeypatch.setattr(
1540
+ pipeline.intent_classifier,
1541
+ "predict",
1542
+ lambda *_args, **_kwargs: {
1543
+ "needs_clarification": False,
1544
+ "confidence": 0.95,
1545
+ },
1546
+ )
1547
+ monkeypatch.setattr(
1548
+ pipeline.intent_classifier,
1549
+ "record_feedback",
1550
+ lambda *args, **kwargs: None,
1551
+ )
1552
+
1553
+ session_key = pipeline._session_cache_key("sess-1", user_id="user-1")
1554
+ pipeline._last_query_context[session_key] = {
1555
+ "query": "Compare About Love and BEYOND BOUNDS",
1556
+ "updated_at": pipeline.time.time(),
1557
+ }
1558
+ pipeline._last_chunks[session_key] = [Document(page_content="cached", metadata={})]
1559
+
1560
+ result = pipeline.analyse_intent(
1561
+ query="What about the second one?",
1562
+ category="All",
1563
+ chat_history=[
1564
+ {"role": "user", "content": "Compare About Love and BEYOND BOUNDS"},
1565
+ {"role": "assistant", "content": "Here is the comparison."},
1566
+ ],
1567
+ session_id="sess-1",
1568
+ user_id="user-1",
1569
+ )
1570
+
1571
+ assert result["route_class"] == "follow_up"
1572
+ assert "follow-up about: Compare About Love and BEYOND BOUNDS" in result["enriched_query"]
1573
+
1574
+ pipeline._last_query_context.pop(session_key, None)
1575
+ pipeline._last_chunks.pop(session_key, None)
1576
+
1577
+
1578
+ def test_record_answer_feedback_persists_feedback_and_promotes(monkeypatch):
1579
+ fake_service = FakeServiceSupabase()
1580
+ fake_service.trace_rows.append(
1581
+ {"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111", "session_id": "sess-1", "question": "What is common?"}
1582
+ )
1583
+ monkeypatch.setattr(
1584
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
1585
+ )
1586
+
1587
+ ok = pipeline.record_answer_feedback(
1588
+ {
1589
+ "trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
1590
+ "helpful": False,
1591
+ "reason_code": "needs_improvement",
1592
+ "correction_text": "The two stories should not be merged.",
1593
+ },
1594
+ access_token=None,
1595
+ )
1596
+
1597
+ assert ok is True
1598
+ feedback_insert = next(item for item in fake_service.inserts if item[0] == "answer_feedback")
1599
+ assert feedback_insert[1]["promote_to_eval"] is True
1600
+ assert any(item[0] == "graph_nodes" for item in fake_service.upserts)
1601
+ assert any(item[0] == "graph_edges" for item in fake_service.upserts)
1602
+
1603
+
1604
+ def test_load_feedback_dataset_candidates_promotes_feedback_traces(monkeypatch):
1605
+ fake_service = FakeServiceSupabase()
1606
+ fake_service.feedback_rows.append(
1607
+ {
1608
+ "trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
1609
+ "helpful": False,
1610
+ "accepted": False,
1611
+ "reason_code": "unsupported_commonality",
1612
+ "correction_text": "Insufficient evidence for commonality.",
1613
+ "promote_to_eval": True,
1614
+ "user_id": "user-1",
1615
+ }
1616
+ )
1617
+ fake_service.trace_rows.append(
1618
+ {
1619
+ "trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
1620
+ "question": "What is common between these two documents?",
1621
+ "doc_diagnostics": [{"source": "BEYOND BOUNDS", "reason": "low_scoped_confidence"}],
1622
+ "failure_modes": ["unsupported_commonality"],
1623
+ "answer_preview": "The documents both explore emotion.",
1624
+ }
1625
+ )
1626
+ monkeypatch.setattr(
1627
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
1628
+ )
1629
+
1630
+ rows = run_eval.load_feedback_dataset_candidates(None, "user-1", limit=10)
1631
+
1632
+ assert len(rows) == 1
1633
+ assert rows[0]["trace_id"] == "8f8c1f3f-bcb6-43a8-b10d-85f31a917111"
1634
+ assert rows[0]["gold_evidence_text"] == "Insufficient evidence for commonality."
1635
+
1636
+
1637
+ def test_router_weights_trigger_summary_branch_filters(monkeypatch):
1638
+ class TrackingRpc:
1639
+ def __init__(self, supabase):
1640
+ self.supabase = supabase
1641
+
1642
+ def execute(self):
1643
+ self.supabase.rpc_filters.append(self.supabase.params["filter"])
1644
+ node_type = self.supabase.params["filter"].get("node_type")
1645
+ if node_type == "summary":
1646
+ return SimpleNamespace(
1647
+ data=[
1648
+ {
1649
+ "id": "sum-1",
1650
+ "content": "Synthetic summary content",
1651
+ "metadata": {
1652
+ "file_hash": "A",
1653
+ "source": "About Love Anton Chekhov",
1654
+ "chunk_index": "1-4",
1655
+ "document_type": "short_story",
1656
+ "node_type": "summary",
1657
+ "node_level": 1,
1658
+ },
1659
+ }
1660
+ ]
1661
+ )
1662
+ return SimpleNamespace(
1663
+ data=[
1664
+ {
1665
+ "id": "leaf-1",
1666
+ "content": "Leaf content",
1667
+ "metadata": {
1668
+ "file_hash": "A",
1669
+ "source": "About Love Anton Chekhov",
1670
+ "chunk_index": 1,
1671
+ "document_type": "short_story",
1672
+ "node_type": "leaf",
1673
+ },
1674
+ }
1675
+ ]
1676
+ )
1677
+
1678
+ class TrackingSupabase:
1679
+ def __init__(self):
1680
+ self.rpc_filters = []
1681
+ self.params = {}
1682
+
1683
+ def table(self, _name: str):
1684
+ return FakeRetrieveTable(self, "ingested_files")
1685
+
1686
+ def rpc(self, _name: str, params):
1687
+ self.params = params
1688
+ return TrackingRpc(self)
1689
+
1690
+ tracking = TrackingSupabase()
1691
+ monkeypatch.setattr(
1692
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: tracking
1693
+ )
1694
+ monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
1695
+ monkeypatch.setattr(pipeline, "_route_query_experts", lambda *args, **kwargs: {
1696
+ "expert_weights": {
1697
+ "dense_chunk": 0.3,
1698
+ "raptor_summary": 0.4,
1699
+ "graph_traversal": 0.1,
1700
+ "episodic_memory": 0.1,
1701
+ "hybrid_compare": 0.1,
1702
+ },
1703
+ "selected_experts": ["dense_chunk", "raptor_summary"],
1704
+ "confidence": 0.4,
1705
+ })
1706
+ monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
1707
+ monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
1708
+
1709
+ docs = pipeline.retrieve_chunks(
1710
+ query="tell me more",
1711
+ access_token="token",
1712
+ original_query="tell me more",
1713
+ )
1714
+
1715
+ assert docs
1716
+ assert any(f.get("node_type") == "summary" for f in tracking.rpc_filters)
1717
+
1718
+
1719
+ def test_thin_doc_overview_prefers_synthetic_root_summary():
1720
+ leaf = Document(
1721
+ page_content="Leaf content",
1722
+ metadata={
1723
+ "file_hash": "B",
1724
+ "source": "BEYOND BOUNDS",
1725
+ "node_type": "leaf",
1726
+ "relevance_score": 0.9,
1727
+ },
1728
+ )
1729
+ root = Document(
1730
+ page_content="Synthetic root summary",
1731
+ metadata={
1732
+ "file_hash": "B",
1733
+ "source": "BEYOND BOUNDS",
1734
+ "node_type": "summary",
1735
+ "synthetic_root_summary": True,
1736
+ "relevance_score": 0.4,
1737
+ },
1738
+ )
1739
+
1740
+ ordered, buckets, policy = pipeline._materialize_evidence_buckets(
1741
+ [leaf, root],
1742
+ query="summarise the story",
1743
+ route_mode="single",
1744
+ doc_title_map={"B": "BEYOND BOUNDS"},
1745
+ )
1746
+
1747
+ assert ordered[0].metadata["synthetic_root_summary"] is True
1748
+ assert buckets[0]["thin_doc"] is True
1749
+ assert policy["summary_like"] is True
1750
+
1751
+
1752
+ def test_graph_candidates_return_two_hop_related_chunks(monkeypatch):
1753
+ fake_graph = FakeGraphServiceSupabase()
1754
+ fake_graph.graph_nodes = [
1755
+ {"user_id": "user-1", "node_key": "entity:alehin", "node_type": "entity", "label": "Alehin", "payload": {"file_hash": "A"}},
1756
+ {"user_id": "user-1", "node_key": "summary:root-a", "node_type": "summary", "label": "About Love Anton Chekhov :: 1-4", "payload": {"file_hash": "A", "chunk_index": "1-4"}},
1757
+ {"user_id": "user-1", "node_key": "document:a", "node_type": "document", "label": "About Love Anton Chekhov", "payload": {"file_hash": "A"}},
1758
+ ]
1759
+ fake_graph.graph_edges = [
1760
+ {"user_id": "user-1", "source_node_key": "entity:alehin", "target_node_key": "summary:root-a", "edge_type": "mentions", "weight": 1.0, "payload": {}},
1761
+ {"user_id": "user-1", "source_node_key": "summary:root-a", "target_node_key": "document:a", "edge_type": "part_of", "weight": 1.0, "payload": {}},
1762
+ ]
1763
+ vector_rows = [
1764
+ {
1765
+ "id": "sum-1",
1766
+ "user_id": "user-1",
1767
+ "content": "Alehin appears in About Love.",
1768
+ "metadata": {
1769
+ "file_hash": "A",
1770
+ "source": "About Love Anton Chekhov",
1771
+ "node_type": "summary",
1772
+ "chunk_index": "1-4",
1773
+ },
1774
+ }
1775
+ ]
1776
+
1777
+ monkeypatch.setattr(
1778
+ pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_graph
1779
+ )
1780
+ monkeypatch.setattr(
1781
+ pipeline, "_build_supabase_client", lambda *_args, **_kwargs: FakeGraphVectorSupabase(vector_rows)
1782
+ )
1783
+
1784
+ rows = pipeline._retrieve_graph_candidates(
1785
+ "which one talks about Alehin",
1786
+ route_mode="explicit_compare",
1787
+ access_token="token",
1788
+ user_id="user-1",
1789
+ priority_file_hashes=["A"],
1790
+ )
1791
+
1792
+ assert len(rows) == 1
1793
+ assert rows[0]["metadata"]["retrieval_branch"] == "graph_traversal"
1794
+ assert rows[0]["metadata"]["graph_hit_depth"] >= 0
1795
+
1796
+
1797
+ def test_admin_promote_feedback_creates_eval_dataset(monkeypatch):
1798
+ fake_service = FakeServiceSupabase()
1799
+ fake_service.trace_rows.append(
1800
+ {
1801
+ "trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
1802
+ "question": "What is common between these two documents?",
1803
+ "doc_diagnostics": [{"source": "BEYOND BOUNDS", "reason": "insufficient_coverage"}],
1804
+ "failure_modes": ["unsupported_commonality"],
1805
+ "answer_preview": "The documents both explore emotion.",
1806
+ "review_state": "pending",
1807
+ }
1808
+ )
1809
+ fake_service.feedback_rows.append(
1810
+ {
1811
+ "id": 7,
1812
+ "trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
1813
+ "helpful": False,
1814
+ "accepted": False,
1815
+ "reason_code": "unsupported_commonality",
1816
+ "correction_text": "Insufficient evidence for commonality.",
1817
+ "promote_to_eval": True,
1818
+ "review_state": "pending",
1819
+ }
1820
+ )
1821
+
1822
+ monkeypatch.setattr(admin, "_admin_client", lambda: fake_service)
1823
+ monkeypatch.setenv("MASTER_ADMIN_KEY", "secret")
1824
+
1825
+ result = admin.promote_feedback_to_eval(7, x_admin_key="secret")
1826
+
1827
+ assert result["ok"] is True
1828
+ assert len(fake_service.eval_rows) == 1
1829
+ assert fake_service.eval_rows[0]["trace_id"] == "8f8c1f3f-bcb6-43a8-b10d-85f31a917111"
1830
+ assert fake_service.trace_rows[0]["review_state"] == "promoted"
1831
+ assert fake_service.feedback_rows[0]["review_state"] == "promoted"
tests/test_routing_stress_matrix.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from types import SimpleNamespace
2
+
3
+ from backend.core import pipeline
4
+
5
+
6
+ class FakeFilesTable:
7
+ def __init__(self, rows):
8
+ self.rows = rows
9
+ self.filters = {}
10
+
11
+ def select(self, *_args):
12
+ return self
13
+
14
+ def eq(self, key, value):
15
+ self.filters[key] = value
16
+ return self
17
+
18
+ def execute(self):
19
+ return SimpleNamespace(data=list(self.rows))
20
+
21
+
22
+ class FakeRpc:
23
+ def __init__(self, supabase, params):
24
+ self.supabase = supabase
25
+ self.params = params
26
+
27
+ def execute(self):
28
+ self.supabase.rpc_calls.append(self.params)
29
+ # Always return a row so ambiguity code can compute file_scores
30
+ return SimpleNamespace(data=[{"combined_score": 0.2}])
31
+
32
+
33
+ class FakeSupabase:
34
+ def __init__(self, rows):
35
+ self.rows = rows
36
+ self.rpc_calls = []
37
+
38
+ def table(self, _name: str):
39
+ return FakeFilesTable(self.rows)
40
+
41
+ def rpc(self, _name: str, params):
42
+ return FakeRpc(self, params)
43
+
44
+
45
+ def test_stress_matrix_identity_queries_never_guess_in_multi_doc_all_scope(monkeypatch):
46
+ """
47
+ Invariant: if multiple docs exist and the user hasn't pinned a doc (category=All),
48
+ identity/page-scoped queries must force clarification instead of falling through.
49
+ """
50
+ fake = FakeSupabase(
51
+ rows=[
52
+ {"file_hash": "A", "filename": "Guide A.pdf"},
53
+ {"file_hash": "B", "filename": "Guide B.pdf"},
54
+ {"file_hash": "C", "filename": "Guide C.pdf"},
55
+ ]
56
+ )
57
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
58
+
59
+ identity_like_queries = [
60
+ "Whose guide is this?",
61
+ "What is the exact full title of this guide?",
62
+ "What exact wording on the cover shows this guide is personalized?",
63
+ "Summarize only the first page, not the whole guide.",
64
+ "Does this guide explicitly name a publisher on the opening pages? If not, say not stated.",
65
+ "Publisher on the opening pages?",
66
+ "Cover wording?",
67
+ "Page 1 summary only.",
68
+ ]
69
+ for q in identity_like_queries:
70
+ res = pipeline.check_query_ambiguity(q, access_token=None, category="All")
71
+ assert res["is_ambiguous"] is True, q
72
+ assert res["top_file_hash"] is None, q
73
+
74
+ # For identity/page-scoped safety, we should not do per-file scoring RPC calls.
75
+ assert fake.rpc_calls == []
76
+
77
+
78
+ def test_stress_matrix_generic_queries_may_use_scoring_and_include_p_user_id(monkeypatch):
79
+ fake = FakeSupabase(
80
+ rows=[
81
+ {"file_hash": "A", "filename": "Doc A.pdf"},
82
+ {"file_hash": "B", "filename": "Doc B.pdf"},
83
+ ]
84
+ )
85
+ monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
86
+
87
+ generic_queries = [
88
+ "summarize the document",
89
+ "give me an overview",
90
+ "explain what this is about",
91
+ ]
92
+ for q in generic_queries:
93
+ res = pipeline.check_query_ambiguity(q, access_token=None, category="All")
94
+ assert res["is_ambiguous"] in {True, False}
95
+
96
+ assert fake.rpc_calls, "Expected scoring calls for generic multi-doc queries"
97
+ assert all("p_user_id" in call for call in fake.rpc_calls)
98
+