Spaces:
Running
Running
nothex commited on
Commit ·
4abd98f
1
Parent(s): ca5846e
Harden ingestion and retrieval reliability across the pipeline
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- ARCHITECTURE.md +9 -1
- backend/api/admin.py +286 -3
- backend/api/auth.py +75 -4
- backend/api/frontend_config.py +8 -2
- backend/api/ingest.py +54 -12
- backend/api/query.py +116 -4
- backend/core/auth_utils.py +40 -1
- backend/core/classifier.py +10 -15
- backend/core/config.py +52 -2
- backend/core/pipeline.py +0 -0
- backend/core/pipeline_ambiguity.py +221 -0
- backend/core/pipeline_generation.py +54 -0
- backend/core/pipeline_ingestion.py +465 -0
- backend/core/pipeline_memory.py +23 -0
- backend/core/pipeline_pageindex.py +263 -0
- backend/core/pipeline_retrieval.py +83 -0
- backend/core/pipeline_routing.py +149 -0
- backend/core/pipeline_supabase.py +46 -0
- backend/core/pipeline_types.py +65 -0
- backend/core/rate_limit.py +39 -0
- backend/core/tasks.py +60 -19
- backend/core/warmup_classifier.py +5 -1
- backend/eval/run_eval.py +66 -0
- backend/main.py +12 -15
- frontend/index.html +72 -20
- frontend/js/admin.js +234 -0
- frontend/js/api.js +126 -24
- frontend/js/chat.js +173 -30
- frontend/js/config.js +39 -5
- frontend/js/corpus.js +79 -3
- frontend/js/graph.js +98 -66
- frontend/js/main.js +312 -63
- frontend/js/state.js +21 -12
- recent_changes.txt +0 -0
- requirements.txt +2 -1
- scripts/rebuild_pageindex.py +83 -0
- shared/types.py +41 -4
- supabase/migrations/0010_query_traces_feedback_graph.sql +131 -0
- supabase/migrations/0011_admin_review_eval_workflow.sql +38 -0
- supabase/migrations/0012_lock_down_evaluation_datasets.sql +14 -0
- supabase/migrations/0013_backend_owned_retrieval_hardening.sql +260 -0
- supabase/migrations/0014_drop_legacy_category_centroid_policies.sql +20 -0
- supabase/migrations/0015_ingested_file_identity_json.sql +2 -0
- supabase/migrations/0016_ingestion_file_hash_checkpoints.sql +5 -0
- supabase/schema_backup.before_0013.sql +0 -0
- supabase/schema_backup.sql +1349 -908
- tests/test_guest_mode.py +74 -0
- tests/test_ingest_api.py +156 -0
- tests/test_pipeline_regressions.py +1831 -0
- tests/test_routing_stress_matrix.py +98 -0
ARCHITECTURE.md
CHANGED
|
@@ -133,7 +133,7 @@ morpheus/
|
|
| 133 |
|
| 134 |
| Function | Purpose |
|
| 135 |
|----------|---------|
|
| 136 |
-
| `hybrid_search(query_text, query_embedding, match_count, filter, semantic_weight, keyword_weight)` | Combined BM25 + pgvector search |
|
| 137 |
| `match_memory(query_embedding, match_session_id, match_count)` | Semantic search over chat history |
|
| 138 |
| `insert_document_chunk(p_id, p_content, p_metadata, p_embedding, p_user_id)` | Secure insert with explicit user_id |
|
| 139 |
| `get_document_types()` | Returns distinct categories for this tenant |
|
|
@@ -221,6 +221,12 @@ Step 1: Intent analysis (analyse_intent)
|
|
| 221 |
Reference queries ("summarise it"): replaced with previous query
|
| 222 |
Every query logged to intent_feedback for online retraining
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
Step 2: Query routing
|
| 225 |
Structural queries (table of contents, numbered items, specific codes)?
|
| 226 |
→ tree_search(): recursive traversal of document_trees for this user
|
|
@@ -231,6 +237,8 @@ Step 3: retrieve_chunks() — vector path
|
|
| 231 |
a) Follow-up detection
|
| 232 |
Query ≤8 words with pronouns (it/this/that/they)?
|
| 233 |
Reuse _last_chunks[session_key] — no re-search
|
|
|
|
|
|
|
| 234 |
|
| 235 |
b) Semantic cache check
|
| 236 |
Embed query (256-entry in-memory LRU cache)
|
|
|
|
| 133 |
|
| 134 |
| Function | Purpose |
|
| 135 |
|----------|---------|
|
| 136 |
+
| `hybrid_search(query_text, query_embedding, match_count, filter, semantic_weight, keyword_weight, p_user_id)` | Combined BM25 + pgvector search (tenant-scoped overload) |
|
| 137 |
| `match_memory(query_embedding, match_session_id, match_count)` | Semantic search over chat history |
|
| 138 |
| `insert_document_chunk(p_id, p_content, p_metadata, p_embedding, p_user_id)` | Secure insert with explicit user_id |
|
| 139 |
| `get_document_types()` | Returns distinct categories for this tenant |
|
|
|
|
| 221 |
Reference queries ("summarise it"): replaced with previous query
|
| 222 |
Every query logged to intent_feedback for online retraining
|
| 223 |
|
| 224 |
+
Step 1.5: Ambiguity / scope safety (check_query_ambiguity)
|
| 225 |
+
If the user has NOT pinned a document:
|
| 226 |
+
- If **multiple docs are in scope** and the query is **identity/page-scoped** (owner/title/publisher/cover/first page), Morpheus **asks the user to pick a document** (never guesses).
|
| 227 |
+
- Otherwise, Morpheus may ask a clarification question for generic queries when multiple docs match.
|
| 228 |
+
Implementation detail: ambiguity scoring uses `hybrid_search(..., p_user_id=...)` to avoid PostgREST overload ambiguity.
|
| 229 |
+
|
| 230 |
Step 2: Query routing
|
| 231 |
Structural queries (table of contents, numbered items, specific codes)?
|
| 232 |
→ tree_search(): recursive traversal of document_trees for this user
|
|
|
|
| 237 |
a) Follow-up detection
|
| 238 |
Query ≤8 words with pronouns (it/this/that/they)?
|
| 239 |
Reuse _last_chunks[session_key] — no re-search
|
| 240 |
+
Safety guard: ordinal follow-ups like "the second one" must have an explicit referent (a list);
|
| 241 |
+
otherwise the API asks for clarification instead of guessing.
|
| 242 |
|
| 243 |
b) Semantic cache check
|
| 244 |
Embed query (256-entry in-memory LRU cache)
|
backend/api/admin.py
CHANGED
|
@@ -1,11 +1,16 @@
|
|
| 1 |
"""backend/api/admin.py — Admin endpoints, protected by X-Admin-Key header."""
|
| 2 |
|
| 3 |
import os, hmac, logging # noqa: E401
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from fastapi import APIRouter, HTTPException, Header, Depends
|
|
|
|
|
|
|
| 5 |
from backend.core.auth_utils import require_auth_token
|
| 6 |
from backend.core.warmup_classifier import warmup, warmup_cross_encoder
|
| 7 |
-
from
|
| 8 |
-
from collections import Counter
|
| 9 |
|
| 10 |
log = logging.getLogger("morpheus.api.admin")
|
| 11 |
router = APIRouter()
|
|
@@ -19,6 +24,78 @@ def _check_admin(key: str):
|
|
| 19 |
raise HTTPException(status_code=403, detail="Invalid admin key.")
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
@router.post("/warmup")
|
| 23 |
def run_warmup(x_admin_key: str = Header(..., alias="X-Admin-Key")):
|
| 24 |
_check_admin(x_admin_key)
|
|
@@ -105,4 +182,210 @@ def get_corpus_health(
|
|
| 105 |
"recommendation": "Prompt user to upload documents regarding content gaps."
|
| 106 |
if missing_topics
|
| 107 |
else "Corpus coverage is sufficient.",
|
| 108 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""backend/api/admin.py — Admin endpoints, protected by X-Admin-Key header."""
|
| 2 |
|
| 3 |
import os, hmac, logging # noqa: E401
|
| 4 |
+
from datetime import datetime, timedelta, timezone
|
| 5 |
+
from collections import Counter
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
from fastapi import APIRouter, HTTPException, Header, Depends
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
|
| 11 |
from backend.core.auth_utils import require_auth_token
|
| 12 |
from backend.core.warmup_classifier import warmup, warmup_cross_encoder
|
| 13 |
+
from backend.core.pipeline import _build_service_supabase_client
|
|
|
|
| 14 |
|
| 15 |
log = logging.getLogger("morpheus.api.admin")
|
| 16 |
router = APIRouter()
|
|
|
|
| 24 |
raise HTTPException(status_code=403, detail="Invalid admin key.")
|
| 25 |
|
| 26 |
|
| 27 |
+
class ReviewPayload(BaseModel):
|
| 28 |
+
review_state: str = "reviewed"
|
| 29 |
+
review_notes: Optional[str] = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _admin_client():
|
| 33 |
+
return _build_service_supabase_client()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _trace_sort_key(row: dict):
|
| 37 |
+
return row.get("created_at") or ""
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _feedback_sort_key(row: dict):
|
| 41 |
+
return row.get("created_at") or ""
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _load_recent_traces(*, limit: int = 100) -> list[dict]:
|
| 45 |
+
rows = (
|
| 46 |
+
_admin_client()
|
| 47 |
+
.table("query_traces")
|
| 48 |
+
.select(
|
| 49 |
+
"trace_id, question, route_mode, selected_experts, expert_weights, "
|
| 50 |
+
"document_types, doc_diagnostics, failure_modes, quality_metrics, "
|
| 51 |
+
"answer_preview, latency_ms, review_state, review_notes, reviewed_at, "
|
| 52 |
+
"reviewed_by, promoted_to_eval, created_at"
|
| 53 |
+
)
|
| 54 |
+
.limit(limit)
|
| 55 |
+
.execute()
|
| 56 |
+
.data
|
| 57 |
+
or []
|
| 58 |
+
)
|
| 59 |
+
return sorted(rows, key=_trace_sort_key, reverse=True)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _load_recent_feedback(*, limit: int = 100) -> list[dict]:
|
| 63 |
+
rows = (
|
| 64 |
+
_admin_client()
|
| 65 |
+
.table("answer_feedback")
|
| 66 |
+
.select(
|
| 67 |
+
"id, trace_id, helpful, accepted, reason_code, correction_text, "
|
| 68 |
+
"promote_to_eval, review_state, review_notes, reviewed_at, reviewed_by, "
|
| 69 |
+
"promoted_at, created_at, user_id"
|
| 70 |
+
)
|
| 71 |
+
.limit(limit)
|
| 72 |
+
.execute()
|
| 73 |
+
.data
|
| 74 |
+
or []
|
| 75 |
+
)
|
| 76 |
+
return sorted(rows, key=_feedback_sort_key, reverse=True)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _build_eval_dataset_row(trace_row: dict, feedback_row: dict) -> dict:
|
| 80 |
+
correction_text = (feedback_row.get("correction_text") or "").strip()
|
| 81 |
+
answer_preview = (trace_row.get("answer_preview") or "").strip()
|
| 82 |
+
return {
|
| 83 |
+
"trace_id": trace_row.get("trace_id"),
|
| 84 |
+
"source": "feedback_trace",
|
| 85 |
+
"question": trace_row.get("question"),
|
| 86 |
+
"gold_context_refs": [],
|
| 87 |
+
"gold_evidence_text": correction_text or answer_preview,
|
| 88 |
+
"is_answerable": bool(
|
| 89 |
+
feedback_row.get("accepted")
|
| 90 |
+
or feedback_row.get("helpful")
|
| 91 |
+
),
|
| 92 |
+
"failure_modes": trace_row.get("failure_modes") or [],
|
| 93 |
+
"doc_diagnostics": trace_row.get("doc_diagnostics") or [],
|
| 94 |
+
"reason_code": feedback_row.get("reason_code"),
|
| 95 |
+
"is_active": False,
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
@router.post("/warmup")
|
| 100 |
def run_warmup(x_admin_key: str = Header(..., alias="X-Admin-Key")):
|
| 101 |
_check_admin(x_admin_key)
|
|
|
|
| 182 |
"recommendation": "Prompt user to upload documents regarding content gaps."
|
| 183 |
if missing_topics
|
| 184 |
else "Corpus coverage is sufficient.",
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@router.get("/traces")
|
| 189 |
+
def list_query_traces(
|
| 190 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 191 |
+
limit: int = 50,
|
| 192 |
+
route_mode: Optional[str] = None,
|
| 193 |
+
failure_mode: Optional[str] = None,
|
| 194 |
+
category: Optional[str] = None,
|
| 195 |
+
hours: int = 168,
|
| 196 |
+
review_state: Optional[str] = None,
|
| 197 |
+
):
|
| 198 |
+
_check_admin(x_admin_key)
|
| 199 |
+
traces = _load_recent_traces(limit=max(limit * 3, 100))
|
| 200 |
+
cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, hours))
|
| 201 |
+
filtered = []
|
| 202 |
+
for row in traces:
|
| 203 |
+
created_raw = row.get("created_at")
|
| 204 |
+
created_at = None
|
| 205 |
+
if isinstance(created_raw, str):
|
| 206 |
+
try:
|
| 207 |
+
created_at = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
|
| 208 |
+
except Exception:
|
| 209 |
+
created_at = None
|
| 210 |
+
if created_at and created_at < cutoff:
|
| 211 |
+
continue
|
| 212 |
+
if route_mode and row.get("route_mode") != route_mode:
|
| 213 |
+
continue
|
| 214 |
+
if failure_mode and failure_mode not in (row.get("failure_modes") or []):
|
| 215 |
+
continue
|
| 216 |
+
if review_state and row.get("review_state") != review_state:
|
| 217 |
+
continue
|
| 218 |
+
if category and category not in (row.get("document_types") or []):
|
| 219 |
+
continue
|
| 220 |
+
filtered.append(row)
|
| 221 |
+
return {"items": filtered[:limit]}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@router.get("/traces/{trace_id}")
|
| 225 |
+
def get_query_trace(
|
| 226 |
+
trace_id: str,
|
| 227 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 228 |
+
):
|
| 229 |
+
_check_admin(x_admin_key)
|
| 230 |
+
sb = _admin_client()
|
| 231 |
+
trace_rows = (
|
| 232 |
+
sb.table("query_traces")
|
| 233 |
+
.select("*")
|
| 234 |
+
.eq("trace_id", trace_id)
|
| 235 |
+
.limit(1)
|
| 236 |
+
.execute()
|
| 237 |
+
.data
|
| 238 |
+
or []
|
| 239 |
+
)
|
| 240 |
+
if not trace_rows:
|
| 241 |
+
raise HTTPException(status_code=404, detail="Trace not found.")
|
| 242 |
+
feedback_rows = (
|
| 243 |
+
sb.table("answer_feedback")
|
| 244 |
+
.select("*")
|
| 245 |
+
.eq("trace_id", trace_id)
|
| 246 |
+
.execute()
|
| 247 |
+
.data
|
| 248 |
+
or []
|
| 249 |
+
)
|
| 250 |
+
return {"trace": trace_rows[0], "feedback": sorted(feedback_rows, key=_feedback_sort_key, reverse=True)}
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
@router.post("/traces/{trace_id}/review")
|
| 254 |
+
def review_query_trace(
|
| 255 |
+
trace_id: str,
|
| 256 |
+
payload: ReviewPayload,
|
| 257 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 258 |
+
):
|
| 259 |
+
_check_admin(x_admin_key)
|
| 260 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 261 |
+
_admin_client().table("query_traces").update(
|
| 262 |
+
{
|
| 263 |
+
"review_state": payload.review_state,
|
| 264 |
+
"review_notes": payload.review_notes,
|
| 265 |
+
"reviewed_at": now_iso,
|
| 266 |
+
"reviewed_by": "admin",
|
| 267 |
+
}
|
| 268 |
+
).eq("trace_id", trace_id).execute()
|
| 269 |
+
return {"ok": True}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
@router.get("/feedback")
|
| 273 |
+
def list_feedback(
|
| 274 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 275 |
+
limit: int = 50,
|
| 276 |
+
review_state: Optional[str] = None,
|
| 277 |
+
promote_only: bool = False,
|
| 278 |
+
):
|
| 279 |
+
_check_admin(x_admin_key)
|
| 280 |
+
rows = _load_recent_feedback(limit=max(limit * 3, 100))
|
| 281 |
+
filtered = []
|
| 282 |
+
for row in rows:
|
| 283 |
+
if review_state and row.get("review_state") != review_state:
|
| 284 |
+
continue
|
| 285 |
+
if promote_only and not row.get("promote_to_eval"):
|
| 286 |
+
continue
|
| 287 |
+
filtered.append(row)
|
| 288 |
+
return {"items": filtered[:limit]}
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@router.get("/feedback/{feedback_id}")
|
| 292 |
+
def get_feedback_detail(
|
| 293 |
+
feedback_id: int,
|
| 294 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 295 |
+
):
|
| 296 |
+
_check_admin(x_admin_key)
|
| 297 |
+
sb = _admin_client()
|
| 298 |
+
rows = (
|
| 299 |
+
sb.table("answer_feedback")
|
| 300 |
+
.select("*")
|
| 301 |
+
.eq("id", feedback_id)
|
| 302 |
+
.limit(1)
|
| 303 |
+
.execute()
|
| 304 |
+
.data
|
| 305 |
+
or []
|
| 306 |
+
)
|
| 307 |
+
if not rows:
|
| 308 |
+
raise HTTPException(status_code=404, detail="Feedback not found.")
|
| 309 |
+
feedback = rows[0]
|
| 310 |
+
trace_rows = (
|
| 311 |
+
sb.table("query_traces")
|
| 312 |
+
.select("*")
|
| 313 |
+
.eq("trace_id", feedback.get("trace_id"))
|
| 314 |
+
.limit(1)
|
| 315 |
+
.execute()
|
| 316 |
+
.data
|
| 317 |
+
or []
|
| 318 |
+
)
|
| 319 |
+
return {"feedback": feedback, "trace": trace_rows[0] if trace_rows else None}
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
@router.post("/feedback/{feedback_id}/review")
|
| 323 |
+
def review_feedback(
|
| 324 |
+
feedback_id: int,
|
| 325 |
+
payload: ReviewPayload,
|
| 326 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 327 |
+
):
|
| 328 |
+
_check_admin(x_admin_key)
|
| 329 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 330 |
+
_admin_client().table("answer_feedback").update(
|
| 331 |
+
{
|
| 332 |
+
"review_state": payload.review_state,
|
| 333 |
+
"review_notes": payload.review_notes,
|
| 334 |
+
"reviewed_at": now_iso,
|
| 335 |
+
"reviewed_by": "admin",
|
| 336 |
+
}
|
| 337 |
+
).eq("id", feedback_id).execute()
|
| 338 |
+
return {"ok": True}
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
@router.post("/feedback/{feedback_id}/promote")
|
| 342 |
+
def promote_feedback_to_eval(
|
| 343 |
+
feedback_id: int,
|
| 344 |
+
x_admin_key: str = Header(..., alias="X-Admin-Key"),
|
| 345 |
+
):
|
| 346 |
+
_check_admin(x_admin_key)
|
| 347 |
+
sb = _admin_client()
|
| 348 |
+
feedback_rows = (
|
| 349 |
+
sb.table("answer_feedback")
|
| 350 |
+
.select("*")
|
| 351 |
+
.eq("id", feedback_id)
|
| 352 |
+
.limit(1)
|
| 353 |
+
.execute()
|
| 354 |
+
.data
|
| 355 |
+
or []
|
| 356 |
+
)
|
| 357 |
+
if not feedback_rows:
|
| 358 |
+
raise HTTPException(status_code=404, detail="Feedback not found.")
|
| 359 |
+
feedback = feedback_rows[0]
|
| 360 |
+
trace_rows = (
|
| 361 |
+
sb.table("query_traces")
|
| 362 |
+
.select("*")
|
| 363 |
+
.eq("trace_id", feedback.get("trace_id"))
|
| 364 |
+
.limit(1)
|
| 365 |
+
.execute()
|
| 366 |
+
.data
|
| 367 |
+
or []
|
| 368 |
+
)
|
| 369 |
+
if not trace_rows:
|
| 370 |
+
raise HTTPException(status_code=404, detail="Trace not found.")
|
| 371 |
+
trace = trace_rows[0]
|
| 372 |
+
row = _build_eval_dataset_row(trace, feedback)
|
| 373 |
+
sb.table("evaluation_datasets").upsert(row, on_conflict="trace_id").execute()
|
| 374 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 375 |
+
sb.table("answer_feedback").update(
|
| 376 |
+
{
|
| 377 |
+
"review_state": "promoted",
|
| 378 |
+
"promoted_at": now_iso,
|
| 379 |
+
"reviewed_at": now_iso,
|
| 380 |
+
"reviewed_by": "admin",
|
| 381 |
+
}
|
| 382 |
+
).eq("id", feedback_id).execute()
|
| 383 |
+
sb.table("query_traces").update(
|
| 384 |
+
{
|
| 385 |
+
"review_state": "promoted",
|
| 386 |
+
"promoted_to_eval": True,
|
| 387 |
+
"reviewed_at": now_iso,
|
| 388 |
+
"reviewed_by": "admin",
|
| 389 |
+
}
|
| 390 |
+
).eq("trace_id", trace.get("trace_id")).execute()
|
| 391 |
+
return {"ok": True}
|
backend/api/auth.py
CHANGED
|
@@ -7,10 +7,11 @@ declare `auth: AuthContext = Depends(require_auth)` — see the pattern
|
|
| 7 |
at the bottom of this file and replicate it in each router.
|
| 8 |
"""
|
| 9 |
|
| 10 |
-
from fastapi import APIRouter, Depends
|
| 11 |
|
| 12 |
-
from backend.core.auth_utils import require_auth_token
|
| 13 |
-
from backend.
|
|
|
|
| 14 |
from shared.types import AuthRequest, AuthResponse
|
| 15 |
|
| 16 |
router = APIRouter()
|
|
@@ -31,7 +32,7 @@ def verify(req: AuthRequest):
|
|
| 31 |
@router.post("/admin", response_model=AuthResponse)
|
| 32 |
def admin_verify(req: AuthRequest):
|
| 33 |
if verify_admin_key(req.password):
|
| 34 |
-
return AuthResponse(valid=True,
|
| 35 |
return AuthResponse(valid=False, message="Invalid admin key.")
|
| 36 |
|
| 37 |
|
|
@@ -40,3 +41,73 @@ def admin_verify(req: AuthRequest):
|
|
| 40 |
async def get_me(user_id: str = Depends(require_auth_token)):
|
| 41 |
return {"user_id": user_id, "authenticated": True}
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
at the bottom of this file and replicate it in each router.
|
| 8 |
"""
|
| 9 |
|
| 10 |
+
from fastapi import APIRouter, Depends, Header, HTTPException
|
| 11 |
|
| 12 |
+
from backend.core.auth_utils import is_guest_token, require_auth_token
|
| 13 |
+
from backend.core.pipeline import _build_service_supabase_client
|
| 14 |
+
from backend.services.auth import verify_admin_key, verify_password
|
| 15 |
from shared.types import AuthRequest, AuthResponse
|
| 16 |
|
| 17 |
router = APIRouter()
|
|
|
|
| 32 |
@router.post("/admin", response_model=AuthResponse)
|
| 33 |
def admin_verify(req: AuthRequest):
|
| 34 |
if verify_admin_key(req.password):
|
| 35 |
+
return AuthResponse(valid=True, message="Admin verified.")
|
| 36 |
return AuthResponse(valid=False, message="Invalid admin key.")
|
| 37 |
|
| 38 |
|
|
|
|
| 41 |
async def get_me(user_id: str = Depends(require_auth_token)):
|
| 42 |
return {"user_id": user_id, "authenticated": True}
|
| 43 |
|
| 44 |
+
|
| 45 |
+
@router.delete("/guest-workspace")
|
| 46 |
+
async def clear_guest_workspace(
|
| 47 |
+
user_id: str = Depends(require_auth_token),
|
| 48 |
+
x_auth_token: str = Header(None, alias="X-Auth-Token"),
|
| 49 |
+
):
|
| 50 |
+
if not is_guest_token(x_auth_token):
|
| 51 |
+
raise HTTPException(status_code=403, detail="Guest workspace cleanup is only for guest sessions.")
|
| 52 |
+
|
| 53 |
+
sb = _build_service_supabase_client()
|
| 54 |
+
|
| 55 |
+
# Preserve anonymized adaptive signals while removing the guest's actual workspace.
|
| 56 |
+
try:
|
| 57 |
+
sb.table("query_traces").update(
|
| 58 |
+
{
|
| 59 |
+
"user_id": None,
|
| 60 |
+
"session_id": "guest_archived",
|
| 61 |
+
"question": "[guest session removed]",
|
| 62 |
+
"pinned_file_hashes": [],
|
| 63 |
+
"selected_chunk_ids": [],
|
| 64 |
+
"doc_diagnostics": [],
|
| 65 |
+
"answer_preview": None,
|
| 66 |
+
"document_types": [],
|
| 67 |
+
}
|
| 68 |
+
).eq("user_id", user_id).execute()
|
| 69 |
+
except Exception:
|
| 70 |
+
pass
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
sb.table("answer_feedback").update(
|
| 74 |
+
{
|
| 75 |
+
"user_id": None,
|
| 76 |
+
"correction_text": None,
|
| 77 |
+
}
|
| 78 |
+
).eq("user_id", user_id).execute()
|
| 79 |
+
except Exception:
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
sb.table("evaluation_logs").update(
|
| 84 |
+
{
|
| 85 |
+
"user_id": None,
|
| 86 |
+
"question": "[guest session removed]",
|
| 87 |
+
}
|
| 88 |
+
).eq("user_id", user_id).execute()
|
| 89 |
+
except Exception:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
def _purge(table_name: str) -> None:
|
| 93 |
+
try:
|
| 94 |
+
sb.table(table_name).delete().eq("user_id", user_id).execute()
|
| 95 |
+
except Exception:
|
| 96 |
+
# Optional/older tables should not break guest cleanup.
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
# Delete child/content tables first, then registry-ish tables.
|
| 100 |
+
for table_name in (
|
| 101 |
+
"documents",
|
| 102 |
+
"document_trees",
|
| 103 |
+
"chat_memory",
|
| 104 |
+
"ingestion_retry_logs",
|
| 105 |
+
"rerank_feedback",
|
| 106 |
+
"intent_feedback",
|
| 107 |
+
"graph_edges",
|
| 108 |
+
"graph_nodes",
|
| 109 |
+
"ingested_files",
|
| 110 |
+
):
|
| 111 |
+
_purge(table_name)
|
| 112 |
+
|
| 113 |
+
return {"ok": True, "message": "Guest workspace cleared."}
|
backend/api/frontend_config.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from fastapi import APIRouter
|
| 2 |
from backend.core import config
|
| 3 |
|
| 4 |
router = APIRouter()
|
|
@@ -9,7 +9,13 @@ def get_frontend_config():
|
|
| 9 |
Returns public config values the frontend needs.
|
| 10 |
Only exposes the anon key (safe by design) — never the service key.
|
| 11 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
return {
|
| 13 |
"supabase_url": config.SUPABASE_URL,
|
| 14 |
"supabase_anon": config.SUPABASE_ANON_KEY,
|
| 15 |
-
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
from backend.core import config
|
| 3 |
|
| 4 |
router = APIRouter()
|
|
|
|
| 9 |
Returns public config values the frontend needs.
|
| 10 |
Only exposes the anon key (safe by design) — never the service key.
|
| 11 |
"""
|
| 12 |
+
if not config.SUPABASE_URL or not config.SUPABASE_ANON_KEY:
|
| 13 |
+
raise HTTPException(
|
| 14 |
+
status_code=503,
|
| 15 |
+
detail="Supabase frontend config is missing on the server.",
|
| 16 |
+
)
|
| 17 |
return {
|
| 18 |
"supabase_url": config.SUPABASE_URL,
|
| 19 |
"supabase_anon": config.SUPABASE_ANON_KEY,
|
| 20 |
+
"guest_enabled": config.GUEST_MODE_ENABLED,
|
| 21 |
+
}
|
backend/api/ingest.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
import tempfile
|
| 3 |
import logging
|
| 4 |
-
from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends
|
| 5 |
-
from backend.core
|
|
|
|
|
|
|
| 6 |
from backend.core.tasks import process_pdf_task
|
| 7 |
from backend.core.tasks import celery_app
|
| 8 |
|
|
@@ -10,15 +12,39 @@ log = logging.getLogger("morpheus.api.ingest")
|
|
| 10 |
router = APIRouter()
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
@router.post("/upload")
|
|
|
|
| 14 |
async def upload(
|
|
|
|
| 15 |
file: UploadFile = File(...),
|
| 16 |
user_id: str = Depends(require_auth_token),
|
| 17 |
x_auth_token: str = Header(None, alias="X-Auth-Token"),
|
| 18 |
):
|
|
|
|
| 19 |
if not file.filename.lower().endswith(".pdf"):
|
| 20 |
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
|
| 21 |
|
|
|
|
|
|
|
| 22 |
# NEW: Secure file signature validation using python-magic
|
| 23 |
import magic
|
| 24 |
|
|
@@ -33,6 +59,8 @@ async def upload(
|
|
| 33 |
)
|
| 34 |
|
| 35 |
# ── Per-user document limit ───────────────────────────────────────────────
|
|
|
|
|
|
|
| 36 |
try:
|
| 37 |
from backend.core.pipeline import _build_supabase_client
|
| 38 |
|
|
@@ -43,20 +71,33 @@ async def upload(
|
|
| 43 |
.eq("user_id", user_id)
|
| 44 |
.execute()
|
| 45 |
)
|
| 46 |
-
if (result.count or 0) >=
|
| 47 |
raise HTTPException(
|
| 48 |
-
status_code=429,
|
|
|
|
| 49 |
)
|
| 50 |
except HTTPException:
|
| 51 |
raise
|
| 52 |
-
except Exception:
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# Safely save to disk as before
|
| 56 |
tmp_fd, tmp_path = tempfile.mkstemp(suffix=f"_{file.filename}")
|
| 57 |
os.close(tmp_fd) # close fd immediately, manage file separately
|
| 58 |
try:
|
| 59 |
contents = await file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
with open(tmp_path, "wb") as f:
|
| 61 |
f.write(contents)
|
| 62 |
task = process_pdf_task.delay(tmp_path, file.filename, x_auth_token)
|
|
@@ -65,18 +106,19 @@ async def upload(
|
|
| 65 |
"task_id": task.id,
|
| 66 |
"filename": file.filename,
|
| 67 |
}
|
|
|
|
|
|
|
|
|
|
| 68 |
except Exception as e:
|
| 69 |
-
log.
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
except OSError:
|
| 73 |
-
pass
|
| 74 |
-
raise HTTPException(status_code=500, detail="Failed to queue file.")
|
| 75 |
|
| 76 |
|
| 77 |
# NEW ROUTE: The frontend will poll this every 2 seconds
|
| 78 |
@router.get("/status/{task_id}")
|
| 79 |
def get_ingest_status(task_id: str):
|
|
|
|
| 80 |
task_result = celery_app.AsyncResult(task_id)
|
| 81 |
|
| 82 |
if task_result.state == "PENDING":
|
|
|
|
| 1 |
import os
|
| 2 |
import tempfile
|
| 3 |
import logging
|
| 4 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends, Request
|
| 5 |
+
from backend.core import config
|
| 6 |
+
from backend.core.auth_utils import is_guest_token, require_auth_token
|
| 7 |
+
from backend.core.rate_limit import limiter
|
| 8 |
from backend.core.tasks import process_pdf_task
|
| 9 |
from backend.core.tasks import celery_app
|
| 10 |
|
|
|
|
| 12 |
router = APIRouter()
|
| 13 |
|
| 14 |
|
| 15 |
+
def _cleanup_temp_upload(tmp_path: str) -> None:
|
| 16 |
+
if not tmp_path:
|
| 17 |
+
return
|
| 18 |
+
try:
|
| 19 |
+
os.unlink(tmp_path)
|
| 20 |
+
except FileNotFoundError:
|
| 21 |
+
return
|
| 22 |
+
except OSError as exc:
|
| 23 |
+
log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _ensure_ingest_worker_available() -> None:
|
| 27 |
+
if celery_app is None or not hasattr(process_pdf_task, "delay"):
|
| 28 |
+
raise HTTPException(
|
| 29 |
+
status_code=503,
|
| 30 |
+
detail="Background ingestion worker is unavailable.",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
@router.post("/upload")
|
| 35 |
+
@limiter.limit("12/hour")
|
| 36 |
async def upload(
|
| 37 |
+
request: Request,
|
| 38 |
file: UploadFile = File(...),
|
| 39 |
user_id: str = Depends(require_auth_token),
|
| 40 |
x_auth_token: str = Header(None, alias="X-Auth-Token"),
|
| 41 |
):
|
| 42 |
+
del request
|
| 43 |
if not file.filename.lower().endswith(".pdf"):
|
| 44 |
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
|
| 45 |
|
| 46 |
+
guest_workspace = is_guest_token(x_auth_token)
|
| 47 |
+
|
| 48 |
# NEW: Secure file signature validation using python-magic
|
| 49 |
import magic
|
| 50 |
|
|
|
|
| 59 |
)
|
| 60 |
|
| 61 |
# ── Per-user document limit ───────────────────────────────────────────────
|
| 62 |
+
doc_limit = config.GUEST_MAX_DOCS if guest_workspace else config.MAX_DOCS_PER_USER
|
| 63 |
+
|
| 64 |
try:
|
| 65 |
from backend.core.pipeline import _build_supabase_client
|
| 66 |
|
|
|
|
| 71 |
.eq("user_id", user_id)
|
| 72 |
.execute()
|
| 73 |
)
|
| 74 |
+
if (result.count or 0) >= doc_limit:
|
| 75 |
raise HTTPException(
|
| 76 |
+
status_code=429,
|
| 77 |
+
detail=f"Document limit reached ({doc_limit} max).",
|
| 78 |
)
|
| 79 |
except HTTPException:
|
| 80 |
raise
|
| 81 |
+
except Exception as exc:
|
| 82 |
+
log.error("Upload limit check failed for user %s: %s", user_id, exc)
|
| 83 |
+
raise HTTPException(
|
| 84 |
+
status_code=503,
|
| 85 |
+
detail="Could not verify upload limits right now. Please try again.",
|
| 86 |
+
) from exc
|
| 87 |
+
|
| 88 |
+
_ensure_ingest_worker_available()
|
| 89 |
|
| 90 |
# Safely save to disk as before
|
| 91 |
tmp_fd, tmp_path = tempfile.mkstemp(suffix=f"_{file.filename}")
|
| 92 |
os.close(tmp_fd) # close fd immediately, manage file separately
|
| 93 |
try:
|
| 94 |
contents = await file.read()
|
| 95 |
+
max_upload_mb = config.GUEST_MAX_UPLOAD_MB if guest_workspace else config.MAX_UPLOAD_MB
|
| 96 |
+
if len(contents) > max_upload_mb * 1024 * 1024:
|
| 97 |
+
raise HTTPException(
|
| 98 |
+
status_code=413,
|
| 99 |
+
detail=f"File too large ({max_upload_mb} MB max).",
|
| 100 |
+
)
|
| 101 |
with open(tmp_path, "wb") as f:
|
| 102 |
f.write(contents)
|
| 103 |
task = process_pdf_task.delay(tmp_path, file.filename, x_auth_token)
|
|
|
|
| 106 |
"task_id": task.id,
|
| 107 |
"filename": file.filename,
|
| 108 |
}
|
| 109 |
+
except HTTPException:
|
| 110 |
+
_cleanup_temp_upload(tmp_path)
|
| 111 |
+
raise
|
| 112 |
except Exception as e:
|
| 113 |
+
log.exception("Failed to queue file: %s", e)
|
| 114 |
+
_cleanup_temp_upload(tmp_path)
|
| 115 |
+
raise HTTPException(status_code=500, detail="Failed to queue file.") from e
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
# NEW ROUTE: The frontend will poll this every 2 seconds
|
| 119 |
@router.get("/status/{task_id}")
|
| 120 |
def get_ingest_status(task_id: str):
|
| 121 |
+
_ensure_ingest_worker_available()
|
| 122 |
task_result = celery_app.AsyncResult(task_id)
|
| 123 |
|
| 124 |
if task_result.state == "PENDING":
|
backend/api/query.py
CHANGED
|
@@ -2,21 +2,58 @@
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import asyncio
|
| 5 |
-
from fastapi import APIRouter, Header, Depends, Request
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
-
from shared.types import QueryRequest, SourceChunk
|
| 8 |
from backend.core.pipeline import (
|
| 9 |
retrieve_chunks_routed,
|
| 10 |
generate_answer_stream,
|
| 11 |
analyse_intent,
|
|
|
|
|
|
|
| 12 |
)
|
| 13 |
from backend.core.auth_utils import require_auth_token
|
| 14 |
-
from backend.
|
| 15 |
|
| 16 |
log = logging.getLogger("morpheus.api.query")
|
| 17 |
router = APIRouter()
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def _normalise_original_content(raw):
|
| 21 |
"""Best-effort decode for metadata that may already be dict or JSON string."""
|
| 22 |
if isinstance(raw, dict):
|
|
@@ -91,14 +128,68 @@ async def query(
|
|
| 91 |
user_id = user_id,
|
| 92 |
)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
if not intent.get("is_clear"):
|
| 95 |
# Stream clarification question as a normal assistant message
|
| 96 |
# User answers it → next turn history resolves the subject
|
| 97 |
question = intent.get("clarification_question", "Could you clarify?")
|
| 98 |
yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
|
| 99 |
-
yield "data: " + json.dumps({"type": "done", "sources": [], "images": []}) + "\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
return
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
# ── Step 2: Retrieve using enriched query ─────────────────────────
|
| 103 |
# enriched_query has better embedding signal (category/history injected)
|
| 104 |
# but we answer with the ORIGINAL query so the response sounds natural
|
|
@@ -117,12 +208,15 @@ async def query(
|
|
| 117 |
user_id=user_id,
|
| 118 |
original_query=req.query,
|
| 119 |
eval_mode=(x_eval_mode == "true"),
|
|
|
|
| 120 |
),
|
| 121 |
)
|
| 122 |
|
| 123 |
# ── Step 3: Stream answer tokens ──────────────────────────────────
|
| 124 |
images = []
|
| 125 |
done_sources = []
|
|
|
|
|
|
|
| 126 |
# 🚀 Define the boolean once for readability
|
| 127 |
is_eval = x_eval_mode == "true"
|
| 128 |
async for event in generate_answer_stream(
|
|
@@ -133,12 +227,15 @@ async def query(
|
|
| 133 |
access_token=x_auth_token,
|
| 134 |
category=category,
|
| 135 |
eval_mode=is_eval,
|
|
|
|
| 136 |
):
|
| 137 |
if event["type"] == "token":
|
| 138 |
yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
|
| 139 |
elif event["type"] == "done":
|
| 140 |
images = event.get("images", [])
|
| 141 |
done_sources = event.get("sources", []) or []
|
|
|
|
|
|
|
| 142 |
|
| 143 |
# ── Step 4: Emit sources + images ─────────────────────────────────
|
| 144 |
sources = done_sources or _build_sources_from_chunks(
|
|
@@ -149,6 +246,8 @@ async def query(
|
|
| 149 |
"type": "done",
|
| 150 |
"sources": sources,
|
| 151 |
"images": images,
|
|
|
|
|
|
|
| 152 |
}) + "\n\n"
|
| 153 |
|
| 154 |
except Exception as e:
|
|
@@ -178,3 +277,16 @@ async def query(
|
|
| 178 |
"Access-Control-Allow-Origin": "*",
|
| 179 |
}
|
| 180 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import json
|
| 3 |
import logging
|
| 4 |
import asyncio
|
| 5 |
+
from fastapi import APIRouter, Header, Depends, Request, HTTPException
|
| 6 |
from fastapi.responses import StreamingResponse
|
| 7 |
+
from shared.types import AnswerFeedback, QueryRequest, SourceChunk
|
| 8 |
from backend.core.pipeline import (
|
| 9 |
retrieve_chunks_routed,
|
| 10 |
generate_answer_stream,
|
| 11 |
analyse_intent,
|
| 12 |
+
check_query_ambiguity,
|
| 13 |
+
record_answer_feedback,
|
| 14 |
)
|
| 15 |
from backend.core.auth_utils import require_auth_token
|
| 16 |
+
from backend.core.rate_limit import limiter
|
| 17 |
|
| 18 |
log = logging.getLogger("morpheus.api.query")
|
| 19 |
router = APIRouter()
|
| 20 |
|
| 21 |
|
| 22 |
+
def _contains_ordinal_followup(query: str) -> bool:
|
| 23 |
+
q = (query or "").strip().lower()
|
| 24 |
+
if not q:
|
| 25 |
+
return False
|
| 26 |
+
return any(
|
| 27 |
+
phrase in q
|
| 28 |
+
for phrase in (
|
| 29 |
+
"the second one",
|
| 30 |
+
"the first one",
|
| 31 |
+
"the other one",
|
| 32 |
+
"second one",
|
| 33 |
+
"first one",
|
| 34 |
+
"other one",
|
| 35 |
+
)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _history_has_explicit_enumeration(history: list[dict]) -> bool:
|
| 40 |
+
"""
|
| 41 |
+
Heuristic: if the last assistant message contains an explicit list, then
|
| 42 |
+
ordinal follow-ups (\"second one\") can be resolved. Otherwise, ask.
|
| 43 |
+
"""
|
| 44 |
+
for msg in reversed(history or []):
|
| 45 |
+
if (msg.get("role") or "").lower() != "assistant":
|
| 46 |
+
continue
|
| 47 |
+
content = str(msg.get("content") or "")
|
| 48 |
+
if not content.strip():
|
| 49 |
+
return False
|
| 50 |
+
# Common enumeration patterns (numbers, bullets).
|
| 51 |
+
if any(token in content for token in ("\n1.", "\n2.", "\n- ", "\n• ")):
|
| 52 |
+
return True
|
| 53 |
+
return False
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
def _normalise_original_content(raw):
|
| 58 |
"""Best-effort decode for metadata that may already be dict or JSON string."""
|
| 59 |
if isinstance(raw, dict):
|
|
|
|
| 128 |
user_id = user_id,
|
| 129 |
)
|
| 130 |
|
| 131 |
+
if intent.get("route_class") == "no_retrieval":
|
| 132 |
+
yield "data: " + json.dumps({
|
| 133 |
+
"type": "token",
|
| 134 |
+
"content": "Ask me about your uploaded documents or a topic inside them, and I’ll dig in.",
|
| 135 |
+
}) + "\n\n"
|
| 136 |
+
yield "data: " + json.dumps({
|
| 137 |
+
"type": "done",
|
| 138 |
+
"sources": [],
|
| 139 |
+
"images": [],
|
| 140 |
+
"trace_id": None,
|
| 141 |
+
"doc_diagnostics": [],
|
| 142 |
+
}) + "\n\n"
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
if not intent.get("is_clear"):
|
| 146 |
# Stream clarification question as a normal assistant message
|
| 147 |
# User answers it → next turn history resolves the subject
|
| 148 |
question = intent.get("clarification_question", "Could you clarify?")
|
| 149 |
yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
|
| 150 |
+
yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
|
| 151 |
+
return
|
| 152 |
+
|
| 153 |
+
# Guardrail: ordinal follow-ups without an explicit referent should not guess.
|
| 154 |
+
if (
|
| 155 |
+
intent.get("route_class") == "follow_up"
|
| 156 |
+
and _contains_ordinal_followup(req.query)
|
| 157 |
+
and not _history_has_explicit_enumeration(history)
|
| 158 |
+
):
|
| 159 |
+
yield "data: " + json.dumps(
|
| 160 |
+
{
|
| 161 |
+
"type": "token",
|
| 162 |
+
"content": "Second one of what? Please reference the items you mean (e.g., paste the list or restate the names).",
|
| 163 |
+
}
|
| 164 |
+
) + "\n\n"
|
| 165 |
+
yield "data: " + json.dumps(
|
| 166 |
+
{"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}
|
| 167 |
+
) + "\n\n"
|
| 168 |
return
|
| 169 |
|
| 170 |
+
# ── Step 1.5: Phase 2 Ambiguity Detection ────────────────────────
|
| 171 |
+
# If no manual pin is active, check if the query is too ambiguous
|
| 172 |
+
if not req.priority_file_hashes:
|
| 173 |
+
ambiguity_res = check_query_ambiguity(
|
| 174 |
+
req.query,
|
| 175 |
+
access_token=x_auth_token,
|
| 176 |
+
category=req.category,
|
| 177 |
+
)
|
| 178 |
+
if ambiguity_res.get("is_ambiguous"):
|
| 179 |
+
question = ambiguity_res.get("clarification_question", "Which document do you mean?")
|
| 180 |
+
# Use a distinct identifier so the frontend understands it's a structural prompt
|
| 181 |
+
yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
|
| 182 |
+
|
| 183 |
+
options = ambiguity_res.get("clarification_options")
|
| 184 |
+
if options:
|
| 185 |
+
yield "data: " + json.dumps({"type": "clarification_options", "options": options}) + "\n\n"
|
| 186 |
+
|
| 187 |
+
yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
|
| 188 |
+
return
|
| 189 |
+
if ambiguity_res.get("top_file_hash") and not ambiguity_res.get("is_ambiguous"):
|
| 190 |
+
req.priority_file_hashes = [ambiguity_res["top_file_hash"]]
|
| 191 |
+
log.info("Auto-pinned file hash: %s", ambiguity_res["top_file_hash"])
|
| 192 |
+
|
| 193 |
# ── Step 2: Retrieve using enriched query ─────────────────────────
|
| 194 |
# enriched_query has better embedding signal (category/history injected)
|
| 195 |
# but we answer with the ORIGINAL query so the response sounds natural
|
|
|
|
| 208 |
user_id=user_id,
|
| 209 |
original_query=req.query,
|
| 210 |
eval_mode=(x_eval_mode == "true"),
|
| 211 |
+
priority_file_hashes=req.priority_file_hashes or None,
|
| 212 |
),
|
| 213 |
)
|
| 214 |
|
| 215 |
# ── Step 3: Stream answer tokens ──────────────────────────────────
|
| 216 |
images = []
|
| 217 |
done_sources = []
|
| 218 |
+
trace_id = None
|
| 219 |
+
doc_diagnostics = []
|
| 220 |
# 🚀 Define the boolean once for readability
|
| 221 |
is_eval = x_eval_mode == "true"
|
| 222 |
async for event in generate_answer_stream(
|
|
|
|
| 227 |
access_token=x_auth_token,
|
| 228 |
category=category,
|
| 229 |
eval_mode=is_eval,
|
| 230 |
+
priority_file_hashes=req.priority_file_hashes or None,
|
| 231 |
):
|
| 232 |
if event["type"] == "token":
|
| 233 |
yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
|
| 234 |
elif event["type"] == "done":
|
| 235 |
images = event.get("images", [])
|
| 236 |
done_sources = event.get("sources", []) or []
|
| 237 |
+
trace_id = event.get("trace_id")
|
| 238 |
+
doc_diagnostics = event.get("doc_diagnostics", []) or []
|
| 239 |
|
| 240 |
# ── Step 4: Emit sources + images ─────────────────────────────────
|
| 241 |
sources = done_sources or _build_sources_from_chunks(
|
|
|
|
| 246 |
"type": "done",
|
| 247 |
"sources": sources,
|
| 248 |
"images": images,
|
| 249 |
+
"trace_id": trace_id,
|
| 250 |
+
"doc_diagnostics": doc_diagnostics,
|
| 251 |
}) + "\n\n"
|
| 252 |
|
| 253 |
except Exception as e:
|
|
|
|
| 277 |
"Access-Control-Allow-Origin": "*",
|
| 278 |
}
|
| 279 |
)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
@router.post("/feedback")
|
| 283 |
+
async def submit_feedback(
|
| 284 |
+
payload: AnswerFeedback,
|
| 285 |
+
user_id: str = Depends(require_auth_token),
|
| 286 |
+
x_auth_token: str = Header(None, alias="X-Auth-Token"),
|
| 287 |
+
):
|
| 288 |
+
del user_id
|
| 289 |
+
ok = record_answer_feedback(payload.dict(), access_token=x_auth_token)
|
| 290 |
+
if not ok:
|
| 291 |
+
raise HTTPException(status_code=500, detail="Could not record answer feedback.")
|
| 292 |
+
return {"ok": True}
|
backend/core/auth_utils.py
CHANGED
|
@@ -12,7 +12,7 @@ TASK 1 — Auth Bridge:
|
|
| 12 |
|
| 13 |
import jwt
|
| 14 |
import logging
|
| 15 |
-
from typing import Optional
|
| 16 |
from backend.core import config
|
| 17 |
from fastapi import Header, HTTPException, status
|
| 18 |
|
|
@@ -22,6 +22,45 @@ from fastapi import Header, HTTPException, status
|
|
| 22 |
log = logging.getLogger("morpheus.auth")
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def extract_jwt_sub(access_token: str) -> str:
|
| 26 |
"""
|
| 27 |
Extract the Supabase user id (JWT `sub`) while strictly verifying the signature.
|
|
|
|
| 12 |
|
| 13 |
import jwt
|
| 14 |
import logging
|
| 15 |
+
from typing import Any, Optional
|
| 16 |
from backend.core import config
|
| 17 |
from fastapi import Header, HTTPException, status
|
| 18 |
|
|
|
|
| 22 |
log = logging.getLogger("morpheus.auth")
|
| 23 |
|
| 24 |
|
| 25 |
+
def _decode_unverified_claims(access_token: Optional[str]) -> dict[str, Any]:
|
| 26 |
+
"""Peek at JWT claims without verifying the signature for non-security decisions."""
|
| 27 |
+
if not access_token:
|
| 28 |
+
return {}
|
| 29 |
+
try:
|
| 30 |
+
claims = jwt.decode(
|
| 31 |
+
access_token,
|
| 32 |
+
options={
|
| 33 |
+
"verify_signature": False,
|
| 34 |
+
"verify_exp": False,
|
| 35 |
+
"verify_aud": False,
|
| 36 |
+
},
|
| 37 |
+
algorithms=["ES256", "HS256", "RS256"],
|
| 38 |
+
)
|
| 39 |
+
return claims if isinstance(claims, dict) else {}
|
| 40 |
+
except Exception:
|
| 41 |
+
return {}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def is_guest_token(access_token: Optional[str]) -> bool:
|
| 45 |
+
"""
|
| 46 |
+
Supabase anonymous users still get real JWTs.
|
| 47 |
+
We treat them as guest workspaces for UI/limits/rate-limiting.
|
| 48 |
+
"""
|
| 49 |
+
claims = _decode_unverified_claims(access_token)
|
| 50 |
+
if not claims:
|
| 51 |
+
return False
|
| 52 |
+
|
| 53 |
+
app_meta = claims.get("app_metadata") or {}
|
| 54 |
+
provider = str(app_meta.get("provider") or "").strip().lower()
|
| 55 |
+
providers = app_meta.get("providers") or []
|
| 56 |
+
return bool(
|
| 57 |
+
claims.get("is_anonymous")
|
| 58 |
+
or app_meta.get("is_anonymous")
|
| 59 |
+
or provider == "anonymous"
|
| 60 |
+
or "anonymous" in providers
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
def extract_jwt_sub(access_token: str) -> str:
|
| 65 |
"""
|
| 66 |
Extract the Supabase user id (JWT `sub`) while strictly verifying the signature.
|
backend/core/classifier.py
CHANGED
|
@@ -167,8 +167,9 @@ class CentroidStore:
|
|
| 167 |
self._access_token = access_token
|
| 168 |
self._user_id = None
|
| 169 |
if access_token:
|
| 170 |
-
from backend.core.auth_utils import
|
| 171 |
-
|
|
|
|
| 172 |
self._cache: Dict[str, Dict] = {}
|
| 173 |
self._lock = threading.Lock()
|
| 174 |
self._client = None
|
|
@@ -176,23 +177,17 @@ class CentroidStore:
|
|
| 176 |
|
| 177 |
def _get_client(self):
|
| 178 |
if self._client is None:
|
| 179 |
-
#
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
raise RuntimeError("SUPABASE_ANON_KEY is not set but access_token was provided.")
|
| 183 |
-
self._client = create_client(
|
| 184 |
-
config.SUPABASE_URL,
|
| 185 |
-
config.SUPABASE_ANON_KEY,
|
| 186 |
-
)
|
| 187 |
-
self._client.postgrest.auth(self._access_token)
|
| 188 |
-
else:
|
| 189 |
-
# Admin / legacy fallback (bypasses RLS via service role).
|
| 190 |
-
self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
|
| 191 |
return self._client
|
| 192 |
|
| 193 |
def _load_from_db(self):
|
| 194 |
try:
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
for row in (result.data or []):
|
| 197 |
self._cache[row["document_type"]] = {
|
| 198 |
"vector": np.array(row["centroid_vector"], dtype=np.float32),
|
|
|
|
| 167 |
self._access_token = access_token
|
| 168 |
self._user_id = None
|
| 169 |
if access_token:
|
| 170 |
+
from backend.core.auth_utils import safe_extract_jwt_sub
|
| 171 |
+
|
| 172 |
+
self._user_id = safe_extract_jwt_sub(access_token)
|
| 173 |
self._cache: Dict[str, Dict] = {}
|
| 174 |
self._lock = threading.Lock()
|
| 175 |
self._client = None
|
|
|
|
| 177 |
|
| 178 |
def _get_client(self):
|
| 179 |
if self._client is None:
|
| 180 |
+
# Backend-owned access model: always use the service-role client and
|
| 181 |
+
# scope rows explicitly by user_id where applicable.
|
| 182 |
+
self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
return self._client
|
| 184 |
|
| 185 |
def _load_from_db(self):
|
| 186 |
try:
|
| 187 |
+
query = self._get_client().table(self.TABLE).select("*")
|
| 188 |
+
if self._user_id:
|
| 189 |
+
query = query.eq("user_id", self._user_id)
|
| 190 |
+
result = query.execute()
|
| 191 |
for row in (result.data or []):
|
| 192 |
self._cache[row["document_type"]] = {
|
| 193 |
"vector": np.array(row["centroid_vector"], dtype=np.float32),
|
backend/core/config.py
CHANGED
|
@@ -19,6 +19,15 @@ SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
|
|
| 19 |
SUPABASE_JWT_SECRET = os.getenv("SUPABASE_JWT_SECRET")
|
| 20 |
VECTOR_TABLE_NAME = "documents"
|
| 21 |
IMAGE_STORAGE_BUCKET = "rag-images"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# ==================== API KEYS ====================
|
| 24 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
@@ -37,9 +46,19 @@ OLLAMA_MODELS = ["llama3.2", "mistral"]
|
|
| 37 |
EMBEDDING_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2:free"
|
| 38 |
EMBEDDING_DIMENSIONS = 2048
|
| 39 |
EMBEDDING_DEVICE = "cuda"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
EMBEDDING_MODELS = [
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
]
|
| 44 |
|
| 45 |
# ==================== GROQ MODELS ====================
|
|
@@ -119,6 +138,17 @@ UPLOAD_RETRY_MAX_ATTEMPTS = int(os.getenv("UPLOAD_RETRY_MAX_ATTEMPTS", "4"))
|
|
| 119 |
UPLOAD_RETRY_BASE_SLEEP_S = float(os.getenv("UPLOAD_RETRY_BASE_SLEEP_S", "2"))
|
| 120 |
UPLOAD_RETRY_MAX_SLEEP_S = float(os.getenv("UPLOAD_RETRY_MAX_SLEEP_S", "20"))
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# ==================== RETRIEVAL ====================
|
| 123 |
CHAT_MEMORY_TURNS = 3
|
| 124 |
EMBEDDING_CACHE_SIZE = 256
|
|
@@ -127,6 +157,26 @@ RELEVANCE_THRESHOLD = 0.35
|
|
| 127 |
LLM_MAX_TOKENS = 4096
|
| 128 |
MAX_CONTEXT_CHARS = 14000
|
| 129 |
CATEGORY_SLOTS = 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
# ==================== LOGGING ====================
|
| 132 |
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
|
|
|
| 19 |
SUPABASE_JWT_SECRET = os.getenv("SUPABASE_JWT_SECRET")
|
| 20 |
VECTOR_TABLE_NAME = "documents"
|
| 21 |
IMAGE_STORAGE_BUCKET = "rag-images"
|
| 22 |
+
GUEST_MODE_ENABLED = os.getenv("GUEST_MODE_ENABLED", "true").lower() in {
|
| 23 |
+
"1",
|
| 24 |
+
"true",
|
| 25 |
+
"yes",
|
| 26 |
+
}
|
| 27 |
+
MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "25"))
|
| 28 |
+
GUEST_MAX_UPLOAD_MB = int(os.getenv("GUEST_MAX_UPLOAD_MB", "10"))
|
| 29 |
+
MAX_DOCS_PER_USER = int(os.getenv("MAX_DOCS_PER_USER", "50"))
|
| 30 |
+
GUEST_MAX_DOCS = int(os.getenv("GUEST_MAX_DOCS", "10"))
|
| 31 |
|
| 32 |
# ==================== API KEYS ====================
|
| 33 |
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
|
|
| 46 |
EMBEDDING_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2:free"
|
| 47 |
EMBEDDING_DIMENSIONS = 2048
|
| 48 |
EMBEDDING_DEVICE = "cuda"
|
| 49 |
+
RETRIEVAL_EMBEDDING_VARIANT = os.getenv(
|
| 50 |
+
"RETRIEVAL_EMBEDDING_VARIANT", "control"
|
| 51 |
+
).strip().lower()
|
| 52 |
+
RETRIEVAL_EMBEDDING_MODEL_OVERRIDE = os.getenv(
|
| 53 |
+
"RETRIEVAL_EMBEDDING_MODEL_OVERRIDE", ""
|
| 54 |
+
).strip()
|
| 55 |
EMBEDDING_MODELS = [
|
| 56 |
+
model
|
| 57 |
+
for model in [
|
| 58 |
+
RETRIEVAL_EMBEDDING_MODEL_OVERRIDE or EMBEDDING_MODEL,
|
| 59 |
+
EMBEDDING_MODEL if RETRIEVAL_EMBEDDING_MODEL_OVERRIDE else "",
|
| 60 |
+
]
|
| 61 |
+
if model
|
| 62 |
]
|
| 63 |
|
| 64 |
# ==================== GROQ MODELS ====================
|
|
|
|
| 138 |
UPLOAD_RETRY_BASE_SLEEP_S = float(os.getenv("UPLOAD_RETRY_BASE_SLEEP_S", "2"))
|
| 139 |
UPLOAD_RETRY_MAX_SLEEP_S = float(os.getenv("UPLOAD_RETRY_MAX_SLEEP_S", "20"))
|
| 140 |
|
| 141 |
+
# ==================== CELERY / REDIS ====================
|
| 142 |
+
CELERY_VISIBILITY_TIMEOUT_S = int(os.getenv("CELERY_VISIBILITY_TIMEOUT_S", "7200"))
|
| 143 |
+
CELERY_BROKER_HEARTBEAT_S = int(os.getenv("CELERY_BROKER_HEARTBEAT_S", "30"))
|
| 144 |
+
CELERY_BROKER_POOL_LIMIT = int(os.getenv("CELERY_BROKER_POOL_LIMIT", "1"))
|
| 145 |
+
CELERY_REDIS_SOCKET_TIMEOUT_S = float(
|
| 146 |
+
os.getenv("CELERY_REDIS_SOCKET_TIMEOUT_S", "30")
|
| 147 |
+
)
|
| 148 |
+
CELERY_REDIS_HEALTH_CHECK_INTERVAL_S = int(
|
| 149 |
+
os.getenv("CELERY_REDIS_HEALTH_CHECK_INTERVAL_S", "30")
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
# ==================== RETRIEVAL ====================
|
| 153 |
CHAT_MEMORY_TURNS = 3
|
| 154 |
EMBEDDING_CACHE_SIZE = 256
|
|
|
|
| 157 |
LLM_MAX_TOKENS = 4096
|
| 158 |
MAX_CONTEXT_CHARS = 14000
|
| 159 |
CATEGORY_SLOTS = 2
|
| 160 |
+
ENABLE_STRICT_OUTPUT_SANITIZER = os.getenv(
|
| 161 |
+
"ENABLE_STRICT_OUTPUT_SANITIZER", "true"
|
| 162 |
+
).lower() in {"1", "true", "yes"}
|
| 163 |
+
ENABLE_DUPLICATE_CHUNK_COLLAPSE = os.getenv(
|
| 164 |
+
"ENABLE_DUPLICATE_CHUNK_COLLAPSE", "true"
|
| 165 |
+
).lower() in {"1", "true", "yes"}
|
| 166 |
+
ENABLE_HYDE = os.getenv("ENABLE_HYDE", "false").lower() in {"1", "true", "yes"}
|
| 167 |
+
ENABLE_RETRIEVE_THEN_STUFF = os.getenv(
|
| 168 |
+
"ENABLE_RETRIEVE_THEN_STUFF", "true"
|
| 169 |
+
).lower() in {"1", "true", "yes"}
|
| 170 |
+
ENABLE_CONTEXTUAL_CHUNKING = os.getenv(
|
| 171 |
+
"ENABLE_CONTEXTUAL_CHUNKING", "false"
|
| 172 |
+
).lower() in {"1", "true", "yes"}
|
| 173 |
+
FOLLOWUP_SESSION_TTL_S = int(os.getenv("FOLLOWUP_SESSION_TTL_S", "1800"))
|
| 174 |
+
HISTORY_RECENT_TURNS = int(os.getenv("HISTORY_RECENT_TURNS", "3"))
|
| 175 |
+
HISTORY_IMPORTANT_MAX = int(os.getenv("HISTORY_IMPORTANT_MAX", "6"))
|
| 176 |
+
RETRIEVE_THEN_STUFF_K = int(os.getenv("RETRIEVE_THEN_STUFF_K", "12"))
|
| 177 |
+
RETRIEVE_THEN_STUFF_FETCH_K = int(
|
| 178 |
+
os.getenv("RETRIEVE_THEN_STUFF_FETCH_K", "20")
|
| 179 |
+
)
|
| 180 |
|
| 181 |
# ==================== LOGGING ====================
|
| 182 |
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
backend/core/pipeline.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
backend/core/pipeline_ambiguity.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ambiguity / scope safety logic.
|
| 3 |
+
|
| 4 |
+
Extracted from `backend/core/pipeline.py` to isolate multi-doc clarification
|
| 5 |
+
rules and reduce coupling with retrieval/generation.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
log = logging.getLogger("rag_pipeline")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def check_query_ambiguity(
|
| 16 |
+
query: str,
|
| 17 |
+
access_token: str = None,
|
| 18 |
+
category: str = None,
|
| 19 |
+
) -> dict:
|
| 20 |
+
from backend.core import pipeline as pipeline_facade
|
| 21 |
+
|
| 22 |
+
AMBIGUITY_GAP = 0.12
|
| 23 |
+
MIN_MATCH_SCORE = 0.05
|
| 24 |
+
MIN_WORDS_FOR_SPECIFICITY = 10
|
| 25 |
+
|
| 26 |
+
words = query.strip().split()
|
| 27 |
+
if len(words) > MIN_WORDS_FOR_SPECIFICITY and not pipeline_facade._is_generic_ambiguous_query(query):
|
| 28 |
+
# Still check if category resolves to a single file — if so, auto-pin it
|
| 29 |
+
try:
|
| 30 |
+
supabase = pipeline_facade._build_supabase_client(access_token)
|
| 31 |
+
user_id = None
|
| 32 |
+
if access_token:
|
| 33 |
+
from backend.core.auth_utils import safe_extract_jwt_sub
|
| 34 |
+
|
| 35 |
+
user_id = safe_extract_jwt_sub(access_token)
|
| 36 |
+
files_q = supabase.table("ingested_files").select("file_hash, filename")
|
| 37 |
+
if user_id:
|
| 38 |
+
files_q = files_q.eq("user_id", user_id)
|
| 39 |
+
if category and category != "All":
|
| 40 |
+
files_q = files_q.eq("document_type", category)
|
| 41 |
+
files_resp = files_q.execute()
|
| 42 |
+
files = files_resp.data or []
|
| 43 |
+
if len(files) == 1:
|
| 44 |
+
single_hash = files[0]["file_hash"]
|
| 45 |
+
else:
|
| 46 |
+
single_hash = None
|
| 47 |
+
|
| 48 |
+
if len(files) > 1 and pipeline_facade._query_requires_identity_lookup(query):
|
| 49 |
+
top_files = sorted(
|
| 50 |
+
(
|
| 51 |
+
(str(f.get("file_hash") or "").strip(), str(f.get("filename") or "").strip())
|
| 52 |
+
for f in files
|
| 53 |
+
),
|
| 54 |
+
key=lambda x: (x[1] or x[0]),
|
| 55 |
+
)
|
| 56 |
+
top_files = [(h, n) for h, n in top_files if h][:3]
|
| 57 |
+
options = [
|
| 58 |
+
{
|
| 59 |
+
"mode": "single",
|
| 60 |
+
"label": (name or fhash).replace(".pdf", ""),
|
| 61 |
+
"file_hash": fhash,
|
| 62 |
+
}
|
| 63 |
+
for fhash, name in top_files
|
| 64 |
+
]
|
| 65 |
+
return {
|
| 66 |
+
"is_ambiguous": True,
|
| 67 |
+
"clarification_question": "Which document do you mean? Please pick one.",
|
| 68 |
+
"clarification_options": options,
|
| 69 |
+
"top_file_hash": None,
|
| 70 |
+
}
|
| 71 |
+
except Exception:
|
| 72 |
+
single_hash = None
|
| 73 |
+
return {
|
| 74 |
+
"is_ambiguous": False,
|
| 75 |
+
"clarification_question": None,
|
| 76 |
+
"clarification_options": None,
|
| 77 |
+
"top_file_hash": single_hash,
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
supabase = pipeline_facade._build_supabase_client(access_token)
|
| 82 |
+
user_id = None
|
| 83 |
+
if access_token:
|
| 84 |
+
from backend.core.auth_utils import safe_extract_jwt_sub
|
| 85 |
+
|
| 86 |
+
user_id = safe_extract_jwt_sub(access_token)
|
| 87 |
+
|
| 88 |
+
files_q = supabase.table("ingested_files").select("file_hash, filename")
|
| 89 |
+
if user_id:
|
| 90 |
+
files_q = files_q.eq("user_id", user_id)
|
| 91 |
+
if category and category != "All":
|
| 92 |
+
files_q = files_q.eq("document_type", category)
|
| 93 |
+
files_resp = files_q.execute()
|
| 94 |
+
files = files_resp.data or []
|
| 95 |
+
if len(files) == 0:
|
| 96 |
+
return {
|
| 97 |
+
"is_ambiguous": False,
|
| 98 |
+
"clarification_question": None,
|
| 99 |
+
"clarification_options": None,
|
| 100 |
+
"top_file_hash": None,
|
| 101 |
+
}
|
| 102 |
+
if len(files) == 1:
|
| 103 |
+
return {
|
| 104 |
+
"is_ambiguous": False,
|
| 105 |
+
"clarification_question": None,
|
| 106 |
+
"clarification_options": None,
|
| 107 |
+
"top_file_hash": files[0]["file_hash"],
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
if pipeline_facade._query_requires_identity_lookup(query):
|
| 111 |
+
top_files = sorted(
|
| 112 |
+
(
|
| 113 |
+
(str(f.get("file_hash") or "").strip(), str(f.get("filename") or "").strip())
|
| 114 |
+
for f in files
|
| 115 |
+
),
|
| 116 |
+
key=lambda x: (x[1] or x[0]),
|
| 117 |
+
)
|
| 118 |
+
top_files = [(h, n) for h, n in top_files if h][:3]
|
| 119 |
+
options = [
|
| 120 |
+
{
|
| 121 |
+
"mode": "single",
|
| 122 |
+
"label": (name or fhash).replace(".pdf", ""),
|
| 123 |
+
"file_hash": fhash,
|
| 124 |
+
}
|
| 125 |
+
for fhash, name in top_files
|
| 126 |
+
]
|
| 127 |
+
return {
|
| 128 |
+
"is_ambiguous": True,
|
| 129 |
+
"clarification_question": "Which document do you mean? Please pick one.",
|
| 130 |
+
"clarification_options": options,
|
| 131 |
+
"top_file_hash": None,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
query_vec = pipeline_facade.get_cached_embedding(query)
|
| 135 |
+
file_scores: list[tuple[str, str, float]] = [] # (file_hash, label, best_score)
|
| 136 |
+
|
| 137 |
+
for f in files:
|
| 138 |
+
fhash = f.get("file_hash")
|
| 139 |
+
fname = (f.get("filename") or fhash or "Untitled").strip()
|
| 140 |
+
if not fhash:
|
| 141 |
+
continue
|
| 142 |
+
try:
|
| 143 |
+
resp = supabase.rpc(
|
| 144 |
+
"hybrid_search",
|
| 145 |
+
{
|
| 146 |
+
"query_text": query,
|
| 147 |
+
"query_embedding": query_vec,
|
| 148 |
+
"match_count": 1,
|
| 149 |
+
"filter": {"file_hash": fhash},
|
| 150 |
+
"semantic_weight": 0.7,
|
| 151 |
+
"keyword_weight": 0.3,
|
| 152 |
+
"p_user_id": user_id,
|
| 153 |
+
},
|
| 154 |
+
).execute()
|
| 155 |
+
rows = resp.data or []
|
| 156 |
+
if rows:
|
| 157 |
+
score = float(rows[0].get("combined_score", 0.0))
|
| 158 |
+
file_scores.append((fhash, fname, score))
|
| 159 |
+
except Exception as exc:
|
| 160 |
+
log.warning("Ambiguity check RPC error for %s: %s", str(fhash)[:8], exc)
|
| 161 |
+
|
| 162 |
+
if len(file_scores) < 2:
|
| 163 |
+
return {
|
| 164 |
+
"is_ambiguous": False,
|
| 165 |
+
"clarification_question": None,
|
| 166 |
+
"clarification_options": None,
|
| 167 |
+
"top_file_hash": None,
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
file_scores.sort(key=lambda x: x[2], reverse=True)
|
| 171 |
+
top_hash, top_name, top_score = file_scores[0]
|
| 172 |
+
second_hash, second_name, second_score = file_scores[1]
|
| 173 |
+
gap = top_score - second_score
|
| 174 |
+
generic = pipeline_facade._is_generic_ambiguous_query(query)
|
| 175 |
+
|
| 176 |
+
log.info(
|
| 177 |
+
"Ambiguity check: top=%r (%.3f), 2nd=%r (%.3f), gap=%.3f, generic=%s, category=%r",
|
| 178 |
+
top_name,
|
| 179 |
+
top_score,
|
| 180 |
+
second_name,
|
| 181 |
+
second_score,
|
| 182 |
+
gap,
|
| 183 |
+
generic,
|
| 184 |
+
category,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
if gap >= AMBIGUITY_GAP and top_score >= MIN_MATCH_SCORE and not generic:
|
| 188 |
+
return {
|
| 189 |
+
"is_ambiguous": False,
|
| 190 |
+
"clarification_question": None,
|
| 191 |
+
"clarification_options": None,
|
| 192 |
+
"top_file_hash": top_hash,
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
options = []
|
| 196 |
+
for fhash, fname, score in file_scores[:3]:
|
| 197 |
+
options.append(
|
| 198 |
+
{
|
| 199 |
+
"mode": "single",
|
| 200 |
+
"label": (fname or fhash).replace(".pdf", ""),
|
| 201 |
+
"file_hash": fhash,
|
| 202 |
+
"score": round(float(score), 4),
|
| 203 |
+
}
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
"is_ambiguous": True,
|
| 208 |
+
"clarification_question": "Which document do you mean? Please pick one.",
|
| 209 |
+
"clarification_options": options,
|
| 210 |
+
"top_file_hash": None,
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
log.warning("Ambiguity detector failed: %s", e)
|
| 215 |
+
return {
|
| 216 |
+
"is_ambiguous": False,
|
| 217 |
+
"clarification_question": None,
|
| 218 |
+
"clarification_options": None,
|
| 219 |
+
"top_file_hash": None,
|
| 220 |
+
}
|
| 221 |
+
|
backend/core/pipeline_generation.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generation / streaming facade functions.
|
| 3 |
+
|
| 4 |
+
The implementation lives in `pipeline.py` during migration; this module makes
|
| 5 |
+
generation a distinct unit for debugging and future refactors.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Any, AsyncGenerator, List, Optional, Tuple
|
| 11 |
+
|
| 12 |
+
from langchain_core.documents import Document
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def generate_answer(
|
| 16 |
+
chunks: List[Document],
|
| 17 |
+
query: str,
|
| 18 |
+
chat_history: Optional[List[dict]] = None,
|
| 19 |
+
past_memories: Optional[List[dict]] = None,
|
| 20 |
+
) -> Tuple[str, List[str]]:
|
| 21 |
+
from backend.core import pipeline as pipeline_facade
|
| 22 |
+
|
| 23 |
+
return pipeline_facade._generate_answer_impl(
|
| 24 |
+
chunks=chunks,
|
| 25 |
+
query=query,
|
| 26 |
+
chat_history=chat_history,
|
| 27 |
+
past_memories=past_memories,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
async def generate_answer_stream(
|
| 32 |
+
chunks: List[Document],
|
| 33 |
+
query: str,
|
| 34 |
+
chat_history: Optional[List[dict]] = None,
|
| 35 |
+
session_id: str = "default_session",
|
| 36 |
+
access_token: str = None,
|
| 37 |
+
category: str = None,
|
| 38 |
+
eval_mode: bool = False,
|
| 39 |
+
priority_file_hashes: List[str] = None,
|
| 40 |
+
) -> AsyncGenerator[dict, None]:
|
| 41 |
+
from backend.core import pipeline as pipeline_facade
|
| 42 |
+
|
| 43 |
+
async for event in pipeline_facade._generate_answer_stream_impl(
|
| 44 |
+
chunks=chunks,
|
| 45 |
+
query=query,
|
| 46 |
+
chat_history=chat_history,
|
| 47 |
+
session_id=session_id,
|
| 48 |
+
access_token=access_token,
|
| 49 |
+
category=category,
|
| 50 |
+
eval_mode=eval_mode,
|
| 51 |
+
priority_file_hashes=priority_file_hashes,
|
| 52 |
+
):
|
| 53 |
+
yield event
|
| 54 |
+
|
backend/core/pipeline_ingestion.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingestion entrypoints and helpers.
|
| 3 |
+
|
| 4 |
+
This module intentionally keeps imports lightweight where possible and defers
|
| 5 |
+
heavy dependencies to function scope. It is part of the gradual de-monolith
|
| 6 |
+
refactor: `backend/core/pipeline.py` remains a stable facade.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import hashlib
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
import os
|
| 15 |
+
import time
|
| 16 |
+
from types import SimpleNamespace
|
| 17 |
+
from typing import List, Optional
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
import fitz
|
| 21 |
+
except Exception: # optional at import time (only used for PDF/image helpers)
|
| 22 |
+
fitz = None
|
| 23 |
+
|
| 24 |
+
from backend.core.cache_manager import invalidate_user_cache
|
| 25 |
+
from backend.core.pipeline_supabase import _build_service_supabase_client, _build_supabase_client
|
| 26 |
+
|
| 27 |
+
log = logging.getLogger("rag_pipeline")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_file_fingerprint(file_path: str) -> str:
|
| 31 |
+
"""SHA-256 hash — collision-resistant dedup key."""
|
| 32 |
+
hasher = hashlib.sha256()
|
| 33 |
+
with open(file_path, "rb") as f:
|
| 34 |
+
for chunk in iter(lambda: f.read(65536), b""):
|
| 35 |
+
hasher.update(chunk)
|
| 36 |
+
return hasher.hexdigest()
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def extract_images_from_pdf(file_path: str) -> dict:
|
| 40 |
+
"""
|
| 41 |
+
Extract images per page using PyMuPDF.
|
| 42 |
+
Returns dict: {page_number: [base64_string, ...]}
|
| 43 |
+
"""
|
| 44 |
+
if fitz is None:
|
| 45 |
+
log.warning("PyMuPDF (fitz) not installed; skipping image extraction.")
|
| 46 |
+
return {}
|
| 47 |
+
|
| 48 |
+
page_images = {}
|
| 49 |
+
try:
|
| 50 |
+
doc = fitz.open(file_path)
|
| 51 |
+
for page_num in range(len(doc)):
|
| 52 |
+
page = doc[page_num]
|
| 53 |
+
images = []
|
| 54 |
+
for img in page.get_images(full=True):
|
| 55 |
+
xref = img[0]
|
| 56 |
+
base_image = doc.extract_image(xref)
|
| 57 |
+
if base_image and base_image.get("image"):
|
| 58 |
+
# --- NEW LOGIC: Junk Image Filter ---
|
| 59 |
+
w = base_image.get("width", 0)
|
| 60 |
+
h = base_image.get("height", 0)
|
| 61 |
+
|
| 62 |
+
# 1. Skip tiny icons (e.g., smaller than 100x100 pixels)
|
| 63 |
+
if w < 100 or h < 100:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# 2. Skip extreme aspect ratios (e.g., skinny banners/logos)
|
| 67 |
+
aspect_ratio = w / h if h > 0 else 0
|
| 68 |
+
if aspect_ratio > 5.0 or aspect_ratio < 0.2:
|
| 69 |
+
continue
|
| 70 |
+
# ------------------------------------
|
| 71 |
+
import base64
|
| 72 |
+
|
| 73 |
+
b64 = base64.b64encode(base_image["image"]).decode("utf-8")
|
| 74 |
+
images.append(b64)
|
| 75 |
+
if images:
|
| 76 |
+
page_images[page_num + 1] = images # 1-indexed to match page_numbers
|
| 77 |
+
doc.close()
|
| 78 |
+
log.info("PyMuPDF extracted images from %d pages", len(page_images))
|
| 79 |
+
except Exception as exc:
|
| 80 |
+
log.warning("PyMuPDF image extraction failed: %s", exc)
|
| 81 |
+
return page_images
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _has_text_layer(pdf_path: str) -> bool:
|
| 85 |
+
"""Check if the PDF has native digital text to skip expensive OCR."""
|
| 86 |
+
if fitz is None:
|
| 87 |
+
# Without PyMuPDF we can't cheaply inspect the text layer.
|
| 88 |
+
return False
|
| 89 |
+
try:
|
| 90 |
+
doc = fitz.open(pdf_path)
|
| 91 |
+
for page in doc:
|
| 92 |
+
if page.get_text().strip():
|
| 93 |
+
return True
|
| 94 |
+
return False
|
| 95 |
+
except Exception:
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _extract_element_metrics(elements: list) -> dict[str, float]:
|
| 101 |
+
page_numbers = {
|
| 102 |
+
getattr(getattr(el, "metadata", None), "page_number", None)
|
| 103 |
+
for el in elements
|
| 104 |
+
if getattr(getattr(el, "metadata", None), "page_number", None) is not None
|
| 105 |
+
}
|
| 106 |
+
page_count = max(1, len(page_numbers))
|
| 107 |
+
text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
|
| 108 |
+
element_count = len(elements)
|
| 109 |
+
chars_per_page = text_chars / max(1, page_count)
|
| 110 |
+
return {
|
| 111 |
+
"text_chars": text_chars,
|
| 112 |
+
"element_count": element_count,
|
| 113 |
+
"page_count": page_count,
|
| 114 |
+
"chars_per_page": chars_per_page,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _should_retry_with_hi_res(
|
| 119 |
+
strategy: str,
|
| 120 |
+
metrics: dict[str, float],
|
| 121 |
+
) -> bool:
|
| 122 |
+
return (
|
| 123 |
+
strategy == "fast"
|
| 124 |
+
and metrics["chars_per_page"] < 200
|
| 125 |
+
and metrics["element_count"] < 10
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def partition_document(file_path: str) -> list:
|
| 130 |
+
# Dynamic OCR routing + guarded high-resolution retry for suspiciously thin extraction
|
| 131 |
+
# Use facade symbol so tests can monkeypatch `backend.core.pipeline._has_text_layer`.
|
| 132 |
+
from backend.core import pipeline as pipeline_facade
|
| 133 |
+
|
| 134 |
+
partition_pdf = getattr(pipeline_facade, "partition_pdf", None)
|
| 135 |
+
if not callable(partition_pdf):
|
| 136 |
+
try:
|
| 137 |
+
from unstructured.partition.pdf import partition_pdf as _partition_pdf
|
| 138 |
+
except Exception as exc:
|
| 139 |
+
raise RuntimeError(
|
| 140 |
+
"Missing dependency 'unstructured'. Install it to ingest PDFs."
|
| 141 |
+
) from exc
|
| 142 |
+
partition_pdf = _partition_pdf
|
| 143 |
+
|
| 144 |
+
has_text = pipeline_facade._has_text_layer(file_path)
|
| 145 |
+
strategy = "fast" if has_text else "hi_res"
|
| 146 |
+
log.info("PDF text layer detected: %s. Using partition strategy: %s", has_text, strategy)
|
| 147 |
+
elements = partition_pdf(
|
| 148 |
+
filename=file_path,
|
| 149 |
+
strategy=strategy,
|
| 150 |
+
infer_table_structure=True,
|
| 151 |
+
extract_image_block_types=["Image"],
|
| 152 |
+
extract_image_block_to_payload=True,
|
| 153 |
+
)
|
| 154 |
+
metrics = _extract_element_metrics(elements)
|
| 155 |
+
log.info(
|
| 156 |
+
"%d elements extracted (text_chars=%d, page_count=%d, chars_per_page=%.1f)",
|
| 157 |
+
len(elements),
|
| 158 |
+
metrics["text_chars"],
|
| 159 |
+
metrics["page_count"],
|
| 160 |
+
metrics["chars_per_page"],
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if _should_retry_with_hi_res(strategy, metrics):
|
| 164 |
+
log.info(
|
| 165 |
+
"Extraction looked suspiciously thin (chars_per_page=%.1f, elements=%d) — retrying once with hi_res.",
|
| 166 |
+
metrics["chars_per_page"],
|
| 167 |
+
metrics["element_count"],
|
| 168 |
+
)
|
| 169 |
+
hi_res_elements = partition_pdf(
|
| 170 |
+
filename=file_path,
|
| 171 |
+
strategy="hi_res",
|
| 172 |
+
infer_table_structure=True,
|
| 173 |
+
extract_image_block_types=["Image"],
|
| 174 |
+
extract_image_block_to_payload=True,
|
| 175 |
+
)
|
| 176 |
+
hi_res_metrics = _extract_element_metrics(hi_res_elements)
|
| 177 |
+
if (
|
| 178 |
+
hi_res_metrics["text_chars"] > metrics["text_chars"]
|
| 179 |
+
or hi_res_metrics["element_count"] > metrics["element_count"]
|
| 180 |
+
):
|
| 181 |
+
log.info(
|
| 182 |
+
"Using hi_res extraction instead (text_chars=%d, elements=%d).",
|
| 183 |
+
hi_res_metrics["text_chars"],
|
| 184 |
+
hi_res_metrics["element_count"],
|
| 185 |
+
)
|
| 186 |
+
return hi_res_elements
|
| 187 |
+
log.info("Keeping fast extraction — hi_res did not improve coverage.")
|
| 188 |
+
|
| 189 |
+
return elements
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _build_document_tree(elements: list) -> dict:
|
| 193 |
+
"""
|
| 194 |
+
Converts a flat list of unstructured elements into a nested JSON tree.
|
| 195 |
+
Titles become parent nodes, and Text/Tables become their children.
|
| 196 |
+
"""
|
| 197 |
+
tree = {"title": "Document Root", "type": "root", "children": []}
|
| 198 |
+
current_section = tree
|
| 199 |
+
|
| 200 |
+
for el in elements:
|
| 201 |
+
category = getattr(el, "category", "Text")
|
| 202 |
+
text = str(el).strip()
|
| 203 |
+
if not text:
|
| 204 |
+
continue
|
| 205 |
+
page_num = getattr(getattr(el, "metadata", None), "page_number", None)
|
| 206 |
+
try:
|
| 207 |
+
page_num = int(page_num) if page_num is not None else None
|
| 208 |
+
except Exception:
|
| 209 |
+
page_num = None
|
| 210 |
+
|
| 211 |
+
if category == "Title":
|
| 212 |
+
new_section = {
|
| 213 |
+
"type": "section",
|
| 214 |
+
"title": text[:150], # Keep titles concise
|
| 215 |
+
"content": text,
|
| 216 |
+
"children": [],
|
| 217 |
+
}
|
| 218 |
+
tree["children"].append(new_section)
|
| 219 |
+
current_section = new_section
|
| 220 |
+
elif category in ("Table", "Text", "NarrativeText", "ListItem"):
|
| 221 |
+
child = {"type": category, "content": text}
|
| 222 |
+
if page_num is not None:
|
| 223 |
+
child["page_numbers"] = [page_num]
|
| 224 |
+
current_section["children"].append(child)
|
| 225 |
+
|
| 226 |
+
return tree
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def run_ingestion(
|
| 230 |
+
pdf_path: str,
|
| 231 |
+
export_json: bool = False,
|
| 232 |
+
force: bool = False,
|
| 233 |
+
progress_callback=None,
|
| 234 |
+
original_filename: str = None,
|
| 235 |
+
access_token: str = None,
|
| 236 |
+
) -> str:
|
| 237 |
+
"""
|
| 238 |
+
Ingestion orchestrator.
|
| 239 |
+
|
| 240 |
+
Note: during the de-monolith refactor, some collaborators still live on the
|
| 241 |
+
facade module. We import them lazily to avoid circular imports at module load.
|
| 242 |
+
"""
|
| 243 |
+
from backend.core.auth_utils import extract_jwt_sub
|
| 244 |
+
from backend.core import pipeline as pipeline_facade
|
| 245 |
+
|
| 246 |
+
STEPS = 6
|
| 247 |
+
stage_timings_ms: dict[str, int] = {}
|
| 248 |
+
|
| 249 |
+
def _progress(step: int, msg: str):
|
| 250 |
+
log.info("[%d/%d] %s", step, STEPS, msg)
|
| 251 |
+
if progress_callback:
|
| 252 |
+
progress_callback(step, STEPS, msg)
|
| 253 |
+
|
| 254 |
+
def _record_stage_timing(stage_name: str, started_at: float) -> None:
|
| 255 |
+
elapsed_ms = max(0, int((time.perf_counter() - started_at) * 1000))
|
| 256 |
+
stage_timings_ms[stage_name] = elapsed_ms
|
| 257 |
+
log.info("Ingestion stage '%s' completed in %d ms", stage_name, elapsed_ms)
|
| 258 |
+
pipeline_facade._log_ingestion_retry_event(
|
| 259 |
+
user_id=user_id,
|
| 260 |
+
file_hash=file_hash if "file_hash" in locals() else None,
|
| 261 |
+
batch_num=0,
|
| 262 |
+
total_batches=0,
|
| 263 |
+
attempt=1,
|
| 264 |
+
event_type="stage_timing",
|
| 265 |
+
message=json.dumps({"stage": stage_name, "elapsed_ms": elapsed_ms})[:500],
|
| 266 |
+
sleep_s=0,
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
log.info("=" * 50)
|
| 270 |
+
log.info("Starting ingestion: %s", pdf_path)
|
| 271 |
+
|
| 272 |
+
user_id = (
|
| 273 |
+
extract_jwt_sub(access_token)
|
| 274 |
+
if access_token
|
| 275 |
+
else "00000000-0000-0000-0000-000000000000"
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
_progress(1, "Computing file fingerprint…")
|
| 279 |
+
# Use facade symbol so tests can monkeypatch `backend.core.pipeline.get_file_fingerprint`.
|
| 280 |
+
file_hash = pipeline_facade.get_file_fingerprint(pdf_path)
|
| 281 |
+
already_exists = pipeline_facade.is_file_already_ingested(file_hash, access_token=access_token)
|
| 282 |
+
if not already_exists:
|
| 283 |
+
recovered_existing = pipeline_facade._recover_or_prepare_orphaned_upload(
|
| 284 |
+
file_hash,
|
| 285 |
+
user_id=user_id,
|
| 286 |
+
access_token=access_token,
|
| 287 |
+
filename_hint=original_filename or os.path.basename(pdf_path),
|
| 288 |
+
force=force,
|
| 289 |
+
)
|
| 290 |
+
if recovered_existing:
|
| 291 |
+
return recovered_existing
|
| 292 |
+
if already_exists and not force:
|
| 293 |
+
log.info("SKIPPING — already ingested.")
|
| 294 |
+
return "already_ingested"
|
| 295 |
+
|
| 296 |
+
forced_category = None
|
| 297 |
+
if already_exists or force:
|
| 298 |
+
try:
|
| 299 |
+
_sb = pipeline_facade._build_supabase_client(access_token)
|
| 300 |
+
_existing = (
|
| 301 |
+
_sb.table("ingested_files")
|
| 302 |
+
.select("document_type, user_overridden")
|
| 303 |
+
.eq("user_id", user_id)
|
| 304 |
+
.eq("file_hash", file_hash)
|
| 305 |
+
.limit(1)
|
| 306 |
+
.execute()
|
| 307 |
+
)
|
| 308 |
+
if _existing.data and _existing.data[0].get("user_overridden"):
|
| 309 |
+
forced_category = _existing.data[0]["document_type"]
|
| 310 |
+
log.info(
|
| 311 |
+
"User override active — forcing category '%s', skipping classifier.",
|
| 312 |
+
forced_category,
|
| 313 |
+
)
|
| 314 |
+
except Exception as _exc:
|
| 315 |
+
log.warning("Could not check user override: %s", _exc)
|
| 316 |
+
|
| 317 |
+
if already_exists or force:
|
| 318 |
+
pipeline_facade._cleanup_existing_ingestion_fragments(
|
| 319 |
+
file_hash,
|
| 320 |
+
user_id=user_id,
|
| 321 |
+
access_token=access_token,
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
_progress(2, "Partitioning PDF (OCR + layout detection)…")
|
| 325 |
+
stage_started = time.perf_counter()
|
| 326 |
+
# Use facade symbols so tests can monkeypatch these helpers.
|
| 327 |
+
elements = pipeline_facade.partition_document(pdf_path)
|
| 328 |
+
pdf_images = pipeline_facade.extract_images_from_pdf(pdf_path)
|
| 329 |
+
if not elements:
|
| 330 |
+
raise ValueError(
|
| 331 |
+
"The PDF appears blank or unreadable. "
|
| 332 |
+
"If scanned, ensure tesseract-ocr is installed."
|
| 333 |
+
)
|
| 334 |
+
text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
|
| 335 |
+
coverage_metrics = _extract_element_metrics(elements)
|
| 336 |
+
if text_chars < 50:
|
| 337 |
+
raise ValueError(
|
| 338 |
+
f"PDF contains almost no readable text ({text_chars} chars). "
|
| 339 |
+
"May be corrupted or image-only without OCR layer."
|
| 340 |
+
)
|
| 341 |
+
identity_json = pipeline_facade._identity_json_from_elements(
|
| 342 |
+
elements,
|
| 343 |
+
fallback_title=pipeline_facade._extract_pdf_title(elements, os.path.basename(pdf_path)),
|
| 344 |
+
)
|
| 345 |
+
_record_stage_timing("partition", stage_started)
|
| 346 |
+
|
| 347 |
+
_progress(3, "Classifying document and building taxonomy…")
|
| 348 |
+
stage_started = time.perf_counter()
|
| 349 |
+
graph_data = pipeline_facade.extract_document_entities(
|
| 350 |
+
elements,
|
| 351 |
+
access_token=access_token,
|
| 352 |
+
forced_category=forced_category,
|
| 353 |
+
)
|
| 354 |
+
if not graph_data.is_allowed:
|
| 355 |
+
raise ValueError("Document rejected: appears blank, spam, or unreadable.")
|
| 356 |
+
log.info("Category: '%s'", graph_data.document_type)
|
| 357 |
+
_record_stage_timing("classify", stage_started)
|
| 358 |
+
|
| 359 |
+
try:
|
| 360 |
+
log.info("🌳 Generating structural PageIndex tree...")
|
| 361 |
+
doc_tree = pipeline_facade._build_document_tree(elements)
|
| 362 |
+
sb = pipeline_facade._build_service_supabase_client()
|
| 363 |
+
sb.table("document_trees").upsert(
|
| 364 |
+
{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
|
| 365 |
+
on_conflict="user_id,file_hash",
|
| 366 |
+
).execute()
|
| 367 |
+
log.info("✅ PageIndex tree saved to Supabase.")
|
| 368 |
+
except Exception as e:
|
| 369 |
+
log.warning("⚠️ Failed to generate/save document tree: %s", e)
|
| 370 |
+
|
| 371 |
+
_progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
|
| 372 |
+
stage_started = time.perf_counter()
|
| 373 |
+
chunks = pipeline_facade.create_chunks(elements, text_chars=text_chars)
|
| 374 |
+
pdf_path_for_naming = original_filename if original_filename else pdf_path
|
| 375 |
+
docs, ids = pipeline_facade.process_chunks(
|
| 376 |
+
chunks,
|
| 377 |
+
elements,
|
| 378 |
+
pdf_path_for_naming,
|
| 379 |
+
file_hash,
|
| 380 |
+
graph_data,
|
| 381 |
+
user_id,
|
| 382 |
+
pdf_images,
|
| 383 |
+
coverage_metrics=coverage_metrics,
|
| 384 |
+
)
|
| 385 |
+
_record_stage_timing("chunk_process", stage_started)
|
| 386 |
+
|
| 387 |
+
_progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
|
| 388 |
+
stage_started = time.perf_counter()
|
| 389 |
+
docs, ids = pipeline_facade.build_raptor_tree(docs, ids, user_id)
|
| 390 |
+
pipeline_facade._persist_graph_foundation(
|
| 391 |
+
user_id=user_id,
|
| 392 |
+
file_hash=file_hash,
|
| 393 |
+
docs=docs,
|
| 394 |
+
graph_data=graph_data,
|
| 395 |
+
)
|
| 396 |
+
_record_stage_timing("raptor", stage_started)
|
| 397 |
+
|
| 398 |
+
smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
|
| 399 |
+
if export_json:
|
| 400 |
+
log.info("💾 Exporting processed chunks to local JSON...")
|
| 401 |
+
pipeline_facade.export_to_json(docs)
|
| 402 |
+
|
| 403 |
+
_progress(6, f"Embedding and uploading {len(docs)} tree nodes…")
|
| 404 |
+
stage_started = time.perf_counter()
|
| 405 |
+
pipeline_facade.upload_to_supabase(docs, ids, access_token=access_token)
|
| 406 |
+
_record_stage_timing("upload", stage_started)
|
| 407 |
+
|
| 408 |
+
try:
|
| 409 |
+
sb = pipeline_facade._build_service_supabase_client()
|
| 410 |
+
sb.table("ingested_files").upsert(
|
| 411 |
+
{
|
| 412 |
+
"user_id": user_id,
|
| 413 |
+
"file_hash": file_hash,
|
| 414 |
+
"filename": smart_name,
|
| 415 |
+
"document_type": graph_data.document_type,
|
| 416 |
+
"chunk_count": len(docs),
|
| 417 |
+
"identity_json": identity_json,
|
| 418 |
+
},
|
| 419 |
+
on_conflict="user_id,file_hash",
|
| 420 |
+
).execute()
|
| 421 |
+
pipeline_facade._log_ingestion_retry_event(
|
| 422 |
+
user_id=user_id,
|
| 423 |
+
file_hash=file_hash,
|
| 424 |
+
batch_num=0,
|
| 425 |
+
total_batches=0,
|
| 426 |
+
attempt=1,
|
| 427 |
+
event_type="registry_saved",
|
| 428 |
+
message="Registered ingested file after successful upload.",
|
| 429 |
+
)
|
| 430 |
+
except Exception as e:
|
| 431 |
+
log.error("Failed to register file: %s", e)
|
| 432 |
+
pipeline_facade._log_ingestion_retry_event(
|
| 433 |
+
user_id=user_id,
|
| 434 |
+
file_hash=file_hash,
|
| 435 |
+
batch_num=0,
|
| 436 |
+
total_batches=0,
|
| 437 |
+
attempt=1,
|
| 438 |
+
event_type="registry_failed",
|
| 439 |
+
message=str(e),
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
if access_token:
|
| 443 |
+
try:
|
| 444 |
+
invalidate_user_cache(user_id, reason="new_document_ingested")
|
| 445 |
+
except Exception:
|
| 446 |
+
pass
|
| 447 |
+
|
| 448 |
+
log.info("Ingestion complete!")
|
| 449 |
+
pipeline_facade._log_ingestion_retry_event(
|
| 450 |
+
user_id=user_id,
|
| 451 |
+
file_hash=file_hash,
|
| 452 |
+
batch_num=0,
|
| 453 |
+
total_batches=0,
|
| 454 |
+
attempt=1,
|
| 455 |
+
event_type="ingestion_complete",
|
| 456 |
+
message="Ingestion completed successfully.",
|
| 457 |
+
)
|
| 458 |
+
log.info("Ingestion stage timings (ms): %s", stage_timings_ms)
|
| 459 |
+
return {
|
| 460 |
+
"pending_review": True,
|
| 461 |
+
"document_type": graph_data.document_type,
|
| 462 |
+
"filename": smart_name,
|
| 463 |
+
"file_hash": file_hash,
|
| 464 |
+
}
|
| 465 |
+
|
backend/core/pipeline_memory.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Memory & prefetch facade functions.
|
| 3 |
+
|
| 4 |
+
The implementation lives in `pipeline.py` during migration; this module gives
|
| 5 |
+
it a clear ownership boundary and makes it easy to feature-flag later.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _predict_and_prefetch(
|
| 12 |
+
original_query: str, answer: str, category: str, session_id: str, access_token: str
|
| 13 |
+
):
|
| 14 |
+
from backend.core import pipeline as pipeline_facade
|
| 15 |
+
|
| 16 |
+
return pipeline_facade._predict_and_prefetch_impl(
|
| 17 |
+
original_query=original_query,
|
| 18 |
+
answer=answer,
|
| 19 |
+
category=category,
|
| 20 |
+
session_id=session_id,
|
| 21 |
+
access_token=access_token,
|
| 22 |
+
)
|
| 23 |
+
|
backend/core/pipeline_pageindex.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PageIndex / structural tree path retrieval.
|
| 3 |
+
|
| 4 |
+
This module isolates TOC/page lookup heuristics and Supabase `document_trees`
|
| 5 |
+
traversal so issues in structural retrieval don't churn the main retrieval path.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
import re
|
| 12 |
+
import time
|
| 13 |
+
from typing import List
|
| 14 |
+
|
| 15 |
+
from langchain_core.documents import Document
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
log = logging.getLogger("rag_pipeline")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _should_use_tree_path(query: str) -> bool:
|
| 22 |
+
"""
|
| 23 |
+
Zero-latency heuristic to route structured/specific queries to PageIndex
|
| 24 |
+
instead of the standard vector semantic search.
|
| 25 |
+
"""
|
| 26 |
+
# 1. Regex match for Course Codes (e.g., DSN4097, CSE2001, ENG101)
|
| 27 |
+
if re.search(r"\b[A-Z]{2,4}\s?[0-9]{3,4}\b", query, re.IGNORECASE):
|
| 28 |
+
return True
|
| 29 |
+
|
| 30 |
+
q = (query or "").lower()
|
| 31 |
+
|
| 32 |
+
# 2. Structured-document intents that benefit from PageIndex.
|
| 33 |
+
# Keep this conservative: over-triggering PageIndex causes irrelevant “structural” hits.
|
| 34 |
+
if "table of contents" in q or ("contents" in q and "page" in q):
|
| 35 |
+
return True
|
| 36 |
+
trigger_words = {"list", "exactly", "code"}
|
| 37 |
+
query_words = set(q.split())
|
| 38 |
+
if query_words.intersection(trigger_words):
|
| 39 |
+
return True
|
| 40 |
+
|
| 41 |
+
return False
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def tree_search(
|
| 45 |
+
query: str,
|
| 46 |
+
access_token: str = None,
|
| 47 |
+
category: str = None,
|
| 48 |
+
priority_file_hashes: List[str] = None,
|
| 49 |
+
) -> List[Document]:
|
| 50 |
+
"""
|
| 51 |
+
Navigates the structural JSON trees in Supabase to answer highly specific
|
| 52 |
+
'Needle in a Haystack' queries (e.g., course codes, exact table lookups).
|
| 53 |
+
"""
|
| 54 |
+
log.info("🔍 Executing Tree Search for query: %s", query)
|
| 55 |
+
|
| 56 |
+
q = (query or "").strip()
|
| 57 |
+
q_lower = q.lower()
|
| 58 |
+
|
| 59 |
+
def _norm_for_match(s: str) -> str:
|
| 60 |
+
"""
|
| 61 |
+
Lightweight normalization to make TOC matching robust across OCR/partition quirks:
|
| 62 |
+
- normalize curly quotes/apostrophes to ASCII
|
| 63 |
+
- lowercase
|
| 64 |
+
- collapse whitespace
|
| 65 |
+
"""
|
| 66 |
+
s = str(s or "")
|
| 67 |
+
s = (
|
| 68 |
+
s.replace("’", "'")
|
| 69 |
+
.replace("‘", "'")
|
| 70 |
+
.replace("“", '"')
|
| 71 |
+
.replace("”", '"')
|
| 72 |
+
.replace("`", "'")
|
| 73 |
+
)
|
| 74 |
+
s = s.lower()
|
| 75 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 76 |
+
return s
|
| 77 |
+
|
| 78 |
+
# 1. Extract the specific targets from the query (e.g., Course Codes)
|
| 79 |
+
targets = set(re.findall(r"\b[A-Z]{2,4}\s?[0-9]{3,4}\b", q, re.IGNORECASE))
|
| 80 |
+
|
| 81 |
+
# Special-case: Table of contents lookups (“what page is X on?”).
|
| 82 |
+
# Extract the section title inside quotes if present, otherwise fall back
|
| 83 |
+
# to a small target set to avoid matching the entire tree.
|
| 84 |
+
toc_lookup = ("table of contents" in q_lower) or ("contents" in q_lower and "page" in q_lower)
|
| 85 |
+
toc_target = None
|
| 86 |
+
if toc_lookup and not targets:
|
| 87 |
+
m = re.search(r"[\"'“”‘’](.+?)[\"'“”‘’]", q)
|
| 88 |
+
if m:
|
| 89 |
+
toc_target = m.group(1).strip()
|
| 90 |
+
if toc_target:
|
| 91 |
+
# Add normalized variants so “What’s New” matches "What's New" / "Whats New".
|
| 92 |
+
norm = _norm_for_match(toc_target)
|
| 93 |
+
if norm:
|
| 94 |
+
targets = {norm, norm.replace("'", "")}
|
| 95 |
+
|
| 96 |
+
# Fallback: extract important keywords if no explicit course code is found
|
| 97 |
+
if not targets:
|
| 98 |
+
trigger_words = {"table", "contents", "list", "exactly", "code", "section", "capstone", "credits"}
|
| 99 |
+
stopwords = {
|
| 100 |
+
"what",
|
| 101 |
+
"is",
|
| 102 |
+
"the",
|
| 103 |
+
"how",
|
| 104 |
+
"many",
|
| 105 |
+
"for",
|
| 106 |
+
"in",
|
| 107 |
+
"a",
|
| 108 |
+
"of",
|
| 109 |
+
"to",
|
| 110 |
+
"on",
|
| 111 |
+
"only",
|
| 112 |
+
"page",
|
| 113 |
+
}
|
| 114 |
+
words = {w.strip(".,:;!?()[]{}") for w in q_lower.split()}
|
| 115 |
+
words = {w for w in words if w}
|
| 116 |
+
targets = words - trigger_words - stopwords
|
| 117 |
+
|
| 118 |
+
if not targets:
|
| 119 |
+
log.info("No specific targets extracted for tree search.")
|
| 120 |
+
return []
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
from backend.core.auth_utils import extract_jwt_sub
|
| 124 |
+
from backend.core import pipeline as pipeline_facade
|
| 125 |
+
|
| 126 |
+
user_id = (
|
| 127 |
+
extract_jwt_sub(access_token)
|
| 128 |
+
if access_token
|
| 129 |
+
else "00000000-0000-0000-0000-000000000000"
|
| 130 |
+
)
|
| 131 |
+
# Use facade symbol so tests can monkeypatch `backend.core.pipeline._build_supabase_client`.
|
| 132 |
+
sb = pipeline_facade._build_supabase_client(access_token)
|
| 133 |
+
|
| 134 |
+
# 2. Fetch all structural trees for this user
|
| 135 |
+
res = (
|
| 136 |
+
sb.table("document_trees")
|
| 137 |
+
.select("file_hash, tree_json")
|
| 138 |
+
.eq("user_id", user_id)
|
| 139 |
+
.execute()
|
| 140 |
+
)
|
| 141 |
+
if not res.data:
|
| 142 |
+
return []
|
| 143 |
+
|
| 144 |
+
allowed_hashes = None
|
| 145 |
+
if category and category != "All":
|
| 146 |
+
try:
|
| 147 |
+
allowed_res = (
|
| 148 |
+
sb.table("ingested_files")
|
| 149 |
+
.select("file_hash")
|
| 150 |
+
.eq("document_type", category)
|
| 151 |
+
.execute()
|
| 152 |
+
)
|
| 153 |
+
allowed_hashes = {
|
| 154 |
+
row.get("file_hash") for row in (allowed_res.data or []) if row.get("file_hash")
|
| 155 |
+
}
|
| 156 |
+
except Exception as exc:
|
| 157 |
+
log.warning("Could not apply tree-search category filter: %s", exc)
|
| 158 |
+
if priority_file_hashes:
|
| 159 |
+
pinned_hashes = {h for h in priority_file_hashes if h}
|
| 160 |
+
if pinned_hashes:
|
| 161 |
+
allowed_hashes = (
|
| 162 |
+
pinned_hashes
|
| 163 |
+
if allowed_hashes is None
|
| 164 |
+
else allowed_hashes.intersection(pinned_hashes)
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
matched_chunks: list[Document] = []
|
| 168 |
+
|
| 169 |
+
# 3. Recursive Tree Traversal
|
| 170 |
+
def _traverse(node, parent_title="", file_hash=""):
|
| 171 |
+
title = str(node.get("title", "") or "")
|
| 172 |
+
content = str(node.get("content", "") or "")
|
| 173 |
+
node_text = _norm_for_match(title + " " + content)
|
| 174 |
+
|
| 175 |
+
# If the node contains our target noun/code, we capture it
|
| 176 |
+
norm_targets = [_norm_for_match(t) for t in targets]
|
| 177 |
+
is_match = any(t and t in node_text for t in norm_targets)
|
| 178 |
+
|
| 179 |
+
if is_match and content:
|
| 180 |
+
parent_chain = f"{parent_title} {title}".strip().lower()
|
| 181 |
+
|
| 182 |
+
# TOC lookups should only match TOC entries (not random headers/sections that mention the phrase).
|
| 183 |
+
# NOTE: Many PDFs don't label the TOC as a distinct "Title" element during partitioning,
|
| 184 |
+
# so TOC rows can end up under "Document Root" or a different parent. We therefore treat
|
| 185 |
+
# "in TOC section" as a relevance boost (not a hard filter) and rely on the stricter
|
| 186 |
+
# "dotted leader -> page number" extraction below to keep TOC matches precise.
|
| 187 |
+
in_toc_section = False
|
| 188 |
+
if toc_lookup:
|
| 189 |
+
in_toc_section = ("table of contents" in parent_chain) or (
|
| 190 |
+
parent_chain.startswith("contents") or "contents" in parent_chain
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
# Score matches: prefer nodes that contain the full target phrase and a TOC-like page number.
|
| 194 |
+
score = 0.2
|
| 195 |
+
if toc_lookup and in_toc_section:
|
| 196 |
+
score += 0.15
|
| 197 |
+
if toc_target and _norm_for_match(toc_target) in node_text:
|
| 198 |
+
score += 0.6
|
| 199 |
+
score += 0.2 if any(t and t in node_text for t in norm_targets) else 0.0
|
| 200 |
+
|
| 201 |
+
# Attempt to extract page numbers from TOC lines ("..... 6") or "Page 6".
|
| 202 |
+
page_numbers: list[int] = []
|
| 203 |
+
# TOC dotted leaders can appear as "..... 6" or ". . . . 6"
|
| 204 |
+
toc_page_match = re.search(r"(?:\.\s*){2,}(\d{1,3})\b", content)
|
| 205 |
+
if toc_page_match:
|
| 206 |
+
page_numbers.append(int(toc_page_match.group(1)))
|
| 207 |
+
score += 0.3
|
| 208 |
+
elif toc_lookup:
|
| 209 |
+
leader_page = re.search(
|
| 210 |
+
r"(?:[.\u00b7\u2026]\s*){1,}(\d{1,3})\s*$", content
|
| 211 |
+
)
|
| 212 |
+
if leader_page:
|
| 213 |
+
page_numbers.append(int(leader_page.group(1)))
|
| 214 |
+
score += 0.25
|
| 215 |
+
else:
|
| 216 |
+
spaced_page = re.search(r"\s{2,}(\d{1,3})\s*$", content)
|
| 217 |
+
if spaced_page:
|
| 218 |
+
page_numbers.append(int(spaced_page.group(1)))
|
| 219 |
+
score += 0.2
|
| 220 |
+
elif not toc_lookup:
|
| 221 |
+
page_hint = re.search(
|
| 222 |
+
r"\bpage\s+(\d{1,3})\b", content, flags=re.IGNORECASE
|
| 223 |
+
)
|
| 224 |
+
if page_hint:
|
| 225 |
+
page_numbers.append(int(page_hint.group(1)))
|
| 226 |
+
score += 0.2
|
| 227 |
+
|
| 228 |
+
if toc_lookup and not page_numbers:
|
| 229 |
+
return
|
| 230 |
+
|
| 231 |
+
matched_chunks.append(
|
| 232 |
+
Document(
|
| 233 |
+
page_content=f"Section Context: {parent_title} -> {title}\n\n{content}",
|
| 234 |
+
metadata={
|
| 235 |
+
"source": "PageIndex Tree Structure",
|
| 236 |
+
"file_hash": file_hash,
|
| 237 |
+
"type": "structural_node",
|
| 238 |
+
"page_numbers": page_numbers,
|
| 239 |
+
"relevance_score": round(min(1.0, max(0.0, score)), 4),
|
| 240 |
+
"retrieved_at_ms": int(time.time() * 1000),
|
| 241 |
+
},
|
| 242 |
+
)
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
for child in node.get("children", []):
|
| 246 |
+
_traverse(child, node.get("title", parent_title), file_hash)
|
| 247 |
+
|
| 248 |
+
for tree_row in res.data:
|
| 249 |
+
if allowed_hashes is not None and tree_row.get("file_hash") not in allowed_hashes:
|
| 250 |
+
continue
|
| 251 |
+
_traverse(tree_row["tree_json"], file_hash=tree_row["file_hash"])
|
| 252 |
+
|
| 253 |
+
log.info("✅ Tree search found %d matching structural nodes.", len(matched_chunks))
|
| 254 |
+
|
| 255 |
+
matched_chunks.sort(
|
| 256 |
+
key=lambda d: float((d.metadata or {}).get("relevance_score") or 0.0), reverse=True
|
| 257 |
+
)
|
| 258 |
+
return matched_chunks[:5]
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
log.warning("⚠️ Tree Search failed, falling back to empty chunks: %s", e)
|
| 262 |
+
return []
|
| 263 |
+
|
backend/core/pipeline_retrieval.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Retrieval facade functions.
|
| 3 |
+
|
| 4 |
+
During the gradual de-monolith refactor, we keep the heavy implementations in
|
| 5 |
+
`pipeline.py` (renamed to *_impl) and provide stable entrypoints here. This
|
| 6 |
+
lets API/tests import retrieval without pulling generation/ingestion concerns.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import List
|
| 12 |
+
|
| 13 |
+
from langchain_core.documents import Document
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def generate_sub_queries(original_query: str, *, route_class: str = "factoid") -> List[str]:
|
| 17 |
+
from backend.core import pipeline as pipeline_facade
|
| 18 |
+
|
| 19 |
+
return pipeline_facade._generate_sub_queries_impl(
|
| 20 |
+
original_query,
|
| 21 |
+
route_class=route_class,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def retrieve_chunks(
|
| 26 |
+
query: str,
|
| 27 |
+
k: int = 3,
|
| 28 |
+
source_file: str = None,
|
| 29 |
+
category: str = None,
|
| 30 |
+
alpha: float = 0.5,
|
| 31 |
+
session_id: str = "default_session",
|
| 32 |
+
access_token: str = None,
|
| 33 |
+
user_id: str = None,
|
| 34 |
+
original_query: str = None,
|
| 35 |
+
eval_mode: bool = False,
|
| 36 |
+
priority_file_hashes: List[str] = None,
|
| 37 |
+
) -> List[Document]:
|
| 38 |
+
from backend.core import pipeline as pipeline_facade
|
| 39 |
+
|
| 40 |
+
return pipeline_facade._retrieve_chunks_impl(
|
| 41 |
+
query,
|
| 42 |
+
k=k,
|
| 43 |
+
source_file=source_file,
|
| 44 |
+
category=category,
|
| 45 |
+
alpha=alpha,
|
| 46 |
+
session_id=session_id,
|
| 47 |
+
access_token=access_token,
|
| 48 |
+
user_id=user_id,
|
| 49 |
+
original_query=original_query,
|
| 50 |
+
eval_mode=eval_mode,
|
| 51 |
+
priority_file_hashes=priority_file_hashes,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def retrieve_chunks_routed(
|
| 56 |
+
query: str,
|
| 57 |
+
k: int = 3,
|
| 58 |
+
source_file: str = None,
|
| 59 |
+
category: str = None,
|
| 60 |
+
alpha: float = 0.5,
|
| 61 |
+
session_id: str = "default_session",
|
| 62 |
+
access_token: str = None,
|
| 63 |
+
user_id: str = None,
|
| 64 |
+
original_query: str = None,
|
| 65 |
+
eval_mode: bool = False,
|
| 66 |
+
priority_file_hashes: List[str] = None,
|
| 67 |
+
) -> List[Document]:
|
| 68 |
+
from backend.core import pipeline as pipeline_facade
|
| 69 |
+
|
| 70 |
+
return pipeline_facade._retrieve_chunks_routed_impl(
|
| 71 |
+
query,
|
| 72 |
+
k=k,
|
| 73 |
+
source_file=source_file,
|
| 74 |
+
category=category,
|
| 75 |
+
alpha=alpha,
|
| 76 |
+
session_id=session_id,
|
| 77 |
+
access_token=access_token,
|
| 78 |
+
user_id=user_id,
|
| 79 |
+
original_query=original_query,
|
| 80 |
+
eval_mode=eval_mode,
|
| 81 |
+
priority_file_hashes=priority_file_hashes,
|
| 82 |
+
)
|
| 83 |
+
|
backend/core/pipeline_routing.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Routing and expert selection logic.
|
| 3 |
+
|
| 4 |
+
This module is extracted from `backend/core/pipeline.py` as part of the
|
| 5 |
+
de-monolith refactor. The facade still owns many helpers; we import them
|
| 6 |
+
lazily to avoid circular imports during migration.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from typing import List, Optional
|
| 13 |
+
|
| 14 |
+
from backend.core.pipeline_types import RouteDecision
|
| 15 |
+
|
| 16 |
+
log = logging.getLogger("rag_pipeline")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _classify_query_route_decision(
|
| 20 |
+
query: str,
|
| 21 |
+
*,
|
| 22 |
+
session_id: Optional[str] = None,
|
| 23 |
+
user_id: Optional[str] = None,
|
| 24 |
+
priority_file_hashes: Optional[List[str]] = None,
|
| 25 |
+
) -> RouteDecision:
|
| 26 |
+
from backend.core import pipeline as pipeline_facade
|
| 27 |
+
|
| 28 |
+
q = (query or "").strip().lower()
|
| 29 |
+
if not q:
|
| 30 |
+
return RouteDecision(route_class="factoid", route_reason="empty_query")
|
| 31 |
+
if q in {"hi", "hello", "hey", "thanks", "thank you"}:
|
| 32 |
+
return RouteDecision(route_class="no_retrieval", route_reason="greeting")
|
| 33 |
+
page_scope = pipeline_facade._detect_page_scope(q)
|
| 34 |
+
exact_field = pipeline_facade._detect_identity_field(q)
|
| 35 |
+
if page_scope:
|
| 36 |
+
return RouteDecision(
|
| 37 |
+
route_class="page_scoped",
|
| 38 |
+
route_reason=f"page_scope:{page_scope}",
|
| 39 |
+
preserve_query=True,
|
| 40 |
+
disable_memory=True,
|
| 41 |
+
page_scope=page_scope,
|
| 42 |
+
exact_field=exact_field,
|
| 43 |
+
)
|
| 44 |
+
if exact_field or pipeline_facade._is_exact_fact_query(q):
|
| 45 |
+
return RouteDecision(
|
| 46 |
+
route_class="exact_fact",
|
| 47 |
+
route_reason=f"identity_field:{exact_field or 'generic'}",
|
| 48 |
+
preserve_query=True,
|
| 49 |
+
disable_memory=True,
|
| 50 |
+
exact_field=exact_field,
|
| 51 |
+
)
|
| 52 |
+
if pipeline_facade._is_follow_up_reference(query, session_id=session_id, user_id=user_id):
|
| 53 |
+
return RouteDecision(
|
| 54 |
+
route_class="follow_up",
|
| 55 |
+
route_reason="session_reference",
|
| 56 |
+
preserve_query=False,
|
| 57 |
+
disable_memory=False,
|
| 58 |
+
)
|
| 59 |
+
if pipeline_facade._is_compare_like_query(query) or bool(
|
| 60 |
+
priority_file_hashes and len(priority_file_hashes) > 1
|
| 61 |
+
):
|
| 62 |
+
return RouteDecision(route_class="compare", route_reason="compare_keywords")
|
| 63 |
+
if pipeline_facade._is_multi_part_query(query):
|
| 64 |
+
return RouteDecision(route_class="multi_part", route_reason="multi_part_keywords")
|
| 65 |
+
if pipeline_facade._is_summary_like_query(query):
|
| 66 |
+
return RouteDecision(route_class="summary", route_reason="summary_keywords")
|
| 67 |
+
if pipeline_facade._is_relational_query(query):
|
| 68 |
+
return RouteDecision(route_class="relational", route_reason="relational_keywords")
|
| 69 |
+
|
| 70 |
+
llm_decision = pipeline_facade._llm_route_classifier(
|
| 71 |
+
query,
|
| 72 |
+
session_id=session_id,
|
| 73 |
+
user_id=user_id,
|
| 74 |
+
priority_file_hashes=priority_file_hashes,
|
| 75 |
+
)
|
| 76 |
+
if llm_decision and llm_decision.route_class:
|
| 77 |
+
return llm_decision
|
| 78 |
+
|
| 79 |
+
return RouteDecision(route_class="factoid", route_reason="heuristic_default")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _route_query_experts(
|
| 83 |
+
query: str,
|
| 84 |
+
*,
|
| 85 |
+
session_id: Optional[str] = None,
|
| 86 |
+
user_id: Optional[str] = None,
|
| 87 |
+
priority_file_hashes: Optional[List[str]] = None,
|
| 88 |
+
) -> dict:
|
| 89 |
+
from backend.core import pipeline as pipeline_facade
|
| 90 |
+
|
| 91 |
+
q = (query or "").strip()
|
| 92 |
+
q_lower = q.lower()
|
| 93 |
+
embedding_scores: dict[str, float] = {}
|
| 94 |
+
try:
|
| 95 |
+
query_vec = pipeline_facade.get_cached_embedding(q or "general document information")
|
| 96 |
+
for expert, prototypes in pipeline_facade._ROUTER_PROTOTYPES.items():
|
| 97 |
+
sims = [
|
| 98 |
+
pipeline_facade._vector_cosine(
|
| 99 |
+
query_vec, pipeline_facade.get_cached_embedding(proto)
|
| 100 |
+
)
|
| 101 |
+
for proto in prototypes
|
| 102 |
+
]
|
| 103 |
+
embedding_scores[expert] = max(0.0, sum(sims) / max(1, len(sims)))
|
| 104 |
+
except Exception as exc:
|
| 105 |
+
log.debug("Router embedding stage unavailable: %s", exc)
|
| 106 |
+
embedding_scores = {expert: 0.2 for expert in pipeline_facade._ROUTER_PROTOTYPES}
|
| 107 |
+
|
| 108 |
+
feature_scores = {expert: 0.0 for expert in pipeline_facade._ROUTER_PROTOTYPES}
|
| 109 |
+
if pipeline_facade._is_summary_like_query(q_lower):
|
| 110 |
+
feature_scores["raptor_summary"] += 0.35
|
| 111 |
+
if pipeline_facade._is_compare_like_query(q_lower):
|
| 112 |
+
feature_scores["hybrid_compare"] += 0.45
|
| 113 |
+
feature_scores["graph_traversal"] += 0.10
|
| 114 |
+
if any(
|
| 115 |
+
token in q_lower
|
| 116 |
+
for token in ("relationship", "connected", "connection", "link", "linked", "why", "cause")
|
| 117 |
+
):
|
| 118 |
+
feature_scores["graph_traversal"] += 0.35
|
| 119 |
+
if priority_file_hashes and len(priority_file_hashes) > 1:
|
| 120 |
+
feature_scores["hybrid_compare"] += 0.15
|
| 121 |
+
if session_id:
|
| 122 |
+
session_key = pipeline_facade._session_cache_key(session_id, user_id=user_id)
|
| 123 |
+
if session_key in pipeline_facade._last_chunks and any(
|
| 124 |
+
token in q_lower for token in ("it", "this", "that", "previous", "above", "earlier")
|
| 125 |
+
):
|
| 126 |
+
feature_scores["episodic_memory"] += 0.35
|
| 127 |
+
if not priority_file_hashes:
|
| 128 |
+
feature_scores["dense_chunk"] += 0.10
|
| 129 |
+
|
| 130 |
+
combined = {
|
| 131 |
+
expert: (embedding_scores.get(expert, 0.0) * 0.65)
|
| 132 |
+
+ (feature_scores.get(expert, 0.0) * 0.35)
|
| 133 |
+
for expert in pipeline_facade._ROUTER_PROTOTYPES
|
| 134 |
+
}
|
| 135 |
+
weights = pipeline_facade._normalize_weight_map(combined)
|
| 136 |
+
ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
|
| 137 |
+
confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
|
| 138 |
+
if confidence_gap < 0.06 and len(q.split()) >= 4:
|
| 139 |
+
llm_weights = pipeline_facade._llm_router_fallback(q)
|
| 140 |
+
if llm_weights:
|
| 141 |
+
weights = llm_weights
|
| 142 |
+
ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
|
| 143 |
+
confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
|
| 144 |
+
return {
|
| 145 |
+
"expert_weights": weights,
|
| 146 |
+
"selected_experts": [expert for expert, score in ranked if score >= 0.18][:3],
|
| 147 |
+
"confidence": round(confidence_gap, 4),
|
| 148 |
+
}
|
| 149 |
+
|
backend/core/pipeline_supabase.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Supabase client builders and small DB helpers for the RAG pipeline.
|
| 3 |
+
|
| 4 |
+
Separated so API/worker code can use Supabase utilities without importing the
|
| 5 |
+
entire pipeline (LLMs, unstructured, etc.).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from supabase.client import create_client
|
| 12 |
+
except Exception: # optional at import time
|
| 13 |
+
create_client = None
|
| 14 |
+
|
| 15 |
+
from backend.core import config
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _build_service_supabase_client():
|
| 19 |
+
"""Service-role client (bypasses RLS). Use only for admin/bootstrap paths."""
|
| 20 |
+
if create_client is None:
|
| 21 |
+
raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to use DB features.")
|
| 22 |
+
return create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _build_user_supabase_client(access_token: str):
|
| 26 |
+
if create_client is None:
|
| 27 |
+
raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to use DB features.")
|
| 28 |
+
if not config.SUPABASE_ANON_KEY:
|
| 29 |
+
raise RuntimeError(
|
| 30 |
+
"SUPABASE_ANON_KEY is not set but a tenant access_token was provided."
|
| 31 |
+
)
|
| 32 |
+
client = create_client(config.SUPABASE_URL, config.SUPABASE_ANON_KEY)
|
| 33 |
+
# supabase-py v2: set JWT for RLS via postgrest auth header
|
| 34 |
+
client.postgrest.auth(access_token)
|
| 35 |
+
return client
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _build_supabase_client(access_token: str = None):
|
| 39 |
+
"""
|
| 40 |
+
Default to service role for legacy/internal call paths.
|
| 41 |
+
API routes should pass access_token so RLS is enforced.
|
| 42 |
+
"""
|
| 43 |
+
if access_token:
|
| 44 |
+
return _build_user_supabase_client(access_token)
|
| 45 |
+
return _build_service_supabase_client()
|
| 46 |
+
|
backend/core/pipeline_types.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Shared Pydantic schemas and lightweight types for the RAG pipeline.
|
| 3 |
+
|
| 4 |
+
Kept in a separate module so API/worker code can import types without pulling
|
| 5 |
+
in the full pipeline runtime (LLM clients, unstructured, etc.).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import List, Optional
|
| 11 |
+
|
| 12 |
+
from pydantic import BaseModel, Field
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class DocumentGraphMetadata(BaseModel):
|
| 16 |
+
"""
|
| 17 |
+
Dynamic taxonomy classification.
|
| 18 |
+
All fields have safe defaults so partial LLM responses never raise.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
is_allowed: bool = Field(
|
| 22 |
+
default=True,
|
| 23 |
+
description=(
|
| 24 |
+
"True for any real document with meaningful content. "
|
| 25 |
+
"False ONLY for blank/empty files, pure spam, or completely unreadable content."
|
| 26 |
+
),
|
| 27 |
+
)
|
| 28 |
+
document_type: str = Field(
|
| 29 |
+
default="general_document",
|
| 30 |
+
description=(
|
| 31 |
+
"A snake_case category label. Choose from the existing list if a good match exists. "
|
| 32 |
+
"Otherwise invent a concise new label e.g. 'machine_learning_paper', 'legal_contract'."
|
| 33 |
+
),
|
| 34 |
+
)
|
| 35 |
+
key_entities: List[str] = Field(
|
| 36 |
+
default_factory=list,
|
| 37 |
+
description="Names of algorithms, people, organizations, places, or technologies mentioned.",
|
| 38 |
+
)
|
| 39 |
+
primary_topics: List[str] = Field(
|
| 40 |
+
default_factory=list,
|
| 41 |
+
description="The 2-3 broad themes of the document.",
|
| 42 |
+
)
|
| 43 |
+
brief_summary: str = Field(
|
| 44 |
+
default="No summary available.",
|
| 45 |
+
description="A one-sentence summary of what this document is about.",
|
| 46 |
+
)
|
| 47 |
+
# Absorb extra fields older LLM responses include — prevents Pydantic crash
|
| 48 |
+
categories: Optional[List[str]] = Field(default=None, exclude=True)
|
| 49 |
+
audience: Optional[str] = Field(default=None, exclude=True)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class QueryVariants(BaseModel):
|
| 53 |
+
sub_queries: List[str] = Field(
|
| 54 |
+
description="1-3 highly optimized, distinct search queries broken down from the original prompt."
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class RouteDecision(BaseModel):
|
| 59 |
+
route_class: str = Field(default="factoid")
|
| 60 |
+
route_reason: str = Field(default="heuristic_default")
|
| 61 |
+
preserve_query: bool = Field(default=False)
|
| 62 |
+
disable_memory: bool = Field(default=False)
|
| 63 |
+
page_scope: Optional[str] = Field(default=None)
|
| 64 |
+
exact_field: Optional[str] = Field(default=None)
|
| 65 |
+
|
backend/core/rate_limit.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from starlette.requests import Request
|
| 2 |
+
|
| 3 |
+
from backend.core.auth_utils import is_guest_token
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 7 |
+
from slowapi.errors import RateLimitExceeded
|
| 8 |
+
from slowapi.util import get_remote_address
|
| 9 |
+
except Exception: # optional in minimal/test envs
|
| 10 |
+
Limiter = None
|
| 11 |
+
RateLimitExceeded = Exception
|
| 12 |
+
_rate_limit_exceeded_handler = None
|
| 13 |
+
|
| 14 |
+
def get_remote_address(request): # type: ignore
|
| 15 |
+
return request.client.host if getattr(request, "client", None) else "unknown"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _rate_limit_key(request: Request) -> str:
|
| 19 |
+
"""Use stricter IP limits for guest workspaces, user token limits otherwise."""
|
| 20 |
+
token = request.headers.get("X-Auth-Token") or request.headers.get("Authorization")
|
| 21 |
+
if token and token.startswith("Bearer "):
|
| 22 |
+
token = token.split(" ", 1)[1]
|
| 23 |
+
if token and not is_guest_token(token):
|
| 24 |
+
return token
|
| 25 |
+
return get_remote_address(request)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
if Limiter is not None:
|
| 29 |
+
limiter = Limiter(key_func=_rate_limit_key)
|
| 30 |
+
else:
|
| 31 |
+
class _NoopLimiter:
|
| 32 |
+
def limit(self, *_args, **_kwargs):
|
| 33 |
+
def _decorator(fn):
|
| 34 |
+
return fn
|
| 35 |
+
|
| 36 |
+
return _decorator
|
| 37 |
+
|
| 38 |
+
limiter = _NoopLimiter()
|
| 39 |
+
|
backend/core/tasks.py
CHANGED
|
@@ -1,14 +1,60 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
-
|
|
|
|
| 3 |
from backend.core.pipeline import run_ingestion
|
| 4 |
|
|
|
|
|
|
|
| 5 |
# Initialize Celery pointing to your Redis broker
|
| 6 |
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
This runs in a completely separate background process!
|
| 14 |
We pass a progress_callback to run_ingestion so it can report its status.
|
|
@@ -21,23 +67,18 @@ def process_pdf_task(self, tmp_path: str, original_filename: str, access_token:
|
|
| 21 |
)
|
| 22 |
|
| 23 |
try:
|
| 24 |
-
|
| 25 |
-
result = run_ingestion(
|
| 26 |
pdf_path=tmp_path,
|
| 27 |
original_filename=original_filename,
|
| 28 |
progress_callback=update_progress,
|
| 29 |
access_token=access_token,
|
| 30 |
)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
# Reraising the exception tells Celery the task failed
|
| 41 |
-
raise Exception(str(e))
|
| 42 |
-
|
| 43 |
-
|
|
|
|
| 1 |
+
import logging
|
| 2 |
import os
|
| 3 |
+
|
| 4 |
+
from backend.core import config
|
| 5 |
from backend.core.pipeline import run_ingestion
|
| 6 |
|
| 7 |
+
log = logging.getLogger("morpheus.tasks")
|
| 8 |
+
|
| 9 |
# Initialize Celery pointing to your Redis broker
|
| 10 |
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
|
| 11 |
|
| 12 |
+
try:
|
| 13 |
+
from celery import Celery
|
| 14 |
+
except Exception:
|
| 15 |
+
Celery = None
|
| 16 |
+
|
| 17 |
+
if Celery is not None:
|
| 18 |
+
celery_app = Celery("morpheus_worker", broker=REDIS_URL, backend=REDIS_URL)
|
| 19 |
+
celery_app.conf.update(
|
| 20 |
+
task_track_started=True,
|
| 21 |
+
task_acks_late=True,
|
| 22 |
+
task_reject_on_worker_lost=True,
|
| 23 |
+
worker_cancel_long_running_tasks_on_connection_loss=True,
|
| 24 |
+
broker_connection_retry_on_startup=True,
|
| 25 |
+
broker_connection_max_retries=None,
|
| 26 |
+
broker_heartbeat=config.CELERY_BROKER_HEARTBEAT_S,
|
| 27 |
+
broker_pool_limit=config.CELERY_BROKER_POOL_LIMIT,
|
| 28 |
+
broker_transport_options={
|
| 29 |
+
"visibility_timeout": config.CELERY_VISIBILITY_TIMEOUT_S,
|
| 30 |
+
"socket_keepalive": True,
|
| 31 |
+
"socket_timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S,
|
| 32 |
+
"socket_connect_timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S,
|
| 33 |
+
"retry_on_timeout": True,
|
| 34 |
+
"health_check_interval": config.CELERY_REDIS_HEALTH_CHECK_INTERVAL_S,
|
| 35 |
+
},
|
| 36 |
+
result_backend_transport_options={
|
| 37 |
+
"visibility_timeout": config.CELERY_VISIBILITY_TIMEOUT_S,
|
| 38 |
+
"retry_policy": {"timeout": config.CELERY_REDIS_SOCKET_TIMEOUT_S},
|
| 39 |
+
"health_check_interval": config.CELERY_REDIS_HEALTH_CHECK_INTERVAL_S,
|
| 40 |
+
},
|
| 41 |
+
)
|
| 42 |
+
else:
|
| 43 |
+
celery_app = None
|
| 44 |
|
| 45 |
+
|
| 46 |
+
def _cleanup_temp_upload(tmp_path: str) -> None:
|
| 47 |
+
if not tmp_path:
|
| 48 |
+
return
|
| 49 |
+
try:
|
| 50 |
+
os.unlink(tmp_path)
|
| 51 |
+
except FileNotFoundError:
|
| 52 |
+
return
|
| 53 |
+
except OSError as exc:
|
| 54 |
+
log.warning("Could not remove temp upload %s: %s", tmp_path, exc)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _process_pdf_task_impl(self, tmp_path: str, original_filename: str, access_token: str):
|
| 58 |
"""
|
| 59 |
This runs in a completely separate background process!
|
| 60 |
We pass a progress_callback to run_ingestion so it can report its status.
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
try:
|
| 70 |
+
return run_ingestion(
|
|
|
|
| 71 |
pdf_path=tmp_path,
|
| 72 |
original_filename=original_filename,
|
| 73 |
progress_callback=update_progress,
|
| 74 |
access_token=access_token,
|
| 75 |
)
|
| 76 |
+
finally:
|
| 77 |
+
_cleanup_temp_upload(tmp_path)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if celery_app is not None:
|
| 81 |
+
process_pdf_task = celery_app.task(bind=True)(_process_pdf_task_impl)
|
| 82 |
+
else:
|
| 83 |
+
def process_pdf_task(*_args, **_kwargs):
|
| 84 |
+
raise RuntimeError("Celery not installed; background ingestion is unavailable.")
|
|
|
|
|
|
|
|
|
|
|
|
backend/core/warmup_classifier.py
CHANGED
|
@@ -19,7 +19,6 @@ Usage:
|
|
| 19 |
|
| 20 |
import numpy as np
|
| 21 |
import logging
|
| 22 |
-
from supabase.client import create_client
|
| 23 |
from dotenv import load_dotenv
|
| 24 |
from backend.core import config
|
| 25 |
|
|
@@ -32,6 +31,11 @@ log = logging.getLogger("warmup")
|
|
| 32 |
|
| 33 |
|
| 34 |
def warmup():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
|
| 36 |
|
| 37 |
# Step 1 — find which categories already have centroids
|
|
|
|
| 19 |
|
| 20 |
import numpy as np
|
| 21 |
import logging
|
|
|
|
| 22 |
from dotenv import load_dotenv
|
| 23 |
from backend.core import config
|
| 24 |
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def warmup():
|
| 34 |
+
try:
|
| 35 |
+
from supabase.client import create_client
|
| 36 |
+
except Exception as exc:
|
| 37 |
+
raise RuntimeError("Missing dependency 'supabase'. Install supabase-py to warm up classifier.") from exc
|
| 38 |
+
|
| 39 |
supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
|
| 40 |
|
| 41 |
# Step 1 — find which categories already have centroids
|
backend/eval/run_eval.py
CHANGED
|
@@ -75,6 +75,72 @@ def _load_from_supabase(
|
|
| 75 |
return res.data or []
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def _parse_csv_floats(s: str) -> List[float]:
|
| 79 |
return [float(x.strip()) for x in s.split(",") if x.strip()]
|
| 80 |
|
|
|
|
| 75 |
return res.data or []
|
| 76 |
|
| 77 |
|
| 78 |
+
def load_feedback_dataset_candidates(
|
| 79 |
+
access_token: Optional[str],
|
| 80 |
+
user_id: Optional[str],
|
| 81 |
+
*,
|
| 82 |
+
limit: int = 50,
|
| 83 |
+
) -> List[Dict[str, Any]]:
|
| 84 |
+
"""
|
| 85 |
+
Promote explicit user feedback into dataset-shaped rows for offline eval curation.
|
| 86 |
+
These candidates are intentionally separate from `evaluation_datasets` so we can
|
| 87 |
+
review them before activation.
|
| 88 |
+
"""
|
| 89 |
+
from backend.core.pipeline import _build_service_supabase_client
|
| 90 |
+
|
| 91 |
+
sb = _build_service_supabase_client()
|
| 92 |
+
feedback_q = (
|
| 93 |
+
sb.table("answer_feedback")
|
| 94 |
+
.select("trace_id, helpful, accepted, reason_code, correction_text, promote_to_eval, user_id")
|
| 95 |
+
.eq("promote_to_eval", True)
|
| 96 |
+
.limit(limit)
|
| 97 |
+
)
|
| 98 |
+
if user_id:
|
| 99 |
+
feedback_q = feedback_q.eq("user_id", user_id)
|
| 100 |
+
feedback_rows = feedback_q.execute().data or []
|
| 101 |
+
trace_ids = [row.get("trace_id") for row in feedback_rows if row.get("trace_id")]
|
| 102 |
+
if not trace_ids:
|
| 103 |
+
return []
|
| 104 |
+
|
| 105 |
+
trace_rows = (
|
| 106 |
+
sb.table("query_traces")
|
| 107 |
+
.select("trace_id, question, doc_diagnostics, failure_modes, answer_preview")
|
| 108 |
+
.in_("trace_id", trace_ids)
|
| 109 |
+
.execute()
|
| 110 |
+
.data
|
| 111 |
+
or []
|
| 112 |
+
)
|
| 113 |
+
trace_map = {row.get("trace_id"): row for row in trace_rows if row.get("trace_id")}
|
| 114 |
+
|
| 115 |
+
dataset_rows: List[Dict[str, Any]] = []
|
| 116 |
+
seen_trace_ids = set()
|
| 117 |
+
for feedback in feedback_rows:
|
| 118 |
+
trace_id = feedback.get("trace_id")
|
| 119 |
+
if trace_id in seen_trace_ids:
|
| 120 |
+
continue
|
| 121 |
+
trace = trace_map.get(trace_id, {})
|
| 122 |
+
question = (trace.get("question") or "").strip()
|
| 123 |
+
if not question:
|
| 124 |
+
continue
|
| 125 |
+
seen_trace_ids.add(trace_id)
|
| 126 |
+
correction_text = (feedback.get("correction_text") or "").strip()
|
| 127 |
+
answer_preview = (trace.get("answer_preview") or "").strip()
|
| 128 |
+
dataset_rows.append(
|
| 129 |
+
{
|
| 130 |
+
"question": question,
|
| 131 |
+
"gold_context_refs": [],
|
| 132 |
+
"gold_evidence_text": correction_text or answer_preview,
|
| 133 |
+
"is_answerable": bool(feedback.get("accepted") or feedback.get("helpful")),
|
| 134 |
+
"trace_id": trace_id,
|
| 135 |
+
"failure_modes": trace.get("failure_modes") or [],
|
| 136 |
+
"doc_diagnostics": trace.get("doc_diagnostics") or [],
|
| 137 |
+
"reason_code": feedback.get("reason_code"),
|
| 138 |
+
"source": "feedback_trace",
|
| 139 |
+
}
|
| 140 |
+
)
|
| 141 |
+
return dataset_rows
|
| 142 |
+
|
| 143 |
+
|
| 144 |
def _parse_csv_floats(s: str) -> List[float]:
|
| 145 |
return [float(x.strip()) for x in s.split(",") if x.strip()]
|
| 146 |
|
backend/main.py
CHANGED
|
@@ -7,19 +7,6 @@ Production: gunicorn -w 1 -k uvicorn.workers.UvicornWorker backend.main:app --b
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
| 10 |
-
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 11 |
-
from slowapi.util import get_remote_address
|
| 12 |
-
from slowapi.errors import RateLimitExceeded
|
| 13 |
-
from starlette.requests import Request
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def _rate_limit_key(request: Request) -> str:
|
| 17 |
-
"""Key rate limits by JWT token (per-user), fall back to IP."""
|
| 18 |
-
token = request.headers.get("X-Auth-Token") or request.headers.get("Authorization")
|
| 19 |
-
return token or get_remote_address(request)
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
limiter = Limiter(key_func=_rate_limit_key)
|
| 23 |
import logging # noqa: E402
|
| 24 |
import subprocess # noqa: E402
|
| 25 |
from contextlib import asynccontextmanager # noqa: E402
|
|
@@ -32,8 +19,17 @@ from dotenv import load_dotenv # noqa: E402
|
|
| 32 |
|
| 33 |
load_dotenv()
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
from backend.api import auth, corpus, ingest, query, admin, frontend_config # noqa: E402
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
log = logging.getLogger("morpheus.main")
|
| 39 |
|
|
@@ -87,7 +83,8 @@ app = FastAPI(
|
|
| 87 |
|
| 88 |
# ── Rate limiting ─────────────────────────────────────────────────────────────
|
| 89 |
app.state.limiter = limiter
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
_origins = [
|
| 93 |
o.strip() for o in os.getenv("ALLOWED_ORIGINS", "*").split(",") if o.strip()
|
|
|
|
| 7 |
|
| 8 |
import os
|
| 9 |
import sys
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import logging # noqa: E402
|
| 11 |
import subprocess # noqa: E402
|
| 12 |
from contextlib import asynccontextmanager # noqa: E402
|
|
|
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
|
| 22 |
+
from backend.core.rate_limit import ( # noqa: E402
|
| 23 |
+
RateLimitExceeded,
|
| 24 |
+
_rate_limit_exceeded_handler,
|
| 25 |
+
limiter,
|
| 26 |
+
)
|
| 27 |
from backend.api import auth, corpus, ingest, query, admin, frontend_config # noqa: E402
|
| 28 |
+
try: # noqa: E402
|
| 29 |
+
from backend.core.intent_classifier import get_intent_classifier_status
|
| 30 |
+
except Exception:
|
| 31 |
+
def get_intent_classifier_status(): # type: ignore
|
| 32 |
+
return {"ok": False, "reason": "intent_classifier_unavailable"}
|
| 33 |
|
| 34 |
log = logging.getLogger("morpheus.main")
|
| 35 |
|
|
|
|
| 83 |
|
| 84 |
# ── Rate limiting ─────────────────────────────────────────────────────────────
|
| 85 |
app.state.limiter = limiter
|
| 86 |
+
if _rate_limit_exceeded_handler is not None:
|
| 87 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 88 |
|
| 89 |
_origins = [
|
| 90 |
o.strip() for o in os.getenv("ALLOWED_ORIGINS", "*").split(",") if o.strip()
|
frontend/index.html
CHANGED
|
@@ -212,24 +212,26 @@
|
|
| 212 |
onclick="submitLogin()"
|
| 213 |
>SIGN IN →</button>
|
| 214 |
|
| 215 |
-
<
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
</div>
|
| 234 |
</div>
|
| 235 |
|
|
@@ -251,8 +253,19 @@
|
|
| 251 |
<button class="nav-btn" id="nav-chat" onclick="switchView('chat')">
|
| 252 |
CHAT
|
| 253 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
</nav>
|
| 255 |
<div class="topbar-right">
|
|
|
|
|
|
|
|
|
|
| 256 |
<div class="stat-pill">
|
| 257 |
DOCS <span class="val" id="stat-docs">0</span>
|
| 258 |
</div>
|
|
@@ -263,6 +276,7 @@
|
|
| 263 |
<div class="conn-dot offline" id="conn-dot"></div>
|
| 264 |
<span id="conn-label">OFFLINE</span>
|
| 265 |
</div>
|
|
|
|
| 266 |
<button onclick="signOut()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">SIGN OUT</button>
|
| 267 |
</div>
|
| 268 |
</header>
|
|
@@ -486,6 +500,43 @@
|
|
| 486 |
</button>
|
| 487 |
</div>
|
| 488 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
</aside>
|
| 490 |
<!-- Mobile bottom navigation — must be inside #app for grid to work -->
|
| 491 |
<div id="mobile-nav">
|
|
@@ -525,7 +576,8 @@
|
|
| 525 |
<script src="js/corpus.js"></script>
|
| 526 |
<script src="js/inspect.js"></script>
|
| 527 |
<script src="js/chat.js?v=3"></script>
|
| 528 |
-
<script src="js/
|
|
|
|
| 529 |
<script>
|
| 530 |
function mobileNav(tab) {
|
| 531 |
document
|
|
@@ -563,4 +615,4 @@
|
|
| 563 |
}
|
| 564 |
</script>
|
| 565 |
</body>
|
| 566 |
-
</html>
|
|
|
|
| 212 |
onclick="submitLogin()"
|
| 213 |
>SIGN IN →</button>
|
| 214 |
|
| 215 |
+
<button
|
| 216 |
+
id="guestBtn"
|
| 217 |
+
class="btn-secondary"
|
| 218 |
+
style="width:100%; letter-spacing:0.1em; margin-top:10px; display:none;"
|
| 219 |
+
onclick="submitGuest()"
|
| 220 |
+
>CONTINUE AS GUEST</button>
|
| 221 |
+
<label
|
| 222 |
+
id="guestPersistWrap"
|
| 223 |
+
style="display:none;width:100%;margin-top:10px;font-size:0.68rem;color:var(--muted);line-height:1.45;text-align:left;"
|
| 224 |
+
>
|
| 225 |
+
<input type="checkbox" id="guestPersist" style="margin-right:8px;accent-color:var(--phosphor);" />
|
| 226 |
+
Keep this guest workspace on this device
|
| 227 |
+
</label>
|
| 228 |
+
<div
|
| 229 |
+
id="guestInfo"
|
| 230 |
+
style="display:none;font-size:0.68rem;color:var(--muted);text-align:center;margin-top:8px;line-height:1.5;"
|
| 231 |
+
>
|
| 232 |
+
Guest mode is isolated and rate-limited. By default it expires when the guest session truly ends.
|
| 233 |
+
</div>
|
| 234 |
+
|
| 235 |
</div>
|
| 236 |
</div>
|
| 237 |
|
|
|
|
| 253 |
<button class="nav-btn" id="nav-chat" onclick="switchView('chat')">
|
| 254 |
CHAT
|
| 255 |
</button>
|
| 256 |
+
<button
|
| 257 |
+
class="nav-btn"
|
| 258 |
+
id="nav-admin"
|
| 259 |
+
onclick="switchView('admin')"
|
| 260 |
+
style="display: none"
|
| 261 |
+
>
|
| 262 |
+
ADMIN
|
| 263 |
+
</button>
|
| 264 |
</nav>
|
| 265 |
<div class="topbar-right">
|
| 266 |
+
<div class="stat-pill" id="session-mode-pill" style="display:none;">
|
| 267 |
+
MODE <span class="val" id="session-mode-label">GUEST</span>
|
| 268 |
+
</div>
|
| 269 |
<div class="stat-pill">
|
| 270 |
DOCS <span class="val" id="stat-docs">0</span>
|
| 271 |
</div>
|
|
|
|
| 276 |
<div class="conn-dot offline" id="conn-dot"></div>
|
| 277 |
<span id="conn-label">OFFLINE</span>
|
| 278 |
</div>
|
| 279 |
+
<button onclick="unlockOperatorTools()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">OPERATOR</button>
|
| 280 |
<button onclick="signOut()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">SIGN OUT</button>
|
| 281 |
</div>
|
| 282 |
</header>
|
|
|
|
| 500 |
</button>
|
| 501 |
</div>
|
| 502 |
</div>
|
| 503 |
+
|
| 504 |
+
<!-- ── ADMIN VIEW ── -->
|
| 505 |
+
<div class="view" id="view-admin">
|
| 506 |
+
<div class="view-header">
|
| 507 |
+
<div class="view-title">ADMIN REVIEW</div>
|
| 508 |
+
<div class="view-subtitle">Trace triage, feedback, and eval promotion</div>
|
| 509 |
+
</div>
|
| 510 |
+
<div class="view-body" style="padding-top: 12px">
|
| 511 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:14px;">
|
| 512 |
+
<input type="text" id="adminTraceFailure" placeholder="failure mode" style="flex:1;min-width:120px;" />
|
| 513 |
+
<input type="text" id="adminTraceCategory" placeholder="category" style="flex:1;min-width:120px;" />
|
| 514 |
+
<select id="adminTraceRoute" style="flex:1;min-width:120px;">
|
| 515 |
+
<option value="">All routes</option>
|
| 516 |
+
<option value="default">default</option>
|
| 517 |
+
<option value="single">single</option>
|
| 518 |
+
<option value="generic_pinned">generic_pinned</option>
|
| 519 |
+
<option value="explicit_compare">explicit_compare</option>
|
| 520 |
+
</select>
|
| 521 |
+
<button class="btn-secondary" onclick="refreshAdminDashboard()">REFRESH</button>
|
| 522 |
+
</div>
|
| 523 |
+
<div id="adminSummary" style="font-size:0.78rem;color:var(--muted);margin-bottom:12px;"></div>
|
| 524 |
+
<div style="display:grid;gap:14px;">
|
| 525 |
+
<div>
|
| 526 |
+
<div class="section-label">Recent Traces</div>
|
| 527 |
+
<div id="adminTraceList"></div>
|
| 528 |
+
</div>
|
| 529 |
+
<div>
|
| 530 |
+
<div class="section-label">Trace Detail</div>
|
| 531 |
+
<div id="adminTraceDetail"></div>
|
| 532 |
+
</div>
|
| 533 |
+
<div>
|
| 534 |
+
<div class="section-label">Recent Feedback</div>
|
| 535 |
+
<div id="adminFeedbackList"></div>
|
| 536 |
+
</div>
|
| 537 |
+
</div>
|
| 538 |
+
</div>
|
| 539 |
+
</div>
|
| 540 |
</aside>
|
| 541 |
<!-- Mobile bottom navigation — must be inside #app for grid to work -->
|
| 542 |
<div id="mobile-nav">
|
|
|
|
| 576 |
<script src="js/corpus.js"></script>
|
| 577 |
<script src="js/inspect.js"></script>
|
| 578 |
<script src="js/chat.js?v=3"></script>
|
| 579 |
+
<script src="js/admin.js?v=1"></script>
|
| 580 |
+
<script src="js/main.js?v=1"></script>
|
| 581 |
<script>
|
| 582 |
function mobileNav(tab) {
|
| 583 |
document
|
|
|
|
| 615 |
}
|
| 616 |
</script>
|
| 617 |
</body>
|
| 618 |
+
</html>
|
frontend/js/admin.js
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
function _adminBadge(text, tone = 'muted') {
|
| 2 |
+
const color = tone === 'danger' ? '#fb7185' : tone === 'success' ? '#34d399' : '#93c5fd';
|
| 3 |
+
return `<span style="display:inline-block;padding:2px 8px;border:1px solid ${color};border-radius:999px;font-size:0.72rem;color:${color};margin-right:6px;">${esc(text)}</span>`;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
function _adminPages(pages) {
|
| 7 |
+
if (!Array.isArray(pages) || !pages.length) return 'none';
|
| 8 |
+
return pages.join(', ');
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
function _adminSignalBadges(quality) {
|
| 12 |
+
const badges = [];
|
| 13 |
+
badges.push(_adminBadge(`route ${quality.route_class || 'factoid'}`));
|
| 14 |
+
if (quality.route_reason) badges.push(_adminBadge(`reason ${quality.route_reason}`));
|
| 15 |
+
badges.push(_adminBadge(`identity ${quality.identity_store_hit ? 'hit' : 'miss'}`, quality.identity_store_hit ? 'success' : 'muted'));
|
| 16 |
+
if (quality.history_injected) badges.push(_adminBadge('history injected', 'danger'));
|
| 17 |
+
if (quality.memory_injected) badges.push(_adminBadge('memory injected', 'danger'));
|
| 18 |
+
if (quality.sanitizer_triggered) badges.push(_adminBadge(`sanitized ${Number(quality.sanitized_token_count || 0)}`, 'danger'));
|
| 19 |
+
if (quality.page_scope_required) badges.push(_adminBadge(`pages ${quality.page_scope_supported ? 'supported' : 'violated'}`, quality.page_scope_supported ? 'success' : 'danger'));
|
| 20 |
+
return badges.join('');
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
function _adminRerankAudit(quality) {
|
| 24 |
+
const deltas = Array.isArray(quality.rerank_deltas) ? quality.rerank_deltas : [];
|
| 25 |
+
if (!deltas.length) return '<div class="confirm-zone">No rerank audit captured.</div>';
|
| 26 |
+
return deltas.slice(0, 8).map(delta => `
|
| 27 |
+
<div style="padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
|
| 28 |
+
<div style="font-weight:600;color:#dbeafe">${esc(delta.chunk_id || delta.source || 'candidate')}</div>
|
| 29 |
+
<div style="font-size:0.78rem;color:#94a3b8;">
|
| 30 |
+
pre ${Number(delta.pre_rank ?? -1)} → post ${Number(delta.post_rank ?? -1)} ·
|
| 31 |
+
branch ${esc(delta.branch || 'unknown')} ·
|
| 32 |
+
score ${Number(delta.score ?? 0).toFixed(2)} ·
|
| 33 |
+
pages ${esc(_adminPages(delta.page_numbers || []))}
|
| 34 |
+
</div>
|
| 35 |
+
</div>
|
| 36 |
+
`).join('');
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
function _renderTraceSummary(trace) {
|
| 40 |
+
const failures = Array.isArray(trace.failure_modes) ? trace.failure_modes : [];
|
| 41 |
+
const experts = Array.isArray(trace.selected_experts) ? trace.selected_experts : [];
|
| 42 |
+
const quality = trace.quality_metrics || {};
|
| 43 |
+
return `
|
| 44 |
+
<div style="padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);margin-bottom:10px;">
|
| 45 |
+
<div style="display:flex;justify-content:space-between;gap:12px;align-items:flex-start;">
|
| 46 |
+
<div>
|
| 47 |
+
<div style="font-weight:600;color:#e2e8f0;">${esc(trace.question || 'Untitled trace')}</div>
|
| 48 |
+
<div style="font-size:0.78rem;color:#94a3b8;margin-top:4px;">${esc(trace.trace_id || '')}</div>
|
| 49 |
+
</div>
|
| 50 |
+
<div style="font-size:0.76rem;color:#94a3b8;text-align:right;">
|
| 51 |
+
<div>${esc(trace.route_mode || 'default')} · ${esc(quality.route_class || 'factoid')}</div>
|
| 52 |
+
<div>${esc(trace.review_state || 'pending')}</div>
|
| 53 |
+
</div>
|
| 54 |
+
</div>
|
| 55 |
+
<div style="margin-top:10px;">${experts.map(exp => _adminBadge(exp)).join('')}</div>
|
| 56 |
+
<div style="margin-top:8px;">${_adminSignalBadges(quality)}</div>
|
| 57 |
+
<div style="margin-top:8px;">${failures.length ? failures.map(f => _adminBadge(f, 'danger')).join('') : _adminBadge('no failure flags', 'success')}</div>
|
| 58 |
+
<div style="font-size:0.78rem;color:#cbd5e1;margin-top:10px;">
|
| 59 |
+
relevance ${Number(quality.retrieval_relevance_proxy ?? 0).toFixed(2)} ·
|
| 60 |
+
balance ${Number(quality.document_balance ?? 0).toFixed(2)} ·
|
| 61 |
+
thin docs ${Number(quality.thin_doc_count ?? 0)} ·
|
| 62 |
+
pages ${esc(_adminPages(quality.selected_page_numbers || []))}
|
| 63 |
+
</div>
|
| 64 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 65 |
+
<button class="btn-secondary" onclick="selectAdminTrace('${esc(trace.trace_id)}')">OPEN</button>
|
| 66 |
+
<button class="btn-secondary" onclick="reviewAdminTrace('${esc(trace.trace_id)}','reviewed')">MARK REVIEWED</button>
|
| 67 |
+
<button class="btn-danger" onclick="reviewAdminTrace('${esc(trace.trace_id)}','rejected')">REJECT</button>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
`;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
function _renderTraceDetail(trace, feedbackRows) {
|
| 74 |
+
if (!trace) return '<div class="confirm-zone">No trace selected yet.</div>';
|
| 75 |
+
const quality = trace.quality_metrics || {};
|
| 76 |
+
const feedbackHtml = (feedbackRows || []).map(row => `
|
| 77 |
+
<div style="padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
|
| 78 |
+
<div style="font-size:0.76rem;color:#94a3b8;">Feedback #${row.id} · ${esc(row.review_state || 'pending')}</div>
|
| 79 |
+
<div style="font-size:0.86rem;color:#e2e8f0;margin-top:4px;">
|
| 80 |
+
helpful=${String(row.helpful)} · accepted=${String(row.accepted)} · reason=${esc(row.reason_code || 'none')}
|
| 81 |
+
</div>
|
| 82 |
+
${row.correction_text ? `<div style="margin-top:6px;color:#cbd5e1;">${esc(row.correction_text)}</div>` : ''}
|
| 83 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 84 |
+
<button class="btn-secondary" onclick="reviewAdminFeedback(${row.id},'reviewed')">REVIEW</button>
|
| 85 |
+
<button class="btn-danger" onclick="reviewAdminFeedback(${row.id},'rejected')">REJECT</button>
|
| 86 |
+
<button class="btn-primary" onclick="promoteAdminFeedback(${row.id})">PROMOTE TO EVAL</button>
|
| 87 |
+
</div>
|
| 88 |
+
</div>
|
| 89 |
+
`).join('');
|
| 90 |
+
|
| 91 |
+
const diagnostics = (trace.doc_diagnostics || []).map(diag => `
|
| 92 |
+
<div style="padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
|
| 93 |
+
<div style="font-weight:600;color:#dbeafe">${esc(diag.source || diag.file_hash || 'Unknown')}</div>
|
| 94 |
+
<div style="font-size:0.78rem;color:#94a3b8;">${esc(diag.reason || 'unknown')} · support ${esc(diag.support_label || diag.confidence_label || 'unknown')} · candidates ${Number(diag.candidate_count ?? 0)}</div>
|
| 95 |
+
</div>
|
| 96 |
+
`).join('');
|
| 97 |
+
|
| 98 |
+
return `
|
| 99 |
+
<div style="padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);">
|
| 100 |
+
<div style="font-size:0.78rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Question</div>
|
| 101 |
+
<div style="color:#e2e8f0;margin-top:6px;">${esc(trace.question || '')}</div>
|
| 102 |
+
<div style="font-size:0.78rem;color:#94a3b8;margin-top:10px;">${esc(trace.trace_id || '')}</div>
|
| 103 |
+
<div style="margin-top:12px;">
|
| 104 |
+
${(trace.failure_modes || []).map(flag => _adminBadge(flag, 'danger')).join('')}
|
| 105 |
+
</div>
|
| 106 |
+
<div style="margin-top:10px;">${_adminSignalBadges(quality)}</div>
|
| 107 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Trace Signals</div>
|
| 108 |
+
<div style="margin-top:8px;padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);color:#cbd5e1;">
|
| 109 |
+
<div>route ${esc(quality.route_class || 'factoid')} · ${esc(quality.route_reason || 'heuristic_default')}</div>
|
| 110 |
+
<div style="margin-top:4px;">identity store ${esc(quality.identity_store_hit ? 'hit' : 'miss')} · history ${esc(quality.history_injected ? 'yes' : 'no')} · memory ${esc(quality.memory_injected ? 'yes' : 'no')}</div>
|
| 111 |
+
<div style="margin-top:4px;">pages ${esc(_adminPages(quality.selected_page_numbers || []))} · opening candidates ${Number(quality.opening_page_candidate_count ?? 0)} · opening selected ${Number(quality.opening_page_selected_count ?? 0)}</div>
|
| 112 |
+
<div style="margin-top:4px;">page scope ${esc(quality.page_scope_required ? 'required' : 'not required')} · ${esc(quality.page_scope_supported ? 'supported' : 'violated')}</div>
|
| 113 |
+
<div style="margin-top:4px;">sanitizer ${esc(quality.sanitizer_triggered ? 'triggered' : 'clean')} · tokens removed ${Number(quality.sanitized_token_count ?? 0)}</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Experts</div>
|
| 116 |
+
<pre style="white-space:pre-wrap;background:rgba(2,6,23,0.9);padding:10px;border-radius:8px;border:1px solid #1e293b;color:#cbd5e1;">${esc(JSON.stringify({
|
| 117 |
+
selected_experts: trace.selected_experts || [],
|
| 118 |
+
expert_weights: trace.expert_weights || {},
|
| 119 |
+
quality_metrics: quality,
|
| 120 |
+
selected_chunk_ids: trace.selected_chunk_ids || [],
|
| 121 |
+
}, null, 2))}</pre>
|
| 122 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Rerank Audit</div>
|
| 123 |
+
${_adminRerankAudit(quality)}
|
| 124 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Diagnostics</div>
|
| 125 |
+
${diagnostics || '<div class="confirm-zone">No diagnostics captured.</div>'}
|
| 126 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Answer Preview</div>
|
| 127 |
+
<div style="margin-top:8px;padding:10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);color:#cbd5e1;white-space:pre-wrap;">${esc(trace.answer_preview || '')}</div>
|
| 128 |
+
<div style="margin-top:14px;font-size:0.8rem;color:#7dd3fc;letter-spacing:0.12em;text-transform:uppercase;">Linked Feedback</div>
|
| 129 |
+
${feedbackHtml || '<div class="confirm-zone">No linked feedback yet.</div>'}
|
| 130 |
+
</div>
|
| 131 |
+
`;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
function _renderFeedbackList(rows) {
|
| 135 |
+
if (!rows.length) return '<div class="confirm-zone">No feedback captured yet.</div>';
|
| 136 |
+
return rows.map(row => `
|
| 137 |
+
<div style="padding:10px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);margin-bottom:10px;">
|
| 138 |
+
<div style="font-size:0.76rem;color:#94a3b8;">Feedback #${row.id} · trace ${esc(row.trace_id || '')}</div>
|
| 139 |
+
<div style="color:#e2e8f0;margin-top:4px;">helpful=${String(row.helpful)} · accepted=${String(row.accepted)} · ${esc(row.reason_code || 'no reason')}</div>
|
| 140 |
+
${row.correction_text ? `<div style="margin-top:6px;color:#cbd5e1;">${esc(row.correction_text)}</div>` : ''}
|
| 141 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:10px;">
|
| 142 |
+
<button class="btn-secondary" onclick="openAdminFeedbackTrace('${esc(row.trace_id || '')}')">OPEN TRACE</button>
|
| 143 |
+
<button class="btn-secondary" onclick="reviewAdminFeedback(${row.id},'reviewed')">REVIEW</button>
|
| 144 |
+
<button class="btn-danger" onclick="reviewAdminFeedback(${row.id},'rejected')">REJECT</button>
|
| 145 |
+
<button class="btn-primary" onclick="promoteAdminFeedback(${row.id})">PROMOTE</button>
|
| 146 |
+
</div>
|
| 147 |
+
</div>
|
| 148 |
+
`).join('');
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
async function refreshAdminDashboard() {
|
| 152 |
+
if (!STATE.adminUnlocked || !STATE.adminKey) return;
|
| 153 |
+
const params = {
|
| 154 |
+
limit: 20,
|
| 155 |
+
failure_mode: document.getElementById('adminTraceFailure')?.value || '',
|
| 156 |
+
category: document.getElementById('adminTraceCategory')?.value || '',
|
| 157 |
+
route_mode: document.getElementById('adminTraceRoute')?.value || '',
|
| 158 |
+
};
|
| 159 |
+
const [traceRes, feedbackRes] = await Promise.all([
|
| 160 |
+
apiAdminListTraces(STATE.adminKey, params),
|
| 161 |
+
apiAdminListFeedback(STATE.adminKey, { limit: 20 }),
|
| 162 |
+
]);
|
| 163 |
+
STATE.adminTraces = traceRes.items || [];
|
| 164 |
+
STATE.adminFeedback = feedbackRes.items || [];
|
| 165 |
+
document.getElementById('adminSummary').textContent =
|
| 166 |
+
`${STATE.adminTraces.length} trace(s), ${STATE.adminFeedback.length} feedback row(s) loaded.`;
|
| 167 |
+
document.getElementById('adminTraceList').innerHTML = STATE.adminTraces.map(_renderTraceSummary).join('');
|
| 168 |
+
document.getElementById('adminFeedbackList').innerHTML = _renderFeedbackList(STATE.adminFeedback);
|
| 169 |
+
if (STATE.selectedTraceId) {
|
| 170 |
+
await selectAdminTrace(STATE.selectedTraceId);
|
| 171 |
+
} else {
|
| 172 |
+
document.getElementById('adminTraceDetail').innerHTML = '<div class="confirm-zone">Select a trace to inspect it.</div>';
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
async function selectAdminTrace(traceId) {
|
| 177 |
+
if (!STATE.adminUnlocked || !STATE.adminKey || !traceId) return;
|
| 178 |
+
STATE.selectedTraceId = traceId;
|
| 179 |
+
const detail = await apiAdminGetTrace(STATE.adminKey, traceId);
|
| 180 |
+
document.getElementById('adminTraceDetail').innerHTML = _renderTraceDetail(detail.trace, detail.feedback || []);
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
async function openAdminFeedbackTrace(traceId) {
|
| 184 |
+
if (!traceId) return;
|
| 185 |
+
await selectAdminTrace(traceId);
|
| 186 |
+
switchView('admin');
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
async function reviewAdminTrace(traceId, reviewState) {
|
| 190 |
+
if (!STATE.adminKey) return;
|
| 191 |
+
const reviewNotes = window.prompt(`Notes for ${reviewState}?`, '') || null;
|
| 192 |
+
await apiAdminReviewTrace(STATE.adminKey, traceId, {
|
| 193 |
+
review_state: reviewState,
|
| 194 |
+
review_notes: reviewNotes,
|
| 195 |
+
});
|
| 196 |
+
toast(`Trace marked ${reviewState}.`, 'success');
|
| 197 |
+
await refreshAdminDashboard();
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
async function reviewAdminFeedback(feedbackId, reviewState) {
|
| 201 |
+
if (!STATE.adminKey) return;
|
| 202 |
+
const reviewNotes = window.prompt(`Notes for ${reviewState}?`, '') || null;
|
| 203 |
+
await apiAdminReviewFeedback(STATE.adminKey, feedbackId, {
|
| 204 |
+
review_state: reviewState,
|
| 205 |
+
review_notes: reviewNotes,
|
| 206 |
+
});
|
| 207 |
+
toast(`Feedback marked ${reviewState}.`, 'success');
|
| 208 |
+
await refreshAdminDashboard();
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
async function promoteAdminFeedback(feedbackId) {
|
| 212 |
+
if (!STATE.adminKey) return;
|
| 213 |
+
await apiAdminPromoteFeedback(STATE.adminKey, feedbackId);
|
| 214 |
+
toast('Feedback promoted to evaluation_datasets.', 'success');
|
| 215 |
+
await refreshAdminDashboard();
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
function enableAdminReview(adminKey) {
|
| 219 |
+
STATE.adminKey = adminKey;
|
| 220 |
+
STATE.adminUnlocked = true;
|
| 221 |
+
const nav = document.getElementById('nav-admin');
|
| 222 |
+
if (nav) nav.style.display = '';
|
| 223 |
+
refreshAdminDashboard().catch(err => {
|
| 224 |
+
toast(`Admin dashboard failed: ${err.message}`, 'error');
|
| 225 |
+
});
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
window.refreshAdminDashboard = refreshAdminDashboard;
|
| 229 |
+
window.selectAdminTrace = selectAdminTrace;
|
| 230 |
+
window.openAdminFeedbackTrace = openAdminFeedbackTrace;
|
| 231 |
+
window.reviewAdminTrace = reviewAdminTrace;
|
| 232 |
+
window.reviewAdminFeedback = reviewAdminFeedback;
|
| 233 |
+
window.promoteAdminFeedback = promoteAdminFeedback;
|
| 234 |
+
window.enableAdminReview = enableAdminReview;
|
frontend/js/api.js
CHANGED
|
@@ -18,13 +18,38 @@
|
|
| 18 |
*/
|
| 19 |
async function getSupabaseToken() {
|
| 20 |
try {
|
| 21 |
-
const
|
|
|
|
|
|
|
| 22 |
return data.session?.access_token ?? null;
|
| 23 |
} catch {
|
| 24 |
return null;
|
| 25 |
}
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
// ── Core fetch wrapper ────────────────────────────────────────────────────────
|
| 29 |
async function apiFetch(path, opts = {}) {
|
| 30 |
// Always pull a fresh token — Supabase auto-refreshes silently.
|
|
@@ -41,7 +66,7 @@ async function apiFetch(path, opts = {}) {
|
|
| 41 |
|
| 42 |
if (!res.ok) {
|
| 43 |
let detail = `HTTP ${res.status}`;
|
| 44 |
-
try { detail = (await res.json()).detail || detail; } catch {}
|
| 45 |
throw new Error(detail);
|
| 46 |
}
|
| 47 |
|
|
@@ -55,7 +80,7 @@ async function apiVerifyPassword(password) {
|
|
| 55 |
// Token injection is handled by apiFetch — no sessionStorage involved.
|
| 56 |
const data = await apiFetch('/api/v1/auth/verify', {
|
| 57 |
method: 'POST',
|
| 58 |
-
body:
|
| 59 |
});
|
| 60 |
return data;
|
| 61 |
}
|
|
@@ -63,7 +88,68 @@ async function apiVerifyPassword(password) {
|
|
| 63 |
async function apiVerifyAdmin(key) {
|
| 64 |
return apiFetch('/api/v1/auth/admin', {
|
| 65 |
method: 'POST',
|
| 66 |
-
body:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
});
|
| 68 |
}
|
| 69 |
|
|
@@ -75,14 +161,14 @@ async function apiLoadFiles() {
|
|
| 75 |
async function apiOverrideCategory(fileHash, newCategory) {
|
| 76 |
return apiFetch('/api/v1/corpus/recategorise', {
|
| 77 |
method: 'POST',
|
| 78 |
-
body:
|
| 79 |
});
|
| 80 |
}
|
| 81 |
|
| 82 |
async function apiRenameDocument(fileHash, newName) {
|
| 83 |
return apiFetch('/api/v1/corpus/rename', {
|
| 84 |
method: 'POST',
|
| 85 |
-
body:
|
| 86 |
});
|
| 87 |
}
|
| 88 |
|
|
@@ -90,6 +176,13 @@ async function apiDeleteDocument(fileHash) {
|
|
| 90 |
return apiFetch(`/api/v1/corpus/${fileHash}`, { method: 'DELETE' });
|
| 91 |
}
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
// ── Ingest ────────────────────────────────────────────────────────────────────
|
| 94 |
async function apiIngestFile(file) {
|
| 95 |
// multipart/form-data — cannot go through apiFetch (no JSON body),
|
|
@@ -100,15 +193,15 @@ async function apiIngestFile(file) {
|
|
| 100 |
formData.append('file', file);
|
| 101 |
|
| 102 |
const res = await fetch(`${CONFIG.API_URL}/api/v1/ingest/upload`, {
|
| 103 |
-
method:
|
| 104 |
headers: token ? { 'X-Auth-Token': token } : {},
|
| 105 |
-
body:
|
| 106 |
});
|
| 107 |
|
| 108 |
if (res.status === 409) throw new Error('already_ingested');
|
| 109 |
if (!res.ok) {
|
| 110 |
let detail = `HTTP ${res.status}`;
|
| 111 |
-
try { detail = (await res.json()).detail || detail; } catch {}
|
| 112 |
throw new Error(detail);
|
| 113 |
}
|
| 114 |
return res.json();
|
|
@@ -119,41 +212,42 @@ async function apiIngestStatus(taskId) {
|
|
| 119 |
}
|
| 120 |
|
| 121 |
// ── Query ─────────────────────────────────────────────────────────────────────
|
| 122 |
-
async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
|
| 123 |
/**
|
| 124 |
* SSE streaming query.
|
| 125 |
* callbacks = {
|
| 126 |
* onToken(text) — called for each streamed token
|
| 127 |
-
* onDone(sources, images) — called when stream ends
|
| 128 |
* onError(msg) — called on error
|
| 129 |
* }
|
| 130 |
*/
|
| 131 |
const token = await getSupabaseToken(); // ← Supabase JWT
|
| 132 |
|
| 133 |
const res = await fetch(`${CONFIG.API_URL}/api/v1/query`, {
|
| 134 |
-
method:
|
| 135 |
headers: {
|
| 136 |
'Content-Type': 'application/json',
|
| 137 |
...(token ? { 'X-Auth-Token': token } : {}),
|
| 138 |
},
|
| 139 |
body: JSON.stringify({
|
| 140 |
query,
|
| 141 |
-
category:
|
| 142 |
-
history:
|
| 143 |
-
session_id: sessionId
|
| 144 |
-
alpha:
|
|
|
|
| 145 |
}),
|
| 146 |
});
|
| 147 |
|
| 148 |
if (!res.ok) {
|
| 149 |
let detail = `HTTP ${res.status}`;
|
| 150 |
-
try { detail = (await res.json()).detail || detail; } catch {}
|
| 151 |
throw new Error(detail);
|
| 152 |
}
|
| 153 |
|
| 154 |
-
const reader
|
| 155 |
const decoder = new TextDecoder();
|
| 156 |
-
let
|
| 157 |
|
| 158 |
while (true) {
|
| 159 |
const { done, value } = await reader.read();
|
|
@@ -169,10 +263,18 @@ async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
|
|
| 169 |
if (!raw) continue;
|
| 170 |
try {
|
| 171 |
const event = JSON.parse(raw);
|
| 172 |
-
if
|
| 173 |
-
else if (event.type === 'done'
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
}
|
| 178 |
-
}
|
|
|
|
| 18 |
*/
|
| 19 |
async function getSupabaseToken() {
|
| 20 |
try {
|
| 21 |
+
const client = await initSupabase();
|
| 22 |
+
if (!client?.auth) return null;
|
| 23 |
+
const { data } = await client.auth.getSession();
|
| 24 |
return data.session?.access_token ?? null;
|
| 25 |
} catch {
|
| 26 |
return null;
|
| 27 |
}
|
| 28 |
}
|
| 29 |
|
| 30 |
+
async function getSupabaseSession() {
|
| 31 |
+
try {
|
| 32 |
+
const client = await initSupabase();
|
| 33 |
+
if (!client?.auth) return null;
|
| 34 |
+
const { data } = await client.auth.getSession();
|
| 35 |
+
return data.session ?? null;
|
| 36 |
+
} catch {
|
| 37 |
+
return null;
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
async function isGuestSession() {
|
| 42 |
+
const session = await getSupabaseSession();
|
| 43 |
+
const appMeta = session?.user?.app_metadata || {};
|
| 44 |
+
const provider = String(appMeta.provider || '').toLowerCase();
|
| 45 |
+
return Boolean(
|
| 46 |
+
session?.user?.is_anonymous ||
|
| 47 |
+
appMeta.is_anonymous ||
|
| 48 |
+
provider === 'anonymous' ||
|
| 49 |
+
(Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
|
| 50 |
+
);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
// ── Core fetch wrapper ────────────────────────────────────────────────────────
|
| 54 |
async function apiFetch(path, opts = {}) {
|
| 55 |
// Always pull a fresh token — Supabase auto-refreshes silently.
|
|
|
|
| 66 |
|
| 67 |
if (!res.ok) {
|
| 68 |
let detail = `HTTP ${res.status}`;
|
| 69 |
+
try { detail = (await res.json()).detail || detail; } catch { }
|
| 70 |
throw new Error(detail);
|
| 71 |
}
|
| 72 |
|
|
|
|
| 80 |
// Token injection is handled by apiFetch — no sessionStorage involved.
|
| 81 |
const data = await apiFetch('/api/v1/auth/verify', {
|
| 82 |
method: 'POST',
|
| 83 |
+
body: JSON.stringify({ password }),
|
| 84 |
});
|
| 85 |
return data;
|
| 86 |
}
|
|
|
|
| 88 |
async function apiVerifyAdmin(key) {
|
| 89 |
return apiFetch('/api/v1/auth/admin', {
|
| 90 |
method: 'POST',
|
| 91 |
+
body: JSON.stringify({ password: key }),
|
| 92 |
+
});
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
async function apiCleanupGuestWorkspace() {
|
| 96 |
+
return apiFetch('/api/v1/auth/guest-workspace', {
|
| 97 |
+
method: 'DELETE',
|
| 98 |
+
});
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
async function apiAdminFetch(path, adminKey, opts = {}) {
|
| 102 |
+
if (!adminKey) throw new Error('Admin key required.');
|
| 103 |
+
return apiFetch(path, {
|
| 104 |
+
...opts,
|
| 105 |
+
headers: {
|
| 106 |
+
'X-Admin-Key': adminKey,
|
| 107 |
+
...(opts.headers || {}),
|
| 108 |
+
},
|
| 109 |
+
});
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
async function apiAdminListTraces(adminKey, params = {}) {
|
| 113 |
+
const qs = new URLSearchParams();
|
| 114 |
+
Object.entries(params).forEach(([key, value]) => {
|
| 115 |
+
if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
|
| 116 |
+
});
|
| 117 |
+
return apiAdminFetch(`/api/v1/admin/traces${qs.toString() ? `?${qs}` : ''}`, adminKey);
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
async function apiAdminGetTrace(adminKey, traceId) {
|
| 121 |
+
return apiAdminFetch(`/api/v1/admin/traces/${traceId}`, adminKey);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
async function apiAdminReviewTrace(adminKey, traceId, payload) {
|
| 125 |
+
return apiAdminFetch(`/api/v1/admin/traces/${traceId}/review`, adminKey, {
|
| 126 |
+
method: 'POST',
|
| 127 |
+
body: JSON.stringify(payload),
|
| 128 |
+
});
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
async function apiAdminListFeedback(adminKey, params = {}) {
|
| 132 |
+
const qs = new URLSearchParams();
|
| 133 |
+
Object.entries(params).forEach(([key, value]) => {
|
| 134 |
+
if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
|
| 135 |
+
});
|
| 136 |
+
return apiAdminFetch(`/api/v1/admin/feedback${qs.toString() ? `?${qs}` : ''}`, adminKey);
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
async function apiAdminGetFeedback(adminKey, feedbackId) {
|
| 140 |
+
return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}`, adminKey);
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
async function apiAdminReviewFeedback(adminKey, feedbackId, payload) {
|
| 144 |
+
return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/review`, adminKey, {
|
| 145 |
+
method: 'POST',
|
| 146 |
+
body: JSON.stringify(payload),
|
| 147 |
+
});
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
async function apiAdminPromoteFeedback(adminKey, feedbackId) {
|
| 151 |
+
return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/promote`, adminKey, {
|
| 152 |
+
method: 'POST',
|
| 153 |
});
|
| 154 |
}
|
| 155 |
|
|
|
|
| 161 |
async function apiOverrideCategory(fileHash, newCategory) {
|
| 162 |
return apiFetch('/api/v1/corpus/recategorise', {
|
| 163 |
method: 'POST',
|
| 164 |
+
body: JSON.stringify({ file_hash: fileHash, new_category: newCategory }),
|
| 165 |
});
|
| 166 |
}
|
| 167 |
|
| 168 |
async function apiRenameDocument(fileHash, newName) {
|
| 169 |
return apiFetch('/api/v1/corpus/rename', {
|
| 170 |
method: 'POST',
|
| 171 |
+
body: JSON.stringify({ file_hash: fileHash, new_name: newName }),
|
| 172 |
});
|
| 173 |
}
|
| 174 |
|
|
|
|
| 176 |
return apiFetch(`/api/v1/corpus/${fileHash}`, { method: 'DELETE' });
|
| 177 |
}
|
| 178 |
|
| 179 |
+
async function apiSubmitAnswerFeedback(payload) {
|
| 180 |
+
return apiFetch('/api/v1/query/feedback', {
|
| 181 |
+
method: 'POST',
|
| 182 |
+
body: JSON.stringify(payload),
|
| 183 |
+
});
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
// ── Ingest ────────────────────────────────────────────────────────────────────
|
| 187 |
async function apiIngestFile(file) {
|
| 188 |
// multipart/form-data — cannot go through apiFetch (no JSON body),
|
|
|
|
| 193 |
formData.append('file', file);
|
| 194 |
|
| 195 |
const res = await fetch(`${CONFIG.API_URL}/api/v1/ingest/upload`, {
|
| 196 |
+
method: 'POST',
|
| 197 |
headers: token ? { 'X-Auth-Token': token } : {},
|
| 198 |
+
body: formData,
|
| 199 |
});
|
| 200 |
|
| 201 |
if (res.status === 409) throw new Error('already_ingested');
|
| 202 |
if (!res.ok) {
|
| 203 |
let detail = `HTTP ${res.status}`;
|
| 204 |
+
try { detail = (await res.json()).detail || detail; } catch { }
|
| 205 |
throw new Error(detail);
|
| 206 |
}
|
| 207 |
return res.json();
|
|
|
|
| 212 |
}
|
| 213 |
|
| 214 |
// ── Query ─────────────────────────────────────────────────────────────────────
|
| 215 |
+
async function apiQuery(query, category, history, sessionId, alpha, callbacks, pinnedFiles) {
|
| 216 |
/**
|
| 217 |
* SSE streaming query.
|
| 218 |
* callbacks = {
|
| 219 |
* onToken(text) — called for each streamed token
|
| 220 |
+
* onDone({ sources, images, traceId, docDiagnostics }) — called when stream ends
|
| 221 |
* onError(msg) — called on error
|
| 222 |
* }
|
| 223 |
*/
|
| 224 |
const token = await getSupabaseToken(); // ← Supabase JWT
|
| 225 |
|
| 226 |
const res = await fetch(`${CONFIG.API_URL}/api/v1/query`, {
|
| 227 |
+
method: 'POST',
|
| 228 |
headers: {
|
| 229 |
'Content-Type': 'application/json',
|
| 230 |
...(token ? { 'X-Auth-Token': token } : {}),
|
| 231 |
},
|
| 232 |
body: JSON.stringify({
|
| 233 |
query,
|
| 234 |
+
category: category || 'All',
|
| 235 |
+
history: history || [],
|
| 236 |
+
session_id: sessionId || 'default_session',
|
| 237 |
+
alpha: alpha ?? 0.5,
|
| 238 |
+
priority_file_hashes: pinnedFiles || [],
|
| 239 |
}),
|
| 240 |
});
|
| 241 |
|
| 242 |
if (!res.ok) {
|
| 243 |
let detail = `HTTP ${res.status}`;
|
| 244 |
+
try { detail = (await res.json()).detail || detail; } catch { }
|
| 245 |
throw new Error(detail);
|
| 246 |
}
|
| 247 |
|
| 248 |
+
const reader = res.body.getReader();
|
| 249 |
const decoder = new TextDecoder();
|
| 250 |
+
let buffer = '';
|
| 251 |
|
| 252 |
while (true) {
|
| 253 |
const { done, value } = await reader.read();
|
|
|
|
| 263 |
if (!raw) continue;
|
| 264 |
try {
|
| 265 |
const event = JSON.parse(raw);
|
| 266 |
+
if (event.type === 'token' && callbacks?.onToken) callbacks.onToken(event.content);
|
| 267 |
+
else if (event.type === 'done' && callbacks?.onDone) {
|
| 268 |
+
callbacks.onDone({
|
| 269 |
+
sources: event.sources || [],
|
| 270 |
+
images: event.images || [],
|
| 271 |
+
traceId: event.trace_id || null,
|
| 272 |
+
docDiagnostics: event.doc_diagnostics || [],
|
| 273 |
+
});
|
| 274 |
+
}
|
| 275 |
+
else if (event.type === 'error' && callbacks?.onError) callbacks.onError(event.content);
|
| 276 |
+
else if (event.type === 'clarification_options' && callbacks?.onOptions) callbacks.onOptions(event.options);
|
| 277 |
+
} catch { }
|
| 278 |
}
|
| 279 |
}
|
| 280 |
+
}
|
frontend/js/chat.js
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
lb.style.cssText = `display:none;position:fixed;inset:0;background:rgba(0,0,0,0.88);
|
| 11 |
z-index:9998;align-items:center;justify-content:center;cursor:zoom-out;
|
| 12 |
backdrop-filter:blur(4px);`;
|
| 13 |
-
|
| 14 |
<button id="img-lightbox-close"
|
| 15 |
onclick="event.stopPropagation(); document.getElementById('img-lightbox').style.display='none'">
|
| 16 |
✕
|
|
@@ -34,14 +34,14 @@ function renderMarkdown(text) {
|
|
| 34 |
let inUL = false;
|
| 35 |
let inOL = false;
|
| 36 |
|
| 37 |
-
const closeUL = () => { if (inUL)
|
| 38 |
-
const closeOL = () => { if (inOL)
|
| 39 |
|
| 40 |
const inline = (str) => str
|
| 41 |
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
| 42 |
.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
|
| 43 |
-
.replace(/\*(.+?)\*/g,
|
| 44 |
-
.replace(/`([^`]+)`/g,
|
| 45 |
.replace(/\[Source (\d+)\]/g,
|
| 46 |
'<span class="source-ref">[S$1]</span>');
|
| 47 |
|
|
@@ -100,6 +100,74 @@ function renderMarkdown(text) {
|
|
| 100 |
return html;
|
| 101 |
}
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
// ── Chat core ─────────────────────────────────────────────
|
| 104 |
|
| 105 |
// Debounce guard — prevents double-submit on rapid Enter + button click
|
|
@@ -111,7 +179,7 @@ async function sendChat() {
|
|
| 111 |
_lastSendTime = now;
|
| 112 |
|
| 113 |
const input = document.getElementById('chatInput');
|
| 114 |
-
const msg
|
| 115 |
if (!msg || STATE.isThinking) return;
|
| 116 |
input.value = '';
|
| 117 |
autoResize(input);
|
|
@@ -122,15 +190,15 @@ async function sendChat() {
|
|
| 122 |
document.getElementById('chatSend').disabled = true;
|
| 123 |
|
| 124 |
const category = document.getElementById('chatFilterSelect').value;
|
| 125 |
-
const history
|
| 126 |
|
| 127 |
// Create assistant bubble immediately — will be filled by stream
|
| 128 |
const assistantDiv = appendMsg('assistant', '', [], []);
|
| 129 |
-
const bubble
|
| 130 |
-
bubble.innerHTML
|
| 131 |
|
| 132 |
-
let
|
| 133 |
-
let
|
| 134 |
|
| 135 |
try {
|
| 136 |
await apiQuery(msg, category, history, STATE.sessionId, STATE.alpha, {
|
|
@@ -142,11 +210,11 @@ async function sendChat() {
|
|
| 142 |
fullText += token;
|
| 143 |
bubble.innerHTML = renderMarkdown(fullText);
|
| 144 |
// Auto scroll
|
| 145 |
-
document.getElementById('chatMessages').scrollTop =
|
| 146 |
-
|
| 147 |
-
|
| 148 |
},
|
| 149 |
-
onDone(sources, images) {
|
| 150 |
// Finalize markdown render
|
| 151 |
bubble.innerHTML = renderMarkdown(fullText);
|
| 152 |
STATE.chatHistory.push({ role: 'assistant', content: fullText });
|
|
@@ -162,11 +230,11 @@ async function sendChat() {
|
|
| 162 |
|
| 163 |
// Append sources
|
| 164 |
if (visibleSources.length > 0) {
|
| 165 |
-
const n
|
| 166 |
const chips = visibleSources.map(s => {
|
| 167 |
-
const score
|
| 168 |
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
|
| 169 |
-
const cls
|
| 170 |
return `<div class="source-chip ${cls}">
|
| 171 |
<div class="source-chip-header">
|
| 172 |
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
|
|
@@ -180,12 +248,18 @@ async function sendChat() {
|
|
| 180 |
<button class="sources-toggle" onclick="
|
| 181 |
const p=this.nextElementSibling;
|
| 182 |
const open=p.classList.toggle('open');
|
| 183 |
-
this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
|
| 184 |
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
|
| 185 |
<div class="sources-panel">${chips}</div>`;
|
| 186 |
assistantDiv.appendChild(srcEl);
|
| 187 |
}
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
// Append images
|
| 190 |
if (images.length > 0) {
|
| 191 |
const uniqueImages = [...new Set(images)];
|
|
@@ -199,6 +273,75 @@ async function sendChat() {
|
|
| 199 |
assistantDiv.appendChild(imgEl);
|
| 200 |
}
|
| 201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
const el = document.getElementById('chatMessages');
|
| 203 |
el.scrollTop = el.scrollHeight;
|
| 204 |
},
|
|
@@ -215,7 +358,7 @@ async function sendChat() {
|
|
| 215 |
}
|
| 216 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
|
| 217 |
},
|
| 218 |
-
});
|
| 219 |
} catch (e) {
|
| 220 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">Request failed: ${esc(e.message)}</p>`;
|
| 221 |
} finally {
|
|
@@ -225,7 +368,7 @@ async function sendChat() {
|
|
| 225 |
}
|
| 226 |
|
| 227 |
function appendMsg(role, text, sources = [], images = []) {
|
| 228 |
-
const el
|
| 229 |
const div = document.createElement('div');
|
| 230 |
div.className = `msg ${role}`;
|
| 231 |
const n = sources.length;
|
|
@@ -237,11 +380,11 @@ function appendMsg(role, text, sources = [], images = []) {
|
|
| 237 |
imgHtml = `
|
| 238 |
<div style="display:flex; flex-direction:row; gap:10px; margin-top:12px; width:100%; overflow-x:auto; padding-bottom:8px;">
|
| 239 |
${uniqueImages.map(img => {
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
</div>`;
|
| 246 |
}
|
| 247 |
|
|
@@ -249,9 +392,9 @@ function appendMsg(role, text, sources = [], images = []) {
|
|
| 249 |
let srcHtml = '';
|
| 250 |
if (n > 0) {
|
| 251 |
const chips = sources.map(s => {
|
| 252 |
-
const score
|
| 253 |
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
|
| 254 |
-
const cls
|
| 255 |
return `<div class="source-chip ${cls}">
|
| 256 |
<div class="source-chip-header">
|
| 257 |
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
|
|
@@ -264,7 +407,7 @@ function appendMsg(role, text, sources = [], images = []) {
|
|
| 264 |
<button class="sources-toggle" onclick="
|
| 265 |
const p=this.nextElementSibling;
|
| 266 |
const open=p.classList.toggle('open');
|
| 267 |
-
this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
|
| 268 |
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
|
| 269 |
<div class="sources-panel">${chips}</div>`;
|
| 270 |
}
|
|
@@ -284,7 +427,7 @@ function appendMsg(role, text, sources = [], images = []) {
|
|
| 284 |
}
|
| 285 |
|
| 286 |
function appendThinking() {
|
| 287 |
-
const el
|
| 288 |
const div = document.createElement('div');
|
| 289 |
div.className = 'msg assistant';
|
| 290 |
div.innerHTML = `
|
|
|
|
| 10 |
lb.style.cssText = `display:none;position:fixed;inset:0;background:rgba(0,0,0,0.88);
|
| 11 |
z-index:9998;align-items:center;justify-content:center;cursor:zoom-out;
|
| 12 |
backdrop-filter:blur(4px);`;
|
| 13 |
+
lb.innerHTML = `
|
| 14 |
<button id="img-lightbox-close"
|
| 15 |
onclick="event.stopPropagation(); document.getElementById('img-lightbox').style.display='none'">
|
| 16 |
✕
|
|
|
|
| 34 |
let inUL = false;
|
| 35 |
let inOL = false;
|
| 36 |
|
| 37 |
+
const closeUL = () => { if (inUL) { html += '</ul>'; inUL = false; } };
|
| 38 |
+
const closeOL = () => { if (inOL) { html += '</ol>'; inOL = false; } };
|
| 39 |
|
| 40 |
const inline = (str) => str
|
| 41 |
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
| 42 |
.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
|
| 43 |
+
.replace(/\*(.+?)\*/g, '<em>$1</em>')
|
| 44 |
+
.replace(/`([^`]+)`/g, '<code class="inline-code">$1</code>')
|
| 45 |
.replace(/\[Source (\d+)\]/g,
|
| 46 |
'<span class="source-ref">[S$1]</span>');
|
| 47 |
|
|
|
|
| 100 |
return html;
|
| 101 |
}
|
| 102 |
|
| 103 |
+
function renderDocDiagnostics(docDiagnostics) {
|
| 104 |
+
if (!Array.isArray(docDiagnostics) || docDiagnostics.length === 0) return '';
|
| 105 |
+
const rows = docDiagnostics.map(diag => {
|
| 106 |
+
const score = diag.doc_score != null ? `${Math.round(diag.doc_score * 100)}%` : 'n/a';
|
| 107 |
+
const reason = diag.reason || 'unknown';
|
| 108 |
+
const status = diag.included ? 'included' : 'excluded';
|
| 109 |
+
return `
|
| 110 |
+
<div style="display:flex;justify-content:space-between;gap:12px;padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
|
| 111 |
+
<div>
|
| 112 |
+
<div style="font-weight:600;color:#dbeafe">${esc(diag.source || diag.file_hash || 'Unknown')}</div>
|
| 113 |
+
<div style="font-size:0.85em;color:#94a3b8">${esc(status)} · ${esc(reason)} · candidates ${Number(diag.candidate_count ?? 0)}</div>
|
| 114 |
+
</div>
|
| 115 |
+
<div style="font-size:0.85em;color:#cbd5e1;white-space:nowrap">${esc(diag.confidence_label || 'unknown')} · ${esc(score)}</div>
|
| 116 |
+
</div>
|
| 117 |
+
`;
|
| 118 |
+
}).join('');
|
| 119 |
+
return `
|
| 120 |
+
<div style="margin-top:12px;padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);">
|
| 121 |
+
<div style="font-size:0.8em;letter-spacing:0.14em;text-transform:uppercase;color:#7dd3fc;">Retrieval Diagnostics</div>
|
| 122 |
+
${rows}
|
| 123 |
+
</div>
|
| 124 |
+
`;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
function attachFeedbackControls(container, traceId) {
|
| 128 |
+
if (!traceId) return;
|
| 129 |
+
const bar = document.createElement('div');
|
| 130 |
+
bar.style.cssText = 'display:flex;flex-wrap:wrap;gap:8px;margin-top:12px;';
|
| 131 |
+
|
| 132 |
+
const disableAll = () => {
|
| 133 |
+
Array.from(bar.querySelectorAll('button')).forEach(btn => { btn.disabled = true; btn.style.opacity = '0.65'; });
|
| 134 |
+
};
|
| 135 |
+
|
| 136 |
+
const makeBtn = (label, handler) => {
|
| 137 |
+
const btn = document.createElement('button');
|
| 138 |
+
btn.textContent = label;
|
| 139 |
+
btn.style.cssText = 'background:rgba(255,255,255,0.05);border:1px solid #334155;color:var(--fg);padding:7px 12px;border-radius:8px;font-size:0.85em;cursor:pointer;';
|
| 140 |
+
btn.onclick = async () => {
|
| 141 |
+
try {
|
| 142 |
+
await handler();
|
| 143 |
+
disableAll();
|
| 144 |
+
toast('Feedback saved.', 'success');
|
| 145 |
+
} catch (err) {
|
| 146 |
+
toast(err?.message || 'Could not save feedback.', 'error');
|
| 147 |
+
}
|
| 148 |
+
};
|
| 149 |
+
return btn;
|
| 150 |
+
};
|
| 151 |
+
|
| 152 |
+
bar.appendChild(makeBtn('Helpful', async () => {
|
| 153 |
+
await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true });
|
| 154 |
+
}));
|
| 155 |
+
bar.appendChild(makeBtn('Not Helpful', async () => {
|
| 156 |
+
const note = window.prompt('What went wrong? You can add a short reason or a correction.', '') || '';
|
| 157 |
+
await apiSubmitAnswerFeedback({
|
| 158 |
+
trace_id: traceId,
|
| 159 |
+
helpful: false,
|
| 160 |
+
reason_code: note ? 'user_reported_issue' : 'needs_improvement',
|
| 161 |
+
correction_text: note || null,
|
| 162 |
+
});
|
| 163 |
+
}));
|
| 164 |
+
bar.appendChild(makeBtn('Save Answer', async () => {
|
| 165 |
+
await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true, accepted: true });
|
| 166 |
+
}));
|
| 167 |
+
|
| 168 |
+
container.appendChild(bar);
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
// ── Chat core ─────────────────────────────────────────────
|
| 172 |
|
| 173 |
// Debounce guard — prevents double-submit on rapid Enter + button click
|
|
|
|
| 179 |
_lastSendTime = now;
|
| 180 |
|
| 181 |
const input = document.getElementById('chatInput');
|
| 182 |
+
const msg = input.value.trim();
|
| 183 |
if (!msg || STATE.isThinking) return;
|
| 184 |
input.value = '';
|
| 185 |
autoResize(input);
|
|
|
|
| 190 |
document.getElementById('chatSend').disabled = true;
|
| 191 |
|
| 192 |
const category = document.getElementById('chatFilterSelect').value;
|
| 193 |
+
const history = STATE.chatHistory.slice(-CONFIG.CHAT_HISTORY_TURNS);
|
| 194 |
|
| 195 |
// Create assistant bubble immediately — will be filled by stream
|
| 196 |
const assistantDiv = appendMsg('assistant', '', [], []);
|
| 197 |
+
const bubble = assistantDiv.querySelector('.msg-bubble');
|
| 198 |
+
bubble.innerHTML = '<div class="thinking-dots"><span></span><span></span><span></span></div>';
|
| 199 |
|
| 200 |
+
let fullText = '';
|
| 201 |
+
let started = false;
|
| 202 |
|
| 203 |
try {
|
| 204 |
await apiQuery(msg, category, history, STATE.sessionId, STATE.alpha, {
|
|
|
|
| 210 |
fullText += token;
|
| 211 |
bubble.innerHTML = renderMarkdown(fullText);
|
| 212 |
// Auto scroll
|
| 213 |
+
document.getElementById('chatMessages').scrollTop =
|
| 214 |
+
document.getElementById('chatMessages').scrollHeight;
|
| 215 |
+
await new Promise(r => setTimeout(r, 0));
|
| 216 |
},
|
| 217 |
+
onDone({ sources, images, traceId, docDiagnostics }) {
|
| 218 |
// Finalize markdown render
|
| 219 |
bubble.innerHTML = renderMarkdown(fullText);
|
| 220 |
STATE.chatHistory.push({ role: 'assistant', content: fullText });
|
|
|
|
| 230 |
|
| 231 |
// Append sources
|
| 232 |
if (visibleSources.length > 0) {
|
| 233 |
+
const n = visibleSources.length;
|
| 234 |
const chips = visibleSources.map(s => {
|
| 235 |
+
const score = s.score != null ? Math.round(s.score * 100) : null;
|
| 236 |
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
|
| 237 |
+
const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
|
| 238 |
return `<div class="source-chip ${cls}">
|
| 239 |
<div class="source-chip-header">
|
| 240 |
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
|
|
|
|
| 248 |
<button class="sources-toggle" onclick="
|
| 249 |
const p=this.nextElementSibling;
|
| 250 |
const open=p.classList.toggle('open');
|
| 251 |
+
this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
|
| 252 |
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
|
| 253 |
<div class="sources-panel">${chips}</div>`;
|
| 254 |
assistantDiv.appendChild(srcEl);
|
| 255 |
}
|
| 256 |
|
| 257 |
+
if (docDiagnostics && docDiagnostics.length > 0) {
|
| 258 |
+
const diagEl = document.createElement('div');
|
| 259 |
+
diagEl.innerHTML = renderDocDiagnostics(docDiagnostics);
|
| 260 |
+
assistantDiv.appendChild(diagEl);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
// Append images
|
| 264 |
if (images.length > 0) {
|
| 265 |
const uniqueImages = [...new Set(images)];
|
|
|
|
| 273 |
assistantDiv.appendChild(imgEl);
|
| 274 |
}
|
| 275 |
|
| 276 |
+
attachFeedbackControls(assistantDiv, traceId);
|
| 277 |
+
|
| 278 |
+
const el = document.getElementById('chatMessages');
|
| 279 |
+
el.scrollTop = el.scrollHeight;
|
| 280 |
+
},
|
| 281 |
+
onOptions(options) {
|
| 282 |
+
// Render inline choice buttons
|
| 283 |
+
const btnContainer = document.createElement('div');
|
| 284 |
+
btnContainer.style.cssText = 'display:flex;flex-direction:row;flex-wrap:wrap;gap:8px;margin-top:12px;';
|
| 285 |
+
|
| 286 |
+
const syncGraphPinStyles = () => {
|
| 287 |
+
const d3 = window.d3;
|
| 288 |
+
if (!d3) return;
|
| 289 |
+
|
| 290 |
+
d3.selectAll('.node')
|
| 291 |
+
.filter(d => d && d.type === 'document')
|
| 292 |
+
.select('circle')
|
| 293 |
+
.attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
|
| 294 |
+
.attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
|
| 295 |
+
.attr('filter', d => {
|
| 296 |
+
if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
|
| 297 |
+
const idx = STATE.categories.indexOf(d.category);
|
| 298 |
+
return idx >= 0 ? `url(#glow-${idx})` : null;
|
| 299 |
+
});
|
| 300 |
+
};
|
| 301 |
+
|
| 302 |
+
options.forEach(opt => {
|
| 303 |
+
const btn = document.createElement('button');
|
| 304 |
+
btn.textContent = opt.label;
|
| 305 |
+
btn.style.cssText = `
|
| 306 |
+
background: rgba(255, 255, 255, 0.05);
|
| 307 |
+
border: 1px solid #334155;
|
| 308 |
+
color: var(--fg);
|
| 309 |
+
padding: 8px 16px;
|
| 310 |
+
border-radius: 6px;
|
| 311 |
+
font-size: 0.9em;
|
| 312 |
+
cursor: pointer;
|
| 313 |
+
transition: all 0.2s;
|
| 314 |
+
`;
|
| 315 |
+
btn.onmouseover = () => {
|
| 316 |
+
btn.style.background = 'rgba(255, 255, 255, 0.1)';
|
| 317 |
+
btn.style.borderColor = 'var(--text-glow)';
|
| 318 |
+
};
|
| 319 |
+
btn.onmouseout = () => {
|
| 320 |
+
btn.style.background = 'rgba(255, 255, 255, 0.05)';
|
| 321 |
+
btn.style.borderColor = '#334155';
|
| 322 |
+
};
|
| 323 |
+
|
| 324 |
+
btn.onclick = () => {
|
| 325 |
+
// 1) Apply selected routing scope (single-doc or multi-doc)
|
| 326 |
+
const selectedHashes = opt.mode === 'all'
|
| 327 |
+
? (Array.isArray(opt.file_hashes) ? opt.file_hashes.filter(Boolean) : [])
|
| 328 |
+
: (opt.file_hash ? [opt.file_hash] : []);
|
| 329 |
+
STATE.pinnedFiles = [...new Set(selectedHashes)];
|
| 330 |
+
syncGraphPinStyles();
|
| 331 |
+
|
| 332 |
+
// 2. Hide the buttons
|
| 333 |
+
btnContainer.style.display = 'none';
|
| 334 |
+
|
| 335 |
+
// 3. Resubmit the query now that it has a pin
|
| 336 |
+
const input = document.getElementById('chatInput');
|
| 337 |
+
input.value = msg; // original msg
|
| 338 |
+
document.getElementById('chatSend').click();
|
| 339 |
+
};
|
| 340 |
+
btnContainer.appendChild(btn);
|
| 341 |
+
});
|
| 342 |
+
|
| 343 |
+
assistantDiv.appendChild(btnContainer);
|
| 344 |
+
|
| 345 |
const el = document.getElementById('chatMessages');
|
| 346 |
el.scrollTop = el.scrollHeight;
|
| 347 |
},
|
|
|
|
| 358 |
}
|
| 359 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
|
| 360 |
},
|
| 361 |
+
}, STATE.pinnedFiles);
|
| 362 |
} catch (e) {
|
| 363 |
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">Request failed: ${esc(e.message)}</p>`;
|
| 364 |
} finally {
|
|
|
|
| 368 |
}
|
| 369 |
|
| 370 |
function appendMsg(role, text, sources = [], images = []) {
|
| 371 |
+
const el = document.getElementById('chatMessages');
|
| 372 |
const div = document.createElement('div');
|
| 373 |
div.className = `msg ${role}`;
|
| 374 |
const n = sources.length;
|
|
|
|
| 380 |
imgHtml = `
|
| 381 |
<div style="display:flex; flex-direction:row; gap:10px; margin-top:12px; width:100%; overflow-x:auto; padding-bottom:8px;">
|
| 382 |
${uniqueImages.map(img => {
|
| 383 |
+
const src = img.startsWith('data:') || img.startsWith('http')
|
| 384 |
+
? img
|
| 385 |
+
: `data:image/jpeg;base64,${img}`;
|
| 386 |
+
return `<img src="${src}" style="max-height: 220px; max-width: 100%; object-fit: contain; border-radius: 8px; background: white; border: 1px solid #334155; cursor: zoom-in;" onclick="openLightbox(this.src)">`;
|
| 387 |
+
}).join('')}
|
| 388 |
</div>`;
|
| 389 |
}
|
| 390 |
|
|
|
|
| 392 |
let srcHtml = '';
|
| 393 |
if (n > 0) {
|
| 394 |
const chips = sources.map(s => {
|
| 395 |
+
const score = s.score != null ? Math.round(s.score * 100) : null;
|
| 396 |
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
|
| 397 |
+
const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
|
| 398 |
return `<div class="source-chip ${cls}">
|
| 399 |
<div class="source-chip-header">
|
| 400 |
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
|
|
|
|
| 407 |
<button class="sources-toggle" onclick="
|
| 408 |
const p=this.nextElementSibling;
|
| 409 |
const open=p.classList.toggle('open');
|
| 410 |
+
this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
|
| 411 |
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
|
| 412 |
<div class="sources-panel">${chips}</div>`;
|
| 413 |
}
|
|
|
|
| 427 |
}
|
| 428 |
|
| 429 |
function appendThinking() {
|
| 430 |
+
const el = document.getElementById('chatMessages');
|
| 431 |
const div = document.createElement('div');
|
| 432 |
div.className = 'msg assistant';
|
| 433 |
div.innerHTML = `
|
frontend/js/config.js
CHANGED
|
@@ -2,14 +2,48 @@ const CONFIG = {
|
|
| 2 |
API_URL: '',
|
| 3 |
CAT_PALETTE: ['#00ff88','#4a9eff','#f5a623','#ff6b9d','#a78bfa','#34d399','#fb923c','#60a5fa'],
|
| 4 |
CHAT_HISTORY_TURNS: 6,
|
|
|
|
| 5 |
};
|
| 6 |
|
| 7 |
// Supabase client — keys loaded from backend, never hardcoded here
|
| 8 |
let supabaseClient = null;
|
|
|
|
| 9 |
|
| 10 |
async function initSupabase() {
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
API_URL: '',
|
| 3 |
CAT_PALETTE: ['#00ff88','#4a9eff','#f5a623','#ff6b9d','#a78bfa','#34d399','#fb923c','#60a5fa'],
|
| 4 |
CHAT_HISTORY_TURNS: 6,
|
| 5 |
+
GUEST_ENABLED: true,
|
| 6 |
};
|
| 7 |
|
| 8 |
// Supabase client — keys loaded from backend, never hardcoded here
|
| 9 |
let supabaseClient = null;
|
| 10 |
+
let supabaseReady = null;
|
| 11 |
|
| 12 |
async function initSupabase() {
|
| 13 |
+
if (supabaseClient?.auth) return supabaseClient;
|
| 14 |
+
if (supabaseReady) return supabaseReady;
|
| 15 |
+
|
| 16 |
+
supabaseReady = (async () => {
|
| 17 |
+
try {
|
| 18 |
+
const res = await fetch('/api/v1/config', { cache: 'no-store' });
|
| 19 |
+
if (!res.ok) {
|
| 20 |
+
throw new Error(`Config endpoint failed (${res.status})`);
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
const cfg = await res.json();
|
| 24 |
+
const createClient = window.supabase?.createClient;
|
| 25 |
+
if (typeof createClient !== 'function') {
|
| 26 |
+
throw new Error('Supabase browser SDK failed to load.');
|
| 27 |
+
}
|
| 28 |
+
if (!cfg?.supabase_url || !cfg?.supabase_anon) {
|
| 29 |
+
throw new Error('Supabase frontend config is missing.');
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
CONFIG.GUEST_ENABLED = cfg?.guest_enabled !== false;
|
| 33 |
+
const client = createClient(cfg.supabase_url, cfg.supabase_anon);
|
| 34 |
+
if (!client?.auth) {
|
| 35 |
+
throw new Error('Supabase auth client failed to initialize.');
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
supabaseClient = client;
|
| 39 |
+
window.supabaseClient = client;
|
| 40 |
+
return client;
|
| 41 |
+
} catch (err) {
|
| 42 |
+
supabaseClient = null;
|
| 43 |
+
supabaseReady = null;
|
| 44 |
+
throw err;
|
| 45 |
+
}
|
| 46 |
+
})();
|
| 47 |
+
|
| 48 |
+
return supabaseReady;
|
| 49 |
+
}
|
frontend/js/corpus.js
CHANGED
|
@@ -3,6 +3,30 @@
|
|
| 3 |
* Document list, upload (real FastAPI call), category review.
|
| 4 |
*/
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
// ── Doc list ──────────────────────────────────────────────────────────────────
|
| 7 |
function renderDocList() {
|
| 8 |
const el = document.getElementById('docList');
|
|
@@ -81,6 +105,7 @@ async function processUpload(file) {
|
|
| 81 |
try {
|
| 82 |
const queued = await apiIngestFile(file);
|
| 83 |
// queued = {task_id, filename, message}
|
|
|
|
| 84 |
|
| 85 |
setProgress(20, 'Queued — processing in background…');
|
| 86 |
|
|
@@ -92,20 +117,71 @@ async function processUpload(file) {
|
|
| 92 |
|
| 93 |
setProgress(100, 'Complete!');
|
| 94 |
setTimeout(() => pc.classList.remove('visible'), 1500);
|
|
|
|
| 95 |
|
| 96 |
-
if (result && result.
|
|
|
|
|
|
|
| 97 |
showCategoryReview(result.file_hash, result.filename, result.document_type);
|
| 98 |
}
|
| 99 |
await refreshCorpus();
|
| 100 |
|
| 101 |
} catch (err) {
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
| 103 |
if (err.message === 'already_ingested') toast('Already ingested — skipped', 'error');
|
| 104 |
else toast('Ingestion failed: ' + err.message, 'error');
|
| 105 |
}
|
| 106 |
document.getElementById('fileInput').value = '';
|
| 107 |
}
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
async function pollIngestStatus(taskId, onProgress) {
|
| 110 |
// No hard timeout — poll until COMPLETED or FAILED.
|
| 111 |
// A large PDF with AI vision summaries can take 5-10 minutes on free-tier
|
|
@@ -218,4 +294,4 @@ function populateFilterDropdowns() {
|
|
| 218 |
const sel = document.getElementById('chatFilterSelect');
|
| 219 |
sel.innerHTML = '<option value="All">All Categories</option>' +
|
| 220 |
STATE.categories.map(c => `<option value="${c}">${c.replace(/_/g,' ')}</option>`).join('');
|
| 221 |
-
}
|
|
|
|
| 3 |
* Document list, upload (real FastAPI call), category review.
|
| 4 |
*/
|
| 5 |
|
| 6 |
+
const ACTIVE_INGEST_KEY = 'morpheus_active_ingest';
|
| 7 |
+
let ACTIVE_INGEST_PROMISE = null;
|
| 8 |
+
|
| 9 |
+
function saveActiveIngest(taskId, filename) {
|
| 10 |
+
localStorage.setItem(ACTIVE_INGEST_KEY, JSON.stringify({
|
| 11 |
+
taskId,
|
| 12 |
+
filename,
|
| 13 |
+
savedAt: Date.now(),
|
| 14 |
+
}));
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
function loadActiveIngest() {
|
| 18 |
+
try {
|
| 19 |
+
const raw = localStorage.getItem(ACTIVE_INGEST_KEY);
|
| 20 |
+
return raw ? JSON.parse(raw) : null;
|
| 21 |
+
} catch {
|
| 22 |
+
return null;
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
function clearActiveIngest() {
|
| 27 |
+
localStorage.removeItem(ACTIVE_INGEST_KEY);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
// ── Doc list ──────────────────────────────────────────────────────────────────
|
| 31 |
function renderDocList() {
|
| 32 |
const el = document.getElementById('docList');
|
|
|
|
| 105 |
try {
|
| 106 |
const queued = await apiIngestFile(file);
|
| 107 |
// queued = {task_id, filename, message}
|
| 108 |
+
saveActiveIngest(queued.task_id, queued.filename || file.name);
|
| 109 |
|
| 110 |
setProgress(20, 'Queued — processing in background…');
|
| 111 |
|
|
|
|
| 117 |
|
| 118 |
setProgress(100, 'Complete!');
|
| 119 |
setTimeout(() => pc.classList.remove('visible'), 1500);
|
| 120 |
+
clearActiveIngest();
|
| 121 |
|
| 122 |
+
if (result && result.recovered_existing) {
|
| 123 |
+
toast('Recovered previous upload without recomputing.', 'success');
|
| 124 |
+
} else if (result && result.file_hash) {
|
| 125 |
showCategoryReview(result.file_hash, result.filename, result.document_type);
|
| 126 |
}
|
| 127 |
await refreshCorpus();
|
| 128 |
|
| 129 |
} catch (err) {
|
| 130 |
+
if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
|
| 131 |
+
clearActiveIngest();
|
| 132 |
+
pc.classList.remove('visible');
|
| 133 |
+
}
|
| 134 |
if (err.message === 'already_ingested') toast('Already ingested — skipped', 'error');
|
| 135 |
else toast('Ingestion failed: ' + err.message, 'error');
|
| 136 |
}
|
| 137 |
document.getElementById('fileInput').value = '';
|
| 138 |
}
|
| 139 |
|
| 140 |
+
async function resumeActiveIngestionIfNeeded() {
|
| 141 |
+
if (ACTIVE_INGEST_PROMISE) return ACTIVE_INGEST_PROMISE;
|
| 142 |
+
const active = loadActiveIngest();
|
| 143 |
+
if (!active || !active.taskId) return null;
|
| 144 |
+
|
| 145 |
+
const pc = document.getElementById('progressCard');
|
| 146 |
+
pc.classList.add('visible');
|
| 147 |
+
document.getElementById('progressFilename').textContent = active.filename || 'Uploading PDF';
|
| 148 |
+
setProgress(25, 'Reconnecting to active ingestion…');
|
| 149 |
+
|
| 150 |
+
ACTIVE_INGEST_PROMISE = (async () => {
|
| 151 |
+
try {
|
| 152 |
+
const result = await pollIngestStatus(active.taskId, (step, total, msg) => {
|
| 153 |
+
const pct = Math.round((step / total) * 80) + 20;
|
| 154 |
+
setProgress(pct, msg);
|
| 155 |
+
});
|
| 156 |
+
|
| 157 |
+
clearActiveIngest();
|
| 158 |
+
setProgress(100, 'Complete!');
|
| 159 |
+
setTimeout(() => pc.classList.remove('visible'), 1500);
|
| 160 |
+
|
| 161 |
+
if (result && result.recovered_existing) {
|
| 162 |
+
toast('Recovered previous upload without recomputing.', 'success');
|
| 163 |
+
} else if (result && result.file_hash) {
|
| 164 |
+
showCategoryReview(result.file_hash, result.filename, result.document_type);
|
| 165 |
+
}
|
| 166 |
+
await refreshCorpus();
|
| 167 |
+
return result;
|
| 168 |
+
} catch (err) {
|
| 169 |
+
if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
|
| 170 |
+
clearActiveIngest();
|
| 171 |
+
pc.classList.remove('visible');
|
| 172 |
+
}
|
| 173 |
+
if (err.message === 'already_ingested') {
|
| 174 |
+
await refreshCorpus();
|
| 175 |
+
}
|
| 176 |
+
throw err;
|
| 177 |
+
} finally {
|
| 178 |
+
ACTIVE_INGEST_PROMISE = null;
|
| 179 |
+
}
|
| 180 |
+
})();
|
| 181 |
+
|
| 182 |
+
return ACTIVE_INGEST_PROMISE;
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
async function pollIngestStatus(taskId, onProgress) {
|
| 186 |
// No hard timeout — poll until COMPLETED or FAILED.
|
| 187 |
// A large PDF with AI vision summaries can take 5-10 minutes on free-tier
|
|
|
|
| 294 |
const sel = document.getElementById('chatFilterSelect');
|
| 295 |
sel.innerHTML = '<option value="All">All Categories</option>' +
|
| 296 |
STATE.categories.map(c => `<option value="${c}">${c.replace(/_/g,' ')}</option>`).join('');
|
| 297 |
+
}
|
frontend/js/graph.js
CHANGED
|
@@ -17,10 +17,10 @@
|
|
| 17 |
*/
|
| 18 |
|
| 19 |
function renderGraph() {
|
| 20 |
-
const svg
|
| 21 |
const panel = document.getElementById('graph-panel');
|
| 22 |
-
const W
|
| 23 |
-
const H
|
| 24 |
const empty = document.getElementById('graph-empty');
|
| 25 |
|
| 26 |
svg.selectAll('*').remove();
|
|
@@ -37,12 +37,12 @@ function renderGraph() {
|
|
| 37 |
|
| 38 |
STATE.categories.forEach(cat => {
|
| 39 |
nodes.push({
|
| 40 |
-
id:
|
| 41 |
-
type:
|
| 42 |
label: cat.replace(/_/g, ' '),
|
| 43 |
-
raw:
|
| 44 |
color: STATE.catColors[cat],
|
| 45 |
-
r:
|
| 46 |
pinned: false,
|
| 47 |
count: STATE.files.filter(f => (f.document_type || 'uncategorised') === cat).length,
|
| 48 |
});
|
|
@@ -51,26 +51,26 @@ function renderGraph() {
|
|
| 51 |
STATE.files.forEach(f => {
|
| 52 |
const cat = f.document_type || 'uncategorised';
|
| 53 |
nodes.push({
|
| 54 |
-
id:
|
| 55 |
-
type:
|
| 56 |
-
label:
|
| 57 |
file_hash: f.file_hash,
|
| 58 |
-
category:
|
| 59 |
-
color:
|
| 60 |
-
r:
|
| 61 |
-
pinned:
|
| 62 |
-
chunks:
|
| 63 |
-
ingested:
|
| 64 |
});
|
| 65 |
links.push({ source: `cat::${cat}`, target: `doc::${f.file_hash}` });
|
| 66 |
});
|
| 67 |
|
| 68 |
// ── Zoom + pan ─────────────────────────────────────────
|
| 69 |
-
const g
|
| 70 |
const zoom = d3.zoom()
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
svg.call(zoom).on('dblclick.zoom', null);
|
| 75 |
STATE.svgZoom = { zoom, svg };
|
| 76 |
|
|
@@ -102,29 +102,56 @@ function renderGraph() {
|
|
| 102 |
.style('cursor', 'pointer')
|
| 103 |
.call(d3.drag()
|
| 104 |
.on('start', (e, d) => {
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
})
|
| 109 |
.on('drag', (e, d) => {
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
})
|
| 115 |
.on('end', (e, d) => {
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
})
|
| 125 |
)
|
| 126 |
.on('click', (event, d) => {
|
| 127 |
event.stopPropagation();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
onNodeClick(d);
|
| 129 |
})
|
| 130 |
.on('contextmenu', (event, d) => {
|
|
@@ -149,7 +176,7 @@ function renderGraph() {
|
|
| 149 |
node.filter(d => d.type === 'category')
|
| 150 |
.append('circle')
|
| 151 |
.attr('r', 26)
|
| 152 |
-
.attr('fill',
|
| 153 |
.attr('stroke', d => d.color)
|
| 154 |
.attr('stroke-width', 2)
|
| 155 |
.attr('filter', d => {
|
|
@@ -170,9 +197,14 @@ function renderGraph() {
|
|
| 170 |
node.filter(d => d.type === 'document')
|
| 171 |
.append('circle')
|
| 172 |
.attr('r', 7)
|
| 173 |
-
.attr('fill',
|
| 174 |
-
.attr('stroke', d => d.color)
|
| 175 |
-
.attr('stroke-width', 1.5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
// Labels
|
| 178 |
node.append('text')
|
|
@@ -180,14 +212,14 @@ function renderGraph() {
|
|
| 180 |
.attr('dy', d => d.type === 'category' ? -32 : -12)
|
| 181 |
.attr('text-anchor', 'middle')
|
| 182 |
.attr('fill', d => d.type === 'category' ? d.color : 'rgba(200,216,244,0.7)')
|
| 183 |
-
.attr('font-size',
|
| 184 |
.attr('font-family', 'Syne Mono, monospace')
|
| 185 |
.attr('font-weight', d => d.type === 'category' ? '600' : '400')
|
| 186 |
.text(d => trunc(d.label, d.type === 'category' ? 18 : 16))
|
| 187 |
.style('pointer-events', 'none')
|
| 188 |
.style('user-select', 'none');
|
| 189 |
|
| 190 |
-
svg.on('click', () => {});
|
| 191 |
|
| 192 |
// ── Simulation — Obsidian style ────────────────────────
|
| 193 |
STATE.simulation = d3.forceSimulation(nodes)
|
|
@@ -207,25 +239,25 @@ function renderGraph() {
|
|
| 207 |
.alphaDecay(0.02)
|
| 208 |
.velocityDecay(0.4)
|
| 209 |
.on('tick', () => {
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
});
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
});
|
| 229 |
|
| 230 |
setTimeout(() => STATE.simulation.alphaTarget(0.05), 3000);
|
| 231 |
}
|
|
@@ -352,7 +384,7 @@ function setupGraphObservers() {
|
|
| 352 |
}
|
| 353 |
});
|
| 354 |
mo.observe(panel, {
|
| 355 |
-
attributes:
|
| 356 |
attributeFilter: ['style', 'class'],
|
| 357 |
});
|
| 358 |
|
|
@@ -366,7 +398,7 @@ function setupGraphObservers() {
|
|
| 366 |
if (W && H) graphReheat();
|
| 367 |
});
|
| 368 |
moParent.observe(panel.parentElement, {
|
| 369 |
-
attributes:
|
| 370 |
attributeFilter: ['style', 'class'],
|
| 371 |
});
|
| 372 |
}
|
|
@@ -377,4 +409,4 @@ function setupGraphObservers() {
|
|
| 377 |
window.addEventListener('resize', () => graphReheat());
|
| 378 |
}
|
| 379 |
|
| 380 |
-
setupGraphObservers();
|
|
|
|
| 17 |
*/
|
| 18 |
|
| 19 |
function renderGraph() {
|
| 20 |
+
const svg = d3.select('#graph-svg');
|
| 21 |
const panel = document.getElementById('graph-panel');
|
| 22 |
+
const W = panel.clientWidth;
|
| 23 |
+
const H = panel.clientHeight;
|
| 24 |
const empty = document.getElementById('graph-empty');
|
| 25 |
|
| 26 |
svg.selectAll('*').remove();
|
|
|
|
| 37 |
|
| 38 |
STATE.categories.forEach(cat => {
|
| 39 |
nodes.push({
|
| 40 |
+
id: `cat::${cat}`,
|
| 41 |
+
type: 'category',
|
| 42 |
label: cat.replace(/_/g, ' '),
|
| 43 |
+
raw: cat,
|
| 44 |
color: STATE.catColors[cat],
|
| 45 |
+
r: 26,
|
| 46 |
pinned: false,
|
| 47 |
count: STATE.files.filter(f => (f.document_type || 'uncategorised') === cat).length,
|
| 48 |
});
|
|
|
|
| 51 |
STATE.files.forEach(f => {
|
| 52 |
const cat = f.document_type || 'uncategorised';
|
| 53 |
nodes.push({
|
| 54 |
+
id: `doc::${f.file_hash}`,
|
| 55 |
+
type: 'document',
|
| 56 |
+
label: f.filename,
|
| 57 |
file_hash: f.file_hash,
|
| 58 |
+
category: cat,
|
| 59 |
+
color: STATE.catColors[cat] || '#4a9eff',
|
| 60 |
+
r: 7,
|
| 61 |
+
pinned: false,
|
| 62 |
+
chunks: f.chunk_count,
|
| 63 |
+
ingested: (f.ingested_at || '').slice(0, 10),
|
| 64 |
});
|
| 65 |
links.push({ source: `cat::${cat}`, target: `doc::${f.file_hash}` });
|
| 66 |
});
|
| 67 |
|
| 68 |
// ── Zoom + pan ─────────────────────────────────────────
|
| 69 |
+
const g = svg.append('g');
|
| 70 |
const zoom = d3.zoom()
|
| 71 |
+
.scaleExtent([0.3, 3])
|
| 72 |
+
// scroll to zoom only, no drag-to-pan
|
| 73 |
+
.on('zoom', e => g.attr('transform', e.transform));
|
| 74 |
svg.call(zoom).on('dblclick.zoom', null);
|
| 75 |
STATE.svgZoom = { zoom, svg };
|
| 76 |
|
|
|
|
| 102 |
.style('cursor', 'pointer')
|
| 103 |
.call(d3.drag()
|
| 104 |
.on('start', (e, d) => {
|
| 105 |
+
if (!e.active) STATE.simulation.alphaTarget(0.3).restart();
|
| 106 |
+
d.fx = d.x; d.fy = d.y;
|
| 107 |
+
d._lastX = d.x; d._lastY = d.y;
|
| 108 |
+
})
|
| 109 |
.on('drag', (e, d) => {
|
| 110 |
+
d._vx = e.x - (d._lastX || e.x);
|
| 111 |
+
d._vy = e.y - (d._lastY || e.y);
|
| 112 |
+
d._lastX = e.x; d._lastY = e.y;
|
| 113 |
+
d.fx = e.x; d.fy = e.y;
|
| 114 |
+
})
|
| 115 |
.on('end', (e, d) => {
|
| 116 |
+
if (!e.active) STATE.simulation.alphaTarget(0.05);
|
| 117 |
+
if (!d.pinned) {
|
| 118 |
+
d.fx = null; d.fy = null;
|
| 119 |
+
d.vx = (d._vx || 0) * 3;
|
| 120 |
+
d.vy = (d._vy || 0) * 3;
|
| 121 |
+
STATE.simulation.alphaTarget(0.3).restart();
|
| 122 |
+
setTimeout(() => STATE.simulation.alphaTarget(0.05), 2000);
|
| 123 |
+
}
|
| 124 |
+
})
|
| 125 |
)
|
| 126 |
.on('click', (event, d) => {
|
| 127 |
event.stopPropagation();
|
| 128 |
+
|
| 129 |
+
if (d.type === 'document') {
|
| 130 |
+
// Toggle this document's file_hash in the pinned set
|
| 131 |
+
const idx = STATE.pinnedFiles.indexOf(d.file_hash);
|
| 132 |
+
if (idx >= 0) {
|
| 133 |
+
STATE.pinnedFiles.splice(idx, 1);
|
| 134 |
+
} else {
|
| 135 |
+
STATE.pinnedFiles.push(d.file_hash);
|
| 136 |
+
}
|
| 137 |
+
// Visual: bright white stroke when pinned, original colour when not
|
| 138 |
+
node.filter(n => n && n.type === 'document').select('circle')
|
| 139 |
+
.attr('stroke', n => STATE.pinnedFiles.includes(n.file_hash) ? '#ffffff' : n.color)
|
| 140 |
+
.attr('stroke-width', n => STATE.pinnedFiles.includes(n.file_hash) ? 3 : 1.5)
|
| 141 |
+
.attr('filter', n => {
|
| 142 |
+
if (!STATE.pinnedFiles.includes(n.file_hash)) return null;
|
| 143 |
+
const glowIdx = STATE.categories.indexOf(n.category);
|
| 144 |
+
return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
|
| 145 |
+
});
|
| 146 |
+
} else if (d.type === 'category') {
|
| 147 |
+
// Clicking a category node clears ALL pins
|
| 148 |
+
STATE.pinnedFiles = [];
|
| 149 |
+
node.filter(n => n && n.type === 'document').select('circle')
|
| 150 |
+
.attr('stroke', n => n.color)
|
| 151 |
+
.attr('stroke-width', 1.5)
|
| 152 |
+
.attr('filter', null);
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
onNodeClick(d);
|
| 156 |
})
|
| 157 |
.on('contextmenu', (event, d) => {
|
|
|
|
| 176 |
node.filter(d => d.type === 'category')
|
| 177 |
.append('circle')
|
| 178 |
.attr('r', 26)
|
| 179 |
+
.attr('fill', d => d.color + '18')
|
| 180 |
.attr('stroke', d => d.color)
|
| 181 |
.attr('stroke-width', 2)
|
| 182 |
.attr('filter', d => {
|
|
|
|
| 197 |
node.filter(d => d.type === 'document')
|
| 198 |
.append('circle')
|
| 199 |
.attr('r', 7)
|
| 200 |
+
.attr('fill', d => d.color + '55')
|
| 201 |
+
.attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
|
| 202 |
+
.attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
|
| 203 |
+
.attr('filter', d => {
|
| 204 |
+
if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
|
| 205 |
+
const glowIdx = STATE.categories.indexOf(d.category);
|
| 206 |
+
return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
|
| 207 |
+
});
|
| 208 |
|
| 209 |
// Labels
|
| 210 |
node.append('text')
|
|
|
|
| 212 |
.attr('dy', d => d.type === 'category' ? -32 : -12)
|
| 213 |
.attr('text-anchor', 'middle')
|
| 214 |
.attr('fill', d => d.type === 'category' ? d.color : 'rgba(200,216,244,0.7)')
|
| 215 |
+
.attr('font-size', d => d.type === 'category' ? '10px' : '8px')
|
| 216 |
.attr('font-family', 'Syne Mono, monospace')
|
| 217 |
.attr('font-weight', d => d.type === 'category' ? '600' : '400')
|
| 218 |
.text(d => trunc(d.label, d.type === 'category' ? 18 : 16))
|
| 219 |
.style('pointer-events', 'none')
|
| 220 |
.style('user-select', 'none');
|
| 221 |
|
| 222 |
+
svg.on('click', () => { });
|
| 223 |
|
| 224 |
// ── Simulation — Obsidian style ────────────────────────
|
| 225 |
STATE.simulation = d3.forceSimulation(nodes)
|
|
|
|
| 239 |
.alphaDecay(0.02)
|
| 240 |
.velocityDecay(0.4)
|
| 241 |
.on('tick', () => {
|
| 242 |
+
const liveW = document.getElementById('graph-panel').clientWidth;
|
| 243 |
+
const liveH = document.getElementById('graph-panel').clientHeight;
|
| 244 |
+
nodes.forEach(d => {
|
| 245 |
+
if (d.fx == null) {
|
| 246 |
+
const pad = 40;
|
| 247 |
+
if (d.x < pad) { d.x = pad; d.vx = Math.abs(d.vx) * 0.7; }
|
| 248 |
+
if (d.x > liveW - pad) { d.x = liveW - pad; d.vx = -Math.abs(d.vx) * 0.7; }
|
| 249 |
+
if (d.y < pad) { d.y = pad; d.vy = Math.abs(d.vy) * 0.7; }
|
| 250 |
+
if (d.y > liveH - pad) { d.y = liveH - pad; d.vy = -Math.abs(d.vy) * 0.7; }
|
| 251 |
+
}
|
| 252 |
+
});
|
| 253 |
+
link
|
| 254 |
+
.attr('x1', d => d.source.x).attr('y1', d => d.source.y)
|
| 255 |
+
.attr('x2', d => d.target.x).attr('y2', d => d.target.y);
|
| 256 |
+
node.attr('transform', d => `translate(${d.x},${d.y})`);
|
| 257 |
+
|
| 258 |
+
const maxV = Math.max(...nodes.map(d => Math.abs(d.vx || 0) + Math.abs(d.vy || 0)));
|
| 259 |
+
if (maxV > 0.5) STATE.simulation.alphaTarget(0.1).restart();
|
| 260 |
+
});
|
| 261 |
|
| 262 |
setTimeout(() => STATE.simulation.alphaTarget(0.05), 3000);
|
| 263 |
}
|
|
|
|
| 384 |
}
|
| 385 |
});
|
| 386 |
mo.observe(panel, {
|
| 387 |
+
attributes: true,
|
| 388 |
attributeFilter: ['style', 'class'],
|
| 389 |
});
|
| 390 |
|
|
|
|
| 398 |
if (W && H) graphReheat();
|
| 399 |
});
|
| 400 |
moParent.observe(panel.parentElement, {
|
| 401 |
+
attributes: true,
|
| 402 |
attributeFilter: ['style', 'class'],
|
| 403 |
});
|
| 404 |
}
|
|
|
|
| 409 |
window.addEventListener('resize', () => graphReheat());
|
| 410 |
}
|
| 411 |
|
| 412 |
+
setupGraphObservers();
|
frontend/js/main.js
CHANGED
|
@@ -6,17 +6,126 @@
|
|
| 6 |
* On success, supabase-js stores the session in localStorage automatically.
|
| 7 |
* getSupabaseToken() in api.js reads it on every request.
|
| 8 |
*
|
| 9 |
-
*
|
| 10 |
-
*
|
| 11 |
*
|
| 12 |
-
*
|
|
|
|
| 13 |
*/
|
| 14 |
|
| 15 |
-
const AUTH_DISABLED = false; //
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
window.addEventListener('DOMContentLoaded', async () => {
|
| 18 |
try {
|
| 19 |
-
await initSupabase();
|
|
|
|
| 20 |
|
| 21 |
if (AUTH_DISABLED) {
|
| 22 |
showApp();
|
|
@@ -34,27 +143,71 @@ window.addEventListener('DOMContentLoaded', async () => {
|
|
| 34 |
// once with INITIAL_SESSION (with or without a session), then again on
|
| 35 |
// SIGNED_IN / SIGNED_OUT. No polling, no timeouts.
|
| 36 |
let booted = false;
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
booted = true;
|
| 41 |
showApp();
|
| 42 |
bootApp();
|
| 43 |
-
} else {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
showLogin();
|
| 45 |
}
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
booted = false;
|
| 52 |
showLogin();
|
| 53 |
-
}
|
| 54 |
});
|
| 55 |
|
| 56 |
} catch (err) {
|
| 57 |
console.error("Boot failed:", err);
|
|
|
|
|
|
|
| 58 |
showLogin();
|
| 59 |
}
|
| 60 |
});
|
|
@@ -86,7 +239,12 @@ async function submitLogin() {
|
|
| 86 |
err.textContent = '';
|
| 87 |
|
| 88 |
try {
|
| 89 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
email,
|
| 91 |
password: pw,
|
| 92 |
});
|
|
@@ -94,12 +252,14 @@ async function submitLogin() {
|
|
| 94 |
if (error) {
|
| 95 |
err.textContent = error.message || 'Invalid credentials.';
|
| 96 |
btn.disabled = false;
|
| 97 |
-
btn.textContent = '
|
| 98 |
return;
|
| 99 |
}
|
| 100 |
// EXPLICIT UI TAKEOVER:
|
| 101 |
// Wait 500ms to guarantee local storage has the token, then force the system online.
|
| 102 |
STATE.authenticated = true;
|
|
|
|
|
|
|
| 103 |
showApp();
|
| 104 |
|
| 105 |
setTimeout(() => {
|
|
@@ -111,7 +271,60 @@ async function submitLogin() {
|
|
| 111 |
} catch (e) {
|
| 112 |
err.textContent = 'Server unreachable: ' + e.message;
|
| 113 |
btn.disabled = false;
|
| 114 |
-
btn.textContent = '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
}
|
| 117 |
|
|
@@ -163,7 +376,12 @@ async function submitSignup() {
|
|
| 163 |
btn.textContent = 'CREATING ACCOUNT…';
|
| 164 |
|
| 165 |
try {
|
| 166 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
if (error) {
|
| 169 |
err.textContent = error.message || 'Sign-up failed.';
|
|
@@ -187,55 +405,43 @@ async function submitSignup() {
|
|
| 187 |
}
|
| 188 |
}
|
| 189 |
|
| 190 |
-
// ──
|
| 191 |
-
async function submitAdmin() {
|
| 192 |
-
const key =
|
| 193 |
-
if (!key) return;
|
| 194 |
try {
|
| 195 |
const res = await apiVerifyAdmin(key);
|
| 196 |
if (res.valid) {
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
} else {
|
| 203 |
-
|
| 204 |
}
|
| 205 |
} catch (e) {
|
| 206 |
-
|
| 207 |
}
|
|
|
|
| 208 |
}
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
const
|
| 214 |
-
if (
|
| 215 |
-
btn.textContent = 'DISABLE AUTH';
|
| 216 |
-
btn.style.background = 'rgba(255,71,87,0.15)';
|
| 217 |
-
btn.style.borderColor = 'var(--red)';
|
| 218 |
-
btn.style.color = 'var(--red)';
|
| 219 |
-
label.textContent = 'Auth is ON — users must sign in';
|
| 220 |
-
} else {
|
| 221 |
-
btn.textContent = 'ENABLE AUTH';
|
| 222 |
-
btn.style.background = 'rgba(0,255,136,0.08)';
|
| 223 |
-
btn.style.borderColor = 'var(--phosphor)';
|
| 224 |
-
btn.style.color = 'var(--phosphor)';
|
| 225 |
-
label.textContent = 'Auth is OFF — anyone can access';
|
| 226 |
-
}
|
| 227 |
-
}
|
| 228 |
-
|
| 229 |
-
function toggleAuth() {
|
| 230 |
-
const current = localStorage.getItem('nexus_auth_locked') !== 'false';
|
| 231 |
-
const next = !current;
|
| 232 |
-
localStorage.setItem('nexus_auth_locked', next ? 'true' : 'false');
|
| 233 |
-
updateToggleUI(next);
|
| 234 |
-
toast(
|
| 235 |
-
next ? 'Auth enabled — sign-in required on next visit'
|
| 236 |
-
: 'Auth disabled — open access',
|
| 237 |
-
next ? 'error' : 'success',
|
| 238 |
-
);
|
| 239 |
}
|
| 240 |
|
| 241 |
function handleLoginKey(e) {
|
|
@@ -247,10 +453,38 @@ function handleLoginKey(e) {
|
|
| 247 |
|
| 248 |
// ── Sign out ──────────────────────────────────────────────────────────────────
|
| 249 |
async function signOut() {
|
| 250 |
-
await
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
STATE.authenticated = false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
STATE.files = [];
|
| 253 |
STATE.categories = [];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
showLogin();
|
| 255 |
authTab('signin');
|
| 256 |
}
|
|
@@ -262,7 +496,22 @@ async function bootApp() {
|
|
| 262 |
setOnline(true);
|
| 263 |
try {
|
| 264 |
await refreshCorpus();
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
} catch (e) {
|
| 267 |
setOnline(false);
|
| 268 |
toast('Could not reach backend: ' + e.message, 'error');
|
|
@@ -292,4 +541,4 @@ async function refreshCorpus() {
|
|
| 292 |
}
|
| 293 |
};
|
| 294 |
}, 50);
|
| 295 |
-
})();
|
|
|
|
| 6 |
* On success, supabase-js stores the session in localStorage automatically.
|
| 7 |
* getSupabaseToken() in api.js reads it on every request.
|
| 8 |
*
|
| 9 |
+
* Legacy daily-password UI has been removed. Supabase JWT gates the main app,
|
| 10 |
+
* while the admin key only unlocks operator review tools.
|
| 11 |
*
|
| 12 |
+
* AUTH_DISABLED is a local-dev escape hatch only.
|
| 13 |
+
* Product guest access should use Supabase anonymous sessions instead.
|
| 14 |
*/
|
| 15 |
|
| 16 |
+
const AUTH_DISABLED = false; // local dev only — keep false in real use
|
| 17 |
+
const GUEST_PERSIST_KEY = 'morpheus_guest_persist';
|
| 18 |
+
const GUEST_TAB_KEY = 'morpheus_guest_tab_alive';
|
| 19 |
+
const GUEST_LAST_SEEN_KEY = 'morpheus_guest_last_seen_at';
|
| 20 |
+
const GUEST_ACTIVITY_WINDOW_MS = 45000;
|
| 21 |
+
let guestHeartbeatTimer = null;
|
| 22 |
+
|
| 23 |
+
function shouldPersistGuestWorkspace() {
|
| 24 |
+
return localStorage.getItem(GUEST_PERSIST_KEY) === '1';
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
function setGuestPersistPreference(keep) {
|
| 28 |
+
localStorage.setItem(GUEST_PERSIST_KEY, keep ? '1' : '0');
|
| 29 |
+
STATE.guestPersist = Boolean(keep);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
function markGuestTabAlive() {
|
| 33 |
+
sessionStorage.setItem(GUEST_TAB_KEY, '1');
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
function clearGuestSessionMarkers() {
|
| 37 |
+
sessionStorage.removeItem(GUEST_TAB_KEY);
|
| 38 |
+
localStorage.removeItem(GUEST_LAST_SEEN_KEY);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
function hasGuestTabMarker() {
|
| 42 |
+
return sessionStorage.getItem(GUEST_TAB_KEY) === '1';
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
function touchGuestHeartbeat() {
|
| 46 |
+
localStorage.setItem(GUEST_LAST_SEEN_KEY, String(Date.now()));
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
function hasRecentGuestHeartbeat() {
|
| 50 |
+
const raw = Number(localStorage.getItem(GUEST_LAST_SEEN_KEY) || 0);
|
| 51 |
+
return Number.isFinite(raw) && raw > 0 && (Date.now() - raw) < GUEST_ACTIVITY_WINDOW_MS;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
function startGuestHeartbeat() {
|
| 55 |
+
stopGuestHeartbeat();
|
| 56 |
+
touchGuestHeartbeat();
|
| 57 |
+
guestHeartbeatTimer = window.setInterval(() => {
|
| 58 |
+
if (!STATE.isGuest) return;
|
| 59 |
+
touchGuestHeartbeat();
|
| 60 |
+
}, 15000);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
function stopGuestHeartbeat() {
|
| 64 |
+
if (!guestHeartbeatTimer) return;
|
| 65 |
+
clearInterval(guestHeartbeatTimer);
|
| 66 |
+
guestHeartbeatTimer = null;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
function setGuestControlsVisibility() {
|
| 70 |
+
const guestBtn = document.getElementById('guestBtn');
|
| 71 |
+
const guestInfo = document.getElementById('guestInfo');
|
| 72 |
+
const guestPersistWrap = document.getElementById('guestPersistWrap');
|
| 73 |
+
const visible = Boolean(CONFIG.GUEST_ENABLED);
|
| 74 |
+
if (guestBtn) guestBtn.style.display = visible ? '' : 'none';
|
| 75 |
+
if (guestInfo) guestInfo.style.display = visible ? 'block' : 'none';
|
| 76 |
+
if (guestPersistWrap) guestPersistWrap.style.display = visible ? 'block' : 'none';
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
function setSessionMode(session) {
|
| 80 |
+
const appMeta = session?.user?.app_metadata || {};
|
| 81 |
+
const provider = String(appMeta.provider || '').toLowerCase();
|
| 82 |
+
STATE.isGuest = Boolean(
|
| 83 |
+
session?.user?.is_anonymous ||
|
| 84 |
+
appMeta.is_anonymous ||
|
| 85 |
+
provider === 'anonymous' ||
|
| 86 |
+
(Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
|
| 87 |
+
);
|
| 88 |
+
STATE.guestPersist = STATE.isGuest ? shouldPersistGuestWorkspace() : false;
|
| 89 |
+
|
| 90 |
+
const pill = document.getElementById('session-mode-pill');
|
| 91 |
+
const label = document.getElementById('session-mode-label');
|
| 92 |
+
if (pill) pill.style.display = STATE.isGuest ? '' : 'none';
|
| 93 |
+
if (label) label.textContent = STATE.isGuest ? 'GUEST' : 'ACCOUNT';
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
function isTemporaryGuestResume(session) {
|
| 97 |
+
if (!session || !STATE.isGuest || STATE.guestPersist) return false;
|
| 98 |
+
return !hasGuestTabMarker() && !hasRecentGuestHeartbeat();
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
async function expireTemporaryGuestSession(client) {
|
| 102 |
+
try {
|
| 103 |
+
await apiCleanupGuestWorkspace();
|
| 104 |
+
} catch {
|
| 105 |
+
// best effort only
|
| 106 |
+
}
|
| 107 |
+
try {
|
| 108 |
+
await client.auth.signOut();
|
| 109 |
+
} catch {
|
| 110 |
+
// best effort only
|
| 111 |
+
}
|
| 112 |
+
localStorage.removeItem(GUEST_PERSIST_KEY);
|
| 113 |
+
clearGuestSessionMarkers();
|
| 114 |
+
STATE.isGuest = false;
|
| 115 |
+
STATE.guestPersist = false;
|
| 116 |
+
setSessionMode(null);
|
| 117 |
+
showLogin();
|
| 118 |
+
const info = document.getElementById('loginInfo');
|
| 119 |
+
if (info) {
|
| 120 |
+
info.textContent = 'Temporary guest workspace expired after the previous guest session ended.';
|
| 121 |
+
info.style.display = 'block';
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
|
| 125 |
window.addEventListener('DOMContentLoaded', async () => {
|
| 126 |
try {
|
| 127 |
+
const client = await initSupabase();
|
| 128 |
+
setGuestControlsVisibility();
|
| 129 |
|
| 130 |
if (AUTH_DISABLED) {
|
| 131 |
showApp();
|
|
|
|
| 143 |
// once with INITIAL_SESSION (with or without a session), then again on
|
| 144 |
// SIGNED_IN / SIGNED_OUT. No polling, no timeouts.
|
| 145 |
let booted = false;
|
| 146 |
+
client.auth.onAuthStateChange((event, session) => {
|
| 147 |
+
const handle = async () => {
|
| 148 |
+
if (event === 'INITIAL_SESSION') {
|
| 149 |
+
if (session) {
|
| 150 |
+
setSessionMode(session);
|
| 151 |
+
if (isTemporaryGuestResume(session)) {
|
| 152 |
+
booted = false;
|
| 153 |
+
await expireTemporaryGuestSession(client);
|
| 154 |
+
return;
|
| 155 |
+
}
|
| 156 |
+
if (STATE.isGuest) {
|
| 157 |
+
markGuestTabAlive();
|
| 158 |
+
startGuestHeartbeat();
|
| 159 |
+
} else {
|
| 160 |
+
stopGuestHeartbeat();
|
| 161 |
+
}
|
| 162 |
+
booted = true;
|
| 163 |
+
showApp();
|
| 164 |
+
bootApp();
|
| 165 |
+
} else {
|
| 166 |
+
stopGuestHeartbeat();
|
| 167 |
+
STATE.isGuest = false;
|
| 168 |
+
STATE.guestPersist = false;
|
| 169 |
+
showLogin();
|
| 170 |
+
}
|
| 171 |
+
} else if (event === 'SIGNED_IN' && !booted) {
|
| 172 |
+
setSessionMode(session);
|
| 173 |
+
if (STATE.isGuest) {
|
| 174 |
+
markGuestTabAlive();
|
| 175 |
+
startGuestHeartbeat();
|
| 176 |
+
} else {
|
| 177 |
+
stopGuestHeartbeat();
|
| 178 |
+
}
|
| 179 |
booted = true;
|
| 180 |
showApp();
|
| 181 |
bootApp();
|
| 182 |
+
} else if (event === 'SIGNED_IN') {
|
| 183 |
+
setSessionMode(session);
|
| 184 |
+
if (STATE.isGuest) {
|
| 185 |
+
markGuestTabAlive();
|
| 186 |
+
startGuestHeartbeat();
|
| 187 |
+
} else {
|
| 188 |
+
stopGuestHeartbeat();
|
| 189 |
+
}
|
| 190 |
+
} else if (event === 'SIGNED_OUT') {
|
| 191 |
+
booted = false;
|
| 192 |
+
stopGuestHeartbeat();
|
| 193 |
+
STATE.isGuest = false;
|
| 194 |
+
STATE.guestPersist = false;
|
| 195 |
+
setSessionMode(null);
|
| 196 |
showLogin();
|
| 197 |
}
|
| 198 |
+
};
|
| 199 |
+
|
| 200 |
+
handle().catch(err => {
|
| 201 |
+
console.error('Auth transition failed:', err);
|
| 202 |
+
stopGuestHeartbeat();
|
|
|
|
| 203 |
showLogin();
|
| 204 |
+
});
|
| 205 |
});
|
| 206 |
|
| 207 |
} catch (err) {
|
| 208 |
console.error("Boot failed:", err);
|
| 209 |
+
const errEl = document.getElementById('loginError');
|
| 210 |
+
if (errEl) errEl.textContent = 'Auth init failed: ' + err.message;
|
| 211 |
showLogin();
|
| 212 |
}
|
| 213 |
});
|
|
|
|
| 239 |
err.textContent = '';
|
| 240 |
|
| 241 |
try {
|
| 242 |
+
const client = await initSupabase();
|
| 243 |
+
if (!client?.auth) {
|
| 244 |
+
throw new Error('Supabase auth client is unavailable.');
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
const {error } = await client.auth.signInWithPassword({
|
| 248 |
email,
|
| 249 |
password: pw,
|
| 250 |
});
|
|
|
|
| 252 |
if (error) {
|
| 253 |
err.textContent = error.message || 'Invalid credentials.';
|
| 254 |
btn.disabled = false;
|
| 255 |
+
btn.textContent = 'SIGN IN →';
|
| 256 |
return;
|
| 257 |
}
|
| 258 |
// EXPLICIT UI TAKEOVER:
|
| 259 |
// Wait 500ms to guarantee local storage has the token, then force the system online.
|
| 260 |
STATE.authenticated = true;
|
| 261 |
+
const session = await getSupabaseSession();
|
| 262 |
+
setSessionMode(session);
|
| 263 |
showApp();
|
| 264 |
|
| 265 |
setTimeout(() => {
|
|
|
|
| 271 |
} catch (e) {
|
| 272 |
err.textContent = 'Server unreachable: ' + e.message;
|
| 273 |
btn.disabled = false;
|
| 274 |
+
btn.textContent = 'SIGN IN →';
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
async function submitGuest() {
|
| 279 |
+
const btn = document.getElementById('guestBtn');
|
| 280 |
+
const err = document.getElementById('loginError');
|
| 281 |
+
const info = document.getElementById('loginInfo');
|
| 282 |
+
const persistCheckbox = document.getElementById('guestPersist');
|
| 283 |
+
const keepWorkspace = Boolean(persistCheckbox?.checked);
|
| 284 |
+
|
| 285 |
+
err.textContent = '';
|
| 286 |
+
if (info) {
|
| 287 |
+
info.style.display = 'none';
|
| 288 |
+
info.textContent = '';
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
btn.disabled = true;
|
| 292 |
+
btn.textContent = 'STARTING GUEST WORKSPACE…';
|
| 293 |
+
|
| 294 |
+
try {
|
| 295 |
+
const client = await initSupabase();
|
| 296 |
+
if (!client?.auth) {
|
| 297 |
+
throw new Error('Supabase auth client is unavailable.');
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
const { error } = await client.auth.signInAnonymously();
|
| 301 |
+
if (error) {
|
| 302 |
+
throw error;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
setGuestPersistPreference(keepWorkspace);
|
| 306 |
+
const session = await getSupabaseSession();
|
| 307 |
+
setSessionMode(session);
|
| 308 |
+
markGuestTabAlive();
|
| 309 |
+
startGuestHeartbeat();
|
| 310 |
+
STATE.authenticated = true;
|
| 311 |
+
showApp();
|
| 312 |
+
setTimeout(() => {
|
| 313 |
+
setOnline(true);
|
| 314 |
+
bootApp();
|
| 315 |
+
const msg = keepWorkspace
|
| 316 |
+
? 'Guest workspace ready. It will stay on this device until you end it.'
|
| 317 |
+
: 'Temporary guest workspace ready. It will expire after the guest session truly ends.';
|
| 318 |
+
toast(msg, 'success');
|
| 319 |
+
}, 300);
|
| 320 |
+
} catch (e) {
|
| 321 |
+
err.textContent = e?.message || 'Could not start guest workspace.';
|
| 322 |
+
if (/anonymous/i.test(err.textContent)) {
|
| 323 |
+
err.textContent = 'Guest mode is disabled in Supabase Auth settings.';
|
| 324 |
+
}
|
| 325 |
+
} finally {
|
| 326 |
+
btn.disabled = false;
|
| 327 |
+
btn.textContent = 'CONTINUE AS GUEST';
|
| 328 |
}
|
| 329 |
}
|
| 330 |
|
|
|
|
| 376 |
btn.textContent = 'CREATING ACCOUNT…';
|
| 377 |
|
| 378 |
try {
|
| 379 |
+
const client = await initSupabase();
|
| 380 |
+
if (!client?.auth) {
|
| 381 |
+
throw new Error('Supabase auth client is unavailable.');
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
const { data, error } = await client.auth.signUp({ email, password: pw });
|
| 385 |
|
| 386 |
if (error) {
|
| 387 |
err.textContent = error.message || 'Sign-up failed.';
|
|
|
|
| 405 |
}
|
| 406 |
}
|
| 407 |
|
| 408 |
+
// ── Operator tools unlock ──────────────────────────────────────────────────────
|
| 409 |
+
async function submitAdmin(adminKey) {
|
| 410 |
+
const key = String(adminKey || '').trim();
|
| 411 |
+
if (!key) return false;
|
| 412 |
try {
|
| 413 |
const res = await apiVerifyAdmin(key);
|
| 414 |
if (res.valid) {
|
| 415 |
+
if (typeof window.enableAdminReview === 'function') {
|
| 416 |
+
window.enableAdminReview(key);
|
| 417 |
+
STATE.adminPendingView = true;
|
| 418 |
+
if (document.getElementById('app')?.style.display !== 'none') {
|
| 419 |
+
switchView('admin');
|
| 420 |
+
} else {
|
| 421 |
+
const info = document.getElementById('loginInfo');
|
| 422 |
+
if (info) {
|
| 423 |
+
info.textContent = 'Admin dashboard unlocked. Sign in to open it.';
|
| 424 |
+
info.style.display = 'block';
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
} else {
|
| 428 |
+
toast('Admin dashboard assets are stale. Hard refresh with Ctrl+Shift+R.', 'error');
|
| 429 |
+
}
|
| 430 |
+
return true;
|
| 431 |
} else {
|
| 432 |
+
toast('Invalid operator key.', 'error');
|
| 433 |
}
|
| 434 |
} catch (e) {
|
| 435 |
+
toast('Operator unlock failed: ' + e.message, 'error');
|
| 436 |
}
|
| 437 |
+
return false;
|
| 438 |
}
|
| 439 |
|
| 440 |
+
async function unlockOperatorTools() {
|
| 441 |
+
const key = window.prompt('Enter operator key to open review tools:', '') || '';
|
| 442 |
+
if (!key.trim()) return;
|
| 443 |
+
const ok = await submitAdmin(key);
|
| 444 |
+
if (ok) toast('Operator tools unlocked.', 'success');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
}
|
| 446 |
|
| 447 |
function handleLoginKey(e) {
|
|
|
|
| 453 |
|
| 454 |
// ── Sign out ──────────────────────────────────────────────────────────────────
|
| 455 |
async function signOut() {
|
| 456 |
+
const client = await initSupabase();
|
| 457 |
+
if (!client?.auth) {
|
| 458 |
+
throw new Error('Supabase auth client is unavailable.');
|
| 459 |
+
}
|
| 460 |
+
if (STATE.isGuest) {
|
| 461 |
+
if (STATE.guestPersist) {
|
| 462 |
+
const shouldEnd = window.confirm(
|
| 463 |
+
'This guest workspace is set to stay on this device. Click OK to end and delete it now, or Cancel to keep it and just close the tab later.'
|
| 464 |
+
);
|
| 465 |
+
if (!shouldEnd) return;
|
| 466 |
+
}
|
| 467 |
+
try {
|
| 468 |
+
await apiCleanupGuestWorkspace();
|
| 469 |
+
} catch (err) {
|
| 470 |
+
toast('Guest workspace cleanup failed: ' + err.message, 'error');
|
| 471 |
+
}
|
| 472 |
+
}
|
| 473 |
+
await client.auth.signOut();
|
| 474 |
STATE.authenticated = false;
|
| 475 |
+
STATE.isGuest = false;
|
| 476 |
+
STATE.guestPersist = false;
|
| 477 |
+
stopGuestHeartbeat();
|
| 478 |
+
clearGuestSessionMarkers();
|
| 479 |
+
localStorage.removeItem(GUEST_PERSIST_KEY);
|
| 480 |
+
setSessionMode(null);
|
| 481 |
STATE.files = [];
|
| 482 |
STATE.categories = [];
|
| 483 |
+
STATE.adminUnlocked = false;
|
| 484 |
+
STATE.adminKey = '';
|
| 485 |
+
STATE.adminPendingView = false;
|
| 486 |
+
const navAdmin = document.getElementById('nav-admin');
|
| 487 |
+
if (navAdmin) navAdmin.style.display = 'none';
|
| 488 |
showLogin();
|
| 489 |
authTab('signin');
|
| 490 |
}
|
|
|
|
| 496 |
setOnline(true);
|
| 497 |
try {
|
| 498 |
await refreshCorpus();
|
| 499 |
+
if (typeof resumeActiveIngestionIfNeeded === 'function') {
|
| 500 |
+
resumeActiveIngestionIfNeeded().catch(err => {
|
| 501 |
+
console.warn('Ingestion resume failed:', err?.message || err);
|
| 502 |
+
});
|
| 503 |
+
}
|
| 504 |
+
if (STATE.adminUnlocked && STATE.adminPendingView) {
|
| 505 |
+
switchView('admin');
|
| 506 |
+
STATE.adminPendingView = false;
|
| 507 |
+
if (typeof refreshAdminDashboard === 'function') {
|
| 508 |
+
refreshAdminDashboard().catch(err => {
|
| 509 |
+
toast('Admin dashboard failed: ' + err.message, 'error');
|
| 510 |
+
});
|
| 511 |
+
}
|
| 512 |
+
} else {
|
| 513 |
+
switchView('corpus');
|
| 514 |
+
}
|
| 515 |
} catch (e) {
|
| 516 |
setOnline(false);
|
| 517 |
toast('Could not reach backend: ' + e.message, 'error');
|
|
|
|
| 541 |
}
|
| 542 |
};
|
| 543 |
}, 50);
|
| 544 |
+
})();
|
frontend/js/state.js
CHANGED
|
@@ -3,19 +3,28 @@
|
|
| 3 |
* Single source of truth. All data flows through api.js, never direct Supabase.
|
| 4 |
*/
|
| 5 |
const STATE = {
|
| 6 |
-
authenticated:
|
| 7 |
-
files:
|
| 8 |
-
categories:
|
| 9 |
-
catColors:
|
| 10 |
-
simulation:
|
| 11 |
-
svgZoom:
|
| 12 |
-
selectedNode:
|
| 13 |
deleteConfirmed: false,
|
| 14 |
-
pendingReview:
|
| 15 |
-
chatHistory:
|
| 16 |
-
isThinking:
|
| 17 |
-
sessionId:
|
| 18 |
alpha: 0.5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
};
|
| 20 |
|
| 21 |
function stateRefreshCategories() {
|
|
@@ -29,7 +38,7 @@ function stateRefreshCategories() {
|
|
| 29 |
}
|
| 30 |
|
| 31 |
async function stateLoadCorpus() {
|
| 32 |
-
const data
|
| 33 |
STATE.files = data.files || [];
|
| 34 |
stateRefreshCategories();
|
| 35 |
document.getElementById('stat-docs').textContent = STATE.files.length;
|
|
|
|
| 3 |
* Single source of truth. All data flows through api.js, never direct Supabase.
|
| 4 |
*/
|
| 5 |
const STATE = {
|
| 6 |
+
authenticated: false,
|
| 7 |
+
files: [],
|
| 8 |
+
categories: [],
|
| 9 |
+
catColors: {},
|
| 10 |
+
simulation: null,
|
| 11 |
+
svgZoom: null,
|
| 12 |
+
selectedNode: null,
|
| 13 |
deleteConfirmed: false,
|
| 14 |
+
pendingReview: null,
|
| 15 |
+
chatHistory: [],
|
| 16 |
+
isThinking: false,
|
| 17 |
+
sessionId: crypto.randomUUID(),
|
| 18 |
alpha: 0.5,
|
| 19 |
+
pinnedFiles: [], // file_hashes of graph-pinned documents
|
| 20 |
+
adminKey: '',
|
| 21 |
+
adminUnlocked: false,
|
| 22 |
+
adminTraces: [],
|
| 23 |
+
adminFeedback: [],
|
| 24 |
+
selectedTraceId: null,
|
| 25 |
+
adminPendingView: false,
|
| 26 |
+
isGuest: false,
|
| 27 |
+
guestPersist: false,
|
| 28 |
};
|
| 29 |
|
| 30 |
function stateRefreshCategories() {
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
async function stateLoadCorpus() {
|
| 41 |
+
const data = await apiLoadFiles();
|
| 42 |
STATE.files = data.files || [];
|
| 43 |
stateRefreshCategories();
|
| 44 |
document.getElementById('stat-docs').textContent = STATE.files.length;
|
recent_changes.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -22,4 +22,5 @@ celery[redis]
|
|
| 22 |
scikit-learn
|
| 23 |
joblib
|
| 24 |
sentence-transformers
|
| 25 |
-
python-magic
|
|
|
|
|
|
| 22 |
scikit-learn
|
| 23 |
joblib
|
| 24 |
sentence-transformers
|
| 25 |
+
python-magic
|
| 26 |
+
pytest
|
scripts/rebuild_pageindex.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rebuild the PageIndex (document_trees) for an already-ingested PDF.
|
| 3 |
+
|
| 4 |
+
Why this exists:
|
| 5 |
+
- Ingestion deletes the uploaded temp PDF after processing.
|
| 6 |
+
- PageIndex behavior evolves (better TOC handling, page_numbers, etc.).
|
| 7 |
+
- You may want to refresh only the structural index without re-embedding/re-uploading chunks.
|
| 8 |
+
|
| 9 |
+
Usage (PowerShell):
|
| 10 |
+
conda activate rag_env
|
| 11 |
+
python scripts/rebuild_pageindex.py --pdf "C:\path\to\file.pdf" --access-token "<JWT>"
|
| 12 |
+
|
| 13 |
+
Notes:
|
| 14 |
+
- This only rewrites `document_trees` (and optionally `identity_json` if you choose to extend it).
|
| 15 |
+
- It does NOT touch the vector store, RAPTOR summaries, or ingested_files registry.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
# Ensure repo root is on sys.path so `import backend...` works when executed as a script.
|
| 26 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 27 |
+
if str(REPO_ROOT) not in sys.path:
|
| 28 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 29 |
+
|
| 30 |
+
from backend.core.pipeline import (
|
| 31 |
+
_build_document_tree,
|
| 32 |
+
_build_service_supabase_client,
|
| 33 |
+
get_file_fingerprint,
|
| 34 |
+
partition_document,
|
| 35 |
+
)
|
| 36 |
+
from backend.core.auth_utils import extract_jwt_sub
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def main() -> int:
|
| 40 |
+
parser = argparse.ArgumentParser(description="Rebuild PageIndex tree for a PDF.")
|
| 41 |
+
parser.add_argument("--pdf", required=True, help="Path to local PDF file.")
|
| 42 |
+
parser.add_argument(
|
| 43 |
+
"--access-token",
|
| 44 |
+
required=False,
|
| 45 |
+
default=None,
|
| 46 |
+
help="User JWT (same X-Auth-Token used by the API). Optional if --user-id is provided.",
|
| 47 |
+
)
|
| 48 |
+
parser.add_argument(
|
| 49 |
+
"--user-id",
|
| 50 |
+
required=False,
|
| 51 |
+
default=None,
|
| 52 |
+
help="Supabase auth user_id (sub). Use this if you don't want to paste a JWT.",
|
| 53 |
+
)
|
| 54 |
+
args = parser.parse_args()
|
| 55 |
+
|
| 56 |
+
pdf_path = os.path.abspath(args.pdf)
|
| 57 |
+
if not os.path.exists(pdf_path):
|
| 58 |
+
raise SystemExit(f"PDF not found: {pdf_path}")
|
| 59 |
+
|
| 60 |
+
if args.user_id:
|
| 61 |
+
user_id = str(args.user_id).strip()
|
| 62 |
+
elif args.access_token:
|
| 63 |
+
user_id = extract_jwt_sub(args.access_token)
|
| 64 |
+
else:
|
| 65 |
+
raise SystemExit("Provide either --user-id or --access-token.")
|
| 66 |
+
file_hash = get_file_fingerprint(pdf_path)
|
| 67 |
+
|
| 68 |
+
elements = partition_document(pdf_path)
|
| 69 |
+
doc_tree = _build_document_tree(elements)
|
| 70 |
+
|
| 71 |
+
sb = _build_service_supabase_client()
|
| 72 |
+
sb.table("document_trees").upsert(
|
| 73 |
+
{"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
|
| 74 |
+
on_conflict="user_id,file_hash",
|
| 75 |
+
).execute()
|
| 76 |
+
|
| 77 |
+
print(f"Rebuilt PageIndex tree for file_hash={file_hash} user_id={user_id}")
|
| 78 |
+
return 0
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
raise SystemExit(main())
|
| 83 |
+
|
shared/types.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import List, Optional
|
| 2 |
from pydantic import BaseModel, Field
|
| 3 |
|
| 4 |
class IngestResponse(BaseModel):
|
|
@@ -24,16 +24,53 @@ class ChatMessage(BaseModel):
|
|
| 24 |
role: str; content: str
|
| 25 |
|
| 26 |
class QueryRequest(BaseModel):
|
| 27 |
-
query: str
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
session_id: str = "default_session"
|
| 30 |
-
alpha: float = 0.5
|
|
|
|
| 31 |
|
| 32 |
class SourceChunk(BaseModel):
|
| 33 |
source: str; score: Optional[float]=None; chunk: Optional[int | str] = None
|
| 34 |
snippet: Optional[str]=None; doc_type: Optional[str]=None
|
| 35 |
pages: Optional[List[int]]=None
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
class QueryResponse(BaseModel):
|
| 38 |
answer: str; sources: List[SourceChunk] = Field(default_factory=list)
|
| 39 |
images: List[str] = []
|
|
|
|
| 1 |
+
from typing import Any, Dict, List, Optional
|
| 2 |
from pydantic import BaseModel, Field
|
| 3 |
|
| 4 |
class IngestResponse(BaseModel):
|
|
|
|
| 24 |
role: str; content: str
|
| 25 |
|
| 26 |
class QueryRequest(BaseModel):
|
| 27 |
+
query: str
|
| 28 |
+
category: str = "All"
|
| 29 |
+
history: List[ChatMessage] = Field(default_factory=list)
|
| 30 |
+
k: int = 3
|
| 31 |
session_id: str = "default_session"
|
| 32 |
+
alpha: float = 0.5
|
| 33 |
+
priority_file_hashes: List[str] = Field(default_factory=list)
|
| 34 |
|
| 35 |
class SourceChunk(BaseModel):
|
| 36 |
source: str; score: Optional[float]=None; chunk: Optional[int | str] = None
|
| 37 |
snippet: Optional[str]=None; doc_type: Optional[str]=None
|
| 38 |
pages: Optional[List[int]]=None
|
| 39 |
|
| 40 |
+
class DocDiagnostic(BaseModel):
|
| 41 |
+
file_hash: str
|
| 42 |
+
source: str
|
| 43 |
+
included: bool = True
|
| 44 |
+
candidate_count: int = 0
|
| 45 |
+
doc_score: Optional[float] = None
|
| 46 |
+
confidence_label: Optional[str] = None
|
| 47 |
+
reason: Optional[str] = None
|
| 48 |
+
support_label: Optional[str] = None
|
| 49 |
+
thin_doc: Optional[bool] = None
|
| 50 |
+
|
| 51 |
+
class QueryTrace(BaseModel):
|
| 52 |
+
trace_id: str
|
| 53 |
+
query: str
|
| 54 |
+
session_id: str
|
| 55 |
+
route_mode: str
|
| 56 |
+
selected_experts: List[str] = Field(default_factory=list)
|
| 57 |
+
expert_weights: Dict[str, float] = Field(default_factory=dict)
|
| 58 |
+
pinned_file_hashes: List[str] = Field(default_factory=list)
|
| 59 |
+
candidate_counts: Dict[str, int] = Field(default_factory=dict)
|
| 60 |
+
selected_chunk_ids: List[str] = Field(default_factory=list)
|
| 61 |
+
doc_diagnostics: List[DocDiagnostic] = Field(default_factory=list)
|
| 62 |
+
failure_modes: List[str] = Field(default_factory=list)
|
| 63 |
+
quality_metrics: Dict[str, Any] = Field(default_factory=dict)
|
| 64 |
+
latency_ms: Optional[int] = None
|
| 65 |
+
answer_hash: Optional[str] = None
|
| 66 |
+
|
| 67 |
+
class AnswerFeedback(BaseModel):
|
| 68 |
+
trace_id: str
|
| 69 |
+
helpful: Optional[bool] = None
|
| 70 |
+
accepted: Optional[bool] = None
|
| 71 |
+
reason_code: Optional[str] = None
|
| 72 |
+
correction_text: Optional[str] = None
|
| 73 |
+
|
| 74 |
class QueryResponse(BaseModel):
|
| 75 |
answer: str; sources: List[SourceChunk] = Field(default_factory=list)
|
| 76 |
images: List[str] = []
|
supabase/migrations/0010_query_traces_feedback_graph.sql
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
create table if not exists public.query_traces (
|
| 2 |
+
trace_id uuid primary key default gen_random_uuid(),
|
| 3 |
+
user_id uuid null,
|
| 4 |
+
session_id text not null default 'default_session',
|
| 5 |
+
question text not null,
|
| 6 |
+
route_mode text not null default 'default',
|
| 7 |
+
selected_experts jsonb not null default '[]'::jsonb,
|
| 8 |
+
expert_weights jsonb not null default '{}'::jsonb,
|
| 9 |
+
pinned_file_hashes jsonb not null default '[]'::jsonb,
|
| 10 |
+
candidate_counts jsonb not null default '{}'::jsonb,
|
| 11 |
+
selected_chunk_ids jsonb not null default '[]'::jsonb,
|
| 12 |
+
doc_diagnostics jsonb not null default '[]'::jsonb,
|
| 13 |
+
failure_modes jsonb not null default '[]'::jsonb,
|
| 14 |
+
quality_metrics jsonb not null default '{}'::jsonb,
|
| 15 |
+
answer_hash text null,
|
| 16 |
+
answer_preview text null,
|
| 17 |
+
latency_ms integer null,
|
| 18 |
+
created_at timestamptz not null default timezone('utc', now())
|
| 19 |
+
);
|
| 20 |
+
|
| 21 |
+
create index if not exists idx_query_traces_user_created
|
| 22 |
+
on public.query_traces (user_id, created_at desc);
|
| 23 |
+
|
| 24 |
+
create index if not exists idx_query_traces_session_created
|
| 25 |
+
on public.query_traces (session_id, created_at desc);
|
| 26 |
+
|
| 27 |
+
alter table public.query_traces enable row level security;
|
| 28 |
+
|
| 29 |
+
drop policy if exists query_traces_select_own on public.query_traces;
|
| 30 |
+
create policy query_traces_select_own
|
| 31 |
+
on public.query_traces
|
| 32 |
+
for select
|
| 33 |
+
to authenticated
|
| 34 |
+
using (auth.uid() = user_id);
|
| 35 |
+
|
| 36 |
+
drop policy if exists query_traces_insert_own on public.query_traces;
|
| 37 |
+
create policy query_traces_insert_own
|
| 38 |
+
on public.query_traces
|
| 39 |
+
for insert
|
| 40 |
+
to authenticated
|
| 41 |
+
with check (auth.uid() = user_id);
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
create table if not exists public.answer_feedback (
|
| 45 |
+
id bigint generated by default as identity primary key,
|
| 46 |
+
trace_id uuid not null references public.query_traces(trace_id) on delete cascade,
|
| 47 |
+
user_id uuid null,
|
| 48 |
+
helpful boolean null,
|
| 49 |
+
accepted boolean null,
|
| 50 |
+
reason_code text null,
|
| 51 |
+
correction_text text null,
|
| 52 |
+
promote_to_eval boolean not null default false,
|
| 53 |
+
created_at timestamptz not null default timezone('utc', now())
|
| 54 |
+
);
|
| 55 |
+
|
| 56 |
+
create index if not exists idx_answer_feedback_trace_created
|
| 57 |
+
on public.answer_feedback (trace_id, created_at desc);
|
| 58 |
+
|
| 59 |
+
create index if not exists idx_answer_feedback_user_created
|
| 60 |
+
on public.answer_feedback (user_id, created_at desc);
|
| 61 |
+
|
| 62 |
+
alter table public.answer_feedback enable row level security;
|
| 63 |
+
|
| 64 |
+
drop policy if exists answer_feedback_select_own on public.answer_feedback;
|
| 65 |
+
create policy answer_feedback_select_own
|
| 66 |
+
on public.answer_feedback
|
| 67 |
+
for select
|
| 68 |
+
to authenticated
|
| 69 |
+
using (auth.uid() = user_id);
|
| 70 |
+
|
| 71 |
+
drop policy if exists answer_feedback_insert_own on public.answer_feedback;
|
| 72 |
+
create policy answer_feedback_insert_own
|
| 73 |
+
on public.answer_feedback
|
| 74 |
+
for insert
|
| 75 |
+
to authenticated
|
| 76 |
+
with check (auth.uid() = user_id);
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
create table if not exists public.graph_nodes (
|
| 80 |
+
id bigint generated by default as identity primary key,
|
| 81 |
+
user_id uuid null,
|
| 82 |
+
node_key text not null,
|
| 83 |
+
node_type text not null,
|
| 84 |
+
label text not null,
|
| 85 |
+
payload jsonb not null default '{}'::jsonb,
|
| 86 |
+
created_at timestamptz not null default timezone('utc', now()),
|
| 87 |
+
unique (user_id, node_key)
|
| 88 |
+
);
|
| 89 |
+
|
| 90 |
+
create index if not exists idx_graph_nodes_user_type
|
| 91 |
+
on public.graph_nodes (user_id, node_type);
|
| 92 |
+
|
| 93 |
+
create index if not exists idx_graph_nodes_user_label
|
| 94 |
+
on public.graph_nodes (user_id, label);
|
| 95 |
+
|
| 96 |
+
alter table public.graph_nodes enable row level security;
|
| 97 |
+
|
| 98 |
+
drop policy if exists graph_nodes_select_own on public.graph_nodes;
|
| 99 |
+
create policy graph_nodes_select_own
|
| 100 |
+
on public.graph_nodes
|
| 101 |
+
for select
|
| 102 |
+
to authenticated
|
| 103 |
+
using (auth.uid() = user_id);
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
create table if not exists public.graph_edges (
|
| 107 |
+
id bigint generated by default as identity primary key,
|
| 108 |
+
user_id uuid null,
|
| 109 |
+
source_node_key text not null,
|
| 110 |
+
target_node_key text not null,
|
| 111 |
+
edge_type text not null,
|
| 112 |
+
weight double precision not null default 1.0,
|
| 113 |
+
payload jsonb not null default '{}'::jsonb,
|
| 114 |
+
created_at timestamptz not null default timezone('utc', now()),
|
| 115 |
+
unique (user_id, source_node_key, target_node_key, edge_type)
|
| 116 |
+
);
|
| 117 |
+
|
| 118 |
+
create index if not exists idx_graph_edges_user_source
|
| 119 |
+
on public.graph_edges (user_id, source_node_key);
|
| 120 |
+
|
| 121 |
+
create index if not exists idx_graph_edges_user_target
|
| 122 |
+
on public.graph_edges (user_id, target_node_key);
|
| 123 |
+
|
| 124 |
+
alter table public.graph_edges enable row level security;
|
| 125 |
+
|
| 126 |
+
drop policy if exists graph_edges_select_own on public.graph_edges;
|
| 127 |
+
create policy graph_edges_select_own
|
| 128 |
+
on public.graph_edges
|
| 129 |
+
for select
|
| 130 |
+
to authenticated
|
| 131 |
+
using (auth.uid() = user_id);
|
supabase/migrations/0011_admin_review_eval_workflow.sql
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alter table public.query_traces
|
| 2 |
+
add column if not exists review_state text not null default 'pending',
|
| 3 |
+
add column if not exists review_notes text null,
|
| 4 |
+
add column if not exists reviewed_at timestamptz null,
|
| 5 |
+
add column if not exists reviewed_by text null,
|
| 6 |
+
add column if not exists promoted_to_eval boolean not null default false,
|
| 7 |
+
add column if not exists document_types jsonb not null default '[]'::jsonb;
|
| 8 |
+
|
| 9 |
+
create index if not exists idx_query_traces_review_state_created
|
| 10 |
+
on public.query_traces (review_state, created_at desc);
|
| 11 |
+
|
| 12 |
+
alter table public.answer_feedback
|
| 13 |
+
add column if not exists review_state text not null default 'pending',
|
| 14 |
+
add column if not exists review_notes text null,
|
| 15 |
+
add column if not exists reviewed_at timestamptz null,
|
| 16 |
+
add column if not exists reviewed_by text null,
|
| 17 |
+
add column if not exists promoted_at timestamptz null;
|
| 18 |
+
|
| 19 |
+
create index if not exists idx_answer_feedback_review_state_created
|
| 20 |
+
on public.answer_feedback (review_state, created_at desc);
|
| 21 |
+
|
| 22 |
+
create table if not exists public.evaluation_datasets (
|
| 23 |
+
id bigint generated by default as identity primary key,
|
| 24 |
+
trace_id uuid unique null references public.query_traces(trace_id) on delete set null,
|
| 25 |
+
source text not null default 'feedback_trace',
|
| 26 |
+
question text not null,
|
| 27 |
+
gold_context_refs jsonb not null default '[]'::jsonb,
|
| 28 |
+
gold_evidence_text text null,
|
| 29 |
+
is_answerable boolean not null default true,
|
| 30 |
+
failure_modes jsonb not null default '[]'::jsonb,
|
| 31 |
+
doc_diagnostics jsonb not null default '[]'::jsonb,
|
| 32 |
+
reason_code text null,
|
| 33 |
+
is_active boolean not null default false,
|
| 34 |
+
created_at timestamptz not null default timezone('utc', now())
|
| 35 |
+
);
|
| 36 |
+
|
| 37 |
+
create index if not exists idx_evaluation_datasets_active_created
|
| 38 |
+
on public.evaluation_datasets (is_active, created_at desc);
|
supabase/migrations/0012_lock_down_evaluation_datasets.sql
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alter table public.evaluation_datasets
|
| 2 |
+
enable row level security;
|
| 3 |
+
|
| 4 |
+
revoke all on public.evaluation_datasets from anon, authenticated;
|
| 5 |
+
|
| 6 |
+
drop policy if exists evaluation_datasets_select_own on public.evaluation_datasets;
|
| 7 |
+
drop policy if exists evaluation_datasets_insert_own on public.evaluation_datasets;
|
| 8 |
+
drop policy if exists evaluation_datasets_update_own on public.evaluation_datasets;
|
| 9 |
+
drop policy if exists evaluation_datasets_delete_own on public.evaluation_datasets;
|
| 10 |
+
|
| 11 |
+
-- evaluation_datasets is an internal curation/evaluation table.
|
| 12 |
+
-- The app reads/writes it via service-role admin/eval paths only.
|
| 13 |
+
-- With RLS enabled and no anon/authenticated policies, normal clients cannot
|
| 14 |
+
-- access it through PostgREST even though it lives in the public schema.
|
supabase/migrations/0013_backend_owned_retrieval_hardening.sql
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Migration 0013: backend-owned retrieval hardening
|
| 2 |
+
--
|
| 3 |
+
-- Goals:
|
| 4 |
+
-- 1. Add a bulk chunk insert RPC for ingestion throughput.
|
| 5 |
+
-- 2. Move retrieval/memory RPCs to explicit user_id scoping so the backend can
|
| 6 |
+
-- call them with the service role instead of relying on browser RLS.
|
| 7 |
+
-- 3. Lock internal telemetry/eval tables down to backend-only access.
|
| 8 |
+
|
| 9 |
+
CREATE OR REPLACE FUNCTION public.insert_document_chunks_batch(
|
| 10 |
+
p_rows jsonb
|
| 11 |
+
) RETURNS void
|
| 12 |
+
LANGUAGE plpgsql
|
| 13 |
+
SECURITY DEFINER
|
| 14 |
+
SET search_path = ''
|
| 15 |
+
AS $$
|
| 16 |
+
BEGIN
|
| 17 |
+
IF p_rows IS NULL OR jsonb_typeof(p_rows) <> 'array' THEN
|
| 18 |
+
RETURN;
|
| 19 |
+
END IF;
|
| 20 |
+
|
| 21 |
+
INSERT INTO public.documents (
|
| 22 |
+
id,
|
| 23 |
+
content,
|
| 24 |
+
metadata,
|
| 25 |
+
embedding,
|
| 26 |
+
user_id,
|
| 27 |
+
node_type,
|
| 28 |
+
parent_node_id,
|
| 29 |
+
node_level
|
| 30 |
+
)
|
| 31 |
+
SELECT
|
| 32 |
+
(row->>'id')::uuid,
|
| 33 |
+
row->>'content',
|
| 34 |
+
COALESCE(row->'metadata', '{}'::jsonb),
|
| 35 |
+
(row->'embedding')::text::extensions.vector,
|
| 36 |
+
(row->>'user_id')::uuid,
|
| 37 |
+
COALESCE(NULLIF(row->>'node_type', ''), 'leaf'),
|
| 38 |
+
NULLIF(row->>'parent_node_id', '')::uuid,
|
| 39 |
+
COALESCE(NULLIF(row->>'node_level', '')::integer, 0)
|
| 40 |
+
FROM jsonb_array_elements(p_rows) AS row
|
| 41 |
+
ON CONFLICT (id) DO UPDATE
|
| 42 |
+
SET content = EXCLUDED.content,
|
| 43 |
+
metadata = EXCLUDED.metadata,
|
| 44 |
+
embedding = EXCLUDED.embedding,
|
| 45 |
+
user_id = EXCLUDED.user_id,
|
| 46 |
+
node_type = EXCLUDED.node_type,
|
| 47 |
+
parent_node_id = EXCLUDED.parent_node_id,
|
| 48 |
+
node_level = EXCLUDED.node_level;
|
| 49 |
+
END;
|
| 50 |
+
$$;
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
CREATE OR REPLACE FUNCTION public.hybrid_search(
|
| 54 |
+
query_text text,
|
| 55 |
+
query_embedding extensions.vector,
|
| 56 |
+
match_count integer DEFAULT 10,
|
| 57 |
+
filter jsonb DEFAULT '{}'::jsonb,
|
| 58 |
+
semantic_weight double precision DEFAULT 0.7,
|
| 59 |
+
keyword_weight double precision DEFAULT 0.3,
|
| 60 |
+
p_user_id uuid DEFAULT NULL::uuid
|
| 61 |
+
) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
|
| 62 |
+
LANGUAGE plpgsql
|
| 63 |
+
SET search_path = ''
|
| 64 |
+
AS $$
|
| 65 |
+
BEGIN
|
| 66 |
+
RETURN QUERY
|
| 67 |
+
WITH
|
| 68 |
+
semantic AS (
|
| 69 |
+
SELECT
|
| 70 |
+
d.id,
|
| 71 |
+
d.content,
|
| 72 |
+
d.metadata,
|
| 73 |
+
(
|
| 74 |
+
1 - (
|
| 75 |
+
d.embedding::extensions.halfvec(2048)
|
| 76 |
+
OPERATOR(extensions.<=>)
|
| 77 |
+
query_embedding::extensions.halfvec(2048)
|
| 78 |
+
)
|
| 79 |
+
)::float AS score
|
| 80 |
+
FROM public.documents AS d
|
| 81 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 82 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 83 |
+
ORDER BY d.embedding::extensions.halfvec(2048)
|
| 84 |
+
OPERATOR(extensions.<=>)
|
| 85 |
+
query_embedding::extensions.halfvec(2048)
|
| 86 |
+
LIMIT match_count * 3
|
| 87 |
+
),
|
| 88 |
+
keyword AS (
|
| 89 |
+
SELECT
|
| 90 |
+
d.id,
|
| 91 |
+
d.content,
|
| 92 |
+
d.metadata,
|
| 93 |
+
pg_catalog.ts_rank(
|
| 94 |
+
pg_catalog.to_tsvector('english', d.content),
|
| 95 |
+
pg_catalog.plainto_tsquery('english', query_text)
|
| 96 |
+
)::float AS raw_score
|
| 97 |
+
FROM public.documents AS d
|
| 98 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 99 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 100 |
+
AND pg_catalog.to_tsvector('english', d.content)
|
| 101 |
+
@@ pg_catalog.plainto_tsquery('english', query_text)
|
| 102 |
+
ORDER BY raw_score DESC
|
| 103 |
+
LIMIT match_count * 3
|
| 104 |
+
),
|
| 105 |
+
keyword_norm AS (
|
| 106 |
+
SELECT
|
| 107 |
+
k.id,
|
| 108 |
+
k.content,
|
| 109 |
+
k.metadata,
|
| 110 |
+
CASE
|
| 111 |
+
WHEN max(k.raw_score) OVER () = 0 THEN 0::float
|
| 112 |
+
ELSE (k.raw_score / max(k.raw_score) OVER ())::float
|
| 113 |
+
END AS score
|
| 114 |
+
FROM keyword AS k
|
| 115 |
+
),
|
| 116 |
+
blended AS (
|
| 117 |
+
SELECT
|
| 118 |
+
COALESCE(s.id, kn.id) AS id,
|
| 119 |
+
COALESCE(s.content, kn.content) AS content,
|
| 120 |
+
COALESCE(s.metadata, kn.metadata) AS metadata,
|
| 121 |
+
(
|
| 122 |
+
COALESCE(s.score, 0::float) * semantic_weight +
|
| 123 |
+
COALESCE(kn.score, 0::float) * keyword_weight
|
| 124 |
+
) AS combined_score
|
| 125 |
+
FROM semantic AS s
|
| 126 |
+
FULL OUTER JOIN keyword_norm AS kn ON s.id = kn.id
|
| 127 |
+
)
|
| 128 |
+
SELECT
|
| 129 |
+
b.id,
|
| 130 |
+
b.content,
|
| 131 |
+
b.metadata,
|
| 132 |
+
b.combined_score
|
| 133 |
+
FROM blended AS b
|
| 134 |
+
ORDER BY b.combined_score DESC
|
| 135 |
+
LIMIT match_count;
|
| 136 |
+
END;
|
| 137 |
+
$$;
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
CREATE OR REPLACE FUNCTION public.match_documents(
|
| 141 |
+
query_embedding extensions.vector,
|
| 142 |
+
match_count integer DEFAULT 5,
|
| 143 |
+
filter jsonb DEFAULT '{}'::jsonb,
|
| 144 |
+
p_user_id uuid DEFAULT NULL::uuid
|
| 145 |
+
) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
|
| 146 |
+
LANGUAGE plpgsql
|
| 147 |
+
SET search_path = ''
|
| 148 |
+
AS $$
|
| 149 |
+
BEGIN
|
| 150 |
+
RETURN QUERY
|
| 151 |
+
SELECT
|
| 152 |
+
d.id,
|
| 153 |
+
d.content,
|
| 154 |
+
d.metadata,
|
| 155 |
+
(
|
| 156 |
+
1 - (
|
| 157 |
+
d.embedding::extensions.halfvec(2048)
|
| 158 |
+
OPERATOR(extensions.<=>)
|
| 159 |
+
query_embedding::extensions.halfvec(2048)
|
| 160 |
+
)
|
| 161 |
+
)::float AS similarity
|
| 162 |
+
FROM public.documents AS d
|
| 163 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 164 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 165 |
+
ORDER BY d.embedding::extensions.halfvec(2048)
|
| 166 |
+
OPERATOR(extensions.<=>)
|
| 167 |
+
query_embedding::extensions.halfvec(2048)
|
| 168 |
+
LIMIT match_count;
|
| 169 |
+
END;
|
| 170 |
+
$$;
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
CREATE OR REPLACE FUNCTION public.match_memory(
|
| 174 |
+
query_embedding extensions.vector,
|
| 175 |
+
match_session_id text,
|
| 176 |
+
match_count integer DEFAULT 4,
|
| 177 |
+
p_user_id uuid DEFAULT NULL::uuid
|
| 178 |
+
) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
|
| 179 |
+
LANGUAGE plpgsql
|
| 180 |
+
SET search_path = ''
|
| 181 |
+
AS $$
|
| 182 |
+
BEGIN
|
| 183 |
+
RETURN QUERY
|
| 184 |
+
SELECT
|
| 185 |
+
cm.id,
|
| 186 |
+
cm.role,
|
| 187 |
+
cm.content,
|
| 188 |
+
1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
|
| 189 |
+
FROM public.chat_memory AS cm
|
| 190 |
+
WHERE cm.session_id = match_session_id
|
| 191 |
+
AND (p_user_id IS NULL OR cm.user_id = p_user_id)
|
| 192 |
+
ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
|
| 193 |
+
LIMIT match_count;
|
| 194 |
+
END;
|
| 195 |
+
$$;
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
DO $$
|
| 199 |
+
BEGIN
|
| 200 |
+
IF to_regclass('public.query_traces') IS NOT NULL THEN
|
| 201 |
+
EXECUTE 'ALTER TABLE public.query_traces ENABLE ROW LEVEL SECURITY';
|
| 202 |
+
EXECUTE 'REVOKE ALL ON TABLE public.query_traces FROM anon, authenticated';
|
| 203 |
+
EXECUTE 'DROP POLICY IF EXISTS query_traces_select_own ON public.query_traces';
|
| 204 |
+
EXECUTE 'DROP POLICY IF EXISTS query_traces_insert_own ON public.query_traces';
|
| 205 |
+
END IF;
|
| 206 |
+
|
| 207 |
+
IF to_regclass('public.answer_feedback') IS NOT NULL THEN
|
| 208 |
+
EXECUTE 'ALTER TABLE public.answer_feedback ENABLE ROW LEVEL SECURITY';
|
| 209 |
+
EXECUTE 'REVOKE ALL ON TABLE public.answer_feedback FROM anon, authenticated';
|
| 210 |
+
EXECUTE 'DROP POLICY IF EXISTS answer_feedback_select_own ON public.answer_feedback';
|
| 211 |
+
EXECUTE 'DROP POLICY IF EXISTS answer_feedback_insert_own ON public.answer_feedback';
|
| 212 |
+
END IF;
|
| 213 |
+
|
| 214 |
+
IF to_regclass('public.evaluation_logs') IS NOT NULL THEN
|
| 215 |
+
EXECUTE 'ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY';
|
| 216 |
+
EXECUTE 'REVOKE ALL ON TABLE public.evaluation_logs FROM anon, authenticated';
|
| 217 |
+
EXECUTE 'DROP POLICY IF EXISTS evaluation_logs_insert_own ON public.evaluation_logs';
|
| 218 |
+
EXECUTE 'DROP POLICY IF EXISTS evaluation_logs_select_own ON public.evaluation_logs';
|
| 219 |
+
END IF;
|
| 220 |
+
|
| 221 |
+
IF to_regclass('public.intent_feedback') IS NOT NULL THEN
|
| 222 |
+
EXECUTE 'ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY';
|
| 223 |
+
EXECUTE 'REVOKE ALL ON TABLE public.intent_feedback FROM anon, authenticated';
|
| 224 |
+
EXECUTE 'DROP POLICY IF EXISTS intent_feedback_select_own ON public.intent_feedback';
|
| 225 |
+
EXECUTE 'DROP POLICY IF EXISTS intent_feedback_insert_own ON public.intent_feedback';
|
| 226 |
+
END IF;
|
| 227 |
+
|
| 228 |
+
IF to_regclass('public.rerank_feedback') IS NOT NULL THEN
|
| 229 |
+
EXECUTE 'ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY';
|
| 230 |
+
EXECUTE 'REVOKE ALL ON TABLE public.rerank_feedback FROM anon, authenticated';
|
| 231 |
+
EXECUTE 'DROP POLICY IF EXISTS rerank_feedback_select_own ON public.rerank_feedback';
|
| 232 |
+
END IF;
|
| 233 |
+
|
| 234 |
+
IF to_regclass('public.graph_nodes') IS NOT NULL THEN
|
| 235 |
+
EXECUTE 'ALTER TABLE public.graph_nodes ENABLE ROW LEVEL SECURITY';
|
| 236 |
+
EXECUTE 'REVOKE ALL ON TABLE public.graph_nodes FROM anon, authenticated';
|
| 237 |
+
EXECUTE 'DROP POLICY IF EXISTS graph_nodes_select_own ON public.graph_nodes';
|
| 238 |
+
END IF;
|
| 239 |
+
|
| 240 |
+
IF to_regclass('public.graph_edges') IS NOT NULL THEN
|
| 241 |
+
EXECUTE 'ALTER TABLE public.graph_edges ENABLE ROW LEVEL SECURITY';
|
| 242 |
+
EXECUTE 'REVOKE ALL ON TABLE public.graph_edges FROM anon, authenticated';
|
| 243 |
+
EXECUTE 'DROP POLICY IF EXISTS graph_edges_select_own ON public.graph_edges';
|
| 244 |
+
END IF;
|
| 245 |
+
|
| 246 |
+
IF to_regclass('public.category_centroids') IS NOT NULL THEN
|
| 247 |
+
EXECUTE 'ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY';
|
| 248 |
+
EXECUTE 'REVOKE ALL ON TABLE public.category_centroids FROM anon, authenticated';
|
| 249 |
+
END IF;
|
| 250 |
+
|
| 251 |
+
IF to_regclass('public.ingestion_retry_logs') IS NOT NULL THEN
|
| 252 |
+
EXECUTE 'ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY';
|
| 253 |
+
EXECUTE 'REVOKE ALL ON TABLE public.ingestion_retry_logs FROM anon, authenticated';
|
| 254 |
+
EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_select_own ON public.ingestion_retry_logs';
|
| 255 |
+
EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_insert_own ON public.ingestion_retry_logs';
|
| 256 |
+
EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_update_own ON public.ingestion_retry_logs';
|
| 257 |
+
EXECUTE 'DROP POLICY IF EXISTS ingestion_retry_logs_delete_own ON public.ingestion_retry_logs';
|
| 258 |
+
END IF;
|
| 259 |
+
END;
|
| 260 |
+
$$;
|
supabase/migrations/0014_drop_legacy_category_centroid_policies.sql
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- Migration 0014: drop legacy category_centroids user-facing RLS policies
|
| 2 |
+
--
|
| 3 |
+
-- 0013 moved centroid access to backend-owned service-role calls with
|
| 4 |
+
-- explicit user_id filtering, but it did not remove the older auth.uid()
|
| 5 |
+
-- policies. Those stale policies keep Security Advisor warning about
|
| 6 |
+
-- anonymous access on public.category_centroids and also keep schema dumps
|
| 7 |
+
-- out of sync with the intended access model.
|
| 8 |
+
|
| 9 |
+
DO $$
|
| 10 |
+
BEGIN
|
| 11 |
+
IF to_regclass('public.category_centroids') IS NOT NULL THEN
|
| 12 |
+
EXECUTE 'ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY';
|
| 13 |
+
EXECUTE 'REVOKE ALL ON TABLE public.category_centroids FROM anon, authenticated';
|
| 14 |
+
EXECUTE 'DROP POLICY IF EXISTS centroids_select_own ON public.category_centroids';
|
| 15 |
+
EXECUTE 'DROP POLICY IF EXISTS centroids_insert_own ON public.category_centroids';
|
| 16 |
+
EXECUTE 'DROP POLICY IF EXISTS centroids_update_own ON public.category_centroids';
|
| 17 |
+
EXECUTE 'DROP POLICY IF EXISTS centroids_delete_own ON public.category_centroids';
|
| 18 |
+
END IF;
|
| 19 |
+
END
|
| 20 |
+
$$;
|
supabase/migrations/0015_ingested_file_identity_json.sql
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alter table public.ingested_files
|
| 2 |
+
add column if not exists identity_json jsonb not null default '{}'::jsonb;
|
supabase/migrations/0016_ingestion_file_hash_checkpoints.sql
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ALTER TABLE public.ingestion_retry_logs
|
| 2 |
+
ADD COLUMN IF NOT EXISTS file_hash text;
|
| 3 |
+
|
| 4 |
+
CREATE INDEX IF NOT EXISTS ingestion_retry_logs_user_file_event_idx
|
| 5 |
+
ON public.ingestion_retry_logs (user_id, file_hash, event_type, created_at DESC);
|
supabase/schema_backup.before_0013.sql
ADDED
|
File without changes
|
supabase/schema_backup.sql
CHANGED
|
@@ -1,74 +1,60 @@
|
|
| 1 |
--
|
| 2 |
-- PostgreSQL database dump
|
| 3 |
--
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
SET
|
| 11 |
-
SET
|
| 12 |
-
SET
|
| 13 |
-
SET
|
| 14 |
-
|
| 15 |
-
SET
|
| 16 |
-
|
| 17 |
-
SET
|
| 18 |
-
SET
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
--
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
--
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
--
|
| 37 |
-
|
| 38 |
-
--
|
| 39 |
-
|
| 40 |
-
-- AS $$
|
| 41 |
-
-- begin
|
| 42 |
-
-- -- Fire-and-forget: refresh in background via pg_notify
|
| 43 |
-
-- -- (avoids blocking the INSERT transaction itself)
|
| 44 |
-
-- perform pg_notify('refresh_mv', 'document_types');
|
| 45 |
-
-- return new;
|
| 46 |
-
-- end;
|
| 47 |
-
-- $$;
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
--
|
| 51 |
-
-- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
|
| 52 |
-
--
|
| 53 |
-
|
| 54 |
CREATE FUNCTION public._trg_set_updated_at() RETURNS trigger
|
| 55 |
LANGUAGE plpgsql
|
| 56 |
-
SET search_path
|
| 57 |
-
AS $$
|
| 58 |
-
begin
|
| 59 |
new.updated_at = pg_catalog.now();
|
| 60 |
-
return new;
|
| 61 |
-
end;
|
| 62 |
-
$$;
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
--
|
| 66 |
-
-- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
|
| 67 |
-
--
|
| 68 |
-
|
| 69 |
CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
|
| 70 |
LANGUAGE sql STABLE
|
| 71 |
-
SET search_path
|
| 72 |
AS $$
|
| 73 |
select distinct f.document_type
|
| 74 |
from public.ingested_files as f
|
|
@@ -76,23 +62,25 @@ CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
|
|
| 76 |
and f.document_type is not null
|
| 77 |
and f.document_type <> 'unknown'
|
| 78 |
order by f.document_type;
|
| 79 |
-
$$;
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
--
|
| 83 |
-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
|
| 84 |
-
--
|
| 85 |
-
|
| 86 |
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
|
| 87 |
LANGUAGE plpgsql
|
| 88 |
-
SET search_path
|
| 89 |
AS $$
|
| 90 |
-
begin
|
| 91 |
-
return query
|
| 92 |
-
with
|
| 93 |
-
semantic as (
|
| 94 |
-
select
|
| 95 |
-
d.id,
|
|
|
|
|
|
|
| 96 |
(
|
| 97 |
1 - (
|
| 98 |
d.embedding::extensions.halfvec(2048)
|
|
@@ -101,937 +89,1390 @@ begin
|
|
| 101 |
)
|
| 102 |
)::float as score
|
| 103 |
from public.documents d
|
| 104 |
-
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 105 |
order by d.embedding::extensions.halfvec(2048)
|
| 106 |
OPERATOR(extensions.<=>)
|
| 107 |
query_embedding::extensions.halfvec(2048)
|
| 108 |
-
limit match_count * 3
|
| 109 |
-
),
|
| 110 |
-
keyword as (
|
| 111 |
-
select
|
| 112 |
-
d.id,
|
|
|
|
|
|
|
| 113 |
pg_catalog.ts_rank(
|
| 114 |
pg_catalog.to_tsvector('english', d.content),
|
| 115 |
pg_catalog.plainto_tsquery('english', query_text)
|
| 116 |
-
)::float as raw_score
|
| 117 |
from public.documents d
|
| 118 |
-
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 119 |
and pg_catalog.to_tsvector('english', d.content) @@ pg_catalog.plainto_tsquery('english', query_text)
|
| 120 |
-
order by raw_score desc
|
| 121 |
-
limit match_count * 3
|
| 122 |
-
),
|
| 123 |
-
keyword_norm as (
|
| 124 |
-
select
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
(
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 153 |
-
--
|
| 154 |
-
|
| 155 |
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
|
| 156 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 157 |
-
SET search_path
|
| 158 |
AS $$
|
| 159 |
-
BEGIN
|
| 160 |
INSERT INTO public.documents (id, content, metadata, embedding, user_id)
|
| 161 |
-
VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
|
| 162 |
-
ON CONFLICT (id) DO UPDATE
|
| 163 |
-
SET content = EXCLUDED.content,
|
| 164 |
-
metadata = EXCLUDED.metadata,
|
| 165 |
-
embedding = EXCLUDED.embedding;
|
| 166 |
-
END;
|
| 167 |
-
$$;
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
--
|
| 171 |
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 172 |
-
--
|
| 173 |
-
|
| 174 |
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
|
| 175 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 176 |
-
SET search_path
|
| 177 |
AS $$
|
| 178 |
-
BEGIN
|
| 179 |
INSERT INTO public.documents (
|
| 180 |
-
id, content, metadata, embedding, user_id,
|
| 181 |
-
node_type, parent_node_id, node_level
|
| 182 |
-
)
|
| 183 |
-
VALUES (
|
| 184 |
-
p_id, p_content, p_metadata, p_embedding, p_user_id,
|
| 185 |
-
p_node_type, p_parent_node_id, p_node_level
|
| 186 |
-
)
|
| 187 |
-
ON CONFLICT (id) DO UPDATE
|
| 188 |
-
SET content = EXCLUDED.content,
|
| 189 |
-
metadata = EXCLUDED.metadata,
|
| 190 |
-
embedding = EXCLUDED.embedding,
|
| 191 |
-
node_type = EXCLUDED.node_type,
|
| 192 |
-
parent_node_id = EXCLUDED.parent_node_id,
|
| 193 |
-
node_level = EXCLUDED.node_level;
|
| 194 |
-
END;
|
| 195 |
-
$$;
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
-- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
|
| 200 |
-
--
|
| 201 |
-
|
| 202 |
CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
|
| 203 |
LANGUAGE plpgsql
|
| 204 |
-
SET search_path
|
| 205 |
AS $$
|
| 206 |
-
begin
|
| 207 |
-
return query
|
| 208 |
-
select
|
| 209 |
-
d.id,
|
| 210 |
-
d.content,
|
| 211 |
-
d.metadata,
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
from public.documents d
|
| 220 |
-
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
limit match_count;
|
| 225 |
-
end;
|
| 226 |
-
$$;
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
-- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 231 |
-
--
|
| 232 |
-
|
| 233 |
CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
|
| 234 |
LANGUAGE plpgsql
|
| 235 |
-
SET search_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
AS $$
|
| 237 |
-
BEGIN
|
| 238 |
-
RETURN QUERY
|
| 239 |
-
SELECT
|
| 240 |
cm.id,
|
| 241 |
cm.role,
|
| 242 |
cm.content,
|
| 243 |
1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
|
| 244 |
FROM public.chat_memory AS cm
|
| 245 |
WHERE cm.session_id = match_session_id
|
|
|
|
| 246 |
ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
|
| 247 |
-
LIMIT match_count;
|
| 248 |
-
END;
|
| 249 |
-
$$;
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
--
|
| 253 |
-
-- Name:
|
| 254 |
-
--
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
embedding extensions.vector(2048),
|
| 294 |
-
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
|
| 295 |
-
user_id uuid DEFAULT auth.uid()
|
| 296 |
-
);
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
--
|
| 300 |
-
-- Name: document_trees; Type: TABLE; Schema: public; Owner: -
|
| 301 |
-
--
|
| 302 |
-
|
| 303 |
-
CREATE TABLE public.document_trees (
|
| 304 |
-
file_hash text NOT NULL,
|
| 305 |
-
user_id uuid NOT NULL,
|
| 306 |
-
tree_json jsonb NOT NULL,
|
| 307 |
-
created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
|
| 308 |
-
);
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
--
|
| 312 |
-
-- Name: documents; Type: TABLE; Schema: public; Owner: -
|
| 313 |
-
--
|
| 314 |
-
|
| 315 |
-
CREATE TABLE public.documents (
|
| 316 |
-
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 317 |
-
content text,
|
| 318 |
-
metadata jsonb,
|
| 319 |
embedding extensions.vector(2048),
|
| 320 |
-
user_id uuid DEFAULT auth.uid(),
|
| 321 |
-
node_type text DEFAULT 'leaf'::text,
|
| 322 |
-
parent_node_id uuid,
|
| 323 |
-
node_level integer DEFAULT 0
|
| 324 |
-
);
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
--
|
| 328 |
-
-- Name:
|
| 329 |
-
--
|
| 330 |
-
|
| 331 |
-
CREATE TABLE public.evaluation_logs (
|
| 332 |
-
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 333 |
-
run_label text,
|
| 334 |
-
evaluated_at timestamp with time zone,
|
| 335 |
-
alpha double precision,
|
| 336 |
-
k integer,
|
| 337 |
-
question text,
|
| 338 |
-
is_answerable boolean,
|
| 339 |
-
precision_at_k double precision,
|
| 340 |
-
faithfulness_proxy double precision,
|
| 341 |
-
relevance_proxy double precision,
|
| 342 |
-
local_reward double precision,
|
| 343 |
-
llm_judge_score double precision,
|
| 344 |
-
judge_a_verdict boolean,
|
| 345 |
-
judge_b_verdict boolean,
|
| 346 |
-
judge_a_model text,
|
| 347 |
-
judge_b_model text,
|
| 348 |
-
calibration_score double precision,
|
| 349 |
-
final_score double precision,
|
| 350 |
-
requires_manual_review boolean DEFAULT false,
|
| 351 |
-
disagreement_note text DEFAULT ''::text,
|
| 352 |
-
user_id uuid
|
| 353 |
-
);
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
--
|
| 357 |
-
-- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
|
| 358 |
-
--
|
| 359 |
-
|
| 360 |
-
CREATE TABLE public.ingested_files (
|
| 361 |
-
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 362 |
-
file_hash text NOT NULL,
|
| 363 |
-
filename text NOT NULL,
|
| 364 |
-
document_type text,
|
| 365 |
-
chunk_count integer DEFAULT 0,
|
| 366 |
-
ingested_at timestamp with time zone DEFAULT now(),
|
| 367 |
-
user_id uuid DEFAULT auth.uid(),
|
| 368 |
-
user_overridden boolean DEFAULT false
|
| 369 |
-
);
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
--
|
| 373 |
-
-- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
|
| 374 |
-
--
|
| 375 |
-
|
| 376 |
-
CREATE TABLE public.ingestion_retry_logs (
|
| 377 |
-
id bigint NOT NULL,
|
| 378 |
-
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 379 |
-
user_id uuid,
|
| 380 |
-
batch_num integer NOT NULL,
|
| 381 |
-
total_batches integer NOT NULL,
|
| 382 |
-
attempt integer NOT NULL,
|
| 383 |
-
event_type text NOT NULL,
|
| 384 |
-
message text,
|
| 385 |
-
sleep_s double precision DEFAULT 0
|
| 386 |
-
);
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
--
|
| 390 |
-
-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 391 |
-
--
|
| 392 |
-
|
| 393 |
-
CREATE SEQUENCE public.ingestion_retry_logs_id_seq
|
| 394 |
-
START WITH 1
|
| 395 |
-
INCREMENT BY 1
|
| 396 |
-
NO MINVALUE
|
| 397 |
-
NO MAXVALUE
|
| 398 |
-
CACHE 1;
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
--
|
| 402 |
-
-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 403 |
-
--
|
| 404 |
-
|
| 405 |
-
ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
--
|
| 409 |
-
-- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
|
| 410 |
-
--
|
| 411 |
-
|
| 412 |
-
CREATE TABLE public.intent_feedback (
|
| 413 |
-
id bigint NOT NULL,
|
| 414 |
-
user_id uuid,
|
| 415 |
-
query text NOT NULL,
|
| 416 |
-
has_category boolean DEFAULT false NOT NULL,
|
| 417 |
-
has_history boolean DEFAULT false NOT NULL,
|
| 418 |
-
label integer NOT NULL,
|
| 419 |
-
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 420 |
-
CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
|
| 421 |
-
);
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
--
|
| 425 |
-
-- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 426 |
-
--
|
| 427 |
-
|
| 428 |
-
CREATE SEQUENCE public.intent_feedback_id_seq
|
| 429 |
-
START WITH 1
|
| 430 |
-
INCREMENT BY 1
|
| 431 |
-
NO MINVALUE
|
| 432 |
-
NO MAXVALUE
|
| 433 |
-
CACHE 1;
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
--
|
| 437 |
-
-- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 438 |
-
--
|
| 439 |
-
|
| 440 |
-
ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
--
|
| 444 |
-
-- Name: mv_document_types; Type: MATERIALIZED VIEW; Schema: public; Owner: -
|
| 445 |
-
--
|
| 446 |
-
|
| 447 |
-
-- CREATE MATERIALIZED VIEW public.mv_document_types AS
|
| 448 |
-
-- SELECT DISTINCT (metadata ->> 'document_type'::text) AS document_type
|
| 449 |
-
-- FROM public.documents
|
| 450 |
-
-- WHERE (((metadata ->> 'document_type'::text) IS NOT NULL) AND ((metadata ->> 'document_type'::text) <> 'unknown'::text))
|
| 451 |
-
-- ORDER BY (metadata ->> 'document_type'::text)
|
| 452 |
-
-- WITH NO DATA;
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
--
|
| 456 |
-
-- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
|
| 457 |
-
--
|
| 458 |
-
|
| 459 |
-
CREATE TABLE public.rerank_feedback (
|
| 460 |
-
id bigint NOT NULL,
|
| 461 |
-
user_id uuid,
|
| 462 |
-
query_hash text NOT NULL,
|
| 463 |
-
chunk_id uuid,
|
| 464 |
-
chunk_hash text NOT NULL,
|
| 465 |
-
document_type text,
|
| 466 |
-
cohere_score real NOT NULL,
|
| 467 |
-
was_selected boolean NOT NULL,
|
| 468 |
-
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 469 |
-
query_text text,
|
| 470 |
-
chunk_text text
|
| 471 |
-
);
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
--
|
| 475 |
-
-- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 476 |
-
--
|
| 477 |
-
|
| 478 |
-
CREATE SEQUENCE public.rerank_feedback_id_seq
|
| 479 |
-
START WITH 1
|
| 480 |
-
INCREMENT BY 1
|
| 481 |
-
NO MINVALUE
|
| 482 |
-
NO MAXVALUE
|
| 483 |
-
CACHE 1;
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
--
|
| 487 |
-
-- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 488 |
-
--
|
| 489 |
-
|
| 490 |
-
ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
--
|
| 494 |
-
-- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
|
| 495 |
-
--
|
| 496 |
-
|
| 497 |
-
ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
--
|
| 501 |
-
-- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
|
| 502 |
-
--
|
| 503 |
-
|
| 504 |
-
ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
--
|
| 508 |
-
-- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
|
| 509 |
-
--
|
| 510 |
-
|
| 511 |
-
ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
--
|
| 515 |
-
-- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 516 |
-
--
|
| 517 |
-
|
| 518 |
-
ALTER TABLE ONLY public.category_centroids
|
| 519 |
-
ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
--
|
| 523 |
-
-- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 524 |
-
--
|
| 525 |
-
|
| 526 |
-
ALTER TABLE ONLY public.category_centroids
|
| 527 |
-
ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
--
|
| 531 |
-
-- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 532 |
-
--
|
| 533 |
-
|
| 534 |
-
ALTER TABLE ONLY public.chat_memory
|
| 535 |
-
ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
--
|
| 539 |
-
-- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 540 |
--
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
--
|
| 563 |
-
-- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 564 |
--
|
| 565 |
|
| 566 |
-
ALTER TABLE
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
--
|
| 579 |
-
-- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 580 |
-
--
|
| 581 |
-
|
| 582 |
-
ALTER TABLE ONLY public.ingestion_retry_logs
|
| 583 |
-
ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
--
|
| 587 |
-
-- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 588 |
-
--
|
| 589 |
-
|
| 590 |
-
ALTER TABLE ONLY public.intent_feedback
|
| 591 |
-
ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
--
|
| 595 |
-
-- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 596 |
-
--
|
| 597 |
-
|
| 598 |
-
ALTER TABLE ONLY public.rerank_feedback
|
| 599 |
-
ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
--
|
| 603 |
-
-- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 604 |
-
--
|
| 605 |
-
|
| 606 |
-
CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
--
|
| 610 |
-
-- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 611 |
-
--
|
| 612 |
-
|
| 613 |
-
CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
--
|
| 617 |
-
-- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
|
| 618 |
-
--
|
| 619 |
-
|
| 620 |
-
CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
--
|
| 624 |
-
-- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 625 |
-
--
|
| 626 |
-
|
| 627 |
-
CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
--
|
| 631 |
-
-- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 632 |
-
--
|
| 633 |
-
|
| 634 |
-
CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
--
|
| 638 |
-
-- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
|
| 639 |
-
--
|
| 640 |
-
|
| 641 |
-
CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
--
|
| 645 |
-
-- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
|
| 646 |
-
--
|
| 647 |
-
|
| 648 |
-
CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
--
|
| 652 |
-
-- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
|
| 653 |
-
--
|
| 654 |
-
|
| 655 |
-
CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
--
|
| 659 |
-
-- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
|
| 660 |
-
--
|
| 661 |
-
|
| 662 |
-
CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
--
|
| 666 |
-
-- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 667 |
-
--
|
| 668 |
-
|
| 669 |
-
CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
--
|
| 673 |
-
-- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
|
| 674 |
-
--
|
| 675 |
-
|
| 676 |
-
CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
--
|
| 680 |
-
-- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
|
| 681 |
-
--
|
| 682 |
-
|
| 683 |
-
CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
--
|
| 687 |
-
-- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
|
| 688 |
-
--
|
| 689 |
-
|
| 690 |
-
CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
--
|
| 694 |
-
-- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
|
| 695 |
-
--
|
| 696 |
-
|
| 697 |
-
CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
--
|
| 701 |
-
-- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
|
| 702 |
-
--
|
| 703 |
-
|
| 704 |
-
CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
--
|
| 708 |
-
-- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
|
| 709 |
-
--
|
| 710 |
-
|
| 711 |
-
CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
--
|
| 715 |
-
-- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 716 |
-
--
|
| 717 |
-
|
| 718 |
-
CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
--
|
| 722 |
-
-- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
|
| 723 |
-
--
|
| 724 |
-
|
| 725 |
-
CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
--
|
| 729 |
-
-- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 730 |
-
--
|
| 731 |
-
|
| 732 |
-
CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
--
|
| 736 |
-
-- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 737 |
-
--
|
| 738 |
-
|
| 739 |
-
CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
--
|
| 743 |
-
-- Name: mv_document_types_idx; Type: INDEX; Schema: public; Owner: -
|
| 744 |
-
--
|
| 745 |
-
|
| 746 |
-
-- CREATE UNIQUE INDEX mv_document_types_idx ON public.mv_document_types USING btree (document_type);
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
--
|
| 750 |
-
-- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 751 |
-
--
|
| 752 |
-
|
| 753 |
-
CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
--
|
| 757 |
-
-- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
|
| 758 |
-
--
|
| 759 |
-
|
| 760 |
-
CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
--
|
| 764 |
-
-- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
|
| 765 |
-
--
|
| 766 |
-
|
| 767 |
-
CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
--
|
| 771 |
-
-- Name: documents trg_refresh_mv_document_types; Type: TRIGGER; Schema: public; Owner: -
|
| 772 |
-
--
|
| 773 |
-
|
| 774 |
-
-- CREATE TRIGGER trg_refresh_mv_document_types AFTER INSERT ON public.documents FOR EACH STATEMENT EXECUTE FUNCTION public._trg_refresh_mv_document_types();
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
--
|
| 778 |
-
-- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
|
| 779 |
-
--
|
| 780 |
-
|
| 781 |
-
ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
|
| 782 |
-
|
| 783 |
-
--
|
| 784 |
-
-- Name: category_centroids centroids_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 785 |
-
--
|
| 786 |
-
|
| 787 |
-
CREATE POLICY centroids_delete_own ON public.category_centroids FOR DELETE USING ((user_id = auth.uid()));
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
--
|
| 791 |
-
-- Name: category_centroids centroids_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 792 |
-
--
|
| 793 |
-
|
| 794 |
-
CREATE POLICY centroids_insert_own ON public.category_centroids FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
--
|
| 798 |
-
-- Name: category_centroids centroids_select_own; Type: POLICY; Schema: public; Owner: -
|
| 799 |
-
--
|
| 800 |
-
|
| 801 |
-
CREATE POLICY centroids_select_own ON public.category_centroids FOR SELECT USING ((user_id = auth.uid()));
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
--
|
| 805 |
-
-- Name: category_centroids centroids_update_own; Type: POLICY; Schema: public; Owner: -
|
| 806 |
-
--
|
| 807 |
-
|
| 808 |
-
CREATE POLICY centroids_update_own ON public.category_centroids FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
--
|
| 812 |
-
-- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
|
| 813 |
-
--
|
| 814 |
-
|
| 815 |
-
ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
|
| 816 |
-
|
| 817 |
-
--
|
| 818 |
-
-- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 819 |
-
--
|
| 820 |
-
|
| 821 |
-
CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
--
|
| 825 |
-
-- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 826 |
-
--
|
| 827 |
-
|
| 828 |
-
CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
--
|
| 832 |
-
-- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
|
| 833 |
-
--
|
| 834 |
-
|
| 835 |
-
CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
--
|
| 839 |
-
-- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
|
| 840 |
-
--
|
| 841 |
-
|
| 842 |
-
CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
--
|
| 846 |
-
-- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
|
| 847 |
-
--
|
| 848 |
-
|
| 849 |
-
ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
|
| 850 |
|
| 851 |
--
|
| 852 |
-
-- Name:
|
| 853 |
-
--
|
| 854 |
-
|
| 855 |
-
CREATE
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
|
|
|
| 877 |
|
| 878 |
|
| 879 |
--
|
| 880 |
-
-- Name:
|
| 881 |
--
|
| 882 |
|
| 883 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
|
| 885 |
--
|
| 886 |
-
-- Name:
|
| 887 |
--
|
| 888 |
|
| 889 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
|
| 891 |
|
| 892 |
--
|
| 893 |
-
-- Name:
|
| 894 |
--
|
| 895 |
|
| 896 |
-
CREATE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
|
| 898 |
|
| 899 |
--
|
| 900 |
-
-- Name:
|
| 901 |
--
|
| 902 |
|
| 903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
|
| 905 |
|
| 906 |
--
|
| 907 |
-
-- Name:
|
| 908 |
--
|
| 909 |
|
| 910 |
-
CREATE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 911 |
|
| 912 |
|
| 913 |
--
|
| 914 |
-
-- Name:
|
| 915 |
--
|
| 916 |
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
CREATE POLICY evaluation_logs_select_own ON public.evaluation_logs FOR SELECT USING ((user_id = auth.uid()));
|
| 931 |
|
| 932 |
|
| 933 |
--
|
| 934 |
-
-- Name:
|
| 935 |
--
|
| 936 |
|
| 937 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
|
| 939 |
--
|
| 940 |
-
-- Name:
|
| 941 |
--
|
| 942 |
|
| 943 |
-
|
| 944 |
|
| 945 |
|
| 946 |
--
|
| 947 |
-
-- Name:
|
| 948 |
--
|
| 949 |
|
| 950 |
-
CREATE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
|
| 952 |
|
| 953 |
--
|
| 954 |
-
-- Name:
|
| 955 |
--
|
| 956 |
|
| 957 |
-
CREATE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
|
| 959 |
|
| 960 |
--
|
| 961 |
-
-- Name:
|
| 962 |
--
|
| 963 |
|
| 964 |
-
|
| 965 |
|
| 966 |
|
| 967 |
--
|
| 968 |
-
-- Name:
|
| 969 |
--
|
| 970 |
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
--
|
| 1002 |
-
-- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1003 |
-
--
|
| 1004 |
-
|
| 1005 |
-
ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
|
| 1006 |
-
|
| 1007 |
-
--
|
| 1008 |
-
-- Name: intent_feedback intent_feedback_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 1009 |
-
--
|
| 1010 |
-
|
| 1011 |
-
CREATE POLICY intent_feedback_insert_own ON public.intent_feedback FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
--
|
| 1015 |
-
-- Name: intent_feedback intent_feedback_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1016 |
-
--
|
| 1017 |
-
|
| 1018 |
-
CREATE POLICY intent_feedback_select_own ON public.intent_feedback FOR SELECT USING ((user_id = auth.uid()));
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
--
|
| 1022 |
-
-- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1023 |
-
--
|
| 1024 |
-
|
| 1025 |
-
ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
|
| 1026 |
-
|
| 1027 |
-
--
|
| 1028 |
-
-- Name: rerank_feedback rerank_feedback_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1029 |
-
--
|
| 1030 |
-
|
| 1031 |
-
CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
--
|
| 1035 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1036 |
--
|
| 1037 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
--
|
| 2 |
-- PostgreSQL database dump
|
| 3 |
--
|
| 4 |
+
|
| 5 |
+
\restrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua
|
| 6 |
+
|
| 7 |
+
-- Dumped from database version 17.6
|
| 8 |
+
-- Dumped by pg_dump version 18.3
|
| 9 |
+
|
| 10 |
+
SET statement_timeout = 0;
|
| 11 |
+
SET lock_timeout = 0;
|
| 12 |
+
SET idle_in_transaction_session_timeout = 0;
|
| 13 |
+
SET transaction_timeout = 0;
|
| 14 |
+
SET client_encoding = 'UTF8';
|
| 15 |
+
SET standard_conforming_strings = on;
|
| 16 |
+
SELECT pg_catalog.set_config('search_path', '', false);
|
| 17 |
+
SET check_function_bodies = false;
|
| 18 |
+
SET xmloption = content;
|
| 19 |
+
SET client_min_messages = warning;
|
| 20 |
+
SET row_security = off;
|
| 21 |
+
|
| 22 |
+
--
|
| 23 |
+
-- Name: public; Type: SCHEMA; Schema: -; Owner: -
|
| 24 |
+
--
|
| 25 |
+
|
| 26 |
+
CREATE SCHEMA public;
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
--
|
| 30 |
+
-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: -
|
| 31 |
+
--
|
| 32 |
+
|
| 33 |
+
COMMENT ON SCHEMA public IS 'standard public schema';
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
--
|
| 37 |
+
-- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
|
| 38 |
+
--
|
| 39 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
CREATE FUNCTION public._trg_set_updated_at() RETURNS trigger
|
| 41 |
LANGUAGE plpgsql
|
| 42 |
+
SET search_path TO ''
|
| 43 |
+
AS $$
|
| 44 |
+
begin
|
| 45 |
new.updated_at = pg_catalog.now();
|
| 46 |
+
return new;
|
| 47 |
+
end;
|
| 48 |
+
$$;
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
--
|
| 52 |
+
-- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
|
| 53 |
+
--
|
| 54 |
+
|
| 55 |
CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
|
| 56 |
LANGUAGE sql STABLE
|
| 57 |
+
SET search_path TO ''
|
| 58 |
AS $$
|
| 59 |
select distinct f.document_type
|
| 60 |
from public.ingested_files as f
|
|
|
|
| 62 |
and f.document_type is not null
|
| 63 |
and f.document_type <> 'unknown'
|
| 64 |
order by f.document_type;
|
| 65 |
+
$$;
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
--
|
| 69 |
-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
|
| 70 |
+
--
|
| 71 |
+
|
| 72 |
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
|
| 73 |
LANGUAGE plpgsql
|
| 74 |
+
SET search_path TO ''
|
| 75 |
AS $$
|
| 76 |
+
begin
|
| 77 |
+
return query
|
| 78 |
+
with
|
| 79 |
+
semantic as (
|
| 80 |
+
select
|
| 81 |
+
d.id,
|
| 82 |
+
d.content,
|
| 83 |
+
d.metadata,
|
| 84 |
(
|
| 85 |
1 - (
|
| 86 |
d.embedding::extensions.halfvec(2048)
|
|
|
|
| 89 |
)
|
| 90 |
)::float as score
|
| 91 |
from public.documents d
|
| 92 |
+
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 93 |
order by d.embedding::extensions.halfvec(2048)
|
| 94 |
OPERATOR(extensions.<=>)
|
| 95 |
query_embedding::extensions.halfvec(2048)
|
| 96 |
+
limit match_count * 3
|
| 97 |
+
),
|
| 98 |
+
keyword as (
|
| 99 |
+
select
|
| 100 |
+
d.id,
|
| 101 |
+
d.content,
|
| 102 |
+
d.metadata,
|
| 103 |
pg_catalog.ts_rank(
|
| 104 |
pg_catalog.to_tsvector('english', d.content),
|
| 105 |
pg_catalog.plainto_tsquery('english', query_text)
|
| 106 |
+
)::float as raw_score
|
| 107 |
from public.documents d
|
| 108 |
+
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 109 |
and pg_catalog.to_tsvector('english', d.content) @@ pg_catalog.plainto_tsquery('english', query_text)
|
| 110 |
+
order by raw_score desc
|
| 111 |
+
limit match_count * 3
|
| 112 |
+
),
|
| 113 |
+
keyword_norm as (
|
| 114 |
+
select
|
| 115 |
+
k.id,
|
| 116 |
+
k.content,
|
| 117 |
+
k.metadata,
|
| 118 |
+
case
|
| 119 |
+
when max(k.raw_score) over () = 0 then 0::float
|
| 120 |
+
else (k.raw_score / max(k.raw_score) over ())::float
|
| 121 |
+
end as score
|
| 122 |
+
from keyword k
|
| 123 |
+
),
|
| 124 |
+
blended as (
|
| 125 |
+
select
|
| 126 |
+
coalesce(s.id, kn.id) as id,
|
| 127 |
+
coalesce(s.content, kn.content) as content,
|
| 128 |
+
coalesce(s.metadata, kn.metadata) as metadata,
|
| 129 |
+
(
|
| 130 |
+
coalesce(s.score, 0::float) * semantic_weight +
|
| 131 |
+
coalesce(kn.score, 0::float) * keyword_weight
|
| 132 |
+
) as combined_score
|
| 133 |
+
from semantic s
|
| 134 |
+
full outer join keyword_norm kn on s.id = kn.id
|
| 135 |
+
)
|
| 136 |
+
select
|
| 137 |
+
b.id,
|
| 138 |
+
b.content,
|
| 139 |
+
b.metadata,
|
| 140 |
+
b.combined_score
|
| 141 |
+
from blended b
|
| 142 |
+
order by b.combined_score desc
|
| 143 |
+
limit match_count;
|
| 144 |
+
end;
|
| 145 |
+
$$;
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
--
|
| 149 |
+
-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 150 |
+
--
|
| 151 |
+
|
| 152 |
+
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
|
| 153 |
+
LANGUAGE plpgsql
|
| 154 |
+
SET search_path TO ''
|
| 155 |
+
AS $$
|
| 156 |
+
BEGIN
|
| 157 |
+
RETURN QUERY
|
| 158 |
+
WITH
|
| 159 |
+
semantic AS (
|
| 160 |
+
SELECT
|
| 161 |
+
d.id,
|
| 162 |
+
d.content,
|
| 163 |
+
d.metadata,
|
| 164 |
+
(
|
| 165 |
+
1 - (
|
| 166 |
+
d.embedding::extensions.halfvec(2048)
|
| 167 |
+
OPERATOR(extensions.<=>)
|
| 168 |
+
query_embedding::extensions.halfvec(2048)
|
| 169 |
+
)
|
| 170 |
+
)::float AS score
|
| 171 |
+
FROM public.documents AS d
|
| 172 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 173 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 174 |
+
ORDER BY d.embedding::extensions.halfvec(2048)
|
| 175 |
+
OPERATOR(extensions.<=>)
|
| 176 |
+
query_embedding::extensions.halfvec(2048)
|
| 177 |
+
LIMIT match_count * 3
|
| 178 |
+
),
|
| 179 |
+
keyword AS (
|
| 180 |
+
SELECT
|
| 181 |
+
d.id,
|
| 182 |
+
d.content,
|
| 183 |
+
d.metadata,
|
| 184 |
+
pg_catalog.ts_rank(
|
| 185 |
+
pg_catalog.to_tsvector('english', d.content),
|
| 186 |
+
pg_catalog.plainto_tsquery('english', query_text)
|
| 187 |
+
)::float AS raw_score
|
| 188 |
+
FROM public.documents AS d
|
| 189 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 190 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 191 |
+
AND pg_catalog.to_tsvector('english', d.content)
|
| 192 |
+
@@ pg_catalog.plainto_tsquery('english', query_text)
|
| 193 |
+
ORDER BY raw_score DESC
|
| 194 |
+
LIMIT match_count * 3
|
| 195 |
+
),
|
| 196 |
+
keyword_norm AS (
|
| 197 |
+
SELECT
|
| 198 |
+
k.id,
|
| 199 |
+
k.content,
|
| 200 |
+
k.metadata,
|
| 201 |
+
CASE
|
| 202 |
+
WHEN max(k.raw_score) OVER () = 0 THEN 0::float
|
| 203 |
+
ELSE (k.raw_score / max(k.raw_score) OVER ())::float
|
| 204 |
+
END AS score
|
| 205 |
+
FROM keyword AS k
|
| 206 |
+
),
|
| 207 |
+
blended AS (
|
| 208 |
+
SELECT
|
| 209 |
+
COALESCE(s.id, kn.id) AS id,
|
| 210 |
+
COALESCE(s.content, kn.content) AS content,
|
| 211 |
+
COALESCE(s.metadata, kn.metadata) AS metadata,
|
| 212 |
+
(
|
| 213 |
+
COALESCE(s.score, 0::float) * semantic_weight +
|
| 214 |
+
COALESCE(kn.score, 0::float) * keyword_weight
|
| 215 |
+
) AS combined_score
|
| 216 |
+
FROM semantic AS s
|
| 217 |
+
FULL OUTER JOIN keyword_norm AS kn ON s.id = kn.id
|
| 218 |
+
)
|
| 219 |
+
SELECT
|
| 220 |
+
b.id,
|
| 221 |
+
b.content,
|
| 222 |
+
b.metadata,
|
| 223 |
+
b.combined_score
|
| 224 |
+
FROM blended AS b
|
| 225 |
+
ORDER BY b.combined_score DESC
|
| 226 |
+
LIMIT match_count;
|
| 227 |
+
END;
|
| 228 |
+
$$;
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
--
|
| 232 |
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 233 |
+
--
|
| 234 |
+
|
| 235 |
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
|
| 236 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 237 |
+
SET search_path TO ''
|
| 238 |
AS $$
|
| 239 |
+
BEGIN
|
| 240 |
INSERT INTO public.documents (id, content, metadata, embedding, user_id)
|
| 241 |
+
VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
|
| 242 |
+
ON CONFLICT (id) DO UPDATE
|
| 243 |
+
SET content = EXCLUDED.content,
|
| 244 |
+
metadata = EXCLUDED.metadata,
|
| 245 |
+
embedding = EXCLUDED.embedding;
|
| 246 |
+
END;
|
| 247 |
+
$$;
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
--
|
| 251 |
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 252 |
+
--
|
| 253 |
+
|
| 254 |
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
|
| 255 |
LANGUAGE plpgsql SECURITY DEFINER
|
| 256 |
+
SET search_path TO ''
|
| 257 |
AS $$
|
| 258 |
+
BEGIN
|
| 259 |
INSERT INTO public.documents (
|
| 260 |
+
id, content, metadata, embedding, user_id,
|
| 261 |
+
node_type, parent_node_id, node_level
|
| 262 |
+
)
|
| 263 |
+
VALUES (
|
| 264 |
+
p_id, p_content, p_metadata, p_embedding, p_user_id,
|
| 265 |
+
p_node_type, p_parent_node_id, p_node_level
|
| 266 |
+
)
|
| 267 |
+
ON CONFLICT (id) DO UPDATE
|
| 268 |
+
SET content = EXCLUDED.content,
|
| 269 |
+
metadata = EXCLUDED.metadata,
|
| 270 |
+
embedding = EXCLUDED.embedding,
|
| 271 |
+
node_type = EXCLUDED.node_type,
|
| 272 |
+
parent_node_id = EXCLUDED.parent_node_id,
|
| 273 |
+
node_level = EXCLUDED.node_level;
|
| 274 |
+
END;
|
| 275 |
+
$$;
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
--
|
| 279 |
+
-- Name: insert_document_chunks_batch(jsonb); Type: FUNCTION; Schema: public; Owner: -
|
| 280 |
+
--
|
| 281 |
+
|
| 282 |
+
CREATE FUNCTION public.insert_document_chunks_batch(p_rows jsonb) RETURNS void
|
| 283 |
+
LANGUAGE plpgsql SECURITY DEFINER
|
| 284 |
+
SET search_path TO ''
|
| 285 |
+
AS $$
|
| 286 |
+
BEGIN
|
| 287 |
+
IF p_rows IS NULL OR jsonb_typeof(p_rows) <> 'array' THEN
|
| 288 |
+
RETURN;
|
| 289 |
+
END IF;
|
| 290 |
+
|
| 291 |
+
INSERT INTO public.documents (
|
| 292 |
+
id,
|
| 293 |
+
content,
|
| 294 |
+
metadata,
|
| 295 |
+
embedding,
|
| 296 |
+
user_id,
|
| 297 |
+
node_type,
|
| 298 |
+
parent_node_id,
|
| 299 |
+
node_level
|
| 300 |
+
)
|
| 301 |
+
SELECT
|
| 302 |
+
(row->>'id')::uuid,
|
| 303 |
+
row->>'content',
|
| 304 |
+
COALESCE(row->'metadata', '{}'::jsonb),
|
| 305 |
+
(row->'embedding')::text::extensions.vector,
|
| 306 |
+
(row->>'user_id')::uuid,
|
| 307 |
+
COALESCE(NULLIF(row->>'node_type', ''), 'leaf'),
|
| 308 |
+
NULLIF(row->>'parent_node_id', '')::uuid,
|
| 309 |
+
COALESCE(NULLIF(row->>'node_level', '')::integer, 0)
|
| 310 |
+
FROM jsonb_array_elements(p_rows) AS row
|
| 311 |
+
ON CONFLICT (id) DO UPDATE
|
| 312 |
+
SET content = EXCLUDED.content,
|
| 313 |
+
metadata = EXCLUDED.metadata,
|
| 314 |
+
embedding = EXCLUDED.embedding,
|
| 315 |
+
user_id = EXCLUDED.user_id,
|
| 316 |
+
node_type = EXCLUDED.node_type,
|
| 317 |
+
parent_node_id = EXCLUDED.parent_node_id,
|
| 318 |
+
node_level = EXCLUDED.node_level;
|
| 319 |
+
END;
|
| 320 |
+
$$;
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
--
|
| 324 |
-- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
|
| 325 |
+
--
|
| 326 |
+
|
| 327 |
CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
|
| 328 |
LANGUAGE plpgsql
|
| 329 |
+
SET search_path TO ''
|
| 330 |
AS $$
|
| 331 |
+
begin
|
| 332 |
+
return query
|
| 333 |
+
select
|
| 334 |
+
d.id,
|
| 335 |
+
d.content,
|
| 336 |
+
d.metadata,
|
| 337 |
+
(
|
| 338 |
+
1 - (
|
| 339 |
+
d.embedding::extensions.halfvec(2048)
|
| 340 |
+
OPERATOR(extensions.<=>)
|
| 341 |
+
query_embedding::extensions.halfvec(2048)
|
| 342 |
+
)
|
| 343 |
+
)::float as similarity
|
| 344 |
from public.documents d
|
| 345 |
+
where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
|
| 346 |
+
order by d.embedding::extensions.halfvec(2048)
|
| 347 |
+
OPERATOR(extensions.<=>)
|
| 348 |
+
query_embedding::extensions.halfvec(2048)
|
| 349 |
+
limit match_count;
|
| 350 |
+
end;
|
| 351 |
+
$$;
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
--
|
| 355 |
+
-- Name: match_documents(extensions.vector, integer, jsonb, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 356 |
+
--
|
| 357 |
+
|
| 358 |
+
CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
|
| 359 |
+
LANGUAGE plpgsql
|
| 360 |
+
SET search_path TO ''
|
| 361 |
+
AS $$
|
| 362 |
+
BEGIN
|
| 363 |
+
RETURN QUERY
|
| 364 |
+
SELECT
|
| 365 |
+
d.id,
|
| 366 |
+
d.content,
|
| 367 |
+
d.metadata,
|
| 368 |
+
(
|
| 369 |
+
1 - (
|
| 370 |
+
d.embedding::extensions.halfvec(2048)
|
| 371 |
+
OPERATOR(extensions.<=>)
|
| 372 |
+
query_embedding::extensions.halfvec(2048)
|
| 373 |
+
)
|
| 374 |
+
)::float AS similarity
|
| 375 |
+
FROM public.documents AS d
|
| 376 |
+
WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
|
| 377 |
+
AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
|
| 378 |
+
ORDER BY d.embedding::extensions.halfvec(2048)
|
| 379 |
+
OPERATOR(extensions.<=>)
|
| 380 |
+
query_embedding::extensions.halfvec(2048)
|
| 381 |
+
LIMIT match_count;
|
| 382 |
+
END;
|
| 383 |
+
$$;
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
--
|
| 387 |
-- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
|
| 388 |
+
--
|
| 389 |
+
|
| 390 |
CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
|
| 391 |
LANGUAGE plpgsql
|
| 392 |
+
SET search_path TO ''
|
| 393 |
+
AS $$
|
| 394 |
+
BEGIN
|
| 395 |
+
RETURN QUERY
|
| 396 |
+
SELECT
|
| 397 |
+
cm.id,
|
| 398 |
+
cm.role,
|
| 399 |
+
cm.content,
|
| 400 |
+
1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
|
| 401 |
+
FROM public.chat_memory AS cm
|
| 402 |
+
WHERE cm.session_id = match_session_id
|
| 403 |
+
ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
|
| 404 |
+
LIMIT match_count;
|
| 405 |
+
END;
|
| 406 |
+
$$;
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
--
|
| 410 |
+
-- Name: match_memory(extensions.vector, text, integer, uuid); Type: FUNCTION; Schema: public; Owner: -
|
| 411 |
+
--
|
| 412 |
+
|
| 413 |
+
CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
|
| 414 |
+
LANGUAGE plpgsql
|
| 415 |
+
SET search_path TO ''
|
| 416 |
AS $$
|
| 417 |
+
BEGIN
|
| 418 |
+
RETURN QUERY
|
| 419 |
+
SELECT
|
| 420 |
cm.id,
|
| 421 |
cm.role,
|
| 422 |
cm.content,
|
| 423 |
1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
|
| 424 |
FROM public.chat_memory AS cm
|
| 425 |
WHERE cm.session_id = match_session_id
|
| 426 |
+
AND (p_user_id IS NULL OR cm.user_id = p_user_id)
|
| 427 |
ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
|
| 428 |
+
LIMIT match_count;
|
| 429 |
+
END;
|
| 430 |
+
$$;
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
--
|
| 434 |
+
-- Name: rls_auto_enable(); Type: FUNCTION; Schema: public; Owner: -
|
| 435 |
+
--
|
| 436 |
+
|
| 437 |
+
CREATE FUNCTION public.rls_auto_enable() RETURNS event_trigger
|
| 438 |
+
LANGUAGE plpgsql SECURITY DEFINER
|
| 439 |
+
SET search_path TO 'pg_catalog'
|
| 440 |
+
AS $$
|
| 441 |
+
DECLARE
|
| 442 |
+
cmd record;
|
| 443 |
+
BEGIN
|
| 444 |
+
FOR cmd IN
|
| 445 |
+
SELECT *
|
| 446 |
+
FROM pg_event_trigger_ddl_commands()
|
| 447 |
+
WHERE command_tag IN ('CREATE TABLE', 'CREATE TABLE AS', 'SELECT INTO')
|
| 448 |
+
AND object_type IN ('table','partitioned table')
|
| 449 |
+
LOOP
|
| 450 |
+
IF cmd.schema_name IS NOT NULL AND cmd.schema_name IN ('public') AND cmd.schema_name NOT IN ('pg_catalog','information_schema') AND cmd.schema_name NOT LIKE 'pg_toast%' AND cmd.schema_name NOT LIKE 'pg_temp%' THEN
|
| 451 |
+
BEGIN
|
| 452 |
+
EXECUTE format('alter table if exists %s enable row level security', cmd.object_identity);
|
| 453 |
+
RAISE LOG 'rls_auto_enable: enabled RLS on %', cmd.object_identity;
|
| 454 |
+
EXCEPTION
|
| 455 |
+
WHEN OTHERS THEN
|
| 456 |
+
RAISE LOG 'rls_auto_enable: failed to enable RLS on %', cmd.object_identity;
|
| 457 |
+
END;
|
| 458 |
+
ELSE
|
| 459 |
+
RAISE LOG 'rls_auto_enable: skip % (either system schema or not in enforced list: %.)', cmd.object_identity, cmd.schema_name;
|
| 460 |
+
END IF;
|
| 461 |
+
END LOOP;
|
| 462 |
+
END;
|
| 463 |
+
$$;
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
SET default_tablespace = '';
|
| 467 |
+
|
| 468 |
+
SET default_table_access_method = heap;
|
| 469 |
+
|
| 470 |
+
--
|
| 471 |
+
-- Name: answer_feedback; Type: TABLE; Schema: public; Owner: -
|
| 472 |
+
--
|
| 473 |
+
|
| 474 |
+
CREATE TABLE public.answer_feedback (
|
| 475 |
+
id bigint NOT NULL,
|
| 476 |
+
trace_id uuid NOT NULL,
|
| 477 |
+
user_id uuid,
|
| 478 |
+
helpful boolean,
|
| 479 |
+
accepted boolean,
|
| 480 |
+
reason_code text,
|
| 481 |
+
correction_text text,
|
| 482 |
+
promote_to_eval boolean DEFAULT false NOT NULL,
|
| 483 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
|
| 484 |
+
review_state text DEFAULT 'pending'::text NOT NULL,
|
| 485 |
+
review_notes text,
|
| 486 |
+
reviewed_at timestamp with time zone,
|
| 487 |
+
reviewed_by text,
|
| 488 |
+
promoted_at timestamp with time zone
|
| 489 |
+
);
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
--
|
| 493 |
+
-- Name: answer_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 494 |
+
--
|
| 495 |
+
|
| 496 |
+
ALTER TABLE public.answer_feedback ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
|
| 497 |
+
SEQUENCE NAME public.answer_feedback_id_seq
|
| 498 |
+
START WITH 1
|
| 499 |
+
INCREMENT BY 1
|
| 500 |
+
NO MINVALUE
|
| 501 |
+
NO MAXVALUE
|
| 502 |
+
CACHE 1
|
| 503 |
+
);
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
--
|
| 507 |
+
-- Name: category_centroids; Type: TABLE; Schema: public; Owner: -
|
| 508 |
+
--
|
| 509 |
+
|
| 510 |
+
CREATE TABLE public.category_centroids (
|
| 511 |
+
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 512 |
+
document_type text NOT NULL,
|
| 513 |
+
centroid_vector double precision[] NOT NULL,
|
| 514 |
+
document_count integer DEFAULT 1,
|
| 515 |
+
created_at timestamp with time zone DEFAULT now(),
|
| 516 |
+
updated_at timestamp with time zone DEFAULT now(),
|
| 517 |
+
user_id uuid DEFAULT auth.uid()
|
| 518 |
+
);
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
--
|
| 522 |
+
-- Name: chat_memory; Type: TABLE; Schema: public; Owner: -
|
| 523 |
+
--
|
| 524 |
+
|
| 525 |
+
CREATE TABLE public.chat_memory (
|
| 526 |
+
id uuid DEFAULT extensions.uuid_generate_v4() NOT NULL,
|
| 527 |
+
session_id text NOT NULL,
|
| 528 |
+
role text NOT NULL,
|
| 529 |
+
content text NOT NULL,
|
| 530 |
embedding extensions.vector(2048),
|
| 531 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
|
| 532 |
+
user_id uuid DEFAULT auth.uid()
|
| 533 |
+
);
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
--
|
| 537 |
+
-- Name: document_trees; Type: TABLE; Schema: public; Owner: -
|
| 538 |
+
--
|
| 539 |
+
|
| 540 |
+
CREATE TABLE public.document_trees (
|
| 541 |
+
file_hash text NOT NULL,
|
| 542 |
+
user_id uuid NOT NULL,
|
| 543 |
+
tree_json jsonb NOT NULL,
|
| 544 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
|
| 545 |
+
);
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
--
|
| 549 |
+
-- Name: documents; Type: TABLE; Schema: public; Owner: -
|
| 550 |
+
--
|
| 551 |
+
|
| 552 |
+
CREATE TABLE public.documents (
|
| 553 |
+
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 554 |
+
content text,
|
| 555 |
+
metadata jsonb,
|
| 556 |
embedding extensions.vector(2048),
|
| 557 |
+
user_id uuid DEFAULT auth.uid(),
|
| 558 |
+
node_type text DEFAULT 'leaf'::text,
|
| 559 |
+
parent_node_id uuid,
|
| 560 |
+
node_level integer DEFAULT 0
|
| 561 |
+
);
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
--
|
| 565 |
+
-- Name: evaluation_datasets; Type: TABLE; Schema: public; Owner: -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
--
|
| 567 |
|
| 568 |
+
CREATE TABLE public.evaluation_datasets (
|
| 569 |
+
id bigint NOT NULL,
|
| 570 |
+
trace_id uuid,
|
| 571 |
+
source text DEFAULT 'feedback_trace'::text NOT NULL,
|
| 572 |
+
question text NOT NULL,
|
| 573 |
+
gold_context_refs jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 574 |
+
gold_evidence_text text,
|
| 575 |
+
is_answerable boolean DEFAULT true NOT NULL,
|
| 576 |
+
failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 577 |
+
doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 578 |
+
reason_code text,
|
| 579 |
+
is_active boolean DEFAULT false NOT NULL,
|
| 580 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
|
| 581 |
+
);
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
--
|
| 585 |
+
-- Name: evaluation_datasets_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
--
|
| 587 |
|
| 588 |
+
ALTER TABLE public.evaluation_datasets ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
|
| 589 |
+
SEQUENCE NAME public.evaluation_datasets_id_seq
|
| 590 |
+
START WITH 1
|
| 591 |
+
INCREMENT BY 1
|
| 592 |
+
NO MINVALUE
|
| 593 |
+
NO MAXVALUE
|
| 594 |
+
CACHE 1
|
| 595 |
+
);
|
| 596 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
|
| 598 |
--
|
| 599 |
+
-- Name: evaluation_logs; Type: TABLE; Schema: public; Owner: -
|
| 600 |
+
--
|
| 601 |
+
|
| 602 |
+
CREATE TABLE public.evaluation_logs (
|
| 603 |
+
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 604 |
+
run_label text,
|
| 605 |
+
evaluated_at timestamp with time zone,
|
| 606 |
+
alpha double precision,
|
| 607 |
+
k integer,
|
| 608 |
+
question text,
|
| 609 |
+
is_answerable boolean,
|
| 610 |
+
precision_at_k double precision,
|
| 611 |
+
faithfulness_proxy double precision,
|
| 612 |
+
relevance_proxy double precision,
|
| 613 |
+
local_reward double precision,
|
| 614 |
+
llm_judge_score double precision,
|
| 615 |
+
judge_a_verdict boolean,
|
| 616 |
+
judge_b_verdict boolean,
|
| 617 |
+
judge_a_model text,
|
| 618 |
+
judge_b_model text,
|
| 619 |
+
calibration_score double precision,
|
| 620 |
+
final_score double precision,
|
| 621 |
+
requires_manual_review boolean DEFAULT false,
|
| 622 |
+
disagreement_note text DEFAULT ''::text,
|
| 623 |
+
user_id uuid
|
| 624 |
+
);
|
| 625 |
|
| 626 |
|
| 627 |
--
|
| 628 |
+
-- Name: graph_edges; Type: TABLE; Schema: public; Owner: -
|
| 629 |
--
|
| 630 |
|
| 631 |
+
CREATE TABLE public.graph_edges (
|
| 632 |
+
id bigint NOT NULL,
|
| 633 |
+
user_id uuid,
|
| 634 |
+
source_node_key text NOT NULL,
|
| 635 |
+
target_node_key text NOT NULL,
|
| 636 |
+
edge_type text NOT NULL,
|
| 637 |
+
weight double precision DEFAULT 1.0 NOT NULL,
|
| 638 |
+
payload jsonb DEFAULT '{}'::jsonb NOT NULL,
|
| 639 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
|
| 640 |
+
);
|
| 641 |
+
|
| 642 |
|
| 643 |
--
|
| 644 |
+
-- Name: graph_edges_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 645 |
--
|
| 646 |
|
| 647 |
+
ALTER TABLE public.graph_edges ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
|
| 648 |
+
SEQUENCE NAME public.graph_edges_id_seq
|
| 649 |
+
START WITH 1
|
| 650 |
+
INCREMENT BY 1
|
| 651 |
+
NO MINVALUE
|
| 652 |
+
NO MAXVALUE
|
| 653 |
+
CACHE 1
|
| 654 |
+
);
|
| 655 |
|
| 656 |
|
| 657 |
--
|
| 658 |
+
-- Name: graph_nodes; Type: TABLE; Schema: public; Owner: -
|
| 659 |
--
|
| 660 |
|
| 661 |
+
CREATE TABLE public.graph_nodes (
|
| 662 |
+
id bigint NOT NULL,
|
| 663 |
+
user_id uuid,
|
| 664 |
+
node_key text NOT NULL,
|
| 665 |
+
node_type text NOT NULL,
|
| 666 |
+
label text NOT NULL,
|
| 667 |
+
payload jsonb DEFAULT '{}'::jsonb NOT NULL,
|
| 668 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
|
| 669 |
+
);
|
| 670 |
|
| 671 |
|
| 672 |
--
|
| 673 |
+
-- Name: graph_nodes_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 674 |
--
|
| 675 |
|
| 676 |
+
ALTER TABLE public.graph_nodes ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
|
| 677 |
+
SEQUENCE NAME public.graph_nodes_id_seq
|
| 678 |
+
START WITH 1
|
| 679 |
+
INCREMENT BY 1
|
| 680 |
+
NO MINVALUE
|
| 681 |
+
NO MAXVALUE
|
| 682 |
+
CACHE 1
|
| 683 |
+
);
|
| 684 |
|
| 685 |
|
| 686 |
--
|
| 687 |
+
-- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
|
| 688 |
--
|
| 689 |
|
| 690 |
+
CREATE TABLE public.ingested_files (
|
| 691 |
+
id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 692 |
+
file_hash text NOT NULL,
|
| 693 |
+
filename text NOT NULL,
|
| 694 |
+
document_type text,
|
| 695 |
+
chunk_count integer DEFAULT 0,
|
| 696 |
+
ingested_at timestamp with time zone DEFAULT now(),
|
| 697 |
+
user_id uuid DEFAULT auth.uid(),
|
| 698 |
+
user_overridden boolean DEFAULT false,
|
| 699 |
+
identity_json jsonb DEFAULT '{}'::jsonb NOT NULL
|
| 700 |
+
);
|
| 701 |
|
| 702 |
|
| 703 |
--
|
| 704 |
+
-- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
|
| 705 |
--
|
| 706 |
|
| 707 |
+
CREATE TABLE public.ingestion_retry_logs (
|
| 708 |
+
id bigint NOT NULL,
|
| 709 |
+
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 710 |
+
user_id uuid,
|
| 711 |
+
file_hash text,
|
| 712 |
+
batch_num integer NOT NULL,
|
| 713 |
+
total_batches integer NOT NULL,
|
| 714 |
+
attempt integer NOT NULL,
|
| 715 |
+
event_type text NOT NULL,
|
| 716 |
+
message text,
|
| 717 |
+
sleep_s double precision DEFAULT 0
|
| 718 |
+
);
|
|
|
|
|
|
|
| 719 |
|
| 720 |
|
| 721 |
--
|
| 722 |
+
-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 723 |
--
|
| 724 |
|
| 725 |
+
CREATE SEQUENCE public.ingestion_retry_logs_id_seq
|
| 726 |
+
START WITH 1
|
| 727 |
+
INCREMENT BY 1
|
| 728 |
+
NO MINVALUE
|
| 729 |
+
NO MAXVALUE
|
| 730 |
+
CACHE 1;
|
| 731 |
+
|
| 732 |
|
| 733 |
--
|
| 734 |
+
-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 735 |
--
|
| 736 |
|
| 737 |
+
ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
|
| 738 |
|
| 739 |
|
| 740 |
--
|
| 741 |
+
-- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
|
| 742 |
--
|
| 743 |
|
| 744 |
+
CREATE TABLE public.intent_feedback (
|
| 745 |
+
id bigint NOT NULL,
|
| 746 |
+
user_id uuid,
|
| 747 |
+
query text NOT NULL,
|
| 748 |
+
has_category boolean DEFAULT false NOT NULL,
|
| 749 |
+
has_history boolean DEFAULT false NOT NULL,
|
| 750 |
+
label integer NOT NULL,
|
| 751 |
+
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 752 |
+
CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
|
| 753 |
+
);
|
| 754 |
|
| 755 |
|
| 756 |
--
|
| 757 |
+
-- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 758 |
--
|
| 759 |
|
| 760 |
+
CREATE SEQUENCE public.intent_feedback_id_seq
|
| 761 |
+
START WITH 1
|
| 762 |
+
INCREMENT BY 1
|
| 763 |
+
NO MINVALUE
|
| 764 |
+
NO MAXVALUE
|
| 765 |
+
CACHE 1;
|
| 766 |
|
| 767 |
|
| 768 |
--
|
| 769 |
+
-- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 770 |
--
|
| 771 |
|
| 772 |
+
ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
|
| 773 |
|
| 774 |
|
| 775 |
--
|
| 776 |
+
-- Name: query_traces; Type: TABLE; Schema: public; Owner: -
|
| 777 |
--
|
| 778 |
|
| 779 |
+
CREATE TABLE public.query_traces (
|
| 780 |
+
trace_id uuid DEFAULT gen_random_uuid() NOT NULL,
|
| 781 |
+
user_id uuid,
|
| 782 |
+
session_id text DEFAULT 'default_session'::text NOT NULL,
|
| 783 |
+
question text NOT NULL,
|
| 784 |
+
route_mode text DEFAULT 'default'::text NOT NULL,
|
| 785 |
+
selected_experts jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 786 |
+
expert_weights jsonb DEFAULT '{}'::jsonb NOT NULL,
|
| 787 |
+
pinned_file_hashes jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 788 |
+
candidate_counts jsonb DEFAULT '{}'::jsonb NOT NULL,
|
| 789 |
+
selected_chunk_ids jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 790 |
+
doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 791 |
+
failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
|
| 792 |
+
quality_metrics jsonb DEFAULT '{}'::jsonb NOT NULL,
|
| 793 |
+
answer_hash text,
|
| 794 |
+
answer_preview text,
|
| 795 |
+
latency_ms integer,
|
| 796 |
+
created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
|
| 797 |
+
review_state text DEFAULT 'pending'::text NOT NULL,
|
| 798 |
+
review_notes text,
|
| 799 |
+
reviewed_at timestamp with time zone,
|
| 800 |
+
reviewed_by text,
|
| 801 |
+
promoted_to_eval boolean DEFAULT false NOT NULL,
|
| 802 |
+
document_types jsonb DEFAULT '[]'::jsonb NOT NULL
|
| 803 |
+
);
|
| 804 |
+
|
| 805 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
--
|
| 807 |
+
-- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
|
| 808 |
+
--
|
| 809 |
+
|
| 810 |
+
CREATE TABLE public.rerank_feedback (
|
| 811 |
+
id bigint NOT NULL,
|
| 812 |
+
user_id uuid,
|
| 813 |
+
query_hash text NOT NULL,
|
| 814 |
+
chunk_id uuid,
|
| 815 |
+
chunk_hash text NOT NULL,
|
| 816 |
+
document_type text,
|
| 817 |
+
cohere_score real NOT NULL,
|
| 818 |
+
was_selected boolean NOT NULL,
|
| 819 |
+
created_at timestamp with time zone DEFAULT now() NOT NULL,
|
| 820 |
+
query_text text,
|
| 821 |
+
chunk_text text
|
| 822 |
+
);
|
| 823 |
+
|
| 824 |
+
|
| 825 |
+
--
|
| 826 |
+
-- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
|
| 827 |
+
--
|
| 828 |
+
|
| 829 |
+
CREATE SEQUENCE public.rerank_feedback_id_seq
|
| 830 |
+
START WITH 1
|
| 831 |
+
INCREMENT BY 1
|
| 832 |
+
NO MINVALUE
|
| 833 |
+
NO MAXVALUE
|
| 834 |
+
CACHE 1;
|
| 835 |
+
|
| 836 |
+
|
| 837 |
+
--
|
| 838 |
+
-- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
|
| 839 |
+
--
|
| 840 |
+
|
| 841 |
+
ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
|
| 842 |
+
|
| 843 |
+
|
| 844 |
+
--
|
| 845 |
+
-- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
|
| 846 |
+
--
|
| 847 |
+
|
| 848 |
+
ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
|
| 849 |
+
|
| 850 |
+
|
| 851 |
+
--
|
| 852 |
+
-- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
|
| 853 |
+
--
|
| 854 |
+
|
| 855 |
+
ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
|
| 856 |
+
|
| 857 |
+
|
| 858 |
+
--
|
| 859 |
+
-- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
|
| 860 |
+
--
|
| 861 |
+
|
| 862 |
+
ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
|
| 863 |
+
|
| 864 |
+
|
| 865 |
+
--
|
| 866 |
+
-- Name: answer_feedback answer_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 867 |
+
--
|
| 868 |
+
|
| 869 |
+
ALTER TABLE ONLY public.answer_feedback
|
| 870 |
+
ADD CONSTRAINT answer_feedback_pkey PRIMARY KEY (id);
|
| 871 |
+
|
| 872 |
+
|
| 873 |
+
--
|
| 874 |
+
-- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 875 |
+
--
|
| 876 |
+
|
| 877 |
+
ALTER TABLE ONLY public.category_centroids
|
| 878 |
+
ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
|
| 879 |
+
|
| 880 |
+
|
| 881 |
+
--
|
| 882 |
+
-- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 883 |
+
--
|
| 884 |
+
|
| 885 |
+
ALTER TABLE ONLY public.category_centroids
|
| 886 |
+
ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
--
|
| 890 |
+
-- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 891 |
+
--
|
| 892 |
+
|
| 893 |
+
ALTER TABLE ONLY public.chat_memory
|
| 894 |
+
ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
|
| 895 |
+
|
| 896 |
+
|
| 897 |
+
--
|
| 898 |
+
-- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 899 |
+
--
|
| 900 |
+
|
| 901 |
+
ALTER TABLE ONLY public.document_trees
|
| 902 |
+
ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
|
| 903 |
+
|
| 904 |
+
|
| 905 |
+
--
|
| 906 |
+
-- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 907 |
+
--
|
| 908 |
+
|
| 909 |
+
ALTER TABLE ONLY public.documents
|
| 910 |
+
ADD CONSTRAINT documents_pkey PRIMARY KEY (id);
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
--
|
| 914 |
+
-- Name: evaluation_datasets evaluation_datasets_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 915 |
+
--
|
| 916 |
+
|
| 917 |
+
ALTER TABLE ONLY public.evaluation_datasets
|
| 918 |
+
ADD CONSTRAINT evaluation_datasets_pkey PRIMARY KEY (id);
|
| 919 |
+
|
| 920 |
+
|
| 921 |
+
--
|
| 922 |
+
-- Name: evaluation_datasets evaluation_datasets_trace_id_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 923 |
+
--
|
| 924 |
+
|
| 925 |
+
ALTER TABLE ONLY public.evaluation_datasets
|
| 926 |
+
ADD CONSTRAINT evaluation_datasets_trace_id_key UNIQUE (trace_id);
|
| 927 |
+
|
| 928 |
+
|
| 929 |
--
|
| 930 |
+
-- Name: evaluation_logs evaluation_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 931 |
+
--
|
| 932 |
+
|
| 933 |
+
ALTER TABLE ONLY public.evaluation_logs
|
| 934 |
+
ADD CONSTRAINT evaluation_logs_pkey PRIMARY KEY (id);
|
| 935 |
+
|
| 936 |
+
|
| 937 |
+
--
|
| 938 |
+
-- Name: graph_edges graph_edges_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 939 |
+
--
|
| 940 |
+
|
| 941 |
+
ALTER TABLE ONLY public.graph_edges
|
| 942 |
+
ADD CONSTRAINT graph_edges_pkey PRIMARY KEY (id);
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
--
|
| 946 |
+
-- Name: graph_edges graph_edges_user_id_source_node_key_target_node_key_edge_ty_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 947 |
+
--
|
| 948 |
+
|
| 949 |
+
ALTER TABLE ONLY public.graph_edges
|
| 950 |
+
ADD CONSTRAINT graph_edges_user_id_source_node_key_target_node_key_edge_ty_key UNIQUE (user_id, source_node_key, target_node_key, edge_type);
|
| 951 |
+
|
| 952 |
+
|
| 953 |
+
--
|
| 954 |
+
-- Name: graph_nodes graph_nodes_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 955 |
+
--
|
| 956 |
+
|
| 957 |
+
ALTER TABLE ONLY public.graph_nodes
|
| 958 |
+
ADD CONSTRAINT graph_nodes_pkey PRIMARY KEY (id);
|
| 959 |
+
|
| 960 |
+
|
| 961 |
+
--
|
| 962 |
+
-- Name: graph_nodes graph_nodes_user_id_node_key_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 963 |
+
--
|
| 964 |
+
|
| 965 |
+
ALTER TABLE ONLY public.graph_nodes
|
| 966 |
+
ADD CONSTRAINT graph_nodes_user_id_node_key_key UNIQUE (user_id, node_key);
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
--
|
| 970 |
+
-- Name: ingested_files ingested_files_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 971 |
+
--
|
| 972 |
+
|
| 973 |
+
ALTER TABLE ONLY public.ingested_files
|
| 974 |
+
ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
|
| 975 |
+
|
| 976 |
+
|
| 977 |
+
--
|
| 978 |
+
-- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
|
| 979 |
+
--
|
| 980 |
+
|
| 981 |
+
ALTER TABLE ONLY public.ingested_files
|
| 982 |
+
ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
|
| 983 |
+
|
| 984 |
+
|
| 985 |
+
--
|
| 986 |
+
-- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 987 |
+
--
|
| 988 |
+
|
| 989 |
+
ALTER TABLE ONLY public.ingestion_retry_logs
|
| 990 |
+
ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
|
| 991 |
+
|
| 992 |
+
|
| 993 |
+
--
|
| 994 |
+
-- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 995 |
+
--
|
| 996 |
+
|
| 997 |
+
ALTER TABLE ONLY public.intent_feedback
|
| 998 |
+
ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
|
| 999 |
+
|
| 1000 |
+
|
| 1001 |
+
--
|
| 1002 |
+
-- Name: query_traces query_traces_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 1003 |
+
--
|
| 1004 |
+
|
| 1005 |
+
ALTER TABLE ONLY public.query_traces
|
| 1006 |
+
ADD CONSTRAINT query_traces_pkey PRIMARY KEY (trace_id);
|
| 1007 |
+
|
| 1008 |
+
|
| 1009 |
+
--
|
| 1010 |
+
-- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
|
| 1011 |
+
--
|
| 1012 |
+
|
| 1013 |
+
ALTER TABLE ONLY public.rerank_feedback
|
| 1014 |
+
ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
|
| 1015 |
+
|
| 1016 |
+
|
| 1017 |
+
--
|
| 1018 |
+
-- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 1019 |
+
--
|
| 1020 |
+
|
| 1021 |
+
CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
|
| 1022 |
+
|
| 1023 |
+
|
| 1024 |
+
--
|
| 1025 |
+
-- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1026 |
+
--
|
| 1027 |
+
|
| 1028 |
+
CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
|
| 1029 |
+
|
| 1030 |
+
|
| 1031 |
+
--
|
| 1032 |
+
-- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
|
| 1033 |
+
--
|
| 1034 |
+
|
| 1035 |
+
CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
|
| 1036 |
+
|
| 1037 |
+
|
| 1038 |
+
--
|
| 1039 |
+
-- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1040 |
+
--
|
| 1041 |
+
|
| 1042 |
+
CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
|
| 1043 |
+
|
| 1044 |
+
|
| 1045 |
+
--
|
| 1046 |
+
-- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 1047 |
+
--
|
| 1048 |
+
|
| 1049 |
+
CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
|
| 1050 |
+
|
| 1051 |
+
|
| 1052 |
+
--
|
| 1053 |
+
-- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
|
| 1054 |
+
--
|
| 1055 |
+
|
| 1056 |
+
CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
|
| 1057 |
+
|
| 1058 |
+
|
| 1059 |
+
--
|
| 1060 |
+
-- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
|
| 1061 |
+
--
|
| 1062 |
+
|
| 1063 |
+
CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
|
| 1064 |
+
|
| 1065 |
+
|
| 1066 |
+
--
|
| 1067 |
+
-- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
|
| 1068 |
+
--
|
| 1069 |
+
|
| 1070 |
+
CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
|
| 1071 |
+
|
| 1072 |
+
|
| 1073 |
+
--
|
| 1074 |
+
-- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
|
| 1075 |
+
--
|
| 1076 |
+
|
| 1077 |
+
CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
|
| 1078 |
+
|
| 1079 |
+
|
| 1080 |
+
--
|
| 1081 |
+
-- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1082 |
+
--
|
| 1083 |
+
|
| 1084 |
+
CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
|
| 1085 |
+
|
| 1086 |
+
|
| 1087 |
+
--
|
| 1088 |
+
-- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
|
| 1089 |
+
--
|
| 1090 |
+
|
| 1091 |
+
CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
|
| 1092 |
+
|
| 1093 |
+
|
| 1094 |
+
--
|
| 1095 |
+
-- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
|
| 1096 |
+
--
|
| 1097 |
+
|
| 1098 |
+
CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
|
| 1099 |
+
|
| 1100 |
+
|
| 1101 |
+
--
|
| 1102 |
+
-- Name: idx_answer_feedback_review_state_created; Type: INDEX; Schema: public; Owner: -
|
| 1103 |
+
--
|
| 1104 |
+
|
| 1105 |
+
CREATE INDEX idx_answer_feedback_review_state_created ON public.answer_feedback USING btree (review_state, created_at DESC);
|
| 1106 |
+
|
| 1107 |
+
|
| 1108 |
+
--
|
| 1109 |
+
-- Name: idx_answer_feedback_trace_created; Type: INDEX; Schema: public; Owner: -
|
| 1110 |
+
--
|
| 1111 |
+
|
| 1112 |
+
CREATE INDEX idx_answer_feedback_trace_created ON public.answer_feedback USING btree (trace_id, created_at DESC);
|
| 1113 |
+
|
| 1114 |
+
|
| 1115 |
+
--
|
| 1116 |
+
-- Name: idx_answer_feedback_user_created; Type: INDEX; Schema: public; Owner: -
|
| 1117 |
+
--
|
| 1118 |
+
|
| 1119 |
+
CREATE INDEX idx_answer_feedback_user_created ON public.answer_feedback USING btree (user_id, created_at DESC);
|
| 1120 |
+
|
| 1121 |
+
|
| 1122 |
+
--
|
| 1123 |
+
-- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
|
| 1124 |
+
--
|
| 1125 |
+
|
| 1126 |
+
CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
|
| 1127 |
+
|
| 1128 |
+
|
| 1129 |
+
--
|
| 1130 |
+
-- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
|
| 1131 |
+
--
|
| 1132 |
+
|
| 1133 |
+
CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
|
| 1134 |
+
|
| 1135 |
+
|
| 1136 |
+
--
|
| 1137 |
+
-- Name: idx_evaluation_datasets_active_created; Type: INDEX; Schema: public; Owner: -
|
| 1138 |
+
--
|
| 1139 |
+
|
| 1140 |
+
CREATE INDEX idx_evaluation_datasets_active_created ON public.evaluation_datasets USING btree (is_active, created_at DESC);
|
| 1141 |
+
|
| 1142 |
+
|
| 1143 |
+
--
|
| 1144 |
+
-- Name: idx_graph_edges_user_source; Type: INDEX; Schema: public; Owner: -
|
| 1145 |
+
--
|
| 1146 |
+
|
| 1147 |
+
CREATE INDEX idx_graph_edges_user_source ON public.graph_edges USING btree (user_id, source_node_key);
|
| 1148 |
+
|
| 1149 |
+
|
| 1150 |
+
--
|
| 1151 |
+
-- Name: idx_graph_edges_user_target; Type: INDEX; Schema: public; Owner: -
|
| 1152 |
+
--
|
| 1153 |
+
|
| 1154 |
+
CREATE INDEX idx_graph_edges_user_target ON public.graph_edges USING btree (user_id, target_node_key);
|
| 1155 |
+
|
| 1156 |
+
|
| 1157 |
+
--
|
| 1158 |
+
-- Name: idx_graph_nodes_user_label; Type: INDEX; Schema: public; Owner: -
|
| 1159 |
+
--
|
| 1160 |
+
|
| 1161 |
+
CREATE INDEX idx_graph_nodes_user_label ON public.graph_nodes USING btree (user_id, label);
|
| 1162 |
+
|
| 1163 |
+
|
| 1164 |
+
--
|
| 1165 |
+
-- Name: idx_graph_nodes_user_type; Type: INDEX; Schema: public; Owner: -
|
| 1166 |
+
--
|
| 1167 |
+
|
| 1168 |
+
CREATE INDEX idx_graph_nodes_user_type ON public.graph_nodes USING btree (user_id, node_type);
|
| 1169 |
+
|
| 1170 |
+
|
| 1171 |
+
--
|
| 1172 |
+
-- Name: idx_query_traces_review_state_created; Type: INDEX; Schema: public; Owner: -
|
| 1173 |
+
--
|
| 1174 |
+
|
| 1175 |
+
CREATE INDEX idx_query_traces_review_state_created ON public.query_traces USING btree (review_state, created_at DESC);
|
| 1176 |
+
|
| 1177 |
+
|
| 1178 |
+
--
|
| 1179 |
+
-- Name: idx_query_traces_session_created; Type: INDEX; Schema: public; Owner: -
|
| 1180 |
+
--
|
| 1181 |
+
|
| 1182 |
+
CREATE INDEX idx_query_traces_session_created ON public.query_traces USING btree (session_id, created_at DESC);
|
| 1183 |
+
|
| 1184 |
+
|
| 1185 |
+
--
|
| 1186 |
+
-- Name: idx_query_traces_user_created; Type: INDEX; Schema: public; Owner: -
|
| 1187 |
+
--
|
| 1188 |
+
|
| 1189 |
+
CREATE INDEX idx_query_traces_user_created ON public.query_traces USING btree (user_id, created_at DESC);
|
| 1190 |
+
|
| 1191 |
+
|
| 1192 |
+
--
|
| 1193 |
+
-- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
|
| 1194 |
+
--
|
| 1195 |
+
|
| 1196 |
+
CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
|
| 1197 |
+
|
| 1198 |
+
|
| 1199 |
+
--
|
| 1200 |
+
-- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
|
| 1201 |
+
--
|
| 1202 |
+
|
| 1203 |
+
CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
|
| 1204 |
+
|
| 1205 |
+
|
| 1206 |
+
--
|
| 1207 |
+
-- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1208 |
+
--
|
| 1209 |
+
|
| 1210 |
+
CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
|
| 1211 |
+
|
| 1212 |
+
|
| 1213 |
+
--
|
| 1214 |
+
-- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
|
| 1215 |
+
--
|
| 1216 |
+
|
| 1217 |
+
CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
|
| 1218 |
+
|
| 1219 |
+
|
| 1220 |
+
--
|
| 1221 |
+
-- Name: ingestion_retry_logs_user_file_event_idx; Type: INDEX; Schema: public; Owner: -
|
| 1222 |
+
--
|
| 1223 |
+
|
| 1224 |
+
CREATE INDEX ingestion_retry_logs_user_file_event_idx ON public.ingestion_retry_logs USING btree (user_id, file_hash, event_type, created_at DESC);
|
| 1225 |
+
|
| 1226 |
+
|
| 1227 |
+
--
|
| 1228 |
+
-- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1229 |
+
--
|
| 1230 |
+
|
| 1231 |
+
CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
|
| 1232 |
+
|
| 1233 |
+
|
| 1234 |
+
--
|
| 1235 |
+
-- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
|
| 1236 |
+
--
|
| 1237 |
+
|
| 1238 |
+
CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
|
| 1239 |
+
|
| 1240 |
+
|
| 1241 |
+
--
|
| 1242 |
+
-- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
|
| 1243 |
+
--
|
| 1244 |
+
|
| 1245 |
+
CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
|
| 1246 |
+
|
| 1247 |
+
|
| 1248 |
+
--
|
| 1249 |
+
-- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
|
| 1250 |
+
--
|
| 1251 |
+
|
| 1252 |
+
CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
|
| 1253 |
+
|
| 1254 |
+
|
| 1255 |
+
--
|
| 1256 |
+
-- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
|
| 1257 |
+
--
|
| 1258 |
+
|
| 1259 |
+
CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
|
| 1260 |
+
|
| 1261 |
+
|
| 1262 |
+
--
|
| 1263 |
+
-- Name: answer_feedback answer_feedback_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
|
| 1264 |
+
--
|
| 1265 |
+
|
| 1266 |
+
ALTER TABLE ONLY public.answer_feedback
|
| 1267 |
+
ADD CONSTRAINT answer_feedback_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE CASCADE;
|
| 1268 |
+
|
| 1269 |
+
|
| 1270 |
+
--
|
| 1271 |
+
-- Name: evaluation_datasets evaluation_datasets_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
|
| 1272 |
+
--
|
| 1273 |
+
|
| 1274 |
+
ALTER TABLE ONLY public.evaluation_datasets
|
| 1275 |
+
ADD CONSTRAINT evaluation_datasets_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE SET NULL;
|
| 1276 |
+
|
| 1277 |
+
|
| 1278 |
+
--
|
| 1279 |
+
-- Name: answer_feedback; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1280 |
+
--
|
| 1281 |
+
|
| 1282 |
+
ALTER TABLE public.answer_feedback ENABLE ROW LEVEL SECURITY;
|
| 1283 |
+
|
| 1284 |
+
--
|
| 1285 |
+
-- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1286 |
+
--
|
| 1287 |
+
|
| 1288 |
+
ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
|
| 1289 |
+
|
| 1290 |
+
--
|
| 1291 |
+
-- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1292 |
+
--
|
| 1293 |
+
|
| 1294 |
+
ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
|
| 1295 |
+
|
| 1296 |
+
--
|
| 1297 |
+
-- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 1298 |
+
--
|
| 1299 |
+
|
| 1300 |
+
CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
|
| 1301 |
+
|
| 1302 |
+
|
| 1303 |
+
--
|
| 1304 |
+
-- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 1305 |
+
--
|
| 1306 |
+
|
| 1307 |
+
CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 1308 |
+
|
| 1309 |
+
|
| 1310 |
+
--
|
| 1311 |
+
-- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1312 |
+
--
|
| 1313 |
+
|
| 1314 |
+
CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
|
| 1315 |
+
|
| 1316 |
+
|
| 1317 |
+
--
|
| 1318 |
+
-- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
|
| 1319 |
+
--
|
| 1320 |
+
|
| 1321 |
+
CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 1322 |
+
|
| 1323 |
+
|
| 1324 |
+
--
|
| 1325 |
+
-- Name: document_trees; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1326 |
+
--
|
| 1327 |
+
|
| 1328 |
+
ALTER TABLE public.document_trees ENABLE ROW LEVEL SECURITY;
|
| 1329 |
+
|
| 1330 |
+
--
|
| 1331 |
+
-- Name: document_trees document_trees_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 1332 |
+
--
|
| 1333 |
+
|
| 1334 |
+
CREATE POLICY document_trees_delete_own ON public.document_trees FOR DELETE USING ((user_id = auth.uid()));
|
| 1335 |
+
|
| 1336 |
+
|
| 1337 |
+
--
|
| 1338 |
+
-- Name: document_trees document_trees_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 1339 |
+
--
|
| 1340 |
+
|
| 1341 |
+
CREATE POLICY document_trees_insert_own ON public.document_trees FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 1342 |
+
|
| 1343 |
+
|
| 1344 |
+
--
|
| 1345 |
+
-- Name: document_trees document_trees_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1346 |
+
--
|
| 1347 |
+
|
| 1348 |
+
CREATE POLICY document_trees_select_own ON public.document_trees FOR SELECT USING ((user_id = auth.uid()));
|
| 1349 |
+
|
| 1350 |
+
|
| 1351 |
+
--
|
| 1352 |
+
-- Name: document_trees document_trees_update_own; Type: POLICY; Schema: public; Owner: -
|
| 1353 |
+
--
|
| 1354 |
+
|
| 1355 |
+
CREATE POLICY document_trees_update_own ON public.document_trees FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 1356 |
+
|
| 1357 |
+
|
| 1358 |
+
--
|
| 1359 |
+
-- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1360 |
+
--
|
| 1361 |
+
|
| 1362 |
+
ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
|
| 1363 |
+
|
| 1364 |
+
--
|
| 1365 |
+
-- Name: documents documents_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 1366 |
+
--
|
| 1367 |
+
|
| 1368 |
+
CREATE POLICY documents_delete_own ON public.documents FOR DELETE USING ((user_id = auth.uid()));
|
| 1369 |
+
|
| 1370 |
+
|
| 1371 |
+
--
|
| 1372 |
+
-- Name: documents documents_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 1373 |
+
--
|
| 1374 |
+
|
| 1375 |
+
CREATE POLICY documents_insert_own ON public.documents FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 1376 |
+
|
| 1377 |
+
|
| 1378 |
+
--
|
| 1379 |
+
-- Name: documents documents_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1380 |
+
--
|
| 1381 |
+
|
| 1382 |
+
CREATE POLICY documents_select_own ON public.documents FOR SELECT USING ((user_id = auth.uid()));
|
| 1383 |
+
|
| 1384 |
+
|
| 1385 |
+
--
|
| 1386 |
+
-- Name: documents documents_update_own; Type: POLICY; Schema: public; Owner: -
|
| 1387 |
+
--
|
| 1388 |
+
|
| 1389 |
+
CREATE POLICY documents_update_own ON public.documents FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 1390 |
+
|
| 1391 |
+
|
| 1392 |
+
--
|
| 1393 |
+
-- Name: evaluation_datasets; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1394 |
+
--
|
| 1395 |
+
|
| 1396 |
+
ALTER TABLE public.evaluation_datasets ENABLE ROW LEVEL SECURITY;
|
| 1397 |
+
|
| 1398 |
+
--
|
| 1399 |
+
-- Name: evaluation_logs; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1400 |
+
--
|
| 1401 |
+
|
| 1402 |
+
ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY;
|
| 1403 |
+
|
| 1404 |
+
--
|
| 1405 |
+
-- Name: graph_edges; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1406 |
+
--
|
| 1407 |
+
|
| 1408 |
+
ALTER TABLE public.graph_edges ENABLE ROW LEVEL SECURITY;
|
| 1409 |
+
|
| 1410 |
+
--
|
| 1411 |
+
-- Name: graph_nodes; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1412 |
+
--
|
| 1413 |
+
|
| 1414 |
+
ALTER TABLE public.graph_nodes ENABLE ROW LEVEL SECURITY;
|
| 1415 |
+
|
| 1416 |
+
--
|
| 1417 |
+
-- Name: ingested_files; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1418 |
+
--
|
| 1419 |
+
|
| 1420 |
+
ALTER TABLE public.ingested_files ENABLE ROW LEVEL SECURITY;
|
| 1421 |
+
|
| 1422 |
+
--
|
| 1423 |
+
-- Name: ingested_files ingested_files_delete_own; Type: POLICY; Schema: public; Owner: -
|
| 1424 |
+
--
|
| 1425 |
+
|
| 1426 |
+
CREATE POLICY ingested_files_delete_own ON public.ingested_files FOR DELETE USING ((user_id = auth.uid()));
|
| 1427 |
+
|
| 1428 |
+
|
| 1429 |
+
--
|
| 1430 |
+
-- Name: ingested_files ingested_files_insert_own; Type: POLICY; Schema: public; Owner: -
|
| 1431 |
+
--
|
| 1432 |
+
|
| 1433 |
+
CREATE POLICY ingested_files_insert_own ON public.ingested_files FOR INSERT WITH CHECK ((user_id = auth.uid()));
|
| 1434 |
+
|
| 1435 |
+
|
| 1436 |
+
--
|
| 1437 |
+
-- Name: ingested_files ingested_files_select_own; Type: POLICY; Schema: public; Owner: -
|
| 1438 |
+
--
|
| 1439 |
+
|
| 1440 |
+
CREATE POLICY ingested_files_select_own ON public.ingested_files FOR SELECT USING ((user_id = auth.uid()));
|
| 1441 |
+
|
| 1442 |
+
|
| 1443 |
+
--
|
| 1444 |
+
-- Name: ingested_files ingested_files_update_own; Type: POLICY; Schema: public; Owner: -
|
| 1445 |
+
--
|
| 1446 |
+
|
| 1447 |
+
CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
|
| 1448 |
+
|
| 1449 |
+
|
| 1450 |
+
--
|
| 1451 |
+
-- Name: ingestion_retry_logs; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1452 |
+
--
|
| 1453 |
+
|
| 1454 |
+
ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY;
|
| 1455 |
+
|
| 1456 |
+
--
|
| 1457 |
+
-- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1458 |
+
--
|
| 1459 |
+
|
| 1460 |
+
ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
|
| 1461 |
+
|
| 1462 |
+
--
|
| 1463 |
+
-- Name: query_traces; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1464 |
+
--
|
| 1465 |
+
|
| 1466 |
+
ALTER TABLE public.query_traces ENABLE ROW LEVEL SECURITY;
|
| 1467 |
+
|
| 1468 |
+
--
|
| 1469 |
+
-- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
|
| 1470 |
+
--
|
| 1471 |
+
|
| 1472 |
+
ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
|
| 1473 |
+
|
| 1474 |
+
--
|
| 1475 |
+
-- PostgreSQL database dump complete
|
| 1476 |
+
--
|
| 1477 |
+
|
| 1478 |
+
\unrestrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua
|
tests/test_guest_mode.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import jwt
|
| 2 |
+
from starlette.requests import Request
|
| 3 |
+
|
| 4 |
+
from backend.core.auth_utils import is_guest_token
|
| 5 |
+
from backend.main import _rate_limit_key
|
| 6 |
+
|
| 7 |
+
_TEST_GUEST_KEY = "guest-secret-key-that-is-long-enough"
|
| 8 |
+
_TEST_USER_KEY = "user-secret-key-that-is-long-enough"
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _make_request(headers: dict[str, str], client_ip: str = "127.0.0.1") -> Request:
|
| 12 |
+
scope = {
|
| 13 |
+
"type": "http",
|
| 14 |
+
"method": "GET",
|
| 15 |
+
"path": "/",
|
| 16 |
+
"scheme": "http",
|
| 17 |
+
"client": (client_ip, 4321),
|
| 18 |
+
"server": ("testserver", 80),
|
| 19 |
+
"headers": [
|
| 20 |
+
(key.lower().encode("latin-1"), value.encode("latin-1"))
|
| 21 |
+
for key, value in headers.items()
|
| 22 |
+
],
|
| 23 |
+
}
|
| 24 |
+
return Request(scope)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_is_guest_token_detects_anonymous_provider():
|
| 28 |
+
token = jwt.encode(
|
| 29 |
+
{
|
| 30 |
+
"sub": "11111111-1111-1111-1111-111111111111",
|
| 31 |
+
"app_metadata": {"provider": "anonymous", "providers": ["anonymous"]},
|
| 32 |
+
},
|
| 33 |
+
_TEST_GUEST_KEY,
|
| 34 |
+
algorithm="HS256",
|
| 35 |
+
)
|
| 36 |
+
assert is_guest_token(token) is True
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_is_guest_token_ignores_regular_authenticated_user():
|
| 40 |
+
token = jwt.encode(
|
| 41 |
+
{
|
| 42 |
+
"sub": "22222222-2222-2222-2222-222222222222",
|
| 43 |
+
"app_metadata": {"provider": "email", "providers": ["email"]},
|
| 44 |
+
},
|
| 45 |
+
_TEST_USER_KEY,
|
| 46 |
+
algorithm="HS256",
|
| 47 |
+
)
|
| 48 |
+
assert is_guest_token(token) is False
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_rate_limit_key_uses_ip_for_guest_tokens():
|
| 52 |
+
token = jwt.encode(
|
| 53 |
+
{
|
| 54 |
+
"sub": "33333333-3333-3333-3333-333333333333",
|
| 55 |
+
"app_metadata": {"provider": "anonymous"},
|
| 56 |
+
},
|
| 57 |
+
_TEST_GUEST_KEY,
|
| 58 |
+
algorithm="HS256",
|
| 59 |
+
)
|
| 60 |
+
request = _make_request({"X-Auth-Token": token}, client_ip="10.0.0.8")
|
| 61 |
+
assert _rate_limit_key(request) == "10.0.0.8"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_rate_limit_key_uses_token_for_regular_users():
|
| 65 |
+
token = jwt.encode(
|
| 66 |
+
{
|
| 67 |
+
"sub": "44444444-4444-4444-4444-444444444444",
|
| 68 |
+
"app_metadata": {"provider": "email"},
|
| 69 |
+
},
|
| 70 |
+
_TEST_USER_KEY,
|
| 71 |
+
algorithm="HS256",
|
| 72 |
+
)
|
| 73 |
+
request = _make_request({"Authorization": f"Bearer {token}"}, client_ip="10.0.0.8")
|
| 74 |
+
assert _rate_limit_key(request) == token
|
tests/test_ingest_api.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import tempfile
|
| 5 |
+
from types import SimpleNamespace
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
from fastapi import HTTPException
|
| 9 |
+
from starlette.requests import Request
|
| 10 |
+
|
| 11 |
+
from backend.api import ingest as ingest_api
|
| 12 |
+
from backend.core import pipeline, tasks
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FakeUploadFile:
|
| 16 |
+
def __init__(self, filename: str, content: bytes):
|
| 17 |
+
self.filename = filename
|
| 18 |
+
self._content = content
|
| 19 |
+
self._cursor = 0
|
| 20 |
+
|
| 21 |
+
async def read(self, size: int = -1) -> bytes:
|
| 22 |
+
if size is None or size < 0:
|
| 23 |
+
size = len(self._content) - self._cursor
|
| 24 |
+
start = self._cursor
|
| 25 |
+
end = min(len(self._content), self._cursor + size)
|
| 26 |
+
self._cursor = end
|
| 27 |
+
return self._content[start:end]
|
| 28 |
+
|
| 29 |
+
async def seek(self, offset: int) -> None:
|
| 30 |
+
self._cursor = max(0, offset)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class FakeCountQuery:
|
| 34 |
+
def __init__(self, count: int):
|
| 35 |
+
self.count = count
|
| 36 |
+
|
| 37 |
+
def select(self, *_args, **_kwargs):
|
| 38 |
+
return self
|
| 39 |
+
|
| 40 |
+
def eq(self, *_args, **_kwargs):
|
| 41 |
+
return self
|
| 42 |
+
|
| 43 |
+
def execute(self):
|
| 44 |
+
return SimpleNamespace(count=self.count)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class FakeCountSupabase:
|
| 48 |
+
def __init__(self, count: int = 0):
|
| 49 |
+
self.count = count
|
| 50 |
+
|
| 51 |
+
def table(self, _name: str):
|
| 52 |
+
return FakeCountQuery(self.count)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _install_fake_magic(monkeypatch):
|
| 56 |
+
monkeypatch.setitem(
|
| 57 |
+
sys.modules,
|
| 58 |
+
"magic",
|
| 59 |
+
SimpleNamespace(from_buffer=lambda *_args, **_kwargs: "application/pdf"),
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _fake_request() -> Request:
|
| 64 |
+
return Request(
|
| 65 |
+
{
|
| 66 |
+
"type": "http",
|
| 67 |
+
"method": "POST",
|
| 68 |
+
"path": "/api/v1/ingest/upload",
|
| 69 |
+
"headers": [],
|
| 70 |
+
"client": ("127.0.0.1", 12345),
|
| 71 |
+
}
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_upload_rejects_large_pdf_with_original_http_status(monkeypatch):
|
| 76 |
+
_install_fake_magic(monkeypatch)
|
| 77 |
+
monkeypatch.setattr(
|
| 78 |
+
pipeline,
|
| 79 |
+
"_build_supabase_client",
|
| 80 |
+
lambda **_kwargs: FakeCountSupabase(count=0),
|
| 81 |
+
)
|
| 82 |
+
monkeypatch.setattr(ingest_api, "celery_app", SimpleNamespace())
|
| 83 |
+
monkeypatch.setattr(
|
| 84 |
+
ingest_api,
|
| 85 |
+
"process_pdf_task",
|
| 86 |
+
SimpleNamespace(delay=lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("should not queue"))),
|
| 87 |
+
)
|
| 88 |
+
monkeypatch.setattr(ingest_api.config, "MAX_UPLOAD_MB", 1, raising=False)
|
| 89 |
+
monkeypatch.setattr(ingest_api.config, "GUEST_MAX_UPLOAD_MB", 1, raising=False)
|
| 90 |
+
|
| 91 |
+
file = FakeUploadFile("guide.pdf", b"%PDF-1.4\n" + (b"x" * (2 * 1024 * 1024)))
|
| 92 |
+
|
| 93 |
+
with pytest.raises(HTTPException) as exc_info:
|
| 94 |
+
asyncio.run(
|
| 95 |
+
ingest_api.upload(
|
| 96 |
+
request=_fake_request(),
|
| 97 |
+
file=file,
|
| 98 |
+
user_id="user-1",
|
| 99 |
+
x_auth_token="token",
|
| 100 |
+
)
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
assert exc_info.value.status_code == 413
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_upload_returns_503_when_worker_is_unavailable(monkeypatch):
|
| 107 |
+
_install_fake_magic(monkeypatch)
|
| 108 |
+
monkeypatch.setattr(
|
| 109 |
+
pipeline,
|
| 110 |
+
"_build_supabase_client",
|
| 111 |
+
lambda **_kwargs: FakeCountSupabase(count=0),
|
| 112 |
+
)
|
| 113 |
+
monkeypatch.setattr(ingest_api, "celery_app", None)
|
| 114 |
+
monkeypatch.setattr(ingest_api, "process_pdf_task", SimpleNamespace())
|
| 115 |
+
|
| 116 |
+
file = FakeUploadFile("guide.pdf", b"%PDF-1.4\nsmall")
|
| 117 |
+
|
| 118 |
+
with pytest.raises(HTTPException) as exc_info:
|
| 119 |
+
asyncio.run(
|
| 120 |
+
ingest_api.upload(
|
| 121 |
+
request=_fake_request(),
|
| 122 |
+
file=file,
|
| 123 |
+
user_id="user-1",
|
| 124 |
+
x_auth_token="token",
|
| 125 |
+
)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
assert exc_info.value.status_code == 503
|
| 129 |
+
assert "worker is unavailable" in exc_info.value.detail.lower()
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def test_get_ingest_status_requires_available_worker(monkeypatch):
|
| 133 |
+
monkeypatch.setattr(ingest_api, "celery_app", None)
|
| 134 |
+
|
| 135 |
+
with pytest.raises(HTTPException) as exc_info:
|
| 136 |
+
ingest_api.get_ingest_status("task-1")
|
| 137 |
+
|
| 138 |
+
assert exc_info.value.status_code == 503
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def test_process_pdf_task_impl_preserves_original_exception_and_cleans_temp_file(monkeypatch):
|
| 142 |
+
fd, tmp_path = tempfile.mkstemp(suffix="_guide.pdf")
|
| 143 |
+
os.close(fd)
|
| 144 |
+
|
| 145 |
+
monkeypatch.setattr(
|
| 146 |
+
tasks,
|
| 147 |
+
"run_ingestion",
|
| 148 |
+
lambda **_kwargs: (_ for _ in ()).throw(ValueError("boom")),
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
fake_task = SimpleNamespace(update_state=lambda **_kwargs: None)
|
| 152 |
+
|
| 153 |
+
with pytest.raises(ValueError, match="boom"):
|
| 154 |
+
tasks._process_pdf_task_impl(fake_task, tmp_path, "guide.pdf", "token")
|
| 155 |
+
|
| 156 |
+
assert not os.path.exists(tmp_path)
|
tests/test_pipeline_regressions.py
ADDED
|
@@ -0,0 +1,1831 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from types import SimpleNamespace
|
| 5 |
+
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
|
| 8 |
+
from backend.api import admin
|
| 9 |
+
from backend.api import query as query_api
|
| 10 |
+
from backend.core import auth_utils, pipeline, providers
|
| 11 |
+
from backend.eval import run_eval
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FakeElement:
|
| 15 |
+
def __init__(self, text: str, category: str = "Text", page_number: int = 1):
|
| 16 |
+
self.text = text
|
| 17 |
+
self.category = category
|
| 18 |
+
self.metadata = SimpleNamespace(page_number=page_number)
|
| 19 |
+
|
| 20 |
+
def __str__(self) -> str:
|
| 21 |
+
return self.text
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class FakeIngestionTable:
|
| 25 |
+
def __init__(self, supabase, name: str):
|
| 26 |
+
self.supabase = supabase
|
| 27 |
+
self.name = name
|
| 28 |
+
self.action = None
|
| 29 |
+
self.filters = {}
|
| 30 |
+
self.payload = None
|
| 31 |
+
|
| 32 |
+
def select(self, *_args):
|
| 33 |
+
self.action = "select"
|
| 34 |
+
return self
|
| 35 |
+
|
| 36 |
+
def delete(self):
|
| 37 |
+
self.action = "delete"
|
| 38 |
+
return self
|
| 39 |
+
|
| 40 |
+
def upsert(self, payload, on_conflict=None):
|
| 41 |
+
self.action = "upsert"
|
| 42 |
+
self.payload = payload
|
| 43 |
+
self.on_conflict = on_conflict
|
| 44 |
+
return self
|
| 45 |
+
|
| 46 |
+
def insert(self, payload):
|
| 47 |
+
self.action = "insert"
|
| 48 |
+
self.payload = payload
|
| 49 |
+
return self
|
| 50 |
+
|
| 51 |
+
def eq(self, key, value):
|
| 52 |
+
self.filters[key] = value
|
| 53 |
+
return self
|
| 54 |
+
|
| 55 |
+
def contains(self, key, value):
|
| 56 |
+
self.filters[key] = value
|
| 57 |
+
return self
|
| 58 |
+
|
| 59 |
+
def limit(self, value):
|
| 60 |
+
self.filters["limit"] = value
|
| 61 |
+
return self
|
| 62 |
+
|
| 63 |
+
def execute(self):
|
| 64 |
+
self.supabase.ops.append((self.name, self.action, dict(self.filters)))
|
| 65 |
+
if self.action == "insert":
|
| 66 |
+
self.supabase.inserts.append((self.name, self.payload))
|
| 67 |
+
if self.name == "ingested_files" and self.action == "select":
|
| 68 |
+
return SimpleNamespace(
|
| 69 |
+
data=[{"document_type": "short_story", "user_overridden": True}]
|
| 70 |
+
)
|
| 71 |
+
return SimpleNamespace(data=[])
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class FakeIngestionSupabase:
|
| 75 |
+
def __init__(self):
|
| 76 |
+
self.ops = []
|
| 77 |
+
self.inserts = []
|
| 78 |
+
|
| 79 |
+
def table(self, name: str):
|
| 80 |
+
return FakeIngestionTable(self, name)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class FakeRecoveryTable:
|
| 84 |
+
def __init__(self, supabase, name: str):
|
| 85 |
+
self.supabase = supabase
|
| 86 |
+
self.name = name
|
| 87 |
+
self.action = None
|
| 88 |
+
self.filters = {}
|
| 89 |
+
self.payload = None
|
| 90 |
+
self.limit_value = None
|
| 91 |
+
|
| 92 |
+
def select(self, *_args):
|
| 93 |
+
self.action = "select"
|
| 94 |
+
return self
|
| 95 |
+
|
| 96 |
+
def upsert(self, payload, on_conflict=None):
|
| 97 |
+
self.action = "upsert"
|
| 98 |
+
self.payload = payload
|
| 99 |
+
self.on_conflict = on_conflict
|
| 100 |
+
self.supabase.upserts.append((self.name, payload, on_conflict))
|
| 101 |
+
return self
|
| 102 |
+
|
| 103 |
+
def insert(self, payload):
|
| 104 |
+
self.action = "insert"
|
| 105 |
+
self.payload = payload
|
| 106 |
+
self.supabase.inserts.append((self.name, payload))
|
| 107 |
+
return self
|
| 108 |
+
|
| 109 |
+
def eq(self, key, value):
|
| 110 |
+
self.filters[key] = value
|
| 111 |
+
return self
|
| 112 |
+
|
| 113 |
+
def contains(self, key, value):
|
| 114 |
+
self.filters[key] = value
|
| 115 |
+
return self
|
| 116 |
+
|
| 117 |
+
def limit(self, value):
|
| 118 |
+
self.limit_value = value
|
| 119 |
+
return self
|
| 120 |
+
|
| 121 |
+
def execute(self):
|
| 122 |
+
if self.name == "documents" and self.action == "select":
|
| 123 |
+
file_hash = (self.filters.get("metadata") or {}).get("file_hash")
|
| 124 |
+
user_id = self.filters.get("user_id")
|
| 125 |
+
rows = [
|
| 126 |
+
row for row in self.supabase.documents
|
| 127 |
+
if (not user_id or row.get("user_id") == user_id)
|
| 128 |
+
and ((row.get("metadata") or {}).get("file_hash") == file_hash)
|
| 129 |
+
]
|
| 130 |
+
if self.limit_value is not None:
|
| 131 |
+
rows = rows[: self.limit_value]
|
| 132 |
+
return SimpleNamespace(data=rows)
|
| 133 |
+
if self.name == "ingestion_retry_logs" and self.action == "select":
|
| 134 |
+
user_id = self.filters.get("user_id")
|
| 135 |
+
file_hash = self.filters.get("file_hash")
|
| 136 |
+
event_type = self.filters.get("event_type")
|
| 137 |
+
rows = [
|
| 138 |
+
row for row in self.supabase.ingestion_logs
|
| 139 |
+
if (not user_id or row.get("user_id") == user_id)
|
| 140 |
+
and (not file_hash or row.get("file_hash") == file_hash)
|
| 141 |
+
and (not event_type or row.get("event_type") == event_type)
|
| 142 |
+
]
|
| 143 |
+
if self.limit_value is not None:
|
| 144 |
+
rows = rows[: self.limit_value]
|
| 145 |
+
return SimpleNamespace(data=rows)
|
| 146 |
+
return SimpleNamespace(data=[])
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class FakeRecoverySupabase:
|
| 150 |
+
def __init__(self, *, documents=None, ingestion_logs=None):
|
| 151 |
+
self.documents = list(documents or [])
|
| 152 |
+
self.ingestion_logs = list(ingestion_logs or [])
|
| 153 |
+
self.upserts = []
|
| 154 |
+
self.inserts = []
|
| 155 |
+
|
| 156 |
+
def table(self, name: str):
|
| 157 |
+
return FakeRecoveryTable(self, name)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class FakeRetrieveTable:
|
| 161 |
+
def __init__(self, supabase, name: str):
|
| 162 |
+
self.supabase = supabase
|
| 163 |
+
self.name = name
|
| 164 |
+
self.filters = {}
|
| 165 |
+
|
| 166 |
+
def select(self, *_args):
|
| 167 |
+
return self
|
| 168 |
+
|
| 169 |
+
def in_(self, key, values):
|
| 170 |
+
self.filters[key] = tuple(values)
|
| 171 |
+
return self
|
| 172 |
+
|
| 173 |
+
def eq(self, key, value):
|
| 174 |
+
self.filters[key] = value
|
| 175 |
+
return self
|
| 176 |
+
|
| 177 |
+
def execute(self):
|
| 178 |
+
if self.name == "ingested_files":
|
| 179 |
+
requested = self.filters.get("file_hash", ())
|
| 180 |
+
data = []
|
| 181 |
+
for item in requested:
|
| 182 |
+
if item == "A":
|
| 183 |
+
data.append({"file_hash": "A", "filename": "About Love Anton Chekhov"})
|
| 184 |
+
if item == "B":
|
| 185 |
+
data.append({"file_hash": "B", "filename": "BEYOND BOUNDS"})
|
| 186 |
+
return SimpleNamespace(data=data)
|
| 187 |
+
return SimpleNamespace(data=[])
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class FakeRetrieveRpc:
|
| 191 |
+
def __init__(self, supabase, params):
|
| 192 |
+
self.supabase = supabase
|
| 193 |
+
self.params = params
|
| 194 |
+
|
| 195 |
+
def execute(self):
|
| 196 |
+
file_hash = self.params["filter"]["file_hash"]
|
| 197 |
+
if file_hash == "A":
|
| 198 |
+
return SimpleNamespace(
|
| 199 |
+
data=[
|
| 200 |
+
{
|
| 201 |
+
"id": "A-1",
|
| 202 |
+
"content": "A" * 400,
|
| 203 |
+
"metadata": {
|
| 204 |
+
"file_hash": "A",
|
| 205 |
+
"source": "About Love Anton Chekhov",
|
| 206 |
+
"chunk_index": 1,
|
| 207 |
+
"document_type": "short_story",
|
| 208 |
+
"page_numbers": [1],
|
| 209 |
+
},
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"id": "A-2",
|
| 213 |
+
"content": "B" * 400,
|
| 214 |
+
"metadata": {
|
| 215 |
+
"file_hash": "A",
|
| 216 |
+
"source": "About Love Anton Chekhov",
|
| 217 |
+
"chunk_index": 2,
|
| 218 |
+
"document_type": "short_story",
|
| 219 |
+
"page_numbers": [2],
|
| 220 |
+
},
|
| 221 |
+
},
|
| 222 |
+
]
|
| 223 |
+
)
|
| 224 |
+
return SimpleNamespace(
|
| 225 |
+
data=[
|
| 226 |
+
{
|
| 227 |
+
"id": "B-1",
|
| 228 |
+
"content": "C" * 200,
|
| 229 |
+
"metadata": {
|
| 230 |
+
"file_hash": "B",
|
| 231 |
+
"source": "BEYOND BOUNDS",
|
| 232 |
+
"chunk_index": 1,
|
| 233 |
+
"document_type": "short_story",
|
| 234 |
+
"page_numbers": [1],
|
| 235 |
+
},
|
| 236 |
+
}
|
| 237 |
+
]
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
class FakeRetrieveSupabase:
|
| 242 |
+
def table(self, name: str):
|
| 243 |
+
return FakeRetrieveTable(self, name)
|
| 244 |
+
|
| 245 |
+
def rpc(self, _name: str, params):
|
| 246 |
+
return FakeRetrieveRpc(self, params)
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
class FakeServiceTable:
|
| 250 |
+
def __init__(self, supabase, name: str):
|
| 251 |
+
self.supabase = supabase
|
| 252 |
+
self.name = name
|
| 253 |
+
self.filters = {}
|
| 254 |
+
self.action = None
|
| 255 |
+
self.payload = None
|
| 256 |
+
|
| 257 |
+
def insert(self, payload):
|
| 258 |
+
self.action = "insert"
|
| 259 |
+
self.payload = payload
|
| 260 |
+
self.supabase.inserts.append((self.name, payload))
|
| 261 |
+
return self
|
| 262 |
+
|
| 263 |
+
def update(self, payload):
|
| 264 |
+
self.action = "update"
|
| 265 |
+
self.payload = payload
|
| 266 |
+
return self
|
| 267 |
+
|
| 268 |
+
def upsert(self, payload, on_conflict=None):
|
| 269 |
+
self.action = "upsert"
|
| 270 |
+
self.payload = payload
|
| 271 |
+
self.on_conflict = on_conflict
|
| 272 |
+
self.supabase.upserts.append((self.name, payload, on_conflict))
|
| 273 |
+
return self
|
| 274 |
+
|
| 275 |
+
def select(self, *_args):
|
| 276 |
+
self.action = "select"
|
| 277 |
+
return self
|
| 278 |
+
|
| 279 |
+
def eq(self, key, value):
|
| 280 |
+
self.filters[key] = value
|
| 281 |
+
return self
|
| 282 |
+
|
| 283 |
+
def in_(self, key, values):
|
| 284 |
+
self.filters[key] = tuple(values)
|
| 285 |
+
return self
|
| 286 |
+
|
| 287 |
+
def limit(self, value):
|
| 288 |
+
self.filters["limit"] = value
|
| 289 |
+
return self
|
| 290 |
+
|
| 291 |
+
def execute(self):
|
| 292 |
+
if self.name == "query_traces" and self.action == "select":
|
| 293 |
+
trace_ids = self.filters.get("trace_id")
|
| 294 |
+
data = [
|
| 295 |
+
row
|
| 296 |
+
for row in self.supabase.trace_rows
|
| 297 |
+
if trace_ids is None
|
| 298 |
+
or (
|
| 299 |
+
isinstance(trace_ids, tuple)
|
| 300 |
+
and row.get("trace_id") in trace_ids
|
| 301 |
+
)
|
| 302 |
+
or row.get("trace_id") == trace_ids
|
| 303 |
+
]
|
| 304 |
+
if "user_id" in self.filters:
|
| 305 |
+
data = [row for row in data if row.get("user_id") == self.filters["user_id"]]
|
| 306 |
+
if "session_id" in self.filters:
|
| 307 |
+
data = [row for row in data if row.get("session_id") == self.filters["session_id"]]
|
| 308 |
+
return SimpleNamespace(data=data[: self.filters.get("limit", len(data))])
|
| 309 |
+
if self.name == "answer_feedback" and self.action == "select":
|
| 310 |
+
rows = [
|
| 311 |
+
row
|
| 312 |
+
for row in self.supabase.feedback_rows
|
| 313 |
+
if ("promote_to_eval" not in self.filters or row.get("promote_to_eval") is self.filters["promote_to_eval"])
|
| 314 |
+
]
|
| 315 |
+
if "user_id" in self.filters:
|
| 316 |
+
rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
|
| 317 |
+
if "trace_id" in self.filters:
|
| 318 |
+
rows = [row for row in rows if row.get("trace_id") == self.filters["trace_id"]]
|
| 319 |
+
if "id" in self.filters:
|
| 320 |
+
rows = [row for row in rows if row.get("id") == self.filters["id"]]
|
| 321 |
+
return SimpleNamespace(data=rows[: self.filters.get("limit", len(rows))])
|
| 322 |
+
if self.name == "evaluation_datasets" and self.action == "select":
|
| 323 |
+
rows = list(self.supabase.eval_rows)
|
| 324 |
+
return SimpleNamespace(data=rows[: self.filters.get("limit", len(rows))])
|
| 325 |
+
if self.name == "query_traces" and self.action == "insert":
|
| 326 |
+
self.supabase.trace_rows.append(self.payload)
|
| 327 |
+
if self.name == "answer_feedback" and self.action == "insert":
|
| 328 |
+
self.supabase.feedback_rows.append(self.payload)
|
| 329 |
+
if self.name == "query_traces" and self.action == "update":
|
| 330 |
+
for row in self.supabase.trace_rows:
|
| 331 |
+
if all(row.get(k) == v for k, v in self.filters.items()):
|
| 332 |
+
row.update(self.payload)
|
| 333 |
+
if self.name == "answer_feedback" and self.action == "update":
|
| 334 |
+
for row in self.supabase.feedback_rows:
|
| 335 |
+
if all(row.get(k) == v for k, v in self.filters.items()):
|
| 336 |
+
row.update(self.payload)
|
| 337 |
+
if self.name == "evaluation_datasets" and self.action == "upsert":
|
| 338 |
+
trace_id = self.payload.get("trace_id")
|
| 339 |
+
existing = next(
|
| 340 |
+
(row for row in self.supabase.eval_rows if row.get("trace_id") == trace_id),
|
| 341 |
+
None,
|
| 342 |
+
)
|
| 343 |
+
if existing:
|
| 344 |
+
existing.update(self.payload)
|
| 345 |
+
else:
|
| 346 |
+
self.supabase.eval_rows.append(self.payload)
|
| 347 |
+
return SimpleNamespace(data=[])
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
class FakeServiceSupabase:
|
| 351 |
+
def __init__(self):
|
| 352 |
+
self.inserts = []
|
| 353 |
+
self.upserts = []
|
| 354 |
+
self.trace_rows = []
|
| 355 |
+
self.feedback_rows = []
|
| 356 |
+
self.eval_rows = []
|
| 357 |
+
|
| 358 |
+
def table(self, name: str):
|
| 359 |
+
return FakeServiceTable(self, name)
|
| 360 |
+
|
| 361 |
+
def rpc(self, _name: str, _params):
|
| 362 |
+
return SimpleNamespace(execute=lambda: SimpleNamespace(data=[]))
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
class FakeGraphServiceTable(FakeServiceTable):
|
| 366 |
+
def execute(self):
|
| 367 |
+
if self.name == "graph_nodes" and self.action == "select":
|
| 368 |
+
rows = list(self.supabase.graph_nodes)
|
| 369 |
+
if "user_id" in self.filters:
|
| 370 |
+
rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
|
| 371 |
+
return SimpleNamespace(data=rows)
|
| 372 |
+
if self.name == "graph_edges" and self.action == "select":
|
| 373 |
+
rows = list(self.supabase.graph_edges)
|
| 374 |
+
if "user_id" in self.filters:
|
| 375 |
+
rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
|
| 376 |
+
return SimpleNamespace(data=rows)
|
| 377 |
+
return super().execute()
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
class FakeGraphServiceSupabase(FakeServiceSupabase):
|
| 381 |
+
def __init__(self):
|
| 382 |
+
super().__init__()
|
| 383 |
+
self.graph_nodes = []
|
| 384 |
+
self.graph_edges = []
|
| 385 |
+
|
| 386 |
+
def table(self, name: str):
|
| 387 |
+
return FakeGraphServiceTable(self, name)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
class FakeGraphVectorTable:
|
| 391 |
+
def __init__(self, rows):
|
| 392 |
+
self.rows = rows
|
| 393 |
+
self.filters = {}
|
| 394 |
+
|
| 395 |
+
def select(self, *_args):
|
| 396 |
+
return self
|
| 397 |
+
|
| 398 |
+
def eq(self, key, value):
|
| 399 |
+
self.filters[key] = value
|
| 400 |
+
return self
|
| 401 |
+
|
| 402 |
+
def contains(self, key, value):
|
| 403 |
+
self.filters[key] = value
|
| 404 |
+
return self
|
| 405 |
+
|
| 406 |
+
def execute(self):
|
| 407 |
+
rows = list(self.rows)
|
| 408 |
+
if "user_id" in self.filters:
|
| 409 |
+
rows = [row for row in rows if row.get("user_id") == self.filters["user_id"]]
|
| 410 |
+
metadata_contains = self.filters.get("metadata")
|
| 411 |
+
if metadata_contains:
|
| 412 |
+
rows = [
|
| 413 |
+
row for row in rows
|
| 414 |
+
if all((row.get("metadata", {}) or {}).get(k) == v for k, v in metadata_contains.items())
|
| 415 |
+
]
|
| 416 |
+
return SimpleNamespace(data=rows)
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
class FakeGraphVectorSupabase:
|
| 420 |
+
def __init__(self, rows):
|
| 421 |
+
self.rows = rows
|
| 422 |
+
|
| 423 |
+
def table(self, _name: str):
|
| 424 |
+
return FakeGraphVectorTable(self.rows)
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
class FakeRerankResult:
|
| 428 |
+
def __init__(self, index: int, relevance_score: float):
|
| 429 |
+
self.index = index
|
| 430 |
+
self.relevance_score = relevance_score
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
class FakeCohereClient:
|
| 434 |
+
def __init__(self, *_args, **_kwargs):
|
| 435 |
+
pass
|
| 436 |
+
|
| 437 |
+
def rerank(self, model, query, documents, top_n):
|
| 438 |
+
del model, query, top_n
|
| 439 |
+
if len(documents) == 2:
|
| 440 |
+
scores = [0.9, 0.8]
|
| 441 |
+
else:
|
| 442 |
+
scores = [0.2]
|
| 443 |
+
return SimpleNamespace(
|
| 444 |
+
results=[
|
| 445 |
+
FakeRerankResult(index=i, relevance_score=score)
|
| 446 |
+
for i, score in enumerate(scores[: len(documents)])
|
| 447 |
+
]
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
|
| 451 |
+
def test_create_chunks_uses_short_document_settings(monkeypatch):
|
| 452 |
+
seen = {}
|
| 453 |
+
|
| 454 |
+
def fake_chunk_by_title(elements, **kwargs):
|
| 455 |
+
seen["kwargs"] = kwargs
|
| 456 |
+
return list(elements)
|
| 457 |
+
|
| 458 |
+
monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
|
| 459 |
+
|
| 460 |
+
chunks = pipeline.create_chunks([FakeElement("short text")], text_chars=5_000)
|
| 461 |
+
|
| 462 |
+
assert len(chunks) == 1
|
| 463 |
+
assert seen["kwargs"] == {
|
| 464 |
+
"max_characters": 3000,
|
| 465 |
+
"new_after_n_chars": 2500,
|
| 466 |
+
"combine_text_under_n_chars": 300,
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def test_create_chunks_keeps_large_document_settings(monkeypatch):
|
| 471 |
+
seen = {}
|
| 472 |
+
|
| 473 |
+
def fake_chunk_by_title(elements, **kwargs):
|
| 474 |
+
seen["kwargs"] = kwargs
|
| 475 |
+
return list(elements)
|
| 476 |
+
|
| 477 |
+
monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
|
| 478 |
+
|
| 479 |
+
chunks = pipeline.create_chunks([FakeElement("large text")], text_chars=40_000)
|
| 480 |
+
|
| 481 |
+
assert len(chunks) == 1
|
| 482 |
+
assert seen["kwargs"] == {
|
| 483 |
+
"max_characters": 8000,
|
| 484 |
+
"new_after_n_chars": 7000,
|
| 485 |
+
"combine_text_under_n_chars": 500,
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def test_predict_and_prefetch_uses_rewriter_provider(monkeypatch):
|
| 490 |
+
seen_purposes = []
|
| 491 |
+
warmed_queries = []
|
| 492 |
+
|
| 493 |
+
class FakeLLM:
|
| 494 |
+
def invoke(self, _messages):
|
| 495 |
+
return SimpleNamespace(content='["follow-up question"]')
|
| 496 |
+
|
| 497 |
+
def fake_build_chat_llm(*, purpose="text", **_kwargs):
|
| 498 |
+
seen_purposes.append(purpose)
|
| 499 |
+
return FakeLLM()
|
| 500 |
+
|
| 501 |
+
monkeypatch.setattr(
|
| 502 |
+
providers.ProviderFactory, "build_chat_llm", staticmethod(fake_build_chat_llm)
|
| 503 |
+
)
|
| 504 |
+
monkeypatch.setattr(
|
| 505 |
+
pipeline,
|
| 506 |
+
"retrieve_chunks",
|
| 507 |
+
lambda **kwargs: warmed_queries.append(kwargs["query"]) or [],
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
pipeline._predict_and_prefetch(
|
| 511 |
+
original_query="original",
|
| 512 |
+
answer="answer",
|
| 513 |
+
category="short_story",
|
| 514 |
+
session_id="session-1",
|
| 515 |
+
access_token="token",
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
assert seen_purposes == ["rewriter"]
|
| 519 |
+
assert warmed_queries == ["follow-up question"]
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def test_generate_answer_stream_marks_summary_nodes(monkeypatch):
|
| 523 |
+
captured = {}
|
| 524 |
+
|
| 525 |
+
class FakeLLM:
|
| 526 |
+
async def astream(self, messages):
|
| 527 |
+
captured["prompt"] = messages[0].content[0]["text"]
|
| 528 |
+
yield SimpleNamespace(content="ok")
|
| 529 |
+
|
| 530 |
+
monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
|
| 531 |
+
monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
|
| 532 |
+
monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
|
| 533 |
+
monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
|
| 534 |
+
|
| 535 |
+
summary_chunk = Document(
|
| 536 |
+
page_content="Summary body",
|
| 537 |
+
metadata={
|
| 538 |
+
"source": "About Love Anton Chekhov",
|
| 539 |
+
"node_type": "summary",
|
| 540 |
+
"node_level": 2,
|
| 541 |
+
"chunk_index": "summary-1",
|
| 542 |
+
"document_type": "short_story",
|
| 543 |
+
"relevance_score": 0.8,
|
| 544 |
+
},
|
| 545 |
+
)
|
| 546 |
+
leaf_chunk = Document(
|
| 547 |
+
page_content="Leaf fallback",
|
| 548 |
+
metadata={
|
| 549 |
+
"source": "About Love Anton Chekhov",
|
| 550 |
+
"chunk_index": 1,
|
| 551 |
+
"document_type": "short_story",
|
| 552 |
+
"relevance_score": 0.6,
|
| 553 |
+
"original_content": {"raw_text": "Leaf raw text", "tables_html": []},
|
| 554 |
+
},
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
async def collect():
|
| 558 |
+
events = []
|
| 559 |
+
async for event in pipeline.generate_answer_stream(
|
| 560 |
+
chunks=[summary_chunk, leaf_chunk],
|
| 561 |
+
query="summarise this",
|
| 562 |
+
access_token=None,
|
| 563 |
+
category="short_story",
|
| 564 |
+
priority_file_hashes=None,
|
| 565 |
+
):
|
| 566 |
+
events.append(event)
|
| 567 |
+
return events
|
| 568 |
+
|
| 569 |
+
events = asyncio.run(collect())
|
| 570 |
+
|
| 571 |
+
assert any(event["type"] == "done" for event in events)
|
| 572 |
+
assert "[SYNTHESIZED CHAPTER SUMMARY - LEVEL 2]" in captured["prompt"]
|
| 573 |
+
assert "TEXT:\nSummary body" in captured["prompt"]
|
| 574 |
+
assert "TEXT:\nLeaf raw text" in captured["prompt"]
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def test_run_ingestion_preserves_user_override_before_cleanup(monkeypatch):
|
| 578 |
+
fake_supabase = FakeIngestionSupabase()
|
| 579 |
+
captured = {}
|
| 580 |
+
|
| 581 |
+
monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
|
| 582 |
+
monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
|
| 583 |
+
monkeypatch.setattr(
|
| 584 |
+
pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: True
|
| 585 |
+
)
|
| 586 |
+
monkeypatch.setattr(
|
| 587 |
+
pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake_supabase
|
| 588 |
+
)
|
| 589 |
+
monkeypatch.setattr(
|
| 590 |
+
pipeline,
|
| 591 |
+
"_build_service_supabase_client",
|
| 592 |
+
lambda *_args, **_kwargs: fake_supabase,
|
| 593 |
+
)
|
| 594 |
+
monkeypatch.setattr(
|
| 595 |
+
pipeline, "partition_document", lambda _path: [FakeElement("x" * 100)]
|
| 596 |
+
)
|
| 597 |
+
monkeypatch.setattr(pipeline, "extract_images_from_pdf", lambda _path: {})
|
| 598 |
+
|
| 599 |
+
def fake_extract_document_entities(
|
| 600 |
+
elements, access_token=None, forced_category=None
|
| 601 |
+
):
|
| 602 |
+
del elements, access_token
|
| 603 |
+
captured["forced_category"] = forced_category
|
| 604 |
+
return SimpleNamespace(is_allowed=True, document_type=forced_category)
|
| 605 |
+
|
| 606 |
+
monkeypatch.setattr(
|
| 607 |
+
pipeline, "extract_document_entities", fake_extract_document_entities
|
| 608 |
+
)
|
| 609 |
+
monkeypatch.setattr(
|
| 610 |
+
pipeline, "create_chunks", lambda elements, text_chars=None: ["chunk"]
|
| 611 |
+
)
|
| 612 |
+
monkeypatch.setattr(
|
| 613 |
+
pipeline,
|
| 614 |
+
"process_chunks",
|
| 615 |
+
lambda *args, **kwargs: (
|
| 616 |
+
[Document(page_content="body", metadata={"source": "Test Doc"})],
|
| 617 |
+
["doc-1"],
|
| 618 |
+
),
|
| 619 |
+
)
|
| 620 |
+
monkeypatch.setattr(
|
| 621 |
+
pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids)
|
| 622 |
+
)
|
| 623 |
+
monkeypatch.setattr(pipeline, "upload_to_supabase", lambda *args, **kwargs: None)
|
| 624 |
+
monkeypatch.setattr(
|
| 625 |
+
pipeline, "invalidate_user_cache", lambda *args, **kwargs: None
|
| 626 |
+
)
|
| 627 |
+
|
| 628 |
+
result = pipeline.run_ingestion(
|
| 629 |
+
pdf_path="file.pdf",
|
| 630 |
+
force=True,
|
| 631 |
+
original_filename="file.pdf",
|
| 632 |
+
access_token="token",
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
assert result["document_type"] == "short_story"
|
| 636 |
+
assert captured["forced_category"] == "short_story"
|
| 637 |
+
|
| 638 |
+
select_idx = fake_supabase.ops.index(
|
| 639 |
+
("ingested_files", "select", {"user_id": "user-1", "file_hash": "file-hash", "limit": 1})
|
| 640 |
+
)
|
| 641 |
+
delete_idx = fake_supabase.ops.index(
|
| 642 |
+
("ingested_files", "delete", {"user_id": "user-1", "file_hash": "file-hash"})
|
| 643 |
+
)
|
| 644 |
+
assert select_idx < delete_idx
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
def test_upload_to_supabase_uses_batch_rpc_and_skips_success_sleep(monkeypatch):
|
| 648 |
+
calls = []
|
| 649 |
+
sleeps = []
|
| 650 |
+
|
| 651 |
+
class FakeRpc:
|
| 652 |
+
def __init__(self, name, params):
|
| 653 |
+
self.name = name
|
| 654 |
+
self.params = params
|
| 655 |
+
|
| 656 |
+
def execute(self):
|
| 657 |
+
calls.append((self.name, self.params))
|
| 658 |
+
return SimpleNamespace(data=[])
|
| 659 |
+
|
| 660 |
+
class FakeBatchSupabase:
|
| 661 |
+
def rpc(self, name, params):
|
| 662 |
+
return FakeRpc(name, params)
|
| 663 |
+
|
| 664 |
+
class FakeEmbedder:
|
| 665 |
+
def embed_documents(self, texts):
|
| 666 |
+
return [[float(i), float(len(text))] for i, text in enumerate(texts, 1)]
|
| 667 |
+
|
| 668 |
+
monkeypatch.setattr(auth_utils, "safe_extract_jwt_sub", lambda _token: "user-1")
|
| 669 |
+
monkeypatch.setattr(pipeline, "_build_embeddings", lambda: FakeEmbedder())
|
| 670 |
+
monkeypatch.setattr(
|
| 671 |
+
pipeline,
|
| 672 |
+
"_build_service_supabase_client",
|
| 673 |
+
lambda *_args, **_kwargs: FakeBatchSupabase(),
|
| 674 |
+
)
|
| 675 |
+
monkeypatch.setattr(pipeline.time, "sleep", lambda seconds: sleeps.append(seconds))
|
| 676 |
+
|
| 677 |
+
docs = [
|
| 678 |
+
Document(page_content="alpha", metadata={"source": "A", "node_type": "leaf"}),
|
| 679 |
+
Document(page_content="beta", metadata={"source": "B", "node_type": "summary"}),
|
| 680 |
+
]
|
| 681 |
+
|
| 682 |
+
pipeline.upload_to_supabase(
|
| 683 |
+
docs,
|
| 684 |
+
["doc-1", "doc-2"],
|
| 685 |
+
access_token="token",
|
| 686 |
+
)
|
| 687 |
+
|
| 688 |
+
assert calls
|
| 689 |
+
assert calls[0][0] == "insert_document_chunks_batch"
|
| 690 |
+
assert len(calls[0][1]["p_rows"]) == 2
|
| 691 |
+
assert sleeps == []
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
def test_run_ingestion_records_stage_timing_events(monkeypatch):
|
| 695 |
+
fake_supabase = FakeIngestionSupabase()
|
| 696 |
+
|
| 697 |
+
monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
|
| 698 |
+
monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
|
| 699 |
+
monkeypatch.setattr(
|
| 700 |
+
pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: False
|
| 701 |
+
)
|
| 702 |
+
monkeypatch.setattr(
|
| 703 |
+
pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake_supabase
|
| 704 |
+
)
|
| 705 |
+
monkeypatch.setattr(
|
| 706 |
+
pipeline,
|
| 707 |
+
"_build_service_supabase_client",
|
| 708 |
+
lambda *_args, **_kwargs: fake_supabase,
|
| 709 |
+
)
|
| 710 |
+
monkeypatch.setattr(
|
| 711 |
+
pipeline, "partition_document", lambda _path: [FakeElement("x" * 120)]
|
| 712 |
+
)
|
| 713 |
+
monkeypatch.setattr(pipeline, "extract_images_from_pdf", lambda _path: {})
|
| 714 |
+
monkeypatch.setattr(
|
| 715 |
+
pipeline,
|
| 716 |
+
"extract_document_entities",
|
| 717 |
+
lambda *args, **kwargs: SimpleNamespace(
|
| 718 |
+
is_allowed=True,
|
| 719 |
+
document_type="short_story",
|
| 720 |
+
primary_topics=[],
|
| 721 |
+
brief_summary="Short story",
|
| 722 |
+
key_entities=[],
|
| 723 |
+
),
|
| 724 |
+
)
|
| 725 |
+
monkeypatch.setattr(pipeline, "create_chunks", lambda elements, text_chars=None: ["chunk"])
|
| 726 |
+
monkeypatch.setattr(
|
| 727 |
+
pipeline,
|
| 728 |
+
"process_chunks",
|
| 729 |
+
lambda *args, **kwargs: (
|
| 730 |
+
[Document(page_content="body", metadata={"source": "Test Doc"})],
|
| 731 |
+
["doc-1"],
|
| 732 |
+
),
|
| 733 |
+
)
|
| 734 |
+
monkeypatch.setattr(
|
| 735 |
+
pipeline, "build_raptor_tree", lambda docs, ids, user_id: (docs, ids)
|
| 736 |
+
)
|
| 737 |
+
monkeypatch.setattr(pipeline, "upload_to_supabase", lambda *args, **kwargs: None)
|
| 738 |
+
monkeypatch.setattr(
|
| 739 |
+
pipeline, "invalidate_user_cache", lambda *args, **kwargs: None
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
+
pipeline.run_ingestion(
|
| 743 |
+
pdf_path="file.pdf",
|
| 744 |
+
force=False,
|
| 745 |
+
original_filename="file.pdf",
|
| 746 |
+
access_token="token",
|
| 747 |
+
)
|
| 748 |
+
|
| 749 |
+
stage_rows = [
|
| 750 |
+
payload
|
| 751 |
+
for name, payload in fake_supabase.inserts
|
| 752 |
+
if name == "ingestion_retry_logs" and payload.get("event_type") == "stage_timing"
|
| 753 |
+
]
|
| 754 |
+
stages = {payload["message"] for payload in stage_rows}
|
| 755 |
+
assert any('"stage": "partition"' in stage for stage in stages)
|
| 756 |
+
assert any('"stage": "classify"' in stage for stage in stages)
|
| 757 |
+
assert any('"stage": "chunk_process"' in stage for stage in stages)
|
| 758 |
+
assert any('"stage": "raptor"' in stage for stage in stages)
|
| 759 |
+
assert any('"stage": "upload"' in stage for stage in stages)
|
| 760 |
+
|
| 761 |
+
|
| 762 |
+
def test_recover_or_prepare_orphaned_upload_repairs_completed_upload(monkeypatch):
|
| 763 |
+
fake_service = FakeRecoverySupabase(
|
| 764 |
+
documents=[
|
| 765 |
+
{
|
| 766 |
+
"user_id": "user-1",
|
| 767 |
+
"content": "Abdul Manan — Deep Foundations Guide",
|
| 768 |
+
"metadata": {
|
| 769 |
+
"file_hash": "file-hash",
|
| 770 |
+
"source": "Recovered Guide",
|
| 771 |
+
"document_type": "technical_guide",
|
| 772 |
+
},
|
| 773 |
+
},
|
| 774 |
+
{
|
| 775 |
+
"user_id": "user-1",
|
| 776 |
+
"content": 'The "Why Before What" Bible for ML/DL/AI Engineering',
|
| 777 |
+
"metadata": {
|
| 778 |
+
"file_hash": "file-hash",
|
| 779 |
+
"source": "Recovered Guide",
|
| 780 |
+
"document_type": "technical_guide",
|
| 781 |
+
},
|
| 782 |
+
},
|
| 783 |
+
],
|
| 784 |
+
ingestion_logs=[
|
| 785 |
+
{
|
| 786 |
+
"user_id": "user-1",
|
| 787 |
+
"file_hash": "file-hash",
|
| 788 |
+
"event_type": "upload_complete",
|
| 789 |
+
}
|
| 790 |
+
],
|
| 791 |
+
)
|
| 792 |
+
monkeypatch.setattr(
|
| 793 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
result = pipeline._recover_or_prepare_orphaned_upload(
|
| 797 |
+
"file-hash",
|
| 798 |
+
user_id="user-1",
|
| 799 |
+
filename_hint="fallback.pdf",
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
assert result["recovered_existing"] is True
|
| 803 |
+
upsert = next(item for item in fake_service.upserts if item[0] == "ingested_files")
|
| 804 |
+
assert upsert[1]["file_hash"] == "file-hash"
|
| 805 |
+
assert upsert[1]["document_type"] == "technical_guide"
|
| 806 |
+
assert upsert[1]["chunk_count"] == 2
|
| 807 |
+
|
| 808 |
+
|
| 809 |
+
def test_run_ingestion_short_circuits_on_recovered_existing_upload(monkeypatch):
|
| 810 |
+
monkeypatch.setattr(auth_utils, "extract_jwt_sub", lambda _token: "user-1")
|
| 811 |
+
monkeypatch.setattr(pipeline, "get_file_fingerprint", lambda _path: "file-hash")
|
| 812 |
+
monkeypatch.setattr(
|
| 813 |
+
pipeline, "is_file_already_ingested", lambda *_args, **_kwargs: False
|
| 814 |
+
)
|
| 815 |
+
monkeypatch.setattr(
|
| 816 |
+
pipeline,
|
| 817 |
+
"_recover_or_prepare_orphaned_upload",
|
| 818 |
+
lambda *_args, **_kwargs: {
|
| 819 |
+
"pending_review": False,
|
| 820 |
+
"document_type": "technical_guide",
|
| 821 |
+
"filename": "Recovered Guide",
|
| 822 |
+
"file_hash": "file-hash",
|
| 823 |
+
"recovered_existing": True,
|
| 824 |
+
},
|
| 825 |
+
)
|
| 826 |
+
monkeypatch.setattr(
|
| 827 |
+
pipeline,
|
| 828 |
+
"partition_document",
|
| 829 |
+
lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("should not recompute")),
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
result = pipeline.run_ingestion(
|
| 833 |
+
pdf_path="file.pdf",
|
| 834 |
+
force=False,
|
| 835 |
+
original_filename="file.pdf",
|
| 836 |
+
access_token="token",
|
| 837 |
+
)
|
| 838 |
+
|
| 839 |
+
assert result["recovered_existing"] is True
|
| 840 |
+
assert result["file_hash"] == "file-hash"
|
| 841 |
+
|
| 842 |
+
|
| 843 |
+
def test_identity_json_extracts_cover_metadata():
|
| 844 |
+
identity = pipeline._identity_json_from_elements(
|
| 845 |
+
[
|
| 846 |
+
FakeElement("Abdul Manan — Deep Foundations Guide", page_number=1),
|
| 847 |
+
FakeElement('The "Why Before What" Bible for ML/DL/AI Engineering', page_number=1),
|
| 848 |
+
FakeElement(
|
| 849 |
+
"This guide exists because knowing definitions is not enough. Most people learn ML backwards.",
|
| 850 |
+
page_number=1,
|
| 851 |
+
),
|
| 852 |
+
],
|
| 853 |
+
fallback_title="Fallback Guide",
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
# Display title should be the actual title line; the personalized cover-owner line is stored separately.
|
| 857 |
+
assert identity["display_title"] == 'The "Why Before What" Bible for ML/DL/AI Engineering'
|
| 858 |
+
assert identity["subtitle"] == "This guide exists because knowing definitions is not enough. Most people learn ML backwards."
|
| 859 |
+
assert identity["named_owner"] == "Abdul Manan"
|
| 860 |
+
assert "knowing definitions is not enough" in identity["opening_page_summary"].lower()
|
| 861 |
+
assert identity["field_presence"]["publisher"] is False
|
| 862 |
+
|
| 863 |
+
|
| 864 |
+
def test_identity_json_strips_null_bytes_from_opening_page_fields():
|
| 865 |
+
identity = pipeline._identity_json_from_elements(
|
| 866 |
+
[
|
| 867 |
+
FakeElement("Abdul\x00 Manan — Deep Foundations Guide", page_number=1),
|
| 868 |
+
FakeElement('The "Why Before What"\x00 Bible for ML/DL/AI Engineering', page_number=1),
|
| 869 |
+
FakeElement("Publisher:\x00 Not stated", page_number=1),
|
| 870 |
+
],
|
| 871 |
+
fallback_title="Fallback Guide",
|
| 872 |
+
)
|
| 873 |
+
|
| 874 |
+
serialized = json.dumps(identity)
|
| 875 |
+
assert "\u0000" not in serialized
|
| 876 |
+
assert "\x00" not in identity["display_title"]
|
| 877 |
+
assert "\x00" not in identity["subtitle"]
|
| 878 |
+
assert "\x00" not in identity["cover_text"]
|
| 879 |
+
|
| 880 |
+
|
| 881 |
+
def test_identity_json_from_docs_dedupes_repeated_opening_page_content():
|
| 882 |
+
repeated_row = {
|
| 883 |
+
"content": (
|
| 884 |
+
"Abdul Manan — Deep Foundations Guide\n"
|
| 885 |
+
'The "Why Before What" Bible for ML/DL/AI Engineering\n'
|
| 886 |
+
"This guide exists because knowing definitions is not enough."
|
| 887 |
+
),
|
| 888 |
+
"metadata": {
|
| 889 |
+
"page_numbers": [1],
|
| 890 |
+
"original_content": {
|
| 891 |
+
"raw_text": (
|
| 892 |
+
"Abdul Manan — Deep Foundations Guide\n"
|
| 893 |
+
'The "Why Before What" Bible for ML/DL/AI Engineering\n'
|
| 894 |
+
"This guide exists because knowing definitions is not enough."
|
| 895 |
+
)
|
| 896 |
+
},
|
| 897 |
+
},
|
| 898 |
+
}
|
| 899 |
+
|
| 900 |
+
identity = pipeline._identity_json_from_docs(
|
| 901 |
+
[repeated_row, repeated_row],
|
| 902 |
+
fallback_title="Fallback Guide",
|
| 903 |
+
)
|
| 904 |
+
|
| 905 |
+
assert identity["cover_text"].count("Abdul Manan — Deep Foundations Guide") == 1
|
| 906 |
+
assert (
|
| 907 |
+
identity["opening_page_summary"].count(
|
| 908 |
+
'The "Why Before What" Bible for ML/DL/AI Engineering'
|
| 909 |
+
)
|
| 910 |
+
== 1
|
| 911 |
+
)
|
| 912 |
+
|
| 913 |
+
|
| 914 |
+
def test_classify_query_route_decision_marks_exact_fact_query():
|
| 915 |
+
decision = pipeline._classify_query_route_decision(
|
| 916 |
+
"Whose guide is this? Answer using the exact name written in the document."
|
| 917 |
+
)
|
| 918 |
+
|
| 919 |
+
assert decision.route_class == "exact_fact"
|
| 920 |
+
assert decision.exact_field == "owner"
|
| 921 |
+
assert decision.preserve_query is True
|
| 922 |
+
assert decision.disable_memory is True
|
| 923 |
+
|
| 924 |
+
|
| 925 |
+
def test_classify_query_route_decision_marks_page_scoped_query():
|
| 926 |
+
decision = pipeline._classify_query_route_decision(
|
| 927 |
+
"Summarize only the first page, not the whole guide."
|
| 928 |
+
)
|
| 929 |
+
|
| 930 |
+
assert decision.route_class == "page_scoped"
|
| 931 |
+
assert decision.page_scope == "first_page"
|
| 932 |
+
assert decision.preserve_query is True
|
| 933 |
+
assert decision.disable_memory is True
|
| 934 |
+
|
| 935 |
+
|
| 936 |
+
class FakeAmbiguityTable:
|
| 937 |
+
def __init__(self, rows):
|
| 938 |
+
self.rows = rows
|
| 939 |
+
self.filters = {}
|
| 940 |
+
self.action = None
|
| 941 |
+
|
| 942 |
+
def select(self, *_args):
|
| 943 |
+
self.action = "select"
|
| 944 |
+
return self
|
| 945 |
+
|
| 946 |
+
def eq(self, key, value):
|
| 947 |
+
self.filters[key] = value
|
| 948 |
+
return self
|
| 949 |
+
|
| 950 |
+
def execute(self):
|
| 951 |
+
return SimpleNamespace(data=list(self.rows))
|
| 952 |
+
|
| 953 |
+
|
| 954 |
+
class FakeAmbiguityRpc:
|
| 955 |
+
def __init__(self, supabase, params):
|
| 956 |
+
self.supabase = supabase
|
| 957 |
+
self.params = params
|
| 958 |
+
|
| 959 |
+
def execute(self):
|
| 960 |
+
self.supabase.rpc_calls.append(self.params)
|
| 961 |
+
fhash = (self.params.get("filter") or {}).get("file_hash")
|
| 962 |
+
score = 0.22 if fhash == "A" else 0.11
|
| 963 |
+
return SimpleNamespace(data=[{"combined_score": score}])
|
| 964 |
+
|
| 965 |
+
|
| 966 |
+
class FakeAmbiguitySupabase:
|
| 967 |
+
def __init__(self, rows):
|
| 968 |
+
self.rows = rows
|
| 969 |
+
self.rpc_calls = []
|
| 970 |
+
|
| 971 |
+
def table(self, _name: str):
|
| 972 |
+
return FakeAmbiguityTable(self.rows)
|
| 973 |
+
|
| 974 |
+
def rpc(self, _name: str, params):
|
| 975 |
+
return FakeAmbiguityRpc(self, params)
|
| 976 |
+
|
| 977 |
+
|
| 978 |
+
def test_check_query_ambiguity_forces_clarification_for_identity_queries_in_multi_doc_scope(monkeypatch):
|
| 979 |
+
fake = FakeAmbiguitySupabase(
|
| 980 |
+
rows=[
|
| 981 |
+
{"file_hash": "A", "filename": "Guide A.pdf"},
|
| 982 |
+
{"file_hash": "B", "filename": "Guide B.pdf"},
|
| 983 |
+
]
|
| 984 |
+
)
|
| 985 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 986 |
+
|
| 987 |
+
res = pipeline.check_query_ambiguity("Whose guide is this?", access_token=None, category="All")
|
| 988 |
+
|
| 989 |
+
assert res["is_ambiguous"] is True
|
| 990 |
+
assert res["top_file_hash"] is None
|
| 991 |
+
assert res["clarification_options"]
|
| 992 |
+
assert fake.rpc_calls == []
|
| 993 |
+
|
| 994 |
+
|
| 995 |
+
def test_check_query_ambiguity_rpc_includes_p_user_id_to_avoid_overload(monkeypatch):
|
| 996 |
+
fake = FakeAmbiguitySupabase(
|
| 997 |
+
rows=[
|
| 998 |
+
{"file_hash": "A", "filename": "Doc A.pdf"},
|
| 999 |
+
{"file_hash": "B", "filename": "Doc B.pdf"},
|
| 1000 |
+
]
|
| 1001 |
+
)
|
| 1002 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 1003 |
+
|
| 1004 |
+
res = pipeline.check_query_ambiguity("summarize the document", access_token=None, category="All")
|
| 1005 |
+
|
| 1006 |
+
assert res["is_ambiguous"] is True
|
| 1007 |
+
assert fake.rpc_calls, "Expected ambiguity scoring RPC calls"
|
| 1008 |
+
assert all("p_user_id" in call for call in fake.rpc_calls)
|
| 1009 |
+
|
| 1010 |
+
|
| 1011 |
+
def test_check_query_ambiguity_autopins_single_doc_in_category_even_for_identity_query(monkeypatch):
|
| 1012 |
+
fake = FakeAmbiguitySupabase(
|
| 1013 |
+
rows=[
|
| 1014 |
+
{"file_hash": "ONLY", "filename": "Only Doc.pdf"},
|
| 1015 |
+
]
|
| 1016 |
+
)
|
| 1017 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 1018 |
+
|
| 1019 |
+
res = pipeline.check_query_ambiguity(
|
| 1020 |
+
"Whose guide is this?",
|
| 1021 |
+
access_token=None,
|
| 1022 |
+
category="technical_guide",
|
| 1023 |
+
)
|
| 1024 |
+
|
| 1025 |
+
assert res["is_ambiguous"] is False
|
| 1026 |
+
assert res["top_file_hash"] == "ONLY"
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def test_check_query_ambiguity_lists_only_three_options_when_many_docs(monkeypatch):
|
| 1030 |
+
fake = FakeAmbiguitySupabase(
|
| 1031 |
+
rows=[
|
| 1032 |
+
{"file_hash": "A", "filename": "A.pdf"},
|
| 1033 |
+
{"file_hash": "B", "filename": "B.pdf"},
|
| 1034 |
+
{"file_hash": "C", "filename": "C.pdf"},
|
| 1035 |
+
{"file_hash": "D", "filename": "D.pdf"},
|
| 1036 |
+
{"file_hash": "E", "filename": "E.pdf"},
|
| 1037 |
+
]
|
| 1038 |
+
)
|
| 1039 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 1040 |
+
|
| 1041 |
+
res = pipeline.check_query_ambiguity("What is the exact full title?", access_token=None, category="All")
|
| 1042 |
+
|
| 1043 |
+
assert res["is_ambiguous"] is True
|
| 1044 |
+
assert len(res.get("clarification_options") or []) == 3
|
| 1045 |
+
|
| 1046 |
+
|
| 1047 |
+
def test_query_followup_guard_detects_ordinal_without_enumeration():
|
| 1048 |
+
assert query_api._contains_ordinal_followup("What about the second one?") is True
|
| 1049 |
+
assert query_api._history_has_explicit_enumeration(
|
| 1050 |
+
[{"role": "assistant", "content": "No list here."}]
|
| 1051 |
+
) is False
|
| 1052 |
+
|
| 1053 |
+
|
| 1054 |
+
def test_query_followup_guard_allows_ordinal_when_prior_answer_lists_items():
|
| 1055 |
+
history = [
|
| 1056 |
+
{"role": "assistant", "content": "1. Alice\n2. Bob\n"},
|
| 1057 |
+
]
|
| 1058 |
+
assert query_api._contains_ordinal_followup("What about the second one?") is True
|
| 1059 |
+
assert query_api._history_has_explicit_enumeration(history) is True
|
| 1060 |
+
|
| 1061 |
+
|
| 1062 |
+
def test_generate_sub_queries_skips_rewrite_for_exact_fact(monkeypatch):
|
| 1063 |
+
monkeypatch.setattr(
|
| 1064 |
+
providers.ProviderFactory,
|
| 1065 |
+
"build_chat_llm",
|
| 1066 |
+
staticmethod(lambda **_kwargs: (_ for _ in ()).throw(AssertionError("rewriter should not be called"))),
|
| 1067 |
+
)
|
| 1068 |
+
|
| 1069 |
+
queries = pipeline.generate_sub_queries(
|
| 1070 |
+
"What is the exact full title of this guide?",
|
| 1071 |
+
route_class="exact_fact",
|
| 1072 |
+
)
|
| 1073 |
+
|
| 1074 |
+
assert queries == ["What is the exact full title of this guide?"]
|
| 1075 |
+
|
| 1076 |
+
|
| 1077 |
+
def test_identity_documents_for_query_answers_not_stated_publisher():
|
| 1078 |
+
row = {
|
| 1079 |
+
"filename": "Guide.pdf",
|
| 1080 |
+
"identity_json": {
|
| 1081 |
+
"display_title": "Abdul Manan — Deep Foundations Guide",
|
| 1082 |
+
"field_presence": {"publisher": False},
|
| 1083 |
+
"source_pages": [1],
|
| 1084 |
+
},
|
| 1085 |
+
}
|
| 1086 |
+
route_decision = pipeline.RouteDecision(
|
| 1087 |
+
route_class="exact_fact",
|
| 1088 |
+
route_reason="identity_field:publisher",
|
| 1089 |
+
exact_field="publisher",
|
| 1090 |
+
)
|
| 1091 |
+
|
| 1092 |
+
docs = pipeline._identity_documents_for_query(
|
| 1093 |
+
row,
|
| 1094 |
+
query="Does this guide explicitly name a publisher on the opening pages? If not, say not stated.",
|
| 1095 |
+
route_decision=route_decision,
|
| 1096 |
+
)
|
| 1097 |
+
|
| 1098 |
+
assert len(docs) == 1
|
| 1099 |
+
assert "not stated on the opening pages" in docs[0].page_content.lower()
|
| 1100 |
+
assert docs[0].metadata["retrieval_branch"] == "identity_store"
|
| 1101 |
+
|
| 1102 |
+
|
| 1103 |
+
def test_build_history_block_returns_structured_state_without_role_labels():
|
| 1104 |
+
block = pipeline._build_history_block(
|
| 1105 |
+
[
|
| 1106 |
+
{"role": "user", "content": "Whose guide is this?"},
|
| 1107 |
+
{"role": "assistant", "content": "ASSISTANT: Abdul Manan — Deep Foundations Guide."},
|
| 1108 |
+
],
|
| 1109 |
+
route_class="factoid",
|
| 1110 |
+
eval_mode=False,
|
| 1111 |
+
)
|
| 1112 |
+
|
| 1113 |
+
assert "CONVERSATION STATE:" in block
|
| 1114 |
+
assert "previous_user_intent:" in block
|
| 1115 |
+
assert "previous_answer_summary:" in block
|
| 1116 |
+
assert "ASSISTANT:" not in block
|
| 1117 |
+
assert "USER:" not in block
|
| 1118 |
+
|
| 1119 |
+
|
| 1120 |
+
def test_save_to_memory_writes_structured_payloads(monkeypatch):
|
| 1121 |
+
fake_service = FakeServiceSupabase()
|
| 1122 |
+
monkeypatch.setattr(
|
| 1123 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
|
| 1124 |
+
)
|
| 1125 |
+
monkeypatch.setattr(pipeline, "_stable_user_id", lambda *_args, **_kwargs: "user-1")
|
| 1126 |
+
monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _text: [0.1, 0.2])
|
| 1127 |
+
|
| 1128 |
+
chunks = [
|
| 1129 |
+
Document(
|
| 1130 |
+
page_content="body",
|
| 1131 |
+
metadata={"file_hash": "file-1", "document_type": "machine_learning_guide"},
|
| 1132 |
+
)
|
| 1133 |
+
]
|
| 1134 |
+
|
| 1135 |
+
pipeline._save_to_memory(
|
| 1136 |
+
"session-1",
|
| 1137 |
+
"Whose guide is this?",
|
| 1138 |
+
"Abdul Manan — Deep Foundations Guide\n\n---\n**Sources:**\n[Source 1]",
|
| 1139 |
+
access_token=None,
|
| 1140 |
+
route_class="factoid",
|
| 1141 |
+
chunks=chunks,
|
| 1142 |
+
)
|
| 1143 |
+
|
| 1144 |
+
assert len(fake_service.inserts) == 2
|
| 1145 |
+
user_payload = json.loads(fake_service.inserts[0][1]["content"])
|
| 1146 |
+
answer_payload = json.loads(fake_service.inserts[1][1]["content"])
|
| 1147 |
+
assert user_payload["kind"] == "user_query"
|
| 1148 |
+
assert answer_payload["kind"] == "assistant_fact"
|
| 1149 |
+
assert answer_payload["file_hashes"] == ["file-1"]
|
| 1150 |
+
assert "Sources" not in answer_payload["summary"]
|
| 1151 |
+
|
| 1152 |
+
|
| 1153 |
+
def test_generate_answer_stream_eval_mode_skips_history_and_memory_injection(monkeypatch):
|
| 1154 |
+
captured = {}
|
| 1155 |
+
|
| 1156 |
+
class FakeLLM:
|
| 1157 |
+
async def astream(self, messages):
|
| 1158 |
+
captured["prompt"] = messages[0].content[0]["text"]
|
| 1159 |
+
yield SimpleNamespace(content="clean answer")
|
| 1160 |
+
|
| 1161 |
+
monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
|
| 1162 |
+
monkeypatch.setattr(
|
| 1163 |
+
pipeline,
|
| 1164 |
+
"_get_episodic_memory",
|
| 1165 |
+
lambda *args, **kwargs: "" if kwargs.get("eval_mode") else "SESSION FACTS:\n- prior answer: x\n",
|
| 1166 |
+
)
|
| 1167 |
+
monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
|
| 1168 |
+
monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
|
| 1169 |
+
monkeypatch.setattr(pipeline, "_persist_query_trace", lambda **_kwargs: "trace-1")
|
| 1170 |
+
|
| 1171 |
+
chunk = Document(
|
| 1172 |
+
page_content="Body text",
|
| 1173 |
+
metadata={
|
| 1174 |
+
"source": "Guide",
|
| 1175 |
+
"chunk_index": 1,
|
| 1176 |
+
"document_type": "machine_learning_guide",
|
| 1177 |
+
"relevance_score": 0.9,
|
| 1178 |
+
"route_class": "factoid",
|
| 1179 |
+
"original_content": {"raw_text": "Body text", "tables_html": []},
|
| 1180 |
+
},
|
| 1181 |
+
)
|
| 1182 |
+
|
| 1183 |
+
async def collect():
|
| 1184 |
+
events = []
|
| 1185 |
+
async for event in pipeline.generate_answer_stream(
|
| 1186 |
+
chunks=[chunk],
|
| 1187 |
+
query="Tell me more",
|
| 1188 |
+
chat_history=[
|
| 1189 |
+
{"role": "user", "content": "Who is this guide for?"},
|
| 1190 |
+
{"role": "assistant", "content": "It is personalized."},
|
| 1191 |
+
],
|
| 1192 |
+
session_id="session-1",
|
| 1193 |
+
eval_mode=True,
|
| 1194 |
+
):
|
| 1195 |
+
events.append(event)
|
| 1196 |
+
return events
|
| 1197 |
+
|
| 1198 |
+
events = asyncio.run(collect())
|
| 1199 |
+
|
| 1200 |
+
assert any(event["type"] == "done" for event in events)
|
| 1201 |
+
assert "CONVERSATION STATE:" not in captured["prompt"]
|
| 1202 |
+
assert "SESSION FACTS:" not in captured["prompt"]
|
| 1203 |
+
|
| 1204 |
+
|
| 1205 |
+
def test_persist_query_trace_marks_output_echo_and_contamination(monkeypatch):
|
| 1206 |
+
fake_service = FakeServiceSupabase()
|
| 1207 |
+
monkeypatch.setattr(
|
| 1208 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
|
| 1209 |
+
)
|
| 1210 |
+
monkeypatch.setattr(pipeline, "_persist_trace_graph_enrichment", lambda *args, **kwargs: None)
|
| 1211 |
+
|
| 1212 |
+
chunks = [
|
| 1213 |
+
Document(
|
| 1214 |
+
page_content="body",
|
| 1215 |
+
metadata={
|
| 1216 |
+
"trace_id": "trace-echo",
|
| 1217 |
+
"route_class": "factoid",
|
| 1218 |
+
"route_mode": "default",
|
| 1219 |
+
"source": "Guide",
|
| 1220 |
+
"document_type": "machine_learning_guide",
|
| 1221 |
+
"trace_quality": {
|
| 1222 |
+
"retrieval_relevance_proxy": 0.8,
|
| 1223 |
+
"history_injected": True,
|
| 1224 |
+
"memory_injected": True,
|
| 1225 |
+
},
|
| 1226 |
+
},
|
| 1227 |
+
)
|
| 1228 |
+
]
|
| 1229 |
+
|
| 1230 |
+
pipeline._persist_query_trace(
|
| 1231 |
+
query="Why does this guide say it exists?",
|
| 1232 |
+
session_id="session-1",
|
| 1233 |
+
chunks=chunks,
|
| 1234 |
+
answer="ASSISTANT: This guide exists because knowing definitions is not enough.",
|
| 1235 |
+
access_token=None,
|
| 1236 |
+
)
|
| 1237 |
+
|
| 1238 |
+
upsert = next(item for item in fake_service.upserts if item[0] == "query_traces")
|
| 1239 |
+
failure_modes = set(upsert[1]["failure_modes"])
|
| 1240 |
+
assert {"output_echo", "history_contamination", "memory_contamination"} <= failure_modes
|
| 1241 |
+
|
| 1242 |
+
|
| 1243 |
+
def test_retrieve_chunks_exact_fact_prefers_identity_store(monkeypatch):
|
| 1244 |
+
monkeypatch.setattr(pipeline, "_stable_user_id", lambda *_args, **_kwargs: "user-1")
|
| 1245 |
+
monkeypatch.setattr(pipeline, "_route_query_experts", lambda *args, **kwargs: {
|
| 1246 |
+
"selected_experts": ["dense_chunk"],
|
| 1247 |
+
"expert_weights": {"dense_chunk": 1.0},
|
| 1248 |
+
"confidence": 0.9,
|
| 1249 |
+
})
|
| 1250 |
+
monkeypatch.setattr(
|
| 1251 |
+
pipeline,
|
| 1252 |
+
"_load_or_backfill_identity_row",
|
| 1253 |
+
lambda *args, **kwargs: {
|
| 1254 |
+
"filename": "Guide.pdf",
|
| 1255 |
+
"identity_json": {
|
| 1256 |
+
"display_title": "Abdul Manan — Deep Foundations Guide",
|
| 1257 |
+
"subtitle": 'The "Why Before What" Bible for ML/DL/AI Engineering',
|
| 1258 |
+
"named_owner": "Abdul Manan",
|
| 1259 |
+
"field_presence": {"owner": True},
|
| 1260 |
+
"source_pages": [1],
|
| 1261 |
+
},
|
| 1262 |
+
},
|
| 1263 |
+
)
|
| 1264 |
+
|
| 1265 |
+
docs = pipeline.retrieve_chunks(
|
| 1266 |
+
query="Whose guide is this? Answer using the exact name written in the document.",
|
| 1267 |
+
original_query="Whose guide is this? Answer using the exact name written in the document.",
|
| 1268 |
+
user_id="user-1",
|
| 1269 |
+
priority_file_hashes=["file-1"],
|
| 1270 |
+
)
|
| 1271 |
+
|
| 1272 |
+
assert len(docs) == 1
|
| 1273 |
+
assert docs[0].metadata["retrieval_branch"] == "identity_store"
|
| 1274 |
+
assert "Abdul Manan" in docs[0].page_content
|
| 1275 |
+
|
| 1276 |
+
|
| 1277 |
+
def test_multi_doc_context_budget_preserves_one_chunk_per_pinned_doc(monkeypatch):
|
| 1278 |
+
monkeypatch.setattr(
|
| 1279 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: FakeRetrieveSupabase()
|
| 1280 |
+
)
|
| 1281 |
+
monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
|
| 1282 |
+
monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
|
| 1283 |
+
monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
|
| 1284 |
+
monkeypatch.setattr(pipeline.config, "MAX_CONTEXT_CHARS", 700, raising=False)
|
| 1285 |
+
|
| 1286 |
+
docs = pipeline.retrieve_chunks(
|
| 1287 |
+
query="compare the themes of both stories",
|
| 1288 |
+
category="short_story",
|
| 1289 |
+
access_token="token",
|
| 1290 |
+
priority_file_hashes=["A", "B"],
|
| 1291 |
+
)
|
| 1292 |
+
|
| 1293 |
+
assert len(docs) == 2
|
| 1294 |
+
assert {doc.metadata["file_hash"] for doc in docs} == {"A", "B"}
|
| 1295 |
+
|
| 1296 |
+
|
| 1297 |
+
def test_build_pinned_query_plan_scopes_title_queries_to_own_doc():
|
| 1298 |
+
plan = pipeline._build_pinned_query_plan(
|
| 1299 |
+
"summarise the story short story",
|
| 1300 |
+
[
|
| 1301 |
+
{"file_hash": "A", "filename": "About Love Anton Chekhov"},
|
| 1302 |
+
{"file_hash": "B", "filename": "BEYOND BOUNDS"},
|
| 1303 |
+
],
|
| 1304 |
+
"generic_pinned",
|
| 1305 |
+
)
|
| 1306 |
+
|
| 1307 |
+
about_entries = [entry for entry in plan if "About Love Anton Chekhov" in entry["query_text"]]
|
| 1308 |
+
beyond_entries = [entry for entry in plan if "BEYOND BOUNDS" in entry["query_text"]]
|
| 1309 |
+
|
| 1310 |
+
assert about_entries
|
| 1311 |
+
assert beyond_entries
|
| 1312 |
+
assert all(entry["target_file_hashes"] == ["A"] for entry in about_entries)
|
| 1313 |
+
assert all(entry["target_file_hashes"] == ["B"] for entry in beyond_entries)
|
| 1314 |
+
|
| 1315 |
+
|
| 1316 |
+
def test_partition_document_retries_with_hi_res_when_fast_is_suspiciously_thin(monkeypatch):
|
| 1317 |
+
calls = []
|
| 1318 |
+
|
| 1319 |
+
def fake_partition_pdf(*, filename, strategy, **_kwargs):
|
| 1320 |
+
del filename
|
| 1321 |
+
calls.append(strategy)
|
| 1322 |
+
if strategy == "fast":
|
| 1323 |
+
return [FakeElement("x" * 50, page_number=1)]
|
| 1324 |
+
return [
|
| 1325 |
+
FakeElement("x" * 500, page_number=1),
|
| 1326 |
+
FakeElement("y" * 500, page_number=1),
|
| 1327 |
+
FakeElement("z" * 500, page_number=1),
|
| 1328 |
+
]
|
| 1329 |
+
|
| 1330 |
+
monkeypatch.setattr(pipeline, "_has_text_layer", lambda _path: True)
|
| 1331 |
+
monkeypatch.setattr(pipeline, "partition_pdf", fake_partition_pdf)
|
| 1332 |
+
|
| 1333 |
+
elements = pipeline.partition_document("file.pdf")
|
| 1334 |
+
|
| 1335 |
+
assert calls == ["fast", "hi_res"]
|
| 1336 |
+
assert len(elements) == 3
|
| 1337 |
+
|
| 1338 |
+
|
| 1339 |
+
def test_create_chunks_splits_single_thin_narrative(monkeypatch):
|
| 1340 |
+
long_text = (
|
| 1341 |
+
'"Every single night..." Lee said softly. '
|
| 1342 |
+
"The same demons kept returning, and the weight of them was unbearable. "
|
| 1343 |
+
"She kept remembering the dream, the corridor, the whispering, and the crushing fear. "
|
| 1344 |
+
"Classes were slipping away from her, and every conversation with the doctor felt more urgent. "
|
| 1345 |
+
"Still, she tried to describe what she saw, heard, and felt in careful detail."
|
| 1346 |
+
) * 3
|
| 1347 |
+
|
| 1348 |
+
def fake_chunk_by_title(elements, **_kwargs):
|
| 1349 |
+
del elements
|
| 1350 |
+
return [
|
| 1351 |
+
SimpleNamespace(
|
| 1352 |
+
text=long_text,
|
| 1353 |
+
metadata=SimpleNamespace(orig_elements=[FakeElement(long_text)]),
|
| 1354 |
+
)
|
| 1355 |
+
]
|
| 1356 |
+
|
| 1357 |
+
monkeypatch.setattr(pipeline, "chunk_by_title", fake_chunk_by_title)
|
| 1358 |
+
|
| 1359 |
+
chunks = pipeline.create_chunks([FakeElement(long_text)], text_chars=len(long_text))
|
| 1360 |
+
|
| 1361 |
+
assert len(chunks) >= 2
|
| 1362 |
+
assert all(getattr(chunk, "text", "") for chunk in chunks)
|
| 1363 |
+
|
| 1364 |
+
|
| 1365 |
+
def test_build_raptor_tree_synthesizes_root_for_single_leaf(monkeypatch):
|
| 1366 |
+
class FakeLLM:
|
| 1367 |
+
def invoke(self, _messages):
|
| 1368 |
+
return SimpleNamespace(content="Root summary")
|
| 1369 |
+
|
| 1370 |
+
monkeypatch.setattr(pipeline, "_build_llm", lambda **_kwargs: FakeLLM())
|
| 1371 |
+
leaf = Document(
|
| 1372 |
+
page_content="Leaf content",
|
| 1373 |
+
metadata={
|
| 1374 |
+
"source": "BEYOND BOUNDS",
|
| 1375 |
+
"file_hash": "B",
|
| 1376 |
+
"document_type": "short_story",
|
| 1377 |
+
"summary": "Leaf summary",
|
| 1378 |
+
"chunk_index": 1,
|
| 1379 |
+
"page_numbers": [1],
|
| 1380 |
+
},
|
| 1381 |
+
)
|
| 1382 |
+
|
| 1383 |
+
docs, ids = pipeline.build_raptor_tree([leaf], ["leaf-1"], "user-1")
|
| 1384 |
+
|
| 1385 |
+
assert len(docs) == 2
|
| 1386 |
+
assert len(ids) == 2
|
| 1387 |
+
assert any(doc.metadata.get("node_type") == "summary" for doc in docs)
|
| 1388 |
+
|
| 1389 |
+
|
| 1390 |
+
def test_generic_multi_doc_mode_keeps_weak_doc_with_candidates(monkeypatch):
|
| 1391 |
+
monkeypatch.setattr(
|
| 1392 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: FakeRetrieveSupabase()
|
| 1393 |
+
)
|
| 1394 |
+
monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
|
| 1395 |
+
monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
|
| 1396 |
+
monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
|
| 1397 |
+
monkeypatch.setattr(pipeline.config, "MAX_CONTEXT_CHARS", 2_000, raising=False)
|
| 1398 |
+
|
| 1399 |
+
docs = pipeline.retrieve_chunks(
|
| 1400 |
+
query="summarise the story short story",
|
| 1401 |
+
category="short_story",
|
| 1402 |
+
access_token="token",
|
| 1403 |
+
priority_file_hashes=["A", "B"],
|
| 1404 |
+
original_query="summarise the story",
|
| 1405 |
+
)
|
| 1406 |
+
|
| 1407 |
+
assert {doc.metadata["file_hash"] for doc in docs} == {"A", "B"}
|
| 1408 |
+
assert docs[0].metadata["route_mode"] == "generic_pinned"
|
| 1409 |
+
assert len(docs[0].metadata["doc_diagnostics"]) == 2
|
| 1410 |
+
|
| 1411 |
+
|
| 1412 |
+
def test_weighted_doc_prior_fusion_does_not_saturate_scores():
|
| 1413 |
+
fused = pipeline._combine_local_and_doc_score(0.95, 1.0, 0.2)
|
| 1414 |
+
assert fused < 1.0
|
| 1415 |
+
assert fused == 0.96
|
| 1416 |
+
|
| 1417 |
+
|
| 1418 |
+
def test_generate_answer_stream_done_event_includes_trace_metadata(monkeypatch):
|
| 1419 |
+
class FakeLLM:
|
| 1420 |
+
async def astream(self, _messages):
|
| 1421 |
+
yield SimpleNamespace(content="ok")
|
| 1422 |
+
|
| 1423 |
+
monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
|
| 1424 |
+
monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
|
| 1425 |
+
monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
|
| 1426 |
+
monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
|
| 1427 |
+
monkeypatch.setattr(pipeline, "_persist_query_trace", lambda **_kwargs: "trace-123")
|
| 1428 |
+
|
| 1429 |
+
chunk = Document(
|
| 1430 |
+
page_content="Leaf fallback",
|
| 1431 |
+
metadata={
|
| 1432 |
+
"source": "About Love Anton Chekhov",
|
| 1433 |
+
"chunk_index": 1,
|
| 1434 |
+
"document_type": "short_story",
|
| 1435 |
+
"relevance_score": 0.6,
|
| 1436 |
+
"original_content": {"raw_text": "Leaf raw text", "tables_html": []},
|
| 1437 |
+
"trace_id": "trace-123",
|
| 1438 |
+
"route_mode": "explicit_compare",
|
| 1439 |
+
"doc_diagnostics": [{"file_hash": "A", "source": "About Love Anton Chekhov", "included": True, "candidate_count": 2, "doc_score": 0.6, "confidence_label": "high", "reason": "supported"}],
|
| 1440 |
+
},
|
| 1441 |
+
)
|
| 1442 |
+
|
| 1443 |
+
async def collect():
|
| 1444 |
+
events = []
|
| 1445 |
+
async for event in pipeline.generate_answer_stream(
|
| 1446 |
+
chunks=[chunk],
|
| 1447 |
+
query="compare the themes",
|
| 1448 |
+
access_token=None,
|
| 1449 |
+
category="short_story",
|
| 1450 |
+
priority_file_hashes=["A", "B"],
|
| 1451 |
+
):
|
| 1452 |
+
events.append(event)
|
| 1453 |
+
return events
|
| 1454 |
+
|
| 1455 |
+
events = asyncio.run(collect())
|
| 1456 |
+
done_event = next(event for event in events if event["type"] == "done")
|
| 1457 |
+
|
| 1458 |
+
assert done_event["trace_id"] == "trace-123"
|
| 1459 |
+
assert done_event["doc_diagnostics"][0]["source"] == "About Love Anton Chekhov"
|
| 1460 |
+
|
| 1461 |
+
|
| 1462 |
+
def test_generate_answer_stream_sanitizes_template_tokens_and_records_metrics(monkeypatch):
|
| 1463 |
+
captured = {}
|
| 1464 |
+
|
| 1465 |
+
def fake_persist_query_trace(**kwargs):
|
| 1466 |
+
captured["kwargs"] = kwargs
|
| 1467 |
+
return "trace-xyz"
|
| 1468 |
+
|
| 1469 |
+
class FakeLLM:
|
| 1470 |
+
async def astream(self, _messages):
|
| 1471 |
+
yield SimpleNamespace(content="assistant<|header_end|>Hello")
|
| 1472 |
+
yield SimpleNamespace(content=" there<|eot_id|>")
|
| 1473 |
+
|
| 1474 |
+
monkeypatch.setattr(pipeline, "_build_llm", lambda needs_vision=False: FakeLLM())
|
| 1475 |
+
monkeypatch.setattr(pipeline, "_get_episodic_memory", lambda *args, **kwargs: "")
|
| 1476 |
+
monkeypatch.setattr(pipeline, "_log_retrieval_reward", lambda *args, **kwargs: None)
|
| 1477 |
+
monkeypatch.setattr(pipeline, "_save_to_memory", lambda *args, **kwargs: None)
|
| 1478 |
+
monkeypatch.setattr(pipeline, "_persist_query_trace", fake_persist_query_trace)
|
| 1479 |
+
|
| 1480 |
+
chunk = Document(
|
| 1481 |
+
page_content="Leaf fallback",
|
| 1482 |
+
metadata={
|
| 1483 |
+
"source": "About Love Anton Chekhov",
|
| 1484 |
+
"chunk_index": 1,
|
| 1485 |
+
"document_type": "short_story",
|
| 1486 |
+
"relevance_score": 0.6,
|
| 1487 |
+
"original_content": {"raw_text": "Leaf raw text", "tables_html": []},
|
| 1488 |
+
"trace_id": "trace-xyz",
|
| 1489 |
+
"route_mode": "default",
|
| 1490 |
+
},
|
| 1491 |
+
)
|
| 1492 |
+
|
| 1493 |
+
async def collect():
|
| 1494 |
+
events = []
|
| 1495 |
+
async for event in pipeline.generate_answer_stream(
|
| 1496 |
+
chunks=[chunk],
|
| 1497 |
+
query="hello",
|
| 1498 |
+
access_token=None,
|
| 1499 |
+
category="short_story",
|
| 1500 |
+
):
|
| 1501 |
+
events.append(event)
|
| 1502 |
+
return events
|
| 1503 |
+
|
| 1504 |
+
events = asyncio.run(collect())
|
| 1505 |
+
tokens = "".join(event["content"] for event in events if event["type"] == "token")
|
| 1506 |
+
|
| 1507 |
+
assert "<|" not in tokens
|
| 1508 |
+
assert "Hello there" in tokens
|
| 1509 |
+
assert captured["kwargs"]["sanitizer_metrics"]["sanitizer_triggered"] is True
|
| 1510 |
+
assert captured["kwargs"]["sanitizer_metrics"]["sanitized_token_count"] > 0
|
| 1511 |
+
|
| 1512 |
+
|
| 1513 |
+
def test_duplicate_chunk_collapse_removes_overlap():
|
| 1514 |
+
kept, collapsed = pipeline._collapse_near_duplicate_candidates(
|
| 1515 |
+
[
|
| 1516 |
+
{
|
| 1517 |
+
"id": "a",
|
| 1518 |
+
"content": "Alpha beta gamma delta epsilon zeta",
|
| 1519 |
+
"metadata": {"file_hash": "doc-a", "source": "Doc A"},
|
| 1520 |
+
},
|
| 1521 |
+
{
|
| 1522 |
+
"id": "b",
|
| 1523 |
+
"content": "Alpha beta gamma delta epsilon zeta eta",
|
| 1524 |
+
"metadata": {"file_hash": "doc-a", "source": "Doc A"},
|
| 1525 |
+
},
|
| 1526 |
+
{
|
| 1527 |
+
"id": "c",
|
| 1528 |
+
"content": "Completely different content",
|
| 1529 |
+
"metadata": {"file_hash": "doc-a", "source": "Doc A"},
|
| 1530 |
+
},
|
| 1531 |
+
]
|
| 1532 |
+
)
|
| 1533 |
+
|
| 1534 |
+
assert collapsed == 1
|
| 1535 |
+
assert [row["id"] for row in kept] == ["a", "c"]
|
| 1536 |
+
|
| 1537 |
+
|
| 1538 |
+
def test_analyse_intent_rewrites_follow_up_query(monkeypatch):
|
| 1539 |
+
monkeypatch.setattr(
|
| 1540 |
+
pipeline.intent_classifier,
|
| 1541 |
+
"predict",
|
| 1542 |
+
lambda *_args, **_kwargs: {
|
| 1543 |
+
"needs_clarification": False,
|
| 1544 |
+
"confidence": 0.95,
|
| 1545 |
+
},
|
| 1546 |
+
)
|
| 1547 |
+
monkeypatch.setattr(
|
| 1548 |
+
pipeline.intent_classifier,
|
| 1549 |
+
"record_feedback",
|
| 1550 |
+
lambda *args, **kwargs: None,
|
| 1551 |
+
)
|
| 1552 |
+
|
| 1553 |
+
session_key = pipeline._session_cache_key("sess-1", user_id="user-1")
|
| 1554 |
+
pipeline._last_query_context[session_key] = {
|
| 1555 |
+
"query": "Compare About Love and BEYOND BOUNDS",
|
| 1556 |
+
"updated_at": pipeline.time.time(),
|
| 1557 |
+
}
|
| 1558 |
+
pipeline._last_chunks[session_key] = [Document(page_content="cached", metadata={})]
|
| 1559 |
+
|
| 1560 |
+
result = pipeline.analyse_intent(
|
| 1561 |
+
query="What about the second one?",
|
| 1562 |
+
category="All",
|
| 1563 |
+
chat_history=[
|
| 1564 |
+
{"role": "user", "content": "Compare About Love and BEYOND BOUNDS"},
|
| 1565 |
+
{"role": "assistant", "content": "Here is the comparison."},
|
| 1566 |
+
],
|
| 1567 |
+
session_id="sess-1",
|
| 1568 |
+
user_id="user-1",
|
| 1569 |
+
)
|
| 1570 |
+
|
| 1571 |
+
assert result["route_class"] == "follow_up"
|
| 1572 |
+
assert "follow-up about: Compare About Love and BEYOND BOUNDS" in result["enriched_query"]
|
| 1573 |
+
|
| 1574 |
+
pipeline._last_query_context.pop(session_key, None)
|
| 1575 |
+
pipeline._last_chunks.pop(session_key, None)
|
| 1576 |
+
|
| 1577 |
+
|
| 1578 |
+
def test_record_answer_feedback_persists_feedback_and_promotes(monkeypatch):
|
| 1579 |
+
fake_service = FakeServiceSupabase()
|
| 1580 |
+
fake_service.trace_rows.append(
|
| 1581 |
+
{"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111", "session_id": "sess-1", "question": "What is common?"}
|
| 1582 |
+
)
|
| 1583 |
+
monkeypatch.setattr(
|
| 1584 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
|
| 1585 |
+
)
|
| 1586 |
+
|
| 1587 |
+
ok = pipeline.record_answer_feedback(
|
| 1588 |
+
{
|
| 1589 |
+
"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
|
| 1590 |
+
"helpful": False,
|
| 1591 |
+
"reason_code": "needs_improvement",
|
| 1592 |
+
"correction_text": "The two stories should not be merged.",
|
| 1593 |
+
},
|
| 1594 |
+
access_token=None,
|
| 1595 |
+
)
|
| 1596 |
+
|
| 1597 |
+
assert ok is True
|
| 1598 |
+
feedback_insert = next(item for item in fake_service.inserts if item[0] == "answer_feedback")
|
| 1599 |
+
assert feedback_insert[1]["promote_to_eval"] is True
|
| 1600 |
+
assert any(item[0] == "graph_nodes" for item in fake_service.upserts)
|
| 1601 |
+
assert any(item[0] == "graph_edges" for item in fake_service.upserts)
|
| 1602 |
+
|
| 1603 |
+
|
| 1604 |
+
def test_load_feedback_dataset_candidates_promotes_feedback_traces(monkeypatch):
|
| 1605 |
+
fake_service = FakeServiceSupabase()
|
| 1606 |
+
fake_service.feedback_rows.append(
|
| 1607 |
+
{
|
| 1608 |
+
"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
|
| 1609 |
+
"helpful": False,
|
| 1610 |
+
"accepted": False,
|
| 1611 |
+
"reason_code": "unsupported_commonality",
|
| 1612 |
+
"correction_text": "Insufficient evidence for commonality.",
|
| 1613 |
+
"promote_to_eval": True,
|
| 1614 |
+
"user_id": "user-1",
|
| 1615 |
+
}
|
| 1616 |
+
)
|
| 1617 |
+
fake_service.trace_rows.append(
|
| 1618 |
+
{
|
| 1619 |
+
"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
|
| 1620 |
+
"question": "What is common between these two documents?",
|
| 1621 |
+
"doc_diagnostics": [{"source": "BEYOND BOUNDS", "reason": "low_scoped_confidence"}],
|
| 1622 |
+
"failure_modes": ["unsupported_commonality"],
|
| 1623 |
+
"answer_preview": "The documents both explore emotion.",
|
| 1624 |
+
}
|
| 1625 |
+
)
|
| 1626 |
+
monkeypatch.setattr(
|
| 1627 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_service
|
| 1628 |
+
)
|
| 1629 |
+
|
| 1630 |
+
rows = run_eval.load_feedback_dataset_candidates(None, "user-1", limit=10)
|
| 1631 |
+
|
| 1632 |
+
assert len(rows) == 1
|
| 1633 |
+
assert rows[0]["trace_id"] == "8f8c1f3f-bcb6-43a8-b10d-85f31a917111"
|
| 1634 |
+
assert rows[0]["gold_evidence_text"] == "Insufficient evidence for commonality."
|
| 1635 |
+
|
| 1636 |
+
|
| 1637 |
+
def test_router_weights_trigger_summary_branch_filters(monkeypatch):
|
| 1638 |
+
class TrackingRpc:
|
| 1639 |
+
def __init__(self, supabase):
|
| 1640 |
+
self.supabase = supabase
|
| 1641 |
+
|
| 1642 |
+
def execute(self):
|
| 1643 |
+
self.supabase.rpc_filters.append(self.supabase.params["filter"])
|
| 1644 |
+
node_type = self.supabase.params["filter"].get("node_type")
|
| 1645 |
+
if node_type == "summary":
|
| 1646 |
+
return SimpleNamespace(
|
| 1647 |
+
data=[
|
| 1648 |
+
{
|
| 1649 |
+
"id": "sum-1",
|
| 1650 |
+
"content": "Synthetic summary content",
|
| 1651 |
+
"metadata": {
|
| 1652 |
+
"file_hash": "A",
|
| 1653 |
+
"source": "About Love Anton Chekhov",
|
| 1654 |
+
"chunk_index": "1-4",
|
| 1655 |
+
"document_type": "short_story",
|
| 1656 |
+
"node_type": "summary",
|
| 1657 |
+
"node_level": 1,
|
| 1658 |
+
},
|
| 1659 |
+
}
|
| 1660 |
+
]
|
| 1661 |
+
)
|
| 1662 |
+
return SimpleNamespace(
|
| 1663 |
+
data=[
|
| 1664 |
+
{
|
| 1665 |
+
"id": "leaf-1",
|
| 1666 |
+
"content": "Leaf content",
|
| 1667 |
+
"metadata": {
|
| 1668 |
+
"file_hash": "A",
|
| 1669 |
+
"source": "About Love Anton Chekhov",
|
| 1670 |
+
"chunk_index": 1,
|
| 1671 |
+
"document_type": "short_story",
|
| 1672 |
+
"node_type": "leaf",
|
| 1673 |
+
},
|
| 1674 |
+
}
|
| 1675 |
+
]
|
| 1676 |
+
)
|
| 1677 |
+
|
| 1678 |
+
class TrackingSupabase:
|
| 1679 |
+
def __init__(self):
|
| 1680 |
+
self.rpc_filters = []
|
| 1681 |
+
self.params = {}
|
| 1682 |
+
|
| 1683 |
+
def table(self, _name: str):
|
| 1684 |
+
return FakeRetrieveTable(self, "ingested_files")
|
| 1685 |
+
|
| 1686 |
+
def rpc(self, _name: str, params):
|
| 1687 |
+
self.params = params
|
| 1688 |
+
return TrackingRpc(self)
|
| 1689 |
+
|
| 1690 |
+
tracking = TrackingSupabase()
|
| 1691 |
+
monkeypatch.setattr(
|
| 1692 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: tracking
|
| 1693 |
+
)
|
| 1694 |
+
monkeypatch.setattr(pipeline, "get_cached_embedding", lambda _query: [0.1, 0.2])
|
| 1695 |
+
monkeypatch.setattr(pipeline, "_route_query_experts", lambda *args, **kwargs: {
|
| 1696 |
+
"expert_weights": {
|
| 1697 |
+
"dense_chunk": 0.3,
|
| 1698 |
+
"raptor_summary": 0.4,
|
| 1699 |
+
"graph_traversal": 0.1,
|
| 1700 |
+
"episodic_memory": 0.1,
|
| 1701 |
+
"hybrid_compare": 0.1,
|
| 1702 |
+
},
|
| 1703 |
+
"selected_experts": ["dense_chunk", "raptor_summary"],
|
| 1704 |
+
"confidence": 0.4,
|
| 1705 |
+
})
|
| 1706 |
+
monkeypatch.setattr(pipeline.cohere, "Client", FakeCohereClient)
|
| 1707 |
+
monkeypatch.setattr(pipeline, "_log_rerank_feedback", lambda *args, **kwargs: None)
|
| 1708 |
+
|
| 1709 |
+
docs = pipeline.retrieve_chunks(
|
| 1710 |
+
query="tell me more",
|
| 1711 |
+
access_token="token",
|
| 1712 |
+
original_query="tell me more",
|
| 1713 |
+
)
|
| 1714 |
+
|
| 1715 |
+
assert docs
|
| 1716 |
+
assert any(f.get("node_type") == "summary" for f in tracking.rpc_filters)
|
| 1717 |
+
|
| 1718 |
+
|
| 1719 |
+
def test_thin_doc_overview_prefers_synthetic_root_summary():
|
| 1720 |
+
leaf = Document(
|
| 1721 |
+
page_content="Leaf content",
|
| 1722 |
+
metadata={
|
| 1723 |
+
"file_hash": "B",
|
| 1724 |
+
"source": "BEYOND BOUNDS",
|
| 1725 |
+
"node_type": "leaf",
|
| 1726 |
+
"relevance_score": 0.9,
|
| 1727 |
+
},
|
| 1728 |
+
)
|
| 1729 |
+
root = Document(
|
| 1730 |
+
page_content="Synthetic root summary",
|
| 1731 |
+
metadata={
|
| 1732 |
+
"file_hash": "B",
|
| 1733 |
+
"source": "BEYOND BOUNDS",
|
| 1734 |
+
"node_type": "summary",
|
| 1735 |
+
"synthetic_root_summary": True,
|
| 1736 |
+
"relevance_score": 0.4,
|
| 1737 |
+
},
|
| 1738 |
+
)
|
| 1739 |
+
|
| 1740 |
+
ordered, buckets, policy = pipeline._materialize_evidence_buckets(
|
| 1741 |
+
[leaf, root],
|
| 1742 |
+
query="summarise the story",
|
| 1743 |
+
route_mode="single",
|
| 1744 |
+
doc_title_map={"B": "BEYOND BOUNDS"},
|
| 1745 |
+
)
|
| 1746 |
+
|
| 1747 |
+
assert ordered[0].metadata["synthetic_root_summary"] is True
|
| 1748 |
+
assert buckets[0]["thin_doc"] is True
|
| 1749 |
+
assert policy["summary_like"] is True
|
| 1750 |
+
|
| 1751 |
+
|
| 1752 |
+
def test_graph_candidates_return_two_hop_related_chunks(monkeypatch):
|
| 1753 |
+
fake_graph = FakeGraphServiceSupabase()
|
| 1754 |
+
fake_graph.graph_nodes = [
|
| 1755 |
+
{"user_id": "user-1", "node_key": "entity:alehin", "node_type": "entity", "label": "Alehin", "payload": {"file_hash": "A"}},
|
| 1756 |
+
{"user_id": "user-1", "node_key": "summary:root-a", "node_type": "summary", "label": "About Love Anton Chekhov :: 1-4", "payload": {"file_hash": "A", "chunk_index": "1-4"}},
|
| 1757 |
+
{"user_id": "user-1", "node_key": "document:a", "node_type": "document", "label": "About Love Anton Chekhov", "payload": {"file_hash": "A"}},
|
| 1758 |
+
]
|
| 1759 |
+
fake_graph.graph_edges = [
|
| 1760 |
+
{"user_id": "user-1", "source_node_key": "entity:alehin", "target_node_key": "summary:root-a", "edge_type": "mentions", "weight": 1.0, "payload": {}},
|
| 1761 |
+
{"user_id": "user-1", "source_node_key": "summary:root-a", "target_node_key": "document:a", "edge_type": "part_of", "weight": 1.0, "payload": {}},
|
| 1762 |
+
]
|
| 1763 |
+
vector_rows = [
|
| 1764 |
+
{
|
| 1765 |
+
"id": "sum-1",
|
| 1766 |
+
"user_id": "user-1",
|
| 1767 |
+
"content": "Alehin appears in About Love.",
|
| 1768 |
+
"metadata": {
|
| 1769 |
+
"file_hash": "A",
|
| 1770 |
+
"source": "About Love Anton Chekhov",
|
| 1771 |
+
"node_type": "summary",
|
| 1772 |
+
"chunk_index": "1-4",
|
| 1773 |
+
},
|
| 1774 |
+
}
|
| 1775 |
+
]
|
| 1776 |
+
|
| 1777 |
+
monkeypatch.setattr(
|
| 1778 |
+
pipeline, "_build_service_supabase_client", lambda *_args, **_kwargs: fake_graph
|
| 1779 |
+
)
|
| 1780 |
+
monkeypatch.setattr(
|
| 1781 |
+
pipeline, "_build_supabase_client", lambda *_args, **_kwargs: FakeGraphVectorSupabase(vector_rows)
|
| 1782 |
+
)
|
| 1783 |
+
|
| 1784 |
+
rows = pipeline._retrieve_graph_candidates(
|
| 1785 |
+
"which one talks about Alehin",
|
| 1786 |
+
route_mode="explicit_compare",
|
| 1787 |
+
access_token="token",
|
| 1788 |
+
user_id="user-1",
|
| 1789 |
+
priority_file_hashes=["A"],
|
| 1790 |
+
)
|
| 1791 |
+
|
| 1792 |
+
assert len(rows) == 1
|
| 1793 |
+
assert rows[0]["metadata"]["retrieval_branch"] == "graph_traversal"
|
| 1794 |
+
assert rows[0]["metadata"]["graph_hit_depth"] >= 0
|
| 1795 |
+
|
| 1796 |
+
|
| 1797 |
+
def test_admin_promote_feedback_creates_eval_dataset(monkeypatch):
|
| 1798 |
+
fake_service = FakeServiceSupabase()
|
| 1799 |
+
fake_service.trace_rows.append(
|
| 1800 |
+
{
|
| 1801 |
+
"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
|
| 1802 |
+
"question": "What is common between these two documents?",
|
| 1803 |
+
"doc_diagnostics": [{"source": "BEYOND BOUNDS", "reason": "insufficient_coverage"}],
|
| 1804 |
+
"failure_modes": ["unsupported_commonality"],
|
| 1805 |
+
"answer_preview": "The documents both explore emotion.",
|
| 1806 |
+
"review_state": "pending",
|
| 1807 |
+
}
|
| 1808 |
+
)
|
| 1809 |
+
fake_service.feedback_rows.append(
|
| 1810 |
+
{
|
| 1811 |
+
"id": 7,
|
| 1812 |
+
"trace_id": "8f8c1f3f-bcb6-43a8-b10d-85f31a917111",
|
| 1813 |
+
"helpful": False,
|
| 1814 |
+
"accepted": False,
|
| 1815 |
+
"reason_code": "unsupported_commonality",
|
| 1816 |
+
"correction_text": "Insufficient evidence for commonality.",
|
| 1817 |
+
"promote_to_eval": True,
|
| 1818 |
+
"review_state": "pending",
|
| 1819 |
+
}
|
| 1820 |
+
)
|
| 1821 |
+
|
| 1822 |
+
monkeypatch.setattr(admin, "_admin_client", lambda: fake_service)
|
| 1823 |
+
monkeypatch.setenv("MASTER_ADMIN_KEY", "secret")
|
| 1824 |
+
|
| 1825 |
+
result = admin.promote_feedback_to_eval(7, x_admin_key="secret")
|
| 1826 |
+
|
| 1827 |
+
assert result["ok"] is True
|
| 1828 |
+
assert len(fake_service.eval_rows) == 1
|
| 1829 |
+
assert fake_service.eval_rows[0]["trace_id"] == "8f8c1f3f-bcb6-43a8-b10d-85f31a917111"
|
| 1830 |
+
assert fake_service.trace_rows[0]["review_state"] == "promoted"
|
| 1831 |
+
assert fake_service.feedback_rows[0]["review_state"] == "promoted"
|
tests/test_routing_stress_matrix.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from types import SimpleNamespace
|
| 2 |
+
|
| 3 |
+
from backend.core import pipeline
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class FakeFilesTable:
|
| 7 |
+
def __init__(self, rows):
|
| 8 |
+
self.rows = rows
|
| 9 |
+
self.filters = {}
|
| 10 |
+
|
| 11 |
+
def select(self, *_args):
|
| 12 |
+
return self
|
| 13 |
+
|
| 14 |
+
def eq(self, key, value):
|
| 15 |
+
self.filters[key] = value
|
| 16 |
+
return self
|
| 17 |
+
|
| 18 |
+
def execute(self):
|
| 19 |
+
return SimpleNamespace(data=list(self.rows))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class FakeRpc:
|
| 23 |
+
def __init__(self, supabase, params):
|
| 24 |
+
self.supabase = supabase
|
| 25 |
+
self.params = params
|
| 26 |
+
|
| 27 |
+
def execute(self):
|
| 28 |
+
self.supabase.rpc_calls.append(self.params)
|
| 29 |
+
# Always return a row so ambiguity code can compute file_scores
|
| 30 |
+
return SimpleNamespace(data=[{"combined_score": 0.2}])
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class FakeSupabase:
|
| 34 |
+
def __init__(self, rows):
|
| 35 |
+
self.rows = rows
|
| 36 |
+
self.rpc_calls = []
|
| 37 |
+
|
| 38 |
+
def table(self, _name: str):
|
| 39 |
+
return FakeFilesTable(self.rows)
|
| 40 |
+
|
| 41 |
+
def rpc(self, _name: str, params):
|
| 42 |
+
return FakeRpc(self, params)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_stress_matrix_identity_queries_never_guess_in_multi_doc_all_scope(monkeypatch):
|
| 46 |
+
"""
|
| 47 |
+
Invariant: if multiple docs exist and the user hasn't pinned a doc (category=All),
|
| 48 |
+
identity/page-scoped queries must force clarification instead of falling through.
|
| 49 |
+
"""
|
| 50 |
+
fake = FakeSupabase(
|
| 51 |
+
rows=[
|
| 52 |
+
{"file_hash": "A", "filename": "Guide A.pdf"},
|
| 53 |
+
{"file_hash": "B", "filename": "Guide B.pdf"},
|
| 54 |
+
{"file_hash": "C", "filename": "Guide C.pdf"},
|
| 55 |
+
]
|
| 56 |
+
)
|
| 57 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 58 |
+
|
| 59 |
+
identity_like_queries = [
|
| 60 |
+
"Whose guide is this?",
|
| 61 |
+
"What is the exact full title of this guide?",
|
| 62 |
+
"What exact wording on the cover shows this guide is personalized?",
|
| 63 |
+
"Summarize only the first page, not the whole guide.",
|
| 64 |
+
"Does this guide explicitly name a publisher on the opening pages? If not, say not stated.",
|
| 65 |
+
"Publisher on the opening pages?",
|
| 66 |
+
"Cover wording?",
|
| 67 |
+
"Page 1 summary only.",
|
| 68 |
+
]
|
| 69 |
+
for q in identity_like_queries:
|
| 70 |
+
res = pipeline.check_query_ambiguity(q, access_token=None, category="All")
|
| 71 |
+
assert res["is_ambiguous"] is True, q
|
| 72 |
+
assert res["top_file_hash"] is None, q
|
| 73 |
+
|
| 74 |
+
# For identity/page-scoped safety, we should not do per-file scoring RPC calls.
|
| 75 |
+
assert fake.rpc_calls == []
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_stress_matrix_generic_queries_may_use_scoring_and_include_p_user_id(monkeypatch):
|
| 79 |
+
fake = FakeSupabase(
|
| 80 |
+
rows=[
|
| 81 |
+
{"file_hash": "A", "filename": "Doc A.pdf"},
|
| 82 |
+
{"file_hash": "B", "filename": "Doc B.pdf"},
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
monkeypatch.setattr(pipeline, "_build_supabase_client", lambda *_args, **_kwargs: fake)
|
| 86 |
+
|
| 87 |
+
generic_queries = [
|
| 88 |
+
"summarize the document",
|
| 89 |
+
"give me an overview",
|
| 90 |
+
"explain what this is about",
|
| 91 |
+
]
|
| 92 |
+
for q in generic_queries:
|
| 93 |
+
res = pipeline.check_query_ambiguity(q, access_token=None, category="All")
|
| 94 |
+
assert res["is_ambiguous"] in {True, False}
|
| 95 |
+
|
| 96 |
+
assert fake.rpc_calls, "Expected scoring calls for generic multi-doc queries"
|
| 97 |
+
assert all("p_user_id" in call for call in fake.rpc_calls)
|
| 98 |
+
|