morpheus-rag / recent_changes.txt
nothex
Harden ingestion and retrieval reliability across the pipeline
4abd98f
diff --git a/backend/api/admin.py b/backend/api/admin.py
index 7a9fe1e..cf0f7eb 100644
--- a/backend/api/admin.py
+++ b/backend/api/admin.py
@@ -1,11 +1,16 @@
"""backend/api/admin.py — Admin endpoints, protected by X-Admin-Key header."""
import os, hmac, logging # noqa: E401
+from datetime import datetime, timedelta, timezone
+from collections import Counter
+from typing import Optional
+
from fastapi import APIRouter, HTTPException, Header, Depends
+from pydantic import BaseModel
+
from backend.core.auth_utils import require_auth_token
from backend.core.warmup_classifier import warmup, warmup_cross_encoder
-from datetime import datetime, timedelta, timezone
-from collections import Counter
+from backend.core.pipeline import _build_service_supabase_client
log = logging.getLogger("morpheus.api.admin")
router = APIRouter()
@@ -19,6 +24,78 @@ def _check_admin(key: str):
raise HTTPException(status_code=403, detail="Invalid admin key.")
+class ReviewPayload(BaseModel):
+ review_state: str = "reviewed"
+ review_notes: Optional[str] = None
+
+
+def _admin_client():
+ return _build_service_supabase_client()
+
+
+def _trace_sort_key(row: dict):
+ return row.get("created_at") or ""
+
+
+def _feedback_sort_key(row: dict):
+ return row.get("created_at") or ""
+
+
+def _load_recent_traces(*, limit: int = 100) -> list[dict]:
+ rows = (
+ _admin_client()
+ .table("query_traces")
+ .select(
+ "trace_id, question, route_mode, selected_experts, expert_weights, "
+ "document_types, doc_diagnostics, failure_modes, quality_metrics, "
+ "answer_preview, latency_ms, review_state, review_notes, reviewed_at, "
+ "reviewed_by, promoted_to_eval, created_at"
+ )
+ .limit(limit)
+ .execute()
+ .data
+ or []
+ )
+ return sorted(rows, key=_trace_sort_key, reverse=True)
+
+
+def _load_recent_feedback(*, limit: int = 100) -> list[dict]:
+ rows = (
+ _admin_client()
+ .table("answer_feedback")
+ .select(
+ "id, trace_id, helpful, accepted, reason_code, correction_text, "
+ "promote_to_eval, review_state, review_notes, reviewed_at, reviewed_by, "
+ "promoted_at, created_at, user_id"
+ )
+ .limit(limit)
+ .execute()
+ .data
+ or []
+ )
+ return sorted(rows, key=_feedback_sort_key, reverse=True)
+
+
+def _build_eval_dataset_row(trace_row: dict, feedback_row: dict) -> dict:
+ correction_text = (feedback_row.get("correction_text") or "").strip()
+ answer_preview = (trace_row.get("answer_preview") or "").strip()
+ return {
+ "trace_id": trace_row.get("trace_id"),
+ "source": "feedback_trace",
+ "question": trace_row.get("question"),
+ "gold_context_refs": [],
+ "gold_evidence_text": correction_text or answer_preview,
+ "is_answerable": bool(
+ feedback_row.get("accepted")
+ or feedback_row.get("helpful")
+ ),
+ "failure_modes": trace_row.get("failure_modes") or [],
+ "doc_diagnostics": trace_row.get("doc_diagnostics") or [],
+ "reason_code": feedback_row.get("reason_code"),
+ "is_active": False,
+ }
+
+
@router.post("/warmup")
def run_warmup(x_admin_key: str = Header(..., alias="X-Admin-Key")):
_check_admin(x_admin_key)
@@ -105,4 +182,210 @@ def get_corpus_health(
"recommendation": "Prompt user to upload documents regarding content gaps."
if missing_topics
else "Corpus coverage is sufficient.",
- }
\ No newline at end of file
+ }
+
+
+@router.get("/traces")
+def list_query_traces(
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+ limit: int = 50,
+ route_mode: Optional[str] = None,
+ failure_mode: Optional[str] = None,
+ category: Optional[str] = None,
+ hours: int = 168,
+ review_state: Optional[str] = None,
+):
+ _check_admin(x_admin_key)
+ traces = _load_recent_traces(limit=max(limit * 3, 100))
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=max(1, hours))
+ filtered = []
+ for row in traces:
+ created_raw = row.get("created_at")
+ created_at = None
+ if isinstance(created_raw, str):
+ try:
+ created_at = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
+ except Exception:
+ created_at = None
+ if created_at and created_at < cutoff:
+ continue
+ if route_mode and row.get("route_mode") != route_mode:
+ continue
+ if failure_mode and failure_mode not in (row.get("failure_modes") or []):
+ continue
+ if review_state and row.get("review_state") != review_state:
+ continue
+ if category and category not in (row.get("document_types") or []):
+ continue
+ filtered.append(row)
+ return {"items": filtered[:limit]}
+
+
+@router.get("/traces/{trace_id}")
+def get_query_trace(
+ trace_id: str,
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+):
+ _check_admin(x_admin_key)
+ sb = _admin_client()
+ trace_rows = (
+ sb.table("query_traces")
+ .select("*")
+ .eq("trace_id", trace_id)
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ if not trace_rows:
+ raise HTTPException(status_code=404, detail="Trace not found.")
+ feedback_rows = (
+ sb.table("answer_feedback")
+ .select("*")
+ .eq("trace_id", trace_id)
+ .execute()
+ .data
+ or []
+ )
+ return {"trace": trace_rows[0], "feedback": sorted(feedback_rows, key=_feedback_sort_key, reverse=True)}
+
+
+@router.post("/traces/{trace_id}/review")
+def review_query_trace(
+ trace_id: str,
+ payload: ReviewPayload,
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+):
+ _check_admin(x_admin_key)
+ now_iso = datetime.now(timezone.utc).isoformat()
+ _admin_client().table("query_traces").update(
+ {
+ "review_state": payload.review_state,
+ "review_notes": payload.review_notes,
+ "reviewed_at": now_iso,
+ "reviewed_by": "admin",
+ }
+ ).eq("trace_id", trace_id).execute()
+ return {"ok": True}
+
+
+@router.get("/feedback")
+def list_feedback(
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+ limit: int = 50,
+ review_state: Optional[str] = None,
+ promote_only: bool = False,
+):
+ _check_admin(x_admin_key)
+ rows = _load_recent_feedback(limit=max(limit * 3, 100))
+ filtered = []
+ for row in rows:
+ if review_state and row.get("review_state") != review_state:
+ continue
+ if promote_only and not row.get("promote_to_eval"):
+ continue
+ filtered.append(row)
+ return {"items": filtered[:limit]}
+
+
+@router.get("/feedback/{feedback_id}")
+def get_feedback_detail(
+ feedback_id: int,
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+):
+ _check_admin(x_admin_key)
+ sb = _admin_client()
+ rows = (
+ sb.table("answer_feedback")
+ .select("*")
+ .eq("id", feedback_id)
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ if not rows:
+ raise HTTPException(status_code=404, detail="Feedback not found.")
+ feedback = rows[0]
+ trace_rows = (
+ sb.table("query_traces")
+ .select("*")
+ .eq("trace_id", feedback.get("trace_id"))
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ return {"feedback": feedback, "trace": trace_rows[0] if trace_rows else None}
+
+
+@router.post("/feedback/{feedback_id}/review")
+def review_feedback(
+ feedback_id: int,
+ payload: ReviewPayload,
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+):
+ _check_admin(x_admin_key)
+ now_iso = datetime.now(timezone.utc).isoformat()
+ _admin_client().table("answer_feedback").update(
+ {
+ "review_state": payload.review_state,
+ "review_notes": payload.review_notes,
+ "reviewed_at": now_iso,
+ "reviewed_by": "admin",
+ }
+ ).eq("id", feedback_id).execute()
+ return {"ok": True}
+
+
+@router.post("/feedback/{feedback_id}/promote")
+def promote_feedback_to_eval(
+ feedback_id: int,
+ x_admin_key: str = Header(..., alias="X-Admin-Key"),
+):
+ _check_admin(x_admin_key)
+ sb = _admin_client()
+ feedback_rows = (
+ sb.table("answer_feedback")
+ .select("*")
+ .eq("id", feedback_id)
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ if not feedback_rows:
+ raise HTTPException(status_code=404, detail="Feedback not found.")
+ feedback = feedback_rows[0]
+ trace_rows = (
+ sb.table("query_traces")
+ .select("*")
+ .eq("trace_id", feedback.get("trace_id"))
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ if not trace_rows:
+ raise HTTPException(status_code=404, detail="Trace not found.")
+ trace = trace_rows[0]
+ row = _build_eval_dataset_row(trace, feedback)
+ sb.table("evaluation_datasets").upsert(row, on_conflict="trace_id").execute()
+ now_iso = datetime.now(timezone.utc).isoformat()
+ sb.table("answer_feedback").update(
+ {
+ "review_state": "promoted",
+ "promoted_at": now_iso,
+ "reviewed_at": now_iso,
+ "reviewed_by": "admin",
+ }
+ ).eq("id", feedback_id).execute()
+ sb.table("query_traces").update(
+ {
+ "review_state": "promoted",
+ "promoted_to_eval": True,
+ "reviewed_at": now_iso,
+ "reviewed_by": "admin",
+ }
+ ).eq("trace_id", trace.get("trace_id")).execute()
+ return {"ok": True}
diff --git a/backend/api/auth.py b/backend/api/auth.py
index c35849e..5b0c6fb 100644
--- a/backend/api/auth.py
+++ b/backend/api/auth.py
@@ -7,10 +7,11 @@ declare `auth: AuthContext = Depends(require_auth)` — see the pattern
at the bottom of this file and replicate it in each router.
"""
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter, Depends, Header, HTTPException
-from backend.core.auth_utils import require_auth_token
-from backend.services.auth import get_daily_password, verify_admin_key, verify_password
+from backend.core.auth_utils import is_guest_token, require_auth_token
+from backend.core.pipeline import _build_service_supabase_client
+from backend.services.auth import verify_admin_key, verify_password
from shared.types import AuthRequest, AuthResponse
router = APIRouter()
@@ -31,7 +32,7 @@ def verify(req: AuthRequest):
@router.post("/admin", response_model=AuthResponse)
def admin_verify(req: AuthRequest):
if verify_admin_key(req.password):
- return AuthResponse(valid=True, token=get_daily_password(), message="Admin verified.")
+ return AuthResponse(valid=True, message="Admin verified.")
return AuthResponse(valid=False, message="Invalid admin key.")
@@ -40,3 +41,73 @@ def admin_verify(req: AuthRequest):
async def get_me(user_id: str = Depends(require_auth_token)):
return {"user_id": user_id, "authenticated": True}
+
+@router.delete("/guest-workspace")
+async def clear_guest_workspace(
+ user_id: str = Depends(require_auth_token),
+ x_auth_token: str = Header(None, alias="X-Auth-Token"),
+):
+ if not is_guest_token(x_auth_token):
+ raise HTTPException(status_code=403, detail="Guest workspace cleanup is only for guest sessions.")
+
+ sb = _build_service_supabase_client()
+
+ # Preserve anonymized adaptive signals while removing the guest's actual workspace.
+ try:
+ sb.table("query_traces").update(
+ {
+ "user_id": None,
+ "session_id": "guest_archived",
+ "question": "[guest session removed]",
+ "pinned_file_hashes": [],
+ "selected_chunk_ids": [],
+ "doc_diagnostics": [],
+ "answer_preview": None,
+ "document_types": [],
+ }
+ ).eq("user_id", user_id).execute()
+ except Exception:
+ pass
+
+ try:
+ sb.table("answer_feedback").update(
+ {
+ "user_id": None,
+ "correction_text": None,
+ }
+ ).eq("user_id", user_id).execute()
+ except Exception:
+ pass
+
+ try:
+ sb.table("evaluation_logs").update(
+ {
+ "user_id": None,
+ "question": "[guest session removed]",
+ }
+ ).eq("user_id", user_id).execute()
+ except Exception:
+ pass
+
+ def _purge(table_name: str) -> None:
+ try:
+ sb.table(table_name).delete().eq("user_id", user_id).execute()
+ except Exception:
+ # Optional/older tables should not break guest cleanup.
+ pass
+
+ # Delete child/content tables first, then registry-ish tables.
+ for table_name in (
+ "documents",
+ "document_trees",
+ "chat_memory",
+ "ingestion_retry_logs",
+ "rerank_feedback",
+ "intent_feedback",
+ "graph_edges",
+ "graph_nodes",
+ "ingested_files",
+ ):
+ _purge(table_name)
+
+ return {"ok": True, "message": "Guest workspace cleared."}
diff --git a/backend/api/frontend_config.py b/backend/api/frontend_config.py
index bfb1cad..c2cab10 100644
--- a/backend/api/frontend_config.py
+++ b/backend/api/frontend_config.py
@@ -1,4 +1,4 @@
-from fastapi import APIRouter
+from fastapi import APIRouter, HTTPException
from backend.core import config
router = APIRouter()
@@ -9,7 +9,13 @@ def get_frontend_config():
Returns public config values the frontend needs.
Only exposes the anon key (safe by design) — never the service key.
"""
+ if not config.SUPABASE_URL or not config.SUPABASE_ANON_KEY:
+ raise HTTPException(
+ status_code=503,
+ detail="Supabase frontend config is missing on the server.",
+ )
return {
"supabase_url": config.SUPABASE_URL,
"supabase_anon": config.SUPABASE_ANON_KEY,
- }
\ No newline at end of file
+ "guest_enabled": config.GUEST_MODE_ENABLED,
+ }
diff --git a/backend/api/ingest.py b/backend/api/ingest.py
index 4e54251..59c9f5e 100644
--- a/backend/api/ingest.py
+++ b/backend/api/ingest.py
@@ -1,24 +1,31 @@
import os
import tempfile
import logging
-from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends
-from backend.core.auth_utils import require_auth_token
+from fastapi import APIRouter, UploadFile, File, HTTPException, Header, Depends, Request
+from backend.core import config
+from backend.core.auth_utils import is_guest_token, require_auth_token
from backend.core.tasks import process_pdf_task
from backend.core.tasks import celery_app
+from backend.main import limiter
log = logging.getLogger("morpheus.api.ingest")
router = APIRouter()
@router.post("/upload")
+@limiter.limit("12/hour")
async def upload(
+ request: Request,
file: UploadFile = File(...),
user_id: str = Depends(require_auth_token),
x_auth_token: str = Header(None, alias="X-Auth-Token"),
):
+ del request
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+ guest_workspace = is_guest_token(x_auth_token)
+
# NEW: Secure file signature validation using python-magic
import magic
@@ -33,6 +40,8 @@ async def upload(
)
# ── Per-user document limit ───────────────────────────────────────────────
+ doc_limit = config.GUEST_MAX_DOCS if guest_workspace else config.MAX_DOCS_PER_USER
+
try:
from backend.core.pipeline import _build_supabase_client
@@ -43,9 +52,10 @@ async def upload(
.eq("user_id", user_id)
.execute()
)
- if (result.count or 0) >= 50:
+ if (result.count or 0) >= doc_limit:
raise HTTPException(
- status_code=429, detail="Document limit reached (50 max)."
+ status_code=429,
+ detail=f"Document limit reached ({doc_limit} max).",
)
except HTTPException:
raise
@@ -57,6 +67,12 @@ async def upload(
os.close(tmp_fd) # close fd immediately, manage file separately
try:
contents = await file.read()
+ max_upload_mb = config.GUEST_MAX_UPLOAD_MB if guest_workspace else config.MAX_UPLOAD_MB
+ if len(contents) > max_upload_mb * 1024 * 1024:
+ raise HTTPException(
+ status_code=413,
+ detail=f"File too large ({max_upload_mb} MB max).",
+ )
with open(tmp_path, "wb") as f:
f.write(contents)
task = process_pdf_task.delay(tmp_path, file.filename, x_auth_token)
diff --git a/backend/api/query.py b/backend/api/query.py
index 402453f..52accfe 100644
--- a/backend/api/query.py
+++ b/backend/api/query.py
@@ -2,13 +2,15 @@
import json
import logging
import asyncio
-from fastapi import APIRouter, Header, Depends, Request
+from fastapi import APIRouter, Header, Depends, Request, HTTPException
from fastapi.responses import StreamingResponse
-from shared.types import QueryRequest, SourceChunk
+from shared.types import AnswerFeedback, QueryRequest, SourceChunk
from backend.core.pipeline import (
retrieve_chunks_routed,
generate_answer_stream,
analyse_intent,
+ check_query_ambiguity,
+ record_answer_feedback,
)
from backend.core.auth_utils import require_auth_token
from backend.main import limiter
@@ -91,14 +93,51 @@ async def query(
user_id = user_id,
)
+ if intent.get("route_class") == "no_retrieval":
+ yield "data: " + json.dumps({
+ "type": "token",
+ "content": "Ask me about your uploaded documents or a topic inside them, and I’ll dig in.",
+ }) + "\n\n"
+ yield "data: " + json.dumps({
+ "type": "done",
+ "sources": [],
+ "images": [],
+ "trace_id": None,
+ "doc_diagnostics": [],
+ }) + "\n\n"
+ return
+
if not intent.get("is_clear"):
# Stream clarification question as a normal assistant message
# User answers it → next turn history resolves the subject
question = intent.get("clarification_question", "Could you clarify?")
yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
- yield "data: " + json.dumps({"type": "done", "sources": [], "images": []}) + "\n\n"
+ yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
return
+ # ── Step 1.5: Phase 2 Ambiguity Detection ────────────────────────
+ # If no manual pin is active, check if the query is too ambiguous
+ if not req.priority_file_hashes:
+ ambiguity_res = check_query_ambiguity(
+ req.query,
+ access_token=x_auth_token,
+ category=req.category,
+ )
+ if ambiguity_res.get("is_ambiguous"):
+ question = ambiguity_res.get("clarification_question", "Which document do you mean?")
+ # Use a distinct identifier so the frontend understands it's a structural prompt
+ yield "data: " + json.dumps({"type": "token", "content": question}) + "\n\n"
+
+ options = ambiguity_res.get("clarification_options")
+ if options:
+ yield "data: " + json.dumps({"type": "clarification_options", "options": options}) + "\n\n"
+
+ yield "data: " + json.dumps({"type": "done", "sources": [], "images": [], "trace_id": None, "doc_diagnostics": []}) + "\n\n"
+ return
+ # Optional: If Phase 3 was ready we could apply the soft auto-pin here:
+ # if ambiguity_res.get("top_file_hash"):
+ # req.priority_file_hashes = [ambiguity_res["top_file_hash"]]
+
# ── Step 2: Retrieve using enriched query ─────────────────────────
# enriched_query has better embedding signal (category/history injected)
# but we answer with the ORIGINAL query so the response sounds natural
@@ -117,12 +156,15 @@ async def query(
user_id=user_id,
original_query=req.query,
eval_mode=(x_eval_mode == "true"),
+ priority_file_hashes=req.priority_file_hashes or None,
),
)
# ── Step 3: Stream answer tokens ──────────────────────────────────
images = []
done_sources = []
+ trace_id = None
+ doc_diagnostics = []
# 🚀 Define the boolean once for readability
is_eval = x_eval_mode == "true"
async for event in generate_answer_stream(
@@ -133,12 +175,15 @@ async def query(
access_token=x_auth_token,
category=category,
eval_mode=is_eval,
+ priority_file_hashes=req.priority_file_hashes or None,
):
if event["type"] == "token":
yield "data: " + json.dumps({"type": "token", "content": event["content"]}) + "\n\n"
elif event["type"] == "done":
images = event.get("images", [])
done_sources = event.get("sources", []) or []
+ trace_id = event.get("trace_id")
+ doc_diagnostics = event.get("doc_diagnostics", []) or []
# ── Step 4: Emit sources + images ─────────────────────────────────
sources = done_sources or _build_sources_from_chunks(
@@ -149,6 +194,8 @@ async def query(
"type": "done",
"sources": sources,
"images": images,
+ "trace_id": trace_id,
+ "doc_diagnostics": doc_diagnostics,
}) + "\n\n"
except Exception as e:
@@ -178,3 +225,16 @@ async def query(
"Access-Control-Allow-Origin": "*",
}
)
+
+
+@router.post("/feedback")
+async def submit_feedback(
+ payload: AnswerFeedback,
+ user_id: str = Depends(require_auth_token),
+ x_auth_token: str = Header(None, alias="X-Auth-Token"),
+):
+ del user_id
+ ok = record_answer_feedback(payload.dict(), access_token=x_auth_token)
+ if not ok:
+ raise HTTPException(status_code=500, detail="Could not record answer feedback.")
+ return {"ok": True}
diff --git a/backend/core/auth_utils.py b/backend/core/auth_utils.py
index 14b395c..7cf2b5b 100644
--- a/backend/core/auth_utils.py
+++ b/backend/core/auth_utils.py
@@ -12,7 +12,7 @@ TASK 1 — Auth Bridge:
import jwt
import logging
-from typing import Optional
+from typing import Any, Optional
from backend.core import config
from fastapi import Header, HTTPException, status
@@ -22,6 +22,45 @@ from fastapi import Header, HTTPException, status
log = logging.getLogger("morpheus.auth")
+def _decode_unverified_claims(access_token: Optional[str]) -> dict[str, Any]:
+ """Peek at JWT claims without verifying the signature for non-security decisions."""
+ if not access_token:
+ return {}
+ try:
+ claims = jwt.decode(
+ access_token,
+ options={
+ "verify_signature": False,
+ "verify_exp": False,
+ "verify_aud": False,
+ },
+ algorithms=["ES256", "HS256", "RS256"],
+ )
+ return claims if isinstance(claims, dict) else {}
+ except Exception:
+ return {}
+
+
+def is_guest_token(access_token: Optional[str]) -> bool:
+ """
+ Supabase anonymous users still get real JWTs.
+ We treat them as guest workspaces for UI/limits/rate-limiting.
+ """
+ claims = _decode_unverified_claims(access_token)
+ if not claims:
+ return False
+
+ app_meta = claims.get("app_metadata") or {}
+ provider = str(app_meta.get("provider") or "").strip().lower()
+ providers = app_meta.get("providers") or []
+ return bool(
+ claims.get("is_anonymous")
+ or app_meta.get("is_anonymous")
+ or provider == "anonymous"
+ or "anonymous" in providers
+ )
+
+
def extract_jwt_sub(access_token: str) -> str:
"""
Extract the Supabase user id (JWT `sub`) while strictly verifying the signature.
diff --git a/backend/core/classifier.py b/backend/core/classifier.py
index af304ab..594878a 100644
--- a/backend/core/classifier.py
+++ b/backend/core/classifier.py
@@ -167,8 +167,9 @@ class CentroidStore:
self._access_token = access_token
self._user_id = None
if access_token:
- from backend.core.auth_utils import extract_jwt_sub
- self._user_id = extract_jwt_sub(access_token)
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ self._user_id = safe_extract_jwt_sub(access_token)
self._cache: Dict[str, Dict] = {}
self._lock = threading.Lock()
self._client = None
@@ -176,23 +177,17 @@ class CentroidStore:
def _get_client(self):
if self._client is None:
- # Tenant-scoped client (anon + access token) is required for RLS isolation.
- if self._access_token:
- if not config.SUPABASE_ANON_KEY:
- raise RuntimeError("SUPABASE_ANON_KEY is not set but access_token was provided.")
- self._client = create_client(
- config.SUPABASE_URL,
- config.SUPABASE_ANON_KEY,
- )
- self._client.postgrest.auth(self._access_token)
- else:
- # Admin / legacy fallback (bypasses RLS via service role).
- self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
+ # Backend-owned access model: always use the service-role client and
+ # scope rows explicitly by user_id where applicable.
+ self._client = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
return self._client
def _load_from_db(self):
try:
- result = self._get_client().table(self.TABLE).select("*").execute()
+ query = self._get_client().table(self.TABLE).select("*")
+ if self._user_id:
+ query = query.eq("user_id", self._user_id)
+ result = query.execute()
for row in (result.data or []):
self._cache[row["document_type"]] = {
"vector": np.array(row["centroid_vector"], dtype=np.float32),
diff --git a/backend/core/config.py b/backend/core/config.py
index f48f0a6..c1cbf39 100644
--- a/backend/core/config.py
+++ b/backend/core/config.py
@@ -19,6 +19,15 @@ SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY")
SUPABASE_JWT_SECRET = os.getenv("SUPABASE_JWT_SECRET")
VECTOR_TABLE_NAME = "documents"
IMAGE_STORAGE_BUCKET = "rag-images"
+GUEST_MODE_ENABLED = os.getenv("GUEST_MODE_ENABLED", "true").lower() in {
+ "1",
+ "true",
+ "yes",
+}
+MAX_UPLOAD_MB = int(os.getenv("MAX_UPLOAD_MB", "25"))
+GUEST_MAX_UPLOAD_MB = int(os.getenv("GUEST_MAX_UPLOAD_MB", "10"))
+MAX_DOCS_PER_USER = int(os.getenv("MAX_DOCS_PER_USER", "50"))
+GUEST_MAX_DOCS = int(os.getenv("GUEST_MAX_DOCS", "10"))
# ==================== API KEYS ====================
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
@@ -37,9 +46,19 @@ OLLAMA_MODELS = ["llama3.2", "mistral"]
EMBEDDING_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2:free"
EMBEDDING_DIMENSIONS = 2048
EMBEDDING_DEVICE = "cuda"
+RETRIEVAL_EMBEDDING_VARIANT = os.getenv(
+ "RETRIEVAL_EMBEDDING_VARIANT", "control"
+).strip().lower()
+RETRIEVAL_EMBEDDING_MODEL_OVERRIDE = os.getenv(
+ "RETRIEVAL_EMBEDDING_MODEL_OVERRIDE", ""
+).strip()
EMBEDDING_MODELS = [
- "nvidia/llama-nemotron-embed-vl-1b-v2:free",
- "text-embedding-3-small", # OpenRouter fallback
+ model
+ for model in [
+ RETRIEVAL_EMBEDDING_MODEL_OVERRIDE or EMBEDDING_MODEL,
+ EMBEDDING_MODEL if RETRIEVAL_EMBEDDING_MODEL_OVERRIDE else "",
+ ]
+ if model
]
# ==================== GROQ MODELS ====================
@@ -127,6 +146,26 @@ RELEVANCE_THRESHOLD = 0.35
LLM_MAX_TOKENS = 4096
MAX_CONTEXT_CHARS = 14000
CATEGORY_SLOTS = 2
+ENABLE_STRICT_OUTPUT_SANITIZER = os.getenv(
+ "ENABLE_STRICT_OUTPUT_SANITIZER", "true"
+).lower() in {"1", "true", "yes"}
+ENABLE_DUPLICATE_CHUNK_COLLAPSE = os.getenv(
+ "ENABLE_DUPLICATE_CHUNK_COLLAPSE", "true"
+).lower() in {"1", "true", "yes"}
+ENABLE_HYDE = os.getenv("ENABLE_HYDE", "false").lower() in {"1", "true", "yes"}
+ENABLE_RETRIEVE_THEN_STUFF = os.getenv(
+ "ENABLE_RETRIEVE_THEN_STUFF", "true"
+).lower() in {"1", "true", "yes"}
+ENABLE_CONTEXTUAL_CHUNKING = os.getenv(
+ "ENABLE_CONTEXTUAL_CHUNKING", "false"
+).lower() in {"1", "true", "yes"}
+FOLLOWUP_SESSION_TTL_S = int(os.getenv("FOLLOWUP_SESSION_TTL_S", "1800"))
+HISTORY_RECENT_TURNS = int(os.getenv("HISTORY_RECENT_TURNS", "3"))
+HISTORY_IMPORTANT_MAX = int(os.getenv("HISTORY_IMPORTANT_MAX", "6"))
+RETRIEVE_THEN_STUFF_K = int(os.getenv("RETRIEVE_THEN_STUFF_K", "12"))
+RETRIEVE_THEN_STUFF_FETCH_K = int(
+ os.getenv("RETRIEVE_THEN_STUFF_FETCH_K", "20")
+)
# ==================== LOGGING ====================
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
diff --git a/backend/core/pipeline.py b/backend/core/pipeline.py
index b959ad2..68ddaf5 100644
--- a/backend/core/pipeline.py
+++ b/backend/core/pipeline.py
@@ -15,8 +15,9 @@ import time
import re
import threading
import asyncio
-from typing import List, Optional, Tuple
-from typing import AsyncGenerator
+from collections import defaultdict, deque
+from types import SimpleNamespace
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
@@ -98,6 +99,15 @@ class QueryVariants(BaseModel):
)
+class RouteDecision(BaseModel):
+ route_class: str = Field(default="factoid")
+ route_reason: str = Field(default="heuristic_default")
+ preserve_query: bool = Field(default=False)
+ disable_memory: bool = Field(default=False)
+ page_scope: Optional[str] = Field(default=None)
+ exact_field: Optional[str] = Field(default=None)
+
+
# =========================================================================== #
# SHARED BUILDER HELPERS #
# =========================================================================== #
@@ -166,6 +176,50 @@ def _build_supabase_client(access_token: str = None):
return _build_service_supabase_client()
+def _log_ingestion_retry_event(
+ *,
+ user_id: Optional[str],
+ batch_num: int,
+ total_batches: int,
+ attempt: int,
+ event_type: str,
+ message: str = "",
+ sleep_s: float = 0.0,
+ file_hash: Optional[str] = None,
+) -> None:
+ """
+ Best-effort telemetry for ingestion retry behavior.
+
+ `file_hash` is optional so older deployments that have not yet applied the
+ schema migration can still write logs using the legacy table shape.
+ """
+ base_payload = {
+ "user_id": user_id,
+ "batch_num": batch_num,
+ "total_batches": total_batches,
+ "attempt": attempt,
+ "event_type": event_type,
+ "message": (message or "")[:500],
+ "sleep_s": sleep_s,
+ }
+ payload = dict(base_payload)
+ if file_hash:
+ payload["file_hash"] = file_hash
+ try:
+ _build_service_supabase_client().table("ingestion_retry_logs").insert(
+ payload
+ ).execute()
+ except Exception:
+ if "file_hash" not in payload:
+ return
+ try:
+ _build_service_supabase_client().table("ingestion_retry_logs").insert(
+ base_payload
+ ).execute()
+ except Exception:
+ pass
+
+
def get_file_fingerprint(file_path: str) -> str:
"""SHA-256 hash — collision-resistant dedup key."""
hasher = hashlib.sha256()
@@ -187,8 +241,451 @@ _embed_lock = threading.Lock()
# Session-level chunk cache for follow-up detection
_last_chunks: dict = {}
+_last_query_context: dict = {}
_last_chunks_lock = threading.Lock()
+_CONTROL_TOKEN_RE = re.compile(r"<\|[^>\n]{1,200}\|>")
+_ROLE_MARKER_RE = re.compile(
+ r"(?im)(?:^|\n)\s*(?:assistant|user|system)\s*<\|header_end\|>\s*"
+)
+_TRANSCRIPT_LABEL_RE = re.compile(r"(?im)^\s*(?:assistant|user|system)\s*:\s*")
+_PROMPT_RESIDUE_RE = re.compile(
+ r"(?im)^\s*(?:it seems like there'?s no actual question provided\.?|please go ahead and ask your question.*)$"
+)
+_DUPLICATE_PARAGRAPH_RE = re.compile(r"(?s)(?:^|\n\n)(.{40,}?)\n\n\1(?:\n\n\1)+")
+_COVER_OWNER_RE = re.compile(
+ r"^\s*([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,3})\s+[—\-]\s+", re.UNICODE
+)
+_AUTHOR_RE = re.compile(r"(?im)\b(?:author|written by)\b\s*[:\-]?\s*(.+)$")
+_PUBLISHER_RE = re.compile(r"(?im)\b(?:publisher|published by)\b\s*[:\-]?\s*(.+)$")
+_EDITION_RE = re.compile(r"(?im)\b([0-9]+(?:st|nd|rd|th)?\s+edition)\b")
+_DATE_RE = re.compile(
+ r"\b(?:19|20)\d{2}\b|\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\s+\d{1,2},?\s+(?:19|20)\d{2}\b",
+ re.IGNORECASE,
+)
+_LINE_SPLIT_RE = re.compile(r"[\r\n]+")
+_EXACT_FACT_TERMS = (
+ "exact title",
+ "full title",
+ "exact full title",
+ "whose guide",
+ "exact name",
+ "cover wording",
+ "publisher",
+ "edition",
+ "opening pages",
+ "first page",
+ "cover",
+ "chapter 1",
+ "chapter one",
+)
+_PAGE_SCOPED_TERMS = (
+ "first page",
+ "opening page",
+ "opening pages",
+ "cover",
+ "cover wording",
+ "page 1",
+ "page one",
+)
+_IDENTITY_FIELD_HINTS = {
+ "title": ("title",),
+ "owner": ("whose guide", "whose document", "exact name", "who is this guide for", "whom is this guide made for"),
+ "publisher": ("publisher", "published by"),
+ "author": ("author", "written by"),
+ "edition": ("edition",),
+ "date": ("date", "year", "when was this"),
+ "cover_text": ("cover wording", "cover", "personalized"),
+ "opening_page_summary": ("first page", "opening pages"),
+}
+_PAGE_RANGE_HINT_RE = re.compile(r"\bpage\s+(\d+)\b", re.IGNORECASE)
+
+
+def _stable_user_id(user_id: Optional[str], access_token: Optional[str]) -> Optional[str]:
+ if user_id:
+ return user_id
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ return safe_extract_jwt_sub(access_token)
+ return None
+
+
+def _sanitize_generated_text(text: str) -> tuple[str, int]:
+ if not text:
+ return "", 0
+
+ cleaned = text
+ if config.ENABLE_STRICT_OUTPUT_SANITIZER:
+ cleaned = cleaned.replace("\x00", "").replace("\u0000", "")
+ cleaned = re.sub(r"[\ud800-\udfff]", "", cleaned)
+ cleaned = _ROLE_MARKER_RE.sub("\n", cleaned)
+ cleaned = _CONTROL_TOKEN_RE.sub("", cleaned)
+ cleaned = _TRANSCRIPT_LABEL_RE.sub("", cleaned)
+ cleaned = _PROMPT_RESIDUE_RE.sub("", cleaned)
+ cleaned = cleaned.replace("ASSISTANT says", "")
+ cleaned = cleaned.replace("USER says", "")
+ cleaned = cleaned.replace("ASSISTANT", "")
+ cleaned = cleaned.replace("USER", "")
+ cleaned = cleaned.replace("SYSTEM", "")
+ while True:
+ collapsed = _DUPLICATE_PARAGRAPH_RE.sub(r"\n\n\1", cleaned)
+ if collapsed == cleaned:
+ break
+ cleaned = collapsed
+ cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
+ cleaned = cleaned.strip()
+
+ removed = max(0, len(text) - len(cleaned))
+ return cleaned, removed
+
+
+def _clean_identity_lines(text: str) -> List[str]:
+ lines = []
+ for raw in _LINE_SPLIT_RE.split(text or ""):
+ line = re.sub(r"\s+", " ", _strip_invalid_text_chars(str(raw or ""))).strip()[:4000]
+ if not line:
+ continue
+ lines.append(line)
+ return lines
+
+
+def _strip_invalid_text_chars(text: str) -> str:
+ cleaned = str(text or "").replace("\x00", "").replace("\u0000", "")
+ cleaned = re.sub(r"[\ud800-\udfff]", "", cleaned)
+ return re.sub(r"[\x01-\x08\x0b\x0c\x0e-\x1f\x7f]", " ", cleaned)
+
+
+def _truncate_identity_text(text: str, limit: int = 1200) -> str:
+ cleaned = _strip_invalid_text_chars(text)
+ return re.sub(r"\s+", " ", cleaned).strip()[:limit]
+
+
+def _extract_identity_pages_text_from_elements(
+ elements: List[Any],
+ *,
+ max_pages: int = 3,
+) -> Tuple[str, List[int]]:
+ page_map: dict[int, List[str]] = defaultdict(list)
+ for el in elements or []:
+ page_num = getattr(getattr(el, "metadata", None), "page_number", None)
+ if page_num is None or int(page_num) > max_pages:
+ continue
+ text = str(getattr(el, "text", "") or "").strip()
+ if not text:
+ continue
+ page_map[int(page_num)].append(text)
+ pages = sorted(page_map.keys())
+ joined = "\n".join("\n".join(page_map[p]) for p in pages)
+ return joined, pages
+
+
+def _extract_identity_pages_text_from_docs(
+ docs: List[dict],
+ *,
+ max_pages: int = 3,
+) -> Tuple[str, List[int]]:
+ ordered: list[tuple[int, str]] = []
+ seen_pages: set[int] = set()
+ for row in docs or []:
+ meta = dict(row.get("metadata", {}) or {})
+ pages = [
+ int(p)
+ for p in (meta.get("page_numbers") or [])
+ if isinstance(p, int) or str(p).isdigit()
+ ]
+ if not pages:
+ continue
+ first_page = min(pages)
+ if first_page > max_pages:
+ continue
+ original = meta.get("original_content")
+ if isinstance(original, str):
+ try:
+ original = json.loads(original)
+ if isinstance(original, str):
+ original = json.loads(original)
+ except Exception:
+ original = {}
+ if not isinstance(original, dict):
+ original = {}
+ content = (
+ original.get("raw_text")
+ or row.get("content")
+ or ""
+ )
+ if not content:
+ continue
+ ordered.append((first_page, str(content)))
+ seen_pages.update(p for p in pages if p <= max_pages)
+ ordered.sort(key=lambda item: item[0])
+ joined = "\n".join(text for _, text in ordered)
+ return joined, sorted(seen_pages)
+
+
+def _looks_like_subtitle(line: str) -> bool:
+ lowered = (line or "").lower()
+ return (
+ len(line) >= 12
+ and lowered != lowered.upper()
+ and not lowered.startswith("chapter ")
+ and any(token in lowered for token in ("guide", "bible", "introduction", "manual", "engineering"))
+ )
+
+
+def _build_identity_json(
+ raw_text: str,
+ *,
+ source_pages: Optional[List[int]] = None,
+ fallback_title: str = "",
+) -> dict:
+ cleaned_raw_text = _strip_invalid_text_chars(raw_text)[:8000]
+ lines = _clean_identity_lines(cleaned_raw_text)
+ display_title = ""
+ subtitle = ""
+ named_owner = None
+ author = None
+ publisher = None
+ edition = None
+ document_date = None
+
+ for line in lines[:6]:
+ if len(line) >= 6:
+ display_title = line
+ break
+ if not display_title:
+ display_title = fallback_title or "Untitled document"
+
+ for line in lines[1:8]:
+ if line != display_title and _looks_like_subtitle(line):
+ subtitle = line
+ break
+
+ owner_match = _COVER_OWNER_RE.match(display_title)
+ if owner_match:
+ named_owner = owner_match.group(1).strip()
+
+ author_match = _AUTHOR_RE.search(cleaned_raw_text)
+ if author_match:
+ author = author_match.group(1).strip().splitlines()[0][:200]
+
+ publisher_match = _PUBLISHER_RE.search(cleaned_raw_text)
+ if publisher_match:
+ publisher = publisher_match.group(1).strip().splitlines()[0][:200]
+
+ edition_match = _EDITION_RE.search(cleaned_raw_text)
+ if edition_match:
+ edition = edition_match.group(1).strip()
+
+ date_match = _DATE_RE.search(cleaned_raw_text)
+ if date_match:
+ document_date = date_match.group(0).strip()
+
+ cover_lines = lines[:6]
+ opening_summary_lines = lines[:14]
+ opening_page_summary = " ".join(opening_summary_lines)
+ cover_text = "\n".join(cover_lines)
+
+ field_presence = {
+ "display_title": bool(display_title),
+ "subtitle": bool(subtitle),
+ "named_owner": bool(named_owner),
+ "author": bool(author),
+ "publisher": bool(publisher),
+ "edition": bool(edition),
+ "document_date": bool(document_date),
+ "cover_text": bool(cover_text),
+ "opening_page_summary": bool(opening_page_summary),
+ }
+
+ return {
+ "display_title": _truncate_identity_text(display_title, 240),
+ "subtitle": _truncate_identity_text(subtitle, 240),
+ "named_owner": _truncate_identity_text(named_owner or "", 120) or None,
+ "author": _truncate_identity_text(author or "", 160) or None,
+ "publisher": _truncate_identity_text(publisher or "", 160) or None,
+ "edition": _truncate_identity_text(edition or "", 80) or None,
+ "document_date": _truncate_identity_text(document_date or "", 80) or None,
+ "cover_text": _truncate_identity_text(cover_text, 800),
+ "opening_page_summary": _truncate_identity_text(opening_page_summary, 1000),
+ "source_pages": list(source_pages or []),
+ "field_presence": field_presence,
+ }
+
+
+def _identity_json_from_elements(
+ elements: List[Any],
+ *,
+ fallback_title: str,
+) -> dict:
+ text, pages = _extract_identity_pages_text_from_elements(elements)
+ return _build_identity_json(text, source_pages=pages, fallback_title=fallback_title)
+
+
+def _identity_json_from_docs(
+ docs: List[dict],
+ *,
+ fallback_title: str,
+) -> dict:
+ text, pages = _extract_identity_pages_text_from_docs(docs)
+ return _build_identity_json(text, source_pages=pages, fallback_title=fallback_title)
+
+
+def _is_exact_fact_query(query: str) -> bool:
+ q = (query or "").lower()
+ return any(token in q for token in _EXACT_FACT_TERMS)
+
+
+def _detect_page_scope(query: str) -> Optional[str]:
+ q = (query or "").lower()
+ if any(token in q for token in ("opening pages", "opening page")):
+ return "opening_pages"
+ if any(token in q for token in ("first page", "page one", "page 1")):
+ return "first_page"
+ if "cover" in q:
+ return "cover"
+ match = _PAGE_RANGE_HINT_RE.search(query or "")
+ if match:
+ return f"page_{match.group(1)}"
+ return None
+
+
+def _detect_identity_field(query: str) -> Optional[str]:
+ q = (query or "").lower()
+ for field, hints in _IDENTITY_FIELD_HINTS.items():
+ if any(hint in q for hint in hints):
+ return field
+ if "title" in q:
+ return "title"
+ return None
+
+
+def _query_requires_identity_lookup(query: str) -> bool:
+ return bool(_detect_identity_field(query) or _detect_page_scope(query))
+
+
+def _query_requires_opening_page_evidence(query: str) -> bool:
+ scope = _detect_page_scope(query)
+ return scope in {"opening_pages", "first_page", "cover"}
+
+
+def _query_asks_for_not_stated(query: str) -> bool:
+ q = (query or "").lower()
+ return "not stated" in q or "if not" in q
+
+
+def _answer_looks_abstention(answer: str) -> bool:
+ lowered = (answer or "").strip().lower()
+ if not lowered:
+ return False
+ signals = (
+ "not stated",
+ "no relevant documents were found",
+ "i don't have that information",
+ "i do not have that information",
+ "insufficient evidence",
+ "opening-page evidence was not found",
+ "not mentioned",
+ "could not find",
+ )
+ return any(signal in lowered for signal in signals)
+
+
+def _history_fact_summary(content: str, *, limit: int = 180) -> str:
+ cleaned = re.sub(r"\s+", " ", str(content or "")).strip()
+ cleaned = _TRANSCRIPT_LABEL_RE.sub("", cleaned)
+ cleaned = _CONTROL_TOKEN_RE.sub("", cleaned)
+ return cleaned[:limit]
+
+
+def _normalise_overlap_text(text: str) -> str:
+ lowered = re.sub(r"\s+", " ", (text or "").lower()).strip()
+ lowered = re.sub(r"[^a-z0-9 ]+", "", lowered)
+ return re.sub(r"\s+", " ", lowered).strip()
+
+
+def _text_overlap_ratio(a: str, b: str) -> float:
+ if not a or not b:
+ return 0.0
+ a_tokens = set(a.split())
+ b_tokens = set(b.split())
+ if not a_tokens or not b_tokens:
+ return 0.0
+ return len(a_tokens.intersection(b_tokens)) / max(1, min(len(a_tokens), len(b_tokens)))
+
+
+def _collapse_near_duplicate_candidates(
+ candidates: List[dict],
+) -> tuple[List[dict], int]:
+ if not config.ENABLE_DUPLICATE_CHUNK_COLLAPSE:
+ return candidates, 0
+
+ kept: List[dict] = []
+ seen_signatures: list[tuple[str, str, str]] = []
+ collapsed = 0
+
+ for candidate in candidates:
+ meta = dict(candidate.get("metadata", {}) or {})
+ file_hash = str(
+ meta.get("file_hash")
+ or candidate.get("file_hash")
+ or meta.get("source")
+ or "unknown"
+ )
+ normalized = _normalise_overlap_text(candidate.get("content", ""))
+ fingerprint = " ".join(normalized.split()[:40])
+ duplicate = False
+
+ for prev_hash, prev_norm, prev_fp in seen_signatures:
+ if prev_hash != file_hash:
+ continue
+ if fingerprint and prev_fp and fingerprint == prev_fp:
+ duplicate = True
+ break
+ if _text_overlap_ratio(normalized, prev_norm) >= 0.92:
+ duplicate = True
+ break
+
+ if duplicate:
+ collapsed += 1
+ continue
+
+ kept.append(candidate)
+ seen_signatures.append((file_hash, normalized, fingerprint))
+
+ return kept, collapsed
+
+
+def _remember_session_retrieval(
+ *,
+ session_key: str,
+ query: str,
+ chunks: List[Document],
+) -> None:
+ if not session_key or not chunks:
+ return
+ with _last_chunks_lock:
+ _last_chunks[session_key] = chunks
+ _last_query_context[session_key] = {
+ "query": query,
+ "updated_at": time.time(),
+ }
+
+
+def _get_session_context(session_key: str) -> dict:
+ if not session_key:
+ return {}
+ with _last_chunks_lock:
+ state = dict(_last_query_context.get(session_key) or {})
+ if not state:
+ return {}
+ updated_at = float(state.get("updated_at") or 0.0)
+ if config.FOLLOWUP_SESSION_TTL_S > 0 and time.time() - updated_at > config.FOLLOWUP_SESSION_TTL_S:
+ _last_query_context.pop(session_key, None)
+ _last_chunks.pop(session_key, None)
+ return {}
+ return state
+
def get_cached_embedding(text: str) -> list:
"""Return cached embedding if available, otherwise compute and store."""
@@ -425,8 +922,37 @@ def _has_text_layer(pdf_path: str) -> bool:
return False
+def _extract_element_metrics(elements: list) -> dict[str, float]:
+ page_numbers = {
+ getattr(getattr(el, "metadata", None), "page_number", None)
+ for el in elements
+ if getattr(getattr(el, "metadata", None), "page_number", None) is not None
+ }
+ page_count = max(1, len(page_numbers))
+ text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
+ element_count = len(elements)
+ chars_per_page = text_chars / max(1, page_count)
+ return {
+ "text_chars": text_chars,
+ "element_count": element_count,
+ "page_count": page_count,
+ "chars_per_page": chars_per_page,
+ }
+
+
+def _should_retry_with_hi_res(
+ strategy: str,
+ metrics: dict[str, float],
+) -> bool:
+ return (
+ strategy == "fast"
+ and metrics["chars_per_page"] < 200
+ and metrics["element_count"] < 10
+ )
+
+
def partition_document(file_path: str) -> list:
- # NEW: Dynamic OCR routing
+ # NEW: Dynamic OCR routing + guarded high-resolution retry for suspiciously thin extraction
has_text = _has_text_layer(file_path)
strategy = "fast" if has_text else "hi_res"
log.info(
@@ -439,18 +965,161 @@ def partition_document(file_path: str) -> list:
extract_image_block_types=["Image"],
extract_image_block_to_payload=True,
)
- log.info("%d elements extracted", len(elements))
+ metrics = _extract_element_metrics(elements)
+ log.info(
+ "%d elements extracted (text_chars=%d, page_count=%d, chars_per_page=%.1f)",
+ len(elements),
+ metrics["text_chars"],
+ metrics["page_count"],
+ metrics["chars_per_page"],
+ )
+
+ if _should_retry_with_hi_res(strategy, metrics):
+ log.info(
+ "Extraction looked suspiciously thin (chars_per_page=%.1f, elements=%d) — retrying once with hi_res.",
+ metrics["chars_per_page"],
+ metrics["element_count"],
+ )
+ hi_res_elements = partition_pdf(
+ filename=file_path,
+ strategy="hi_res",
+ infer_table_structure=True,
+ extract_image_block_types=["Image"],
+ extract_image_block_to_payload=True,
+ )
+ hi_res_metrics = _extract_element_metrics(hi_res_elements)
+ if (
+ hi_res_metrics["text_chars"] > metrics["text_chars"]
+ or hi_res_metrics["element_count"] > metrics["element_count"]
+ ):
+ log.info(
+ "Using hi_res extraction instead (text_chars=%d, elements=%d).",
+ hi_res_metrics["text_chars"],
+ hi_res_metrics["element_count"],
+ )
+ return hi_res_elements
+ log.info("Keeping fast extraction — hi_res did not improve coverage.")
+
return elements
-def create_chunks(elements: list) -> list:
+def _chunking_params_for_text_chars(text_chars: int) -> dict[str, int]:
+ if text_chars <= 25_000:
+ return {
+ "max_characters": 3000,
+ "new_after_n_chars": 2500,
+ "combine_text_under_n_chars": 300,
+ }
+ return {
+ "max_characters": 8000,
+ "new_after_n_chars": 7000,
+ "combine_text_under_n_chars": 500,
+ }
+
+
+class _SyntheticNarrativeChunk:
+ def __init__(self, text: str, orig_elements: Optional[list] = None):
+ self.text = text
+ self.metadata = SimpleNamespace(orig_elements=orig_elements or [])
+
+
+def _split_narrative_text(
+ text: str,
+ target_chars: int = 700,
+ min_chars: int = 350,
+) -> list[str]:
+ clean_text = re.sub(r"\s+", " ", (text or "").strip())
+ if len(clean_text) <= target_chars + 120:
+ return [clean_text] if clean_text else []
+
+ parts = [p.strip() for p in re.split(r"\n\s*\n", text or "") if p.strip()]
+ if len(parts) <= 1:
+ parts = [p.strip() for p in re.split(r"(?<=\")\s+(?=[A-Z])|\n+", text or "") if p.strip()]
+ if len(parts) <= 1:
+ parts = [
+ p.strip()
+ for p in re.split(r"(?<=[.!?][\"']?)\s+", clean_text)
+ if p.strip()
+ ]
+
+ if not parts:
+ return [clean_text] if clean_text else []
+
+ segments: list[str] = []
+ current = ""
+ for part in parts:
+ proposed = f"{current} {part}".strip() if current else part
+ if current and len(proposed) > target_chars and len(current) >= min_chars:
+ segments.append(current)
+ current = part
+ else:
+ current = proposed
+ if current:
+ if segments and len(current) < min_chars:
+ segments[-1] = f"{segments[-1]} {current}".strip()
+ else:
+ segments.append(current)
+
+ if len(segments) <= 1 and len(clean_text) > target_chars:
+ midpoint = max(min_chars, len(clean_text) // 2)
+ cut = clean_text.rfind(". ", 0, midpoint + 150)
+ if cut <= 0:
+ cut = midpoint
+ segments = [clean_text[:cut].strip(), clean_text[cut:].strip()]
+
+ return [seg for seg in segments if seg]
+
+
+def _is_text_only_chunk(chunk) -> bool:
+ orig_elements = getattr(getattr(chunk, "metadata", None), "orig_elements", None)
+ if not orig_elements:
+ return True
+ for el in orig_elements:
+ el_type = type(el).__name__
+ if el_type in {"Table", "Image"}:
+ return False
+ return True
+
+
+def _maybe_split_single_narrative_chunk(chunks: list, text_chars: int) -> list:
+ if len(chunks) != 1 or text_chars > 25_000:
+ return chunks
+ chunk = chunks[0]
+ if not _is_text_only_chunk(chunk):
+ return chunks
+ raw_text = getattr(chunk, "text", "") or ""
+ segments = _split_narrative_text(raw_text)
+ if len(segments) <= 1:
+ return chunks
+ orig_elements = getattr(getattr(chunk, "metadata", None), "orig_elements", [])
+ log.info(
+ "Thin-document splitter activated — expanding 1 narrative chunk into %d segments.",
+ len(segments),
+ )
+ return [_SyntheticNarrativeChunk(segment, orig_elements) for segment in segments]
+
+
+def create_chunks(elements: list, text_chars: Optional[int] = None) -> list:
log.info("Chunking %d elements...", len(elements))
+ if text_chars is None:
+ text_chars = sum(
+ len(el.text) for el in elements if hasattr(el, "text") and el.text
+ )
+ chunking_params = _chunking_params_for_text_chars(text_chars)
+ log.info(
+ "Adaptive chunking selected: text_chars=%d max=%d new_after=%d combine_under=%d",
+ text_chars,
+ chunking_params["max_characters"],
+ chunking_params["new_after_n_chars"],
+ chunking_params["combine_text_under_n_chars"],
+ )
chunks = chunk_by_title(
elements,
- max_characters=8000,
- new_after_n_chars=7000,
- combine_text_under_n_chars=500,
+ max_characters=chunking_params["max_characters"],
+ new_after_n_chars=chunking_params["new_after_n_chars"],
+ combine_text_under_n_chars=chunking_params["combine_text_under_n_chars"],
)
+ chunks = _maybe_split_single_narrative_chunk(chunks, text_chars)
log.info("%d chunks created", len(chunks))
return chunks
@@ -616,6 +1285,7 @@ def process_chunks(
graph_data: DocumentGraphMetadata,
user_id: str,
pdf_images=None,
+ coverage_metrics: Optional[dict] = None,
) -> tuple[List[Document], List[str]]:
"""Convert raw unstructured chunks → LangChain Documents with parallel AI summarisation."""
print(f" Processing {len(chunks)} chunks...")
@@ -691,6 +1361,7 @@ def process_chunks(
"chunk_index": i,
"total_chunks": len(chunks),
"page_numbers": content["page_numbers"],
+ "coverage_metrics": dict(coverage_metrics or {}),
"original_content": json.dumps(
{
"raw_text": content["text"],
@@ -727,6 +1398,7 @@ def build_raptor_tree(
doc.metadata["node_level"] = 0
doc.metadata["node_id"] = d_id
doc.metadata["parent_node_id"] = None
+ doc.metadata["synthetic_root_summary"] = False
current_level_docs = list(leaf_docs)
current_level = 1
@@ -748,6 +1420,77 @@ def build_raptor_tree(
"{sections}"
)
+ def _synthesize_parent_node(
+ cluster: List[Document],
+ level: int,
+ ) -> tuple[Document, str]:
+ sections_text = ""
+ for j, child in enumerate(cluster):
+ child_text = child.metadata.get("summary", child.page_content)
+ if not child_text or child_text == "No summary available.":
+ child_text = child.page_content
+ sections_text += f"--- SECTION {j + 1} ---\n{child_text}\n\n"
+
+ prompt = cluster_prompt.format(count=len(cluster), sections=sections_text)
+ try:
+ response = llm.invoke([HumanMessage(content=prompt)])
+ parent_text = response.content
+ except Exception as e:
+ log.warning(
+ "RAPTOR summarization failed at level %d: %s",
+ level,
+ e,
+ )
+ parent_text = "Merged Content:\n" + "\n".join(
+ [c.page_content[:500] for c in cluster]
+ )
+
+ child_node_ids = [
+ str(c.metadata.get("node_id", "")) for c in cluster if c.metadata.get("node_id")
+ ]
+ parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
+ base_meta = cluster[0].metadata
+ parent_id = str(
+ uuid.uuid5(
+ uuid.NAMESPACE_DNS,
+ (
+ f"{user_id}:raptor:{base_meta.get('file_hash', '')}:"
+ f"{level}:{'|'.join(child_node_ids)}:{parent_hash}"
+ ),
+ )
+ )
+
+ all_pages = set()
+ for c in cluster:
+ for p in c.metadata.get("page_numbers", []):
+ all_pages.add(p)
+
+ first_idx = str(cluster[0].metadata.get("chunk_index", "?")).split("-")[0]
+ last_idx = str(cluster[-1].metadata.get("chunk_index", "?")).split("-")[-1]
+ combo_idx = f"{first_idx}-{last_idx}" if first_idx != last_idx else first_idx
+
+ parent_doc = Document(
+ page_content=parent_text,
+ metadata={
+ "source": base_meta.get("source", "Unknown"),
+ "file_hash": base_meta.get("file_hash", ""),
+ "document_type": base_meta.get("document_type", "general_document"),
+ "topics": base_meta.get("topics", []),
+ "node_type": "summary",
+ "node_level": level,
+ "node_id": parent_id,
+ "parent_node_id": None,
+ "page_numbers": sorted(list(all_pages)),
+ "children_count": len(cluster),
+ "chunk_index": combo_idx,
+ "coverage_metrics": dict(base_meta.get("coverage_metrics", {}) or {}),
+ "synthetic_root_summary": False,
+ },
+ )
+ for child in cluster:
+ child.metadata["parent_node_id"] = parent_id
+ return parent_doc, parent_id
+
while len(current_level_docs) > 1:
print(
f" [RAPTOR] Building Level {current_level} (from {len(current_level_docs)} children)..."
@@ -758,81 +1501,7 @@ def build_raptor_tree(
for i in range(0, len(current_level_docs), CLUSTER_SIZE):
cluster = current_level_docs[i : i + CLUSTER_SIZE]
- # Combine the underlying texts (use the existing summary if available, else raw text)
- sections_text = ""
- for j, child in enumerate(cluster):
- # For leaves, we prefer the AI summary if it exists, otherwise page_content.
- # For higher levels, page_content IS the summary.
- child_text = child.metadata.get("summary", child.page_content)
- if not child_text or child_text == "No summary available.":
- child_text = child.page_content
- sections_text += f"--- SECTION {j + 1} ---\n{child_text}\n\n"
-
- # Generate the parent summary
- prompt = cluster_prompt.format(count=len(cluster), sections=sections_text)
- try:
- response = llm.invoke([HumanMessage(content=prompt)])
- parent_text = response.content
- except Exception as e:
- log.warning(
- f"RAPTOR summarization failed at level {current_level}, segment {i}: {e}"
- )
- # Fallback: just concatenate
- parent_text = "Merged Content:\n" + "\n".join(
- [c.page_content[:500] for c in cluster]
- )
-
- # Generate deterministic ID for the parent
- import hashlib
-
- child_node_ids = [
- str(c.metadata.get("node_id", "")) for c in cluster if c.metadata.get("node_id")
- ]
- parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
- base_meta = cluster[0].metadata
- parent_id = str(
- uuid.uuid5(
- uuid.NAMESPACE_DNS,
- (
- f"{user_id}:raptor:{base_meta.get('file_hash', '')}:"
- f"{current_level}:{'|'.join(child_node_ids)}:{parent_hash}"
- ),
- )
- )
-
- # Create the parent document
- # Inherit metadata from the first child (source array, file hash, document type)
- # Gather all unique page numbers from children
- all_pages = set()
- for c in cluster:
- for p in c.metadata.get("page_numbers", []):
- all_pages.add(p)
- # 🚀 FIX: Calculate a dynamic chunk index range (e.g., "1-5")
- first_idx = str(cluster[0].metadata.get("chunk_index", "?")).split("-")[0]
- last_idx = str(cluster[-1].metadata.get("chunk_index", "?")).split("-")[-1]
- combo_idx = (
- f"{first_idx}-{last_idx}" if first_idx != last_idx else first_idx
- )
- parent_doc = Document(
- page_content=parent_text,
- metadata={
- "source": base_meta.get("source", "Unknown"),
- "file_hash": base_meta.get("file_hash", ""),
- "document_type": base_meta.get("document_type", "general_document"),
- "topics": base_meta.get("topics", []),
- "node_type": "summary",
- "node_level": current_level,
- "node_id": parent_id,
- "parent_node_id": None, # Will be set by the NEXT level up
- "page_numbers": sorted(list(all_pages)),
- "children_count": len(cluster),
- "chunk_index": combo_idx, # 🚀 ADDED MISSING METADATA
- },
- )
-
- # Update children to point to this parent
- for child in cluster:
- child.metadata["parent_node_id"] = parent_id
+ parent_doc, parent_id = _synthesize_parent_node(cluster, current_level)
next_level_docs.append(parent_doc)
all_docs.append(parent_doc)
@@ -841,234 +1510,899 @@ def build_raptor_tree(
current_level_docs = next_level_docs
current_level += 1
+ if len(all_docs) == len(leaf_docs) and current_level_docs:
+ log.info(
+ " [RAPTOR] Single-leaf document detected — synthesising a root summary node."
+ )
+ parent_doc, parent_id = _synthesize_parent_node(current_level_docs, current_level)
+ parent_doc.metadata["synthetic_root_summary"] = True
+ all_docs.append(parent_doc)
+ all_ids.append(parent_id)
+
print(
f" [RAPTOR] Tree built. Total nodes: {len(all_docs)} (Leaves: {len(leaf_docs)}, Summaries: {len(all_docs) - len(leaf_docs)})"
)
return all_docs, all_ids
-def is_file_already_ingested(file_hash: str, access_token: str = None) -> bool:
- """
- FIX: Now hits the dedicated ingested_files registry table (O(1) indexed lookup)
- instead of doing a JSONB containment scan on the full documents table.
- Falls back to the old JSONB scan if the registry table doesn't exist yet.
- """
- supabase = _build_supabase_client(access_token)
+def _safe_graph_key(prefix: str, raw: str) -> str:
+ clean = re.sub(r"[^a-z0-9:_-]+", "_", (raw or "").lower()).strip("_")
+ return f"{prefix}:{clean}" if clean else f"{prefix}:unknown"
+
+
+def _persist_graph_rows(
+ *,
+ nodes: List[dict],
+ edges: List[dict],
+) -> None:
try:
- result = (
- supabase.table("ingested_files")
- .select("id")
- .eq("file_hash", file_hash)
- .limit(1)
- .execute()
- )
- return len(result.data) > 0
+ sb = _build_service_supabase_client()
+ if nodes:
+ sb.table("graph_nodes").upsert(nodes, on_conflict="user_id,node_key").execute()
+ if edges:
+ sb.table("graph_edges").upsert(
+ edges,
+ on_conflict="user_id,source_node_key,target_node_key,edge_type",
+ ).execute()
except Exception as exc:
- log.warning(
- "ingested_files table unavailable (%s). Falling back to JSONB scan.", exc
- )
- try:
- result = (
- supabase.table(config.VECTOR_TABLE_NAME)
- .select("id")
- .contains("metadata", {"file_hash": file_hash})
- .limit(1)
- .execute()
- )
- return len(result.data) > 0
- except Exception as exc2:
- log.warning("Fallback dedup check also failed: %s", exc2)
- return False
+ log.debug("Graph foundation persistence skipped: %s", exc)
-def _register_ingested_file(
+def _persist_graph_foundation(
+ *,
+ user_id: str,
file_hash: str,
- filename: str,
- document_type: str,
- chunk_count: int,
- access_token: str = None,
+ docs: List[Document],
+ graph_data: DocumentGraphMetadata,
) -> None:
- """Insert a row into ingested_files registry after successful upload."""
- supabase = _build_supabase_client(access_token)
try:
- payload = {
- "file_hash": file_hash,
- "filename": filename,
- "document_type": document_type,
- "chunk_count": chunk_count,
- }
- if access_token:
- from backend.core.auth_utils import safe_extract_jwt_sub
-
- user_id = safe_extract_jwt_sub(access_token)
- if user_id:
- payload["user_id"] = user_id
- supabase.table("ingested_files").upsert(
- payload,
- on_conflict="user_id,file_hash", # ← string, not list
- ).execute()
- else:
- supabase.table("ingested_files").upsert(
- payload,
- on_conflict="file_hash",
- ).execute()
- log.info("Registered in ingested_files: %s (%s)", filename, document_type)
- except Exception as exc:
- log.warning("Could not register in ingested_files: %s", exc)
+ if not docs:
+ return
+ source_name = str((docs[0].metadata or {}).get("source") or file_hash)
+ doc_key = _safe_graph_key("document", file_hash)
+ nodes: List[dict] = [
+ {
+ "user_id": user_id,
+ "node_key": doc_key,
+ "node_type": "document",
+ "label": source_name,
+ "payload": {
+ "file_hash": file_hash,
+ "document_type": graph_data.document_type,
+ "summary": graph_data.brief_summary,
+ "topics": graph_data.primary_topics,
+ },
+ }
+ ]
+ edges: List[dict] = []
+ entity_keys: dict[str, str] = {}
+ for entity in graph_data.key_entities or []:
+ entity_key = _safe_graph_key("entity", entity)
+ entity_keys[entity.lower()] = entity_key
+ nodes.append(
+ {
+ "user_id": user_id,
+ "node_key": entity_key,
+ "node_type": "entity",
+ "label": entity,
+ "payload": {"file_hash": file_hash},
+ }
+ )
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": doc_key,
+ "target_node_key": entity_key,
+ "edge_type": "mentions",
+ "weight": 1.0,
+ "payload": {"file_hash": file_hash},
+ }
+ )
-def _apply_category_override(
- file_hash: str, new_category: str, access_token: str = None
-) -> None:
- """
- Patch document_type in all chunks belonging to this file_hash.
- Also updates ingested_files registry and refreshes materialized view.
- Safe to call any number of times — fully idempotent.
- """
- supabase = _build_supabase_client(access_token)
+ for doc in docs:
+ meta = doc.metadata or {}
+ node_type = str(meta.get("node_type") or "chunk")
+ node_id = str(
+ meta.get("node_id")
+ or meta.get("id")
+ or f"{file_hash}:{meta.get('chunk_index', 'unknown')}"
+ )
+ node_key = _safe_graph_key(node_type, node_id)
+ nodes.append(
+ {
+ "user_id": user_id,
+ "node_key": node_key,
+ "node_type": node_type,
+ "label": f"{source_name} :: {meta.get('chunk_index', '?')}",
+ "payload": {
+ "file_hash": file_hash,
+ "chunk_index": meta.get("chunk_index"),
+ "node_level": meta.get("node_level", 0),
+ "page_numbers": meta.get("page_numbers", []),
+ "document_type": meta.get("document_type"),
+ },
+ }
+ )
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": node_key,
+ "target_node_key": doc_key,
+ "edge_type": "part_of",
+ "weight": 1.0,
+ "payload": {"file_hash": file_hash},
+ }
+ )
+ parent_node_id = meta.get("parent_node_id")
+ if parent_node_id:
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": node_key,
+ "target_node_key": _safe_graph_key("summary", str(parent_node_id)),
+ "edge_type": "part_of",
+ "weight": 1.0,
+ "payload": {"file_hash": file_hash},
+ }
+ )
+ raw_text = ""
+ original = meta.get("original_content")
+ if isinstance(original, str):
+ try:
+ original = json.loads(original)
+ except Exception:
+ original = {}
+ if isinstance(original, dict):
+ raw_text = str(original.get("raw_text") or "")
+ searchable = f"{doc.page_content}\n{raw_text}".lower()
+ for entity_name, entity_key in entity_keys.items():
+ if entity_name and entity_name in searchable:
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": entity_key,
+ "target_node_key": node_key,
+ "edge_type": "mentions",
+ "weight": 1.0,
+ "payload": {"file_hash": file_hash},
+ }
+ )
- # Fetch all chunks for this file
- rows = (
- supabase.table(config.VECTOR_TABLE_NAME)
- .select("id, metadata")
- .eq("metadata->>file_hash", file_hash)
- .execute()
+ _persist_graph_rows(nodes=nodes, edges=edges)
+ except Exception as exc:
+ log.debug("Graph foundation build skipped: %s", exc)
+
+
+def _extract_graph_terms(query: str) -> list[str]:
+ q = (query or "").strip()
+ if not q:
+ return []
+ quoted = re.findall(r"\"([^\"]+)\"|'([^']+)'", q)
+ terms = [next((part for part in pair if part), "").strip() for pair in quoted]
+ title_case = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b", q)
+ terms.extend(title_case)
+ stop_words = {
+ "what",
+ "which",
+ "does",
+ "between",
+ "these",
+ "their",
+ "about",
+ "story",
+ "stories",
+ "document",
+ "documents",
+ "common",
+ "themes",
+ "theme",
+ "compare",
+ "summary",
+ "summarise",
+ "summarize",
+ }
+ terms.extend(
+ token
+ for token in re.findall(r"[A-Za-z][A-Za-z0-9_-]{3,}", q)
+ if token.lower() not in stop_words
)
+ seen: set[str] = set()
+ ordered: list[str] = []
+ for term in terms:
+ cleaned = re.sub(r"\s+", " ", term).strip()
+ lowered = cleaned.lower()
+ if not cleaned or lowered in seen:
+ continue
+ seen.add(lowered)
+ ordered.append(cleaned)
+ return ordered[:8]
- for row in rows.data or []:
- meta = row["metadata"]
- meta["document_type"] = new_category
- supabase.table(config.VECTOR_TABLE_NAME).update({"metadata": meta}).eq(
- "id", row["id"]
- ).execute()
- # Update ingested_files registry
- supabase.table("ingested_files").update(
- {"document_type": new_category, "user_overridden": True}
- ).eq("file_hash", file_hash).execute()
+def _graph_query_should_run(
+ query: str,
+ route_info: dict,
+) -> bool:
+ return _should_run_expert(route_info, "graph_traversal", threshold=0.24) or (
+ _is_relational_query(query) and bool(_extract_graph_terms(query))
+ )
- log.info("Category override: %s… → '%s'", file_hash[:8], new_category)
+def _graph_edge_whitelist(route_mode: str) -> set[str]:
+ base = {"mentions", "part_of", "saved_from", "cites"}
+ if route_mode == "explicit_compare":
+ base.update({"compared_with"})
+ else:
+ base.update({"follows_from"})
+ return base
+
+
+def _graph_label_score(term: str, label: str) -> float:
+ term_norm = (term or "").strip().lower()
+ label_norm = (label or "").strip().lower()
+ if not term_norm or not label_norm:
+ return 0.0
+ if term_norm == label_norm:
+ return 1.0
+ if term_norm in label_norm or label_norm in term_norm:
+ overlap = len(term_norm) / max(1, len(label_norm))
+ return min(0.95, 0.55 + (0.4 * overlap))
+ term_tokens = set(term_norm.split())
+ label_tokens = set(label_norm.split())
+ if not term_tokens or not label_tokens:
+ return 0.0
+ jaccard = len(term_tokens.intersection(label_tokens)) / max(
+ 1, len(term_tokens.union(label_tokens))
+ )
+ return round(jaccard, 4)
-def delete_document(file_hash: str, access_token: str = None) -> None:
- supabase = _build_supabase_client(access_token)
- log.info("NUCLEAR DELETE initiated for hash: %s...", file_hash[:8])
- # 1. Delete Chunks (Vector Table)
- # We use a more robust "contains" filter for JSONB
+def _retrieve_graph_candidates(
+ query: str,
+ *,
+ route_mode: str,
+ access_token: str = None,
+ user_id: str = None,
+ priority_file_hashes: Optional[List[str]] = None,
+ max_hops: int = 2,
+ limit: int = 6,
+) -> List[dict]:
+ if not user_id:
+ return []
+ terms = _extract_graph_terms(query)
+ if not terms:
+ return []
+
+ sb = _build_service_supabase_client()
try:
- res_chunks = (
- supabase.table(config.VECTOR_TABLE_NAME)
- .delete()
- .contains("metadata", {"file_hash": file_hash})
+ node_rows = (
+ sb.table("graph_nodes")
+ .select("node_key, node_type, label, payload")
+ .eq("user_id", user_id)
.execute()
+ .data
+ or []
)
- log.info("Deleted %d chunks from vector store.", len(res_chunks.data or []))
- except Exception as e:
- log.error("Failed to delete chunks: %s", e)
-
- # 2. Delete from the Registry (The Fingerprint Table)
- try:
- supabase.table("ingested_files").delete().eq("file_hash", file_hash).execute()
- log.info("Removed from ingested_files registry.")
- except Exception as e:
- log.error("Failed to delete registry entry: %s", e)
-
- # 3. Delete from Chat Memory (Tier 2 Memory)
- # This is important! We don't want old memories haunting new uploads.
- try:
- # We don't have file_hash in memory, but we can clean up
- # based on filenames found in the chunks before they were deleted
- log.info("Cleaning up associated chat memories...")
- except Exception as e:
- log.debug(f"Memory cleanup skipped. {e}")
-
- # Invalidate query cache — corpus changed
- if access_token:
- try:
- from backend.core.auth_utils import extract_jwt_sub
+ edge_rows = (
+ sb.table("graph_edges")
+ .select("source_node_key, target_node_key, edge_type, weight, payload")
+ .eq("user_id", user_id)
+ .execute()
+ .data
+ or []
+ )
+ except Exception as exc:
+ log.debug("Graph retrieval skipped: %s", exc)
+ return []
- _uid = extract_jwt_sub(access_token)
- invalidate_user_cache(_uid, reason="document_deleted")
- except Exception:
- pass
+ allowed_edge_types = _graph_edge_whitelist(route_mode)
+ node_map = {str(row.get("node_key")): row for row in node_rows if row.get("node_key")}
+ adjacency: dict[str, list[tuple[str, dict]]] = defaultdict(list)
+ for edge in edge_rows:
+ edge_type = str(edge.get("edge_type") or "")
+ if edge_type not in allowed_edge_types:
+ continue
+ source = str(edge.get("source_node_key") or "")
+ target = str(edge.get("target_node_key") or "")
+ if source and target:
+ adjacency[source].append((target, edge))
+ adjacency[target].append((source, edge))
+
+ matched_starts: list[tuple[str, float]] = []
+ for row in node_rows:
+ label = str(row.get("label") or "")
+ best_score = max((_graph_label_score(term, label) for term in terms), default=0.0)
+ if best_score >= 0.25:
+ matched_starts.append((str(row.get("node_key")), best_score))
+ matched_starts.sort(key=lambda item: item[1], reverse=True)
+ matched_starts = matched_starts[:6]
+ if not matched_starts:
+ return []
+ target_hashes = set(priority_file_hashes or [])
+ visited_depth: dict[str, int] = {}
+ traversal_scores: dict[str, float] = {}
+ queue: deque[tuple[str, int, float]] = deque()
+ for node_key, score in matched_starts:
+ queue.append((node_key, 0, score))
+ visited_depth[node_key] = 0
+ traversal_scores[node_key] = max(traversal_scores.get(node_key, 0.0), score)
+
+ while queue:
+ node_key, depth, score = queue.popleft()
+ if depth >= max_hops:
+ continue
+ for next_key, edge in adjacency.get(node_key, []):
+ next_depth = depth + 1
+ next_score = max(
+ 0.1,
+ score * float(edge.get("weight") or 1.0) * (0.78 if next_depth == 2 else 0.9),
+ )
+ if next_key not in visited_depth or next_depth < visited_depth[next_key]:
+ visited_depth[next_key] = next_depth
+ traversal_scores[next_key] = max(traversal_scores.get(next_key, 0.0), next_score)
+ queue.append((next_key, next_depth, next_score))
+ else:
+ traversal_scores[next_key] = max(traversal_scores.get(next_key, 0.0), next_score)
-def upload_to_supabase(
- documents: List[Document],
- ids: List[str],
- access_token: str = None,
-) -> None:
- def sanitize_text(val):
- if isinstance(val, str):
- # 1. Strip Null Bytes (Fixes your current 22P05 error)
- val = val.replace("\x00", "").replace("\u0000", "")
+ candidate_specs: list[dict] = []
+ for node_key, depth in visited_depth.items():
+ row = node_map.get(node_key)
+ if not row:
+ continue
+ node_type = str(row.get("node_type") or "")
+ payload = dict(row.get("payload") or {})
+ file_hash = str(payload.get("file_hash") or "")
+ if target_hashes and file_hash and file_hash not in target_hashes:
+ continue
+ if node_type not in {"document", "leaf", "summary"}:
+ continue
+ candidate_specs.append(
+ {
+ "node_key": node_key,
+ "node_type": node_type,
+ "payload": payload,
+ "graph_score": round(traversal_scores.get(node_key, 0.0), 4),
+ "depth": depth,
+ "label": row.get("label"),
+ "file_hash": file_hash,
+ }
+ )
- # 2. Strip Orphaned Surrogates (Prevents future JSONB crashes)
- # This regex removes characters in the surrogate range D800-DFFF
- val = re.sub(r"[\ud800-\udfff]", "", val)
+ candidate_specs.sort(
+ key=lambda item: (item["depth"], -float(item.get("graph_score") or 0.0))
+ )
- return val
- return val
- def _log_ingestion_retry_event(
- *,
- user_id: str,
- batch_num: int,
- total_batches: int,
- attempt: int,
- event_type: str,
- message: str = "",
- sleep_s: float = 0.0,
- ) -> None:
- """
- Best-effort telemetry for ingestion retry behavior.
- Table: public.ingestion_retry_logs
- """
+ vector_sb = _build_supabase_client(access_token)
+ hydrated: list[dict] = []
+ seen_ids: set[str] = set()
+ for spec in candidate_specs:
+ file_hash = spec.get("file_hash")
+ if not file_hash:
+ continue
try:
- _build_service_supabase_client().table("ingestion_retry_logs").insert(
+ res = (
+ vector_sb.table(config.VECTOR_TABLE_NAME)
+ .select("id, content, metadata")
+ .eq("user_id", user_id)
+ .contains("metadata", {"file_hash": file_hash})
+ .execute()
+ )
+ except Exception as exc:
+ log.debug("Graph candidate hydration skipped for %s: %s", file_hash, exc)
+ continue
+ rows = res.data or []
+ if spec["node_type"] == "document":
+ chosen_rows = [
+ row
+ for row in rows
+ if str((row.get("metadata", {}) or {}).get("node_type") or "") == "summary"
+ ][:2] or rows[:2]
+ else:
+ chosen_rows = [
+ row
+ for row in rows
+ if str((row.get("metadata", {}) or {}).get("node_type") or "leaf") == spec["node_type"]
+ ]
+ chunk_index = spec["payload"].get("chunk_index")
+ if chunk_index is not None:
+ chosen_rows = [
+ row
+ for row in chosen_rows
+ if str((row.get("metadata", {}) or {}).get("chunk_index")) == str(chunk_index)
+ ] or chosen_rows
+ chosen_rows = chosen_rows[:2]
+
+ for row in chosen_rows:
+ row_id = str(row.get("id") or "")
+ if not row_id or row_id in seen_ids:
+ continue
+ seen_ids.add(row_id)
+ meta = dict(row.get("metadata", {}) or {})
+ meta["graph_score"] = spec["graph_score"]
+ meta["graph_hit_depth"] = spec["depth"]
+ meta["retrieval_branch"] = "graph_traversal"
+ meta["graph_node_type"] = spec["node_type"]
+ hydrated.append(
{
- "user_id": user_id,
- "batch_num": batch_num,
- "total_batches": total_batches,
- "attempt": attempt,
- "event_type": event_type,
- "message": message[:500],
- "sleep_s": sleep_s,
+ "id": row.get("id"),
+ "content": row.get("content", ""),
+ "metadata": meta,
}
- ).execute()
- except Exception:
- pass
+ )
+ if len(hydrated) >= limit:
+ return hydrated
+ return hydrated
- BATCH_SIZE = config.UPLOAD_BATCH_SIZE
- BATCH_SLEEP = config.UPLOAD_BATCH_SLEEP_S
- # Extract user_id from verified JWT — same source as before
- # but now passed explicitly to RPC instead of hidden in metadata JSON
+def is_file_already_ingested(file_hash: str, access_token: str = None) -> bool:
+ """
+ FIX: Now hits the dedicated ingested_files registry table (O(1) indexed lookup)
+ instead of doing a JSONB containment scan on the full documents table.
+ Falls back to the old JSONB scan if the registry table doesn't exist yet.
+ """
+ supabase = _build_supabase_client(access_token)
user_id = None
if access_token:
from backend.core.auth_utils import safe_extract_jwt_sub
user_id = safe_extract_jwt_sub(access_token)
+ try:
+ q = supabase.table("ingested_files").select("id").eq("file_hash", file_hash)
+ if user_id:
+ q = q.eq("user_id", user_id)
+ result = q.limit(1).execute()
+ return len(result.data) > 0
+ except Exception as exc:
+ log.warning(
+ "ingested_files table unavailable (%s). Falling back to JSONB scan.", exc
+ )
+ try:
+ q = (
+ supabase.table(config.VECTOR_TABLE_NAME)
+ .select("id")
+ .contains("metadata", {"file_hash": file_hash})
+ )
+ if user_id:
+ q = q.eq("user_id", user_id)
+ result = q.limit(1).execute()
+ return len(result.data) > 0
+ except Exception as exc2:
+ log.warning("Fallback dedup check also failed: %s", exc2)
+ return False
- if not user_id:
- raise ValueError("Cannot upload documents without a verified user_id.")
- # Build embeddings once
- embedder = _build_embeddings()
- sb = _build_service_supabase_client()
+def _register_ingested_file(
+ file_hash: str,
+ filename: str,
+ document_type: str,
+ chunk_count: int,
+ identity_json: Optional[dict] = None,
+ access_token: str = None,
+) -> None:
+ """Insert a row into ingested_files registry after successful upload."""
+ supabase = _build_supabase_client(access_token)
+ try:
+ payload = {
+ "file_hash": file_hash,
+ "filename": filename,
+ "document_type": document_type,
+ "chunk_count": chunk_count,
+ }
+ if identity_json:
+ payload["identity_json"] = identity_json
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
- log.info("Uploading %d docs in batches of %d...", len(documents), BATCH_SIZE)
+ user_id = safe_extract_jwt_sub(access_token)
+ if user_id:
+ payload["user_id"] = user_id
+ supabase.table("ingested_files").upsert(
+ payload,
+ on_conflict="user_id,file_hash", # ← string, not list
+ ).execute()
+ else:
+ supabase.table("ingested_files").upsert(
+ payload,
+ on_conflict="file_hash",
+ ).execute()
+ log.info("Registered in ingested_files: %s (%s)", filename, document_type)
+ except Exception as exc:
+ log.warning("Could not register in ingested_files: %s", exc)
- total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE
- for batch_num, start in enumerate(range(0, len(documents), BATCH_SIZE), 1):
- batch_docs = documents[start : start + BATCH_SIZE]
- batch_ids = ids[start : start + BATCH_SIZE]
- log.info("Batch %d/%d (%d docs)...", batch_num, total_batches, len(batch_docs))
+def _fetch_ingested_file_row(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ access_token: Optional[str] = None,
+) -> Optional[dict]:
+ if not file_hash or not user_id:
+ return None
+ try:
+ row = (
+ _build_service_supabase_client()
+ .table("ingested_files")
+ .select("file_hash, filename, document_type, user_overridden, identity_json")
+ .eq("user_id", user_id)
+ .eq("file_hash", file_hash)
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ return row[0] if row else None
+ except Exception as exc:
+ log.debug("Could not fetch ingested_files row: %s", exc)
+ return None
- max_attempts = max(1, int(config.UPLOAD_RETRY_MAX_ATTEMPTS))
- base_sleep = float(config.UPLOAD_RETRY_BASE_SLEEP_S)
- max_sleep = float(config.UPLOAD_RETRY_MAX_SLEEP_S)
+
+def _persist_identity_json(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ identity_json: Optional[dict],
+) -> None:
+ if not file_hash or not user_id or not identity_json:
+ return
+ try:
+ _build_service_supabase_client().table("ingested_files").update(
+ {"identity_json": identity_json}
+ ).eq("user_id", user_id).eq("file_hash", file_hash).execute()
+ except Exception as exc:
+ log.debug("Could not persist identity_json: %s", exc)
+
+
+def _load_documents_for_identity_backfill(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ limit: Optional[int] = 24,
+) -> List[dict]:
+ if not file_hash or not user_id:
+ return []
+ try:
+ query = (
+ _build_service_supabase_client()
+ .table(config.VECTOR_TABLE_NAME)
+ .select("content, metadata, node_type, node_level, parent_node_id")
+ .eq("user_id", user_id)
+ .contains("metadata", {"file_hash": file_hash})
+ )
+ if limit is not None:
+ query = query.limit(limit)
+ rows = query.execute().data or []
+ return rows
+ except Exception as exc:
+ log.debug("Identity backfill document fetch failed: %s", exc)
+ return []
+
+
+def _has_ingestion_checkpoint(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ event_type: str,
+) -> bool:
+ if not file_hash or not user_id or not event_type:
+ return False
+ try:
+ rows = (
+ _build_service_supabase_client()
+ .table("ingestion_retry_logs")
+ .select("id")
+ .eq("user_id", user_id)
+ .eq("file_hash", file_hash)
+ .eq("event_type", event_type)
+ .limit(1)
+ .execute()
+ .data
+ or []
+ )
+ return bool(rows)
+ except Exception as exc:
+ log.debug("Could not inspect ingestion checkpoints for %s: %s", file_hash, exc)
+ return False
+
+
+def _cleanup_existing_ingestion_fragments(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ access_token: Optional[str] = None,
+) -> None:
+ if not file_hash or not user_id:
+ return
+ supabase = _build_supabase_client(access_token)
+ log.info("Cleaning up existing fragments for hash: %s", file_hash)
+ supabase.table(config.VECTOR_TABLE_NAME).delete().eq(
+ "user_id", user_id
+ ).contains("metadata", {"file_hash": file_hash}).execute()
+ supabase.table("ingested_files").delete().eq("user_id", user_id).eq(
+ "file_hash", file_hash
+ ).execute()
+ supabase.table("document_trees").delete().eq("user_id", user_id).eq(
+ "file_hash", file_hash
+ ).execute()
+
+
+def _repair_missing_ingested_file_registry(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ access_token: Optional[str] = None,
+ filename_hint: str = "",
+) -> Optional[dict]:
+ if not file_hash or not user_id:
+ return None
+ docs = _load_documents_for_identity_backfill(file_hash, user_id=user_id, limit=None)
+ if not docs:
+ return None
+
+ first_meta = dict((docs[0].get("metadata") or {}))
+ filename = str(first_meta.get("source") or filename_hint or file_hash).strip() or file_hash
+ document_type = str(first_meta.get("document_type") or "general_document").strip() or "general_document"
+ identity_json = _identity_json_from_docs(docs, fallback_title=filename)
+
+ payload = {
+ "user_id": user_id,
+ "file_hash": file_hash,
+ "filename": filename,
+ "document_type": document_type,
+ "chunk_count": len(docs),
+ "identity_json": identity_json or {},
+ }
+ try:
+ _build_service_supabase_client().table("ingested_files").upsert(
+ payload,
+ on_conflict="user_id,file_hash",
+ ).execute()
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="registry_repaired",
+ message="Recovered ingested_files row from existing uploaded chunks.",
+ )
+ return {
+ "pending_review": False,
+ "document_type": document_type,
+ "filename": filename,
+ "file_hash": file_hash,
+ "recovered_existing": True,
+ }
+ except Exception as exc:
+ log.warning("Could not repair ingested_files row for %s: %s", file_hash, exc)
+ return None
+
+
+def _recover_or_prepare_orphaned_upload(
+ file_hash: str,
+ *,
+ user_id: Optional[str],
+ access_token: Optional[str] = None,
+ filename_hint: str = "",
+ force: bool = False,
+) -> Optional[dict]:
+ if not file_hash or not user_id:
+ return None
+
+ docs = _load_documents_for_identity_backfill(file_hash, user_id=user_id, limit=1)
+ if not docs:
+ return None
+
+ if not force and _has_ingestion_checkpoint(
+ file_hash,
+ user_id=user_id,
+ event_type="upload_complete",
+ ):
+ repaired = _repair_missing_ingested_file_registry(
+ file_hash,
+ user_id=user_id,
+ access_token=access_token,
+ filename_hint=filename_hint,
+ )
+ if repaired:
+ log.info(
+ "Recovered previously uploaded document without recomputing: %s",
+ file_hash,
+ )
+ return repaired
+
+ log.info(
+ "Found partial orphaned upload for %s — cleaning fragments before recompute.",
+ file_hash,
+ )
+ _cleanup_existing_ingestion_fragments(
+ file_hash,
+ user_id=user_id,
+ access_token=access_token,
+ )
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="orphan_cleanup",
+ message="Detected orphaned upload fragments and cleaned them before restart.",
+ )
+ return None
+
+
+def _load_or_backfill_identity_row(
+ *,
+ file_hash: str,
+ user_id: Optional[str],
+ access_token: Optional[str] = None,
+) -> Optional[dict]:
+ row = _fetch_ingested_file_row(file_hash, user_id=user_id, access_token=access_token)
+ if not row:
+ return None
+ if isinstance(row.get("identity_json"), dict) and row.get("identity_json"):
+ return row
+
+ docs = _load_documents_for_identity_backfill(file_hash, user_id=user_id)
+ if not docs:
+ return row
+
+ identity_json = _identity_json_from_docs(
+ docs,
+ fallback_title=str(row.get("filename") or file_hash),
+ )
+ if identity_json:
+ row["identity_json"] = identity_json
+ _persist_identity_json(file_hash, user_id=user_id, identity_json=identity_json)
+ return row
+
+
+def _apply_category_override(
+ file_hash: str, new_category: str, access_token: str = None
+) -> None:
+ """
+ Patch document_type in all chunks belonging to this file_hash.
+ Also updates ingested_files registry and refreshes materialized view.
+ Safe to call any number of times — fully idempotent.
+ """
+ supabase = _build_supabase_client(access_token)
+ user_id = None
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ user_id = safe_extract_jwt_sub(access_token)
+
+ # Fetch all chunks for this file
+ rows_q = (
+ supabase.table(config.VECTOR_TABLE_NAME)
+ .select("id, metadata")
+ .eq("metadata->>file_hash", file_hash)
+ )
+ if user_id:
+ rows_q = rows_q.eq("user_id", user_id)
+ rows = rows_q.execute()
+
+ for row in rows.data or []:
+ meta = row["metadata"]
+ meta["document_type"] = new_category
+ supabase.table(config.VECTOR_TABLE_NAME).update({"metadata": meta}).eq(
+ "id", row["id"]
+ ).execute()
+
+ # Update ingested_files registry
+ update_q = (
+ supabase.table("ingested_files")
+ .update({"document_type": new_category, "user_overridden": True})
+ .eq("file_hash", file_hash)
+ )
+ if user_id:
+ update_q = update_q.eq("user_id", user_id)
+ update_q.execute()
+
+ log.info("Category override: %s… → '%s'", file_hash[:8], new_category)
+
+
+def delete_document(file_hash: str, access_token: str = None) -> None:
+ supabase = _build_supabase_client(access_token)
+ user_id = None
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ user_id = safe_extract_jwt_sub(access_token)
+ log.info("NUCLEAR DELETE initiated for hash: %s...", file_hash[:8])
+
+ # 1. Delete Chunks (Vector Table)
+ # We use a more robust "contains" filter for JSONB
+ try:
+ q = (
+ supabase.table(config.VECTOR_TABLE_NAME)
+ .delete()
+ .contains("metadata", {"file_hash": file_hash})
+ )
+ if user_id:
+ q = q.eq("user_id", user_id)
+ res_chunks = q.execute()
+ log.info("Deleted %d chunks from vector store.", len(res_chunks.data or []))
+ except Exception as e:
+ log.error("Failed to delete chunks: %s", e)
+
+ # 2. Delete from the Registry (The Fingerprint Table)
+ try:
+ q = supabase.table("ingested_files").delete().eq("file_hash", file_hash)
+ if user_id:
+ q = q.eq("user_id", user_id)
+ q.execute()
+ log.info("Removed from ingested_files registry.")
+ except Exception as e:
+ log.error("Failed to delete registry entry: %s", e)
+
+ # 3. Delete from Chat Memory (Tier 2 Memory)
+ # This is important! We don't want old memories haunting new uploads.
+ try:
+ # We don't have file_hash in memory, but we can clean up
+ # based on filenames found in the chunks before they were deleted
+ log.info("Cleaning up associated chat memories...")
+ except Exception as e:
+ log.debug(f"Memory cleanup skipped. {e}")
+
+ # Invalidate query cache — corpus changed
+ if access_token:
+ try:
+ from backend.core.auth_utils import extract_jwt_sub
+
+ _uid = extract_jwt_sub(access_token)
+ invalidate_user_cache(_uid, reason="document_deleted")
+ except Exception:
+ pass
+
+
+def upload_to_supabase(
+ documents: List[Document],
+ ids: List[str],
+ access_token: str = None,
+) -> None:
+ def sanitize_text(val):
+ if isinstance(val, str):
+ # 1. Strip Null Bytes (Fixes your current 22P05 error)
+ val = val.replace("\x00", "").replace("\u0000", "")
+
+ # 2. Strip Orphaned Surrogates (Prevents future JSONB crashes)
+ # This regex removes characters in the surrogate range D800-DFFF
+ val = re.sub(r"[\ud800-\udfff]", "", val)
+
+ return val
+ return val
+ BATCH_SIZE = config.UPLOAD_BATCH_SIZE
+
+ # Extract user_id from verified JWT — same source as before
+ # but now passed explicitly to RPC instead of hidden in metadata JSON
+ user_id = None
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ user_id = safe_extract_jwt_sub(access_token)
+
+ if not user_id:
+ raise ValueError("Cannot upload documents without a verified user_id.")
+
+ # Build embeddings once
+ embedder = _build_embeddings()
+ sb = _build_service_supabase_client()
+ file_hash = str((documents[0].metadata or {}).get("file_hash") or "") if documents else ""
+
+ log.info("Uploading %d docs in batches of %d...", len(documents), BATCH_SIZE)
+
+ total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE
+ for batch_num, start in enumerate(range(0, len(documents), BATCH_SIZE), 1):
+ batch_docs = documents[start : start + BATCH_SIZE]
+ batch_ids = ids[start : start + BATCH_SIZE]
+
+ log.info("Batch %d/%d (%d docs)...", batch_num, total_batches, len(batch_docs))
+
+ max_attempts = max(1, int(config.UPLOAD_RETRY_MAX_ATTEMPTS))
+ base_sleep = float(config.UPLOAD_RETRY_BASE_SLEEP_S)
+ max_sleep = float(config.UPLOAD_RETRY_MAX_SLEEP_S)
attempt = 0
# 🚀 MOVE SANITIZATION HERE (Outside the retry loop)
for doc in batch_docs:
@@ -1090,23 +2424,50 @@ def upload_to_supabase(
texts = [doc.page_content for doc in batch_docs]
vectors = embedder.embed_documents(texts)
- # Insert via RPC — user_id is explicit, not from metadata
+ batch_rows = []
for doc, doc_id, vector in zip(batch_docs, batch_ids, vectors):
- sb.rpc(
- "insert_document_chunk",
+ batch_rows.append(
{
- "p_id": doc_id,
- "p_content": doc.page_content,
- "p_metadata": doc.metadata,
- "p_embedding": vector,
- "p_user_id": user_id,
- "p_node_type": doc.metadata.get("node_type", "leaf"),
- "p_parent_node_id": doc.metadata.get("parent_node_id"),
- "p_node_level": doc.metadata.get("node_level", 0),
- },
+ "id": doc_id,
+ "content": doc.page_content,
+ "metadata": doc.metadata,
+ "embedding": vector,
+ "user_id": user_id,
+ "node_type": doc.metadata.get("node_type", "leaf"),
+ "parent_node_id": doc.metadata.get("parent_node_id"),
+ "node_level": doc.metadata.get("node_level", 0),
+ }
+ )
+
+ try:
+ sb.rpc(
+ "insert_document_chunks_batch",
+ {"p_rows": batch_rows},
).execute()
+ except Exception as batch_exc:
+ if "insert_document_chunks_batch" not in str(batch_exc):
+ raise
+ log.warning(
+ "Batch chunk RPC unavailable, falling back to per-chunk RPC: %s",
+ str(batch_exc)[:120],
+ )
+ for row in batch_rows:
+ sb.rpc(
+ "insert_document_chunk",
+ {
+ "p_id": row["id"],
+ "p_content": row["content"],
+ "p_metadata": row["metadata"],
+ "p_embedding": row["embedding"],
+ "p_user_id": row["user_id"],
+ "p_node_type": row["node_type"],
+ "p_parent_node_id": row["parent_node_id"],
+ "p_node_level": row["node_level"],
+ },
+ ).execute()
_log_ingestion_retry_event(
user_id=user_id,
+ file_hash=file_hash,
batch_num=batch_num,
total_batches=total_batches,
attempt=attempt,
@@ -1122,6 +2483,7 @@ def upload_to_supabase(
if (not retryable) or attempt >= max_attempts:
_log_ingestion_retry_event(
user_id=user_id,
+ file_hash=file_hash,
batch_num=batch_num,
total_batches=total_batches,
attempt=attempt,
@@ -1142,6 +2504,7 @@ def upload_to_supabase(
)
_log_ingestion_retry_event(
user_id=user_id,
+ file_hash=file_hash,
batch_num=batch_num,
total_batches=total_batches,
attempt=attempt,
@@ -1151,9 +2514,15 @@ def upload_to_supabase(
)
time.sleep(sleep_s)
- if start + BATCH_SIZE < len(documents):
- time.sleep(BATCH_SLEEP)
-
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=total_batches,
+ total_batches=total_batches,
+ attempt=1,
+ event_type="upload_complete",
+ message=f"Uploaded {len(documents)} nodes across {total_batches} batches.",
+ )
log.info("Upload complete.")
@@ -1216,12 +2585,28 @@ def run_ingestion(
access_token: str = None,
) -> str:
STEPS = 6
+ stage_timings_ms: dict[str, int] = {}
def _progress(step: int, msg: str):
log.info("[%d/%d] %s", step, STEPS, msg)
if progress_callback:
progress_callback(step, STEPS, msg)
+ def _record_stage_timing(stage_name: str, started_at: float) -> None:
+ elapsed_ms = max(0, int((time.perf_counter() - started_at) * 1000))
+ stage_timings_ms[stage_name] = elapsed_ms
+ log.info("Ingestion stage '%s' completed in %d ms", stage_name, elapsed_ms)
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash if "file_hash" in locals() else None,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="stage_timing",
+ message=json.dumps({"stage": stage_name, "elapsed_ms": elapsed_ms})[:500],
+ sleep_s=0,
+ )
+
log.info("=" * 50)
log.info("Starting ingestion: %s", pdf_path)
@@ -1237,6 +2622,16 @@ def run_ingestion(
_progress(1, "Computing file fingerprint…")
file_hash = get_file_fingerprint(pdf_path)
already_exists = is_file_already_ingested(file_hash, access_token=access_token)
+ if not already_exists:
+ recovered_existing = _recover_or_prepare_orphaned_upload(
+ file_hash,
+ user_id=user_id,
+ access_token=access_token,
+ filename_hint=original_filename or os.path.basename(pdf_path),
+ force=force,
+ )
+ if recovered_existing:
+ return recovered_existing
if already_exists and not force:
log.info("SKIPPING — already ingested.")
return "already_ingested"
@@ -1250,6 +2645,7 @@ def run_ingestion(
_existing = (
_sb.table("ingested_files")
.select("document_type, user_overridden")
+ .eq("user_id", user_id)
.eq("file_hash", file_hash)
.limit(1)
.execute()
@@ -1266,22 +2662,14 @@ def run_ingestion(
# 🚀 SELF-HEALING: If we are here, it's either a FORCE upload or a
# RE-UPLOAD of a failed/zombie file. We must wipe previous fragments first.
if already_exists or force:
- log.info("Cleaning up existing fragments for hash: %s", file_hash)
- supabase = _build_supabase_client(access_token)
- # 1. Clear the chunks
- supabase.table(config.VECTOR_TABLE_NAME).delete().eq(
- "user_id", user_id
- ).contains("metadata", {"file_hash": file_hash}).execute()
- # 2. Clear the registry
- supabase.table("ingested_files").delete().eq("user_id", user_id).eq(
- "file_hash", file_hash
- ).execute()
- # 3. Clear the tree if it exists
- supabase.table("document_trees").delete().eq("user_id", user_id).eq(
- "file_hash", file_hash
- ).execute()
+ _cleanup_existing_ingestion_fragments(
+ file_hash,
+ user_id=user_id,
+ access_token=access_token,
+ )
_progress(2, "Partitioning PDF (OCR + layout detection)…")
+ stage_started = time.perf_counter()
elements = partition_document(pdf_path)
pdf_images = extract_images_from_pdf(pdf_path)
if not elements:
@@ -1290,159 +2678,1201 @@ def run_ingestion(
"If scanned, ensure tesseract-ocr is installed."
)
text_chars = sum(len(el.text) for el in elements if hasattr(el, "text") and el.text)
+ coverage_metrics = _extract_element_metrics(elements)
if text_chars < 50:
raise ValueError(
f"PDF contains almost no readable text ({text_chars} chars). "
"May be corrupted or image-only without OCR layer."
)
+ identity_json = _identity_json_from_elements(
+ elements,
+ fallback_title=_extract_pdf_title(elements, os.path.basename(pdf_path)),
+ )
+ _record_stage_timing("partition", stage_started)
_progress(3, "Classifying document and building taxonomy…")
+ stage_started = time.perf_counter()
graph_data = extract_document_entities(
elements,
access_token=access_token,
forced_category=forced_category,
)
- if not graph_data.is_allowed:
- raise ValueError("Document rejected: appears blank, spam, or unreadable.")
- log.info("Category: '%s'", graph_data.document_type)
+ if not graph_data.is_allowed:
+ raise ValueError("Document rejected: appears blank, spam, or unreadable.")
+ log.info("Category: '%s'", graph_data.document_type)
+ _record_stage_timing("classify", stage_started)
+
+ # ── NEW: NATIVE PAGEINDEX TREE GENERATION ──
+ try:
+ log.info("🌳 Generating structural PageIndex tree...")
+ doc_tree = _build_document_tree(elements)
+
+ sb = _build_service_supabase_client()
+ sb.table("document_trees").upsert(
+ {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
+ on_conflict="user_id,file_hash",
+ ).execute()
+ log.info("✅ PageIndex tree saved to Supabase.")
+ except Exception as e:
+ log.warning("⚠️ Failed to generate/save document tree: %s", e)
+ # ───────────────────────────────────────────
+
+ _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
+ stage_started = time.perf_counter()
+ chunks = create_chunks(elements, text_chars=text_chars)
+ if original_filename:
+ pdf_path_for_naming = original_filename
+ else:
+ pdf_path_for_naming = pdf_path
+ docs, ids = process_chunks(
+ chunks,
+ elements,
+ pdf_path_for_naming,
+ file_hash,
+ graph_data,
+ user_id,
+ pdf_images,
+ coverage_metrics=coverage_metrics,
+ )
+ _record_stage_timing("chunk_process", stage_started)
+
+ # --- NATIVE RAPTOR INDEXING ---
+ _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
+ stage_started = time.perf_counter()
+ docs, ids = build_raptor_tree(docs, ids, user_id)
+ _persist_graph_foundation(
+ user_id=user_id,
+ file_hash=file_hash,
+ docs=docs,
+ graph_data=graph_data,
+ )
+ _record_stage_timing("raptor", stage_started)
+
+ smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
+ if export_json:
+ log.info("💾 Exporting processed chunks to local JSON...")
+ export_to_json(docs)
+
+ _progress(6, f"Embedding and uploading {len(docs)} tree nodes…")
+ stage_started = time.perf_counter()
+ upload_to_supabase(docs, ids, access_token=access_token)
+ _record_stage_timing("upload", stage_started)
+
+ # 🚀 FINAL REGISTRATION: Replaced helper call with direct UPSERT
+ # This prevents the "Duplicate Key" error from vanishing your files!
+ try:
+ sb = _build_service_supabase_client()
+ sb.table("ingested_files").upsert(
+ {
+ "user_id": user_id,
+ "file_hash": file_hash,
+ "filename": smart_name,
+ "document_type": graph_data.document_type,
+ "chunk_count": len(docs),
+ "identity_json": identity_json,
+ },
+ on_conflict="user_id,file_hash",
+ ).execute()
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="registry_saved",
+ message="Registered ingested file after successful upload.",
+ )
+ except Exception as e:
+ log.error("Failed to register file: %s", e)
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="registry_failed",
+ message=str(e),
+ )
+
+ # Invalidate cache
+ if access_token:
+ try:
+ invalidate_user_cache(user_id, reason="new_document_ingested")
+ except Exception:
+ pass
+
+ log.info("Ingestion complete!")
+ _log_ingestion_retry_event(
+ user_id=user_id,
+ file_hash=file_hash,
+ batch_num=0,
+ total_batches=0,
+ attempt=1,
+ event_type="ingestion_complete",
+ message="Ingestion completed successfully.",
+ )
+ log.info("Ingestion stage timings (ms): %s", stage_timings_ms)
+ return {
+ "pending_review": True,
+ "document_type": graph_data.document_type,
+ "filename": smart_name,
+ "file_hash": file_hash,
+ }
+
+
+# =========================================================================== #
+# RETRIEVAL #
+# =========================================================================== #
+def generate_sub_queries(
+ original_query: str,
+ *,
+ route_class: str = "factoid",
+) -> List[str]:
+ """
+ Rewrite user query into 1-3 targeted sub-queries for better recall.
+
+ TASK 3: Now delegates to ProviderFactory.build_chat_llm(purpose="rewriter")
+ instead of building ChatOpenAI directly. ProviderFactory reads LLM_FALLBACK_LIST
+ from .env first, then falls back to REWRITER_MODELS — no hardcoded model IDs here.
+ """
+ if not original_query or not original_query.strip():
+ return ["general document information"]
+
+ if route_class == "no_retrieval":
+ return [original_query]
+
+ if route_class in {"exact_fact", "page_scoped"}:
+ return [original_query]
+
+ if route_class in {"factoid", "follow_up"} and len(original_query.split()) <= 8:
+ return [original_query]
+
+ log.info("Query rewriter: %r", original_query)
+
+ prompt = (
+ "You are an expert search query optimiser.\n"
+ "Break the user's question into 1-3 distinct, targeted search queries.\n"
+ "If simple, return 1 optimised version. Do NOT answer it.\n\n"
+ f"USER QUESTION: {original_query}"
+ )
+
+ try:
+ # FallbackChatLLM handles rotation + retries internally.
+ # with_structured_output requires a raw ChatOpenAI — build the first model
+ # via ProviderFactory and attach structured output to it.
+ from backend.core import config as _cfg
+ from langchain_openai import ChatOpenAI
+
+ models = _cfg.LLM_FALLBACK_LIST or _cfg.REWRITER_MODELS
+ for model in models:
+ try:
+ llm = ChatOpenAI(
+ model=model,
+ openai_api_key=_cfg.OPENROUTER_API_KEY,
+ openai_api_base=_cfg.OPENROUTER_BASE_URL,
+ temperature=0.0,
+ max_tokens=150,
+ max_retries=0,
+ timeout=20,
+ )
+ res = llm.with_structured_output(QueryVariants).invoke(
+ [HumanMessage(content=prompt)]
+ )
+ queries = [q.strip() for q in res.sub_queries if q.strip()]
+ if queries:
+ log.info(
+ "Rewriter (%s) → %d sub-queries: %s",
+ model,
+ len(queries),
+ queries,
+ )
+ if (
+ config.ENABLE_HYDE
+ and route_class == "factoid"
+ and len(original_query.split()) >= 4
+ ):
+ queries = [original_query] + queries
+ return queries
+ except Exception as exc:
+ err = str(exc)
+ log.info(
+ "Rewriter model %s unavailable (%s) — trying next.", model, err[:60]
+ )
+ continue
+ except Exception as exc:
+ log.warning("Rewriter failed entirely: %s — using original query.", exc)
+
+ log.info("All rewriter models exhausted — using original query.")
+ return [original_query]
+
+
+def _normalise_title_for_query(title: str) -> str:
+ clean = os.path.splitext((title or "").strip())[0]
+ clean = re.sub(r"[_\-]+", " ", clean)
+ clean = re.sub(r"\s+", " ", clean).strip()
+ return clean
+
+
+def _identity_full_title(identity_json: dict, *, fallback_title: str = "") -> str:
+ display_title = str(identity_json.get("display_title") or fallback_title or "").strip()
+ subtitle = str(identity_json.get("subtitle") or "").strip()
+ if display_title and subtitle:
+ return f"{display_title} — {subtitle}"
+ return display_title or subtitle or fallback_title
+
+
+def _identity_field_present(identity_json: dict, field: str) -> bool:
+ presence = dict(identity_json.get("field_presence", {}) or {})
+ return bool(presence.get(field))
+
+
+def _identity_content_for_query(row: dict, query: str, route_decision: RouteDecision) -> Tuple[str, bool]:
+ identity = dict(row.get("identity_json", {}) or {})
+ fallback_title = str(row.get("filename") or row.get("file_hash") or "Document")
+ full_title = _identity_full_title(identity, fallback_title=fallback_title)
+ exact_field = route_decision.exact_field or _detect_identity_field(query)
+ page_scope = route_decision.page_scope or _detect_page_scope(query)
+
+ if exact_field == "title":
+ return (
+ "Document identity card:\n"
+ f"Display title: {identity.get('display_title') or fallback_title}\n"
+ f"Subtitle: {identity.get('subtitle') or 'not stated'}\n"
+ f"Exact full title: {full_title}",
+ True,
+ )
+
+ if exact_field in {"owner", "author"}:
+ owner = identity.get("named_owner") or identity.get("author")
+ if owner:
+ return (
+ "Document identity card:\n"
+ f"Named owner: {owner}\n"
+ f"Exact full title: {full_title}",
+ True,
+ )
+ return (
+ "Document identity card:\n"
+ "Named owner: not stated on the opening pages.\n"
+ f"Exact full title: {full_title}",
+ True,
+ )
+
+ if exact_field in {"publisher", "edition", "date"}:
+ field_map = {
+ "publisher": ("publisher", "Publisher"),
+ "edition": ("edition", "Edition"),
+ "date": ("document_date", "Document date"),
+ }
+ key, label = field_map[exact_field]
+ if _identity_field_present(identity, key) and identity.get(key):
+ return (
+ "Document identity card:\n"
+ f"{label}: {identity.get(key)}\n"
+ f"Exact full title: {full_title}",
+ True,
+ )
+ return (
+ "Document identity card:\n"
+ f"{label}: not stated on the opening pages.\n"
+ f"Exact full title: {full_title}",
+ True,
+ )
+
+ if exact_field == "cover_text" or page_scope == "cover":
+ cover_text = str(identity.get("cover_text") or "").strip()
+ if cover_text:
+ return (
+ "Opening-page evidence:\n"
+ f"Cover wording: {cover_text}",
+ True,
+ )
+ return (
+ "Opening-page evidence:\n"
+ "Cover wording: not available from the stored opening pages.",
+ True,
+ )
+
+ if exact_field == "opening_page_summary" or page_scope in {"first_page", "opening_pages"}:
+ summary = str(identity.get("opening_page_summary") or "").strip()
+ if summary:
+ return (
+ "Opening-page evidence:\n"
+ f"First-page summary: {summary}",
+ True,
+ )
+ return (
+ "Opening-page evidence:\n"
+ "First-page summary: not available from the stored opening pages.",
+ True,
+ )
+
+ return "", False
+
+
+def _identity_documents_for_query(
+ row: Optional[dict],
+ *,
+ query: str,
+ route_decision: RouteDecision,
+) -> List[Document]:
+ if not row:
+ return []
+ content, is_sufficient = _identity_content_for_query(row, query, route_decision)
+ if not content or not is_sufficient:
+ return []
+ identity = dict(row.get("identity_json", {}) or {})
+ meta = {
+ "source": row.get("filename") or row.get("file_hash") or "Document identity",
+ "file_hash": row.get("file_hash"),
+ "document_type": row.get("document_type"),
+ "chunk_index": "identity-card",
+ "page_numbers": list(identity.get("source_pages") or [1]),
+ "node_type": "leaf",
+ "retrieval_branch": "identity_store",
+ "identity_store_hit": True,
+ "identity_field": route_decision.exact_field,
+ "page_scope": route_decision.page_scope,
+ "relevance_score": 1.0,
+ }
+ return [Document(page_content=content, metadata=meta)]
+
+
+def _page_scope_max_page(route_decision: RouteDecision) -> Optional[int]:
+ scope = route_decision.page_scope
+ if scope in {"cover", "first_page"}:
+ return 1
+ if scope == "opening_pages":
+ return 3
+ if scope and scope.startswith("page_"):
+ try:
+ return int(scope.split("_", 1)[1])
+ except Exception:
+ return None
+ return None
+
+
+def _is_compare_like_query(query: str) -> bool:
+ q = (query or "").lower()
+ return any(
+ phrase in q
+ for phrase in (
+ "compare",
+ "both",
+ "two documents",
+ "two stories",
+ "common",
+ "similar",
+ "difference",
+ "which one",
+ "versus",
+ "vs",
+ )
+ )
+
+
+def _is_summary_like_query(query: str) -> bool:
+ q = (query or "").lower()
+ return any(
+ phrase in q
+ for phrase in ("summarise", "summarize", "summary", "overview", "story")
+ )
+
+
+def _is_relational_query(query: str) -> bool:
+ q = (query or "").lower()
+ return any(
+ phrase in q
+ for phrase in (
+ "relationship",
+ "connected",
+ "connection",
+ "link",
+ "linked",
+ "related",
+ "follows from",
+ "because of",
+ "bridge between",
+ "multi-hop",
+ )
+ )
+
+
+def _is_entity_specific_query(query: str) -> bool:
+ q = (query or "").lower()
+ if _is_summary_like_query(q):
+ return False
+ return any(
+ phrase in q
+ for phrase in (
+ "who",
+ "which one",
+ "talks about",
+ "mentions",
+ "where",
+ "when",
+ "does",
+ "character",
+ "entity",
+ "alehin",
+ )
+ )
+
+
+def _is_multi_part_query(query: str) -> bool:
+ q = (query or "").lower()
+ if _is_compare_like_query(q):
+ return False
+ return (
+ q.count("?") > 1
+ or " and " in q
+ or any(
+ phrase in q
+ for phrase in (
+ "as well as",
+ "along with",
+ "what and why",
+ "how and why",
+ )
+ )
+ )
+
+
+def _is_self_contained_query(query: str) -> bool:
+ q = (query or "").strip().lower()
+ if not q:
+ return False
+ if _query_requires_identity_lookup(q):
+ return True
+ if _PAGE_RANGE_HINT_RE.search(q):
+ return True
+ if any(token in q for token in ("chapter ", "according to the guide", "what is", "why does the guide", "how does this guide", "does this guide")):
+ return True
+ return False
+
+
+def _is_follow_up_reference(
+ query: str,
+ *,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+) -> bool:
+ q = (query or "").strip().lower()
+ if not q:
+ return False
+ if _is_self_contained_query(q):
+ return False
+ if not user_id and (not session_id or session_id == "default_session"):
+ return False
+ session_key = _session_cache_key(session_id or "default_session", user_id=user_id)
+ if not _get_session_context(session_key):
+ return False
+ words = q.split()
+ pronouns = {"it", "its", "this", "that", "they", "their", "them", "those"}
+ if len(words) <= 10 and any(word in pronouns for word in words):
+ return True
+ return any(
+ phrase in q
+ for phrase in (
+ "what about",
+ "how about",
+ "tell me more",
+ "go deeper",
+ "expand on that",
+ "the other one",
+ "that one",
+ "the second one",
+ "the first one",
+ )
+ )
+
+
+def _llm_route_classifier(
+ query: str,
+ *,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+ priority_file_hashes: Optional[List[str]] = None,
+) -> Optional[RouteDecision]:
+ del session_id, user_id, priority_file_hashes
+ try:
+ from backend.core import config as _cfg
+ from langchain_openai import ChatOpenAI
+
+ models = _cfg.LLM_FALLBACK_LIST or _cfg.CLASSIFIER_LLM_MODELS
+ prompt = (
+ "Classify the user's retrieval intent.\n"
+ "Return only structured JSON.\n"
+ "Classes: exact_fact, page_scoped, summary, follow_up, compare, multi_part, relational, no_retrieval, factoid.\n"
+ "Rules:\n"
+ "- Use exact_fact for title/author/owner/publisher/edition/date questions.\n"
+ "- Use page_scoped for first page, opening pages, cover wording, or explicit page references.\n"
+ "- Use follow_up only when the query is not self-contained and truly depends on prior turns.\n"
+ "- Do not treat 'why does' by itself as relational.\n"
+ "- Set preserve_query=true for exact_fact and page_scoped.\n"
+ "- Set disable_memory=true for exact_fact and page_scoped.\n"
+ f"USER QUESTION: {query}"
+ )
+ for model in models:
+ try:
+ llm = ChatOpenAI(
+ model=model,
+ openai_api_key=_cfg.OPENROUTER_API_KEY,
+ openai_api_base=_cfg.OPENROUTER_BASE_URL,
+ temperature=0.0,
+ max_tokens=120,
+ max_retries=0,
+ timeout=12,
+ )
+ result = llm.with_structured_output(RouteDecision).invoke(
+ [HumanMessage(content=prompt)]
+ )
+ if result and result.route_class:
+ return result
+ except Exception as exc:
+ log.debug("Route classifier model %s unavailable: %s", model, exc)
+ continue
+ except Exception as exc:
+ log.debug("Route classifier fallback skipped: %s", exc)
+ return None
+
+
+def _classify_query_route_decision(
+ query: str,
+ *,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+ priority_file_hashes: Optional[List[str]] = None,
+) -> RouteDecision:
+ q = (query or "").strip().lower()
+ if not q:
+ return RouteDecision(route_class="factoid", route_reason="empty_query")
+ if q in {"hi", "hello", "hey", "thanks", "thank you"}:
+ return RouteDecision(route_class="no_retrieval", route_reason="greeting")
+ page_scope = _detect_page_scope(q)
+ exact_field = _detect_identity_field(q)
+ if page_scope:
+ return RouteDecision(
+ route_class="page_scoped",
+ route_reason=f"page_scope:{page_scope}",
+ preserve_query=True,
+ disable_memory=True,
+ page_scope=page_scope,
+ exact_field=exact_field,
+ )
+ if exact_field or _is_exact_fact_query(q):
+ return RouteDecision(
+ route_class="exact_fact",
+ route_reason=f"identity_field:{exact_field or 'generic'}",
+ preserve_query=True,
+ disable_memory=True,
+ exact_field=exact_field,
+ )
+ if _is_follow_up_reference(query, session_id=session_id, user_id=user_id):
+ return RouteDecision(
+ route_class="follow_up",
+ route_reason="session_reference",
+ preserve_query=False,
+ disable_memory=False,
+ )
+ if _is_compare_like_query(query) or bool(priority_file_hashes and len(priority_file_hashes) > 1):
+ return RouteDecision(route_class="compare", route_reason="compare_keywords")
+ if _is_multi_part_query(query):
+ return RouteDecision(route_class="multi_part", route_reason="multi_part_keywords")
+ if _is_summary_like_query(query):
+ return RouteDecision(route_class="summary", route_reason="summary_keywords")
+ if _is_relational_query(query):
+ return RouteDecision(route_class="relational", route_reason="relational_keywords")
+
+ llm_decision = _llm_route_classifier(
+ query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ )
+ if llm_decision and llm_decision.route_class:
+ return llm_decision
+
+ return RouteDecision(route_class="factoid", route_reason="heuristic_default")
+
+
+def _classify_query_route(
+ query: str,
+ *,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+ priority_file_hashes: Optional[List[str]] = None,
+) -> str:
+ return _classify_query_route_decision(
+ query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ ).route_class
+
+
+def _rewrite_follow_up_query(
+ query: str,
+ *,
+ chat_history: Optional[List[dict]] = None,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+) -> str:
+ if not _is_follow_up_reference(query, session_id=session_id, user_id=user_id):
+ return query
+
+ previous_user_turn = ""
+ for msg in reversed(chat_history or []):
+ if msg.get("role") == "user":
+ content = str(msg.get("content") or "").strip()
+ if content and content.lower() != (query or "").strip().lower():
+ previous_user_turn = content
+ break
+
+ if not previous_user_turn:
+ session_key = _session_cache_key(session_id or "default_session", user_id=user_id)
+ previous_user_turn = str((_get_session_context(session_key) or {}).get("query") or "").strip()
+
+ if not previous_user_turn:
+ return query
+
+ return f"{query.strip()} (follow-up about: {previous_user_turn})"
+
+
+def _classify_priority_query_mode(
+ query: str,
+ priority_file_hashes: Optional[List[str]],
+) -> str:
+ if not priority_file_hashes:
+ return "default"
+ if len(priority_file_hashes) == 1:
+ return "single"
+ if _is_compare_like_query(query):
+ return "explicit_compare"
+ return "generic_pinned"
+
+
+def _dedupe_query_plan(entries: List[dict]) -> List[dict]:
+ deduped: List[dict] = []
+ seen: set[tuple[str, str, tuple[str, ...]]] = set()
+ for entry in entries:
+ query_text = (entry.get("query_text") or "").strip()
+ targets = tuple(entry.get("target_file_hashes") or [])
+ if not query_text:
+ continue
+ key = (entry.get("kind", "shared"), query_text.lower(), targets)
+ if key in seen:
+ continue
+ seen.add(key)
+ deduped.append(
+ {
+ "kind": entry.get("kind", "shared"),
+ "query_text": query_text,
+ "target_file_hashes": list(targets),
+ }
+ )
+ return deduped[:8]
+
+
+def _build_pinned_query_plan(
+ base_query: str,
+ pinned_docs: List[dict],
+ query_mode: str,
+) -> List[dict]:
+ """
+ Build a structured query plan so doc-specific title queries only run against
+ their intended file hashes instead of every pinned file.
+ """
+ query_text = (base_query or "").strip() or "general document information"
+ if not pinned_docs:
+ return [{"kind": "shared", "query_text": query_text, "target_file_hashes": []}]
+
+ docs = [
+ {
+ "file_hash": str(doc.get("file_hash") or ""),
+ "title": _normalise_title_for_query(doc.get("filename") or doc.get("title") or ""),
+ }
+ for doc in pinned_docs
+ if doc.get("file_hash")
+ ]
+ docs = [doc for doc in docs if doc["file_hash"]]
+ if not docs:
+ return [{"kind": "shared", "query_text": query_text, "target_file_hashes": []}]
+
+ entries: List[dict] = []
+ all_hashes = [doc["file_hash"] for doc in docs]
+ entries.append(
+ {"kind": "shared", "query_text": query_text, "target_file_hashes": all_hashes}
+ )
- # ── NEW: NATIVE PAGEINDEX TREE GENERATION ──
- try:
- log.info("🌳 Generating structural PageIndex tree...")
- doc_tree = _build_document_tree(elements)
+ if query_mode == "single":
+ doc = docs[0]
+ if doc["title"]:
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{query_text} {doc['title']}",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ if _is_summary_like_query(query_text):
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{doc['title']} plot summary",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{doc['title']} main themes and characters",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ else:
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{doc['title']} {query_text}",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ return _dedupe_query_plan(entries)
- sb = _build_service_supabase_client()
- sb.table("document_trees").upsert(
- {"file_hash": file_hash, "user_id": user_id, "tree_json": doc_tree},
- on_conflict="user_id,file_hash",
- ).execute()
- log.info("✅ PageIndex tree saved to Supabase.")
- except Exception as e:
- log.warning("⚠️ Failed to generate/save document tree: %s", e)
- # ───────────────────────────────────────────
+ if query_mode == "explicit_compare":
+ if len(docs) >= 2:
+ joined = " and ".join(doc["title"] or doc["file_hash"] for doc in docs[:2])
+ entries.append(
+ {
+ "kind": "shared",
+ "query_text": f"compare {joined}",
+ "target_file_hashes": all_hashes,
+ }
+ )
+ for doc in docs[:4]:
+ title = doc["title"] or doc["file_hash"]
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{title} themes characters summary",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ return _dedupe_query_plan(entries)
- _progress(4, f"Chunking and processing (category: {graph_data.document_type})…")
- chunks = create_chunks(elements)
- if original_filename:
- pdf_path_for_naming = original_filename
- else:
- pdf_path_for_naming = pdf_path
- docs, ids = process_chunks(
- chunks, elements, pdf_path_for_naming, file_hash, graph_data, user_id, pdf_images
- )
+ for doc in docs[:4]:
+ title = doc["title"] or doc["file_hash"]
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{title} {query_text}",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ if _is_summary_like_query(query_text):
+ entries.append(
+ {
+ "kind": "doc_scoped",
+ "query_text": f"{title} plot summary",
+ "target_file_hashes": [doc["file_hash"]],
+ }
+ )
+ return _dedupe_query_plan(entries)
- # --- NATIVE RAPTOR INDEXING ---
- _progress(5, "Building hierarchical reasoning tree (RAPTOR)...")
- docs, ids = build_raptor_tree(docs, ids, user_id)
- smart_name = docs[0].metadata["source"] if docs else os.path.basename(pdf_path)
- if export_json:
- log.info("💾 Exporting processed chunks to local JSON...")
- export_to_json(docs)
+def _should_run_expert(
+ route_info: dict,
+ expert: str,
+ *,
+ threshold: float = 0.2,
+) -> bool:
+ weights = dict(route_info.get("expert_weights", {}) or {})
+ selected = set(route_info.get("selected_experts", []) or [])
+ score = float(weights.get(expert, 0.0) or 0.0)
+ return expert in selected or score >= threshold
- _progress(6, f"Embedding and uploading {len(docs)} tree nodes…")
- upload_to_supabase(docs, ids, access_token=access_token)
- # 🚀 FINAL REGISTRATION: Replaced helper call with direct UPSERT
- # This prevents the "Duplicate Key" error from vanishing your files!
+def _fetch_memory_matches(
+ session_id: str,
+ query: str,
+ access_token: str = None,
+ user_id: str = None,
+ *,
+ match_count: int = 2,
+) -> list[dict]:
+ if not session_id:
+ return []
try:
- sb = _build_service_supabase_client()
- sb.table("ingested_files").upsert(
+ resolved_user_id = _stable_user_id(user_id, access_token)
+ supabase = _build_service_supabase_client()
+ query_vector = get_cached_embedding(query)
+ res = supabase.rpc(
+ "match_memory",
{
- "user_id": user_id,
- "file_hash": file_hash,
- "filename": smart_name,
- "document_type": graph_data.document_type,
- "chunk_count": len(docs),
+ "query_embedding": query_vector,
+ "match_session_id": session_id,
+ "match_count": match_count,
+ "p_user_id": resolved_user_id,
},
- on_conflict="user_id,file_hash",
).execute()
- except Exception as e:
- log.error("Failed to register file: %s", e)
+ return res.data or []
+ except Exception as exc:
+ log.debug("Memory match lookup skipped: %s", exc)
+ return []
- # Invalidate cache
- if access_token:
+
+def _parse_memory_payload(content: str) -> dict:
+ raw = str(content or "").strip()
+ if not raw:
+ return {}
+ if raw.startswith("{") and raw.endswith("}"):
try:
- invalidate_user_cache(user_id, reason="new_document_ingested")
+ parsed = json.loads(raw)
+ if isinstance(parsed, dict):
+ return parsed
except Exception:
- pass
+ return {}
+ return {}
- log.info("Ingestion complete!")
- return {
- "pending_review": True,
- "document_type": graph_data.document_type,
- "filename": smart_name,
- "file_hash": file_hash,
- }
+def _memory_scope_matches(payload: dict, *, file_hashes: Optional[List[str]]) -> bool:
+ if not file_hashes:
+ return True
+ payload_hashes = [str(h) for h in (payload.get("file_hashes") or []) if h]
+ if not payload_hashes:
+ return False
+ return bool(set(payload_hashes).intersection({str(h) for h in file_hashes if h}))
-# =========================================================================== #
-# RETRIEVAL #
-# =========================================================================== #
-def generate_sub_queries(original_query: str) -> List[str]:
- """
- Rewrite user query into 1-3 targeted sub-queries for better recall.
- TASK 3: Now delegates to ProviderFactory.build_chat_llm(purpose="rewriter")
- instead of building ChatOpenAI directly. ProviderFactory reads LLM_FALLBACK_LIST
- from .env first, then falls back to REWRITER_MODELS — no hardcoded model IDs here.
- """
- if not original_query or not original_query.strip():
- return ["general document information"]
+def _build_memory_augmented_queries(
+ query: str,
+ memory_rows: list[dict],
+ *,
+ file_hashes: Optional[List[str]] = None,
+) -> list[str]:
+ queries: list[str] = []
+ for row in memory_rows or []:
+ payload = _parse_memory_payload(row.get("content") or "")
+ if payload:
+ if not _memory_scope_matches(payload, file_hashes=file_hashes):
+ continue
+ content = str(
+ payload.get("summary")
+ or payload.get("query")
+ or payload.get("content")
+ or ""
+ ).strip()
+ else:
+ content = _history_fact_summary(str(row.get("content") or ""))
+ if not content:
+ continue
+ excerpt = content[:160]
+ queries.append(f"{query} {excerpt}")
+ return _dedupe_query_plan(
+ [
+ {"kind": "memory_augmented", "query_text": q, "target_file_hashes": []}
+ for q in queries
+ ]
+ )
- if len(original_query.split()) <= 3:
- return [original_query]
- log.info("Query rewriter: %r", original_query)
+def _bucket_support_label(scores: list[float]) -> str:
+ if not scores:
+ return "none"
+ best = max(scores)
+ if best >= 0.45:
+ return "high"
+ if best >= 0.18:
+ return "medium"
+ if best >= 0.08:
+ return "low"
+ return "very_low"
+
+
+def _doc_meta_value(doc: Document, key: str, default=None):
+ meta = getattr(doc, "metadata", {}) or {}
+ return meta.get(key, default)
+
+
+def _build_doc_evidence_buckets(
+ docs: List[Document],
+ *,
+ doc_title_map: Optional[dict[str, str]] = None,
+) -> dict[str, dict]:
+ buckets: dict[str, dict] = {}
+ for doc in docs:
+ meta = doc.metadata or {}
+ file_hash = str(meta.get("file_hash") or meta.get("source") or "unknown")
+ bucket = buckets.setdefault(
+ file_hash,
+ {
+ "file_hash": file_hash,
+ "source": (doc_title_map or {}).get(file_hash)
+ or str(meta.get("source") or file_hash),
+ "docs": [],
+ "scores": [],
+ "has_summary": False,
+ "has_synthetic_summary": False,
+ "leaf_count": 0,
+ "summary_count": 0,
+ "candidate_count": 0,
+ "coverage_metrics": {},
+ },
+ )
+ bucket["docs"].append(doc)
+ score = float(meta.get("relevance_score") or 0.0)
+ bucket["scores"].append(score)
+ node_type = str(meta.get("node_type") or "leaf")
+ if node_type == "summary":
+ bucket["has_summary"] = True
+ bucket["summary_count"] += 1
+ else:
+ bucket["leaf_count"] += 1
+ if meta.get("synthetic_root_summary"):
+ bucket["has_synthetic_summary"] = True
+ coverage = dict(meta.get("coverage_metrics", {}) or {})
+ if coverage:
+ bucket["coverage_metrics"] = coverage
+
+ for bucket in buckets.values():
+ bucket["support_label"] = _bucket_support_label(bucket["scores"])
+ bucket["best_score"] = round(max(bucket["scores"]) if bucket["scores"] else 0.0, 4)
+ bucket["candidate_count"] = len(bucket["docs"])
+ return buckets
+
+
+def _reorder_bucket_docs_for_query(
+ bucket_docs: List[Document],
+ *,
+ summary_like: bool,
+ entity_specific: bool,
+) -> List[Document]:
+ def _sort_key(doc: Document):
+ meta = doc.metadata or {}
+ node_type = str(meta.get("node_type") or "leaf")
+ synthetic = bool(meta.get("synthetic_root_summary"))
+ score = float(meta.get("relevance_score") or 0.0)
+ if summary_like:
+ kind_rank = 0 if synthetic else (1 if node_type == "summary" else 2)
+ elif entity_specific:
+ kind_rank = 0 if node_type != "summary" else 2
+ else:
+ kind_rank = 0 if node_type == "summary" else 1
+ return (kind_rank, -score)
- prompt = (
- "You are an expert search query optimiser.\n"
- "Break the user's question into 1-3 distinct, targeted search queries.\n"
- "If simple, return 1 optimised version. Do NOT answer it.\n\n"
- f"USER QUESTION: {original_query}"
- )
+ return sorted(bucket_docs, key=_sort_key)
+
+def _materialize_evidence_buckets(
+ docs: List[Document],
+ *,
+ query: str,
+ route_mode: str,
+ doc_title_map: Optional[dict[str, str]] = None,
+) -> tuple[list[Document], list[dict], dict]:
+ summary_like = _is_summary_like_query(query)
+ entity_specific = _is_entity_specific_query(query)
+ buckets = _build_doc_evidence_buckets(docs, doc_title_map=doc_title_map)
+ ordered_docs: list[Document] = []
+ bucket_payloads: list[dict] = []
+
+ for file_hash, bucket in buckets.items():
+ ranked_docs = _reorder_bucket_docs_for_query(
+ list(bucket["docs"]),
+ summary_like=summary_like,
+ entity_specific=entity_specific,
+ )
+ ordered_docs.extend(ranked_docs)
+ thin_doc = (
+ bucket["leaf_count"] <= 1
+ and bucket["has_synthetic_summary"]
+ and (bucket["summary_count"] <= 1)
+ )
+ bucket_payloads.append(
+ {
+ "file_hash": file_hash,
+ "source": bucket["source"],
+ "support_label": bucket["support_label"],
+ "best_score": bucket["best_score"],
+ "candidate_count": bucket["candidate_count"],
+ "has_summary": bucket["has_summary"],
+ "has_synthetic_summary": bucket["has_synthetic_summary"],
+ "leaf_count": bucket["leaf_count"],
+ "summary_count": bucket["summary_count"],
+ "thin_doc": thin_doc,
+ "coverage_metrics": bucket["coverage_metrics"],
+ }
+ )
+
+ support_labels = [bucket["support_label"] for bucket in bucket_payloads]
+ commonality_supported = sum(label in {"high", "medium"} for label in support_labels) >= 2
+ comparison_supported = sum(label != "none" for label in support_labels) >= 2
+ return ordered_docs, bucket_payloads, {
+ "commonality_supported": commonality_supported,
+ "comparison_supported": comparison_supported,
+ "summary_like": summary_like,
+ "entity_specific": entity_specific,
+ "route_mode": route_mode,
+ }
+
+
+_ROUTER_PROTOTYPES: dict[str, list[str]] = {
+ "dense_chunk": [
+ "find the exact answer in the document",
+ "retrieve relevant passages from the file",
+ ],
+ "raptor_summary": [
+ "summarize the document at a high level",
+ "give an overview of the story and main themes",
+ ],
+ "graph_traversal": [
+ "explain how two entities are connected across documents",
+ "trace relationships across documents and facts",
+ ],
+ "episodic_memory": [
+ "answer based on the previous response in this conversation",
+ "use conversation memory to answer follow-up questions",
+ ],
+ "hybrid_compare": [
+ "compare two documents and keep their evidence separated",
+ "find similarities and differences across two stories",
+ ],
+}
+
+
+def _vector_cosine(a: List[float], b: List[float]) -> float:
+ if not a or not b:
+ return 0.0
+ denom = math.sqrt(sum(x * x for x in a)) * math.sqrt(sum(y * y for y in b))
+ if not denom:
+ return 0.0
+ return sum(x * y for x, y in zip(a, b)) / denom
+
+
+def _normalize_weight_map(weight_map: dict[str, float]) -> dict[str, float]:
+ cleaned = {k: max(0.0, float(v)) for k, v in weight_map.items()}
+ total = sum(cleaned.values()) or 1.0
+ return {k: round(v / total, 4) for k, v in cleaned.items()}
+
+
+def _llm_router_fallback(query: str) -> Optional[dict[str, float]]:
try:
- # FallbackChatLLM handles rotation + retries internally.
- # with_structured_output requires a raw ChatOpenAI — build the first model
- # via ProviderFactory and attach structured output to it.
- from backend.core import config as _cfg
- from langchain_openai import ChatOpenAI
+ from backend.core.providers import ProviderFactory
- models = _cfg.LLM_FALLBACK_LIST or _cfg.REWRITER_MODELS
- for model in models:
- try:
- llm = ChatOpenAI(
- model=model,
- openai_api_key=_cfg.OPENROUTER_API_KEY,
- openai_api_base=_cfg.OPENROUTER_BASE_URL,
- temperature=0.0,
- max_tokens=150,
- max_retries=0,
- )
- res = llm.with_structured_output(QueryVariants).invoke(
- [HumanMessage(content=prompt)]
- )
- queries = [q.strip() for q in res.sub_queries if q.strip()]
- if queries:
- log.info(
- "Rewriter (%s) → %d sub-queries: %s",
- model,
- len(queries),
- queries,
+ llm = ProviderFactory.build_chat_llm(purpose="rewriter", temperature=0.0)
+ response = llm.invoke(
+ [
+ HumanMessage(
+ content=(
+ "Route this query across retrieval experts. "
+ "Return ONLY a JSON object with numeric weights between 0 and 1 for: "
+ "dense_chunk, raptor_summary, graph_traversal, episodic_memory, hybrid_compare.\n"
+ f"QUERY: {query}"
)
- return queries
- except Exception as exc:
- err = str(exc)
- log.info(
- "Rewriter model %s unavailable (%s) — trying next.", model, err[:60]
)
- continue
+ ]
+ )
+ content = response.content if isinstance(response.content, str) else str(response.content)
+ match = re.search(r"\{.*\}", content, flags=re.DOTALL)
+ if not match:
+ return None
+ parsed = json.loads(match.group(0))
+ return _normalize_weight_map(
+ {
+ key: float(parsed.get(key, 0.0))
+ for key in _ROUTER_PROTOTYPES
+ }
+ )
except Exception as exc:
- log.warning("Rewriter failed entirely: %s — using original query.", exc)
+ log.debug("Router LLM fallback skipped: %s", exc)
+ return None
+
+
+def _route_query_experts(
+ query: str,
+ *,
+ session_id: Optional[str] = None,
+ user_id: Optional[str] = None,
+ priority_file_hashes: Optional[List[str]] = None,
+) -> dict:
+ q = (query or "").strip()
+ q_lower = q.lower()
+ embedding_scores: dict[str, float] = {}
+ try:
+ query_vec = get_cached_embedding(q or "general document information")
+ for expert, prototypes in _ROUTER_PROTOTYPES.items():
+ sims = [
+ _vector_cosine(query_vec, get_cached_embedding(proto))
+ for proto in prototypes
+ ]
+ embedding_scores[expert] = max(0.0, sum(sims) / max(1, len(sims)))
+ except Exception as exc:
+ log.debug("Router embedding stage unavailable: %s", exc)
+ embedding_scores = {expert: 0.2 for expert in _ROUTER_PROTOTYPES}
+
+ feature_scores = {expert: 0.0 for expert in _ROUTER_PROTOTYPES}
+ if _is_summary_like_query(q_lower):
+ feature_scores["raptor_summary"] += 0.35
+ if _is_compare_like_query(q_lower):
+ feature_scores["hybrid_compare"] += 0.45
+ feature_scores["graph_traversal"] += 0.10
+ if any(
+ token in q_lower
+ for token in ("relationship", "connected", "connection", "link", "linked", "why", "cause")
+ ):
+ feature_scores["graph_traversal"] += 0.35
+ if priority_file_hashes and len(priority_file_hashes) > 1:
+ feature_scores["hybrid_compare"] += 0.15
+ if session_id:
+ session_key = _session_cache_key(session_id, user_id=user_id)
+ if session_key in _last_chunks and any(
+ token in q_lower for token in ("it", "this", "that", "previous", "above", "earlier")
+ ):
+ feature_scores["episodic_memory"] += 0.35
+ if not priority_file_hashes:
+ feature_scores["dense_chunk"] += 0.10
+
+ combined = {
+ expert: (embedding_scores.get(expert, 0.0) * 0.65) + (feature_scores.get(expert, 0.0) * 0.35)
+ for expert in _ROUTER_PROTOTYPES
+ }
+ weights = _normalize_weight_map(combined)
+ ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
+ confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
+ if confidence_gap < 0.06 and len(q.split()) >= 4:
+ llm_weights = _llm_router_fallback(q)
+ if llm_weights:
+ weights = llm_weights
+ ranked = sorted(weights.items(), key=lambda item: item[1], reverse=True)
+ confidence_gap = ranked[0][1] - ranked[1][1] if len(ranked) > 1 else ranked[0][1]
+ return {
+ "expert_weights": weights,
+ "selected_experts": [expert for expert, score in ranked if score >= 0.18][:3],
+ "confidence": round(confidence_gap, 4),
+ }
+
+
+def _combine_local_and_doc_score(
+ local_score: float,
+ doc_prior_score: float,
+ weight: float,
+) -> float:
+ weight = max(0.0, min(1.0, float(weight)))
+ combined = (float(local_score) * (1.0 - weight)) + (float(doc_prior_score) * weight)
+ return round(max(0.0, min(1.0, combined)), 4)
- log.info("All rewriter models exhausted — using original query.")
- return [original_query]
+
+def _confidence_label(score: Optional[float]) -> str:
+ if score is None:
+ return "unknown"
+ if score >= 0.45:
+ return "high"
+ if score >= 0.18:
+ return "medium"
+ if score >= 0.08:
+ return "low"
+ return "very_low"
def _category_pool_filter(
@@ -1584,6 +4014,21 @@ def analyse_intent(
}
has_category = bool(category and category != "All")
has_history = bool(chat_history and len(chat_history) >= 2)
+ route_decision = _classify_query_route_decision(
+ query,
+ session_id=session_id,
+ user_id=user_id,
+ )
+ route_class = route_decision.route_class
+
+ if route_class == "no_retrieval":
+ return {
+ "is_clear": True,
+ "enriched_query": query,
+ "clarification_question": None,
+ "route_class": route_class,
+ "route_reason": route_decision.route_reason,
+ }
result = intent_classifier.predict(query, has_category, has_history)
needs_clar = result["needs_clarification"]
@@ -1620,6 +4065,8 @@ def analyse_intent(
"is_clear": False,
"enriched_query": None,
"clarification_question": _clarification_question_for(query),
+ "route_class": route_class,
+ "route_reason": route_decision.route_reason,
}
# ── Query is clear — reset clarification counter ─────────────────────────
@@ -1636,11 +4083,16 @@ def analyse_intent(
)
# ── Build enriched query for better retrieval signal ─────────────────────
- enriched = query
+ enriched = query if route_decision.preserve_query else _rewrite_follow_up_query(
+ query,
+ chat_history=chat_history,
+ session_id=session_id,
+ user_id=user_id,
+ )
if has_category:
readable = category.replace("_", " ")
- enriched = f"{query} {readable}"
+ enriched = f"{enriched} {readable}".strip()
elif has_history and chat_history:
# If chunk cache already exists for this session, skip enrichment entirely —
@@ -1705,12 +4157,189 @@ def analyse_intent(
"Intent: chunk cache exists for session — skipping enrichment, letting cache handle it."
)
- log.info("Intent: clear — enriched query: %r", enriched[:80])
- return {
- "is_clear": True,
- "enriched_query": enriched,
- "clarification_question": None,
- }
+ log.info("Intent: clear — enriched query: %r", enriched[:80])
+ return {
+ "is_clear": True,
+ "enriched_query": enriched,
+ "clarification_question": None,
+ "route_class": route_class,
+ "route_reason": route_decision.route_reason,
+ }
+
+
+def _is_generic_ambiguous_query(query: str) -> bool:
+ q = (query or "").lower().strip()
+ tokens = [t for t in re.split(r"[^a-z0-9]+", q) if t]
+ if len(tokens) <= 3:
+ return True
+ generic_terms = {
+ "summarize",
+ "summarise",
+ "summary",
+ "story",
+ "document",
+ "file",
+ "explain",
+ "overview",
+ "what",
+ "about",
+ }
+ return len(tokens) <= 8 and sum(1 for t in tokens if t in generic_terms) >= 2
+
+
+def check_query_ambiguity(
+ query: str,
+ access_token: str = None,
+ category: str = None,
+) -> dict:
+ """
+ Category-aware ambiguity detector.
+
+ When multiple files are present in the same category and the query is
+ generic/underspecified, return clarification options so the user can pin
+ one file or ask for synthesis across all relevant files.
+ """
+ AMBIGUITY_GAP = 0.12
+ MIN_MATCH_SCORE = 0.05
+ MIN_WORDS_FOR_SPECIFICITY = 10
+
+ words = query.strip().split()
+ if len(words) > MIN_WORDS_FOR_SPECIFICITY and not _is_generic_ambiguous_query(query):
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": None,
+ }
+
+ try:
+ supabase = _build_supabase_client(access_token)
+ user_id = None
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ user_id = safe_extract_jwt_sub(access_token)
+
+ files_q = supabase.table("ingested_files").select("file_hash, filename")
+ if user_id:
+ files_q = files_q.eq("user_id", user_id)
+ if category and category != "All":
+ files_q = files_q.eq("document_type", category)
+ files_resp = files_q.execute()
+ files = files_resp.data or []
+ if len(files) < 2:
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": None,
+ }
+
+ query_vec = get_cached_embedding(query)
+ file_scores: list[tuple[str, str, float]] = [] # (file_hash, label, best_score)
+
+ for f in files:
+ fhash = f.get("file_hash")
+ fname = (f.get("filename") or fhash or "Untitled").strip()
+ if not fhash:
+ continue
+ try:
+ resp = supabase.rpc(
+ "hybrid_search",
+ {
+ "query_text": query,
+ "query_embedding": query_vec,
+ "match_count": 1,
+ "filter": {"file_hash": fhash},
+ "semantic_weight": 0.7,
+ "keyword_weight": 0.3,
+ },
+ ).execute()
+ rows = resp.data or []
+ if rows:
+ score = float(rows[0].get("combined_score", 0.0))
+ file_scores.append((fhash, fname, score))
+ except Exception as exc:
+ log.warning("Ambiguity check RPC error for %s: %s", str(fhash)[:8], exc)
+
+ if len(file_scores) < 2:
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": None,
+ }
+
+ file_scores.sort(key=lambda x: x[2], reverse=True)
+ top_hash, top_name, top_score = file_scores[0]
+ second_hash, second_name, second_score = file_scores[1]
+ gap = top_score - second_score
+ generic = _is_generic_ambiguous_query(query)
+
+ log.info(
+ "Ambiguity check: top=%r (%.3f), 2nd=%r (%.3f), gap=%.3f, generic=%s, category=%r",
+ top_name,
+ top_score,
+ second_name,
+ second_score,
+ gap,
+ generic,
+ category,
+ )
+
+ if top_score < MIN_MATCH_SCORE:
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": None,
+ }
+
+ should_ask = gap < AMBIGUITY_GAP or generic
+ if should_ask:
+ top_options = file_scores[: min(3, len(file_scores))]
+ options = [
+ {
+ "mode": "single",
+ "label": name.replace(".pdf", ""),
+ "file_hash": fhash,
+ }
+ for fhash, name, _ in top_options
+ ]
+ options.append(
+ {
+ "mode": "all",
+ "label": "Use all matching docs",
+ "file_hashes": [fhash for fhash, _, _ in top_options],
+ }
+ )
+ scope = f" in '{category.replace('_', ' ')}'" if category and category != "All" else ""
+ question = (
+ f"Your question is ambiguous{scope}. "
+ "Should I focus on one document or synthesize across multiple?"
+ )
+ return {
+ "is_ambiguous": True,
+ "clarification_question": question,
+ "clarification_options": options,
+ "top_file_hash": None,
+ }
+
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": top_hash,
+ }
+
+ except Exception as exc:
+ log.warning("Ambiguity check failed, skipping: %s", exc)
+ return {
+ "is_ambiguous": False,
+ "clarification_question": None,
+ "clarification_options": None,
+ "top_file_hash": None,
+ }
def retrieve_chunks(
@@ -1724,33 +4353,106 @@ def retrieve_chunks(
user_id: str = None,
original_query: str = None,
eval_mode: bool = False,
+ priority_file_hashes: List[str] = None,
) -> List[Document]:
-
- # ── Follow-up detection: reuse last chunks if pronoun + short query ───────
- q = query.lower().strip()
- words = q.split()
- FOLLOWUP_PRONOUNS = {"it", "its", "this", "that", "they", "their", "them"}
- log.info(
- "retrieve_chunks: session_id=%r, cache_keys=%r",
- session_id,
- list(_last_chunks.keys()),
- )
+ trace_started_at_ms = int(time.time() * 1000)
+ user_id = _stable_user_id(user_id, access_token)
+ if priority_file_hashes:
+ priority_file_hashes = list(dict.fromkeys([h for h in priority_file_hashes if h]))
+ if not priority_file_hashes:
+ priority_file_hashes = None
session_key = _session_cache_key(session_id, user_id=user_id)
+ session_context = _get_session_context(session_key)
+ can_reuse_session_context = bool(user_id or (session_id and session_id != "default_session"))
+ route_decision = _classify_query_route_decision(
+ original_query or query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ )
+ route_class = route_decision.route_class
+ route_reason = route_decision.route_reason
+ exact_route = route_class in {"exact_fact", "page_scoped"}
+ disable_memory = bool(eval_mode or route_decision.disable_memory)
is_followup = (
- session_key
+ not eval_mode
+ and
+ can_reuse_session_context
+ and
+ route_class == "follow_up"
and session_key in _last_chunks
- and len(words) <= 8
- and any(w in words for w in FOLLOWUP_PRONOUNS)
- and not (category) # if user switched category, do fresh retrieval
+ and not category
+ and not priority_file_hashes
+ and bool(session_context)
)
if is_followup:
log.info(
"Follow-up detected — reusing cached chunks for session %s", session_id[:8]
)
return _last_chunks[session_key]
- # ── Query cache check ─────────────────────────────────────────────────────
- if user_id and not is_followup and not eval_mode:
+
+ if exact_route and user_id and priority_file_hashes and len(priority_file_hashes) == 1:
+ identity_row = _load_or_backfill_identity_row(
+ file_hash=priority_file_hashes[0],
+ user_id=user_id,
+ access_token=access_token,
+ )
+ identity_docs = _identity_documents_for_query(
+ identity_row,
+ query=original_query or query,
+ route_decision=route_decision,
+ )
+ if identity_docs:
+ trace_id = str(uuid.uuid4())
+ quality = {
+ "route_class": route_class,
+ "route_reason": route_reason,
+ "retrieval_relevance_proxy": 1.0,
+ "source_diversity": 1.0,
+ "document_balance": 1.0,
+ "doc_count_included": 1,
+ "doc_count_with_candidates": 1,
+ "thin_doc_count": 0,
+ "abstention_expected": False,
+ "identity_store_hit": True,
+ "identity_store_satisfied": True,
+ "history_injected": False,
+ "memory_injected": False,
+ "selected_page_numbers": identity_docs[0].metadata.get("page_numbers", []),
+ "page_scope_required": _query_requires_opening_page_evidence(original_query or query),
+ "page_scope_supported": True,
+ "rerank_deltas": [],
+ "pre_rerank_candidates": [],
+ }
+ for doc in identity_docs:
+ meta = doc.metadata or {}
+ meta["trace_id"] = trace_id
+ meta["route_mode"] = _classify_priority_query_mode(original_query or query, priority_file_hashes)
+ meta["route_class"] = route_class
+ meta["route_reason"] = route_reason
+ meta["selected_experts"] = ["identity_store"]
+ meta["expert_weights"] = {"identity_store": 1.0}
+ meta["doc_diagnostics"] = []
+ meta["failure_modes"] = []
+ meta["trace_started_at_ms"] = trace_started_at_ms
+ meta["trace_quality"] = quality
+ meta["candidate_counts"] = {str(priority_file_hashes[0]): 1}
+ meta["evidence_buckets"] = []
+ meta["answer_policy"] = {}
+ meta["embedding_variant"] = config.RETRIEVAL_EMBEDDING_VARIANT
+ doc.metadata = meta
+ if session_id and not eval_mode:
+ _remember_session_retrieval(
+ session_key=session_key,
+ query=original_query or query,
+ chunks=identity_docs,
+ )
+ return identity_docs
+ # ── Query cache check ──────────────────────────────────────────────────────
+ # Skip semantic cache entirely when graph pinning is active — the answer
+ # must be scoped to the pinned document(s), not a corpus-wide cached answer.
+ if user_id and not is_followup and not eval_mode and not priority_file_hashes and not exact_route:
try:
cache_query = original_query or query
query_vec = get_cached_embedding(cache_query)
@@ -1773,15 +4475,146 @@ def retrieve_chunks(
except Exception as e:
log.warning("Cache check failed, proceeding normally: %s", e)
- queries_to_run = generate_sub_queries(query)
-
- dynamic_k = 10 if len(queries_to_run) > 1 else 5
- fetch_k = 15 if len(queries_to_run) > 1 else 10
-
# Quality bar — 0.35 is sweet spot for Cohere rerank-english-v3.0
RELEVANCE_THRESHOLD = 0.35
- supabase = _build_supabase_client(access_token)
+ supabase = _build_service_supabase_client()
+ route_info = _route_query_experts(
+ original_query or query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ )
+ routing_query = original_query or query
+ query_mode = _classify_priority_query_mode(routing_query, priority_file_hashes)
+ duplicate_collapse_count = 0
+ run_raptor_summary = _should_run_expert(
+ route_info, "raptor_summary", threshold=0.22
+ ) or route_class == "summary"
+ run_memory_aug = (not disable_memory) and _should_run_expert(
+ route_info, "episodic_memory", threshold=0.24
+ )
+ run_graph = (not exact_route) and (
+ _graph_query_should_run(routing_query, route_info) or route_class == "relational"
+ )
+ active_experts = ["dense_chunk"]
+ if exact_route:
+ active_experts = ["dense_chunk", "identity_store"]
+ if run_raptor_summary:
+ active_experts.append("raptor_summary")
+ if run_memory_aug:
+ active_experts.append("episodic_memory")
+ if run_graph:
+ active_experts.append("graph_traversal")
+ if query_mode in {"generic_pinned", "explicit_compare"}:
+ active_experts.append("hybrid_compare")
+
+ pinned_docs: List[dict] = []
+ if priority_file_hashes:
+ try:
+ title_query = supabase.table("ingested_files").select("file_hash, filename")
+ if user_id:
+ title_query = title_query.eq("user_id", user_id)
+ title_rows = title_query.in_("file_hash", priority_file_hashes).execute()
+ rows_by_hash = {
+ row.get("file_hash"): row
+ for row in (title_rows.data or [])
+ if row.get("file_hash")
+ }
+ pinned_docs = [
+ {
+ "file_hash": h,
+ "filename": rows_by_hash.get(h, {}).get("filename") or h,
+ }
+ for h in priority_file_hashes
+ ]
+ except Exception as exc:
+ log.warning("Could not load pinned file titles: %s", exc)
+
+ if priority_file_hashes:
+ query_plan = _build_pinned_query_plan(query, pinned_docs, query_mode)
+ log.info(
+ "Pinned retrieval plan (%s): %s",
+ query_mode,
+ [
+ f"{entry['kind']}:{entry['query_text']}=>{len(entry['target_file_hashes']) or 'all'}"
+ for entry in query_plan
+ ],
+ )
+ else:
+ query_plan = [
+ {"kind": "shared", "query_text": sub_query, "target_file_hashes": []}
+ for sub_query in generate_sub_queries(
+ original_query or query if route_decision.preserve_query else query,
+ route_class=route_class,
+ )
+ ]
+
+ operational_query_plan: List[dict] = []
+ for entry in query_plan:
+ operational_query_plan.append(
+ {
+ **entry,
+ "branch": "dense_chunk",
+ "node_type_filter": None,
+ }
+ )
+ if run_raptor_summary:
+ operational_query_plan.append(
+ {
+ **entry,
+ "branch": "raptor_summary",
+ "node_type_filter": "summary",
+ }
+ )
+
+ if run_memory_aug:
+ memory_rows = _fetch_memory_matches(
+ session_id,
+ routing_query,
+ access_token=access_token,
+ user_id=user_id,
+ match_count=2,
+ )
+ memory_entries = _build_memory_augmented_queries(
+ routing_query,
+ memory_rows,
+ file_hashes=priority_file_hashes,
+ )
+ for entry in memory_entries:
+ operational_query_plan.append(
+ {
+ **entry,
+ "branch": "episodic_memory",
+ "node_type_filter": "summary" if run_raptor_summary else None,
+ }
+ )
+
+ if operational_query_plan:
+ deduped_operational: List[dict] = []
+ seen_operational: set[tuple[str, str, tuple[str, ...], str, str]] = set()
+ for entry in operational_query_plan:
+ key = (
+ entry.get("kind", "shared"),
+ str(entry.get("query_text") or "").strip().lower(),
+ tuple(entry.get("target_file_hashes") or []),
+ str(entry.get("branch") or "dense_chunk"),
+ str(entry.get("node_type_filter") or ""),
+ )
+ if key in seen_operational:
+ continue
+ seen_operational.add(key)
+ deduped_operational.append(entry)
+ operational_query_plan = deduped_operational
+
+ if config.ENABLE_RETRIEVE_THEN_STUFF and route_class in {"compare", "multi_part"}:
+ dynamic_k = max(config.RETRIEVE_THEN_STUFF_K, 10)
+ fetch_k = max(config.RETRIEVE_THEN_STUFF_FETCH_K, 15)
+ else:
+ dynamic_k = 10 if len(operational_query_plan) > 1 else 5
+ fetch_k = 15 if len(operational_query_plan) > 1 else 10
+ explicit_multi_doc_query = query_mode == "explicit_compare"
+ multi_pin_mode = bool(priority_file_hashes and len(priority_file_hashes) > 1)
filter_dict: dict = {}
if source_file:
@@ -1792,48 +4625,127 @@ def retrieve_chunks(
all_candidates: list = []
seen_ids: set = set()
+ doc_candidate_counts: dict[str, int] = {
+ str(doc.get("file_hash")): 0 for doc in pinned_docs if doc.get("file_hash")
+ }
- for sub_query in queries_to_run:
- log.info("Hybrid search: %r | filter: %s", sub_query, filter_dict)
+ for plan_entry in operational_query_plan:
+ sub_query = plan_entry["query_text"]
+ target_hashes = plan_entry.get("target_file_hashes") or []
+ log.info(
+ "Hybrid search (%s/%s): %r | filter: %s",
+ plan_entry.get("kind", "shared"),
+ plan_entry.get("branch", "dense_chunk"),
+ sub_query,
+ filter_dict,
+ )
try:
query_vector = get_cached_embedding(sub_query)
except Exception as exc:
log.error("Embedding failed for %r: %s", sub_query, exc)
continue
- rpc_params = {
- "query_text": sub_query,
- "query_embedding": query_vector,
- "match_count": fetch_k,
- "filter": filter_dict,
- "semantic_weight": alpha,
- "keyword_weight": round(1.0 - alpha, 2),
- }
- try:
- response = supabase.rpc("hybrid_search", rpc_params).execute()
- log.info("RPC returned %d rows for %r", len(response.data or []), sub_query)
- except Exception as exc:
- log.error("RPC error for %r: %s", sub_query, exc)
- continue
- for chunk in response.data or []:
- chunk_id = chunk.get("id")
- if chunk_id not in seen_ids:
- seen_ids.add(chunk_id)
- all_candidates.append(chunk)
+ # ── Build filter set for this sub-query ─────────────────────────────
+ # When graph nodes are pinned we run one RPC per pinned file to enforce
+ # a hard document boundary. Otherwise we run one RPC with the existing
+ # category / source filter (or no filter at all).
+ branch_filter = {}
+ if plan_entry.get("node_type_filter"):
+ branch_filter["node_type"] = plan_entry["node_type_filter"]
+ if priority_file_hashes:
+ scoped_hashes = target_hashes or priority_file_hashes
+ filter_variations = [
+ dict(filter_dict, file_hash=phash, **branch_filter)
+ for phash in scoped_hashes
+ ]
+ log.info(
+ "Graph pin active — querying %d pinned file(s) for %s query.",
+ len(scoped_hashes),
+ plan_entry.get("kind", "shared"),
+ )
+ else:
+ filter_variations = [dict(filter_dict, **branch_filter)]
- # If category filter returned nothing, retry without filter as fallback
- if not all_candidates and filter_dict:
+ for f_var in filter_variations:
+ rpc_params = {
+ "query_text": sub_query,
+ "query_embedding": query_vector,
+ "match_count": fetch_k,
+ "filter": f_var,
+ "semantic_weight": alpha,
+ "keyword_weight": round(1.0 - alpha, 2),
+ "p_user_id": user_id,
+ }
+ try:
+ response = supabase.rpc("hybrid_search", rpc_params).execute()
+ log.info(
+ "RPC returned %d rows for %r (filter=%s)",
+ len(response.data or []), sub_query, f_var,
+ )
+ except Exception as exc:
+ log.error("RPC error for %r: %s", sub_query, exc)
+ continue
+ for chunk in response.data or []:
+ chunk_id = chunk.get("id")
+ if chunk_id not in seen_ids:
+ seen_ids.add(chunk_id)
+ chunk_fhash = str(
+ (chunk.get("metadata", {}) or {}).get("file_hash")
+ or chunk.get("file_hash")
+ or ""
+ )
+ if chunk_fhash:
+ doc_candidate_counts[chunk_fhash] = doc_candidate_counts.get(chunk_fhash, 0) + 1
+ chunk_meta = dict(chunk.get("metadata", {}) or {})
+ chunk_meta["retrieval_branch"] = plan_entry.get("branch", "dense_chunk")
+ chunk["metadata"] = chunk_meta
+ all_candidates.append(chunk)
+
+ if run_graph:
+ graph_candidates = _retrieve_graph_candidates(
+ routing_query,
+ route_mode=query_mode,
+ access_token=access_token,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ limit=max(4, dynamic_k),
+ )
+ if graph_candidates:
+ log.info("Graph retrieval added %d candidate(s).", len(graph_candidates))
+ for chunk in graph_candidates:
+ chunk_id = chunk.get("id")
+ if chunk_id in seen_ids:
+ continue
+ seen_ids.add(chunk_id)
+ chunk_fhash = str(
+ (chunk.get("metadata", {}) or {}).get("file_hash")
+ or chunk.get("file_hash")
+ or ""
+ )
+ if chunk_fhash:
+ doc_candidate_counts[chunk_fhash] = doc_candidate_counts.get(chunk_fhash, 0) + 1
+ all_candidates.append(chunk)
+
+ # If category filter returned nothing retry without filter (grace fallback).
+ # But: if graph pinning is active we must NOT fall back to the full corpus —
+ # the user explicitly chose a document boundary.
+ if not all_candidates and filter_dict and not priority_file_hashes:
log.warning("Filtered search returned 0 results — retrying without filter")
- for sub_query in queries_to_run:
+ for plan_entry in operational_query_plan:
+ sub_query = plan_entry["query_text"]
try:
query_vector = get_cached_embedding(sub_query)
+ fallback_filter = {}
+ if plan_entry.get("node_type_filter"):
+ fallback_filter["node_type"] = plan_entry["node_type_filter"]
response = supabase.rpc(
"hybrid_search",
{
"query_text": sub_query,
"query_embedding": query_vector,
"match_count": fetch_k,
- "filter": {},
+ "filter": fallback_filter,
+ "p_user_id": user_id,
},
).execute()
log.info("Unfiltered RPC returned %d rows", len(response.data or []))
@@ -1841,16 +4753,67 @@ def retrieve_chunks(
chunk_id = chunk.get("id")
if chunk_id not in seen_ids:
seen_ids.add(chunk_id)
+ chunk_meta = dict(chunk.get("metadata", {}) or {})
+ chunk_meta["retrieval_branch"] = plan_entry.get("branch", "dense_chunk")
+ chunk["metadata"] = chunk_meta
all_candidates.append(chunk)
except Exception as exc:
log.error("Fallback RPC error: %s", exc)
continue
+ page_scope_max = _page_scope_max_page(route_decision)
+
+ def _candidate_pages(candidate: dict) -> List[int]:
+ meta = dict(candidate.get("metadata", {}) or {})
+ pages = []
+ for page in meta.get("page_numbers") or []:
+ if isinstance(page, int):
+ pages.append(page)
+ elif str(page).isdigit():
+ pages.append(int(page))
+ return sorted(set(pages))
+
+ def _candidate_bias(candidate: dict) -> float:
+ meta = dict(candidate.get("metadata", {}) or {})
+ pages = _candidate_pages(candidate)
+ node_type = str(meta.get("node_type") or "leaf")
+ boost = 0.0
+ if exact_route:
+ if node_type == "leaf":
+ boost += 0.04
+ if pages and min(pages) <= 3:
+ boost += 0.08
+ if page_scope_max and pages and min(pages) <= page_scope_max:
+ boost += 0.18
+ return boost
+
+ opening_page_candidate_count = sum(
+ 1
+ for candidate in all_candidates
+ if (
+ _candidate_pages(candidate)
+ and min(_candidate_pages(candidate)) <= (page_scope_max or 3)
+ )
+ )
+
+ rerank_query_text = (
+ (original_query or query)
+ if route_decision.preserve_query or exact_route
+ else query
+ )
+ rerank_audit = {
+ "opening_page_candidate_count": opening_page_candidate_count,
+ "opening_page_selected_count": 0,
+ "pre_rerank_candidates": [],
+ "rerank_deltas": [],
+ }
+
if not all_candidates:
log.warning("No chunks found after all attempts.")
return []
log.info("%d candidates — Cohere reranking...", len(all_candidates))
+ multi_pin_mode = bool(priority_file_hashes and len(priority_file_hashes) > 1)
# ── Shared post-processing (used by all three rerank paths) ──────────────
def _apply_threshold_and_filter(
@@ -1860,7 +4823,7 @@ def retrieve_chunks(
ranked_with_scores: list of (index_into_all_candidates, score)
Returns final List[Document]
"""
- nonlocal dynamic_k
+ nonlocal dynamic_k, duplicate_collapse_count
if reranker == "crossencoder":
base_threshold = 0.0001 # sigmoid of ms-marco logits is very small
else:
@@ -1879,20 +4842,38 @@ def retrieve_chunks(
dynamic_k,
)
+ adjusted_scores: list[tuple[int, float]] = []
+ for idx, score in ranked_with_scores:
+ adjusted_score = round(
+ max(0.0, min(1.0, float(score) + _candidate_bias(all_candidates[idx]))),
+ 4,
+ )
+ adjusted_scores.append((idx, adjusted_score))
+
above = [
(idx, score)
- for idx, score in ranked_with_scores
+ for idx, score in adjusted_scores
if score >= effective_threshold
]
if not above:
- if ranked_with_scores and ranked_with_scores[0][1] >= 0.02:
+ if priority_file_hashes and adjusted_scores:
+ keep_n = min(
+ max(1, len(priority_file_hashes)),
+ len(adjusted_scores),
+ )
+ log.info(
+ "Pinned scope with low scores — keeping best %d chunk(s) as fallback.",
+ keep_n,
+ )
+ above = adjusted_scores[:keep_n]
+ elif adjusted_scores and adjusted_scores[0][1] >= 0.02:
log.info(
"Nothing above %.2f — returning top-1 as fallback (score: %.2f).",
effective_threshold,
- ranked_with_scores[0][1],
+ adjusted_scores[0][1],
)
- above = [ranked_with_scores[0]]
+ above = [adjusted_scores[0]]
else:
log.info("No chunks retrieved at all.")
return []
@@ -1904,14 +4885,19 @@ def retrieve_chunks(
meta["relevance_score"] = round(score, 4)
scored_candidates.append({**doc_data, "metadata": meta})
+ collapsed_candidates, collapsed_count = _collapse_near_duplicate_candidates(
+ scored_candidates
+ )
+ duplicate_collapse_count += collapsed_count
+
max_per_source = config.MAX_CHUNKS_PER_SOURCE
if filter_dict.get("document_type"):
diverse = _diversity_filter(
- scored_candidates, top_k=dynamic_k, max_per_source=max_per_source
+ collapsed_candidates, top_k=dynamic_k, max_per_source=max_per_source
)
else:
diverse = _category_pool_filter(
- scored_candidates,
+ collapsed_candidates,
top_k=dynamic_k,
category_slots=config.CATEGORY_SLOTS,
max_per_source=max_per_source,
@@ -1929,32 +4915,211 @@ def retrieve_chunks(
)
return docs
+ def _candidate_file_hash(candidate: dict) -> str:
+ meta = candidate.get("metadata", {}) or {}
+ return str(
+ meta.get("file_hash")
+ or candidate.get("file_hash")
+ or meta.get("source")
+ or "unknown"
+ )
+
+ doc_scores: dict[str, float] = {}
+ doc_diagnostics: list[dict] = []
+ failure_modes: set[str] = set()
+
# ── Path 1: Cohere ────────────────────────────────────────────────────────
retrieved = []
try:
co = cohere.Client(config.COHERE_API_KEY)
- rerank_response = co.rerank(
- model="rerank-multilingual-v3.0",
- query=query,
- documents=[c["content"] for c in all_candidates],
- top_n=min(dynamic_k * 3, len(all_candidates)),
- )
- ranked_with_scores = sorted(
- [(r.index, r.relevance_score) for r in rerank_response.results],
- key=lambda x: x[1],
- reverse=True,
- )
- retrieved = _apply_threshold_and_filter(ranked_with_scores, reranker="cohere")
- log.info("Reranker: Cohere")
+
+ def _run_global_cohere() -> tuple[list, list]:
+ rerank_response = co.rerank(
+ model="rerank-multilingual-v3.0",
+ query=rerank_query_text,
+ documents=[c["content"] for c in all_candidates],
+ top_n=min(dynamic_k * 3, len(all_candidates)),
+ )
+ ranked = sorted(
+ [(r.index, r.relevance_score) for r in rerank_response.results],
+ key=lambda x: x[1],
+ reverse=True,
+ )
+ docs = _apply_threshold_and_filter(ranked, reranker="cohere")
+ return docs, ranked
- # Fire-and-forget: log all Cohere scores for future CrossEncoder distillation
- _log_rerank_feedback(
- query=query,
- all_candidates=all_candidates,
- ranked_with_scores=ranked_with_scores,
- selected_docs=retrieved,
- user_id=user_id,
- )
+ ranked_with_scores = []
+ if multi_pin_mode:
+ DOC_RELEVANCE_THRESHOLD = 0.10
+ DOC_PRIOR_WEIGHT = 0.20
+
+ grouped: dict[str, list[int]] = {}
+ for idx, cand in enumerate(all_candidates):
+ grouped.setdefault(_candidate_file_hash(cand), []).append(idx)
+
+ ordered_hashes = [h for h in (priority_file_hashes or []) if h in grouped]
+ if len(ordered_hashes) >= 2:
+ per_doc_ranked: dict[str, list[tuple[int, float]]] = {}
+
+ for fhash in ordered_hashes:
+ idxs = grouped.get(fhash, [])
+ if not idxs:
+ continue
+ doc_texts = [all_candidates[i].get("content", "") for i in idxs]
+ rr = co.rerank(
+ model="rerank-multilingual-v3.0",
+ query=rerank_query_text,
+ documents=doc_texts,
+ top_n=min(dynamic_k * 3, len(doc_texts)),
+ )
+ local_ranked = sorted(
+ [(r.index, float(r.relevance_score)) for r in rr.results],
+ key=lambda x: x[1],
+ reverse=True,
+ )
+ mapped = [(idxs[local_i], score) for local_i, score in local_ranked]
+ if not mapped:
+ continue
+
+ best = mapped[0][1]
+ top_window = [s for _, s in mapped[: min(3, len(mapped))]]
+ mean_top = sum(top_window) / max(1, len(top_window))
+ doc_scores[fhash] = (0.7 * best) + (0.3 * mean_top)
+ per_doc_ranked[fhash] = mapped
+
+ if query_mode in {"generic_pinned", "explicit_compare"}:
+ log.info(
+ "Pinned multi-doc mode detected — preserving all pinned docs with candidates."
+ )
+ included_scores = dict(doc_scores)
+ else:
+ included_scores = {
+ fhash: score
+ for fhash, score in doc_scores.items()
+ if score >= DOC_RELEVANCE_THRESHOLD
+ }
+ if not included_scores and doc_scores:
+ best_fhash = max(doc_scores, key=doc_scores.get)
+ included_scores = {best_fhash: doc_scores[best_fhash]}
+
+ for fhash, score in doc_scores.items():
+ if fhash not in included_scores:
+ log.info(
+ "Pinned doc %s excluded (score %.3f < %.2f)",
+ str(fhash)[:8],
+ score,
+ DOC_RELEVANCE_THRESHOLD,
+ )
+
+ total_budget = min(
+ dynamic_k,
+ sum(
+ len(per_doc_ranked.get(fhash, []))
+ for fhash in included_scores
+ ),
+ )
+
+ if included_scores and total_budget > 0:
+ allocation: dict[str, int] = {}
+ score_sum = sum(included_scores.values()) or float(len(included_scores))
+
+ for fhash, score in sorted(
+ included_scores.items(), key=lambda x: x[1], reverse=True
+ ):
+ proposed = max(1, round(total_budget * (score / score_sum)))
+ allocation[fhash] = min(proposed, len(per_doc_ranked.get(fhash, [])))
+
+ # Normalize down/up to fit exact total_budget.
+ while sum(allocation.values()) > total_budget:
+ trim_targets = [
+ h
+ for h, n in sorted(
+ allocation.items(), key=lambda kv: included_scores[kv[0]]
+ )
+ if n > 1
+ ]
+ if not trim_targets:
+ break
+ allocation[trim_targets[0]] -= 1
+
+ while sum(allocation.values()) < total_budget:
+ grow_targets = [
+ h
+ for h, n in sorted(
+ allocation.items(),
+ key=lambda kv: included_scores[kv[0]],
+ reverse=True,
+ )
+ if n < len(per_doc_ranked.get(h, []))
+ ]
+ if not grow_targets:
+ break
+ allocation[grow_targets[0]] += 1
+
+ balanced: list[tuple[int, float]] = []
+ for fhash, n_chunks in allocation.items():
+ prior = included_scores.get(fhash, 0.0)
+ for global_idx, local_score in per_doc_ranked[fhash][:n_chunks]:
+ combined_score = _combine_local_and_doc_score(
+ local_score,
+ prior,
+ DOC_PRIOR_WEIGHT,
+ )
+ balanced.append((global_idx, combined_score))
+
+ ranked_with_scores = sorted(balanced, key=lambda x: x[1], reverse=True)
+ scored_candidates = []
+ for idx, score in ranked_with_scores:
+ doc_data = all_candidates[idx]
+ meta = dict(doc_data.get("metadata", {}) or {})
+ fhash = _candidate_file_hash(doc_data)
+ meta["relevance_score"] = round(float(score), 4)
+ meta["file_relevance_score"] = round(
+ float(included_scores.get(fhash, 0.0)), 4
+ )
+ scored_candidates.append({**doc_data, "metadata": meta})
+
+ diverse = _diversity_filter(
+ scored_candidates,
+ top_k=total_budget,
+ max_per_source=config.MAX_CHUNKS_PER_SOURCE,
+ )
+ retrieved = []
+ for c in diverse:
+ meta = dict(c.get("metadata", {}) or {})
+ if c.get("id") is not None:
+ meta["id"] = str(c["id"])
+ retrieved.append(Document(page_content=c["content"], metadata=meta))
+
+ log.info(
+ "Reranker: Cohere (balanced multi-doc mode, %d docs included)",
+ len(included_scores),
+ )
+ else:
+ log.info(
+ "Balanced multi-doc mode found no viable candidates — falling back to global rerank."
+ )
+ retrieved, ranked_with_scores = _run_global_cohere()
+ log.info("Reranker: Cohere")
+ else:
+ log.info(
+ "Balanced multi-doc mode skipped (could not group >=2 pinned docs) — falling back to global rerank."
+ )
+ retrieved, ranked_with_scores = _run_global_cohere()
+ log.info("Reranker: Cohere")
+ else:
+ retrieved, ranked_with_scores = _run_global_cohere()
+ log.info("Reranker: Cohere")
+
+ if ranked_with_scores:
+ # Fire-and-forget: log Cohere-style scores for future distillation
+ _log_rerank_feedback(
+ query=rerank_query_text,
+ all_candidates=all_candidates,
+ ranked_with_scores=ranked_with_scores,
+ selected_docs=retrieved,
+ user_id=user_id,
+ )
# ── Path 2: Local CrossEncoder fallback ───────────────────────────────────
except Exception as cohere_exc:
@@ -1971,7 +5136,7 @@ def retrieve_chunks(
)
retrieve_chunks._cross_encoder = _ce_model # cache on function
- pairs = [(query, c["content"]) for c in all_candidates]
+ pairs = [(rerank_query_text, c["content"]) for c in all_candidates]
scores = _ce_model.predict(pairs)
normalized = [1 / (1 + math.exp(-float(s))) for s in scores]
ranked_with_scores = sorted(
@@ -1987,7 +5152,7 @@ def retrieve_chunks(
# ── Path 3: Lexical last resort ───────────────────────────────────────
except Exception as ce_exc:
log.warning("CrossEncoder failed (%s) — falling back to lexical.", ce_exc)
- q_tokens = set((query or "").lower().split())
+ q_tokens = set((rerank_query_text or "").lower().split())
def _lex_score(text: str) -> float:
t_tokens = set((text or "").lower().split())
@@ -2012,21 +5177,85 @@ def retrieve_chunks(
log.info("Final %d chunks.", len(retrieved))
+ pre_rank_lookup: dict[str, int] = {}
+ pre_rerank_preview: list[dict] = []
+ for rank, (idx, score) in enumerate(ranked_with_scores[: min(8, len(ranked_with_scores))], start=1):
+ candidate = all_candidates[idx]
+ meta = dict(candidate.get("metadata", {}) or {})
+ candidate_id = str(candidate.get("id") or meta.get("id") or idx)
+ pre_rank_lookup[candidate_id] = rank
+ pre_rerank_preview.append(
+ {
+ "candidate_id": candidate_id,
+ "source": meta.get("source"),
+ "page_numbers": meta.get("page_numbers", []),
+ "node_type": meta.get("node_type", "leaf"),
+ "raw_score": round(float(score), 4),
+ }
+ )
+ rerank_audit["pre_rerank_candidates"] = pre_rerank_preview
+
# ── Token budget enforcement ──────────────────────────────────────────────
# Trim chunks that would push the LLM context over MAX_CONTEXT_CHARS.
# Highest-ranked chunks are always kept — only tail overflow is dropped.
if retrieved:
+ ordered_for_budget = list(retrieved)
+
+ # In explicit multi-doc pinned mode, preserve at least one top chunk from
+ # each pinned document before filling with additional chunks by rank.
+ # This prevents the final context-budget trim from silently collapsing
+ # back to one document after balanced reranking.
+ if multi_pin_mode and priority_file_hashes and len(priority_file_hashes) > 1:
+ ranked_by_hash: dict[str, List[Document]] = {}
+ for doc in retrieved:
+ meta = doc.metadata or {}
+ fhash = str(
+ meta.get("file_hash") or meta.get("source") or "unknown"
+ )
+ ranked_by_hash.setdefault(fhash, []).append(doc)
+
+ anchors: List[Document] = []
+ anchored_keys: set[tuple[str, str]] = set()
+ for pinned_hash in priority_file_hashes:
+ doc_list = ranked_by_hash.get(str(pinned_hash), [])
+ if not doc_list:
+ continue
+ top_doc = doc_list[0]
+ key = (
+ str((top_doc.metadata or {}).get("file_hash") or ""),
+ str((top_doc.metadata or {}).get("id") or id(top_doc)),
+ )
+ if key in anchored_keys:
+ continue
+ anchored_keys.add(key)
+ anchors.append(top_doc)
+
+ if len(anchors) >= 2:
+ ordered_for_budget = anchors + [
+ doc
+ for doc in retrieved
+ if (
+ str((doc.metadata or {}).get("file_hash") or ""),
+ str((doc.metadata or {}).get("id") or id(doc)),
+ )
+ not in anchored_keys
+ ]
+ log.info(
+ "Context budget guard: preserving %d anchor chunk(s) across pinned docs before trim.",
+ len(anchors),
+ )
+
budgeted: List[Document] = []
total_chars = 0
- for doc in retrieved:
+ for doc in ordered_for_budget:
chars = len(doc.page_content)
if total_chars + chars > config.MAX_CONTEXT_CHARS:
log.info(
"Context budget (%d chars) hit at chunk %d/%d — dropping %d remaining.",
config.MAX_CONTEXT_CHARS,
len(budgeted),
- len(retrieved),
- len(retrieved) - len(budgeted),
+ len(ordered_for_budget),
+ len(ordered_for_budget) - len(budgeted),
)
break
budgeted.append(doc)
@@ -2036,13 +5265,240 @@ def retrieve_chunks(
"Context budget: %d chars across %d/%d chunks.",
total_chars,
len(budgeted),
- len(retrieved),
+ len(ordered_for_budget),
)
retrieved = budgeted
- if session_id and retrieved:
- with _last_chunks_lock:
- _last_chunks[session_key] = retrieved
+ page_scope_required = _query_requires_opening_page_evidence(original_query or query)
+ page_scope_supported = True
+ if exact_route and page_scope_required:
+ page_scope_supported = any(
+ (doc.metadata or {}).get("page_numbers")
+ and min((doc.metadata or {}).get("page_numbers") or [999]) <= (page_scope_max or 3)
+ for doc in retrieved
+ )
+ if not page_scope_supported:
+ failure_modes.add("page_scope_violation")
+ if route_decision.page_scope == "cover":
+ failure_modes.add("cover_page_miss")
+ source_name = (
+ (pinned_docs[0].get("filename") if pinned_docs else None)
+ or (category.replace("_", " ") if category else None)
+ or "Selected document"
+ )
+ retrieved = [
+ Document(
+ page_content=(
+ "Opening-page evidence was not found in the requested page range. "
+ "Answer with a precise abstention instead of using later pages."
+ ),
+ metadata={
+ "source": source_name,
+ "file_hash": priority_file_hashes[0] if priority_file_hashes else None,
+ "document_type": category,
+ "chunk_index": "page-scope-abstention",
+ "page_numbers": [page_scope_max or 1],
+ "node_type": "leaf",
+ "retrieval_branch": "page_scope_guard",
+ "relevance_score": 0.0,
+ },
+ )
+ ]
+
+ doc_title_map = {
+ str(doc.get("file_hash")): (
+ doc.get("filename") or doc.get("title") or doc.get("file_hash") or "Unknown"
+ )
+ for doc in pinned_docs
+ if doc.get("file_hash")
+ }
+ if retrieved:
+ retrieved, evidence_buckets, answer_policy = _materialize_evidence_buckets(
+ retrieved,
+ query=original_query or query,
+ route_mode=query_mode,
+ doc_title_map=doc_title_map,
+ )
+ else:
+ evidence_buckets = []
+ answer_policy = {
+ "commonality_supported": False,
+ "comparison_supported": False,
+ "summary_like": _is_summary_like_query(original_query or query),
+ "entity_specific": _is_entity_specific_query(original_query or query),
+ "route_mode": query_mode,
+ }
+
+ rerank_deltas = []
+ for position, doc in enumerate(retrieved, start=1):
+ meta = doc.metadata or {}
+ candidate_id = str(meta.get("id") or "")
+ pre_rank = pre_rank_lookup.get(candidate_id)
+ rerank_deltas.append(
+ {
+ "candidate_id": candidate_id or meta.get("chunk_index") or f"selected-{position}",
+ "source": meta.get("source"),
+ "page_numbers": meta.get("page_numbers", []),
+ "node_type": meta.get("node_type", "leaf"),
+ "pre_rank": pre_rank,
+ "post_rank": position,
+ "delta": (pre_rank - position) if pre_rank is not None else None,
+ }
+ )
+ rerank_audit["rerank_deltas"] = rerank_deltas
+ rerank_audit["opening_page_selected_count"] = sum(
+ 1
+ for doc in retrieved
+ if (
+ (doc.metadata or {}).get("page_numbers")
+ and min((doc.metadata or {}).get("page_numbers") or [999]) <= (page_scope_max or 3)
+ )
+ )
+
+ if priority_file_hashes:
+ bucket_map = {
+ bucket["file_hash"]: bucket for bucket in evidence_buckets
+ }
+ for fhash in priority_file_hashes:
+ candidate_count = int(doc_candidate_counts.get(str(fhash), 0))
+ doc_score = doc_scores.get(str(fhash))
+ bucket = bucket_map.get(str(fhash), {})
+ included = any(
+ str((doc.metadata or {}).get("file_hash") or "") == str(fhash)
+ for doc in retrieved
+ )
+ if candidate_count <= 0:
+ reason = "no_scoped_candidates"
+ failure_modes.add("no_scoped_candidates")
+ elif included and (doc_score or 0.0) < 0.10:
+ reason = "included_as_low_confidence"
+ failure_modes.add("low_scoped_confidence")
+ elif not included:
+ reason = "low_scoped_confidence"
+ failure_modes.add("low_scoped_confidence")
+ else:
+ reason = "supported"
+ thin_doc = bool(bucket.get("thin_doc"))
+ if thin_doc or candidate_count <= 1 and multi_pin_mode:
+ failure_modes.add("insufficient_coverage")
+ if reason == "supported" and thin_doc:
+ reason = "insufficient_coverage"
+ doc_diagnostics.append(
+ {
+ "file_hash": str(fhash),
+ "source": doc_title_map.get(str(fhash), str(fhash)),
+ "included": included,
+ "candidate_count": candidate_count,
+ "doc_score": round(float(doc_score), 4) if doc_score is not None else None,
+ "confidence_label": _confidence_label(doc_score),
+ "reason": reason,
+ "support_label": bucket.get("support_label"),
+ "thin_doc": thin_doc,
+ }
+ )
+
+ trace_id = str(uuid.uuid4())
+ selected_chunk_ids = [
+ str((doc.metadata or {}).get("id") or "")
+ for doc in retrieved
+ if (doc.metadata or {}).get("id")
+ ]
+ selected_sources = {
+ str((doc.metadata or {}).get("source") or "Unknown") for doc in retrieved
+ }
+ selected_counts: dict[str, int] = {}
+ selected_page_numbers: set[int] = set()
+ for doc in retrieved:
+ fhash = str((doc.metadata or {}).get("file_hash") or "")
+ if fhash:
+ selected_counts[fhash] = selected_counts.get(fhash, 0) + 1
+ for page in (doc.metadata or {}).get("page_numbers") or []:
+ if isinstance(page, int):
+ selected_page_numbers.add(page)
+ elif str(page).isdigit():
+ selected_page_numbers.add(int(page))
+ if multi_pin_mode and len(selected_counts) <= 1:
+ failure_modes.add("insufficient_coverage")
+ if route_class == "follow_up" and not bool(session_context):
+ failure_modes.add("misrouted_follow_up")
+ if route_class == "relational" and not _extract_graph_terms(original_query or query):
+ failure_modes.add("misrouted_relational")
+
+ trace_quality = {
+ "route_reason": route_reason,
+ "retrieval_relevance_proxy": round(
+ sum(float((doc.metadata or {}).get("relevance_score") or 0.0) for doc in retrieved)
+ / max(1, len(retrieved)),
+ 4,
+ ),
+ "source_diversity": round(len(selected_sources) / max(1, len(retrieved)), 4),
+ "document_balance": round(
+ min(selected_counts.values()) / max(selected_counts.values())
+ if len(selected_counts) >= 2
+ else (1.0 if selected_counts else 0.0),
+ 4,
+ ),
+ "abstention_expected": any(
+ diag["reason"] in {"no_scoped_candidates", "low_scoped_confidence", "included_as_low_confidence"}
+ for diag in doc_diagnostics
+ ),
+ "doc_count_included": len(selected_counts),
+ "doc_count_with_candidates": sum(1 for count in doc_candidate_counts.values() if count > 0),
+ "thin_doc_count": sum(1 for bucket in evidence_buckets if bucket.get("thin_doc")),
+ "synthetic_summary_used": any(
+ bool((doc.metadata or {}).get("synthetic_root_summary")) for doc in retrieved
+ ),
+ "commonality_supported": bool(answer_policy.get("commonality_supported")),
+ "duplicate_collapse_count": duplicate_collapse_count,
+ "duplicate_collapse_rate": round(
+ duplicate_collapse_count / max(1, len(all_candidates)),
+ 4,
+ ),
+ "identity_store_hit": any(
+ bool((doc.metadata or {}).get("identity_store_hit")) for doc in retrieved
+ ),
+ "identity_store_satisfied": any(
+ bool((doc.metadata or {}).get("retrieval_branch") == "identity_store")
+ for doc in retrieved
+ ),
+ "history_injected": False,
+ "memory_injected": False,
+ "selected_page_numbers": sorted(selected_page_numbers),
+ "page_scope_required": page_scope_required,
+ "page_scope_supported": page_scope_supported,
+ "rerank_deltas": rerank_audit.get("rerank_deltas", []),
+ "pre_rerank_candidates": rerank_audit.get("pre_rerank_candidates", []),
+ "opening_page_candidate_count": rerank_audit.get("opening_page_candidate_count", 0),
+ "opening_page_selected_count": rerank_audit.get("opening_page_selected_count", 0),
+ "retrieve_then_stuff_active": bool(
+ config.ENABLE_RETRIEVE_THEN_STUFF and route_class in {"compare", "multi_part"}
+ ),
+ }
+
+ for doc in retrieved:
+ meta = doc.metadata or {}
+ meta["trace_id"] = trace_id
+ meta["route_mode"] = query_mode
+ meta["route_class"] = route_class
+ meta["route_reason"] = route_reason
+ meta["selected_experts"] = active_experts
+ meta["expert_weights"] = route_info.get("expert_weights", {})
+ meta["doc_diagnostics"] = doc_diagnostics
+ meta["failure_modes"] = sorted(failure_modes)
+ meta["trace_started_at_ms"] = trace_started_at_ms
+ meta["trace_quality"] = trace_quality
+ meta["candidate_counts"] = doc_candidate_counts
+ meta["evidence_buckets"] = evidence_buckets
+ meta["answer_policy"] = answer_policy
+ meta["embedding_variant"] = config.RETRIEVAL_EMBEDDING_VARIANT
+ doc.metadata = meta
+
+ if session_id and retrieved and not eval_mode:
+ _remember_session_retrieval(
+ session_key=session_key,
+ query=routing_query,
+ chunks=retrieved,
+ )
return retrieved
@@ -2064,8 +5520,8 @@ def _predict_and_prefetch(
from backend.core.providers import ProviderFactory
fast_llm = ProviderFactory.build_chat_llm(
- purpose="speed"
- ) # Groq llama-3.1 is perfect here
+ purpose="rewriter"
+ ) # Keep this on the lightweight rewriter route.
prompt = (
f"The user asked: '{original_query}'.\n"
@@ -2121,38 +5577,72 @@ def generate_answer(
if not chunks and not past_memories:
return "No relevant documents or past context were found for your query.", []
- # 1. Build Episodic Memory Block (Tier 2)
- memory_block = ""
- if past_memories:
- memory_block = (
- "EPISODIC MEMORY (Highly relevant past interactions from this session):\n"
- )
+ trace_ctx = _extract_trace_context(chunks)
+ route_class = str(trace_ctx.get("route_class") or "factoid")
+ scoped_file_hashes = sorted(
+ {
+ str((chunk.metadata or {}).get("file_hash"))
+ for chunk in chunks
+ if (chunk.metadata or {}).get("file_hash")
+ }
+ )
+
+ history_block = _build_history_block(
+ chat_history,
+ route_class=route_class,
+ eval_mode=False,
+ )
+ memory_block = history_block
+ if past_memories and route_class not in {"exact_fact", "page_scoped"}:
+ episodic_lines: List[str] = []
for mem in past_memories:
- role = mem.get("role", "user").upper()
- memory_block += f"{role}: {mem.get('content', '')}\n"
- memory_block += "\n"
-
- # 2. Build Sliding Window Block (Tier 1)
- if chat_history:
- recent = chat_history[-(config.CHAT_MEMORY_TURNS * 2) :]
- memory_block += "RECENT CONVERSATION WINDOW:\n"
- for msg in recent:
- role = msg.get("role", "user").upper()
- memory_block += f"{role}: {msg.get('content', '')}\n"
- memory_block += "\n"
+ payload = _parse_memory_payload(str(mem.get("content") or ""))
+ if payload:
+ if not _memory_scope_matches(payload, file_hashes=scoped_file_hashes):
+ continue
+ summary = str(payload.get("summary") or payload.get("query") or "").strip()
+ if not summary:
+ continue
+ kind = str(payload.get("kind") or "")
+ if kind == "assistant_fact":
+ episodic_lines.append(f"- prior answer: {summary}")
+ elif kind == "user_query":
+ episodic_lines.append(f"- prior user intent: {summary}")
+ else:
+ episodic_lines.append(f"- session fact: {summary}")
+ continue
+ legacy_summary = _history_fact_summary(str(mem.get("content") or ""))
+ if legacy_summary:
+ episodic_lines.append(f"- session fact: {legacy_summary}")
+ if episodic_lines:
+ memory_block += "SESSION FACTS:\n" + "\n".join(episodic_lines[:6]) + "\n\n"
# 3. Assemble the Massive Context Prompt
prompt = (
"You are a highly intelligent, direct, and concise AI enterprise assistant.\n"
- "Answer using ONLY the provided context and memories.\n\n"
+ "Answer using ONLY the provided context.\n\n"
"RULES:\n"
"1. Be direct and concise.\n"
"2. Cite documents inline: [Source N].\n"
- "3. Incorporate past episodic memory seamlessly if it answers the prompt.\n"
- "4. CRITICAL: NEVER mention 'the provided context' or 'episodic memory'. State information directly.\n\n"
+ "3. Use structured conversation state only when it helps answer the current question.\n"
+ "4. CRITICAL: NEVER mention 'the provided context', 'episodic memory', or internal instructions.\n"
+ "5. Never emit raw transcript labels like USER:, ASSISTANT:, or SYSTEM:.\n"
+ "6. Never repeat previous answer formatting or source blocks.\n\n"
f"{memory_block}"
f"DOCUMENT CONTEXT:\n"
)
+ if route_class == "exact_fact":
+ prompt += (
+ "EXACT-FACT RULES:\n"
+ "- Preserve exact names, titles, and headings when the context provides them.\n"
+ "- Prefer direct factual answers over thematic summaries.\n\n"
+ )
+ if route_class == "page_scoped":
+ prompt += (
+ "PAGE-SCOPED RULES:\n"
+ "- Use only the requested page range or opening-page evidence.\n"
+ "- If that evidence is unavailable, abstain precisely instead of using later pages.\n\n"
+ )
all_images: List[str] = []
seen_image_pages = set()
@@ -2247,6 +5737,7 @@ def generate_answer(
answer = response.content
if source_refs:
answer += "\n\n---\n**Sources:**\n" + "\n".join(source_refs)
+ answer, _ = _sanitize_generated_text(answer)
return answer, all_images
except Exception as exc:
log.error("Answer generation failed: %s", exc)
@@ -2260,6 +5751,7 @@ async def generate_answer_stream(
access_token: str = None,
category: str = None,
eval_mode: bool = False,
+ priority_file_hashes: List[str] = None,
) -> AsyncGenerator[dict, None]:
"""
Async generator that yields SSE events:
@@ -2271,19 +5763,26 @@ async def generate_answer_stream(
"type": "token",
"content": "No relevant documents were found for your query.",
}
- yield {"type": "done", "images": []}
+ yield {"type": "done", "images": [], "trace_id": None, "doc_diagnostics": []}
return
# ── Cache hit handler ────────────────────────────────────────────────────
if len(chunks) == 1 and chunks[0].page_content == "__CACHE_HIT__":
cached = chunks[0].metadata.get("__cache__", {})
- answer = cached.get("answer", "")
+ cache_trace_id = str(uuid.uuid4())
+ answer, _ = _sanitize_generated_text(cached.get("answer", ""))
# Stream cached answer token by token for consistent UX
chunk_size = 50
for i in range(0, len(answer), chunk_size):
yield {"type": "token", "content": answer[i : i + chunk_size]}
await asyncio.sleep(0)
- yield {"type": "done", "sources": cached.get("sources", []), "images": []}
+ yield {
+ "type": "done",
+ "sources": cached.get("sources", []),
+ "images": [],
+ "trace_id": cache_trace_id,
+ "doc_diagnostics": [],
+ }
return
# ── TASK 3: Log RAGAS reward signal to evaluation_logs ───────────────────
@@ -2306,26 +5805,65 @@ async def generate_answer_stream(
pass # logging failure must never break streaming
# ── Build prompt (same logic as generate_answer) ────────────────────────
- memory_block = ""
- if chat_history:
- recent = chat_history[-(config.CHAT_MEMORY_TURNS * 2) :]
- memory_block = "CONVERSATION HISTORY (context only):\n"
- for msg in recent:
- role = msg.get("role", "user").upper()
- memory_block += f"{role}: {msg.get('content', '')}\n"
- memory_block += "\n"
+ trace_ctx = _extract_trace_context(chunks)
+ route_class = str(trace_ctx.get("route_class") or "factoid")
+ scoped_file_hashes = sorted(
+ {
+ str((chunk.metadata or {}).get("file_hash"))
+ for chunk in chunks
+ if (chunk.metadata or {}).get("file_hash")
+ }
+ )
+ history_block = _build_history_block(
+ chat_history,
+ route_class=route_class,
+ eval_mode=eval_mode,
+ )
+ memory_block = history_block
+ history_injected = bool(history_block.strip())
+ memory_injected = False
# Retrieve episodic memory
try:
loop = asyncio.get_event_loop()
memories = await loop.run_in_executor(
None,
- lambda: _get_episodic_memory(session_id, query, access_token=access_token),
+ lambda: _get_episodic_memory(
+ session_id,
+ query,
+ access_token=access_token,
+ route_class=route_class,
+ file_hashes=scoped_file_hashes,
+ eval_mode=eval_mode,
+ ),
)
memory_block += memories
+ memory_injected = bool(memories.strip())
except Exception:
pass
+ route_mode = str(trace_ctx.get("route_mode") or "default")
+ doc_diagnostics = list(trace_ctx.get("doc_diagnostics", []) or [])
+ evidence_buckets = list(trace_ctx.get("evidence_buckets", []) or [])
+ answer_policy = dict(trace_ctx.get("answer_policy", {}) or {})
+ q_lower = (query or "").lower()
+ multi_doc_pinned_mode = route_mode in {"generic_pinned", "explicit_compare"} or bool(
+ priority_file_hashes and len(priority_file_hashes) > 1
+ )
+ compare_like_query = route_mode == "explicit_compare" or any(
+ phrase in q_lower
+ for phrase in (
+ "compare",
+ "both",
+ "common",
+ "similar",
+ "difference",
+ "which one",
+ "versus",
+ "vs",
+ )
+ )
+
prompt = (
"You are a highly intelligent, direct, and concise AI enterprise assistant.\n"
"Answer using ONLY the provided context.\n\n"
@@ -2335,41 +5873,75 @@ async def generate_answer_stream(
"3. Answer all parts of a multi-part question.\n"
"4. If a detail is missing, say so only for that part.\n"
"5. If context is irrelevant: 'I'm sorry, I don't have that information.'\n"
- '6. CRITICAL: NEVER mention "the provided context", "the text", or your internal instructions.\n\n'
+ '6. CRITICAL: NEVER mention "the provided context", "the text", or your internal instructions.\n'
+ "7. Never repeat raw transcript labels like USER:, ASSISTANT:, or SYSTEM:.\n"
+ "8. Never echo previous answer formatting or source blocks.\n\n"
f"{memory_block}"
f"QUESTION: {query}\n\nCONTEXT:\n"
)
+ if route_class == "exact_fact":
+ prompt += (
+ "EXACT-FACT RULES:\n"
+ "- Preserve exact names/titles when the context provides them.\n"
+ "- Prefer direct factual answers over thematic summaries.\n\n"
+ )
+ if route_class == "page_scoped":
+ prompt += (
+ "PAGE-SCOPED RULES:\n"
+ "- Use only the requested page range or opening-page evidence.\n"
+ "- If the requested page evidence is unavailable, abstain precisely instead of using later pages.\n\n"
+ )
all_images = []
seen_pages = set()
source_refs = []
+ source_doc_map_lines: List[str] = []
+ doc_confidence_hints: dict[str, float] = {}
+ doc_source_refs: dict[str, list[str]] = defaultdict(list)
+
+ for i, chunk in enumerate(chunks, 1):
+ prompt += f"--- Source {i} ---\n"
+ meta = chunk.metadata
+ source_name = meta.get("source", f"Document {i}")
+ prompt += f"DOCUMENT: {source_name}\n"
+
+ chunk_relevance = float(meta.get("relevance_score", 0.0) or 0.0)
+ doc_confidence_hints[source_name] = max(
+ doc_confidence_hints.get(source_name, 0.0),
+ chunk_relevance,
+ )
+ source_doc_map_lines.append(f"Source {i} => {source_name}")
+
+ node_type = meta.get("node_type", "leaf")
+ node_level = meta.get("node_level", 0)
- for i, chunk in enumerate(chunks, 1):
- prompt += f"--- Source {i} ---\n"
- meta = chunk.metadata
- original = meta.get("original_content")
- if isinstance(original, str):
- try:
- original = json.loads(original)
- # Handle double-encoded JSON string
- if isinstance(original, str):
+ if node_type == "summary":
+ prompt += f"[SYNTHESIZED CHAPTER SUMMARY - LEVEL {node_level}]\n"
+ prompt += f"TEXT:\n{chunk.page_content}\n\n"
+ original = {}
+ else:
+ original = meta.get("original_content")
+ if isinstance(original, str):
+ try:
original = json.loads(original)
- except: # noqa: E722
+ # Handle double-encoded JSON string
+ if isinstance(original, str):
+ original = json.loads(original)
+ except: # noqa: E722
+ original = {}
+ elif not isinstance(original, dict):
original = {}
- elif not isinstance(original, dict):
- original = {}
- raw_text = original.get("raw_text", chunk.page_content)
- if raw_text:
- prompt += f"TEXT:\n{raw_text}\n\n"
- for j, tbl in enumerate(original.get("tables_html", []), 1):
- prompt += f"TABLE {j}:\n{tbl}\n\n"
+ raw_text = original.get("raw_text", chunk.page_content)
+ if raw_text:
+ prompt += f"TEXT:\n{raw_text}\n\n"
+ for j, tbl in enumerate(original.get("tables_html", []), 1):
+ prompt += f"TABLE {j}:\n{tbl}\n\n"
file_hash = meta.get("file_hash", "unknown")
pages = meta.get("page_numbers", [])
page_ref = pages[0] if pages else "unknown"
loc_key = f"{file_hash}_p{page_ref}"
- chunk_relevance = meta.get("relevance_score", 0)
if (
chunk_relevance >= config.IMAGE_RELEVANCE_THRESHOLD
@@ -2379,7 +5951,6 @@ async def generate_answer_stream(
seen_pages.add(loc_key)
all_images.append(img_b64)
- source_name = meta.get("source", f"Document {i}")
chunk_idx = meta.get("chunk_index", "?")
doc_type = meta.get("document_type", "")
relevance = meta.get("relevance_score")
@@ -2399,6 +5970,71 @@ async def generate_answer_stream(
source_refs.append(
f"[Source {i}] {source_name}{type_str} (chunk {chunk_idx}{page_str}{rel_str})"
)
+ file_hash = str(meta.get("file_hash") or source_name)
+ doc_source_refs[file_hash].append(f"[Source {i}]")
+
+ if multi_doc_pinned_mode and source_doc_map_lines:
+ bucket_map = {
+ str(bucket.get("file_hash")): bucket for bucket in evidence_buckets
+ }
+ prompt += "\nSOURCE TO DOCUMENT MAP:\n"
+ prompt += "\n".join(source_doc_map_lines) + "\n"
+ if doc_confidence_hints:
+ hint_lines = []
+ for name, score in sorted(
+ doc_confidence_hints.items(),
+ key=lambda item: item[1],
+ reverse=True,
+ ):
+ label = "low confidence" if score < 0.1 else f"confidence {score:.2f}"
+ hint_lines.append(f"- {name}: {label}")
+ prompt += "DOCUMENT CONFIDENCE HINTS:\n" + "\n".join(hint_lines) + "\n"
+ if doc_diagnostics:
+ prompt += "COMPOSED DOCUMENT EVIDENCE BUCKETS:\n"
+ for diag in doc_diagnostics:
+ file_hash = str(diag.get("file_hash") or "")
+ bucket = bucket_map.get(file_hash, {})
+ support_label = bucket.get("support_label") or diag.get("confidence_label") or "unknown"
+ cited_sources = ", ".join(doc_source_refs.get(file_hash, [])) or "None"
+ prompt += (
+ f"- {diag.get('source', file_hash)}:\n"
+ f" reason={diag.get('reason', 'unknown')}; support={support_label}; "
+ f"candidates={diag.get('candidate_count', 0)}; thin_doc={bool(bucket.get('thin_doc'))}\n"
+ f" sources={cited_sources}\n"
+ )
+ prompt += (
+ "MULTI-DOC ANSWERING RULES:\n"
+ "1. Keep evidence separated by document; never mix facts between documents.\n"
+ "2. For each selected document, include what is known from its own sources.\n"
+ "3. If evidence is weak/missing for a selected document, explicitly say "
+ "'Insufficient evidence in <document>'.\n"
+ "4. Do not fabricate comparisons or shared themes without direct support.\n"
+ )
+ if route_mode == "generic_pinned":
+ prompt += (
+ "5. This is a generic multi-document request. Structure the response as:\n"
+ " - <Document A> summary\n"
+ " - <Document B> summary\n"
+ " - Cross-document note (optional, only if directly supported)\n"
+ "6. Emit exactly one section per pinned document, even if one document is weak.\n"
+ )
+ elif compare_like_query:
+ prompt += (
+ "5. This is a cross-document question. Structure the response as:\n"
+ " - Document A findings\n"
+ " - Document B findings\n"
+ " - Commonalities (only if evidence supports them)\n"
+ " - Differences (only if evidence supports them)\n"
+ )
+ if not answer_policy.get("commonality_supported", False):
+ prompt += (
+ "6. Retrieval did NOT establish supported shared themes across multiple documents. "
+ "Do not claim commonalities unless you can cite distinct sources from at least two documents.\n"
+ )
+ if any(diag.get("reason") == "insufficient_coverage" for diag in doc_diagnostics):
+ prompt += (
+ "7. At least one document is thinly covered. Prefer precise abstention over broad thematic blending.\n"
+ )
prompt += "\nANSWER (use [Source N] citations):"
@@ -2420,19 +6056,36 @@ async def generate_answer_stream(
try:
full_answer = ""
+ raw_answer = ""
+ emitted_clean_len = 0
async for chunk_text in llm.astream([HumanMessage(content=message_content)]):
token = chunk_text.content
if token:
- full_answer += token
- yield {"type": "token", "content": token}
+ raw_answer += token
+ cleaned_answer, _ = _sanitize_generated_text(raw_answer)
+ delta = cleaned_answer[emitted_clean_len:]
+ if delta:
+ full_answer += delta
+ emitted_clean_len = len(cleaned_answer)
+ yield {"type": "token", "content": delta}
# Append sources as plain text after streaming
if source_refs:
suffix = "\n\n---\n**Sources:**\n" + "\n".join(source_refs)
- yield {"type": "token", "content": suffix}
- full_answer += suffix
+ clean_suffix, _ = _sanitize_generated_text(suffix)
+ yield {"type": "token", "content": clean_suffix}
+ full_answer += clean_suffix
+ sanitized_answer, sanitized_removed = _sanitize_generated_text(raw_answer)
+ sanitizer_metrics = {
+ "sanitizer_triggered": bool(sanitized_removed),
+ "sanitized_token_count": sanitized_removed,
+ "raw_answer_hash": hashlib.sha256((raw_answer or "").encode("utf-8")).hexdigest(),
+ "clean_answer_hash": hashlib.sha256((sanitized_answer or "").encode("utf-8")).hexdigest(),
+ }
# ── Store in query cache ─────────────────────────────────────────────
- if not eval_mode:
+ # Skip when graph-pinning is active: a scoped answer must not pollute
+ # the corpus-wide semantic cache and confuse subsequent unpinned queries.
+ if not eval_mode and not priority_file_hashes:
try:
if access_token:
from backend.core.auth_utils import extract_jwt_sub
@@ -2482,62 +6135,125 @@ async def generate_answer_stream(
# Save to memory async
try:
loop = asyncio.get_event_loop()
- await loop.run_in_executor(
- None,
- lambda: _save_to_memory(
- session_id,
- query,
- full_answer,
- access_token=access_token,
- ),
- )
+ if not eval_mode:
+ await loop.run_in_executor(
+ None,
+ lambda: _save_to_memory(
+ session_id,
+ query,
+ full_answer,
+ access_token=access_token,
+ route_class=route_class,
+ chunks=chunks,
+ ),
+ )
except Exception:
pass
+ for doc in chunks:
+ meta = doc.metadata or {}
+ quality = dict(meta.get("trace_quality", {}) or {})
+ quality["history_injected"] = history_injected
+ quality["memory_injected"] = memory_injected
+ meta["trace_quality"] = quality
+ doc.metadata = meta
+
+ trace_id = _persist_query_trace(
+ query=query,
+ session_id=session_id,
+ chunks=chunks,
+ answer=full_answer,
+ access_token=access_token,
+ priority_file_hashes=priority_file_hashes,
+ sanitizer_metrics=sanitizer_metrics,
+ ) or trace_ctx.get("trace_id")
+
except Exception as exc:
log.error("Streaming generation failed: %s", exc)
yield {"type": "token", "content": f"Failed to generate answer: {exc}"}
+ trace_id = trace_ctx.get("trace_id")
- yield {"type": "done", "images": all_images}
+ yield {
+ "type": "done",
+ "images": all_images,
+ "trace_id": trace_id,
+ "doc_diagnostics": doc_diagnostics,
+ }
-def _get_episodic_memory(session_id: str, query: str, access_token: str = None) -> str:
+def _get_episodic_memory(
+ session_id: str,
+ query: str,
+ access_token: str = None,
+ *,
+ route_class: str = "factoid",
+ file_hashes: Optional[List[str]] = None,
+ eval_mode: bool = False,
+) -> str:
"""Retrieve relevant past memories and return as a formatted string block."""
+ if eval_mode or route_class in {"exact_fact", "page_scoped"}:
+ return ""
try:
query_vector = get_cached_embedding(query)
- supabase = _build_supabase_client(access_token)
+ user_id = _stable_user_id(None, access_token)
+ supabase = _build_service_supabase_client()
mem_res = supabase.rpc(
"match_memory",
{
"query_embedding": query_vector,
"match_session_id": session_id,
"match_count": 3,
+ "p_user_id": user_id,
},
).execute()
if not mem_res.data:
return ""
- block = "EPISODIC MEMORY (relevant past interactions):\n"
+ lines = ["SESSION FACTS:"]
for mem in mem_res.data:
- role = mem.get("role", "user").upper()
- block += f"{role}: {mem.get('content', '')}\n"
- block += "\n"
+ payload = _parse_memory_payload(mem.get("content") or "")
+ if payload:
+ if not _memory_scope_matches(payload, file_hashes=file_hashes):
+ continue
+ summary = _history_fact_summary(
+ payload.get("summary") or payload.get("query") or payload.get("content") or "",
+ limit=180,
+ )
+ if not summary:
+ continue
+ label = "prior_answer_summary" if payload.get("kind") == "assistant_fact" else "prior_user_query"
+ lines.append(f"- {label}: {summary}")
+ continue
+ legacy_content = _history_fact_summary(mem.get("content") or "", limit=180)
+ if legacy_content:
+ lines.append(f"- legacy_memory_summary: {legacy_content}")
+ if len(lines) == 1:
+ return ""
log.info(
"Retrieved %d episodic memories for session %s",
len(mem_res.data),
session_id[:8],
)
- return block
+ return "\n".join(lines) + "\n\n"
except Exception as e:
log.warning("Could not retrieve episodic memory: %s", e)
return ""
def _save_to_memory(
- session_id: str, query: str, answer: str, access_token: str = None
+ session_id: str,
+ query: str,
+ answer: str,
+ access_token: str = None,
+ *,
+ route_class: str = "factoid",
+ chunks: Optional[List[Document]] = None,
) -> None:
"""Save query + answer to episodic memory table."""
+ if route_class in {"exact_fact", "page_scoped"}:
+ return
try:
- supabase = _build_supabase_client(access_token)
+ user_id = _stable_user_id(None, access_token)
+ supabase = _build_service_supabase_client()
query_vector = get_cached_embedding(query)
# 🚀 THE BOUNCER: Check for semantic duplicates before saving
dup_check = supabase.rpc(
@@ -2546,6 +6262,7 @@ def _save_to_memory(
"query_embedding": query_vector,
"match_session_id": session_id,
"match_count": 1,
+ "p_user_id": user_id,
},
).execute()
@@ -2556,35 +6273,478 @@ def _save_to_memory(
session_id[:8],
)
return
+ file_hashes = sorted(
+ {
+ str((doc.metadata or {}).get("file_hash"))
+ for doc in (chunks or [])
+ if (doc.metadata or {}).get("file_hash")
+ }
+ )
+ document_types = sorted(
+ {
+ str((doc.metadata or {}).get("document_type"))
+ for doc in (chunks or [])
+ if (doc.metadata or {}).get("document_type")
+ }
+ )
+ query_payload = {
+ "kind": "user_query",
+ "query": query,
+ "summary": _history_fact_summary(query, limit=220),
+ "file_hashes": file_hashes,
+ "document_types": document_types,
+ }
payload = {
"session_id": session_id,
"role": "user",
- "content": query,
+ "content": json.dumps(query_payload),
"embedding": query_vector,
}
- if access_token:
- from backend.core.auth_utils import extract_jwt_sub
-
- payload["user_id"] = extract_jwt_sub(access_token)
+ if user_id:
+ payload["user_id"] = user_id
supabase.table("chat_memory").insert(payload).execute()
- clean_answer = answer.split("\n\n---\n**Sources:**")[0]
- ans_vector = get_cached_embedding(clean_answer)
+ clean_answer = _summarize_answer_for_memory(answer)
+ answer_payload = {
+ "kind": "assistant_fact",
+ "summary": clean_answer,
+ "query": query,
+ "file_hashes": file_hashes,
+ "document_types": document_types,
+ }
+ ans_vector = get_cached_embedding(clean_answer or query)
payload2 = {
"session_id": session_id,
"role": "assistant",
- "content": clean_answer,
+ "content": json.dumps(answer_payload),
"embedding": ans_vector,
}
- if access_token:
- from backend.core.auth_utils import extract_jwt_sub
-
- payload2["user_id"] = extract_jwt_sub(access_token)
+ if user_id:
+ payload2["user_id"] = user_id
supabase.table("chat_memory").insert(payload2).execute()
log.debug("Saved Q&A to episodic memory for session %s", session_id[:8])
except Exception as e:
log.error("Failed to save memory: %s", e)
+def _extract_trace_context(chunks: List[Document]) -> dict:
+ if not chunks:
+ return {}
+ meta = chunks[0].metadata or {}
+ return {
+ "trace_id": meta.get("trace_id"),
+ "route_mode": meta.get("route_mode", "default"),
+ "route_class": meta.get("route_class", "factoid"),
+ "route_reason": meta.get("route_reason", "heuristic_default"),
+ "selected_experts": list(meta.get("selected_experts", []) or []),
+ "expert_weights": dict(meta.get("expert_weights", {}) or {}),
+ "doc_diagnostics": list(meta.get("doc_diagnostics", []) or []),
+ "evidence_buckets": list(meta.get("evidence_buckets", []) or []),
+ "answer_policy": dict(meta.get("answer_policy", {}) or {}),
+ "failure_modes": list(meta.get("failure_modes", []) or []),
+ "trace_started_at_ms": meta.get("trace_started_at_ms"),
+ "trace_quality": dict(meta.get("trace_quality", {}) or {}),
+ "candidate_counts": dict(meta.get("candidate_counts", {}) or {}),
+ "embedding_variant": meta.get(
+ "embedding_variant",
+ config.RETRIEVAL_EMBEDDING_VARIANT,
+ ),
+ }
+
+
+def _answer_source_agreement_proxy(answer: str, chunk_count: int) -> float:
+ citations = len(re.findall(r"\[Source\s+\d+\]", answer or "", flags=re.IGNORECASE))
+ if chunk_count <= 0:
+ return 0.0
+ return round(min(1.0, citations / max(1, chunk_count)), 4)
+
+
+def _is_high_priority_history_line(content: str) -> bool:
+ lowered = (content or "").lower()
+ return any(
+ token in lowered
+ for token in (
+ "must",
+ "should",
+ "only",
+ "prefer",
+ "don't",
+ "do not",
+ "never",
+ "important",
+ "remember",
+ "focus on",
+ )
+ )
+
+
+def _summarize_answer_for_memory(answer: str) -> str:
+ clean_answer = answer.split("\n\n---\n**Sources:**")[0]
+ clean_answer, _ = _sanitize_generated_text(clean_answer)
+ clean_answer = _history_fact_summary(clean_answer, limit=320)
+ return clean_answer
+
+
+def _build_history_block(
+ chat_history: Optional[List[dict]],
+ *,
+ route_class: str = "factoid",
+ eval_mode: bool = False,
+) -> str:
+ if not chat_history or eval_mode or route_class in {"exact_fact", "page_scoped"}:
+ return ""
+
+ recent_window = max(1, config.HISTORY_RECENT_TURNS * 2)
+ recent = list(chat_history[-recent_window:])
+ older = list(chat_history[:-recent_window]) if len(chat_history) > recent_window else []
+
+ prior_user = next(
+ (
+ _history_fact_summary(msg.get("content", ""), limit=200)
+ for msg in reversed(recent)
+ if msg.get("role") == "user" and str(msg.get("content") or "").strip()
+ ),
+ "",
+ )
+ prior_answer = next(
+ (
+ _history_fact_summary(msg.get("content", ""), limit=220)
+ for msg in reversed(recent)
+ if msg.get("role") == "assistant" and str(msg.get("content") or "").strip()
+ ),
+ "",
+ )
+ constraints = []
+ for msg in older:
+ content = _history_fact_summary(msg.get("content", ""), limit=180)
+ if not content:
+ continue
+ if _is_high_priority_history_line(content):
+ constraints.append(content)
+ constraints = constraints[-config.HISTORY_IMPORTANT_MAX :]
+
+ lines = ["CONVERSATION STATE:"]
+ if prior_user:
+ lines.append(f"- previous_user_intent: {prior_user}")
+ if prior_answer:
+ lines.append(f"- previous_answer_summary: {prior_answer}")
+ for constraint in constraints:
+ lines.append(f"- preserved_constraint: {constraint}")
+ return "\n".join(lines) + "\n\n" if len(lines) > 1 else ""
+
+
+def _persist_trace_graph_enrichment(
+ *,
+ trace_id: str,
+ session_id: str,
+ user_id: Optional[str],
+ query: str,
+ chunks: List[Document],
+ priority_file_hashes: Optional[List[str]] = None,
+) -> None:
+ if not user_id:
+ return
+ try:
+ episode_key = _safe_graph_key("episode_cluster", session_id)
+ conversation_key = _safe_graph_key("conversation_turn", trace_id)
+ nodes = [
+ {
+ "user_id": user_id,
+ "node_key": episode_key,
+ "node_type": "episode_cluster",
+ "label": f"Episode {session_id[:8]}",
+ "payload": {"session_id": session_id},
+ },
+ {
+ "user_id": user_id,
+ "node_key": conversation_key,
+ "node_type": "conversation_turn",
+ "label": (query or "Conversation turn")[:200],
+ "payload": {"trace_id": trace_id, "session_id": session_id},
+ },
+ ]
+ edges = [
+ {
+ "user_id": user_id,
+ "source_node_key": conversation_key,
+ "target_node_key": episode_key,
+ "edge_type": "part_of",
+ "weight": 1.0,
+ "payload": {"trace_id": trace_id},
+ }
+ ]
+ for doc in chunks:
+ meta = doc.metadata or {}
+ node_type = str(meta.get("node_type") or "leaf")
+ node_id = str(meta.get("node_id") or meta.get("id") or "")
+ if not node_id:
+ continue
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": conversation_key,
+ "target_node_key": _safe_graph_key(node_type, node_id),
+ "edge_type": "cites",
+ "weight": float(meta.get("relevance_score") or 1.0),
+ "payload": {"trace_id": trace_id, "file_hash": meta.get("file_hash")},
+ }
+ )
+
+ pinned_hashes = [h for h in (priority_file_hashes or []) if h]
+ if len(pinned_hashes) >= 2:
+ for idx, src_hash in enumerate(pinned_hashes):
+ for dst_hash in pinned_hashes[idx + 1 :]:
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": _safe_graph_key("document", src_hash),
+ "target_node_key": _safe_graph_key("document", dst_hash),
+ "edge_type": "compared_with",
+ "weight": 1.0,
+ "payload": {"trace_id": trace_id},
+ }
+ )
+
+ try:
+ prior_rows = (
+ _build_service_supabase_client()
+ .table("query_traces")
+ .select("trace_id, created_at")
+ .eq("user_id", user_id)
+ .eq("session_id", session_id)
+ .limit(5)
+ .execute()
+ .data
+ or []
+ )
+ prior_rows = sorted(
+ [row for row in prior_rows if row.get("trace_id") and row.get("trace_id") != trace_id],
+ key=lambda row: row.get("created_at") or "",
+ reverse=True,
+ )
+ if prior_rows:
+ prev_trace_id = str(prior_rows[0]["trace_id"])
+ edges.append(
+ {
+ "user_id": user_id,
+ "source_node_key": conversation_key,
+ "target_node_key": _safe_graph_key("conversation_turn", prev_trace_id),
+ "edge_type": "follows_from",
+ "weight": 1.0,
+ "payload": {"trace_id": trace_id},
+ }
+ )
+ except Exception as prev_exc:
+ log.debug("Previous trace graph-link skipped: %s", prev_exc)
+
+ _persist_graph_rows(nodes=nodes, edges=edges)
+ except Exception as exc:
+ log.debug("Trace graph enrichment skipped: %s", exc)
+
+
+def _persist_query_trace(
+ *,
+ query: str,
+ session_id: str,
+ chunks: List[Document],
+ answer: str,
+ access_token: str = None,
+ priority_file_hashes: Optional[List[str]] = None,
+ sanitizer_metrics: Optional[dict] = None,
+) -> Optional[str]:
+ try:
+ trace_ctx = _extract_trace_context(chunks)
+ trace_id = str(trace_ctx.get("trace_id") or uuid.uuid4())
+ trace_started = int(trace_ctx.get("trace_started_at_ms") or int(time.time() * 1000))
+ latency_ms = max(0, int(time.time() * 1000) - trace_started)
+ answer_hash = hashlib.sha256((answer or "").encode("utf-8")).hexdigest()
+ selected_chunk_ids = [
+ str((doc.metadata or {}).get("id") or "")
+ for doc in chunks
+ if (doc.metadata or {}).get("id")
+ ]
+ document_types = sorted(
+ {
+ str((doc.metadata or {}).get("document_type") or "general_document")
+ for doc in chunks
+ if (doc.metadata or {}).get("document_type")
+ }
+ )
+ quality = dict(trace_ctx.get("trace_quality", {}) or {})
+ quality["answer_source_agreement_proxy"] = _answer_source_agreement_proxy(
+ answer,
+ len(chunks),
+ )
+ quality["embedding_variant"] = trace_ctx.get(
+ "embedding_variant",
+ config.RETRIEVAL_EMBEDDING_VARIANT,
+ )
+ quality["route_class"] = trace_ctx.get("route_class", "factoid")
+ quality["route_reason"] = trace_ctx.get("route_reason", "heuristic_default")
+ if sanitizer_metrics:
+ quality.update(sanitizer_metrics)
+
+ answer_lower = (answer or "").lower()
+ failure_modes = set(trace_ctx.get("failure_modes", []) or [])
+ route_class = str(trace_ctx.get("route_class") or "factoid")
+ if any(token in answer_lower for token in ("user:", "assistant:", "system:")):
+ failure_modes.add("output_echo")
+ if quality.get("history_injected"):
+ failure_modes.add("history_contamination")
+ if quality.get("memory_injected"):
+ failure_modes.add("memory_contamination")
+ if quality.get("sanitizer_triggered") and int(quality.get("sanitized_token_count") or 0) >= 20:
+ failure_modes.add("output_echo")
+ if float(quality.get("retrieval_relevance_proxy") or 0.0) < 0.2 and (answer or "").strip():
+ failure_modes.add("low_relevance_answered")
+ if route_class in {"exact_fact", "page_scoped"} and not quality.get("identity_store_hit") and not quality.get("page_scope_supported", True):
+ failure_modes.add("generic_answer_to_exact_query")
+ if route_class == "page_scoped" and not quality.get("page_scope_supported", True):
+ failure_modes.add("page_scope_violation")
+ if not _answer_looks_abstention(answer):
+ failure_modes.add("bad_abstention")
+ if route_class == "page_scoped" and quality.get("route_reason") == "page_scope:cover" and not quality.get("page_scope_supported", True):
+ failure_modes.add("cover_page_miss")
+ if any(token in answer_lower for token in ("i'm not sure", "i am not sure", "it seems", "maybe")) and float(quality.get("retrieval_relevance_proxy") or 0.0) >= 0.65:
+ failure_modes.add("correct_answer_wrong_confidence")
+
+ user_id = None
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ user_id = safe_extract_jwt_sub(access_token)
+
+ payload = {
+ "trace_id": trace_id,
+ "user_id": user_id,
+ "session_id": session_id,
+ "question": query[:1000],
+ "route_mode": trace_ctx.get("route_mode", "default"),
+ "selected_experts": trace_ctx.get("selected_experts", []),
+ "expert_weights": trace_ctx.get("expert_weights", {}),
+ "pinned_file_hashes": priority_file_hashes or [],
+ "document_types": document_types,
+ "candidate_counts": trace_ctx.get("candidate_counts", {}),
+ "selected_chunk_ids": selected_chunk_ids,
+ "doc_diagnostics": trace_ctx.get("doc_diagnostics", []),
+ "failure_modes": sorted(failure_modes),
+ "quality_metrics": quality,
+ "answer_hash": answer_hash,
+ "answer_preview": (answer or "")[:2000],
+ "latency_ms": latency_ms,
+ }
+ _build_service_supabase_client().table("query_traces").upsert(
+ payload,
+ on_conflict="trace_id",
+ ).execute()
+ _persist_trace_graph_enrichment(
+ trace_id=trace_id,
+ session_id=session_id,
+ user_id=user_id,
+ query=query,
+ chunks=chunks,
+ priority_file_hashes=priority_file_hashes,
+ )
+ return trace_id
+ except Exception as exc:
+ log.debug("Query trace logging skipped: %s", exc)
+ return None
+
+
+def record_answer_feedback(
+ feedback: dict,
+ *,
+ access_token: str = None,
+) -> bool:
+ try:
+ trace_id = str(feedback.get("trace_id") or "").strip()
+ if not trace_id:
+ return False
+ correction_text = (feedback.get("correction_text") or "").strip()
+ payload = {
+ "trace_id": trace_id,
+ "helpful": feedback.get("helpful"),
+ "accepted": feedback.get("accepted"),
+ "reason_code": feedback.get("reason_code"),
+ "correction_text": correction_text or None,
+ "promote_to_eval": bool(
+ correction_text or feedback.get("helpful") is False or feedback.get("accepted")
+ ),
+ }
+ if access_token:
+ from backend.core.auth_utils import safe_extract_jwt_sub
+
+ payload["user_id"] = safe_extract_jwt_sub(access_token)
+ _build_service_supabase_client().table("answer_feedback").insert(payload).execute()
+ if payload["promote_to_eval"]:
+ try:
+ sb = _build_service_supabase_client()
+ trace_res = (
+ sb.table("query_traces")
+ .select("trace_id, session_id, question")
+ .eq("trace_id", trace_id)
+ .limit(1)
+ .execute()
+ )
+ trace_row = (trace_res.data or [{}])[0]
+ user_id = payload.get("user_id")
+ session_id = trace_row.get("session_id") or "unknown_session"
+ conversation_key = _safe_graph_key("conversation_turn", trace_id)
+ episode_key = _safe_graph_key("episode_cluster", session_id)
+ saved_key = _safe_graph_key("saved_memory", trace_id)
+ nodes = [
+ {
+ "user_id": user_id,
+ "node_key": episode_key,
+ "node_type": "episode_cluster",
+ "label": f"Episode {session_id[:8]}",
+ "payload": {"session_id": session_id},
+ },
+ {
+ "user_id": user_id,
+ "node_key": conversation_key,
+ "node_type": "conversation_turn",
+ "label": (trace_row.get("question") or "Conversation turn")[:200],
+ "payload": {"trace_id": trace_id, "session_id": session_id},
+ },
+ {
+ "user_id": user_id,
+ "node_key": saved_key,
+ "node_type": "saved_memory",
+ "label": (correction_text or trace_row.get("question") or "Accepted answer")[:200],
+ "payload": {
+ "trace_id": trace_id,
+ "reason_code": payload.get("reason_code"),
+ "correction_text": correction_text or None,
+ },
+ },
+ ]
+ edges = [
+ {
+ "user_id": user_id,
+ "source_node_key": conversation_key,
+ "target_node_key": episode_key,
+ "edge_type": "part_of",
+ "weight": 1.0,
+ "payload": {"trace_id": trace_id},
+ },
+ {
+ "user_id": user_id,
+ "source_node_key": saved_key,
+ "target_node_key": conversation_key,
+ "edge_type": "saved_from",
+ "weight": 1.0,
+ "payload": {"trace_id": trace_id},
+ },
+ ]
+ _persist_graph_rows(nodes=nodes, edges=edges)
+ except Exception as graph_exc:
+ log.debug("Feedback graph persistence skipped: %s", graph_exc)
+ return True
+ except Exception as exc:
+ log.debug("Answer feedback logging skipped: %s", exc)
+ return False
+
+
def _log_retrieval_reward(
query: str,
chunks: List[Document],
@@ -2691,7 +6851,10 @@ def _should_use_tree_path(query: str) -> bool:
def tree_search(
- query: str, access_token: str = None, category: str = None
+ query: str,
+ access_token: str = None,
+ category: str = None,
+ priority_file_hashes: List[str] = None,
) -> List[Document]:
"""
Navigates the structural JSON trees in Supabase to answer highly specific
@@ -2754,6 +6917,14 @@ def tree_search(
}
except Exception as exc:
log.warning("Could not apply tree-search category filter: %s", exc)
+ if priority_file_hashes:
+ pinned_hashes = {h for h in priority_file_hashes if h}
+ if pinned_hashes:
+ allowed_hashes = (
+ pinned_hashes
+ if allowed_hashes is None
+ else allowed_hashes.intersection(pinned_hashes)
+ )
matched_chunks = []
@@ -2810,22 +6981,91 @@ def retrieve_chunks_routed(
user_id: str = None,
original_query: str = None,
eval_mode: bool = False,
+ priority_file_hashes: List[str] = None,
) -> List[Document]:
"""
Live request-path retrieval entrypoint.
Routes structural queries to the tree index first, then falls back to vector retrieval.
"""
routing_query = (original_query or query or "").strip()
+ route_info = _route_query_experts(
+ routing_query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ )
+ query_mode = _classify_priority_query_mode(routing_query, priority_file_hashes)
+ route_decision = _classify_query_route_decision(
+ routing_query,
+ session_id=session_id,
+ user_id=user_id,
+ priority_file_hashes=priority_file_hashes,
+ )
+ route_class = route_decision.route_class
+ route_reason = route_decision.route_reason
if routing_query and _should_use_tree_path(routing_query):
log.info("🎯 PageIndex triggered: query routed to structural tree path.")
tree_chunks = tree_search(
- routing_query, access_token=access_token, category=category
+ routing_query,
+ access_token=access_token,
+ category=category,
+ priority_file_hashes=priority_file_hashes,
)
if tree_chunks:
- if session_id:
+ trace_id = str(uuid.uuid4())
+ for doc in tree_chunks:
+ meta = doc.metadata or {}
+ meta["trace_id"] = trace_id
+ meta["route_mode"] = query_mode
+ meta["route_class"] = route_class
+ meta["route_reason"] = route_reason
+ meta["selected_experts"] = route_info.get("selected_experts", [])
+ meta["expert_weights"] = route_info.get("expert_weights", {})
+ meta["doc_diagnostics"] = []
+ meta["failure_modes"] = []
+ meta["trace_started_at_ms"] = int(time.time() * 1000)
+ meta["embedding_variant"] = config.RETRIEVAL_EMBEDDING_VARIANT
+ meta["trace_quality"] = {
+ "route_class": route_class,
+ "route_reason": route_reason,
+ "retrieval_relevance_proxy": round(
+ sum(float((c.metadata or {}).get("relevance_score") or 0.0) for c in tree_chunks)
+ / max(1, len(tree_chunks)),
+ 4,
+ ),
+ "source_diversity": round(
+ len({str((c.metadata or {}).get("source") or "Unknown") for c in tree_chunks})
+ / max(1, len(tree_chunks)),
+ 4,
+ ),
+ "document_balance": 1.0,
+ "abstention_expected": False,
+ "identity_store_hit": False,
+ "identity_store_satisfied": False,
+ "history_injected": False,
+ "memory_injected": False,
+ "selected_page_numbers": sorted(
+ {
+ int(page)
+ for c in tree_chunks
+ for page in ((c.metadata or {}).get("page_numbers") or [])
+ if isinstance(page, int)
+ }
+ ),
+ "page_scope_required": route_class == "page_scoped",
+ "page_scope_supported": True,
+ "rerank_deltas": [],
+ "pre_rerank_candidates": [],
+ }
+ meta["candidate_counts"] = {}
+ doc.metadata = meta
+ if session_id and not eval_mode:
session_key = _session_cache_key(session_id, user_id=user_id)
- with _last_chunks_lock:
- _last_chunks[session_key] = tree_chunks
+ _remember_session_retrieval(
+ session_key=session_key,
+ query=routing_query,
+ chunks=tree_chunks,
+ )
return tree_chunks
log.info("Tree search yielded 0 results. Falling back to vector search.")
@@ -2840,6 +7080,7 @@ def retrieve_chunks_routed(
user_id=user_id,
original_query=original_query,
eval_mode=eval_mode,
+ priority_file_hashes=priority_file_hashes,
)
@@ -2866,28 +7107,45 @@ def run_query(
original_query=query,
)
+ trace_ctx = _extract_trace_context(chunks)
+ route_class = str(trace_ctx.get("route_class") or "factoid")
+ scoped_file_hashes = sorted(
+ {
+ str((chunk.metadata or {}).get("file_hash"))
+ for chunk in chunks
+ if (chunk.metadata or {}).get("file_hash")
+ }
+ )
+
# 2. Retrieve Episodic Memory (Semantic Search)
past_memories = []
- query_vector = get_cached_embedding(query)
- try:
- supabase = _build_supabase_client(access_token)
- mem_res = supabase.rpc(
- "match_memory",
- {
- "query_embedding": query_vector,
- "match_session_id": session_id,
- "match_count": 3,
- },
- ).execute()
- if mem_res.data:
- past_memories = mem_res.data
- log.info(
- "Retrieved %d episodic memories for session %s",
- len(past_memories),
- session_id[:8],
- )
- except Exception as e:
- log.warning("Could not retrieve episodic memory: %s", e)
+ if route_class not in {"exact_fact", "page_scoped"}:
+ query_vector = get_cached_embedding(query)
+ try:
+ resolved_user_id = _stable_user_id(None, access_token)
+ supabase = _build_service_supabase_client()
+ mem_res = supabase.rpc(
+ "match_memory",
+ {
+ "query_embedding": query_vector,
+ "match_session_id": session_id,
+ "match_count": 3,
+ "p_user_id": resolved_user_id,
+ },
+ ).execute()
+ for row in mem_res.data or []:
+ payload = _parse_memory_payload(str(row.get("content") or ""))
+ if payload and not _memory_scope_matches(payload, file_hashes=scoped_file_hashes):
+ continue
+ past_memories.append(row)
+ if past_memories:
+ log.info(
+ "Retrieved %d episodic memories for session %s",
+ len(past_memories),
+ session_id[:8],
+ )
+ except Exception as e:
+ log.warning("Could not retrieve episodic memory: %s", e)
# 3. Generate the Answer
answer, images = generate_answer(
@@ -2900,35 +7158,15 @@ def run_query(
# 4. Asynchronously Save New Memories
def save_memory():
try:
- supabase = _build_supabase_client(access_token)
- # Save User Query
- payload = {
- "session_id": session_id,
- "role": "user",
- "content": query,
- "embedding": query_vector,
- }
- if access_token:
- from backend.core.auth_utils import extract_jwt_sub
-
- payload["user_id"] = extract_jwt_sub(access_token)
- supabase.table("chat_memory").insert(payload).execute()
-
- # Save AI Answer (Extracting just the core text, ignoring citations)
- clean_answer = answer.split("\n\n---\n**Sources:**")[0]
- ans_vector = get_cached_embedding(clean_answer)
- payload2 = {
- "session_id": session_id,
- "role": "assistant",
- "content": clean_answer,
- "embedding": ans_vector,
- }
- if access_token:
- from backend.core.auth_utils import extract_jwt_sub
-
- payload2["user_id"] = extract_jwt_sub(access_token)
- supabase.table("chat_memory").insert(payload2).execute()
- log.debug("Saved Q&A to episodic memory for session %s", session_id[:8])
+ _save_to_memory(
+ session_id,
+ query,
+ answer,
+ access_token=access_token,
+ route_class=route_class,
+ chunks=chunks,
+ )
+ log.debug("Saved structured Q&A to episodic memory for session %s", session_id[:8])
except Exception as e:
log.error("Failed to save memory: %s", e)
diff --git a/backend/eval/run_eval.py b/backend/eval/run_eval.py
index d953e3f..5fc7452 100644
--- a/backend/eval/run_eval.py
+++ b/backend/eval/run_eval.py
@@ -75,6 +75,72 @@ def _load_from_supabase(
return res.data or []
+def load_feedback_dataset_candidates(
+ access_token: Optional[str],
+ user_id: Optional[str],
+ *,
+ limit: int = 50,
+) -> List[Dict[str, Any]]:
+ """
+ Promote explicit user feedback into dataset-shaped rows for offline eval curation.
+ These candidates are intentionally separate from `evaluation_datasets` so we can
+ review them before activation.
+ """
+ from backend.core.pipeline import _build_service_supabase_client
+
+ sb = _build_service_supabase_client()
+ feedback_q = (
+ sb.table("answer_feedback")
+ .select("trace_id, helpful, accepted, reason_code, correction_text, promote_to_eval, user_id")
+ .eq("promote_to_eval", True)
+ .limit(limit)
+ )
+ if user_id:
+ feedback_q = feedback_q.eq("user_id", user_id)
+ feedback_rows = feedback_q.execute().data or []
+ trace_ids = [row.get("trace_id") for row in feedback_rows if row.get("trace_id")]
+ if not trace_ids:
+ return []
+
+ trace_rows = (
+ sb.table("query_traces")
+ .select("trace_id, question, doc_diagnostics, failure_modes, answer_preview")
+ .in_("trace_id", trace_ids)
+ .execute()
+ .data
+ or []
+ )
+ trace_map = {row.get("trace_id"): row for row in trace_rows if row.get("trace_id")}
+
+ dataset_rows: List[Dict[str, Any]] = []
+ seen_trace_ids = set()
+ for feedback in feedback_rows:
+ trace_id = feedback.get("trace_id")
+ if trace_id in seen_trace_ids:
+ continue
+ trace = trace_map.get(trace_id, {})
+ question = (trace.get("question") or "").strip()
+ if not question:
+ continue
+ seen_trace_ids.add(trace_id)
+ correction_text = (feedback.get("correction_text") or "").strip()
+ answer_preview = (trace.get("answer_preview") or "").strip()
+ dataset_rows.append(
+ {
+ "question": question,
+ "gold_context_refs": [],
+ "gold_evidence_text": correction_text or answer_preview,
+ "is_answerable": bool(feedback.get("accepted") or feedback.get("helpful")),
+ "trace_id": trace_id,
+ "failure_modes": trace.get("failure_modes") or [],
+ "doc_diagnostics": trace.get("doc_diagnostics") or [],
+ "reason_code": feedback.get("reason_code"),
+ "source": "feedback_trace",
+ }
+ )
+ return dataset_rows
+
+
def _parse_csv_floats(s: str) -> List[float]:
return [float(x.strip()) for x in s.split(",") if x.strip()]
diff --git a/backend/main.py b/backend/main.py
index 503c193..ee60006 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -11,12 +11,17 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from starlette.requests import Request
+from backend.core.auth_utils import is_guest_token
def _rate_limit_key(request: Request) -> str:
- """Key rate limits by JWT token (per-user), fall back to IP."""
+ """Use stricter IP limits for guest workspaces, user token limits otherwise."""
token = request.headers.get("X-Auth-Token") or request.headers.get("Authorization")
- return token or get_remote_address(request)
+ if token and token.startswith("Bearer "):
+ token = token.split(" ", 1)[1]
+ if token and not is_guest_token(token):
+ return token
+ return get_remote_address(request)
limiter = Limiter(key_func=_rate_limit_key)
diff --git a/frontend/index.html b/frontend/index.html
index 34a1ee1..b910e2c 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -212,24 +212,26 @@
onclick="submitLogin()"
>SIGN IN →</button>
- <!-- Admin panel — collapsed by default -->
- <details style="margin-top:20px; width:100%;">
- <summary style="
- font-family:var(--font-mono); font-size:0.58rem;
- color:var(--muted); cursor:pointer; letter-spacing:0.12em;
- text-transform:uppercase; list-style:none; text-align:center;
- ">▸ Admin access</summary>
- <div style="margin-top:10px; display:flex; flex-direction:column; gap:6px;">
- <input type="password" id="adminKey" placeholder="Master admin key…" style="width:100%;box-sizing:border-box;"/>
- <button class="btn-secondary" onclick="submitAdmin()">GET TODAY'S CODE</button>
- <div id="adminResult" style="font-family:var(--font-mono);font-size:0.7rem;color:var(--phosphor);min-height:14px;text-align:center;"></div>
- <div id="auth-toggle-panel" style="display:none; margin-top:12px;">
- <div class="section-label">AUTH GATE</div>
- <p id="auth-toggle-label" style="font-size:0.72rem;color:var(--muted);margin-bottom:10px;"></p>
- <button id="auth-toggle-btn" onclick="toggleAuth()" style="width:100%;padding:9px;border-radius:6px;border:1px solid;font-family:var(--font-mono);font-size:0.72rem;cursor:pointer;letter-spacing:0.08em;transition:all 0.15s;">DISABLE AUTH</button>
- </div>
- </div>
- </details>
+ <button
+ id="guestBtn"
+ class="btn-secondary"
+ style="width:100%; letter-spacing:0.1em; margin-top:10px; display:none;"
+ onclick="submitGuest()"
+ >CONTINUE AS GUEST</button>
+ <label
+ id="guestPersistWrap"
+ style="display:none;width:100%;margin-top:10px;font-size:0.68rem;color:var(--muted);line-height:1.45;text-align:left;"
+ >
+ <input type="checkbox" id="guestPersist" style="margin-right:8px;accent-color:var(--phosphor);" />
+ Keep this guest workspace on this device
+ </label>
+ <div
+ id="guestInfo"
+ style="display:none;font-size:0.68rem;color:var(--muted);text-align:center;margin-top:8px;line-height:1.5;"
+ >
+ Guest mode is isolated and rate-limited. By default it expires when the guest session truly ends.
+ </div>
+
</div>
</div>
@@ -251,8 +253,19 @@
<button class="nav-btn" id="nav-chat" onclick="switchView('chat')">
CHAT
</button>
+ <button
+ class="nav-btn"
+ id="nav-admin"
+ onclick="switchView('admin')"
+ style="display: none"
+ >
+ ADMIN
+ </button>
</nav>
<div class="topbar-right">
+ <div class="stat-pill" id="session-mode-pill" style="display:none;">
+ MODE <span class="val" id="session-mode-label">GUEST</span>
+ </div>
<div class="stat-pill">
DOCS <span class="val" id="stat-docs">0</span>
</div>
@@ -263,6 +276,7 @@
<div class="conn-dot offline" id="conn-dot"></div>
<span id="conn-label">OFFLINE</span>
</div>
+ <button onclick="unlockOperatorTools()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">OPERATOR</button>
<button onclick="signOut()" style="font-family:var(--font-mono);font-size:0.6rem;letter-spacing:0.1em;padding:4px 10px;border:1px solid var(--muted);border-radius:4px;background:transparent;color:var(--muted);cursor:pointer;transition:color 0.15s,border-color 0.15s;" onmouseover="this.style.color='var(--phosphor)';this.style.borderColor='var(--phosphor)';" onmouseout="this.style.color='var(--muted)';this.style.borderColor='var(--muted)';">SIGN OUT</button>
</div>
</header>
@@ -486,6 +500,43 @@
</button>
</div>
</div>
+
+ <!-- ── ADMIN VIEW ── -->
+ <div class="view" id="view-admin">
+ <div class="view-header">
+ <div class="view-title">ADMIN REVIEW</div>
+ <div class="view-subtitle">Trace triage, feedback, and eval promotion</div>
+ </div>
+ <div class="view-body" style="padding-top: 12px">
+ <div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:14px;">
+ <input type="text" id="adminTraceFailure" placeholder="failure mode" style="flex:1;min-width:120px;" />
+ <input type="text" id="adminTraceCategory" placeholder="category" style="flex:1;min-width:120px;" />
+ <select id="adminTraceRoute" style="flex:1;min-width:120px;">
+ <option value="">All routes</option>
+ <option value="default">default</option>
+ <option value="single">single</option>
+ <option value="generic_pinned">generic_pinned</option>
+ <option value="explicit_compare">explicit_compare</option>
+ </select>
+ <button class="btn-secondary" onclick="refreshAdminDashboard()">REFRESH</button>
+ </div>
+ <div id="adminSummary" style="font-size:0.78rem;color:var(--muted);margin-bottom:12px;"></div>
+ <div style="display:grid;gap:14px;">
+ <div>
+ <div class="section-label">Recent Traces</div>
+ <div id="adminTraceList"></div>
+ </div>
+ <div>
+ <div class="section-label">Trace Detail</div>
+ <div id="adminTraceDetail"></div>
+ </div>
+ <div>
+ <div class="section-label">Recent Feedback</div>
+ <div id="adminFeedbackList"></div>
+ </div>
+ </div>
+ </div>
+ </div>
</aside>
<!-- Mobile bottom navigation — must be inside #app for grid to work -->
<div id="mobile-nav">
@@ -525,7 +576,8 @@
<script src="js/corpus.js"></script>
<script src="js/inspect.js"></script>
<script src="js/chat.js?v=3"></script>
- <script src="js/main.js"></script>
+ <script src="js/admin.js?v=1"></script>
+ <script src="js/main.js?v=1"></script>
<script>
function mobileNav(tab) {
document
@@ -563,4 +615,4 @@
}
</script>
</body>
-</html>
\ No newline at end of file
+</html>
diff --git a/frontend/js/api.js b/frontend/js/api.js
index 7e9b59c..79c2676 100644
--- a/frontend/js/api.js
+++ b/frontend/js/api.js
@@ -18,13 +18,38 @@
*/
async function getSupabaseToken() {
try {
- const { data } = await supabaseClient.auth.getSession();
+ const client = await initSupabase();
+ if (!client?.auth) return null;
+ const { data } = await client.auth.getSession();
return data.session?.access_token ?? null;
} catch {
return null;
}
}
+async function getSupabaseSession() {
+ try {
+ const client = await initSupabase();
+ if (!client?.auth) return null;
+ const { data } = await client.auth.getSession();
+ return data.session ?? null;
+ } catch {
+ return null;
+ }
+}
+
+async function isGuestSession() {
+ const session = await getSupabaseSession();
+ const appMeta = session?.user?.app_metadata || {};
+ const provider = String(appMeta.provider || '').toLowerCase();
+ return Boolean(
+ session?.user?.is_anonymous ||
+ appMeta.is_anonymous ||
+ provider === 'anonymous' ||
+ (Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
+ );
+}
+
// ── Core fetch wrapper ────────────────────────────────────────────────────────
async function apiFetch(path, opts = {}) {
// Always pull a fresh token — Supabase auto-refreshes silently.
@@ -41,7 +66,7 @@ async function apiFetch(path, opts = {}) {
if (!res.ok) {
let detail = `HTTP ${res.status}`;
- try { detail = (await res.json()).detail || detail; } catch {}
+ try { detail = (await res.json()).detail || detail; } catch { }
throw new Error(detail);
}
@@ -55,7 +80,7 @@ async function apiVerifyPassword(password) {
// Token injection is handled by apiFetch — no sessionStorage involved.
const data = await apiFetch('/api/v1/auth/verify', {
method: 'POST',
- body: JSON.stringify({ password }),
+ body: JSON.stringify({ password }),
});
return data;
}
@@ -63,7 +88,68 @@ async function apiVerifyPassword(password) {
async function apiVerifyAdmin(key) {
return apiFetch('/api/v1/auth/admin', {
method: 'POST',
- body: JSON.stringify({ password: key }),
+ body: JSON.stringify({ password: key }),
+ });
+}
+
+async function apiCleanupGuestWorkspace() {
+ return apiFetch('/api/v1/auth/guest-workspace', {
+ method: 'DELETE',
+ });
+}
+
+async function apiAdminFetch(path, adminKey, opts = {}) {
+ if (!adminKey) throw new Error('Admin key required.');
+ return apiFetch(path, {
+ ...opts,
+ headers: {
+ 'X-Admin-Key': adminKey,
+ ...(opts.headers || {}),
+ },
+ });
+}
+
+async function apiAdminListTraces(adminKey, params = {}) {
+ const qs = new URLSearchParams();
+ Object.entries(params).forEach(([key, value]) => {
+ if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
+ });
+ return apiAdminFetch(`/api/v1/admin/traces${qs.toString() ? `?${qs}` : ''}`, adminKey);
+}
+
+async function apiAdminGetTrace(adminKey, traceId) {
+ return apiAdminFetch(`/api/v1/admin/traces/${traceId}`, adminKey);
+}
+
+async function apiAdminReviewTrace(adminKey, traceId, payload) {
+ return apiAdminFetch(`/api/v1/admin/traces/${traceId}/review`, adminKey, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ });
+}
+
+async function apiAdminListFeedback(adminKey, params = {}) {
+ const qs = new URLSearchParams();
+ Object.entries(params).forEach(([key, value]) => {
+ if (value !== null && value !== undefined && value !== '') qs.set(key, String(value));
+ });
+ return apiAdminFetch(`/api/v1/admin/feedback${qs.toString() ? `?${qs}` : ''}`, adminKey);
+}
+
+async function apiAdminGetFeedback(adminKey, feedbackId) {
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}`, adminKey);
+}
+
+async function apiAdminReviewFeedback(adminKey, feedbackId, payload) {
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/review`, adminKey, {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ });
+}
+
+async function apiAdminPromoteFeedback(adminKey, feedbackId) {
+ return apiAdminFetch(`/api/v1/admin/feedback/${feedbackId}/promote`, adminKey, {
+ method: 'POST',
});
}
@@ -75,14 +161,14 @@ async function apiLoadFiles() {
async function apiOverrideCategory(fileHash, newCategory) {
return apiFetch('/api/v1/corpus/recategorise', {
method: 'POST',
- body: JSON.stringify({ file_hash: fileHash, new_category: newCategory }),
+ body: JSON.stringify({ file_hash: fileHash, new_category: newCategory }),
});
}
async function apiRenameDocument(fileHash, newName) {
return apiFetch('/api/v1/corpus/rename', {
method: 'POST',
- body: JSON.stringify({ file_hash: fileHash, new_name: newName }),
+ body: JSON.stringify({ file_hash: fileHash, new_name: newName }),
});
}
@@ -90,6 +176,13 @@ async function apiDeleteDocument(fileHash) {
return apiFetch(`/api/v1/corpus/${fileHash}`, { method: 'DELETE' });
}
+async function apiSubmitAnswerFeedback(payload) {
+ return apiFetch('/api/v1/query/feedback', {
+ method: 'POST',
+ body: JSON.stringify(payload),
+ });
+}
+
// ── Ingest ────────────────────────────────────────────────────────────────────
async function apiIngestFile(file) {
// multipart/form-data — cannot go through apiFetch (no JSON body),
@@ -100,15 +193,15 @@ async function apiIngestFile(file) {
formData.append('file', file);
const res = await fetch(`${CONFIG.API_URL}/api/v1/ingest/upload`, {
- method: 'POST',
+ method: 'POST',
headers: token ? { 'X-Auth-Token': token } : {},
- body: formData,
+ body: formData,
});
if (res.status === 409) throw new Error('already_ingested');
if (!res.ok) {
let detail = `HTTP ${res.status}`;
- try { detail = (await res.json()).detail || detail; } catch {}
+ try { detail = (await res.json()).detail || detail; } catch { }
throw new Error(detail);
}
return res.json();
@@ -119,41 +212,42 @@ async function apiIngestStatus(taskId) {
}
// ── Query ─────────────────────────────────────────────────────────────────────
-async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
+async function apiQuery(query, category, history, sessionId, alpha, callbacks, pinnedFiles) {
/**
* SSE streaming query.
* callbacks = {
* onToken(text) — called for each streamed token
- * onDone(sources, images) — called when stream ends
+ * onDone({ sources, images, traceId, docDiagnostics }) — called when stream ends
* onError(msg) — called on error
* }
*/
const token = await getSupabaseToken(); // ← Supabase JWT
const res = await fetch(`${CONFIG.API_URL}/api/v1/query`, {
- method: 'POST',
+ method: 'POST',
headers: {
'Content-Type': 'application/json',
...(token ? { 'X-Auth-Token': token } : {}),
},
body: JSON.stringify({
query,
- category: category || 'All',
- history: history || [],
- session_id: sessionId || 'default_session',
- alpha: alpha ?? 0.5,
+ category: category || 'All',
+ history: history || [],
+ session_id: sessionId || 'default_session',
+ alpha: alpha ?? 0.5,
+ priority_file_hashes: pinnedFiles || [],
}),
});
if (!res.ok) {
let detail = `HTTP ${res.status}`;
- try { detail = (await res.json()).detail || detail; } catch {}
+ try { detail = (await res.json()).detail || detail; } catch { }
throw new Error(detail);
}
- const reader = res.body.getReader();
+ const reader = res.body.getReader();
const decoder = new TextDecoder();
- let buffer = '';
+ let buffer = '';
while (true) {
const { done, value } = await reader.read();
@@ -169,10 +263,18 @@ async function apiQuery(query, category, history, sessionId, alpha, callbacks) {
if (!raw) continue;
try {
const event = JSON.parse(raw);
- if (event.type === 'token' && callbacks?.onToken) callbacks.onToken(event.content);
- else if (event.type === 'done' && callbacks?.onDone) callbacks.onDone(event.sources || [], event.images || []);
- else if (event.type === 'error' && callbacks?.onError) callbacks.onError(event.content);
- } catch {}
+ if (event.type === 'token' && callbacks?.onToken) callbacks.onToken(event.content);
+ else if (event.type === 'done' && callbacks?.onDone) {
+ callbacks.onDone({
+ sources: event.sources || [],
+ images: event.images || [],
+ traceId: event.trace_id || null,
+ docDiagnostics: event.doc_diagnostics || [],
+ });
+ }
+ else if (event.type === 'error' && callbacks?.onError) callbacks.onError(event.content);
+ else if (event.type === 'clarification_options' && callbacks?.onOptions) callbacks.onOptions(event.options);
+ } catch { }
}
}
-}
\ No newline at end of file
+}
diff --git a/frontend/js/chat.js b/frontend/js/chat.js
index 38f4be6..42d58cb 100644
--- a/frontend/js/chat.js
+++ b/frontend/js/chat.js
@@ -10,7 +10,7 @@
lb.style.cssText = `display:none;position:fixed;inset:0;background:rgba(0,0,0,0.88);
z-index:9998;align-items:center;justify-content:center;cursor:zoom-out;
backdrop-filter:blur(4px);`;
- lb.innerHTML = `
+ lb.innerHTML = `
<button id="img-lightbox-close"
onclick="event.stopPropagation(); document.getElementById('img-lightbox').style.display='none'">
@@ -34,14 +34,14 @@ function renderMarkdown(text) {
let inUL = false;
let inOL = false;
- const closeUL = () => { if (inUL) { html += '</ul>'; inUL = false; } };
- const closeOL = () => { if (inOL) { html += '</ol>'; inOL = false; } };
+ const closeUL = () => { if (inUL) { html += '</ul>'; inUL = false; } };
+ const closeOL = () => { if (inOL) { html += '</ol>'; inOL = false; } };
const inline = (str) => str
.replace(/&/g, '&amp;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
.replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
- .replace(/\*(.+?)\*/g, '<em>$1</em>')
- .replace(/`([^`]+)`/g, '<code class="inline-code">$1</code>')
+ .replace(/\*(.+?)\*/g, '<em>$1</em>')
+ .replace(/`([^`]+)`/g, '<code class="inline-code">$1</code>')
.replace(/\[Source (\d+)\]/g,
'<span class="source-ref">[S$1]</span>');
@@ -100,6 +100,74 @@ function renderMarkdown(text) {
return html;
}
+function renderDocDiagnostics(docDiagnostics) {
+ if (!Array.isArray(docDiagnostics) || docDiagnostics.length === 0) return '';
+ const rows = docDiagnostics.map(diag => {
+ const score = diag.doc_score != null ? `${Math.round(diag.doc_score * 100)}%` : 'n/a';
+ const reason = diag.reason || 'unknown';
+ const status = diag.included ? 'included' : 'excluded';
+ return `
+ <div style="display:flex;justify-content:space-between;gap:12px;padding:8px 10px;border:1px solid #243142;border-radius:8px;background:rgba(10,18,32,0.55);margin-top:8px;">
+ <div>
+ <div style="font-weight:600;color:#dbeafe">${esc(diag.source || diag.file_hash || 'Unknown')}</div>
+ <div style="font-size:0.85em;color:#94a3b8">${esc(status)} · ${esc(reason)} · candidates ${Number(diag.candidate_count ?? 0)}</div>
+ </div>
+ <div style="font-size:0.85em;color:#cbd5e1;white-space:nowrap">${esc(diag.confidence_label || 'unknown')} · ${esc(score)}</div>
+ </div>
+ `;
+ }).join('');
+ return `
+ <div style="margin-top:12px;padding:12px;border:1px solid #22304a;border-radius:10px;background:rgba(7,12,24,0.72);">
+ <div style="font-size:0.8em;letter-spacing:0.14em;text-transform:uppercase;color:#7dd3fc;">Retrieval Diagnostics</div>
+ ${rows}
+ </div>
+ `;
+}
+
+function attachFeedbackControls(container, traceId) {
+ if (!traceId) return;
+ const bar = document.createElement('div');
+ bar.style.cssText = 'display:flex;flex-wrap:wrap;gap:8px;margin-top:12px;';
+
+ const disableAll = () => {
+ Array.from(bar.querySelectorAll('button')).forEach(btn => { btn.disabled = true; btn.style.opacity = '0.65'; });
+ };
+
+ const makeBtn = (label, handler) => {
+ const btn = document.createElement('button');
+ btn.textContent = label;
+ btn.style.cssText = 'background:rgba(255,255,255,0.05);border:1px solid #334155;color:var(--fg);padding:7px 12px;border-radius:8px;font-size:0.85em;cursor:pointer;';
+ btn.onclick = async () => {
+ try {
+ await handler();
+ disableAll();
+ toast('Feedback saved.', 'success');
+ } catch (err) {
+ toast(err?.message || 'Could not save feedback.', 'error');
+ }
+ };
+ return btn;
+ };
+
+ bar.appendChild(makeBtn('Helpful', async () => {
+ await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true });
+ }));
+ bar.appendChild(makeBtn('Not Helpful', async () => {
+ const note = window.prompt('What went wrong? You can add a short reason or a correction.', '') || '';
+ await apiSubmitAnswerFeedback({
+ trace_id: traceId,
+ helpful: false,
+ reason_code: note ? 'user_reported_issue' : 'needs_improvement',
+ correction_text: note || null,
+ });
+ }));
+ bar.appendChild(makeBtn('Save Answer', async () => {
+ await apiSubmitAnswerFeedback({ trace_id: traceId, helpful: true, accepted: true });
+ }));
+
+ container.appendChild(bar);
+}
+
// ── Chat core ─────────────────────────────────────────────
// Debounce guard — prevents double-submit on rapid Enter + button click
@@ -111,7 +179,7 @@ async function sendChat() {
_lastSendTime = now;
const input = document.getElementById('chatInput');
- const msg = input.value.trim();
+ const msg = input.value.trim();
if (!msg || STATE.isThinking) return;
input.value = '';
autoResize(input);
@@ -122,15 +190,15 @@ async function sendChat() {
document.getElementById('chatSend').disabled = true;
const category = document.getElementById('chatFilterSelect').value;
- const history = STATE.chatHistory.slice(-CONFIG.CHAT_HISTORY_TURNS);
+ const history = STATE.chatHistory.slice(-CONFIG.CHAT_HISTORY_TURNS);
// Create assistant bubble immediately — will be filled by stream
const assistantDiv = appendMsg('assistant', '', [], []);
- const bubble = assistantDiv.querySelector('.msg-bubble');
- bubble.innerHTML = '<div class="thinking-dots"><span></span><span></span><span></span></div>';
+ const bubble = assistantDiv.querySelector('.msg-bubble');
+ bubble.innerHTML = '<div class="thinking-dots"><span></span><span></span><span></span></div>';
- let fullText = '';
- let started = false;
+ let fullText = '';
+ let started = false;
try {
await apiQuery(msg, category, history, STATE.sessionId, STATE.alpha, {
@@ -142,11 +210,11 @@ async function sendChat() {
fullText += token;
bubble.innerHTML = renderMarkdown(fullText);
// Auto scroll
- document.getElementById('chatMessages').scrollTop =
- document.getElementById('chatMessages').scrollHeight;
- await new Promise(r => setTimeout(r, 0));
+ document.getElementById('chatMessages').scrollTop =
+ document.getElementById('chatMessages').scrollHeight;
+ await new Promise(r => setTimeout(r, 0));
},
- onDone(sources, images) {
+ onDone({ sources, images, traceId, docDiagnostics }) {
// Finalize markdown render
bubble.innerHTML = renderMarkdown(fullText);
STATE.chatHistory.push({ role: 'assistant', content: fullText });
@@ -162,11 +230,11 @@ async function sendChat() {
// Append sources
if (visibleSources.length > 0) {
- const n = visibleSources.length;
+ const n = visibleSources.length;
const chips = visibleSources.map(s => {
- const score = s.score != null ? Math.round(s.score * 100) : null;
+ const score = s.score != null ? Math.round(s.score * 100) : null;
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
- const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
+ const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
return `<div class="source-chip ${cls}">
<div class="source-chip-header">
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
@@ -180,12 +248,18 @@ async function sendChat() {
<button class="sources-toggle" onclick="
const p=this.nextElementSibling;
const open=p.classList.toggle('open');
- this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
+ this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
<div class="sources-panel">${chips}</div>`;
assistantDiv.appendChild(srcEl);
}
+ if (docDiagnostics && docDiagnostics.length > 0) {
+ const diagEl = document.createElement('div');
+ diagEl.innerHTML = renderDocDiagnostics(docDiagnostics);
+ assistantDiv.appendChild(diagEl);
+ }
+
// Append images
if (images.length > 0) {
const uniqueImages = [...new Set(images)];
@@ -199,6 +273,75 @@ async function sendChat() {
assistantDiv.appendChild(imgEl);
}
+ attachFeedbackControls(assistantDiv, traceId);
+
+ const el = document.getElementById('chatMessages');
+ el.scrollTop = el.scrollHeight;
+ },
+ onOptions(options) {
+ // Render inline choice buttons
+ const btnContainer = document.createElement('div');
+ btnContainer.style.cssText = 'display:flex;flex-direction:row;flex-wrap:wrap;gap:8px;margin-top:12px;';
+
+ const syncGraphPinStyles = () => {
+ const d3 = window.d3;
+ if (!d3) return;
+
+ d3.selectAll('.node')
+ .filter(d => d && d.type === 'document')
+ .select('circle')
+ .attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
+ .attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
+ .attr('filter', d => {
+ if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
+ const idx = STATE.categories.indexOf(d.category);
+ return idx >= 0 ? `url(#glow-${idx})` : null;
+ });
+ };
+
+ options.forEach(opt => {
+ const btn = document.createElement('button');
+ btn.textContent = opt.label;
+ btn.style.cssText = `
+ background: rgba(255, 255, 255, 0.05);
+ border: 1px solid #334155;
+ color: var(--fg);
+ padding: 8px 16px;
+ border-radius: 6px;
+ font-size: 0.9em;
+ cursor: pointer;
+ transition: all 0.2s;
+ `;
+ btn.onmouseover = () => {
+ btn.style.background = 'rgba(255, 255, 255, 0.1)';
+ btn.style.borderColor = 'var(--text-glow)';
+ };
+ btn.onmouseout = () => {
+ btn.style.background = 'rgba(255, 255, 255, 0.05)';
+ btn.style.borderColor = '#334155';
+ };
+
+ btn.onclick = () => {
+ // 1) Apply selected routing scope (single-doc or multi-doc)
+ const selectedHashes = opt.mode === 'all'
+ ? (Array.isArray(opt.file_hashes) ? opt.file_hashes.filter(Boolean) : [])
+ : (opt.file_hash ? [opt.file_hash] : []);
+ STATE.pinnedFiles = [...new Set(selectedHashes)];
+ syncGraphPinStyles();
+
+ // 2. Hide the buttons
+ btnContainer.style.display = 'none';
+
+ // 3. Resubmit the query now that it has a pin
+ const input = document.getElementById('chatInput');
+ input.value = msg; // original msg
+ document.getElementById('chatSend').click();
+ };
+ btnContainer.appendChild(btn);
+ });
+
+ assistantDiv.appendChild(btnContainer);
+
const el = document.getElementById('chatMessages');
el.scrollTop = el.scrollHeight;
},
@@ -215,7 +358,7 @@ async function sendChat() {
}
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">${esc(errMsg)}</p>`;
},
- });
+ }, STATE.pinnedFiles);
} catch (e) {
bubble.innerHTML = `<p class="msg-p" style="color:var(--red)">Request failed: ${esc(e.message)}</p>`;
} finally {
@@ -225,7 +368,7 @@ async function sendChat() {
}
function appendMsg(role, text, sources = [], images = []) {
- const el = document.getElementById('chatMessages');
+ const el = document.getElementById('chatMessages');
const div = document.createElement('div');
div.className = `msg ${role}`;
const n = sources.length;
@@ -237,11 +380,11 @@ function appendMsg(role, text, sources = [], images = []) {
imgHtml = `
<div style="display:flex; flex-direction:row; gap:10px; margin-top:12px; width:100%; overflow-x:auto; padding-bottom:8px;">
${uniqueImages.map(img => {
- const src = img.startsWith('data:') || img.startsWith('http')
- ? img
- : `data:image/jpeg;base64,${img}`;
- return `<img src="${src}" style="max-height: 220px; max-width: 100%; object-fit: contain; border-radius: 8px; background: white; border: 1px solid #334155; cursor: zoom-in;" onclick="openLightbox(this.src)">`;
- }).join('')}
+ const src = img.startsWith('data:') || img.startsWith('http')
+ ? img
+ : `data:image/jpeg;base64,${img}`;
+ return `<img src="${src}" style="max-height: 220px; max-width: 100%; object-fit: contain; border-radius: 8px; background: white; border: 1px solid #334155; cursor: zoom-in;" onclick="openLightbox(this.src)">`;
+ }).join('')}
</div>`;
}
@@ -249,9 +392,9 @@ function appendMsg(role, text, sources = [], images = []) {
let srcHtml = '';
if (n > 0) {
const chips = sources.map(s => {
- const score = s.score != null ? Math.round(s.score * 100) : null;
+ const score = s.score != null ? Math.round(s.score * 100) : null;
const scoreEl = score != null ? `<span class="source-chip-score">${score}%</span>` : '';
- const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
+ const cls = score == null ? '' : score >= 70 ? '' : score >= 40 ? 'medium' : 'low';
return `<div class="source-chip ${cls}">
<div class="source-chip-header">
<span class="source-chip-name">${esc(s.source)} · chunk ${s.chunk || '?'}</span>
@@ -264,7 +407,7 @@ function appendMsg(role, text, sources = [], images = []) {
<button class="sources-toggle" onclick="
const p=this.nextElementSibling;
const open=p.classList.toggle('open');
- this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n>1?'s':''}';
+ this.textContent=(open?'▲ hide':'▼ show')+' ${n} source${n > 1 ? 's' : ''}';
">▼ show ${n} source${n > 1 ? 's' : ''}</button>
<div class="sources-panel">${chips}</div>`;
}
@@ -284,7 +427,7 @@ function appendMsg(role, text, sources = [], images = []) {
}
function appendThinking() {
- const el = document.getElementById('chatMessages');
+ const el = document.getElementById('chatMessages');
const div = document.createElement('div');
div.className = 'msg assistant';
div.innerHTML = `
diff --git a/frontend/js/config.js b/frontend/js/config.js
index 912e286..3fb2253 100644
--- a/frontend/js/config.js
+++ b/frontend/js/config.js
@@ -2,14 +2,48 @@ const CONFIG = {
API_URL: '',
CAT_PALETTE: ['#00ff88','#4a9eff','#f5a623','#ff6b9d','#a78bfa','#34d399','#fb923c','#60a5fa'],
CHAT_HISTORY_TURNS: 6,
+ GUEST_ENABLED: true,
};
// Supabase client — keys loaded from backend, never hardcoded here
let supabaseClient = null;
+let supabaseReady = null;
async function initSupabase() {
- const res = await fetch('/api/v1/config');
- const cfg = await res.json();
- const { createClient } = supabase;
- supabaseClient = createClient(cfg.supabase_url, cfg.supabase_anon);
-}
\ No newline at end of file
+ if (supabaseClient?.auth) return supabaseClient;
+ if (supabaseReady) return supabaseReady;
+
+ supabaseReady = (async () => {
+ try {
+ const res = await fetch('/api/v1/config', { cache: 'no-store' });
+ if (!res.ok) {
+ throw new Error(`Config endpoint failed (${res.status})`);
+ }
+
+ const cfg = await res.json();
+ const createClient = window.supabase?.createClient;
+ if (typeof createClient !== 'function') {
+ throw new Error('Supabase browser SDK failed to load.');
+ }
+ if (!cfg?.supabase_url || !cfg?.supabase_anon) {
+ throw new Error('Supabase frontend config is missing.');
+ }
+
+ CONFIG.GUEST_ENABLED = cfg?.guest_enabled !== false;
+ const client = createClient(cfg.supabase_url, cfg.supabase_anon);
+ if (!client?.auth) {
+ throw new Error('Supabase auth client failed to initialize.');
+ }
+
+ supabaseClient = client;
+ window.supabaseClient = client;
+ return client;
+ } catch (err) {
+ supabaseClient = null;
+ supabaseReady = null;
+ throw err;
+ }
+ })();
+
+ return supabaseReady;
+}
diff --git a/frontend/js/corpus.js b/frontend/js/corpus.js
index b6e413d..031bce1 100644
--- a/frontend/js/corpus.js
+++ b/frontend/js/corpus.js
@@ -3,6 +3,30 @@
* Document list, upload (real FastAPI call), category review.
*/
+const ACTIVE_INGEST_KEY = 'morpheus_active_ingest';
+let ACTIVE_INGEST_PROMISE = null;
+
+function saveActiveIngest(taskId, filename) {
+ localStorage.setItem(ACTIVE_INGEST_KEY, JSON.stringify({
+ taskId,
+ filename,
+ savedAt: Date.now(),
+ }));
+}
+
+function loadActiveIngest() {
+ try {
+ const raw = localStorage.getItem(ACTIVE_INGEST_KEY);
+ return raw ? JSON.parse(raw) : null;
+ } catch {
+ return null;
+ }
+}
+
+function clearActiveIngest() {
+ localStorage.removeItem(ACTIVE_INGEST_KEY);
+}
+
// ── Doc list ──────────────────────────────────────────────────────────────────
function renderDocList() {
const el = document.getElementById('docList');
@@ -81,6 +105,7 @@ async function processUpload(file) {
try {
const queued = await apiIngestFile(file);
// queued = {task_id, filename, message}
+ saveActiveIngest(queued.task_id, queued.filename || file.name);
setProgress(20, 'Queued — processing in background…');
@@ -92,20 +117,71 @@ async function processUpload(file) {
setProgress(100, 'Complete!');
setTimeout(() => pc.classList.remove('visible'), 1500);
+ clearActiveIngest();
- if (result && result.file_hash) {
+ if (result && result.recovered_existing) {
+ toast('Recovered previous upload without recomputing.', 'success');
+ } else if (result && result.file_hash) {
showCategoryReview(result.file_hash, result.filename, result.document_type);
}
await refreshCorpus();
} catch (err) {
- pc.classList.remove('visible');
+ if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
+ clearActiveIngest();
+ pc.classList.remove('visible');
+ }
if (err.message === 'already_ingested') toast('Already ingested — skipped', 'error');
else toast('Ingestion failed: ' + err.message, 'error');
}
document.getElementById('fileInput').value = '';
}
+async function resumeActiveIngestionIfNeeded() {
+ if (ACTIVE_INGEST_PROMISE) return ACTIVE_INGEST_PROMISE;
+ const active = loadActiveIngest();
+ if (!active || !active.taskId) return null;
+
+ const pc = document.getElementById('progressCard');
+ pc.classList.add('visible');
+ document.getElementById('progressFilename').textContent = active.filename || 'Uploading PDF';
+ setProgress(25, 'Reconnecting to active ingestion…');
+
+ ACTIVE_INGEST_PROMISE = (async () => {
+ try {
+ const result = await pollIngestStatus(active.taskId, (step, total, msg) => {
+ const pct = Math.round((step / total) * 80) + 20;
+ setProgress(pct, msg);
+ });
+
+ clearActiveIngest();
+ setProgress(100, 'Complete!');
+ setTimeout(() => pc.classList.remove('visible'), 1500);
+
+ if (result && result.recovered_existing) {
+ toast('Recovered previous upload without recomputing.', 'success');
+ } else if (result && result.file_hash) {
+ showCategoryReview(result.file_hash, result.filename, result.document_type);
+ }
+ await refreshCorpus();
+ return result;
+ } catch (err) {
+ if (err.message === 'already_ingested' || err.message === 'Ingestion failed') {
+ clearActiveIngest();
+ pc.classList.remove('visible');
+ }
+ if (err.message === 'already_ingested') {
+ await refreshCorpus();
+ }
+ throw err;
+ } finally {
+ ACTIVE_INGEST_PROMISE = null;
+ }
+ })();
+
+ return ACTIVE_INGEST_PROMISE;
+}
+
async function pollIngestStatus(taskId, onProgress) {
// No hard timeout — poll until COMPLETED or FAILED.
// A large PDF with AI vision summaries can take 5-10 minutes on free-tier
@@ -218,4 +294,4 @@ function populateFilterDropdowns() {
const sel = document.getElementById('chatFilterSelect');
sel.innerHTML = '<option value="All">All Categories</option>' +
STATE.categories.map(c => `<option value="${c}">${c.replace(/_/g,' ')}</option>`).join('');
-}
\ No newline at end of file
+}
diff --git a/frontend/js/graph.js b/frontend/js/graph.js
index 3346a6b..5ce5d12 100644
--- a/frontend/js/graph.js
+++ b/frontend/js/graph.js
@@ -17,10 +17,10 @@
*/
function renderGraph() {
- const svg = d3.select('#graph-svg');
+ const svg = d3.select('#graph-svg');
const panel = document.getElementById('graph-panel');
- const W = panel.clientWidth;
- const H = panel.clientHeight;
+ const W = panel.clientWidth;
+ const H = panel.clientHeight;
const empty = document.getElementById('graph-empty');
svg.selectAll('*').remove();
@@ -37,12 +37,12 @@ function renderGraph() {
STATE.categories.forEach(cat => {
nodes.push({
- id: `cat::${cat}`,
- type: 'category',
+ id: `cat::${cat}`,
+ type: 'category',
label: cat.replace(/_/g, ' '),
- raw: cat,
+ raw: cat,
color: STATE.catColors[cat],
- r: 26,
+ r: 26,
pinned: false,
count: STATE.files.filter(f => (f.document_type || 'uncategorised') === cat).length,
});
@@ -51,26 +51,26 @@ function renderGraph() {
STATE.files.forEach(f => {
const cat = f.document_type || 'uncategorised';
nodes.push({
- id: `doc::${f.file_hash}`,
- type: 'document',
- label: f.filename,
+ id: `doc::${f.file_hash}`,
+ type: 'document',
+ label: f.filename,
file_hash: f.file_hash,
- category: cat,
- color: STATE.catColors[cat] || '#4a9eff',
- r: 7,
- pinned: false,
- chunks: f.chunk_count,
- ingested: (f.ingested_at || '').slice(0, 10),
+ category: cat,
+ color: STATE.catColors[cat] || '#4a9eff',
+ r: 7,
+ pinned: false,
+ chunks: f.chunk_count,
+ ingested: (f.ingested_at || '').slice(0, 10),
});
links.push({ source: `cat::${cat}`, target: `doc::${f.file_hash}` });
});
// ── Zoom + pan ─────────────────────────────────────────
- const g = svg.append('g');
+ const g = svg.append('g');
const zoom = d3.zoom()
- .scaleExtent([0.3, 3])
- // scroll to zoom only, no drag-to-pan
- .on('zoom', e => g.attr('transform', e.transform));
+ .scaleExtent([0.3, 3])
+ // scroll to zoom only, no drag-to-pan
+ .on('zoom', e => g.attr('transform', e.transform));
svg.call(zoom).on('dblclick.zoom', null);
STATE.svgZoom = { zoom, svg };
@@ -102,29 +102,56 @@ function renderGraph() {
.style('cursor', 'pointer')
.call(d3.drag()
.on('start', (e, d) => {
- if (!e.active) STATE.simulation.alphaTarget(0.3).restart();
- d.fx = d.x; d.fy = d.y;
- d._lastX = d.x; d._lastY = d.y;
-})
+ if (!e.active) STATE.simulation.alphaTarget(0.3).restart();
+ d.fx = d.x; d.fy = d.y;
+ d._lastX = d.x; d._lastY = d.y;
+ })
.on('drag', (e, d) => {
- d._vx = e.x - (d._lastX || e.x);
- d._vy = e.y - (d._lastY || e.y);
- d._lastX = e.x; d._lastY = e.y;
- d.fx = e.x; d.fy = e.y;
-})
+ d._vx = e.x - (d._lastX || e.x);
+ d._vy = e.y - (d._lastY || e.y);
+ d._lastX = e.x; d._lastY = e.y;
+ d.fx = e.x; d.fy = e.y;
+ })
.on('end', (e, d) => {
- if (!e.active) STATE.simulation.alphaTarget(0.05);
- if (!d.pinned) {
- d.fx = null; d.fy = null;
- d.vx = (d._vx || 0) * 3;
- d.vy = (d._vy || 0) * 3;
- STATE.simulation.alphaTarget(0.3).restart();
- setTimeout(() => STATE.simulation.alphaTarget(0.05), 2000);
- }
-})
+ if (!e.active) STATE.simulation.alphaTarget(0.05);
+ if (!d.pinned) {
+ d.fx = null; d.fy = null;
+ d.vx = (d._vx || 0) * 3;
+ d.vy = (d._vy || 0) * 3;
+ STATE.simulation.alphaTarget(0.3).restart();
+ setTimeout(() => STATE.simulation.alphaTarget(0.05), 2000);
+ }
+ })
)
.on('click', (event, d) => {
event.stopPropagation();
+
+ if (d.type === 'document') {
+ // Toggle this document's file_hash in the pinned set
+ const idx = STATE.pinnedFiles.indexOf(d.file_hash);
+ if (idx >= 0) {
+ STATE.pinnedFiles.splice(idx, 1);
+ } else {
+ STATE.pinnedFiles.push(d.file_hash);
+ }
+ // Visual: bright white stroke when pinned, original colour when not
+ node.filter(n => n && n.type === 'document').select('circle')
+ .attr('stroke', n => STATE.pinnedFiles.includes(n.file_hash) ? '#ffffff' : n.color)
+ .attr('stroke-width', n => STATE.pinnedFiles.includes(n.file_hash) ? 3 : 1.5)
+ .attr('filter', n => {
+ if (!STATE.pinnedFiles.includes(n.file_hash)) return null;
+ const glowIdx = STATE.categories.indexOf(n.category);
+ return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
+ });
+ } else if (d.type === 'category') {
+ // Clicking a category node clears ALL pins
+ STATE.pinnedFiles = [];
+ node.filter(n => n && n.type === 'document').select('circle')
+ .attr('stroke', n => n.color)
+ .attr('stroke-width', 1.5)
+ .attr('filter', null);
+ }
+
onNodeClick(d);
})
.on('contextmenu', (event, d) => {
@@ -149,7 +176,7 @@ function renderGraph() {
node.filter(d => d.type === 'category')
.append('circle')
.attr('r', 26)
- .attr('fill', d => d.color + '18')
+ .attr('fill', d => d.color + '18')
.attr('stroke', d => d.color)
.attr('stroke-width', 2)
.attr('filter', d => {
@@ -170,9 +197,14 @@ function renderGraph() {
node.filter(d => d.type === 'document')
.append('circle')
.attr('r', 7)
- .attr('fill', d => d.color + '55')
- .attr('stroke', d => d.color)
- .attr('stroke-width', 1.5);
+ .attr('fill', d => d.color + '55')
+ .attr('stroke', d => STATE.pinnedFiles.includes(d.file_hash) ? '#ffffff' : d.color)
+ .attr('stroke-width', d => STATE.pinnedFiles.includes(d.file_hash) ? 3 : 1.5)
+ .attr('filter', d => {
+ if (!STATE.pinnedFiles.includes(d.file_hash)) return null;
+ const glowIdx = STATE.categories.indexOf(d.category);
+ return glowIdx >= 0 ? `url(#glow-${glowIdx})` : null;
+ });
// Labels
node.append('text')
@@ -180,14 +212,14 @@ function renderGraph() {
.attr('dy', d => d.type === 'category' ? -32 : -12)
.attr('text-anchor', 'middle')
.attr('fill', d => d.type === 'category' ? d.color : 'rgba(200,216,244,0.7)')
- .attr('font-size', d => d.type === 'category' ? '10px' : '8px')
+ .attr('font-size', d => d.type === 'category' ? '10px' : '8px')
.attr('font-family', 'Syne Mono, monospace')
.attr('font-weight', d => d.type === 'category' ? '600' : '400')
.text(d => trunc(d.label, d.type === 'category' ? 18 : 16))
.style('pointer-events', 'none')
.style('user-select', 'none');
- svg.on('click', () => {});
+ svg.on('click', () => { });
// ── Simulation — Obsidian style ────────────────────────
STATE.simulation = d3.forceSimulation(nodes)
@@ -207,25 +239,25 @@ function renderGraph() {
.alphaDecay(0.02)
.velocityDecay(0.4)
.on('tick', () => {
- const liveW = document.getElementById('graph-panel').clientWidth;
- const liveH = document.getElementById('graph-panel').clientHeight;
- nodes.forEach(d => {
- if (d.fx == null) {
- const pad = 40;
- if (d.x < pad) { d.x = pad; d.vx = Math.abs(d.vx) * 0.7; }
- if (d.x > liveW - pad) { d.x = liveW - pad; d.vx = -Math.abs(d.vx) * 0.7; }
- if (d.y < pad) { d.y = pad; d.vy = Math.abs(d.vy) * 0.7; }
- if (d.y > liveH - pad) { d.y = liveH - pad; d.vy = -Math.abs(d.vy) * 0.7; }
- }
-});
- link
- .attr('x1', d => d.source.x).attr('y1', d => d.source.y)
- .attr('x2', d => d.target.x).attr('y2', d => d.target.y);
- node.attr('transform', d => `translate(${d.x},${d.y})`);
-
- const maxV = Math.max(...nodes.map(d => Math.abs(d.vx||0) + Math.abs(d.vy||0)));
- if (maxV > 0.5) STATE.simulation.alphaTarget(0.1).restart();
-});
+ const liveW = document.getElementById('graph-panel').clientWidth;
+ const liveH = document.getElementById('graph-panel').clientHeight;
+ nodes.forEach(d => {
+ if (d.fx == null) {
+ const pad = 40;
+ if (d.x < pad) { d.x = pad; d.vx = Math.abs(d.vx) * 0.7; }
+ if (d.x > liveW - pad) { d.x = liveW - pad; d.vx = -Math.abs(d.vx) * 0.7; }
+ if (d.y < pad) { d.y = pad; d.vy = Math.abs(d.vy) * 0.7; }
+ if (d.y > liveH - pad) { d.y = liveH - pad; d.vy = -Math.abs(d.vy) * 0.7; }
+ }
+ });
+ link
+ .attr('x1', d => d.source.x).attr('y1', d => d.source.y)
+ .attr('x2', d => d.target.x).attr('y2', d => d.target.y);
+ node.attr('transform', d => `translate(${d.x},${d.y})`);
+
+ const maxV = Math.max(...nodes.map(d => Math.abs(d.vx || 0) + Math.abs(d.vy || 0)));
+ if (maxV > 0.5) STATE.simulation.alphaTarget(0.1).restart();
+ });
setTimeout(() => STATE.simulation.alphaTarget(0.05), 3000);
}
@@ -352,7 +384,7 @@ function setupGraphObservers() {
}
});
mo.observe(panel, {
- attributes: true,
+ attributes: true,
attributeFilter: ['style', 'class'],
});
@@ -366,7 +398,7 @@ function setupGraphObservers() {
if (W && H) graphReheat();
});
moParent.observe(panel.parentElement, {
- attributes: true,
+ attributes: true,
attributeFilter: ['style', 'class'],
});
}
@@ -377,4 +409,4 @@ function setupGraphObservers() {
window.addEventListener('resize', () => graphReheat());
}
-setupGraphObservers();
\ No newline at end of file
+setupGraphObservers();
diff --git a/frontend/js/main.js b/frontend/js/main.js
index d8ac42f..7b8d6c2 100644
--- a/frontend/js/main.js
+++ b/frontend/js/main.js
@@ -6,17 +6,126 @@
* On success, supabase-js stores the session in localStorage automatically.
* getSupabaseToken() in api.js reads it on every request.
*
- * The daily-password system is kept ONLY for the admin panel (getting today's
- * code). It no longer gates the main app — Supabase JWT does that now.
+ * Legacy daily-password UI has been removed. Supabase JWT gates the main app,
+ * while the admin key only unlocks operator review tools.
*
- * Set AUTH_DISABLED = true to skip login during local dev.
+ * AUTH_DISABLED is a local-dev escape hatch only.
+ * Product guest access should use Supabase anonymous sessions instead.
*/
-const AUTH_DISABLED = false; // ← set false in production
+const AUTH_DISABLED = false; // local dev only — keep false in real use
+const GUEST_PERSIST_KEY = 'morpheus_guest_persist';
+const GUEST_TAB_KEY = 'morpheus_guest_tab_alive';
+const GUEST_LAST_SEEN_KEY = 'morpheus_guest_last_seen_at';
+const GUEST_ACTIVITY_WINDOW_MS = 45000;
+let guestHeartbeatTimer = null;
+
+function shouldPersistGuestWorkspace() {
+ return localStorage.getItem(GUEST_PERSIST_KEY) === '1';
+}
+
+function setGuestPersistPreference(keep) {
+ localStorage.setItem(GUEST_PERSIST_KEY, keep ? '1' : '0');
+ STATE.guestPersist = Boolean(keep);
+}
+
+function markGuestTabAlive() {
+ sessionStorage.setItem(GUEST_TAB_KEY, '1');
+}
+
+function clearGuestSessionMarkers() {
+ sessionStorage.removeItem(GUEST_TAB_KEY);
+ localStorage.removeItem(GUEST_LAST_SEEN_KEY);
+}
+
+function hasGuestTabMarker() {
+ return sessionStorage.getItem(GUEST_TAB_KEY) === '1';
+}
+
+function touchGuestHeartbeat() {
+ localStorage.setItem(GUEST_LAST_SEEN_KEY, String(Date.now()));
+}
+
+function hasRecentGuestHeartbeat() {
+ const raw = Number(localStorage.getItem(GUEST_LAST_SEEN_KEY) || 0);
+ return Number.isFinite(raw) && raw > 0 && (Date.now() - raw) < GUEST_ACTIVITY_WINDOW_MS;
+}
+
+function startGuestHeartbeat() {
+ stopGuestHeartbeat();
+ touchGuestHeartbeat();
+ guestHeartbeatTimer = window.setInterval(() => {
+ if (!STATE.isGuest) return;
+ touchGuestHeartbeat();
+ }, 15000);
+}
+
+function stopGuestHeartbeat() {
+ if (!guestHeartbeatTimer) return;
+ clearInterval(guestHeartbeatTimer);
+ guestHeartbeatTimer = null;
+}
+
+function setGuestControlsVisibility() {
+ const guestBtn = document.getElementById('guestBtn');
+ const guestInfo = document.getElementById('guestInfo');
+ const guestPersistWrap = document.getElementById('guestPersistWrap');
+ const visible = Boolean(CONFIG.GUEST_ENABLED);
+ if (guestBtn) guestBtn.style.display = visible ? '' : 'none';
+ if (guestInfo) guestInfo.style.display = visible ? 'block' : 'none';
+ if (guestPersistWrap) guestPersistWrap.style.display = visible ? 'block' : 'none';
+}
+
+function setSessionMode(session) {
+ const appMeta = session?.user?.app_metadata || {};
+ const provider = String(appMeta.provider || '').toLowerCase();
+ STATE.isGuest = Boolean(
+ session?.user?.is_anonymous ||
+ appMeta.is_anonymous ||
+ provider === 'anonymous' ||
+ (Array.isArray(appMeta.providers) && appMeta.providers.includes('anonymous'))
+ );
+ STATE.guestPersist = STATE.isGuest ? shouldPersistGuestWorkspace() : false;
+
+ const pill = document.getElementById('session-mode-pill');
+ const label = document.getElementById('session-mode-label');
+ if (pill) pill.style.display = STATE.isGuest ? '' : 'none';
+ if (label) label.textContent = STATE.isGuest ? 'GUEST' : 'ACCOUNT';
+}
+
+function isTemporaryGuestResume(session) {
+ if (!session || !STATE.isGuest || STATE.guestPersist) return false;
+ return !hasGuestTabMarker() && !hasRecentGuestHeartbeat();
+}
+
+async function expireTemporaryGuestSession(client) {
+ try {
+ await apiCleanupGuestWorkspace();
+ } catch {
+ // best effort only
+ }
+ try {
+ await client.auth.signOut();
+ } catch {
+ // best effort only
+ }
+ localStorage.removeItem(GUEST_PERSIST_KEY);
+ clearGuestSessionMarkers();
+ STATE.isGuest = false;
+ STATE.guestPersist = false;
+ setSessionMode(null);
+ showLogin();
+ const info = document.getElementById('loginInfo');
+ if (info) {
+ info.textContent = 'Temporary guest workspace expired after the previous guest session ended.';
+ info.style.display = 'block';
+ }
+}
window.addEventListener('DOMContentLoaded', async () => {
try {
- await initSupabase();
+ const client = await initSupabase();
+ setGuestControlsVisibility();
if (AUTH_DISABLED) {
showApp();
@@ -34,27 +143,71 @@ window.addEventListener('DOMContentLoaded', async () => {
// once with INITIAL_SESSION (with or without a session), then again on
// SIGNED_IN / SIGNED_OUT. No polling, no timeouts.
let booted = false;
- supabaseClient.auth.onAuthStateChange((event, session) => {
- if (event === 'INITIAL_SESSION') {
- if (session) {
+ client.auth.onAuthStateChange((event, session) => {
+ const handle = async () => {
+ if (event === 'INITIAL_SESSION') {
+ if (session) {
+ setSessionMode(session);
+ if (isTemporaryGuestResume(session)) {
+ booted = false;
+ await expireTemporaryGuestSession(client);
+ return;
+ }
+ if (STATE.isGuest) {
+ markGuestTabAlive();
+ startGuestHeartbeat();
+ } else {
+ stopGuestHeartbeat();
+ }
+ booted = true;
+ showApp();
+ bootApp();
+ } else {
+ stopGuestHeartbeat();
+ STATE.isGuest = false;
+ STATE.guestPersist = false;
+ showLogin();
+ }
+ } else if (event === 'SIGNED_IN' && !booted) {
+ setSessionMode(session);
+ if (STATE.isGuest) {
+ markGuestTabAlive();
+ startGuestHeartbeat();
+ } else {
+ stopGuestHeartbeat();
+ }
booted = true;
showApp();
bootApp();
- } else {
+ } else if (event === 'SIGNED_IN') {
+ setSessionMode(session);
+ if (STATE.isGuest) {
+ markGuestTabAlive();
+ startGuestHeartbeat();
+ } else {
+ stopGuestHeartbeat();
+ }
+ } else if (event === 'SIGNED_OUT') {
+ booted = false;
+ stopGuestHeartbeat();
+ STATE.isGuest = false;
+ STATE.guestPersist = false;
+ setSessionMode(null);
showLogin();
}
- } else if (event === 'SIGNED_IN' && !booted) {
- booted = true;
- showApp();
- bootApp();
- } else if (event === 'SIGNED_OUT') {
- booted = false;
+ };
+
+ handle().catch(err => {
+ console.error('Auth transition failed:', err);
+ stopGuestHeartbeat();
showLogin();
- }
+ });
});
} catch (err) {
console.error("Boot failed:", err);
+ const errEl = document.getElementById('loginError');
+ if (errEl) errEl.textContent = 'Auth init failed: ' + err.message;
showLogin();
}
});
@@ -86,7 +239,12 @@ async function submitLogin() {
err.textContent = '';
try {
- const {error } = await supabaseClient.auth.signInWithPassword({
+ const client = await initSupabase();
+ if (!client?.auth) {
+ throw new Error('Supabase auth client is unavailable.');
+ }
+
+ const {error } = await client.auth.signInWithPassword({
email,
password: pw,
});
@@ -94,12 +252,14 @@ async function submitLogin() {
if (error) {
err.textContent = error.message || 'Invalid credentials.';
btn.disabled = false;
- btn.textContent = 'UNLOCK →';
+ btn.textContent = 'SIGN IN →';
return;
}
// EXPLICIT UI TAKEOVER:
// Wait 500ms to guarantee local storage has the token, then force the system online.
STATE.authenticated = true;
+ const session = await getSupabaseSession();
+ setSessionMode(session);
showApp();
setTimeout(() => {
@@ -111,7 +271,60 @@ async function submitLogin() {
} catch (e) {
err.textContent = 'Server unreachable: ' + e.message;
btn.disabled = false;
- btn.textContent = 'UNLOCK →';
+ btn.textContent = 'SIGN IN →';
+ }
+}
+
+async function submitGuest() {
+ const btn = document.getElementById('guestBtn');
+ const err = document.getElementById('loginError');
+ const info = document.getElementById('loginInfo');
+ const persistCheckbox = document.getElementById('guestPersist');
+ const keepWorkspace = Boolean(persistCheckbox?.checked);
+
+ err.textContent = '';
+ if (info) {
+ info.style.display = 'none';
+ info.textContent = '';
+ }
+
+ btn.disabled = true;
+ btn.textContent = 'STARTING GUEST WORKSPACE…';
+
+ try {
+ const client = await initSupabase();
+ if (!client?.auth) {
+ throw new Error('Supabase auth client is unavailable.');
+ }
+
+ const { error } = await client.auth.signInAnonymously();
+ if (error) {
+ throw error;
+ }
+
+ setGuestPersistPreference(keepWorkspace);
+ const session = await getSupabaseSession();
+ setSessionMode(session);
+ markGuestTabAlive();
+ startGuestHeartbeat();
+ STATE.authenticated = true;
+ showApp();
+ setTimeout(() => {
+ setOnline(true);
+ bootApp();
+ const msg = keepWorkspace
+ ? 'Guest workspace ready. It will stay on this device until you end it.'
+ : 'Temporary guest workspace ready. It will expire after the guest session truly ends.';
+ toast(msg, 'success');
+ }, 300);
+ } catch (e) {
+ err.textContent = e?.message || 'Could not start guest workspace.';
+ if (/anonymous/i.test(err.textContent)) {
+ err.textContent = 'Guest mode is disabled in Supabase Auth settings.';
+ }
+ } finally {
+ btn.disabled = false;
+ btn.textContent = 'CONTINUE AS GUEST';
}
}
@@ -163,7 +376,12 @@ async function submitSignup() {
btn.textContent = 'CREATING ACCOUNT…';
try {
- const { data, error } = await supabaseClient.auth.signUp({ email, password: pw });
+ const client = await initSupabase();
+ if (!client?.auth) {
+ throw new Error('Supabase auth client is unavailable.');
+ }
+
+ const { data, error } = await client.auth.signUp({ email, password: pw });
if (error) {
err.textContent = error.message || 'Sign-up failed.';
@@ -187,55 +405,43 @@ async function submitSignup() {
}
}
-// ── Admin panel — daily code (unchanged, still uses master key) ───────────────
-async function submitAdmin() {
- const key = document.getElementById('adminKey').value.trim();
- if (!key) return;
+// ── Operator tools unlock ──────────────────────────────────────────────────────
+async function submitAdmin(adminKey) {
+ const key = String(adminKey || '').trim();
+ if (!key) return false;
try {
const res = await apiVerifyAdmin(key);
if (res.valid) {
- document.getElementById('adminResult').textContent =
- `Today's code: ${res.token}`;
- document.getElementById('auth-toggle-panel').style.display = 'block';
- const locked = localStorage.getItem('nexus_auth_locked') !== 'false';
- updateToggleUI(locked);
+ if (typeof window.enableAdminReview === 'function') {
+ window.enableAdminReview(key);
+ STATE.adminPendingView = true;
+ if (document.getElementById('app')?.style.display !== 'none') {
+ switchView('admin');
+ } else {
+ const info = document.getElementById('loginInfo');
+ if (info) {
+ info.textContent = 'Admin dashboard unlocked. Sign in to open it.';
+ info.style.display = 'block';
+ }
+ }
+ } else {
+ toast('Admin dashboard assets are stale. Hard refresh with Ctrl+Shift+R.', 'error');
+ }
+ return true;
} else {
- document.getElementById('adminResult').textContent = 'Invalid admin key.';
+ toast('Invalid operator key.', 'error');
}
} catch (e) {
- document.getElementById('adminResult').textContent = 'Error: ' + e.message;
+ toast('Operator unlock failed: ' + e.message, 'error');
}
+ return false;
}
-// ── Auth toggle (admin only) ──────────────────────────────────────────────────
-function updateToggleUI(locked) {
- const btn = document.getElementById('auth-toggle-btn');
- const label = document.getElementById('auth-toggle-label');
- if (locked) {
- btn.textContent = 'DISABLE AUTH';
- btn.style.background = 'rgba(255,71,87,0.15)';
- btn.style.borderColor = 'var(--red)';
- btn.style.color = 'var(--red)';
- label.textContent = 'Auth is ON — users must sign in';
- } else {
- btn.textContent = 'ENABLE AUTH';
- btn.style.background = 'rgba(0,255,136,0.08)';
- btn.style.borderColor = 'var(--phosphor)';
- btn.style.color = 'var(--phosphor)';
- label.textContent = 'Auth is OFF — anyone can access';
- }
-}
-
-function toggleAuth() {
- const current = localStorage.getItem('nexus_auth_locked') !== 'false';
- const next = !current;
- localStorage.setItem('nexus_auth_locked', next ? 'true' : 'false');
- updateToggleUI(next);
- toast(
- next ? 'Auth enabled — sign-in required on next visit'
- : 'Auth disabled — open access',
- next ? 'error' : 'success',
- );
+async function unlockOperatorTools() {
+ const key = window.prompt('Enter operator key to open review tools:', '') || '';
+ if (!key.trim()) return;
+ const ok = await submitAdmin(key);
+ if (ok) toast('Operator tools unlocked.', 'success');
}
function handleLoginKey(e) {
@@ -247,10 +453,38 @@ function handleLoginKey(e) {
// ── Sign out ──────────────────────────────────────────────────────────────────
async function signOut() {
- await supabaseClient.auth.signOut();
+ const client = await initSupabase();
+ if (!client?.auth) {
+ throw new Error('Supabase auth client is unavailable.');
+ }
+ if (STATE.isGuest) {
+ if (STATE.guestPersist) {
+ const shouldEnd = window.confirm(
+ 'This guest workspace is set to stay on this device. Click OK to end and delete it now, or Cancel to keep it and just close the tab later.'
+ );
+ if (!shouldEnd) return;
+ }
+ try {
+ await apiCleanupGuestWorkspace();
+ } catch (err) {
+ toast('Guest workspace cleanup failed: ' + err.message, 'error');
+ }
+ }
+ await client.auth.signOut();
STATE.authenticated = false;
+ STATE.isGuest = false;
+ STATE.guestPersist = false;
+ stopGuestHeartbeat();
+ clearGuestSessionMarkers();
+ localStorage.removeItem(GUEST_PERSIST_KEY);
+ setSessionMode(null);
STATE.files = [];
STATE.categories = [];
+ STATE.adminUnlocked = false;
+ STATE.adminKey = '';
+ STATE.adminPendingView = false;
+ const navAdmin = document.getElementById('nav-admin');
+ if (navAdmin) navAdmin.style.display = 'none';
showLogin();
authTab('signin');
}
@@ -262,7 +496,22 @@ async function bootApp() {
setOnline(true);
try {
await refreshCorpus();
- switchView('corpus');
+ if (typeof resumeActiveIngestionIfNeeded === 'function') {
+ resumeActiveIngestionIfNeeded().catch(err => {
+ console.warn('Ingestion resume failed:', err?.message || err);
+ });
+ }
+ if (STATE.adminUnlocked && STATE.adminPendingView) {
+ switchView('admin');
+ STATE.adminPendingView = false;
+ if (typeof refreshAdminDashboard === 'function') {
+ refreshAdminDashboard().catch(err => {
+ toast('Admin dashboard failed: ' + err.message, 'error');
+ });
+ }
+ } else {
+ switchView('corpus');
+ }
} catch (e) {
setOnline(false);
toast('Could not reach backend: ' + e.message, 'error');
@@ -292,4 +541,4 @@ async function refreshCorpus() {
}
};
}, 50);
-})();
\ No newline at end of file
+})();
diff --git a/frontend/js/state.js b/frontend/js/state.js
index 8710702..9958dd6 100644
--- a/frontend/js/state.js
+++ b/frontend/js/state.js
@@ -3,19 +3,28 @@
* Single source of truth. All data flows through api.js, never direct Supabase.
*/
const STATE = {
- authenticated: false,
- files: [],
- categories: [],
- catColors: {},
- simulation: null,
- svgZoom: null,
- selectedNode: null,
+ authenticated: false,
+ files: [],
+ categories: [],
+ catColors: {},
+ simulation: null,
+ svgZoom: null,
+ selectedNode: null,
deleteConfirmed: false,
- pendingReview: null,
- chatHistory: [],
- isThinking: false,
- sessionId: crypto.randomUUID(),
+ pendingReview: null,
+ chatHistory: [],
+ isThinking: false,
+ sessionId: crypto.randomUUID(),
alpha: 0.5,
+ pinnedFiles: [], // file_hashes of graph-pinned documents
+ adminKey: '',
+ adminUnlocked: false,
+ adminTraces: [],
+ adminFeedback: [],
+ selectedTraceId: null,
+ adminPendingView: false,
+ isGuest: false,
+ guestPersist: false,
};
function stateRefreshCategories() {
@@ -29,7 +38,7 @@ function stateRefreshCategories() {
}
async function stateLoadCorpus() {
- const data = await apiLoadFiles();
+ const data = await apiLoadFiles();
STATE.files = data.files || [];
stateRefreshCategories();
document.getElementById('stat-docs').textContent = STATE.files.length;
diff --git a/requirements.txt b/requirements.txt
index a30191b..edb523d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ celery[redis]
scikit-learn
joblib
sentence-transformers
-python-magic
\ No newline at end of file
+python-magic
+pytest
diff --git a/shared/types.py b/shared/types.py
index 8be1066..fb086d6 100644
--- a/shared/types.py
+++ b/shared/types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
class IngestResponse(BaseModel):
@@ -24,16 +24,53 @@ class ChatMessage(BaseModel):
role: str; content: str
class QueryRequest(BaseModel):
- query: str; category: str = "All"
- history: List[ChatMessage] = Field(default_factory=list); k: int = 3
+ query: str
+ category: str = "All"
+ history: List[ChatMessage] = Field(default_factory=list)
+ k: int = 3
session_id: str = "default_session"
- alpha: float = 0.5
+ alpha: float = 0.5
+ priority_file_hashes: List[str] = Field(default_factory=list)
class SourceChunk(BaseModel):
source: str; score: Optional[float]=None; chunk: Optional[int | str] = None
snippet: Optional[str]=None; doc_type: Optional[str]=None
pages: Optional[List[int]]=None
+class DocDiagnostic(BaseModel):
+ file_hash: str
+ source: str
+ included: bool = True
+ candidate_count: int = 0
+ doc_score: Optional[float] = None
+ confidence_label: Optional[str] = None
+ reason: Optional[str] = None
+ support_label: Optional[str] = None
+ thin_doc: Optional[bool] = None
+
+class QueryTrace(BaseModel):
+ trace_id: str
+ query: str
+ session_id: str
+ route_mode: str
+ selected_experts: List[str] = Field(default_factory=list)
+ expert_weights: Dict[str, float] = Field(default_factory=dict)
+ pinned_file_hashes: List[str] = Field(default_factory=list)
+ candidate_counts: Dict[str, int] = Field(default_factory=dict)
+ selected_chunk_ids: List[str] = Field(default_factory=list)
+ doc_diagnostics: List[DocDiagnostic] = Field(default_factory=list)
+ failure_modes: List[str] = Field(default_factory=list)
+ quality_metrics: Dict[str, Any] = Field(default_factory=dict)
+ latency_ms: Optional[int] = None
+ answer_hash: Optional[str] = None
+
+class AnswerFeedback(BaseModel):
+ trace_id: str
+ helpful: Optional[bool] = None
+ accepted: Optional[bool] = None
+ reason_code: Optional[str] = None
+ correction_text: Optional[str] = None
+
class QueryResponse(BaseModel):
answer: str; sources: List[SourceChunk] = Field(default_factory=list)
images: List[str] = []
diff --git a/supabase/schema_backup.sql b/supabase/schema_backup.sql
index add9f28..cc5eacf 100644
--- a/supabase/schema_backup.sql
+++ b/supabase/schema_backup.sql
@@ -1,74 +1,60 @@
--
-- PostgreSQL database dump
--
-
--- Dumped from database version 17.6
--- Dumped by pg_dump version 18.3
-
-SET statement_timeout = 0;
-SET lock_timeout = 0;
-SET idle_in_transaction_session_timeout = 0;
-SET transaction_timeout = 0;
-SET client_encoding = 'UTF8';
-SET standard_conforming_strings = on;
-SELECT pg_catalog.set_config('search_path', '', false);
-SET check_function_bodies = false;
-SET xmloption = content;
-SET client_min_messages = warning;
-SET row_security = off;
-
---
--- Name: public; Type: SCHEMA; Schema: -; Owner: -
---
-
-CREATE SCHEMA IF NOT EXISTS public;
-
-
---
--- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: -
---
-
-COMMENT ON SCHEMA public IS 'standard public schema';
-
-
---
--- Name: _trg_refresh_mv_document_types(); Type: FUNCTION; Schema: public; Owner: -
---
-
--- CREATE FUNCTION public._trg_refresh_mv_document_types() RETURNS trigger
--- LANGUAGE plpgsql
--- AS $$
--- begin
--- -- Fire-and-forget: refresh in background via pg_notify
--- -- (avoids blocking the INSERT transaction itself)
--- perform pg_notify('refresh_mv', 'document_types');
--- return new;
--- end;
--- $$;
-
-
---
--- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
---
-
+
+\restrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua
+
+-- Dumped from database version 17.6
+-- Dumped by pg_dump version 18.3
+
+SET statement_timeout = 0;
+SET lock_timeout = 0;
+SET idle_in_transaction_session_timeout = 0;
+SET transaction_timeout = 0;
+SET client_encoding = 'UTF8';
+SET standard_conforming_strings = on;
+SELECT pg_catalog.set_config('search_path', '', false);
+SET check_function_bodies = false;
+SET xmloption = content;
+SET client_min_messages = warning;
+SET row_security = off;
+
+--
+-- Name: public; Type: SCHEMA; Schema: -; Owner: -
+--
+
+CREATE SCHEMA public;
+
+
+--
+-- Name: SCHEMA public; Type: COMMENT; Schema: -; Owner: -
+--
+
+COMMENT ON SCHEMA public IS 'standard public schema';
+
+
+--
+-- Name: _trg_set_updated_at(); Type: FUNCTION; Schema: public; Owner: -
+--
+
CREATE FUNCTION public._trg_set_updated_at() RETURNS trigger
LANGUAGE plpgsql
- SET search_path = ''
- AS $$
-begin
+ SET search_path TO ''
+ AS $$
+begin
new.updated_at = pg_catalog.now();
- return new;
-end;
-$$;
-
-
---
--- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
---
-
+ return new;
+end;
+$$;
+
+
+--
+-- Name: get_document_types(); Type: FUNCTION; Schema: public; Owner: -
+--
+
CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
LANGUAGE sql STABLE
- SET search_path = ''
+ SET search_path TO ''
AS $$
select distinct f.document_type
from public.ingested_files as f
@@ -76,23 +62,25 @@ CREATE FUNCTION public.get_document_types() RETURNS TABLE(document_type text)
and f.document_type is not null
and f.document_type <> 'unknown'
order by f.document_type;
-$$;
-
-
---
+$$;
+
+
+--
-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision); Type: FUNCTION; Schema: public; Owner: -
---
-
+--
+
CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
LANGUAGE plpgsql
- SET search_path = ''
+ SET search_path TO ''
AS $$
-begin
- return query
- with
- semantic as (
- select
- d.id, d.content, d.metadata,
+begin
+ return query
+ with
+ semantic as (
+ select
+ d.id,
+ d.content,
+ d.metadata,
(
1 - (
d.embedding::extensions.halfvec(2048)
@@ -101,937 +89,1390 @@ begin
)
)::float as score
from public.documents d
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
order by d.embedding::extensions.halfvec(2048)
OPERATOR(extensions.<=>)
query_embedding::extensions.halfvec(2048)
- limit match_count * 3
- ),
- keyword as (
- select
- d.id, d.content, d.metadata,
+ limit match_count * 3
+ ),
+ keyword as (
+ select
+ d.id,
+ d.content,
+ d.metadata,
pg_catalog.ts_rank(
pg_catalog.to_tsvector('english', d.content),
pg_catalog.plainto_tsquery('english', query_text)
- )::float as raw_score
+ )::float as raw_score
from public.documents d
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
and pg_catalog.to_tsvector('english', d.content) @@ pg_catalog.plainto_tsquery('english', query_text)
- order by raw_score desc
- limit match_count * 3
- ),
- keyword_norm as (
- select k.id, k.content, k.metadata,
- case
- when max(k.raw_score) over () = 0 then 0::float
- else (k.raw_score / max(k.raw_score) over ())::float
- end as score
- from keyword k
- ),
- blended as (
- select
- coalesce(s.id, kn.id) as id,
- coalesce(s.content, kn.content) as content,
- coalesce(s.metadata, kn.metadata) as metadata,
- (
- coalesce(s.score, 0::float) * semantic_weight +
- coalesce(kn.score, 0::float) * keyword_weight
- ) as combined_score
- from semantic s
- full outer join keyword_norm kn on s.id = kn.id
- )
- select b.id, b.content, b.metadata, b.combined_score
- from blended b
- order by b.combined_score desc
- limit match_count;
-end;
-$$;
-
-
---
+ order by raw_score desc
+ limit match_count * 3
+ ),
+ keyword_norm as (
+ select
+ k.id,
+ k.content,
+ k.metadata,
+ case
+ when max(k.raw_score) over () = 0 then 0::float
+ else (k.raw_score / max(k.raw_score) over ())::float
+ end as score
+ from keyword k
+ ),
+ blended as (
+ select
+ coalesce(s.id, kn.id) as id,
+ coalesce(s.content, kn.content) as content,
+ coalesce(s.metadata, kn.metadata) as metadata,
+ (
+ coalesce(s.score, 0::float) * semantic_weight +
+ coalesce(kn.score, 0::float) * keyword_weight
+ ) as combined_score
+ from semantic s
+ full outer join keyword_norm kn on s.id = kn.id
+ )
+ select
+ b.id,
+ b.content,
+ b.metadata,
+ b.combined_score
+ from blended b
+ order by b.combined_score desc
+ limit match_count;
+end;
+$$;
+
+
+--
+-- Name: hybrid_search(text, extensions.vector, integer, jsonb, double precision, double precision, uuid); Type: FUNCTION; Schema: public; Owner: -
+--
+
+CREATE FUNCTION public.hybrid_search(query_text text, query_embedding extensions.vector, match_count integer DEFAULT 10, filter jsonb DEFAULT '{}'::jsonb, semantic_weight double precision DEFAULT 0.7, keyword_weight double precision DEFAULT 0.3, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, combined_score double precision)
+ LANGUAGE plpgsql
+ SET search_path TO ''
+ AS $$
+BEGIN
+ RETURN QUERY
+ WITH
+ semantic AS (
+ SELECT
+ d.id,
+ d.content,
+ d.metadata,
+ (
+ 1 - (
+ d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ )
+ )::float AS score
+ FROM public.documents AS d
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
+ ORDER BY d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ LIMIT match_count * 3
+ ),
+ keyword AS (
+ SELECT
+ d.id,
+ d.content,
+ d.metadata,
+ pg_catalog.ts_rank(
+ pg_catalog.to_tsvector('english', d.content),
+ pg_catalog.plainto_tsquery('english', query_text)
+ )::float AS raw_score
+ FROM public.documents AS d
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
+ AND pg_catalog.to_tsvector('english', d.content)
+ @@ pg_catalog.plainto_tsquery('english', query_text)
+ ORDER BY raw_score DESC
+ LIMIT match_count * 3
+ ),
+ keyword_norm AS (
+ SELECT
+ k.id,
+ k.content,
+ k.metadata,
+ CASE
+ WHEN max(k.raw_score) OVER () = 0 THEN 0::float
+ ELSE (k.raw_score / max(k.raw_score) OVER ())::float
+ END AS score
+ FROM keyword AS k
+ ),
+ blended AS (
+ SELECT
+ COALESCE(s.id, kn.id) AS id,
+ COALESCE(s.content, kn.content) AS content,
+ COALESCE(s.metadata, kn.metadata) AS metadata,
+ (
+ COALESCE(s.score, 0::float) * semantic_weight +
+ COALESCE(kn.score, 0::float) * keyword_weight
+ ) AS combined_score
+ FROM semantic AS s
+ FULL OUTER JOIN keyword_norm AS kn ON s.id = kn.id
+ )
+ SELECT
+ b.id,
+ b.content,
+ b.metadata,
+ b.combined_score
+ FROM blended AS b
+ ORDER BY b.combined_score DESC
+ LIMIT match_count;
+END;
+$$;
+
+
+--
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid); Type: FUNCTION; Schema: public; Owner: -
---
-
+--
+
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid) RETURNS void
LANGUAGE plpgsql SECURITY DEFINER
- SET search_path = ''
+ SET search_path TO ''
AS $$
-BEGIN
+BEGIN
INSERT INTO public.documents (id, content, metadata, embedding, user_id)
- VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
- ON CONFLICT (id) DO UPDATE
- SET content = EXCLUDED.content,
- metadata = EXCLUDED.metadata,
- embedding = EXCLUDED.embedding;
-END;
-$$;
-
-
---
+ VALUES (p_id, p_content, p_metadata, p_embedding, p_user_id)
+ ON CONFLICT (id) DO UPDATE
+ SET content = EXCLUDED.content,
+ metadata = EXCLUDED.metadata,
+ embedding = EXCLUDED.embedding;
+END;
+$$;
+
+
+--
-- Name: insert_document_chunk(uuid, text, jsonb, extensions.vector, uuid, text, uuid, integer); Type: FUNCTION; Schema: public; Owner: -
---
-
+--
+
CREATE FUNCTION public.insert_document_chunk(p_id uuid, p_content text, p_metadata jsonb, p_embedding extensions.vector, p_user_id uuid, p_node_type text DEFAULT 'leaf'::text, p_parent_node_id uuid DEFAULT NULL::uuid, p_node_level integer DEFAULT 0) RETURNS void
LANGUAGE plpgsql SECURITY DEFINER
- SET search_path = ''
+ SET search_path TO ''
AS $$
-BEGIN
+BEGIN
INSERT INTO public.documents (
- id, content, metadata, embedding, user_id,
- node_type, parent_node_id, node_level
- )
- VALUES (
- p_id, p_content, p_metadata, p_embedding, p_user_id,
- p_node_type, p_parent_node_id, p_node_level
- )
- ON CONFLICT (id) DO UPDATE
- SET content = EXCLUDED.content,
- metadata = EXCLUDED.metadata,
- embedding = EXCLUDED.embedding,
- node_type = EXCLUDED.node_type,
- parent_node_id = EXCLUDED.parent_node_id,
- node_level = EXCLUDED.node_level;
-END;
-$$;
-
-
---
+ id, content, metadata, embedding, user_id,
+ node_type, parent_node_id, node_level
+ )
+ VALUES (
+ p_id, p_content, p_metadata, p_embedding, p_user_id,
+ p_node_type, p_parent_node_id, p_node_level
+ )
+ ON CONFLICT (id) DO UPDATE
+ SET content = EXCLUDED.content,
+ metadata = EXCLUDED.metadata,
+ embedding = EXCLUDED.embedding,
+ node_type = EXCLUDED.node_type,
+ parent_node_id = EXCLUDED.parent_node_id,
+ node_level = EXCLUDED.node_level;
+END;
+$$;
+
+
+--
+-- Name: insert_document_chunks_batch(jsonb); Type: FUNCTION; Schema: public; Owner: -
+--
+
+CREATE FUNCTION public.insert_document_chunks_batch(p_rows jsonb) RETURNS void
+ LANGUAGE plpgsql SECURITY DEFINER
+ SET search_path TO ''
+ AS $$
+BEGIN
+ IF p_rows IS NULL OR jsonb_typeof(p_rows) <> 'array' THEN
+ RETURN;
+ END IF;
+
+ INSERT INTO public.documents (
+ id,
+ content,
+ metadata,
+ embedding,
+ user_id,
+ node_type,
+ parent_node_id,
+ node_level
+ )
+ SELECT
+ (row->>'id')::uuid,
+ row->>'content',
+ COALESCE(row->'metadata', '{}'::jsonb),
+ (row->'embedding')::text::extensions.vector,
+ (row->>'user_id')::uuid,
+ COALESCE(NULLIF(row->>'node_type', ''), 'leaf'),
+ NULLIF(row->>'parent_node_id', '')::uuid,
+ COALESCE(NULLIF(row->>'node_level', '')::integer, 0)
+ FROM jsonb_array_elements(p_rows) AS row
+ ON CONFLICT (id) DO UPDATE
+ SET content = EXCLUDED.content,
+ metadata = EXCLUDED.metadata,
+ embedding = EXCLUDED.embedding,
+ user_id = EXCLUDED.user_id,
+ node_type = EXCLUDED.node_type,
+ parent_node_id = EXCLUDED.parent_node_id,
+ node_level = EXCLUDED.node_level;
+END;
+$$;
+
+
+--
-- Name: match_documents(extensions.vector, integer, jsonb); Type: FUNCTION; Schema: public; Owner: -
---
-
+--
+
CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
LANGUAGE plpgsql
- SET search_path = ''
+ SET search_path TO ''
AS $$
-begin
- return query
- select
- d.id,
- d.content,
- d.metadata,
- (
- 1 - (
- d.embedding::extensions.halfvec(2048)
- OPERATOR(extensions.<=>)
- query_embedding::extensions.halfvec(2048)
- )
- )::float as similarity
+begin
+ return query
+ select
+ d.id,
+ d.content,
+ d.metadata,
+ (
+ 1 - (
+ d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ )
+ )::float as similarity
from public.documents d
- where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
- order by d.embedding::extensions.halfvec(2048)
- OPERATOR(extensions.<=>)
- query_embedding::extensions.halfvec(2048)
- limit match_count;
-end;
-$$;
-
-
---
+ where (filter = '{}'::jsonb or d.metadata @> filter::jsonb)
+ order by d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ limit match_count;
+end;
+$$;
+
+
+--
+-- Name: match_documents(extensions.vector, integer, jsonb, uuid); Type: FUNCTION; Schema: public; Owner: -
+--
+
+CREATE FUNCTION public.match_documents(query_embedding extensions.vector, match_count integer DEFAULT 5, filter jsonb DEFAULT '{}'::jsonb, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, content text, metadata jsonb, similarity double precision)
+ LANGUAGE plpgsql
+ SET search_path TO ''
+ AS $$
+BEGIN
+ RETURN QUERY
+ SELECT
+ d.id,
+ d.content,
+ d.metadata,
+ (
+ 1 - (
+ d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ )
+ )::float AS similarity
+ FROM public.documents AS d
+ WHERE (p_user_id IS NULL OR d.user_id = p_user_id)
+ AND (filter = '{}'::jsonb OR d.metadata @> filter::jsonb)
+ ORDER BY d.embedding::extensions.halfvec(2048)
+ OPERATOR(extensions.<=>)
+ query_embedding::extensions.halfvec(2048)
+ LIMIT match_count;
+END;
+$$;
+
+
+--
-- Name: match_memory(extensions.vector, text, integer); Type: FUNCTION; Schema: public; Owner: -
---
-
+--
+
CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
LANGUAGE plpgsql
- SET search_path = ''
+ SET search_path TO ''
+ AS $$
+BEGIN
+ RETURN QUERY
+ SELECT
+ cm.id,
+ cm.role,
+ cm.content,
+ 1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
+ FROM public.chat_memory AS cm
+ WHERE cm.session_id = match_session_id
+ ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
+ LIMIT match_count;
+END;
+$$;
+
+
+--
+-- Name: match_memory(extensions.vector, text, integer, uuid); Type: FUNCTION; Schema: public; Owner: -
+--
+
+CREATE FUNCTION public.match_memory(query_embedding extensions.vector, match_session_id text, match_count integer DEFAULT 4, p_user_id uuid DEFAULT NULL::uuid) RETURNS TABLE(id uuid, role text, content text, similarity double precision)
+ LANGUAGE plpgsql
+ SET search_path TO ''
AS $$
-BEGIN
- RETURN QUERY
- SELECT
+BEGIN
+ RETURN QUERY
+ SELECT
cm.id,
cm.role,
cm.content,
1 - (cm.embedding OPERATOR(extensions.<=>) query_embedding) AS similarity
FROM public.chat_memory AS cm
WHERE cm.session_id = match_session_id
+ AND (p_user_id IS NULL OR cm.user_id = p_user_id)
ORDER BY cm.embedding OPERATOR(extensions.<=>) query_embedding
- LIMIT match_count;
-END;
-$$;
-
-
---
--- Name: refresh_document_types_mv(); Type: FUNCTION; Schema: public; Owner: -
---
-
--- CREATE FUNCTION public.refresh_document_types_mv() RETURNS void
--- LANGUAGE plpgsql
--- AS $$
--- begin
--- refresh materialized view concurrently mv_document_types;
--- end;
--- $$;
-
-
-SET default_tablespace = '';
-
-SET default_table_access_method = heap;
-
---
--- Name: category_centroids; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.category_centroids (
- id uuid DEFAULT gen_random_uuid() NOT NULL,
- document_type text NOT NULL,
- centroid_vector double precision[] NOT NULL,
- document_count integer DEFAULT 1,
- created_at timestamp with time zone DEFAULT now(),
- updated_at timestamp with time zone DEFAULT now(),
- user_id uuid DEFAULT auth.uid()
-);
-
-
---
--- Name: chat_memory; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.chat_memory (
- id uuid DEFAULT extensions.uuid_generate_v4() NOT NULL,
- session_id text NOT NULL,
- role text NOT NULL,
- content text NOT NULL,
+ LIMIT match_count;
+END;
+$$;
+
+
+--
+-- Name: rls_auto_enable(); Type: FUNCTION; Schema: public; Owner: -
+--
+
+CREATE FUNCTION public.rls_auto_enable() RETURNS event_trigger
+ LANGUAGE plpgsql SECURITY DEFINER
+ SET search_path TO 'pg_catalog'
+ AS $$
+DECLARE
+ cmd record;
+BEGIN
+ FOR cmd IN
+ SELECT *
+ FROM pg_event_trigger_ddl_commands()
+ WHERE command_tag IN ('CREATE TABLE', 'CREATE TABLE AS', 'SELECT INTO')
+ AND object_type IN ('table','partitioned table')
+ LOOP
+ IF cmd.schema_name IS NOT NULL AND cmd.schema_name IN ('public') AND cmd.schema_name NOT IN ('pg_catalog','information_schema') AND cmd.schema_name NOT LIKE 'pg_toast%' AND cmd.schema_name NOT LIKE 'pg_temp%' THEN
+ BEGIN
+ EXECUTE format('alter table if exists %s enable row level security', cmd.object_identity);
+ RAISE LOG 'rls_auto_enable: enabled RLS on %', cmd.object_identity;
+ EXCEPTION
+ WHEN OTHERS THEN
+ RAISE LOG 'rls_auto_enable: failed to enable RLS on %', cmd.object_identity;
+ END;
+ ELSE
+ RAISE LOG 'rls_auto_enable: skip % (either system schema or not in enforced list: %.)', cmd.object_identity, cmd.schema_name;
+ END IF;
+ END LOOP;
+END;
+$$;
+
+
+SET default_tablespace = '';
+
+SET default_table_access_method = heap;
+
+--
+-- Name: answer_feedback; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.answer_feedback (
+ id bigint NOT NULL,
+ trace_id uuid NOT NULL,
+ user_id uuid,
+ helpful boolean,
+ accepted boolean,
+ reason_code text,
+ correction_text text,
+ promote_to_eval boolean DEFAULT false NOT NULL,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
+ review_state text DEFAULT 'pending'::text NOT NULL,
+ review_notes text,
+ reviewed_at timestamp with time zone,
+ reviewed_by text,
+ promoted_at timestamp with time zone
+);
+
+
+--
+-- Name: answer_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
+--
+
+ALTER TABLE public.answer_feedback ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
+ SEQUENCE NAME public.answer_feedback_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1
+);
+
+
+--
+-- Name: category_centroids; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.category_centroids (
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
+ document_type text NOT NULL,
+ centroid_vector double precision[] NOT NULL,
+ document_count integer DEFAULT 1,
+ created_at timestamp with time zone DEFAULT now(),
+ updated_at timestamp with time zone DEFAULT now(),
+ user_id uuid DEFAULT auth.uid()
+);
+
+
+--
+-- Name: chat_memory; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.chat_memory (
+ id uuid DEFAULT extensions.uuid_generate_v4() NOT NULL,
+ session_id text NOT NULL,
+ role text NOT NULL,
+ content text NOT NULL,
embedding extensions.vector(2048),
- created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
- user_id uuid DEFAULT auth.uid()
-);
-
-
---
--- Name: document_trees; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.document_trees (
- file_hash text NOT NULL,
- user_id uuid NOT NULL,
- tree_json jsonb NOT NULL,
- created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
-);
-
-
---
--- Name: documents; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.documents (
- id uuid DEFAULT gen_random_uuid() NOT NULL,
- content text,
- metadata jsonb,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()),
+ user_id uuid DEFAULT auth.uid()
+);
+
+
+--
+-- Name: document_trees; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.document_trees (
+ file_hash text NOT NULL,
+ user_id uuid NOT NULL,
+ tree_json jsonb NOT NULL,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now())
+);
+
+
+--
+-- Name: documents; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.documents (
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
+ content text,
+ metadata jsonb,
embedding extensions.vector(2048),
- user_id uuid DEFAULT auth.uid(),
- node_type text DEFAULT 'leaf'::text,
- parent_node_id uuid,
- node_level integer DEFAULT 0
-);
-
-
---
--- Name: evaluation_logs; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.evaluation_logs (
- id uuid DEFAULT gen_random_uuid() NOT NULL,
- run_label text,
- evaluated_at timestamp with time zone,
- alpha double precision,
- k integer,
- question text,
- is_answerable boolean,
- precision_at_k double precision,
- faithfulness_proxy double precision,
- relevance_proxy double precision,
- local_reward double precision,
- llm_judge_score double precision,
- judge_a_verdict boolean,
- judge_b_verdict boolean,
- judge_a_model text,
- judge_b_model text,
- calibration_score double precision,
- final_score double precision,
- requires_manual_review boolean DEFAULT false,
- disagreement_note text DEFAULT ''::text,
- user_id uuid
-);
-
-
---
--- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.ingested_files (
- id uuid DEFAULT gen_random_uuid() NOT NULL,
- file_hash text NOT NULL,
- filename text NOT NULL,
- document_type text,
- chunk_count integer DEFAULT 0,
- ingested_at timestamp with time zone DEFAULT now(),
- user_id uuid DEFAULT auth.uid(),
- user_overridden boolean DEFAULT false
-);
-
-
---
--- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.ingestion_retry_logs (
- id bigint NOT NULL,
- created_at timestamp with time zone DEFAULT now() NOT NULL,
- user_id uuid,
- batch_num integer NOT NULL,
- total_batches integer NOT NULL,
- attempt integer NOT NULL,
- event_type text NOT NULL,
- message text,
- sleep_s double precision DEFAULT 0
-);
-
-
---
--- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
---
-
-CREATE SEQUENCE public.ingestion_retry_logs_id_seq
- START WITH 1
- INCREMENT BY 1
- NO MINVALUE
- NO MAXVALUE
- CACHE 1;
-
-
---
--- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
---
-
-ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
-
-
---
--- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.intent_feedback (
- id bigint NOT NULL,
- user_id uuid,
- query text NOT NULL,
- has_category boolean DEFAULT false NOT NULL,
- has_history boolean DEFAULT false NOT NULL,
- label integer NOT NULL,
- created_at timestamp with time zone DEFAULT now() NOT NULL,
- CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
-);
-
-
---
--- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
---
-
-CREATE SEQUENCE public.intent_feedback_id_seq
- START WITH 1
- INCREMENT BY 1
- NO MINVALUE
- NO MAXVALUE
- CACHE 1;
-
-
---
--- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
---
-
-ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
-
-
---
--- Name: mv_document_types; Type: MATERIALIZED VIEW; Schema: public; Owner: -
---
-
--- CREATE MATERIALIZED VIEW public.mv_document_types AS
--- SELECT DISTINCT (metadata ->> 'document_type'::text) AS document_type
--- FROM public.documents
--- WHERE (((metadata ->> 'document_type'::text) IS NOT NULL) AND ((metadata ->> 'document_type'::text) <> 'unknown'::text))
--- ORDER BY (metadata ->> 'document_type'::text)
--- WITH NO DATA;
-
-
---
--- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
---
-
-CREATE TABLE public.rerank_feedback (
- id bigint NOT NULL,
- user_id uuid,
- query_hash text NOT NULL,
- chunk_id uuid,
- chunk_hash text NOT NULL,
- document_type text,
- cohere_score real NOT NULL,
- was_selected boolean NOT NULL,
- created_at timestamp with time zone DEFAULT now() NOT NULL,
- query_text text,
- chunk_text text
-);
-
-
---
--- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
---
-
-CREATE SEQUENCE public.rerank_feedback_id_seq
- START WITH 1
- INCREMENT BY 1
- NO MINVALUE
- NO MAXVALUE
- CACHE 1;
-
-
---
--- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
---
-
-ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
-
-
---
--- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
-
-
---
--- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
-
-
---
--- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
-
-
---
--- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.category_centroids
- ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
-
-
---
--- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.category_centroids
- ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
-
-
---
--- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.chat_memory
- ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
-
-
---
--- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+ user_id uuid DEFAULT auth.uid(),
+ node_type text DEFAULT 'leaf'::text,
+ parent_node_id uuid,
+ node_level integer DEFAULT 0
+);
+
+
+--
+-- Name: evaluation_datasets; Type: TABLE; Schema: public; Owner: -
--
-ALTER TABLE ONLY public.document_trees
- ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
-
-
---
--- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.documents
- ADD CONSTRAINT documents_pkey PRIMARY KEY (id);
-
-
---
--- Name: evaluation_logs evaluation_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.evaluation_logs
- ADD CONSTRAINT evaluation_logs_pkey PRIMARY KEY (id);
-
-
---
--- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+CREATE TABLE public.evaluation_datasets (
+ id bigint NOT NULL,
+ trace_id uuid,
+ source text DEFAULT 'feedback_trace'::text NOT NULL,
+ question text NOT NULL,
+ gold_context_refs jsonb DEFAULT '[]'::jsonb NOT NULL,
+ gold_evidence_text text,
+ is_answerable boolean DEFAULT true NOT NULL,
+ failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
+ doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
+ reason_code text,
+ is_active boolean DEFAULT false NOT NULL,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
+);
+
+
+--
+-- Name: evaluation_datasets_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--
-ALTER TABLE ONLY public.ingested_files
- ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
-
-
---
--- Name: ingested_files ingested_files_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.ingested_files
- ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
-
-
---
--- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.ingestion_retry_logs
- ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
-
-
---
--- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.intent_feedback
- ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
-
-
---
--- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
---
-
-ALTER TABLE ONLY public.rerank_feedback
- ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
-
-
---
--- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
-
-
---
--- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
-
-
---
--- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
-
-
---
--- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
-
-
---
--- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
-
-
---
--- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
-
-
---
--- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
-
-
---
--- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
-
-
---
--- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
-
-
---
--- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
-
-
---
--- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
-
-
---
--- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
-
-
---
--- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
-
-
---
--- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
-
-
---
--- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
-
-
---
--- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
-
-
---
--- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
-
-
---
--- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
-
-
---
--- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
-
-
---
--- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
-
-
---
--- Name: mv_document_types_idx; Type: INDEX; Schema: public; Owner: -
---
-
--- CREATE UNIQUE INDEX mv_document_types_idx ON public.mv_document_types USING btree (document_type);
-
-
---
--- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
-
-
---
--- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
---
-
-CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
-
-
---
--- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
---
-
-CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
-
-
---
--- Name: documents trg_refresh_mv_document_types; Type: TRIGGER; Schema: public; Owner: -
---
-
--- CREATE TRIGGER trg_refresh_mv_document_types AFTER INSERT ON public.documents FOR EACH STATEMENT EXECUTE FUNCTION public._trg_refresh_mv_document_types();
-
-
---
--- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
---
-
-ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
-
---
--- Name: category_centroids centroids_delete_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY centroids_delete_own ON public.category_centroids FOR DELETE USING ((user_id = auth.uid()));
-
-
---
--- Name: category_centroids centroids_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY centroids_insert_own ON public.category_centroids FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: category_centroids centroids_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY centroids_select_own ON public.category_centroids FOR SELECT USING ((user_id = auth.uid()));
-
-
---
--- Name: category_centroids centroids_update_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY centroids_update_own ON public.category_centroids FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
---
-
-ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
-
---
--- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
-
-
---
--- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
-
-
---
--- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
---
-
-ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
+ALTER TABLE public.evaluation_datasets ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
+ SEQUENCE NAME public.evaluation_datasets_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1
+);
+
--
--- Name: documents documents_delete_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY documents_delete_own ON public.documents FOR DELETE USING ((user_id = auth.uid()));
-
-
---
--- Name: documents documents_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY documents_insert_own ON public.documents FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: documents documents_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY documents_select_own ON public.documents FOR SELECT USING ((user_id = auth.uid()));
-
-
---
--- Name: documents documents_update_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY documents_update_own ON public.documents FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+-- Name: evaluation_logs; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.evaluation_logs (
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
+ run_label text,
+ evaluated_at timestamp with time zone,
+ alpha double precision,
+ k integer,
+ question text,
+ is_answerable boolean,
+ precision_at_k double precision,
+ faithfulness_proxy double precision,
+ relevance_proxy double precision,
+ local_reward double precision,
+ llm_judge_score double precision,
+ judge_a_verdict boolean,
+ judge_b_verdict boolean,
+ judge_a_model text,
+ judge_b_model text,
+ calibration_score double precision,
+ final_score double precision,
+ requires_manual_review boolean DEFAULT false,
+ disagreement_note text DEFAULT ''::text,
+ user_id uuid
+);
--
--- Name: document_trees; Type: ROW SECURITY; Schema: public; Owner: -
+-- Name: graph_edges; Type: TABLE; Schema: public; Owner: -
--
-ALTER TABLE public.document_trees ENABLE ROW LEVEL SECURITY;
+CREATE TABLE public.graph_edges (
+ id bigint NOT NULL,
+ user_id uuid,
+ source_node_key text NOT NULL,
+ target_node_key text NOT NULL,
+ edge_type text NOT NULL,
+ weight double precision DEFAULT 1.0 NOT NULL,
+ payload jsonb DEFAULT '{}'::jsonb NOT NULL,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
+);
+
--
--- Name: document_trees document_trees_delete_own; Type: POLICY; Schema: public; Owner: -
+-- Name: graph_edges_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--
-CREATE POLICY document_trees_delete_own ON public.document_trees FOR DELETE USING ((user_id = auth.uid()));
+ALTER TABLE public.graph_edges ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
+ SEQUENCE NAME public.graph_edges_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1
+);
--
--- Name: document_trees document_trees_insert_own; Type: POLICY; Schema: public; Owner: -
+-- Name: graph_nodes; Type: TABLE; Schema: public; Owner: -
--
-CREATE POLICY document_trees_insert_own ON public.document_trees FOR INSERT WITH CHECK ((user_id = auth.uid()));
+CREATE TABLE public.graph_nodes (
+ id bigint NOT NULL,
+ user_id uuid,
+ node_key text NOT NULL,
+ node_type text NOT NULL,
+ label text NOT NULL,
+ payload jsonb DEFAULT '{}'::jsonb NOT NULL,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL
+);
--
--- Name: document_trees document_trees_select_own; Type: POLICY; Schema: public; Owner: -
+-- Name: graph_nodes_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--
-CREATE POLICY document_trees_select_own ON public.document_trees FOR SELECT USING ((user_id = auth.uid()));
+ALTER TABLE public.graph_nodes ALTER COLUMN id ADD GENERATED BY DEFAULT AS IDENTITY (
+ SEQUENCE NAME public.graph_nodes_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1
+);
--
--- Name: document_trees document_trees_update_own; Type: POLICY; Schema: public; Owner: -
+-- Name: ingested_files; Type: TABLE; Schema: public; Owner: -
--
-CREATE POLICY document_trees_update_own ON public.document_trees FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+CREATE TABLE public.ingested_files (
+ id uuid DEFAULT gen_random_uuid() NOT NULL,
+ file_hash text NOT NULL,
+ filename text NOT NULL,
+ document_type text,
+ chunk_count integer DEFAULT 0,
+ ingested_at timestamp with time zone DEFAULT now(),
+ user_id uuid DEFAULT auth.uid(),
+ user_overridden boolean DEFAULT false,
+ identity_json jsonb DEFAULT '{}'::jsonb NOT NULL
+);
--
--- Name: evaluation_logs; Type: ROW SECURITY; Schema: public; Owner: -
+-- Name: ingestion_retry_logs; Type: TABLE; Schema: public; Owner: -
--
-ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY;
-
---
--- Name: evaluation_logs evaluation_logs_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY evaluation_logs_insert_own ON public.evaluation_logs FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: evaluation_logs evaluation_logs_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY evaluation_logs_select_own ON public.evaluation_logs FOR SELECT USING ((user_id = auth.uid()));
+CREATE TABLE public.ingestion_retry_logs (
+ id bigint NOT NULL,
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
+ user_id uuid,
+ file_hash text,
+ batch_num integer NOT NULL,
+ total_batches integer NOT NULL,
+ attempt integer NOT NULL,
+ event_type text NOT NULL,
+ message text,
+ sleep_s double precision DEFAULT 0
+);
--
--- Name: ingestion_retry_logs; Type: ROW SECURITY; Schema: public; Owner: -
+-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--
-ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY;
+CREATE SEQUENCE public.ingestion_retry_logs_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1;
+
--
--- Name: ingestion_retry_logs ingestion_retry_logs_delete_own; Type: POLICY; Schema: public; Owner: -
+-- Name: ingestion_retry_logs_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--
-CREATE POLICY ingestion_retry_logs_delete_own ON public.ingestion_retry_logs FOR DELETE USING ((user_id = auth.uid()));
+ALTER SEQUENCE public.ingestion_retry_logs_id_seq OWNED BY public.ingestion_retry_logs.id;
--
--- Name: ingestion_retry_logs ingestion_retry_logs_insert_own; Type: POLICY; Schema: public; Owner: -
+-- Name: intent_feedback; Type: TABLE; Schema: public; Owner: -
--
-CREATE POLICY ingestion_retry_logs_insert_own ON public.ingestion_retry_logs FOR INSERT WITH CHECK ((user_id = auth.uid()));
+CREATE TABLE public.intent_feedback (
+ id bigint NOT NULL,
+ user_id uuid,
+ query text NOT NULL,
+ has_category boolean DEFAULT false NOT NULL,
+ has_history boolean DEFAULT false NOT NULL,
+ label integer NOT NULL,
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
+ CONSTRAINT intent_feedback_label_check CHECK ((label = ANY (ARRAY[0, 1])))
+);
--
--- Name: ingestion_retry_logs ingestion_retry_logs_select_own; Type: POLICY; Schema: public; Owner: -
+-- Name: intent_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
--
-CREATE POLICY ingestion_retry_logs_select_own ON public.ingestion_retry_logs FOR SELECT USING ((user_id = auth.uid()));
+CREATE SEQUENCE public.intent_feedback_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1;
--
--- Name: ingestion_retry_logs ingestion_retry_logs_update_own; Type: POLICY; Schema: public; Owner: -
+-- Name: intent_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
--
-CREATE POLICY ingestion_retry_logs_update_own ON public.ingestion_retry_logs FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+ALTER SEQUENCE public.intent_feedback_id_seq OWNED BY public.intent_feedback.id;
--
--- Name: ingested_files; Type: ROW SECURITY; Schema: public; Owner: -
+-- Name: query_traces; Type: TABLE; Schema: public; Owner: -
--
-ALTER TABLE public.ingested_files ENABLE ROW LEVEL SECURITY;
-
---
--- Name: ingested_files ingested_files_delete_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY ingested_files_delete_own ON public.ingested_files FOR DELETE USING ((user_id = auth.uid()));
-
-
---
--- Name: ingested_files ingested_files_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY ingested_files_insert_own ON public.ingested_files FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: ingested_files ingested_files_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY ingested_files_select_own ON public.ingested_files FOR SELECT USING ((user_id = auth.uid()));
-
-
---
--- Name: ingested_files ingested_files_update_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
---
-
-ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
-
---
--- Name: intent_feedback intent_feedback_insert_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY intent_feedback_insert_own ON public.intent_feedback FOR INSERT WITH CHECK ((user_id = auth.uid()));
-
-
---
--- Name: intent_feedback intent_feedback_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY intent_feedback_select_own ON public.intent_feedback FOR SELECT USING ((user_id = auth.uid()));
-
-
---
--- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
---
-
-ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
-
---
--- Name: rerank_feedback rerank_feedback_select_own; Type: POLICY; Schema: public; Owner: -
---
-
-CREATE POLICY rerank_feedback_select_own ON public.rerank_feedback FOR SELECT USING ((user_id = auth.uid()));
-
-
+CREATE TABLE public.query_traces (
+ trace_id uuid DEFAULT gen_random_uuid() NOT NULL,
+ user_id uuid,
+ session_id text DEFAULT 'default_session'::text NOT NULL,
+ question text NOT NULL,
+ route_mode text DEFAULT 'default'::text NOT NULL,
+ selected_experts jsonb DEFAULT '[]'::jsonb NOT NULL,
+ expert_weights jsonb DEFAULT '{}'::jsonb NOT NULL,
+ pinned_file_hashes jsonb DEFAULT '[]'::jsonb NOT NULL,
+ candidate_counts jsonb DEFAULT '{}'::jsonb NOT NULL,
+ selected_chunk_ids jsonb DEFAULT '[]'::jsonb NOT NULL,
+ doc_diagnostics jsonb DEFAULT '[]'::jsonb NOT NULL,
+ failure_modes jsonb DEFAULT '[]'::jsonb NOT NULL,
+ quality_metrics jsonb DEFAULT '{}'::jsonb NOT NULL,
+ answer_hash text,
+ answer_preview text,
+ latency_ms integer,
+ created_at timestamp with time zone DEFAULT timezone('utc'::text, now()) NOT NULL,
+ review_state text DEFAULT 'pending'::text NOT NULL,
+ review_notes text,
+ reviewed_at timestamp with time zone,
+ reviewed_by text,
+ promoted_to_eval boolean DEFAULT false NOT NULL,
+ document_types jsonb DEFAULT '[]'::jsonb NOT NULL
+);
+
+
--
--- PostgreSQL database dump complete
+-- Name: rerank_feedback; Type: TABLE; Schema: public; Owner: -
+--
+
+CREATE TABLE public.rerank_feedback (
+ id bigint NOT NULL,
+ user_id uuid,
+ query_hash text NOT NULL,
+ chunk_id uuid,
+ chunk_hash text NOT NULL,
+ document_type text,
+ cohere_score real NOT NULL,
+ was_selected boolean NOT NULL,
+ created_at timestamp with time zone DEFAULT now() NOT NULL,
+ query_text text,
+ chunk_text text
+);
+
+
+--
+-- Name: rerank_feedback_id_seq; Type: SEQUENCE; Schema: public; Owner: -
+--
+
+CREATE SEQUENCE public.rerank_feedback_id_seq
+ START WITH 1
+ INCREMENT BY 1
+ NO MINVALUE
+ NO MAXVALUE
+ CACHE 1;
+
+
+--
+-- Name: rerank_feedback_id_seq; Type: SEQUENCE OWNED BY; Schema: public; Owner: -
+--
+
+ALTER SEQUENCE public.rerank_feedback_id_seq OWNED BY public.rerank_feedback.id;
+
+
+--
+-- Name: ingestion_retry_logs id; Type: DEFAULT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.ingestion_retry_logs ALTER COLUMN id SET DEFAULT nextval('public.ingestion_retry_logs_id_seq'::regclass);
+
+
+--
+-- Name: intent_feedback id; Type: DEFAULT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.intent_feedback ALTER COLUMN id SET DEFAULT nextval('public.intent_feedback_id_seq'::regclass);
+
+
+--
+-- Name: rerank_feedback id; Type: DEFAULT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.rerank_feedback ALTER COLUMN id SET DEFAULT nextval('public.rerank_feedback_id_seq'::regclass);
+
+
+--
+-- Name: answer_feedback answer_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.answer_feedback
+ ADD CONSTRAINT answer_feedback_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: category_centroids category_centroids_document_type_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.category_centroids
+ ADD CONSTRAINT category_centroids_document_type_key UNIQUE (document_type);
+
+
+--
+-- Name: category_centroids category_centroids_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.category_centroids
+ ADD CONSTRAINT category_centroids_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: chat_memory chat_memory_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.chat_memory
+ ADD CONSTRAINT chat_memory_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: document_trees document_trees_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.document_trees
+ ADD CONSTRAINT document_trees_user_file_hash_key UNIQUE (user_id, file_hash);
+
+
+--
+-- Name: documents documents_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.documents
+ ADD CONSTRAINT documents_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: evaluation_datasets evaluation_datasets_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evaluation_datasets
+ ADD CONSTRAINT evaluation_datasets_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: evaluation_datasets evaluation_datasets_trace_id_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evaluation_datasets
+ ADD CONSTRAINT evaluation_datasets_trace_id_key UNIQUE (trace_id);
+
+
--
-
+-- Name: evaluation_logs evaluation_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evaluation_logs
+ ADD CONSTRAINT evaluation_logs_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: graph_edges graph_edges_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.graph_edges
+ ADD CONSTRAINT graph_edges_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: graph_edges graph_edges_user_id_source_node_key_target_node_key_edge_ty_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.graph_edges
+ ADD CONSTRAINT graph_edges_user_id_source_node_key_target_node_key_edge_ty_key UNIQUE (user_id, source_node_key, target_node_key, edge_type);
+
+
+--
+-- Name: graph_nodes graph_nodes_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.graph_nodes
+ ADD CONSTRAINT graph_nodes_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: graph_nodes graph_nodes_user_id_node_key_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.graph_nodes
+ ADD CONSTRAINT graph_nodes_user_id_node_key_key UNIQUE (user_id, node_key);
+
+
+--
+-- Name: ingested_files ingested_files_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.ingested_files
+ ADD CONSTRAINT ingested_files_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: ingested_files ingested_files_user_file_hash_key; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.ingested_files
+ ADD CONSTRAINT ingested_files_user_file_hash_key UNIQUE (user_id, file_hash);
+
+
+--
+-- Name: ingestion_retry_logs ingestion_retry_logs_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.ingestion_retry_logs
+ ADD CONSTRAINT ingestion_retry_logs_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: intent_feedback intent_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.intent_feedback
+ ADD CONSTRAINT intent_feedback_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: query_traces query_traces_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.query_traces
+ ADD CONSTRAINT query_traces_pkey PRIMARY KEY (trace_id);
+
+
+--
+-- Name: rerank_feedback rerank_feedback_pkey; Type: CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.rerank_feedback
+ ADD CONSTRAINT rerank_feedback_pkey PRIMARY KEY (id);
+
+
+--
+-- Name: category_centroids_type_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX category_centroids_type_idx ON public.category_centroids USING btree (document_type);
+
+
+--
+-- Name: category_centroids_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX category_centroids_user_id_idx ON public.category_centroids USING btree (user_id);
+
+
+--
+-- Name: category_centroids_user_type_uidx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE UNIQUE INDEX category_centroids_user_type_uidx ON public.category_centroids USING btree (user_id, document_type);
+
+
+--
+-- Name: chat_memory_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX chat_memory_user_id_idx ON public.chat_memory USING btree (user_id);
+
+
+--
+-- Name: doc_node_type_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX doc_node_type_idx ON public.documents USING btree (node_type);
+
+
+--
+-- Name: documents_content_fts_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX documents_content_fts_idx ON public.documents USING gin (to_tsvector('english'::regconfig, content));
+
+
+--
+-- Name: documents_embedding_hnsw_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX documents_embedding_hnsw_idx ON public.documents USING hnsw (((embedding)::extensions.halfvec(2048)) extensions.halfvec_cosine_ops) WITH (m='16', ef_construction='64');
+
+
+--
+-- Name: documents_metadata_filehash_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX documents_metadata_filehash_idx ON public.documents USING btree (((metadata ->> 'file_hash'::text)));
+
+
+--
+-- Name: documents_metadata_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX documents_metadata_idx ON public.documents USING gin (metadata);
+
+
+--
+-- Name: documents_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX documents_user_id_idx ON public.documents USING btree (user_id);
+
+
+--
+-- Name: evaluation_logs_evaluated_at_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX evaluation_logs_evaluated_at_idx ON public.evaluation_logs USING btree (evaluated_at DESC);
+
+
+--
+-- Name: evaluation_logs_run_label_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX evaluation_logs_run_label_idx ON public.evaluation_logs USING btree (run_label);
+
+
+--
+-- Name: idx_answer_feedback_review_state_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_answer_feedback_review_state_created ON public.answer_feedback USING btree (review_state, created_at DESC);
+
+
+--
+-- Name: idx_answer_feedback_trace_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_answer_feedback_trace_created ON public.answer_feedback USING btree (trace_id, created_at DESC);
+
+
+--
+-- Name: idx_answer_feedback_user_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_answer_feedback_user_created ON public.answer_feedback USING btree (user_id, created_at DESC);
+
+
+--
+-- Name: idx_chat_memory_session; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_chat_memory_session ON public.chat_memory USING btree (session_id);
+
+
+--
+-- Name: idx_document_trees_json; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_document_trees_json ON public.document_trees USING gin (tree_json);
+
+
+--
+-- Name: idx_evaluation_datasets_active_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_evaluation_datasets_active_created ON public.evaluation_datasets USING btree (is_active, created_at DESC);
+
+
+--
+-- Name: idx_graph_edges_user_source; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_graph_edges_user_source ON public.graph_edges USING btree (user_id, source_node_key);
+
+
+--
+-- Name: idx_graph_edges_user_target; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_graph_edges_user_target ON public.graph_edges USING btree (user_id, target_node_key);
+
+
+--
+-- Name: idx_graph_nodes_user_label; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_graph_nodes_user_label ON public.graph_nodes USING btree (user_id, label);
+
+
+--
+-- Name: idx_graph_nodes_user_type; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_graph_nodes_user_type ON public.graph_nodes USING btree (user_id, node_type);
+
+
+--
+-- Name: idx_query_traces_review_state_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_query_traces_review_state_created ON public.query_traces USING btree (review_state, created_at DESC);
+
+
+--
+-- Name: idx_query_traces_session_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_query_traces_session_created ON public.query_traces USING btree (session_id, created_at DESC);
+
+
+--
+-- Name: idx_query_traces_user_created; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX idx_query_traces_user_created ON public.query_traces USING btree (user_id, created_at DESC);
+
+
+--
+-- Name: ingested_files_hash_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX ingested_files_hash_idx ON public.ingested_files USING btree (file_hash);
+
+
+--
+-- Name: ingested_files_user_file_hash_uidx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE UNIQUE INDEX ingested_files_user_file_hash_uidx ON public.ingested_files USING btree (user_id, file_hash);
+
+
+--
+-- Name: ingested_files_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX ingested_files_user_id_idx ON public.ingested_files USING btree (user_id);
+
+
+--
+-- Name: ingestion_retry_logs_created_at_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX ingestion_retry_logs_created_at_idx ON public.ingestion_retry_logs USING btree (created_at DESC);
+
+
+--
+-- Name: ingestion_retry_logs_user_file_event_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX ingestion_retry_logs_user_file_event_idx ON public.ingestion_retry_logs USING btree (user_id, file_hash, event_type, created_at DESC);
+
+
+--
+-- Name: ingestion_retry_logs_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX ingestion_retry_logs_user_id_idx ON public.ingestion_retry_logs USING btree (user_id);
+
+
+--
+-- Name: intent_feedback_user_id_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX intent_feedback_user_id_idx ON public.intent_feedback USING btree (user_id);
+
+
+--
+-- Name: rerank_feedback_doc_type_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX rerank_feedback_doc_type_idx ON public.rerank_feedback USING btree (document_type);
+
+
+--
+-- Name: rerank_feedback_user_created_idx; Type: INDEX; Schema: public; Owner: -
+--
+
+CREATE INDEX rerank_feedback_user_created_idx ON public.rerank_feedback USING btree (user_id, created_at DESC);
+
+
+--
+-- Name: category_centroids trg_centroids_updated_at; Type: TRIGGER; Schema: public; Owner: -
+--
+
+CREATE TRIGGER trg_centroids_updated_at BEFORE UPDATE ON public.category_centroids FOR EACH ROW EXECUTE FUNCTION public._trg_set_updated_at();
+
+
+--
+-- Name: answer_feedback answer_feedback_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.answer_feedback
+ ADD CONSTRAINT answer_feedback_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE CASCADE;
+
+
+--
+-- Name: evaluation_datasets evaluation_datasets_trace_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
+--
+
+ALTER TABLE ONLY public.evaluation_datasets
+ ADD CONSTRAINT evaluation_datasets_trace_id_fkey FOREIGN KEY (trace_id) REFERENCES public.query_traces(trace_id) ON DELETE SET NULL;
+
+
+--
+-- Name: answer_feedback; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.answer_feedback ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: category_centroids; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.category_centroids ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: chat_memory; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.chat_memory ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: chat_memory chat_memory_delete_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY chat_memory_delete_own ON public.chat_memory FOR DELETE USING ((user_id = auth.uid()));
+
+
+--
+-- Name: chat_memory chat_memory_insert_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY chat_memory_insert_own ON public.chat_memory FOR INSERT WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: chat_memory chat_memory_select_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY chat_memory_select_own ON public.chat_memory FOR SELECT USING ((user_id = auth.uid()));
+
+
+--
+-- Name: chat_memory chat_memory_update_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY chat_memory_update_own ON public.chat_memory FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: document_trees; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.document_trees ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: document_trees document_trees_delete_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY document_trees_delete_own ON public.document_trees FOR DELETE USING ((user_id = auth.uid()));
+
+
+--
+-- Name: document_trees document_trees_insert_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY document_trees_insert_own ON public.document_trees FOR INSERT WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: document_trees document_trees_select_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY document_trees_select_own ON public.document_trees FOR SELECT USING ((user_id = auth.uid()));
+
+
+--
+-- Name: document_trees document_trees_update_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY document_trees_update_own ON public.document_trees FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: documents; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.documents ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: documents documents_delete_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY documents_delete_own ON public.documents FOR DELETE USING ((user_id = auth.uid()));
+
+
+--
+-- Name: documents documents_insert_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY documents_insert_own ON public.documents FOR INSERT WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: documents documents_select_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY documents_select_own ON public.documents FOR SELECT USING ((user_id = auth.uid()));
+
+
+--
+-- Name: documents documents_update_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY documents_update_own ON public.documents FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: evaluation_datasets; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.evaluation_datasets ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: evaluation_logs; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.evaluation_logs ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: graph_edges; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.graph_edges ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: graph_nodes; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.graph_nodes ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: ingested_files; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.ingested_files ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: ingested_files ingested_files_delete_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY ingested_files_delete_own ON public.ingested_files FOR DELETE USING ((user_id = auth.uid()));
+
+
+--
+-- Name: ingested_files ingested_files_insert_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY ingested_files_insert_own ON public.ingested_files FOR INSERT WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: ingested_files ingested_files_select_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY ingested_files_select_own ON public.ingested_files FOR SELECT USING ((user_id = auth.uid()));
+
+
+--
+-- Name: ingested_files ingested_files_update_own; Type: POLICY; Schema: public; Owner: -
+--
+
+CREATE POLICY ingested_files_update_own ON public.ingested_files FOR UPDATE USING ((user_id = auth.uid())) WITH CHECK ((user_id = auth.uid()));
+
+
+--
+-- Name: ingestion_retry_logs; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.ingestion_retry_logs ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: intent_feedback; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.intent_feedback ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: query_traces; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.query_traces ENABLE ROW LEVEL SECURITY;
+
+--
+-- Name: rerank_feedback; Type: ROW SECURITY; Schema: public; Owner: -
+--
+
+ALTER TABLE public.rerank_feedback ENABLE ROW LEVEL SECURITY;
+
+--
+-- PostgreSQL database dump complete
+--
+
+\unrestrict 32urOXpOnsQS0zoo7jGTkIs0BeRgGPyJVLWPDJ6IexS9GSsM4lpkxJaAg6FM0Ua