Spaces:

nothex
/

morpheus-rag

Running

nothex commited on 26 days ago

Commit

c1512e9

1 Parent(s): 6fce572

feat: implement ingestion backoff, CI build modes, and rebrand to Morpheus

- **Resilience**: Added exponential backoff and retry loops for vector embeddings (`FallbackEmbeddings`) and Supabase RPC document inserts, preventing 429 crashes during heavy ingestion.
- **Telemetry**: Added `ingestion_retry_logs` tracking to monitor database rate limits.
- **Build Stability**: Introduced `NEXUS_BUILD_ASSETS_MODE` (light/full) to conditionally control heavy ML asset downloads and intent model training during Docker builds.
- **Graceful Degradation**: Bypassed Supabase operations in `intent_classifier` if credentials are missing, preventing crashes in CI environments.
- **Observability**: Added `/health/details` endpoint exposing the intent classifier's readiness and bootstrap status.
- **Rebranding**: Renamed the project from "NEXUS" to "Morpheus" across FastAPI configs, frontend UI, logging, and chat agent personas.

Files changed (14) hide show

.claude/settings.local.json +15 -8
.github/workflows/smoke.yml +37 -0
Dockerfile +8 -2
NEXUS_PROJECT_GUIDE.md +569 -0
README.md +24 -0
backend/core/build_ml_assets.py +33 -24
backend/core/config.py +6 -2
backend/core/intent_classifier.py +80 -35
backend/core/pipeline.py +185 -74
backend/core/providers.py +12 -9
backend/main.py +26 -16
frontend/index.html +6 -6
frontend/js/chat.js +3 -3
supabase/migrations/0004_ingestion_retry_logs.sql +21 -0

.claude/settings.local.json CHANGED Viewed

@@ -1,36 +1,43 @@
 {
   "hooks": {
-    "PreCompact": [
       {
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/prime.ps1\""
           }
-        ],
-        "matcher": ""
       }
     ],
     "Stop": [
       {
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/stop_hook.ps1\""
           }
-        ],
-        "matcher": ""
       }
     ],
-    "SessionStart": [
       {
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/prime.ps1\""
           }
-        ],
-        "matcher": ""
       }
     ]
   }

 {
+  "permissions": {
+    "allow": [
+      "Bash(where launch:*)",
+      "Bash(ls -la D:/Work/Projects/proj/launch*)",
+      "mcp__dual-graph__graph_read"
+    ]
+  },
   "hooks": {
+    "SessionStart": [
       {
+        "matcher": "",
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/prime.ps1\""
           }
+        ]
       }
     ],
     "Stop": [
       {
+        "matcher": "",
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/stop_hook.ps1\""
           }
+        ]
       }
     ],
+    "PreCompact": [
       {
+        "matcher": "",
         "hooks": [
           {
             "type": "command",
             "command": "powershell -NoProfile -File \"D:/Work/Projects/proj/.dual-graph/prime.ps1\""
           }
+        ]
       }
     ]
   }

.github/workflows/smoke.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Smoke Checks
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+    branches: ["**"]
+jobs:
+  smoke:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Build assets (light mode)
+        env:
+          NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
+          NEXUS_BUILD_ASSETS_MODE: "light"
+        run: |
+          python -m backend.core.build_ml_assets
+      - name: Intent classifier smoke predict
+        env:
+          NEXUS_DISABLE_INTENT_BOOTSTRAP: "true"
+        run: |
+          python -c "from backend.core.intent_classifier import intent_classifier as ic; print(ic.predict('what are the key points?', False, False))"

Dockerfile CHANGED Viewed

@@ -28,8 +28,14 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY --chown=user:user . .
 # 7. Pre-build ML assets (downloads models to cache, trains intent classifier)
-RUN python -m backend.core.build_ml_assets
 # 8. Start FastAPI (7860 is the HF standard, but Railway uses $PORT)
 ENV PORT=7860
-CMD uvicorn backend.main:app --host 0.0.0.0 --port $PORT

 COPY --chown=user:user . .
 # 7. Pre-build ML assets (downloads models to cache, trains intent classifier)
+ARG PREBUILD_ML_ASSETS=1
+ARG NEXUS_BUILD_ASSETS_MODE=light
+RUN if [ "$PREBUILD_ML_ASSETS" = "1" ]; then \
+      NEXUS_BUILD_ASSETS_MODE=$NEXUS_BUILD_ASSETS_MODE python -m backend.core.build_ml_assets ; \
+    else \
+      echo "Skipping ML asset pre-build"; \
+    fi
 # 8. Start FastAPI (7860 is the HF standard, but Railway uses $PORT)
 ENV PORT=7860
+CMD ["sh", "-c", "uvicorn backend.main:app --host 0.0.0.0 --port ${PORT}"]

NEXUS_PROJECT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,569 @@

+# NEXUS — Complete Project Understanding Guide
+> Read this when starting a new session, onboarding someone, or after a long break.
+> Everything you built, how it connects, and why decisions were made.
+---
+## What Is NEXUS?
+NEXUS is a **multi-tenant RAG (Retrieval-Augmented Generation) platform**.
+Users upload PDF documents. They ask questions in natural language. NEXUS finds the most relevant passages and uses an AI to generate accurate answers with source citations and confidence scores.
+**What makes it non-trivial:**
+- Each user sees only their own documents, enforced at the database level (RLS)
+- Retrieval combines keyword search + semantic search + neural reranking
+- The intent classifier learns from user behaviour and retrains automatically
+- Similar questions get instant answers from a semantic cache
+- Conversations are remembered across sessions (episodic memory)
+- If one AI provider fails, the system automatically tries the next one
+- Documents are indexed as hierarchical RAPTOR trees, not just flat chunks
+---
+## Project Structure
+```
+proj/
+├── app.py                      # FastAPI entry point
+├── backend/
+│   ├── main.py                 # App startup, Celery worker, rate limiter
+│   ├── api/
+│   │   ├── auth.py             # /api/v1/auth/* — login, /me, /verify
+│   │   ├── query.py            # /api/v1/query — SSE streaming RAG endpoint
+│   │   ├── corpus.py           # /api/v1/corpus/* — files, delete, rename, recategorise
+│   │   ├── ingest.py           # /api/v1/ingest/* — upload, status polling
+│   │   ├── frontend_config.py  # /api/v1/config — serves Supabase keys to frontend
+│   │   └── admin.py            # /api/v1/admin — master key, daily code
+│   └── core/
+│       ├── pipeline.py         # The entire brain — ingestion + retrieval + generation
+│       ├── providers.py        # All AI provider wrappers (Groq, Gemini, OpenRouter)
+│       ├── classifier.py       # 3-stage document classifier
+│       ├── intent_classifier.py # Neural intent classifier (sklearn, retrains online)
+│       ├── cache_manager.py    # Semantic Redis cache with version invalidation
+│       ├── auth_utils.py       # JWT decoding helpers
+│       ├── config.py           # All constants and model lists
+│       └── tasks.py            # Celery task definition
+├── frontend/
+│   ├── index.html
+│   └── js/
+│       ├── main.js             # Boot sequence, auth gate, tab hook
+│       ├── api.js              # All fetch() calls — single source of truth
+│       ├── state.js            # Global STATE object
+│       ├── corpus.js           # Upload, document list, category review
+│       ├── chat.js             # Chat UI, streaming token renderer
+│       ├── graph.js            # D3 force-directed graph
+│       ├── inspect.js          # Right-panel node inspector
+│       ├── ui.js               # switchView, toast, shared UI utils
+│       └── config.js           # CONFIG object, initSupabase()
+└── supabase/
+    ├── rls/multi_tenancy_rls.sql
+    └── migrations/
+```
+---
+## The Database (Supabase / PostgreSQL)
+### Tables
+**`documents`** — The vector store. Every chunk from every PDF lives here.
+| Column | Type | Purpose |
+|--------|------|---------|
+| `id` | uuid | Chunk ID (deterministic: uuid5 of file_hash + chunk_index) |
+| `content` | text | The chunk text that gets searched |
+| `metadata` | jsonb | Source, file_hash, document_type, page_numbers, chunk_index, relevance_score |
+| `embedding` | vector(2048) | The nvidia-nemotron embedding for pgvector semantic search |
+| `user_id` | uuid | RLS tenant isolation — set by insert_document_chunk RPC |
+**`ingested_files`** — Dedup registry. Checked before every upload.
+| Column | Type | Purpose |
+|--------|------|---------|
+| `file_hash` | text | SHA-256 of the PDF — the dedup key |
+| `filename` | text | Display name shown in the UI |
+| `document_type` | text | Category e.g. academic_syllabus |
+| `chunk_count` | int | How many chunks (includes RAPTOR tree nodes) |
+| `user_id` | uuid | Tenant isolation |
+| `user_overridden` | bool | True if user manually changed the category — classifier skips when true |
+**`chat_memory`** — Episodic memory. Past Q&A pairs, searchable by semantic similarity.
+| Column | Type | Purpose |
+|--------|------|---------|
+| `session_id` | text | Groups messages from the same conversation |
+| `role` | text | "user" or "assistant" |
+| `content` | text | The message |
+| `embedding` | vector | For semantic search via match_memory RPC |
+| `user_id` | uuid | Tenant isolation |
+**`category_centroids`** — The classifier's learned memory.
+| Column | Type | Purpose |
+|--------|------|---------|
+| `document_type` | text | Category label |
+| `centroid_vector` | array | Running average embedding of all docs of this type |
+| `document_count` | int | How many documents contributed |
+| `user_id` | uuid | Per-tenant centroids |
+**`evaluation_logs`** — RAGAS quality metrics. Written after every query.
+| Column | Type | Purpose |
+|--------|------|---------|
+| `run_label` | text | "production" for live, timestamp for offline eval runs |
+| `question` | text | The query |
+| `relevance_proxy` | float | Average Cohere relevance score across chunks |
+| `precision_at_k` | float | Max Cohere relevance score (most relevant chunk) |
+| `final_score` | float | Best quality proxy available online |
+**`rerank_feedback`** — Every Cohere rerank decision, stored for future CrossEncoder distillation.
+**`intent_feedback`** — Online training data for the intent classifier.
+**`mv_document_types`** — Materialized view. The category filter dropdown reads from here.
+### Supabase RPC Functions
+These are stored procedures called from Python like regular functions:
+- `hybrid_search(query_text, query_embedding, match_count, filter, semantic_weight, keyword_weight)` — Combined BM25 + pgvector search
+- `match_memory(query_embedding, match_session_id, match_count)` — Semantic search over chat history
+- `insert_document_chunk(p_id, p_content, p_metadata, p_embedding, p_user_id)` — Secure insert that bypasses RLS by accepting user_id explicitly
+- `refresh_document_types_mv()` — Refreshes the category filter view
+- `get_document_types()` — Returns distinct categories for this tenant
+### Row Level Security
+Every table has RLS policies. The core rule: `user_id = auth.uid()` for reads. Writes from Celery workers use the service role key but always inject `user_id` explicitly via the `insert_document_chunk` RPC. This means the security check happens at the API boundary (JWT validation) and the database enforces it for reads.
+---
+## The Ingestion Pipeline
+When a user uploads a PDF:
+```
+Browser
+  POST /api/v1/ingest/upload
+  FastAPI validates JWT (require_auth_token)
+  Saves PDF to temp file
+  process_pdf_task.delay() pushes to Redis queue
+  Returns {task_id} immediately (200 OK)
+  Browser polls /api/v1/ingest/status/{task_id} every 2 seconds
+Celery worker:
+Step 1: SHA-256 fingerprint
+  Check ingested_files for this hash (O(1) indexed lookup)
+  If found: return "already_ingested"
+  Check if user_overridden=True: load forced_category, skip classifier
+Step 2: PDF partitioning (unstructured library)
+  partition_pdf() — OCR + layout detection
+  extract_images_from_pdf() — PyMuPDF, filters tiny/skewed images
+  Returns list of Element objects (Title, NarrativeText, Table, Image...)
+Step 3: Classification (classifier.py)
+  Three-stage cascade:
+    Stage 1: Centroid nearest-neighbour (cosine similarity, no API call)
+             If confidence >= 0.72: done
+    Stage 2: Ensemble vote (centroid + label-embed + TF-IDF)
+             If confidence >= 0.38: done
+    Stage 3: LLM chain-of-thought (last resort for novel document types)
+  Special: Sparse/tabular pre-check routes to visual classification
+  If user_overridden=True: skip all stages, use forced_category directly
+Step 4: Chunking + AI summaries
+  chunk_by_title() groups elements into logical sections
+  For chunks with tables or images: parallel AI vision summarisation (5 workers)
+  Each chunk becomes a LangChain Document with rich metadata
+Step 5: RAPTOR tree indexing
+  Groups leaf chunks into clusters of 5
+  Generates LLM parent summary for each cluster
+  Repeats until single root node
+  All nodes (leaves + summaries) get uploaded
+  Root node answers "what is this document about?"
+  Leaf nodes answer specific detail questions
+Step 6: Embedding + upload
+  Batch embed all nodes via nvidia-nemotron (2048 dims)
+  Insert each via insert_document_chunk RPC (explicit user_id, no RLS issue)
+  Register in ingested_files
+  Invalidate semantic cache for this user (kb_version++)
+```
+---
+## The Retrieval Pipeline
+When a user asks a question:
+```
+Browser
+  POST /api/v1/query {query, category, history, session_id, alpha}
+  X-Auth-Token header
+FastAPI validates JWT, starts SSE streaming response
+Step 1: Intent analysis (analyse_intent)
+  Local sklearn classifier, <5ms, no API call
+  Inputs: query text, has_category, has_history
+  Output: {is_clear, enriched_query, clarification_question}
+  If needs_clarification: stream question back, stop
+  After 2 consecutive clarification turns: proceed regardless
+  Enrichment: if reference query ("summarise it"), replace with previous query
+  Logs to intent_feedback for online retraining
+Step 2: retrieve_chunks()
+  a) Follow-up detection
+     Short query (<=8 words) with pronouns (it/this/that/they)?
+     Reuse _last_chunks[session_key] — no re-search needed
+  b) Semantic cache check
+     Embed original query (256-entry in-memory LRU cache)
+     Scan Redis for cosine similarity >= 0.92
+     Hit: return __CACHE_HIT__ sentinel document
+  c) Query rewriting
+     LLM breaks query into 1-3 targeted sub-queries
+     Short queries (<=3 words) skip this step
+  d) Hybrid search (per sub-query)
+     hybrid_search RPC: BM25 keywords + pgvector semantics combined
+     alpha=0.5 means equal weight (user can adjust via slider)
+     Deduplicates across sub-queries by chunk ID
+     If category filter active: hard filter on document_type
+  e) Reranking (3-tier fallback)
+     Tier 1: Cohere rerank-multilingual-v3.0 (cloud API, best quality)
+     Tier 2: CrossEncoder ms-marco-MiniLM-L-6-v2 (local CUDA, free)
+     Tier 3: Lexical Jaccard similarity (pure Python, always works)
+     Relevance threshold: 0.35 (relaxed to 0.05 for small corpus)
+     Diversity filter: max 2 chunks per source, cross-category seeding
+     Context budget: trim if total chars > 14,000
+  f) Log rerank feedback (fire-and-forget thread)
+     All Cohere scores stored for future CrossEncoder distillation
+Step 3: generate_answer_stream()
+  If __CACHE_HIT__: stream cached answer directly (skip LLM entirely)
+  Retrieve episodic memory: match_memory RPC (past relevant Q&A pairs)
+  Build prompt: system role + retrieved chunks + memories + history + query
+  Stream tokens via Groq (primary) -> Gemini -> OpenRouter fallback
+  After streaming: save Q&A pair to chat_memory with embeddings (thread)
+  Store in semantic cache (version key + TTL by document type)
+Step 4: Emit sources
+  Collect metadata from retrieved chunks
+  Send {type: "done", sources: [...], images: [...]} SSE event
+```
+---
+## The Provider System
+`providers.py` routes each task to the best available AI provider.
+```
+ProviderFactory.build_chat_llm(purpose=...)
+  purpose="text"       Groq (fast, generous limits) -> Gemini -> OpenRouter
+  purpose="ingestion"  Gemini (1M context, good at summaries) -> OpenRouter
+  purpose="vision"     Gemini (native multimodal) -> OpenRouter vision models
+  purpose="rewriter"   OpenRouter (cheap per-call) -> Groq fallback
+  purpose="classifier" OpenRouter classifier models only
+```
+**Current model lists:**
+- Groq: `llama-4-scout-17b` -> `llama-3.3-70b-versatile` -> `qwen3-32b` -> `llama-3.1-8b-instant`
+- Gemini: `gemini-2.5-flash` -> `gemini-2.5-flash-lite` (updated from deprecated 1.5/2.0)
+- OpenRouter text: `stepfun/step-3.5-flash:free` -> `nvidia/nemotron-3-super-120b:free` -> `arcee-ai/trinity-large-preview:free` -> more
+- Embeddings: `nvidia/llama-nemotron-embed-vl-1b-v2:free` (2048 dims) -> `text-embedding-3-small`
+**FallbackEmbeddings null guard:** OpenRouter sometimes returns HTTP 200 with `data=null`. The guard raises `ValueError` on null response and retries the next model instead of crashing ingestion.
+---
+## The Semantic Cache
+`cache_manager.py` — graduated invalidation with semantic similarity lookup.
+**How it works:**
+1. Each user has a `kb_version` integer in Redis: `nexus:kb_version:{user_id}`
+2. Cache entries use version in key: `nexus:qcache:{user_id}:v{version}:...`
+3. On lookup: scan all entries for this user+version, find best cosine similarity match
+4. Hit threshold: 0.92 (strict to avoid returning wrong answers)
+5. On corpus change: `increment_kb_version()` -> version goes N to N+1
+6. Old v1 entries invisible under v2 — effectively invalidated
+**Critical fix applied:** The version key must be written to Redis on first store via `r.setnx(version_key, kb_version)`. Without this, `r.incr()` on a non-existent key initialises to 0 then increments to 1 — same as the default — so old cache entries remain visible after delete.
+**TTL by document type:** academic_syllabus/reference_chart = 7 days, technical_manual/research_paper = 3 days, financial_report/hr_policy = 1 day, general_document = 1 hour.
+---
+## The Intent Classifier
+`intent_classifier.py` — sklearn-based, runs locally, under 5ms per query.
+**What it classifies:** Does this query need clarification or is it clear enough to proceed?
+**Features:** `has_category`, `has_history`, query text embedding via `all-MiniLM-L6-v2`.
+**Online learning:** Every 25 queries logged to `intent_feedback`, the model retrains automatically and saves to `intent_model.pkl`. Currently at v2 with 158+ examples.
+**Clarification limit:** After 2 consecutive clarification turns, the system proceeds regardless. Prevents the system from getting stuck in a clarification loop.
+---
+## The Document Classifier
+`classifier.py` — three-stage cascade.
+```
+Incoming document
+        |
+        v
+Sparse/tabular pre-check (words < 200 OR unique_ratio > 0.85)
+  YES: visual classification (structural fingerprint to LLM)
+  NO: continue
+        |
+        v
+Stage 1: Centroid nearest-neighbour
+  Cosine similarity to stored category centroids
+  Confidence >= 0.72: done
+        |
+        v
+Stage 2: Ensemble vote
+  Signal A: cosine to known centroids (weight 0.45)
+  Signal B: cosine to category label embeddings (weight 0.30)
+  Signal C: TF-IDF keyword matching (weight 0.25)
+  Score >= 0.38: done
+        |
+        v
+Stage 3: LLM chain-of-thought
+  Sends excerpt to classifier LLM
+  Classifies FORMAT and STRUCTURE (not just topic)
+  Fallback: "general_document"
+```
+After classification, the centroid is updated with this document's vector — the classifier learns with every ingestion.
+**User override lock:** If `ingested_files.user_overridden=True` for this file hash, the entire classifier is skipped. Returns synthetic result with `stage_used="user_override"`, `confidence=1.0`.
+---
+## The Frontend
+### Authentication Flow
+```
+Page load
+  initSupabase() fetches Supabase keys from /api/v1/config
+  supabaseClient.auth.getSession()
+  Session exists: showApp() + bootApp()
+  No session: showLogin()
+Login
+  supabaseClient.auth.signInWithPassword(email, password)
+  JWT stored in localStorage by supabase-js automatically
+  Every request: getSupabaseToken() reads it from localStorage
+  Sent as X-Auth-Token header on every API call
+  Backend require_auth_token Depends() validates JWT and returns user_id
+```
+### Global State
+`state.js` — the single source of truth for UI state:
+- `STATE.files` — list of ingested documents from /api/v1/corpus/files
+- `STATE.categories` — list of category strings
+- `STATE.catColors` — color mapping for graph visualization
+- `STATE.chatHistory` — current conversation turns
+- `STATE.sessionId` — UUID generated per browser tab
+- `STATE.simulation` — D3 force simulation reference
+- `STATE.alpha` — retrieval weight slider (0=keyword, 1=semantic)
+- `STATE.isThinking` — prevents double-submit
+### Upload + Progress
+`corpus.js` — `processUpload()` calls `apiIngestFile()`, then enters `pollIngestStatus()` which is an infinite loop (no timeout) that exits only on COMPLETED or FAILED. Shows heartbeat messages cycling through pipeline stages while waiting.
+### Chat Streaming
+`chat.js` — `sendChat()` has 500ms debounce guard. Creates the assistant bubble immediately with thinking dots. `async onToken()` yields to the browser with `await new Promise(r => setTimeout(r, 0))` after each token update so the DOM repaints during streaming rather than all at once at the end.
+### Graph
+`graph.js` — Obsidian-style D3 force simulation. Key: `graphReheat()` uses `alpha(0.3)` not `alphaTarget(0.2)`. The alpha method sets current energy directly — works even when simulation has fully stopped. alphaTarget only sets where energy wants to decay toward, useless if simulation is already stopped. `onGraphTabVisible()` is called from `main.js` tab wrapper with 50ms delay for CSS display change to propagate before D3 reads panel dimensions.
+---
+## Complete Request Flow Example
+```
+User asks "What are the core courses?"
+1. Browser POST /api/v1/query
+   Headers: X-Auth-Token: eyJ...
+   Body: {query, category="academic_syllabus", history, session_id, alpha=0.5}
+2. FastAPI: require_auth_token decodes JWT, returns user_id="ee903934..."
+3. analyse_intent()
+   sklearn: needs_clarification=False, conf=1.00
+   category active: enriched = "query academic_syllabus"
+   Logs to intent_feedback
+4. retrieve_chunks()
+   No follow-up pronouns in query
+   Cache check: MISS (first time this query)
+   generate_sub_queries -> ["B.Tech CSE core courses", "program core credits", ...]
+   hybrid_search RPC x3 sub-queries -> 12 raw candidates
+   Cohere rerank -> ranked by relevance score
+   Threshold + diversity filter -> 3 final chunks
+   Store in _last_chunks[session_key]
+   Log rerank feedback (background thread)
+5. generate_answer_stream()
+   No __CACHE_HIT__ sentinel
+   match_memory RPC -> 2 past relevant Q&A pairs from this session
+   Build prompt: system + 3 chunks + 2 memories + history + query
+   Groq astream() -> tokens arrive one by one
+   Yield {type:"token", content:"The"}, {type:"token", content:" core"}, ...
+   After streaming: save Q&A to chat_memory (background thread)
+   Store in semantic cache (version v4, TTL 3600s)
+6. Yield {type:"done", sources:[...], images:[...]}
+7. Browser: onToken() fills bubble token by token with DOM repaint
+   onDone() appends source chips (filtering __CACHE_HIT__ sentinels)
+```
+---
+## Key Design Decisions
+**Why Celery + Redis?** Ingestion takes 60-120 seconds (OCR, AI summaries, RAPTOR). FastAPI requests time out before that. Celery lets the task run in background while the browser polls for status.
+**Why service role key for writes?** Celery workers have no browser session so `auth.uid()` is NULL. Security boundary is at the API level (JWT validation). The `insert_document_chunk` RPC accepts `user_id` as an explicit verified parameter extracted from the JWT.
+**Why RAPTOR tree indexing?** Flat chunking misses questions that span multiple sections ("total credits across all categories"). RAPTOR builds parent summaries aggregating child content, enabling retrieval at multiple granularities — root nodes for overview questions, leaf nodes for specific details.
+**Why semantic cache with version invalidation?** Repeated questions should not cost API calls. But cached answers must go stale when corpus changes. Version invalidation solves the second problem without needing to track which cache entry references which document — increment version, all old entries become invisible.
+**Why 3-tier reranker?** Cohere costs money and has rate limits. CrossEncoder is free but needs local GPU. Lexical always works. This order maximises quality while guaranteeing retrieval never fails completely.
+**Why `alpha(0.3)` not `alphaTarget(0.2)` in graph reheat?** alphaTarget sets where the simulation wants to decay toward. If the simulation has already stopped (alpha < alphaMin = 0.001), alphaTarget does nothing — simulation stays stopped. The alpha method sets current energy directly and always forces a restart.
+---
+## Environment Variables
+```
+SUPABASE_URL=https://....supabase.co
+SUPABASE_ANON_KEY=eyJ...          # Frontend-safe, used for user-scoped reads
+SUPABASE_SERVICE_KEY=eyJ...       # Server-only, bypasses RLS for writes
+OPENROUTER_API_KEY=sk-or-...
+OPEN_ROUTER_BASE_URL=https://openrouter.ai/api/v1
+GROQ_API_KEY=gsk_...
+GEMINI_API_KEY=AI...
+COHERE_API_KEY=lm1X...
+REDIS_URL=redis://default:...@...redislabs.com:10519/0
+MASTER_ADMIN_KEY=...
+LOG_LEVEL=INFO
+```
+---
+## Test Status
+| Test | Status | Notes |
+|------|--------|-------|
+| T2.2 Cross-section question | PASS | 160 credits answered correctly |
+| T2.3 Keyword-specific (Capstone) | FAIL | 0 chunks — needs PageIndex |
+| T2.4 Out-of-corpus | PASS | Returns clean "No relevant documents" |
+| T2.5 Category filter | PASS | Hard filter active in logs |
+| T3.1 Vague no context | NOT RUN | |
+| T3.2 Vague with category | NOT RUN | |
+| T3.3 Clarification limit | NOT RUN | |
+| T3.4 Follow-up detection | PASS | Reusing cached chunks confirmed |
+| T-Cache.2 Same query cache hit | PASS | similarity 1.000 |
+| T-Cache.3 Delete invalidates cache | PASS | v2 to v3 on delete |
+| T-Provider Groq fallback | PASS | Groq 200 OK after Gemini 404 |
+| T-Override User category lock | NOT RUN | Implemented, not tested |
+| T-ErrorMsg Error message format | NOT RUN | Implemented, not tested |
+---
+## What Is Next
+### Before showing to anyone
+- README + architecture doc
+- Deployment to Railway or HF Spaces (currently localhost only)
+- Rate limiting per user on query endpoint (60/hour)
+- Run remaining tests T3.1-T3.3, T-Override, T-ErrorMsg
+### Next major feature — PageIndex
+Fixes T2.3. The Capstone Project (DSN4097, 8 credits) is buried in a table in chunk 1 alongside 17 other items. Vector search and Cohere both miss it because the chunk summary emphasises overall credit structure, not individual items. PageIndex builds a hierarchical tree index from document structure and uses LLM reasoning to navigate it — not similarity search.
+Build order:
+1. Fork PageIndex, swap OpenAI for FallbackChatLLM
+2. Add tree generation as optional step in run_ingestion()
+3. New table: `document_trees` (file_hash, tree_json, user_id)
+4. `route_query()`: vector path for simple queries, tree path for structured docs
+5. D3 graph: show tree nodes when clicking into a document
+### After PageIndex
+- SetFit intent classifier upgrade (158+ examples, enough now)
+- 3-class intent: clear / clarify / follow_up (follow_up currently heuristic)
+- Corpus health dashboard (chunks, coverage, cache hit rate, avg relevance)
+- Predictive cache prefetching
+---
+## The Three Self-Improvement Loops
+NEXUS has three feedback loops that make it smarter over time:
+**Loop 1 — Intent classifier (every 25 queries)**
+User queries logged to intent_feedback. Every 25 rows, classifier retrains on accumulated examples. Learns your users' specific query patterns over time.
+**Loop 2 — Document classifier (every ingestion)**
+Each ingested document updates its category centroid. Next similar document gets Stage 1 centroid match instead of needing LLM. Classification gets faster and more accurate as corpus grows.
+**Loop 3 — Reranker distillation (background, future)**
+Every query logs Cohere rerank scores to rerank_feedback. Accumulated labels will be used to train local CrossEncoder to match Cohere quality without the API cost.
+---
+## Common Debugging
+**Ingestion crashes at step 5 (embedding)**
+Look for: `ValueError: Model X returned null embeddings`
+Cause: OpenRouter returns HTTP 200 with data=null
+Fix: FallbackEmbeddings null guard retries next model — should be in providers.py
+**Cache not invalidating after delete**
+Check Redis for key `nexus:kb_version:{user_id}`
+If missing: first ingest happened before the setnx fix was applied
+Fix: run a fresh ingest — `store_cached_answer()` calls `r.setnx()` which writes the key
+**Graph not reheating on tab switch**
+Check: `onGraphTabVisible` defined at bottom of graph.js
+Check: `_hookGraphTabVisible` IIFE at bottom of main.js
+Expected: graph animates within 50ms of tab click
+**Classifier ignoring user category**
+Check: `ingested_files.user_overridden = true` for that file hash
+Look for in logs: `User override active — forcing category 'X', skipping classifier`
+**__CACHE_HIT__ showing as source chip**
+Hard refresh browser (Ctrl+Shift+R) to load new chat.js
+The `visibleSources` filter in `onDone()` strips it
+**Gemini 404 errors during ingestion**
+Check config.py `GEMINI_TEXT_MODELS` and `GEMINI_VISION_MODELS`
+Must be `gemini-2.5-flash` and `gemini-2.5-flash-lite`
+`gemini-1.5-flash` and `gemini-2.0-flash` are deprecated
+---
+*Last updated: March 2026*

README.md CHANGED Viewed

@@ -92,6 +92,15 @@ pip install -r backend/requirements.txt
 uvicorn backend.main:app --reload --port 8000
 ```
 ### 4. Open the frontend
 Open `frontend/index.html` in your browser.
 `config.js` already points to `http://localhost:8000`.
@@ -113,6 +122,14 @@ curl -X POST http://localhost:8000/api/v1/admin/warmup \
 4. Add all your `.env` values in the Render dashboard (Environment tab)
 5. Deploy — copy your Render URL (e.g. `https://nexus-api.onrender.com`)
 ### Frontend → Vercel
 1. Update `API_URL` in `frontend/js/config.js` to your Render URL
 2. Go to [vercel.com](https://vercel.com) → New Project → connect your repo
@@ -136,6 +153,8 @@ All routes are at `/api/v1/`. Interactive docs at `/docs` (disable in prod via `
 | Method | Path | Description |
 |--------|------|-------------|
 | POST | `/auth/verify` | Verify daily guest password |
 | POST | `/auth/admin` | Verify master key, get today's code |
 | GET | `/corpus/files` | List all ingested files |
@@ -158,3 +177,8 @@ The project is designed so you can scale each layer independently:
 - **Add JWT auth**: Swap `verify_password()` in `api/auth.py` for JWT issuance. `services/auth.py` is unchanged.
 - **Add new corpus operations**: Add a route to `api/corpus.py`. `pipeline.py` is unchanged.
 - **Add new document types**: The classifier learns them automatically. Run warmup to reinforce centroids.

 uvicorn backend.main:app --reload --port 8000
 ```
+### Optional: pre-build ML assets locally
+```bash
+# Light mode (downloads embedder only; skips intent training)
+NEXUS_BUILD_ASSETS_MODE=light python -m backend.core.build_ml_assets
+# Full mode (downloads embedder + trains intent model)
+NEXUS_BUILD_ASSETS_MODE=full python -m backend.core.build_ml_assets
+```
 ### 4. Open the frontend
 Open `frontend/index.html` in your browser.
 `config.js` already points to `http://localhost:8000`.
 4. Add all your `.env` values in the Render dashboard (Environment tab)
 5. Deploy — copy your Render URL (e.g. `https://nexus-api.onrender.com`)
+#### Docker build tuning
+The Dockerfile supports optional ML prebuild knobs:
+- `PREBUILD_ML_ASSETS=1` (default) or `0`
+- `NEXUS_BUILD_ASSETS_MODE=light` (default) or `full`
+`light` is recommended for stability and faster builds.
 ### Frontend → Vercel
 1. Update `API_URL` in `frontend/js/config.js` to your Render URL
 2. Go to [vercel.com](https://vercel.com) → New Project → connect your repo
 | Method | Path | Description |
 |--------|------|-------------|
+| GET | `/health` | Basic liveness check |
+| GET | `/health/details` | Liveness + intent classifier readiness |
 | POST | `/auth/verify` | Verify daily guest password |
 | POST | `/auth/admin` | Verify master key, get today's code |
 | GET | `/corpus/files` | List all ingested files |
 - **Add JWT auth**: Swap `verify_password()` in `api/auth.py` for JWT issuance. `services/auth.py` is unchanged.
 - **Add new corpus operations**: Add a route to `api/corpus.py`. `pipeline.py` is unchanged.
 - **Add new document types**: The classifier learns them automatically. Run warmup to reinforce centroids.
+docker stop $(docker ps -a -q)
+docker build -t nexus-rag .
+docker run -p 8000:7860 --env-file .env nexus-rag

backend/core/build_ml_assets.py CHANGED Viewed

@@ -9,7 +9,6 @@ triggering heavy downloads or training loops on the first request.
 import os
 import logging
-from pathlib import Path
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("nexus.build_assets")
@@ -17,31 +16,41 @@ log = logging.getLogger("nexus.build_assets")
 def build_assets():
     log.info("Starting ML asset pre-build...")
-    # 1. Pre-download sentence-transformers (used by Intent Classifier)
-    log.info("Downloading all-MiniLM-L6-v2 embedding model...")
-    try:
-        from sentence_transformers import SentenceTransformer
-        _ = SentenceTransformer("all-MiniLM-L6-v2")
-        log.info("Embedding model downloaded successfully.")
-    except Exception as e:
-        log.error("Failed to download embedding model: %s", e)
-    # 2. Pre-train the Intent Classifier
     # This creates backend/core/intent_model.pkl
-    log.info("Training initial Intent Classifier...")
-    try:
-        from backend.core.intent_classifier import train_initial_model
-        # We temporarily disable the Supabase upload during the build step
-        # because the build environment might not have the Supabase keys yet,
-        # and we only care about the local .pkl file for the container anyway.
-        os.environ["SUPABASE_URL"] = "dummy"
-        os.environ["SUPABASE_SERVICE_KEY"] = "dummy"
-        train_initial_model()
-        log.info("Intent Classifier trained and saved locally.")
-    except Exception as e:
-        log.error("Failed to train Intent Classifier: %s", e)
     log.info("ML asset pre-build complete.")

 import os
 import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
 log = logging.getLogger("nexus.build_assets")
 def build_assets():
     log.info("Starting ML asset pre-build...")
+    # Build-time safety:
+    # Prevent intent_classifier singleton from starting background bootstrap
+    # threads while we run deterministic synchronous training below.
+    os.environ["NEXUS_DISABLE_INTENT_BOOTSTRAP"] = "true"
+    # In CI/build environments we may not have real Supabase credentials.
+    # Keep train/upload logic local-only in that case.
+    os.environ.setdefault("SUPABASE_URL", "")
+    os.environ.setdefault("SUPABASE_SERVICE_KEY", "")
+    mode = os.getenv("NEXUS_BUILD_ASSETS_MODE", "light").strip().lower()
+    log.info("Build asset mode: %s", mode)
+    # 1. Optional pre-download sentence-transformers (used by Intent Classifier)
+    if mode in {"light", "full"}:
+        log.info("Downloading all-MiniLM-L6-v2 embedding model...")
+        try:
+            from sentence_transformers import SentenceTransformer
+            _ = SentenceTransformer("all-MiniLM-L6-v2")
+            log.info("Embedding model downloaded successfully.")
+        except Exception as e:
+            log.error("Failed to download embedding model: %s", e)
+    # 2. Optional pre-train the Intent Classifier (heavy step)
     # This creates backend/core/intent_model.pkl
+    if mode == "full":
+        log.info("Training initial Intent Classifier...")
+        try:
+            from backend.core.intent_classifier import train_initial_model
+            train_initial_model()
+            log.info("Intent Classifier trained and saved locally.")
+        except Exception as e:
+            log.error("Failed to train Intent Classifier: %s", e)
+    else:
+        log.info("Skipping intent training in '%s' mode (runtime bootstrap handles it if enabled).", mode)
     log.info("ML asset pre-build complete.")

backend/core/config.py CHANGED Viewed

@@ -105,10 +105,14 @@ INTENT_MODEL_PATH = "backend/core/intent_model.pkl"
 INTENT_FEEDBACK_PATH = "backend/core/intent_feedback.jsonl"
 INTENT_RETRAIN_EVERY = 25
 INTENT_MIN_CONFIDENCE = 0.65
 # ==================== UPLOAD BATCHING ====================
-UPLOAD_BATCH_SIZE = 10
-UPLOAD_BATCH_SLEEP_S = 2
 # ==================== RETRIEVAL ====================
 CHAT_MEMORY_TURNS = 3

 INTENT_FEEDBACK_PATH = "backend/core/intent_feedback.jsonl"
 INTENT_RETRAIN_EVERY = 25
 INTENT_MIN_CONFIDENCE = 0.65
+INTENT_BOOTSTRAP_ON_STARTUP = os.getenv("INTENT_BOOTSTRAP_ON_STARTUP", "false").lower() in {"1", "true", "yes"}
 # ==================== UPLOAD BATCHING ====================
+UPLOAD_BATCH_SIZE = 5
+UPLOAD_BATCH_SLEEP_S = 5
+UPLOAD_RETRY_MAX_ATTEMPTS = int(os.getenv("UPLOAD_RETRY_MAX_ATTEMPTS", "4"))
+UPLOAD_RETRY_BASE_SLEEP_S = float(os.getenv("UPLOAD_RETRY_BASE_SLEEP_S", "2"))
+UPLOAD_RETRY_MAX_SLEEP_S = float(os.getenv("UPLOAD_RETRY_MAX_SLEEP_S", "20"))
 # ==================== RETRIEVAL ====================
 CHAT_MEMORY_TURNS = 3

backend/core/intent_classifier.py CHANGED Viewed

@@ -33,6 +33,18 @@ from supabase.client import create_client
 log = logging.getLogger("nexus.intent")
 # ── Lazy imports (heavy — only load once at first use) ───────────────────────
 _embedder      = None
 _embedder_lock = threading.Lock()
@@ -228,7 +240,12 @@ def _build_features(query: str, has_category: bool, has_history: bool) -> np.nda
       [384:392] — 8 structural context signals
     """
     embedder        = _get_embedder()
-    query_embedding = embedder.encode(query, normalize_embeddings=True)
     q     = query.lower().strip()
     words = q.split()
@@ -296,7 +313,9 @@ class IntentClassifier:
             else:
                 log.info("No intent model found — will use fallback until trained.")
                 self._ready = False
-                if not getattr(self, "_bootstrap_started", False):
                     self._bootstrap_started = True
                     threading.Thread(target=train_initial_model, daemon=True).start()
         except Exception as e:
@@ -305,6 +324,9 @@ class IntentClassifier:
     def _upload_model_to_supabase(self, model_path: str):
         from backend.core import config
         try:
             with open(model_path, "rb") as f:
                 data = f.read()
@@ -320,6 +342,8 @@ class IntentClassifier:
     def _download_model_from_supabase(self, model_path: str) -> bool:
         from backend.core import config
         try:
             sb = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
             data = sb.storage.from_("rag-models").download("intent_model.pkl")
@@ -382,18 +406,19 @@ class IntentClassifier:
             "user_id":      user_id,
         }
         supabase_ok = False
-        try:
-            supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
-            supabase.table("intent_feedback").insert({
-                "user_id":      user_id,
-                "query":        query,
-                "has_category": has_category,
-                "has_history":  has_history,
-                "label":        int(was_needed),
-            }).execute()
-            supabase_ok = True
-        except Exception as e:
-            log.warning("Supabase intent_feedback insert failed: %s", e)
         if not supabase_ok:
             try:
@@ -407,6 +432,8 @@ class IntentClassifier:
     def _maybe_retrain(self):
         from backend.core import config
         try:
             supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
             result = supabase.table("intent_feedback").select("id", count="exact").execute()
             n = result.count or 0
@@ -429,14 +456,15 @@ class IntentClassifier:
             # Load feedback
             feedback = []
             loaded_from_supabase = False
-            try:
-                supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
-                rows = supabase.table("intent_feedback").select("query,has_category,has_history,label").execute()
-                for r in (rows.data or []):
-                    feedback.append((r["query"], r["has_category"], r["has_history"], r["label"]))
-                loaded_from_supabase = True
-            except Exception as e:
-                log.info("Supabase feedback read not available yet: %s", e)
             if not loaded_from_supabase:
                 try:
@@ -489,6 +517,18 @@ class IntentClassifier:
             return {"needs_clarification": True, "confidence": 0.75}
         return {"needs_clarification": False, "confidence": 0.85}
 # =========================================================================== #
 #  TRAINING                                                                    #
@@ -570,18 +610,19 @@ def train_initial_model():
     joblib.dump({"classifier": clf, "version": 1, "n_examples": n}, config.INTENT_MODEL_PATH)
     log.info("Initial model saved to %s (%d examples).", config.INTENT_MODEL_PATH, n)
     # Upload to Supabase directly without going through singleton
-    try:
-        with open(config.INTENT_MODEL_PATH, "rb") as f:
-            model_bytes = f.read()
-        sb = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
-        sb.storage.from_("rag-models").upload(
-            path="intent_model.pkl",
-            file=model_bytes,
-            file_options={"content-type": "application/octet-stream", "upsert": "true"},
-        )
-        log.info("Initial model uploaded to Supabase Storage.")
-    except Exception as e:
-        log.warning("Could not upload initial model: %s", e)
     # Force singleton to reload
     inst = IntentClassifier()
@@ -590,4 +631,8 @@ def train_initial_model():
 # Module-level singleton — imported by pipeline.py
-intent_classifier = IntentClassifier()

 log = logging.getLogger("nexus.intent")
+def _bootstrap_disabled() -> bool:
+    from backend.core import config
+    force_disabled = os.getenv("NEXUS_DISABLE_INTENT_BOOTSTRAP", "false").lower() in {"1", "true", "yes"}
+    # Default production-safe posture: do not train on import unless explicitly enabled.
+    return force_disabled or (not config.INTENT_BOOTSTRAP_ON_STARTUP)
+def _supabase_config_available() -> bool:
+    from backend.core import config
+    return bool(config.SUPABASE_URL and config.SUPABASE_SERVICE_KEY)
 # ── Lazy imports (heavy — only load once at first use) ───────────────────────
 _embedder      = None
 _embedder_lock = threading.Lock()
       [384:392] — 8 structural context signals
     """
     embedder        = _get_embedder()
+    # Disable tqdm bars to keep Docker/build logs stable and readable.
+    query_embedding = embedder.encode(
+        query,
+        normalize_embeddings=True,
+        show_progress_bar=False,
+    )
     q     = query.lower().strip()
     words = q.split()
             else:
                 log.info("No intent model found — will use fallback until trained.")
                 self._ready = False
+                if _bootstrap_disabled():
+                    log.info("Intent bootstrap disabled by NEXUS_DISABLE_INTENT_BOOTSTRAP.")
+                elif not getattr(self, "_bootstrap_started", False):
                     self._bootstrap_started = True
                     threading.Thread(target=train_initial_model, daemon=True).start()
         except Exception as e:
     def _upload_model_to_supabase(self, model_path: str):
         from backend.core import config
+        if not _supabase_config_available():
+            log.info("Skipping model upload: Supabase config unavailable.")
+            return
         try:
             with open(model_path, "rb") as f:
                 data = f.read()
     def _download_model_from_supabase(self, model_path: str) -> bool:
         from backend.core import config
+        if not _supabase_config_available():
+            return False
         try:
             sb = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
             data = sb.storage.from_("rag-models").download("intent_model.pkl")
             "user_id":      user_id,
         }
         supabase_ok = False
+        if _supabase_config_available():
+            try:
+                supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
+                supabase.table("intent_feedback").insert({
+                    "user_id":      user_id,
+                    "query":        query,
+                    "has_category": has_category,
+                    "has_history":  has_history,
+                    "label":        int(was_needed),
+                }).execute()
+                supabase_ok = True
+            except Exception as e:
+                log.warning("Supabase intent_feedback insert failed: %s", e)
         if not supabase_ok:
             try:
     def _maybe_retrain(self):
         from backend.core import config
         try:
+            if not _supabase_config_available():
+                raise RuntimeError("Supabase config unavailable")
             supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
             result = supabase.table("intent_feedback").select("id", count="exact").execute()
             n = result.count or 0
             # Load feedback
             feedback = []
             loaded_from_supabase = False
+            if _supabase_config_available():
+                try:
+                    supabase = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
+                    rows = supabase.table("intent_feedback").select("query,has_category,has_history,label").execute()
+                    for r in (rows.data or []):
+                        feedback.append((r["query"], r["has_category"], r["has_history"], r["label"]))
+                    loaded_from_supabase = True
+                except Exception as e:
+                    log.info("Supabase feedback read not available yet: %s", e)
             if not loaded_from_supabase:
                 try:
             return {"needs_clarification": True, "confidence": 0.75}
         return {"needs_clarification": False, "confidence": 0.85}
+    def status(self) -> dict:
+        """
+        Lightweight runtime status for health endpoints/observability.
+        """
+        from backend.core import config
+        return {
+            "ready": bool(self._ready and self._clf is not None),
+            "model_path": config.INTENT_MODEL_PATH,
+            "model_exists": Path(config.INTENT_MODEL_PATH).exists(),
+            "bootstrap_enabled": (not _bootstrap_disabled()),
+        }
 # =========================================================================== #
 #  TRAINING                                                                    #
     joblib.dump({"classifier": clf, "version": 1, "n_examples": n}, config.INTENT_MODEL_PATH)
     log.info("Initial model saved to %s (%d examples).", config.INTENT_MODEL_PATH, n)
     # Upload to Supabase directly without going through singleton
+    if _supabase_config_available():
+        try:
+            with open(config.INTENT_MODEL_PATH, "rb") as f:
+                model_bytes = f.read()
+            sb = create_client(config.SUPABASE_URL, config.SUPABASE_SERVICE_KEY)
+            sb.storage.from_("rag-models").upload(
+                path="intent_model.pkl",
+                file=model_bytes,
+                file_options={"content-type": "application/octet-stream", "upsert": "true"},
+            )
+            log.info("Initial model uploaded to Supabase Storage.")
+        except Exception as e:
+            log.warning("Could not upload initial model: %s", e)
     # Force singleton to reload
     inst = IntentClassifier()
 # Module-level singleton — imported by pipeline.py
+intent_classifier = IntentClassifier()
+def get_intent_classifier_status() -> dict:
+    return intent_classifier.status()

backend/core/pipeline.py CHANGED Viewed

@@ -233,6 +233,7 @@ def get_cached_embedding(text: str) -> list:
 #  Schema: supabase/migrations/0003_rerank_feedback.sql                        #
 # =========================================================================== #
 def _log_rerank_feedback(
     query: str,
     all_candidates: list,
@@ -244,6 +245,7 @@ def _log_rerank_feedback(
     Write rerank results to rerank_feedback table via a daemon thread.
     Completely non-blocking -- exceptions are swallowed so query never fails.
     """
     def _write():
         try:
             sb = _build_service_supabase_client()
@@ -265,25 +267,31 @@ def _log_rerank_feedback(
                 doc_type = chunk.get("metadata", {}).get("document_type")
                 chunk_id_raw = chunk.get("id")
                 try:
-                    chunk_uuid = str(uuid.UUID(str(chunk_id_raw))) if chunk_id_raw else None
                 except Exception:
                     chunk_uuid = None
-                rows.append({
-                    "user_id":       user_id,
-                    "query_hash":    q_hash,
-                    "query_text":    query, # Added for local distillation
-                    "chunk_id":      chunk_uuid,
-                    "chunk_hash":    c_hash,
-                    "chunk_text":    content[:500], # Added (truncated to save space)
-                    "document_type": doc_type,
-                    "cohere_score":  float(score),
-                    "was_selected":  c_hash in selected_hashes,
-                })
             if rows:
                 for start in range(0, len(rows), 50):
-                    sb.table("rerank_feedback").insert(rows[start:start + 50]).execute()
                 log.debug("Logged %d rerank feedback rows.", len(rows))
         except Exception as exc:
             log.debug("rerank_feedback logging skipped: %s", exc)
@@ -296,7 +304,6 @@ def _log_rerank_feedback(
 # =========================================================================== #
 def get_existing_categories(access_token: str = None) -> List[str]:
     """Server-side DISTINCT via get_document_types() SQL function."""
     supabase = _build_supabase_client(access_token)
@@ -690,7 +697,9 @@ def process_chunks(
     return docs, ids
-def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[List[Document], List[str]]:
     """
     RAPTOR implementation: recursively clusters documents and generates
     parent summaries until we reach a single root node.
@@ -712,7 +721,7 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
     CLUSTER_SIZE = 5
     llm = _build_llm(needs_vision=False, use_ingestion=True)
     # Prompt for collapsing children into a parent concept node
     cluster_prompt = (
         "SYSTEM ROLE: You are an expert document synthesist building a hierarchical reasoning tree.\n"
@@ -727,13 +736,15 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
     )
     while len(current_level_docs) > 1:
-        print(f"  [RAPTOR] Building Level {current_level} (from {len(current_level_docs)} children)...")
         next_level_docs = []
         # Iterate in clusters
         for i in range(0, len(current_level_docs), CLUSTER_SIZE):
             cluster = current_level_docs[i : i + CLUSTER_SIZE]
             # Combine the underlying texts (use the existing summary if available, else raw text)
             sections_text = ""
             for j, child in enumerate(cluster):
@@ -742,7 +753,7 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
                 child_text = child.metadata.get("summary", child.page_content)
                 if not child_text or child_text == "No summary available.":
                     child_text = child.page_content
-                sections_text += f"--- SECTION {j+1} ---\n{child_text}\n\n"
             # Generate the parent summary
             prompt = cluster_prompt.format(count=len(cluster), sections=sections_text)
@@ -750,19 +761,26 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
                 response = llm.invoke([HumanMessage(content=prompt)])
                 parent_text = response.content
             except Exception as e:
-                log.warning(f"RAPTOR summarization failed at level {current_level}, segment {i}: {e}")
                 # Fallback: just concatenate
-                parent_text = "Merged Content:\n" + "\n".join([c.page_content[:500] for c in cluster])
             # Generate deterministic ID for the parent
             import hashlib
             parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
-            parent_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"raptor_{current_level}_{parent_hash}"))
             # Create the parent document
             # Inherit metadata from the first child (source array, file hash, document type)
             base_meta = cluster[0].metadata
             # Gather all unique page numbers from children
             all_pages = set()
             for c in cluster:
@@ -779,10 +797,10 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
                     "node_type": "summary",
                     "node_level": current_level,
                     "node_id": parent_id,
-                    "parent_node_id": None, # Will be set by the NEXT level up
                     "page_numbers": sorted(list(all_pages)),
-                    "children_count": len(cluster)
-                }
             )
             # Update children to point to this parent
@@ -796,7 +814,9 @@ def build_raptor_tree(leaf_docs: List[Document], leaf_ids: List[str]) -> tuple[L
         current_level_docs = next_level_docs
         current_level += 1
-    print(f"  [RAPTOR] Tree built. Total nodes: {len(all_docs)} (Leaves: {len(leaf_docs)}, Summaries: {len(all_docs) - len(leaf_docs)})")
     return all_docs, all_ids
@@ -902,9 +922,9 @@ def _apply_category_override(
         ).execute()
     # Update ingested_files registry
-    supabase.table("ingested_files").update({"document_type": new_category,"user_overridden": True}).eq(
-        "file_hash", file_hash
-    ).execute()
     # Refresh materialized view so sidebar filter updates immediately
     try:
@@ -963,11 +983,41 @@ def delete_document(file_hash: str, access_token: str = None) -> None:
         except Exception:
             pass
 def upload_to_supabase(
     documents: List[Document],
     ids: List[str],
     access_token: str = None,
 ) -> None:
     BATCH_SIZE = config.UPLOAD_BATCH_SIZE
     BATCH_SLEEP = config.UPLOAD_BATCH_SLEEP_S
@@ -995,25 +1045,77 @@ def upload_to_supabase(
         log.info("Batch %d/%d (%d docs)...", batch_num, total_batches, len(batch_docs))
-        # Embed the batch
-        texts = [doc.page_content for doc in batch_docs]
-        vectors = embedder.embed_documents(texts)
-        # Insert via RPC — user_id is explicit, not from metadata
-        for doc, doc_id, vector in zip(batch_docs, batch_ids, vectors):
-            sb.rpc(
-                "insert_document_chunk",
-                {
-                    "p_id": doc_id,
-                    "p_content": doc.page_content,
-                    "p_metadata": doc.metadata,
-                    "p_embedding": vector,
-                    "p_user_id": user_id,
-                    "p_node_type": doc.metadata.get("node_type", "leaf"),
-                    "p_parent_node_id": doc.metadata.get("parent_node_id"),
-                    "p_node_level": doc.metadata.get("node_level", 0),
-                },
-            ).execute()
         if start + BATCH_SIZE < len(documents):
             time.sleep(BATCH_SLEEP)
@@ -1063,7 +1165,7 @@ def run_ingestion(
     if not force and is_file_already_ingested(file_hash, access_token=access_token):
         log.info("SKIPPING — already ingested.")
         return "already_ingested"
     # NEW: Check if user has previously overridden the category for this file.
     # If so, skip the classifier and use their choice directly.
     forced_category = None
@@ -1553,7 +1655,7 @@ def retrieve_chunks(
                 return [cache_doc]
         except Exception as e:
             log.warning("Cache check failed, proceeding normally: %s", e)
     queries_to_run = generate_sub_queries(query)
     dynamic_k = 10 if len(queries_to_run) > 1 else 5
@@ -1646,7 +1748,7 @@ def retrieve_chunks(
             base_threshold = 0.0001  # sigmoid of ms-marco logits is very small
         else:
             base_threshold = RELEVANCE_THRESHOLD  # 0.35 for Cohere
         effective_threshold = base_threshold
         if filter_dict or len(all_candidates) <= 10:
             if reranker == "cohere":
@@ -1813,7 +1915,9 @@ def retrieve_chunks(
         if budgeted:
             log.info(
                 "Context budget: %d chars across %d/%d chunks.",
-                total_chars, len(budgeted), len(retrieved),
             )
             retrieved = budgeted
@@ -1878,7 +1982,7 @@ def generate_answer(
     for i, chunk in enumerate(chunks, 1):
         prompt += f"--- Source {i} ---\n"
         meta = chunk.metadata
         # Determine if this is a raw chunk or a RAPTOR summary node
         node_type = meta.get("node_type", "leaf")
         node_level = meta.get("node_level", 0)
@@ -1887,7 +1991,7 @@ def generate_answer(
             # For summary nodes, page_content IS the summary. There is no raw original_content.
             prompt += f"[SYNTHESIZED CHAPTER SUMMARY - LEVEL {node_level}]\n"
             prompt += f"TEXT:\n{chunk.page_content}\n\n"
-            original = {} # summaries don't have tables/images directly attached yet
         else:
             # Traditional leaf chunk
             original = meta.get("original_content")
@@ -1896,7 +2000,7 @@ def generate_answer(
                     original = json.loads(original)
                     if isinstance(original, str):
                         original = json.loads(original)
-                except:
                     original = {}
             elif not isinstance(original, dict):
                 original = {}
@@ -1996,7 +2100,7 @@ async def generate_answer_stream(
         yield {"type": "done", "images": []}
         return
     # ── Cache hit handler ────────────────────────────────────────────────────
     if len(chunks) == 1 and chunks[0].page_content == "__CACHE_HIT__":
         cached = chunks[0].metadata.get("__cache__", {})
         answer = cached.get("answer", "")
@@ -2007,7 +2111,7 @@ async def generate_answer_stream(
             await asyncio.sleep(0)
         yield {"type": "done", "sources": cached.get("sources", []), "images": []}
         return
     # ── TASK 3: Log RAGAS reward signal to evaluation_logs ───────────────────
     # Runs fire-and-forget in an executor so it never blocks streaming.
     # Uses the Cohere relevance scores already embedded in chunk metadata.
@@ -2092,7 +2196,7 @@ async def generate_answer_stream(
         page_ref = pages[0] if pages else "unknown"
         loc_key = f"{file_hash}_p{page_ref}"
         chunk_relevance = meta.get("relevance_score", 0)
         if (
             chunk_relevance >= config.IMAGE_RELEVANCE_THRESHOLD
         ):  # only show images from highly relevant chunks
@@ -2154,32 +2258,40 @@ async def generate_answer_stream(
             yield {"type": "token", "content": suffix}
             full_answer += suffix
         # ── Store in query cache ─────────────────────────────────────────────
         try:
             if access_token:
                 from backend.core.auth_utils import extract_jwt_sub
                 from backend.core.cache_manager import store_cached_answer
                 _uid = extract_jwt_sub(access_token)
                 _query_vec = get_cached_embedding(query)
                 _chunk_ids = [c.metadata.get("id", "") for c in chunks]
-                _doc_types = list(set(
-                    c.metadata.get("document_type", "general_document") for c in chunks
-                ))
                 _sources = []
                 for c in chunks:
                     meta = c.metadata
                     orig = meta.get("original_content", "{}")
                     if isinstance(orig, str):
-                        try: orig = json.loads(orig)  # noqa: E701
-                        except: orig = {}  # noqa: E701, E722
-                    _sources.append({
-                        "source":   meta.get("source", "Unknown"),
-                        "score":    meta.get("relevance_score"),
-                        "chunk":    meta.get("chunk_index"),
-                        "snippet":  (orig.get("raw_text") or c.page_content)[:200],
-                        "doc_type": meta.get("document_type"),
-                        "pages":    meta.get("page_numbers"),
-                    })
                 store_cached_answer(
                     user_id=_uid,
                     query_embedding=_query_vec,
@@ -2193,7 +2305,6 @@ async def generate_answer_stream(
         except Exception:
             pass  # cache store never blocks response
         # Save to memory async
         try:
             loop = asyncio.get_event_loop()

 #  Schema: supabase/migrations/0003_rerank_feedback.sql                        #
 # =========================================================================== #
 def _log_rerank_feedback(
     query: str,
     all_candidates: list,
     Write rerank results to rerank_feedback table via a daemon thread.
     Completely non-blocking -- exceptions are swallowed so query never fails.
     """
     def _write():
         try:
             sb = _build_service_supabase_client()
                 doc_type = chunk.get("metadata", {}).get("document_type")
                 chunk_id_raw = chunk.get("id")
                 try:
+                    chunk_uuid = (
+                        str(uuid.UUID(str(chunk_id_raw))) if chunk_id_raw else None
+                    )
                 except Exception:
                     chunk_uuid = None
+                rows.append(
+                    {
+                        "user_id": user_id,
+                        "query_hash": q_hash,
+                        "query_text": query,  # Added for local distillation
+                        "chunk_id": chunk_uuid,
+                        "chunk_hash": c_hash,
+                        "chunk_text": content[:500],  # Added (truncated to save space)
+                        "document_type": doc_type,
+                        "cohere_score": float(score),
+                        "was_selected": c_hash in selected_hashes,
+                    }
+                )
             if rows:
                 for start in range(0, len(rows), 50):
+                    sb.table("rerank_feedback").insert(
+                        rows[start : start + 50]
+                    ).execute()
                 log.debug("Logged %d rerank feedback rows.", len(rows))
         except Exception as exc:
             log.debug("rerank_feedback logging skipped: %s", exc)
 # =========================================================================== #
 def get_existing_categories(access_token: str = None) -> List[str]:
     """Server-side DISTINCT via get_document_types() SQL function."""
     supabase = _build_supabase_client(access_token)
     return docs, ids
+def build_raptor_tree(
+    leaf_docs: List[Document], leaf_ids: List[str]
+) -> tuple[List[Document], List[str]]:
     """
     RAPTOR implementation: recursively clusters documents and generates
     parent summaries until we reach a single root node.
     CLUSTER_SIZE = 5
     llm = _build_llm(needs_vision=False, use_ingestion=True)
     # Prompt for collapsing children into a parent concept node
     cluster_prompt = (
         "SYSTEM ROLE: You are an expert document synthesist building a hierarchical reasoning tree.\n"
     )
     while len(current_level_docs) > 1:
+        print(
+            f"  [RAPTOR] Building Level {current_level} (from {len(current_level_docs)} children)..."
+        )
         next_level_docs = []
         # Iterate in clusters
         for i in range(0, len(current_level_docs), CLUSTER_SIZE):
             cluster = current_level_docs[i : i + CLUSTER_SIZE]
             # Combine the underlying texts (use the existing summary if available, else raw text)
             sections_text = ""
             for j, child in enumerate(cluster):
                 child_text = child.metadata.get("summary", child.page_content)
                 if not child_text or child_text == "No summary available.":
                     child_text = child.page_content
+                sections_text += f"--- SECTION {j + 1} ---\n{child_text}\n\n"
             # Generate the parent summary
             prompt = cluster_prompt.format(count=len(cluster), sections=sections_text)
                 response = llm.invoke([HumanMessage(content=prompt)])
                 parent_text = response.content
             except Exception as e:
+                log.warning(
+                    f"RAPTOR summarization failed at level {current_level}, segment {i}: {e}"
+                )
                 # Fallback: just concatenate
+                parent_text = "Merged Content:\n" + "\n".join(
+                    [c.page_content[:500] for c in cluster]
+                )
             # Generate deterministic ID for the parent
             import hashlib
             parent_hash = hashlib.md5(parent_text.encode()).hexdigest()
+            parent_id = str(
+                uuid.uuid5(uuid.NAMESPACE_DNS, f"raptor_{current_level}_{parent_hash}")
+            )
             # Create the parent document
             # Inherit metadata from the first child (source array, file hash, document type)
             base_meta = cluster[0].metadata
             # Gather all unique page numbers from children
             all_pages = set()
             for c in cluster:
                     "node_type": "summary",
                     "node_level": current_level,
                     "node_id": parent_id,
+                    "parent_node_id": None,  # Will be set by the NEXT level up
                     "page_numbers": sorted(list(all_pages)),
+                    "children_count": len(cluster),
+                },
             )
             # Update children to point to this parent
         current_level_docs = next_level_docs
         current_level += 1
+    print(
+        f"  [RAPTOR] Tree built. Total nodes: {len(all_docs)} (Leaves: {len(leaf_docs)}, Summaries: {len(all_docs) - len(leaf_docs)})"
+    )
     return all_docs, all_ids
         ).execute()
     # Update ingested_files registry
+    supabase.table("ingested_files").update(
+        {"document_type": new_category, "user_overridden": True}
+    ).eq("file_hash", file_hash).execute()
     # Refresh materialized view so sidebar filter updates immediately
     try:
         except Exception:
             pass
 def upload_to_supabase(
     documents: List[Document],
     ids: List[str],
     access_token: str = None,
 ) -> None:
+    def _log_ingestion_retry_event(
+        *,
+        user_id: str,
+        batch_num: int,
+        total_batches: int,
+        attempt: int,
+        event_type: str,
+        message: str = "",
+        sleep_s: float = 0.0,
+    ) -> None:
+        """
+        Best-effort telemetry for ingestion retry behavior.
+        Table: public.ingestion_retry_logs
+        """
+        try:
+            _build_service_supabase_client().table("ingestion_retry_logs").insert(
+                {
+                    "user_id": user_id,
+                    "batch_num": batch_num,
+                    "total_batches": total_batches,
+                    "attempt": attempt,
+                    "event_type": event_type,
+                    "message": message[:500],
+                    "sleep_s": sleep_s,
+                }
+            ).execute()
+        except Exception:
+            pass
     BATCH_SIZE = config.UPLOAD_BATCH_SIZE
     BATCH_SLEEP = config.UPLOAD_BATCH_SLEEP_S
         log.info("Batch %d/%d (%d docs)...", batch_num, total_batches, len(batch_docs))
+        max_attempts = max(1, int(config.UPLOAD_RETRY_MAX_ATTEMPTS))
+        base_sleep = float(config.UPLOAD_RETRY_BASE_SLEEP_S)
+        max_sleep = float(config.UPLOAD_RETRY_MAX_SLEEP_S)
+        attempt = 0
+        while True:
+            attempt += 1
+            try:
+                # Embed the batch
+                texts = [doc.page_content for doc in batch_docs]
+                vectors = embedder.embed_documents(texts)
+                # Insert via RPC — user_id is explicit, not from metadata
+                for doc, doc_id, vector in zip(batch_docs, batch_ids, vectors):
+                    sb.rpc(
+                        "insert_document_chunk",
+                        {
+                            "p_id": doc_id,
+                            "p_content": doc.page_content,
+                            "p_metadata": doc.metadata,
+                            "p_embedding": vector,
+                            "p_user_id": user_id,
+                            "p_node_type": doc.metadata.get("node_type", "leaf"),
+                            "p_parent_node_id": doc.metadata.get("parent_node_id"),
+                            "p_node_level": doc.metadata.get("node_level", 0),
+                        },
+                    ).execute()
+                _log_ingestion_retry_event(
+                    user_id=user_id,
+                    batch_num=batch_num,
+                    total_batches=total_batches,
+                    attempt=attempt,
+                    event_type="success",
+                    message="batch uploaded",
+                )
+                break
+            except Exception as e:
+                err = str(e).lower()
+                retryable = any(
+                    x in err for x in ["429", "rate", "too many requests", "quota"]
+                )
+                if (not retryable) or attempt >= max_attempts:
+                    _log_ingestion_retry_event(
+                        user_id=user_id,
+                        batch_num=batch_num,
+                        total_batches=total_batches,
+                        attempt=attempt,
+                        event_type="failed",
+                        message=str(e),
+                    )
+                    raise
+                # Exponential backoff with cap to stay below burst limits.
+                sleep_s = min(max_sleep, base_sleep * (2 ** (attempt - 1)))
+                log.warning(
+                    "Batch %d/%d rate-limited, retrying in %.1fs (attempt %d/%d): %s",
+                    batch_num,
+                    total_batches,
+                    sleep_s,
+                    attempt,
+                    max_attempts,
+                    str(e)[:120],
+                )
+                _log_ingestion_retry_event(
+                    user_id=user_id,
+                    batch_num=batch_num,
+                    total_batches=total_batches,
+                    attempt=attempt,
+                    event_type="retry",
+                    message=str(e),
+                    sleep_s=sleep_s,
+                )
+                time.sleep(sleep_s)
         if start + BATCH_SIZE < len(documents):
             time.sleep(BATCH_SLEEP)
     if not force and is_file_already_ingested(file_hash, access_token=access_token):
         log.info("SKIPPING — already ingested.")
         return "already_ingested"
     # NEW: Check if user has previously overridden the category for this file.
     # If so, skip the classifier and use their choice directly.
     forced_category = None
                 return [cache_doc]
         except Exception as e:
             log.warning("Cache check failed, proceeding normally: %s", e)
     queries_to_run = generate_sub_queries(query)
     dynamic_k = 10 if len(queries_to_run) > 1 else 5
             base_threshold = 0.0001  # sigmoid of ms-marco logits is very small
         else:
             base_threshold = RELEVANCE_THRESHOLD  # 0.35 for Cohere
         effective_threshold = base_threshold
         if filter_dict or len(all_candidates) <= 10:
             if reranker == "cohere":
         if budgeted:
             log.info(
                 "Context budget: %d chars across %d/%d chunks.",
+                total_chars,
+                len(budgeted),
+                len(retrieved),
             )
             retrieved = budgeted
     for i, chunk in enumerate(chunks, 1):
         prompt += f"--- Source {i} ---\n"
         meta = chunk.metadata
         # Determine if this is a raw chunk or a RAPTOR summary node
         node_type = meta.get("node_type", "leaf")
         node_level = meta.get("node_level", 0)
             # For summary nodes, page_content IS the summary. There is no raw original_content.
             prompt += f"[SYNTHESIZED CHAPTER SUMMARY - LEVEL {node_level}]\n"
             prompt += f"TEXT:\n{chunk.page_content}\n\n"
+            original = {}  # summaries don't have tables/images directly attached yet
         else:
             # Traditional leaf chunk
             original = meta.get("original_content")
                     original = json.loads(original)
                     if isinstance(original, str):
                         original = json.loads(original)
+                except:  # noqa: E722
                     original = {}
             elif not isinstance(original, dict):
                 original = {}
         yield {"type": "done", "images": []}
         return
     # ── Cache hit handler ────────────────────────────────────────────────────
     if len(chunks) == 1 and chunks[0].page_content == "__CACHE_HIT__":
         cached = chunks[0].metadata.get("__cache__", {})
         answer = cached.get("answer", "")
             await asyncio.sleep(0)
         yield {"type": "done", "sources": cached.get("sources", []), "images": []}
         return
     # ── TASK 3: Log RAGAS reward signal to evaluation_logs ───────────────────
     # Runs fire-and-forget in an executor so it never blocks streaming.
     # Uses the Cohere relevance scores already embedded in chunk metadata.
         page_ref = pages[0] if pages else "unknown"
         loc_key = f"{file_hash}_p{page_ref}"
         chunk_relevance = meta.get("relevance_score", 0)
         if (
             chunk_relevance >= config.IMAGE_RELEVANCE_THRESHOLD
         ):  # only show images from highly relevant chunks
             yield {"type": "token", "content": suffix}
             full_answer += suffix
         # ── Store in query cache ─────────────────────────────────────────────
         try:
             if access_token:
                 from backend.core.auth_utils import extract_jwt_sub
                 from backend.core.cache_manager import store_cached_answer
                 _uid = extract_jwt_sub(access_token)
                 _query_vec = get_cached_embedding(query)
                 _chunk_ids = [c.metadata.get("id", "") for c in chunks]
+                _doc_types = list(
+                    set(
+                        c.metadata.get("document_type", "general_document")
+                        for c in chunks
+                    )
+                )
                 _sources = []
                 for c in chunks:
                     meta = c.metadata
                     orig = meta.get("original_content", "{}")
                     if isinstance(orig, str):
+                        try:
+                            orig = json.loads(orig)  # noqa: E701
+                        except:
+                            orig = {}  # noqa: E701, E722
+                    _sources.append(
+                        {
+                            "source": meta.get("source", "Unknown"),
+                            "score": meta.get("relevance_score"),
+                            "chunk": meta.get("chunk_index"),
+                            "snippet": (orig.get("raw_text") or c.page_content)[:200],
+                            "doc_type": meta.get("document_type"),
+                            "pages": meta.get("page_numbers"),
+                        }
+                    )
                 store_cached_answer(
                     user_id=_uid,
                     query_embedding=_query_vec,
         except Exception:
             pass  # cache store never blocks response
         # Save to memory async
         try:
             loop = asyncio.get_event_loop()

backend/core/providers.py CHANGED Viewed

@@ -472,17 +472,20 @@ class FallbackEmbeddings:
     def embed_documents(self, texts: Sequence[str]) -> List[List[float]]:
         if not texts:
             return []
         last_exc: Optional[BaseException] = None
         for model in self._models:
-            try:
-                result = self._build(model).embed_documents(list(texts))
-                # Guard against OpenRouter returning 200 OK with data=None
-                if result is None or any(v is None for v in result):
-                    raise ValueError(f"Model {model} returned null embeddings")
-                return result
-            except Exception as exc:
-                last_exc = exc
-                continue  # always try next model on any failure
         if last_exc:
             raise last_exc
         raise RuntimeError("Embeddings failed without exception")

     def embed_documents(self, texts: Sequence[str]) -> List[List[float]]:
         if not texts:
             return []
+        import time
         last_exc: Optional[BaseException] = None
         for model in self._models:
+            for attempt in range(3):
+                try:
+                    result = self._build(model).embed_documents(list(texts))
+                    # Guard against OpenRouter returning 200 OK with data=None
+                    if result is None or any(v is None for v in result):
+                        raise ValueError(f"Model {model} returned null embeddings")
+                    return result
+                except Exception as exc:
+                    last_exc = exc
+                    time.sleep(2 * (attempt + 1))  # Backoff: 2s, 4s, 6s
+                    continue
         if last_exc:
             raise last_exc
         raise RuntimeError("Embeddings failed without exception")

backend/main.py CHANGED Viewed

@@ -10,7 +10,6 @@ from slowapi import Limiter, _rate_limit_exceeded_handler
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from starlette.requests import Request
-from starlette.responses import JSONResponse
 def _rate_limit_key(request: Request) -> str:
@@ -20,25 +19,28 @@ def _rate_limit_key(request: Request) -> str:
 limiter = Limiter(key_func=_rate_limit_key)
-import logging
-import subprocess
-from contextlib import asynccontextmanager
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.staticfiles import StaticFiles
-from fastapi.responses import FileResponse
-from dotenv import load_dotenv
 load_dotenv()
-from backend.api import auth, corpus, ingest, query, admin,frontend_config
-log = logging.getLogger("nexus.main")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    log.info("NEXUS API starting")
     # Auto-start Celery worker unless explicitly disabled
     # Set AUTO_START_CELERY=false in HF Secrets (start.sh handles it there)
@@ -67,11 +69,11 @@ async def lifespan(app: FastAPI):
             celery_process.kill()
         log.info("Celery worker stopped.")
-    log.info("NEXUS API stopped")
 app = FastAPI(
-    title="NEXUS RAG API", version="1.0.0", lifespan=lifespan,
     docs_url  = "/docs"  if os.getenv("DOCS_ENABLED", "true").lower() == "true" else None,
     redoc_url = "/redoc" if os.getenv("DOCS_ENABLED", "true").lower() == "true" else None,
 )
@@ -95,9 +97,17 @@ app.include_router(frontend_config.router, prefix="/api/v1/config", tags=["confi
 def health():
     return {"status": "healthy"}
 @app.get("/api/status")
 def status():
-    return {"status": "ok", "service": "NEXUS RAG API", "version": "1.0.0"}
 # ── Static Frontend ───────────────────────────────────────────────────────────
 # Mount the entire frontend folder at the root of the app so it serves the index.html.

 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from starlette.requests import Request
 def _rate_limit_key(request: Request) -> str:
 limiter = Limiter(key_func=_rate_limit_key)
+import logging  # noqa: E402
+import subprocess  # noqa: E402
+from contextlib import asynccontextmanager  # noqa: E402
+from fastapi import FastAPI  # noqa: E402
+from fastapi.middleware.cors import CORSMiddleware  # noqa: E402
+from fastapi.staticfiles import StaticFiles  # noqa: E402
+from fastapi.responses import FileResponse  # noqa: E402
+from dotenv import load_dotenv  # noqa: E402
 load_dotenv()
+from backend.api import auth, corpus, ingest, query, admin,frontend_config  # noqa: E402
+from backend.core.intent_classifier import get_intent_classifier_status  # noqa: E402
+log = logging.getLogger("morpheus.main")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    log.info("MORPHEUS API starting")
     # Auto-start Celery worker unless explicitly disabled
     # Set AUTO_START_CELERY=false in HF Secrets (start.sh handles it there)
             celery_process.kill()
         log.info("Celery worker stopped.")
+    log.info("MORPHEUS API stopped")
 app = FastAPI(
+    title="Morpheus RAG API", version="1.0.0", lifespan=lifespan,
     docs_url  = "/docs"  if os.getenv("DOCS_ENABLED", "true").lower() == "true" else None,
     redoc_url = "/redoc" if os.getenv("DOCS_ENABLED", "true").lower() == "true" else None,
 )
 def health():
     return {"status": "healthy"}
+@app.get("/health/details")
+def health_details():
+    return {
+        "status": "healthy",
+        "intent_classifier": get_intent_classifier_status(),
+    }
 @app.get("/api/status")
 def status():
+    return {"status": "ok", "service": "Morpheus RAG API", "version": "1.0.0"}
 # ── Static Frontend ───────────────────────────────────────────────────────────
 # Mount the entire frontend folder at the root of the app so it serves the index.html.

frontend/index.html CHANGED Viewed

@@ -3,7 +3,7 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>NEXUS — RAG Corpus Explorer</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/7.8.5/d3.min.js"></script>
     <link rel="preconnect" href="https://fonts.googleapis.com" />
     <link
@@ -46,12 +46,12 @@
           margin-bottom: 4px;
         "
       >
-        NEX<span style="opacity: 0.5">US</span>
       </div>
       <div
         style="font-size: 0.75rem; color: var(--muted); letter-spacing: 0.12em"
       >
-        ENTER TODAY'S ACCESS CODE
       </div>
       <input
         type="email"
@@ -68,7 +68,7 @@
       <input
         type="password"
         id="loginPassword"
-        placeholder="Access code…"
         style="
           width: 260px;
           text-align: center;
@@ -171,7 +171,7 @@
     <div id="app" style="display: none">
       <!-- TOPBAR -->
       <header id="topbar">
-        <div class="topbar-logo">NEX<span>US</span></div>
         <nav class="topbar-nav">
           <button
             class="nav-btn active"
@@ -343,7 +343,7 @@
           </div>
           <div class="chat-messages" id="chatMessages">
             <div class="msg assistant">
-              <div class="msg-role">NEXUS</div>
               <div class="msg-bubble">
                 Corpus loaded. Ask me anything about your documents.
               </div>

   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>Morpheus — RAG Corpus Explorer</title>
     <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/7.8.5/d3.min.js"></script>
     <link rel="preconnect" href="https://fonts.googleapis.com" />
     <link
           margin-bottom: 4px;
         "
       >
+        MOR<span style="opacity: 0.5">PHEUS</span>
       </div>
       <div
         style="font-size: 0.75rem; color: var(--muted); letter-spacing: 0.12em"
       >
+        AUTHENTICATION REQUIRED
       </div>
       <input
         type="email"
       <input
         type="password"
         id="loginPassword"
+        placeholder="Password…"
         style="
           width: 260px;
           text-align: center;
     <div id="app" style="display: none">
       <!-- TOPBAR -->
       <header id="topbar">
+        <div class="topbar-logo">MOR<span>PHEUS</span></div>
         <nav class="topbar-nav">
           <button
             class="nav-btn active"
           </div>
           <div class="chat-messages" id="chatMessages">
             <div class="msg assistant">
+              <div class="msg-role">MORPHEUS</div>
               <div class="msg-bubble">
                 Corpus loaded. Ask me anything about your documents.
               </div>

frontend/js/chat.js CHANGED Viewed

@@ -264,7 +264,7 @@ function appendMsg(role, text, sources = [], images = []) {
     : renderMarkdown(text);
   div.innerHTML = `
-    <div class="msg-role">${role === 'user' ? 'YOU' : 'NEXUS'}</div>
     <div class="msg-bubble">${bubbleContent}</div>
     ${imgHtml}
     ${srcHtml}`;
@@ -278,7 +278,7 @@ function appendThinking() {
   const div = document.createElement('div');
   div.className = 'msg assistant';
   div.innerHTML = `
-    <div class="msg-role">NEXUS</div>
     <div class="msg-bubble">
       <div class="thinking-dots"><span></span><span></span><span></span></div>
     </div>`;
@@ -294,7 +294,7 @@ function clearChat() {
   document.getElementById('alphaLabel').textContent = '⚖ 0.5';
   document.getElementById('chatMessages').innerHTML = `
     <div class="msg assistant">
-      <div class="msg-role">NEXUS</div>
       <div class="msg-bubble"><p class="msg-p">Chat cleared. Ask me anything about your documents.</p></div>
     </div>`;
 }

     : renderMarkdown(text);
   div.innerHTML = `
+    <div class="msg-role">${role === 'user' ? 'YOU' : 'MORPHEUS'}</div>
     <div class="msg-bubble">${bubbleContent}</div>
     ${imgHtml}
     ${srcHtml}`;
   const div = document.createElement('div');
   div.className = 'msg assistant';
   div.innerHTML = `
+    <div class="msg-role">MORPHEUS</div>
     <div class="msg-bubble">
       <div class="thinking-dots"><span></span><span></span><span></span></div>
     </div>`;
   document.getElementById('alphaLabel').textContent = '⚖ 0.5';
   document.getElementById('chatMessages').innerHTML = `
     <div class="msg assistant">
+      <div class="msg-role">MORPHEUS</div>
       <div class="msg-bubble"><p class="msg-p">Chat cleared. Ask me anything about your documents.</p></div>
     </div>`;
 }

supabase/migrations/0004_ingestion_retry_logs.sql ADDED Viewed

	@@ -0,0 +1,21 @@

+-- Telemetry table for ingestion retry/backoff behavior.
+-- Helps track 429 pressure and tune batch/backoff settings.
+CREATE TABLE IF NOT EXISTS public.ingestion_retry_logs (
+  id bigserial PRIMARY KEY,
+  created_at timestamptz NOT NULL DEFAULT now(),
+  user_id uuid,
+  batch_num integer NOT NULL,
+  total_batches integer NOT NULL,
+  attempt integer NOT NULL,
+  event_type text NOT NULL, -- retry | success | failed
+  message text,
+  sleep_s double precision DEFAULT 0
+);
+CREATE INDEX IF NOT EXISTS ingestion_retry_logs_created_at_idx
+  ON public.ingestion_retry_logs (created_at DESC);
+CREATE INDEX IF NOT EXISTS ingestion_retry_logs_user_id_idx
+  ON public.ingestion_retry_logs (user_id);