Spaces:

ub-aac-chatbot
/

aac-chatbot

Sleeping

App Files Files Community

shwetangisingh commited on Apr 16

Commit

5187368

1 Parent(s): 2d47b97

Remove library bloat

Browse files

Files changed (24) hide show

.env.example +15 -26
.gitignore +3 -7
CLAUDE.md +51 -24
README.md +124 -22
backend/api/main.py +4 -6
backend/config/settings.py +14 -23
backend/generation/llm_client.py +7 -32
backend/main.py +5 -6
backend/pipeline/graph.py +16 -45
backend/pipeline/nodes/feedback.py +42 -52
backend/pipeline/nodes/intent.py +1 -2
backend/pipeline/nodes/planner.py +3 -7
backend/pipeline/nodes/retrieval.py +1 -3
backend/pipeline/state.py +6 -7
backend/retrieval/clustering.py +0 -84
backend/retrieval/vector_store.py +64 -70
backend/sensing/air_writing.py +0 -155
backend/sensing/face_mesh.py +0 -145
backend/sensing/gaze.py +0 -92
backend/sensing/gesture.py +0 -102
backend/sensing/labels.py +6 -0
backend/ui/app.py +0 -163
requirements.txt +5 -24
setup.sh +5 -17

.env.example CHANGED Viewed

@@ -1,36 +1,25 @@
-# Copy this file to .env and fill in your values.
 # Settings here override the defaults in config/settings.py.
-# ── Active LLM tier ────────────────────────────────────────────────────────────
-# "local"   → Ollama on MacBook M2  (dev, no GPU needed)
-# "primary" → Qwen3-30B-A3B on GCP A100/T4 via vLLM
-# "fallback" → Qwen3-8B on same vLLM server
-ACTIVE_LLM_TIER=local
-# ── Primary vLLM server (GCP) ─────────────────────────────────────────────────
-PRIMARY_BASE_URL=http://<GCP_IP>:8000/v1
-PRIMARY_API_KEY=token-abc
-PRIMARY_MODEL=Qwen/Qwen3-30B-A3B
-# ── Fallback model (same vLLM server) ─────────────────────────────────────────
-FALLBACK_MODEL=Qwen/Qwen3-8B
-FALLBACK_BASE_URL=http://<GCP_IP>:8000/v1
-# ── Local Ollama (dev) ────────────────────────────────────────────────────────
-LOCAL_BASE_URL=http://localhost:11434/v1
-LOCAL_MODEL=gemma4:31b-cloud
-# ── MLflow ────────────────────────────────────────────────────────────────────
-MLFLOW_TRACKING_URI=sqlite:///mlflow.db
-MLFLOW_EXPERIMENT=aac-chatbot
-# ── Thinking mode ─────────────────────────────────────────────────────────────
-# "off"   — suppress thinking (fastest, best for latency-sensitive AAC)
-# "strip" — let model think, but strip <think> tags from output
-# "full"  — return raw response including <think> blocks
 THINKING_MODE=off
-# Extra tokens added when thinking is enabled (strip/full). Ignored when off.
 THINKING_TOKEN_BUDGET=4096
-# ── Latency fallback threshold (seconds) ──────────────────────────────────────
 FALLBACK_LATENCY_THRESHOLD=3.5

+# Copy to .env and fill in your values.
 # Settings here override the defaults in config/settings.py.
+# Active tier: "primary" | "fallback"
+ACTIVE_LLM_TIER=primary
+# Both tiers hit Ollama Cloud over the OpenAI-compatible endpoint.
+# Use a larger model on primary; the fallback fires when cumulative
+# latency exceeds FALLBACK_LATENCY_THRESHOLD seconds.
+PRIMARY_BASE_URL=http://localhost:11434/v1
+PRIMARY_MODEL=gemma4:31b-cloud
+PRIMARY_API_KEY=ollama
+FALLBACK_BASE_URL=http://localhost:11434/v1
+FALLBACK_MODEL=gemma4:31b-cloud
+FALLBACK_API_KEY=ollama
+# Per-turn logs are written as JSONL to <LOGS_DIR>/turns.jsonl
+LOGS_DIR=logs
+# off | strip | full | suppress
 THINKING_MODE=off
 THINKING_TOKEN_BUDGET=4096
 FALLBACK_LATENCY_THRESHOLD=3.5

.gitignore CHANGED Viewed

@@ -19,14 +19,10 @@ env/
 # Data — indexes are rebuilt from source; do NOT commit binaries
 data/faiss_store/
-# Air-writing templates (large numpy files, track separately if needed)
-data/air_write_templates/
-# MLflow
-mlruns/
-mlflow.db
-# Latency logs
 timings.csv
 *.csv

 # Data — indexes are rebuilt from source; do NOT commit binaries
 data/faiss_store/
+# Per-turn JSONL logs (contain user conversation content)
+logs/
+# Latency CSVs (legacy)
 timings.csv
 *.csv

CLAUDE.md CHANGED Viewed

@@ -5,7 +5,8 @@
 An AI chatbot that **speaks as an AAC user**, not to them. Given a user persona
 (Mia, Gerald, or Arjun), it fuses real-time multimodal non-verbal signals with
 personal memory retrieval to generate responses in that person's authentic voice.
-Orchestrated as a **LangGraph stateful directed graph** across five layers.
 ---
@@ -19,33 +20,45 @@ frontend/                         React + Vite + TypeScript
 backend/                          Python (conda env: aac-chatbot)
   main.py                         CLI entry point
   api/main.py                     FastAPI REST API
-  pipeline/graph.py               LangGraph StateGraph (5 nodes + conditional edges)
     pipeline/nodes/intent.py        L2 — LLM + Pydantic intent routing
-    pipeline/nodes/retrieval.py     L3 — FAISS + BGE retrieval (fast / full)
     pipeline/nodes/planner.py       L4 — expression-conditioned generation
-    pipeline/nodes/feedback.py      L5 — MLflow logging + Bayesian priors
-  sensing/                        L1 — MediaPipe face mesh, gesture, gaze, air writing (Python, CLI use)
-  retrieval/                      FAISS ops, HDBSCAN clustering, Bayesian bucket priors
-  generation/                     Multi-tier LLM client (vLLM primary / fallback / Ollama local)
   guardrails/                     Input + output safety checks
   config/                         Pydantic BaseSettings — all config in one place
-data/                             Shared data (personas, FAISS indexes)
 ```
 ## Key Design Decisions
-- **LangGraph** orchestrates the pipeline as a stateful directed graph with
-  conditional edges (affect → fast/full retrieval; latency → primary/fallback LLM)
 - **BGE-small-en-v1.5** for embeddings (beats MiniLM on MTEB at same speed)
-- **BGE-reranker-v2-m3** cross-encoder — multilingual, handles Arjun's Hindi
-- **FAISS IndexFlatIP** with L2-normalised vectors (inner product = cosine sim)
-- **Qwen3-30B-A3B** MoE via vLLM — 3B active params/token, sub-3s on T4
-- **Three-tier LLM fallback**: primary (vLLM GCP) → fallback (Qwen3-8B) → local (Ollama)
-- **Pydantic-validated** LLM routing output — LangGraph retries on schema failures
 - **Expression-conditioned response shaping** — affect steers tone, retrieval depth,
   and candidate ranking (not just metadata annotation)
 - **Bayesian bucket priors** — session-level P(bucket) updated after each accepted turn
 - **Browser-side sensing** — MediaPipe JS runs in React frontend, only classified
   labels (affect, gesture, gaze bucket) are sent to the backend API
@@ -69,7 +82,7 @@ data/                             Shared data (personas, FAISS indexes)
 # One-time setup
 bash setup.sh
-# CLI (local Ollama tier)
 python -m backend.main --debug
 # Full stack
@@ -84,9 +97,10 @@ pnpm --dir frontend dev                  # React on :7550
 All config lives in [backend/config/settings.py](backend/config/settings.py) as Pydantic `BaseSettings`.
 Copy `.env.example` → `.env` and set:
-- `ACTIVE_LLM_TIER` — `local` (dev) | `primary` (GCP A100) | `fallback` (Qwen3-8B)
-- `PRIMARY_BASE_URL` — vLLM server address on GCP
-- `MLFLOW_TRACKING_URI` — where MLflow stores runs (default: `mlruns/`)
 ---
@@ -96,23 +110,36 @@ Copy `.env.example` → `.env` and set:
 |------|---------|
 | `data/users.json` | Flat user index (id, name, condition, style) |
 | `data/memories/<uid>.json` | Full persona JSON with bucketed memories |
-| `data/faiss_store/<uid>/` | FAISS index + metadata — **rebuild after any persona edit** |
 | `data/generate_users.py` | Regenerates memories + users.json |
 ---
 ## Development Notes
 - **NEVER use local Ollama models** (e.g. `qwen3:8b`, `gemma3:1b`) — this machine
   is not powerful enough and will break. Always use cloud-backed models like
-  `qwen3.5:397b-cloud` or `gpt-oss:20b-cloud` via Ollama, or vLLM tiers.
 - **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
   then `python -m backend.retrieval.vector_store` to rebuild indexes
 - **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
-- **Extending sensing**: add module under `backend/sensing/`, wire output into
-  `PipelineState` fields in `backend/pipeline/state.py`
 - **Guardrail tuning**: edit signal lists in `backend/guardrails/checks.py`
 - **Affect → generation mapping**: `_AFFECT_CONFIG` in `backend/pipeline/nodes/intent.py`
   and `_PERSONA_TONE_OVERRIDES` in `backend/pipeline/nodes/planner.py`
-- FAISS indexes in `data/faiss_store/` are gitignored — rebuilt from source JSONs
 - Frontend uses pnpm, Node 22+

 An AI chatbot that **speaks as an AAC user**, not to them. Given a user persona
 (Mia, Gerald, or Arjun), it fuses real-time multimodal non-verbal signals with
 personal memory retrieval to generate responses in that person's authentic voice.
+Orchestrated as a **plain Python function chain** across five layers, with two
+conditional branches.
 ---
 backend/                          Python (conda env: aac-chatbot)
   main.py                         CLI entry point
   api/main.py                     FastAPI REST API
+  pipeline/graph.py               run_pipeline() — plain function chain with 2 conditional branches
     pipeline/nodes/intent.py        L2 — LLM + Pydantic intent routing
+    pipeline/nodes/retrieval.py     L3 — BGE embeddings + torch tensor cosine search (fast / full)
     pipeline/nodes/planner.py       L4 — expression-conditioned generation
+    pipeline/nodes/feedback.py      L5 — JSONL turn logging + Bayesian bucket priors
+  sensing/labels.py               GESTURE_TO_TAG label map (sensing itself runs in browser)
+  retrieval/                      BGE embeddings (torch), Bayesian bucket priors
+  generation/                     Two-tier LLM client (primary / fallback, both Ollama Cloud)
   guardrails/                     Input + output safety checks
   config/                         Pydantic BaseSettings — all config in one place
+data/                             Shared data (personas, vector indexes)
+logs/                             Per-turn JSONL logs (gitignored)
 ```
 ## Key Design Decisions
+- **Plain function chain** orchestrates the pipeline (`run_pipeline` in
+  `backend/pipeline/graph.py`): intent → retrieval → planner → feedback,
+  with two conditional branches (affect picks fast/full retrieval; cumulative
+  latency picks primary/fallback LLM). No LangGraph / LangChain dependency.
 - **BGE-small-en-v1.5** for embeddings (beats MiniLM on MTEB at same speed)
+- **Torch tensor matmul** for vector search on the embedder's device
+  (mps → cuda → cpu). No FAISS, no separate index format. Stored as
+  `vectors.pt` per user. Headroom is ~100k vectors before approximate
+  search (`hnswlib`) becomes worthwhile.
+- **No reranker** — cosine score from BGE-small carries the ranking signal
+  at current scales. Revisit when per-query `top_k` grows past ~30.
+- **Two-tier Ollama Cloud LLM**: `primary` → `fallback` (when cumulative
+  latency exceeds `FALLBACK_LATENCY_THRESHOLD`). Both tiers hit Ollama
+  Cloud over the OpenAI-compatible endpoint. Models default to
+  `gemma4:31b-cloud`; swap one when a larger cloud model is provisioned.
+- **Pydantic-validated** LLM routing output — `intent.py` retries on schema
+  failures (3 attempts) before falling back to a default route
 - **Expression-conditioned response shaping** — affect steers tone, retrieval depth,
   and candidate ranking (not just metadata annotation)
 - **Bayesian bucket priors** — session-level P(bucket) updated after each accepted turn
+- **Per-turn JSONL logging** — one line per turn appended to
+  `logs/turns.jsonl` (no MLflow). Query ad-hoc with DuckDB if needed.
 - **Browser-side sensing** — MediaPipe JS runs in React frontend, only classified
   labels (affect, gesture, gaze bucket) are sent to the backend API
 # One-time setup
 bash setup.sh
+# CLI
 python -m backend.main --debug
 # Full stack
 All config lives in [backend/config/settings.py](backend/config/settings.py) as Pydantic `BaseSettings`.
 Copy `.env.example` → `.env` and set:
+- `ACTIVE_LLM_TIER` — `primary` | `fallback`
+- `PRIMARY_MODEL` / `FALLBACK_MODEL` — Ollama Cloud model identifiers
+  (e.g. `gemma4:31b-cloud`)
+- `LOGS_DIR` — where per-turn JSONL logs are written (default: `logs/`)
 ---
 |------|---------|
 | `data/users.json` | Flat user index (id, name, condition, style) |
 | `data/memories/<uid>.json` | Full persona JSON with bucketed memories |
+| `data/faiss_store/<uid>/` | `vectors.pt` + `meta.json` — **rebuild after any persona edit** |
 | `data/generate_users.py` | Regenerates memories + users.json |
 ---
+## Code Style
+- **Keep comments to a minimum.** Only comment what isn't obvious from the
+  code. No file headers explaining what a module does (the name and code
+  show that). No section divider banners (`# ── Foo ──`). No restating
+  what the next line does. Prefer one-line comments when needed.
+- **Skip `from __future__ import annotations`.** The project is Python 3.10+
+  and uses native `X | None` / `list[dict]` syntax — the import adds nothing.
 ## Development Notes
 - **NEVER use local Ollama models** (e.g. `qwen3:8b`, `gemma3:1b`) — this machine
   is not powerful enough and will break. Always use cloud-backed models like
+  `gemma4:31b-cloud` via Ollama Cloud.
 - **Adding a persona**: add to `PERSONAS` in `data/generate_users.py`, re-run it,
   then `python -m backend.retrieval.vector_store` to rebuild indexes
 - **Changing LLM**: set `ACTIVE_LLM_TIER` in `.env` — no code changes needed
+- **Extending sensing**: sensing runs in the React frontend
+  (`frontend/src/hooks/useSensing.ts`); to add a new signal, classify it
+  there and add a label field to `PipelineState` in
+  `backend/pipeline/state.py`. Keep purely-data label maps in
+  `backend/sensing/labels.py`.
 - **Guardrail tuning**: edit signal lists in `backend/guardrails/checks.py`
 - **Affect → generation mapping**: `_AFFECT_CONFIG` in `backend/pipeline/nodes/intent.py`
   and `_PERSONA_TONE_OVERRIDES` in `backend/pipeline/nodes/planner.py`
+- Vector indexes in `data/faiss_store/` are gitignored — rebuilt from source JSONs
+  via `python -m backend.retrieval.vector_store`
 - Frontend uses pnpm, Node 22+

README.md CHANGED Viewed

@@ -4,7 +4,9 @@ An AI chatbot that **speaks as an AAC user**, not to them. Given a persona (Mia,
 it fuses real-time multimodal non-verbal signals — facial expressions, hand gestures, gaze, and
 air writing — with personal memory retrieval to generate responses in that person's authentic voice.
-Built as a training-free, agentic RAG pipeline orchestrated via **LangGraph**.
 ---
@@ -36,7 +38,7 @@ a personalized digital twin that communicates on their behalf.
 ```
 React Frontend (browser)                    Backend (Python)
   MediaPipe JS sensing ──┐
-  Chat UI ───────────────┼── POST /chat ──► FastAPI ──► LangGraph Pipeline
   Webcam feed ───────────┘                                │
                                             L2 Intent ──► L3 Retrieval ──► L4 Generation ──► L5 Feedback
 ```
@@ -44,13 +46,13 @@ React Frontend (browser)                    Backend (Python)
 | Layer | Module | What it does |
 |-------|--------|-------------|
 | L1 | `frontend/src/hooks/useSensing.ts` | MediaPipe JS — affect, gesture, gaze, air writing (browser-side) |
-| L2 | `backend/pipeline/nodes/intent.py` | LLM + Pydantic-validated intent routing |
-| L3 | `backend/pipeline/nodes/retrieval.py` | FAISS + BGE embeddings + cross-encoder reranking |
 | L4 | `backend/pipeline/nodes/planner.py` | Expression-conditioned response generation (Qwen3) |
-| L5 | `backend/pipeline/nodes/feedback.py` | MLflow tracking + Bayesian bucket prior update |
-The pipeline runs as a **LangGraph stateful directed graph** with conditional edges:
-- FRUSTRATED affect → fast retrieval path (k=2, no reranker)
 - Latency > 3.5s → fallback to smaller Qwen3-8B model
 ---
@@ -59,7 +61,8 @@ The pipeline runs as a **LangGraph stateful directed graph** with conditional ed
 - Python **3.10+** (via conda)
 - Node.js **22+** and **pnpm**
-- [Ollama](https://ollama.com) installed locally for the `local` LLM tier
 - A webcam (for live sensing; optional for CLI mode)
 ---
@@ -76,8 +79,8 @@ The setup script handles:
 - Conda environment creation (`aac-chatbot`, Python 3.12)
 - Python dependency installation
 - `.env` file creation from template
-- FAISS index building (downloads BGE models on first run)
-- Ollama model pull
 - Frontend dependency installation (pnpm)
 ---
@@ -88,13 +91,12 @@ All settings live in [backend/config/settings.py](backend/config/settings.py) an
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `ACTIVE_LLM_TIER` | `local` | `local` (Ollama) \| `primary` (vLLM GCP) \| `fallback` (Qwen3-8B) |
-| `LOCAL_MODEL` | `qwen3:8b` | Ollama model name for local dev |
-| `LOCAL_BASE_URL` | `http://localhost:11434/v1` | Ollama OpenAI-compatible endpoint |
-| `PRIMARY_BASE_URL` | *(GCP IP)* | vLLM server URL on GCP |
-| `PRIMARY_MODEL` | `Qwen/Qwen3-30B-A3B` | Primary MoE model served via vLLM |
 | `FALLBACK_LATENCY_THRESHOLD` | `3.5` | Seconds before falling back to smaller model |
-| `MLFLOW_TRACKING_URI` | `mlruns` | Local MLflow storage path |
 ---
@@ -106,7 +108,7 @@ All settings live in [backend/config/settings.py](backend/config/settings.py) an
 bash run.sh
 ```
-This starts Ollama (if needed), FastAPI on `:8000`, and React on `:7550`.
 Open [http://localhost:7550](http://localhost:7550) in your browser.
 ### CLI only
@@ -147,18 +149,19 @@ multimodal_aac_chatbot/
 │   ├── api/main.py                FastAPI REST API
 │   ├── config/settings.py         Pydantic BaseSettings
 │   ├── pipeline/
-│   │   ├── graph.py               LangGraph StateGraph
 │   │   ├── state.py               PipelineState TypedDict
 │   │   └── nodes/                 intent, retrieval, planner, feedback
-│   ├── sensing/                   MediaPipe modules (Python, CLI use)
-│   ├── retrieval/                 FAISS, BGE, HDBSCAN, bucket priors
-│   ├── generation/llm_client.py   3-tier LLM client (vLLM / Ollama)
 │   └── guardrails/checks.py      Input + output safety checks
 │
 ├── data/
 │   ├── users.json                 Persona index
 │   ├── memories/                  Per-persona memory JSONs
-│   └── faiss_store/               FAISS indexes (gitignored, rebuilt)
 │
 ├── setup.sh                       One-time setup script
 ├── run.sh                         Start backend + frontend
@@ -184,7 +187,106 @@ To add a new persona, edit `data/generate_users.py` and re-run `python -m backen
 ## TODO
 From the spec (pages 10–11). Tags: **[Core]** = must do, **[Bonus]** = nice to have, **[Eval]** = for the grade.
 Heads up: all camera/sensing stuff is in the frontend (MediaPipe JS). Backend just gets the labels (`affect`, `gesture_tag`, `gaze_bucket`). The `backend/sensing/` python modules are dead code.

 it fuses real-time multimodal non-verbal signals — facial expressions, hand gestures, gaze, and
 air writing — with personal memory retrieval to generate responses in that person's authentic voice.
+Built as a training-free, agentic RAG pipeline — a plain-Python function chain
+with two conditional branches (no LangGraph / LangChain), torch-tensor
+retrieval (no FAISS), and JSONL turn logging (no MLflow).
 ---
 ```
 React Frontend (browser)                    Backend (Python)
   MediaPipe JS sensing ──┐
+  Chat UI ───────────────┼── POST /chat ──► FastAPI ──► run_pipeline()
   Webcam feed ───────────┘                                │
                                             L2 Intent ──► L3 Retrieval ──► L4 Generation ──► L5 Feedback
 ```
 | Layer | Module | What it does |
 |-------|--------|-------------|
 | L1 | `frontend/src/hooks/useSensing.ts` | MediaPipe JS — affect, gesture, gaze, air writing (browser-side) |
+| L2 | `backend/pipeline/nodes/intent.py` | Keyword-based intent routing (no LLM) |
+| L3 | `backend/pipeline/nodes/retrieval.py` | BGE-small embeddings + torch tensor cosine search (mps/cuda/cpu) |
 | L4 | `backend/pipeline/nodes/planner.py` | Expression-conditioned response generation (Qwen3) |
+| L5 | `backend/pipeline/nodes/feedback.py` | JSONL turn logging + Bayesian bucket prior update |
+The pipeline is a plain Python function chain with two conditional branches:
+- FRUSTRATED affect → fast retrieval path (k=2)
 - Latency > 3.5s → fallback to smaller Qwen3-8B model
 ---
 - Python **3.10+** (via conda)
 - Node.js **22+** and **pnpm**
+- An [Ollama Cloud](https://ollama.com) account — both LLM tiers hit
+  cloud-hosted models; no local Ollama daemon required
 - A webcam (for live sensing; optional for CLI mode)
 ---
 - Conda environment creation (`aac-chatbot`, Python 3.12)
 - Python dependency installation
 - `.env` file creation from template
+- Vector index building (downloads BGE-small embedder on first run, saves
+  per-user `vectors.pt` under `data/faiss_store/`)
 - Frontend dependency installation (pnpm)
 ---
 | Variable | Default | Description |
 |----------|---------|-------------|
+| `ACTIVE_LLM_TIER` | `primary` | `primary` \| `fallback` |
+| `PRIMARY_MODEL` | `gemma4:31b-cloud` | Ollama Cloud model for primary tier |
+| `FALLBACK_MODEL` | `gemma4:31b-cloud` | Ollama Cloud model for fallback tier (smaller/faster) |
+| `PRIMARY_BASE_URL` | `http://localhost:11434/v1` | Ollama-compatible endpoint |
 | `FALLBACK_LATENCY_THRESHOLD` | `3.5` | Seconds before falling back to smaller model |
+| `LOGS_DIR` | `logs` | Where per-turn JSONL logs are written |
 ---
 bash run.sh
 ```
+This starts FastAPI on `:8000` and React on `:7550`.
 Open [http://localhost:7550](http://localhost:7550) in your browser.
 ### CLI only
 │   ├── api/main.py                FastAPI REST API
 │   ├── config/settings.py         Pydantic BaseSettings
 │   ├── pipeline/
+│   │   ├── graph.py               run_pipeline() — plain function chain
 │   │   ├── state.py               PipelineState TypedDict
 │   │   └── nodes/                 intent, retrieval, planner, feedback
+│   ├── sensing/labels.py          GESTURE_TO_TAG (sensing runs in browser)
+│   ├── retrieval/                 BGE embeddings (torch tensor) + bucket priors
+│   ├── generation/llm_client.py   2-tier Ollama Cloud LLM client (primary/fallback)
 │   └── guardrails/checks.py      Input + output safety checks
 │
 ├── data/
 │   ├── users.json                 Persona index
 │   ├── memories/                  Per-persona memory JSONs
+│   └── faiss_store/               vectors.pt + meta.json (gitignored, rebuilt)
+├── logs/                          Per-turn JSONL logs (gitignored)
 │
 ├── setup.sh                       One-time setup script
 ├── run.sh                         Start backend + frontend
 ## TODO
+<<<<<<< Updated upstream
 From the spec (pages 10–11). Tags: **[Core]** = must do, **[Bonus]** = nice to have, **[Eval]** = for the grade.
+=======
+Roadmap derived from the project spec (pages 10–11). Items are grouped by spec
+area and marked with priority. Bracketed tags map back to the spec:
+**[Core]** = required deliverable, **[Bonus]** = stretch goal, **[Eval]** = validation.
+> **Note on sensing:** all camera capture and signal classification happens in
+> the **frontend** (MediaPipe JS). The backend only consumes pre-classified
+> labels (`affect`, `gesture_tag`, `gaze_bucket`).
+### Dataset
+- [ ] **[Core]** Add **heterogeneous** memory types per persona — currently only
+      autobiographical narratives exist.
+  - [ ] Add a set of synthetic social-media posts per persona (voice-matched)
+  - [ ] Add a set of synthetic past communication logs per persona
+  - [ ] Regenerate the synthesis script to produce both, then rebuild embeddings
+  - [ ] Make ingestion type-aware so the retriever knows which chunk-type a hit came from
+- [ ] **[Core]** Document the dataset schema so it is reusable by the evaluation harness.
+### Multimodal Sensing (frontend)
+- [ ] **[Core]** Detect **head-nod / sharp tilt as dissatisfaction**, distinct
+      from a generic frustrated affect read.
+  - [ ] Send a `dissatisfaction_signal` to the backend alongside the existing labels
+  - [ ] When the signal fires, branch the planner to a **"Turnaround Option"** —
+        a clarification candidate ("Did you mean X or Y?") instead of a plain answer
+- [ ] **[Bonus]** Add **vocalisation capture** (Web Speech API) and a
+      **conflict-resolution** step that compares the spoken intent against the
+      air-written intent, sending a single `resolved_intent` to the backend.
+- [ ] **[Polish]** Tighten the **thumbs-up boost** — today it only annotates the
+      prompt. The retriever should also bias affirmative-leaning candidates when
+      a thumbs-up is present.
+### Agentic Intent Decomposition
+> **Current state:** intent routing is **keyword-based**, not LLM-based.
+> The original LLM-driven router (Pydantic-validated JSON output) was
+> dropped because `gemma4:31b-cloud` consistently emitted the wrong JSON
+> shape and got truncated by `max_tokens`, triggering 3 retries + a
+> hard-fallback on every turn — adding ~30s of dead latency before the
+> generation call. The keyword router (~5 buckets matched against
+> hardcoded word lists in `intent.py`) handles the demo personas
+> reliably and adds ~0ms per turn.
+>
+> **Trade-off:** the router is limited to the 5 hardcoded buckets
+> (`family`, `medical`, `hobbies`, `daily_routine`, `social`) and can't
+> distinguish `OPEN_DOMAIN` from `PERSONAL` queries. Acceptable today
+> because all current personas only have personal memories.
+- [ ] **[Core]** Make Personal / Contextual / Open-domain routing actually hit
+      **different retrieval pools** — today all sub-queries fall back to the same
+      vector index. Requires re-introducing some form of intent classification
+      (likely a constrained-output LLM call once `response_format=json_schema`
+      is supported on Ollama Cloud, or a tiny local classifier).
+- [ ] **[Perf]** When/if we re-add LLM intent: cache the schema prompt,
+      use a smaller routing model, and parallelise sub-query retrieval.
+### Retrieval
+- [ ] **[Bonus]** Persist **bucket priors** per user across conversations
+      (currently per-session only).
+- [ ] **[Bonus]** Extend the **latency-optimised fallback** beyond a single
+      LLM-tier switch:
+  - [ ] Return a cached canned response when end-to-end latency blows the budget
+  - [ ] Use the spec's **< 6s end-to-end** target instead of the current 3.5s threshold
+- [ ] **[Scale]** When per-user memory grows past ~100k chunks, swap the
+      torch-tensor matmul search for `hnswlib` (a ~2 MB approximate-NN library);
+      reintroduce a cross-encoder reranker once `top_k > ~30`.
+### Training-Free Response Generation
+- [ ] **[Core]** Return **multiple candidate responses** from the API so the
+      user can pick one (today the endpoint returns a single string).
+- [ ] **[Bonus]** On user selection, upsert the `(query, selected_response)` pair
+      into a small "accepted-pairs" index and consult it as a high-prior shortcut
+      on the next turn — the spec's lightweight retrieval-index update.
+### Evaluation & Validation
+- [ ] **[Eval]** **Factual Faithfulness** — NLI-based groundedness metric over
+      (retrieved evidence, generated response) pairs, reported as a hallucination
+      rate on a held-out set of partner-style queries per persona.
+- [ ] **[Eval]** **Communication Efficiency** — p50 / p95 end-to-end latency
+      across all three LLM tiers, with a pass/fail gate at the spec target of
+      **< 6s p95**.
+- [ ] **[Eval]** **Perceived Authenticity** — generate paired (persona, query,
+      response) samples and a 5-point Likert rating sheet for the live in-class eval.
+- [ ] **[Eval]** **Multimodal Alignment** — synthetic (gesture, query) scenarios
+      checked against expected response traits (e.g. thumbs-up ⇒ affirmative
+      lexicon present), reported as alignment accuracy.
+### Polish
+- [ ] **[Polish]** Move the hard-coded affect→tone and persona-override dicts
+      into a single YAML so tone-shaping can be tuned without touching code.
+- [x] **[Polish]** Delete the unused `backend/sensing/` Python modules now that
+      sensing lives entirely in the frontend. *(Done — only `labels.py` remains.)*
+>>>>>>> Stashed changes
 Heads up: all camera/sensing stuff is in the frontend (MediaPipe JS). Backend just gets the labels (`affect`, `gesture_tag`, `gaze_bucket`). The `backend/sensing/` python modules are dead code.

backend/api/main.py CHANGED Viewed

@@ -14,10 +14,10 @@ from backend.generation.llm_client import (  # active_model used by /debug/confi
     get_client,
 )
 from backend.guardrails.checks import check_input
-from backend.pipeline.graph import aac_graph
 from backend.pipeline.state import PipelineState
 from backend.retrieval.bucket_priors import uniform_priors
-from backend.retrieval.vector_store import _get_embedder, _get_reranker
 app = FastAPI(
     title="Multimodal AAC Chatbot API",
@@ -45,7 +45,6 @@ def _warmup():
     logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
     print("Loading models...", end=" ", flush=True)
     _get_embedder()
-    _get_reranker()
     get_client()
     _models_ready = True
     print("ready.")
@@ -151,7 +150,7 @@ def _build_initial_state(req: ChatRequest, session: dict) -> PipelineState:
             "t_generation": 0.0,
             "t_total": 0.0,
         },
-        mlflow_run_id=None,
         guardrail_passed=True,
     )
@@ -172,7 +171,6 @@ def debug_config():
         "active_model": active_model(),
         "thinking_mode": settings.thinking_mode,
         "embed_model": settings.embed_model,
-        "rerank_model": settings.rerank_model,
         "retrieval_top_k": settings.retrieval_top_k,
         "retrieval_rerank_k": settings.retrieval_rerank_k,
         "fallback_latency_threshold": settings.fallback_latency_threshold,
@@ -217,7 +215,7 @@ def chat(req: ChatRequest):
     session = _get_or_init_session(req.user_id)
     initial_state = _build_initial_state(req, session)
-    result: PipelineState = aac_graph.invoke(initial_state)
     # Persist updated session state
     session["session_history"] = result["session_history"]

     get_client,
 )
 from backend.guardrails.checks import check_input
+from backend.pipeline.graph import run_pipeline
 from backend.pipeline.state import PipelineState
 from backend.retrieval.bucket_priors import uniform_priors
+from backend.retrieval.vector_store import _get_embedder
 app = FastAPI(
     title="Multimodal AAC Chatbot API",
     logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
     print("Loading models...", end=" ", flush=True)
     _get_embedder()
     get_client()
     _models_ready = True
     print("ready.")
             "t_generation": 0.0,
             "t_total": 0.0,
         },
+        run_id=None,
         guardrail_passed=True,
     )
         "active_model": active_model(),
         "thinking_mode": settings.thinking_mode,
         "embed_model": settings.embed_model,
         "retrieval_top_k": settings.retrieval_top_k,
         "retrieval_rerank_k": settings.retrieval_rerank_k,
         "fallback_latency_threshold": settings.fallback_latency_threshold,
     session = _get_or_init_session(req.user_id)
     initial_state = _build_initial_state(req, session)
+    result: PipelineState = run_pipeline(initial_state)
     # Persist updated session state
     session["session_history"] = result["session_history"]

backend/config/settings.py CHANGED Viewed

@@ -10,35 +10,30 @@ class Settings(BaseSettings):
     # ── Paths ──────────────────────────────────────────────────────────────────
     data_dir: Path = Path("data")
-    faiss_store_dir: Path = Path("data/faiss_store")
     memories_dir: Path = Path("data/memories")
     users_json: Path = Path("data/users.json")
-    # ── Retrieval models ───────────────────────────────────────────────────────
     embed_model: str = "BAAI/bge-small-en-v1.5"
-    rerank_model: str = "BAAI/bge-reranker-v2-m3"
     retrieval_top_k: int = 5
     retrieval_rerank_k: int = 3
     retrieval_fast_k: int = 2  # used when affect == FRUSTRATED
-    # ── LLM tiers ─────────────────────────────────────────────────────────────
-    # Tier 1 — primary (Qwen3-30B-A3B via vLLM on GCP)
-    primary_model: str = "Qwen/Qwen3-30B-A3B"
-    primary_base_url: str = "http://localhost:8000/v1"
-    primary_api_key: str = "token-abc"  # vLLM default
-    # Tier 2 — fallback dense model (Qwen3-8B via vLLM, same server)
-    fallback_model: str = "Qwen/Qwen3-8B"
-    fallback_base_url: str = "http://localhost:8000/v1"
-    fallback_api_key: str = "token-abc"
-    # Tier 3 — local dev (Ollama on MacBook M2)
-    local_model: str = "qwen3:8b"
-    local_base_url: str = "http://localhost:11434/v1"
-    local_api_key: str = "ollama"
-    # Active tier: "primary" | "fallback" | "local"
-    active_llm_tier: str = "local"
     # off | strip | full | suppress
     thinking_mode: str = "off"
@@ -60,10 +55,6 @@ class Settings(BaseSettings):
     air_write_end_gap_ms: int = 200  # ms of stillness to end a stroke
     conflict_overlap_ms: int = 500  # audio + gesture co-occurrence window
-    # ── MLflow ────────────────────────────────────────────────────────────────
-    mlflow_tracking_uri: str = "sqlite:///mlflow.db"
-    mlflow_experiment: str = "aac-chatbot"
     # ── Candidate ranking weights ───────────────────────────────────────────────
     rank_alpha: float = 0.4  # faithfulness weight
     rank_beta: float = 0.3  # style similarity weight

     # ── Paths ──────────────────────────────────────────────────────────────────
     data_dir: Path = Path("data")
+    faiss_store_dir: Path = Path("data/faiss_store")  # name kept for back-compat
     memories_dir: Path = Path("data/memories")
     users_json: Path = Path("data/users.json")
+    logs_dir: Path = Path("logs")
+    # ── Retrieval ────────────────────────────────────────────────────────────
     embed_model: str = "BAAI/bge-small-en-v1.5"
     retrieval_top_k: int = 5
     retrieval_rerank_k: int = 3
     retrieval_fast_k: int = 2  # used when affect == FRUSTRATED
+    # LLM tiers — both hit Ollama Cloud via OpenAI-compatible endpoint.
+    # Same model on both tiers for now; swap one when a larger cloud model
+    # is provisioned and the latency-fallback should branch.
+    primary_model: str = "gemma4:31b-cloud"
+    primary_base_url: str = "http://localhost:11434/v1"
+    primary_api_key: str = "ollama"
+    fallback_model: str = "gemma4:31b-cloud"
+    fallback_base_url: str = "http://localhost:11434/v1"
+    fallback_api_key: str = "ollama"
+    # Active tier: "primary" | "fallback"
+    active_llm_tier: str = "primary"
     # off | strip | full | suppress
     thinking_mode: str = "off"
     air_write_end_gap_ms: int = 200  # ms of stillness to end a stroke
     conflict_overlap_ms: int = 500  # audio + gesture co-occurrence window
     # ── Candidate ranking weights ───────────────────────────────────────────────
     rank_alpha: float = 0.4  # faithfulness weight
     rank_beta: float = 0.3  # style similarity weight

backend/generation/llm_client.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# Multi-tier LLM client — primary (vLLM) / fallback / local (Ollama), all OpenAI-compatible.
-from __future__ import annotations
 import re
 from functools import lru_cache
 from typing import Any
@@ -10,33 +8,23 @@ from openai import OpenAI
 from backend.config.settings import settings
-@lru_cache(maxsize=3)
 def _build_client(base_url: str, api_key: str) -> OpenAI:
     return OpenAI(base_url=base_url, api_key=api_key)
 def get_client(tier: str | None = None) -> OpenAI:
     resolved = tier or settings.active_llm_tier
-    if resolved == "primary":
-        return _build_client(settings.primary_base_url, settings.primary_api_key)
     if resolved == "fallback":
         return _build_client(settings.fallback_base_url, settings.fallback_api_key)
-    # local / default
-    return _build_client(settings.local_base_url, settings.local_api_key)
 def active_model(tier: str | None = None) -> str:
     resolved = tier or settings.active_llm_tier
-    models = {
-        "primary": settings.primary_model,
-        "fallback": settings.fallback_model,
-        "local": settings.local_model,
-    }
     if resolved not in models:
-        raise ValueError(
-            f"Unknown LLM tier: '{resolved}'. Must be primary/fallback/local."
-        )
     return models[resolved]
@@ -61,29 +49,16 @@ def chat_complete(
     temperature: float = 0.7,
     **kwargs: Any,
 ) -> str:
-    # Returns response text. Handles thinking mode and local-tier collapsing.
     resolved_tier = tier or settings.active_llm_tier
-    # Local dev: no GCP server available — collapse all tiers to Ollama
-    if settings.active_llm_tier == "local":
-        resolved_tier = "local"
     model = active_model(resolved_tier)
     client = get_client(resolved_tier)
     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
-    # Suppress thinking for models that think by default.
     if settings.thinking_mode == "suppress":
-        if resolved_tier == "local":
-            patched_messages = _apply_no_think(messages)
-        else:
-            extra_body = {
-                **extra_body,
-                "chat_template_kwargs": {"enable_thinking": False},
-            }
-    # Add thinking budget when enabled.
     effective_max_tokens = max_tokens
     if settings.thinking_mode in ("strip", "full"):
         effective_max_tokens = max_tokens + settings.thinking_token_budget

+# Two-tier LLM client — primary / fallback, both Ollama Cloud over OpenAI-compatible HTTP.
 import re
 from functools import lru_cache
 from typing import Any
 from backend.config.settings import settings
+@lru_cache(maxsize=2)
 def _build_client(base_url: str, api_key: str) -> OpenAI:
     return OpenAI(base_url=base_url, api_key=api_key)
 def get_client(tier: str | None = None) -> OpenAI:
     resolved = tier or settings.active_llm_tier
     if resolved == "fallback":
         return _build_client(settings.fallback_base_url, settings.fallback_api_key)
+    return _build_client(settings.primary_base_url, settings.primary_api_key)
 def active_model(tier: str | None = None) -> str:
     resolved = tier or settings.active_llm_tier
+    models = {"primary": settings.primary_model, "fallback": settings.fallback_model}
     if resolved not in models:
+        raise ValueError(f"Unknown LLM tier: '{resolved}'. Must be primary/fallback.")
     return models[resolved]
     temperature: float = 0.7,
     **kwargs: Any,
 ) -> str:
     resolved_tier = tier or settings.active_llm_tier
     model = active_model(resolved_tier)
     client = get_client(resolved_tier)
     patched_messages = messages
     extra_body: dict[str, Any] = kwargs.pop("extra_body", {})
     if settings.thinking_mode == "suppress":
+        patched_messages = _apply_no_think(messages)
     effective_max_tokens = max_tokens
     if settings.thinking_mode in ("strip", "full"):
         effective_max_tokens = max_tokens + settings.thinking_token_budget

backend/main.py CHANGED Viewed

@@ -9,10 +9,10 @@ import time
 from backend.config.settings import settings
 from backend.guardrails.checks import check_input
-from backend.pipeline.graph import aac_graph
 from backend.pipeline.state import GenerationConfig, PipelineState
 from backend.retrieval.bucket_priors import uniform_priors
-from backend.retrieval.vector_store import _get_embedder, _get_reranker
 def parse_args() -> argparse.Namespace:
@@ -28,7 +28,7 @@ def parse_args() -> argparse.Namespace:
         "--tier",
         type=str,
         default=None,
-        choices=["primary", "fallback", "local"],
         help="Override LLM tier (default: settings.active_llm_tier)",
     )
     return p.parse_args()
@@ -134,7 +134,6 @@ def main() -> None:
     # Warm up models
     print(f"\nLoading models for {profile['name']} …", end=" ", flush=True)
     _get_embedder()
-    _get_reranker()
     print("ready.\n")
     session_history: list[dict] = []
@@ -197,11 +196,11 @@ def main() -> None:
                 "t_generation": 0.0,
                 "t_total": 0.0,
             },
-            mlflow_run_id=None,
             guardrail_passed=True,
         )
-        result: PipelineState = aac_graph.invoke(state)
         print(f"AAC Bot: {result['selected_response']}\n")

 from backend.config.settings import settings
 from backend.guardrails.checks import check_input
+from backend.pipeline.graph import run_pipeline
 from backend.pipeline.state import GenerationConfig, PipelineState
 from backend.retrieval.bucket_priors import uniform_priors
+from backend.retrieval.vector_store import _get_embedder
 def parse_args() -> argparse.Namespace:
         "--tier",
         type=str,
         default=None,
+        choices=["primary", "fallback"],
         help="Override LLM tier (default: settings.active_llm_tier)",
     )
     return p.parse_args()
     # Warm up models
     print(f"\nLoading models for {profile['name']} …", end=" ", flush=True)
     _get_embedder()
     print("ready.\n")
     session_history: list[dict] = []
                 "t_generation": 0.0,
                 "t_total": 0.0,
             },
+            run_id=None,
             guardrail_passed=True,
         )
+        result: PipelineState = run_pipeline(state)
         print(f"AAC Bot: {result['selected_response']}\n")

backend/pipeline/graph.py CHANGED Viewed

@@ -1,65 +1,36 @@
-# LangGraph pipeline graph — intent → retrieval → generation → feedback.
-from langgraph.graph import END, StateGraph
 from backend.pipeline.nodes import feedback, intent, planner, retrieval
 from backend.pipeline.state import PipelineState
 def _route_by_affect(state: PipelineState) -> str:
-    """Conditional edge: FRUSTRATED → fast path, otherwise full retrieval."""
     emotion = (state.get("affect") or {}).get("emotion", "NEUTRAL")
     return "fast" if emotion == "FRUSTRATED" else "full"
 def _route_by_latency(state: PipelineState) -> str:
-    """Conditional edge: if cumulative latency > threshold, use fallback LLM."""
-    from backend.config.settings import settings
     log = state.get("latency_log") or {}
     elapsed = log.get("t_intent", 0.0) + log.get("t_retrieval", 0.0)
     return "fallback" if elapsed > settings.fallback_latency_threshold else "primary"
-def build_graph() -> StateGraph:
-    graph = StateGraph(PipelineState)
-    # ── Nodes ──────────────────────────────────────────────────────────────────
-    graph.add_node("intent", intent.run)
-    graph.add_node("fast_retrieval", retrieval.run_fast)
-    graph.add_node("full_retrieval", retrieval.run_full)
-    graph.add_node("primary_gen", planner.run_primary)
-    graph.add_node("fallback_gen", planner.run_fallback)
-    graph.add_node("feedback", feedback.run)
-    # ── Entry ──────────────────────────────────────────────────────────────────
-    graph.set_entry_point("intent")
-    # ── Affect-aware routing after intent ─────────────────────────────────────
-    graph.add_conditional_edges(
-        "intent",
-        _route_by_affect,
-        {"fast": "fast_retrieval", "full": "full_retrieval"},
-    )
-    # ── Latency-aware routing after retrieval ─────────────────────────────────
-    graph.add_conditional_edges(
-        "fast_retrieval",
-        _route_by_latency,
-        {"primary": "primary_gen", "fallback": "fallback_gen"},
-    )
-    graph.add_conditional_edges(
-        "full_retrieval",
-        _route_by_latency,
-        {"primary": "primary_gen", "fallback": "fallback_gen"},
-    )
-    # ── Feedback loop ─────────────────────────────────────────────────────────
-    graph.add_edge("primary_gen", "feedback")
-    graph.add_edge("fallback_gen", "feedback")
-    graph.add_edge("feedback", END)
-    return graph.compile()
-# Module-level compiled graph — import this everywhere
-aac_graph = build_graph()

+# Pipeline orchestrator: intent → retrieval → generation → feedback.
+from backend.config.settings import settings
 from backend.pipeline.nodes import feedback, intent, planner, retrieval
 from backend.pipeline.state import PipelineState
 def _route_by_affect(state: PipelineState) -> str:
     emotion = (state.get("affect") or {}).get("emotion", "NEUTRAL")
     return "fast" if emotion == "FRUSTRATED" else "full"
 def _route_by_latency(state: PipelineState) -> str:
     log = state.get("latency_log") or {}
     elapsed = log.get("t_intent", 0.0) + log.get("t_retrieval", 0.0)
     return "fallback" if elapsed > settings.fallback_latency_threshold else "primary"
+def _merge(state: PipelineState, update: dict) -> None:
+    state.update(update)  # type: ignore[typeddict-item]
+def run_pipeline(state: PipelineState) -> PipelineState:
+    _merge(state, intent.run(state))
+    if _route_by_affect(state) == "fast":
+        _merge(state, retrieval.run_fast(state))
+    else:
+        _merge(state, retrieval.run_full(state))
+    if _route_by_latency(state) == "fallback":
+        _merge(state, planner.run_fallback(state))
+    else:
+        _merge(state, planner.run_primary(state))
+    _merge(state, feedback.run(state))
+    return state

backend/pipeline/nodes/feedback.py CHANGED Viewed

@@ -1,5 +1,8 @@
-# Feedback node — MLflow logging, bucket prior update, history append.
-from __future__ import annotations
 from backend.config.settings import settings
 from backend.pipeline.state import PipelineState
@@ -7,64 +10,52 @@ from backend.retrieval.bucket_priors import update_priors
 def run(state: PipelineState) -> dict:
     try:
-        mlflow_run_id = _log_to_mlflow(state)
-    except Exception:
-        mlflow_run_id = None
     updated_priors = _update_bucket_priors(state)
     updated_history = _append_turn_to_history(state)
     return {
         "bucket_priors": updated_priors,
         "session_history": updated_history,
-        "mlflow_run_id": mlflow_run_id,
     }
-# ── MLflow logging ─────────────────────────────────────────────────────────────
-def _log_to_mlflow(state: PipelineState) -> str:
-    import mlflow
-    mlflow.set_tracking_uri(settings.mlflow_tracking_uri)
-    mlflow.set_experiment(settings.mlflow_experiment)
     latency = state.get("latency_log") or {}
     affect = (state.get("affect") or {}).get("emotion", "UNKNOWN")
-    with mlflow.start_run(run_name=f"turn-{state['turn_id']}") as run:
-        mlflow.log_params(
-            {
-                "user_id": state["user_id"],
-                "turn_id": state["turn_id"],
-                "llm_tier": state.get("llm_tier_used", "unknown"),
-                "retrieval_mode": state.get("retrieval_mode_used", "unknown"),
-                "affect": affect,
-                "guardrail_passed": state.get("guardrail_passed", True),
-            }
-        )
-        mlflow.log_metrics(
-            {
-                "t_sensing": latency.get("t_sensing", 0.0),
-                "t_intent": latency.get("t_intent", 0.0),
-                "t_retrieval": latency.get("t_retrieval", 0.0),
-                "t_generation": latency.get("t_generation", 0.0),
-                "t_total": latency.get("t_total", 0.0),
-                "num_chunks": float(len(state.get("retrieved_chunks") or [])),
-            }
-        )
-        # Log the selected response as artifact text for qualitative review
-        mlflow.log_text(
-            state.get("selected_response") or "",
-            f"responses/turn_{state['turn_id']}.txt",
-        )
-        return run.info.run_id
-# ── Bayesian bucket prior update ───────────────────────────────────────────────
 def _update_bucket_priors(state: PipelineState) -> dict[str, float]:
@@ -83,11 +74,10 @@ def _update_bucket_priors(state: PipelineState) -> dict[str, float]:
     )
-# ── Session history append ─────────────────────────────────────────────────────
 def _append_turn_to_history(state: PipelineState) -> list[dict]:
-    return [
-        {"role": "partner", "content": state["raw_query"]},
-        {"role": "aac_user", "content": state.get("selected_response") or ""},
-    ]

+# Feedback node — JSONL turn logging, bucket prior update, history append.
+import json
+import time
+import uuid
+from pathlib import Path
 from backend.config.settings import settings
 from backend.pipeline.state import PipelineState
 def run(state: PipelineState) -> dict:
+    run_id = uuid.uuid4().hex
     try:
+        _log_to_jsonl(state, run_id)
+    except Exception as exc:
+        # logging never blocks the response path, but make the failure visible
+        print(f"[feedback] JSONL log failed: {exc!r}")
     updated_priors = _update_bucket_priors(state)
     updated_history = _append_turn_to_history(state)
     return {
         "bucket_priors": updated_priors,
         "session_history": updated_history,
+        "run_id": run_id,
     }
+def _log_to_jsonl(state: PipelineState, run_id: str) -> None:
+    logs_dir = Path(settings.logs_dir)
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    log_path = logs_dir / "turns.jsonl"
     latency = state.get("latency_log") or {}
     affect = (state.get("affect") or {}).get("emotion", "UNKNOWN")
+    entry = {
+        "run_id": run_id,
+        "ts": time.time(),
+        "user_id": state["user_id"],
+        "turn_id": state["turn_id"],
+        "llm_tier": state.get("llm_tier_used", "unknown"),
+        "retrieval_mode": state.get("retrieval_mode_used", "unknown"),
+        "affect": affect,
+        "guardrail_passed": state.get("guardrail_passed", True),
+        "num_chunks": len(state.get("retrieved_chunks") or []),
+        "latency": {
+            "t_sensing": latency.get("t_sensing", 0.0),
+            "t_intent": latency.get("t_intent", 0.0),
+            "t_retrieval": latency.get("t_retrieval", 0.0),
+            "t_generation": latency.get("t_generation", 0.0),
+            "t_total": latency.get("t_total", 0.0),
+        },
+        "response": state.get("selected_response") or "",
+    }
+    with open(log_path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
 def _update_bucket_priors(state: PipelineState) -> dict[str, float]:
     )
 def _append_turn_to_history(state: PipelineState) -> list[dict]:
+    history = list(state.get("session_history") or [])
+    history.append({"role": "partner", "content": state["raw_query"]})
+    history.append(
+        {"role": "aac_user", "content": state.get("selected_response") or ""}
+    )
+    return history

backend/pipeline/nodes/intent.py CHANGED Viewed

@@ -106,7 +106,6 @@ def _build_user_prompt(
 def run(state: PipelineState) -> dict:
-    """LangGraph node: intent decomposition."""
     t0 = time.perf_counter()
     # --fast mode: intent_route already resolved by keyword routing in main.py
@@ -123,7 +122,7 @@ def run(state: PipelineState) -> dict:
     route: IntentRoute | None = None
     last_error: str = ""
-    for attempt in range(3):  # LangGraph retry logic (up to 2 retries)
         messages = [
             {"role": "system", "content": _SYSTEM_PROMPT},
             {

 def run(state: PipelineState) -> dict:
     t0 = time.perf_counter()
     # --fast mode: intent_route already resolved by keyword routing in main.py
     route: IntentRoute | None = None
     last_error: str = ""
+    for attempt in range(3):  # up to 2 retries on schema validation failure
         messages = [
             {"role": "system", "content": _SYSTEM_PROMPT},
             {

backend/pipeline/nodes/planner.py CHANGED Viewed

@@ -7,7 +7,7 @@ from backend.config.settings import settings
 from backend.generation.llm_client import active_model, chat_complete
 from backend.guardrails.checks import check_output
 from backend.pipeline.state import PipelineState
-from backend.sensing.gesture import GESTURE_TO_TAG
 # ── Persona-specific tone tags (applied on top of affect base tag) ─────────────
@@ -94,16 +94,12 @@ def _run(state: PipelineState, tier: str) -> dict:
         4,
     )
-    # Mirror chat_complete's tier collapsing so the reported model matches what ran.
-    actual_tier = "local" if settings.active_llm_tier == "local" else tier
-    actual_model = active_model(actual_tier)
     return {
         "augmented_prompt": prompt,
         "candidates": candidates,
         "selected_response": selected,
-        "llm_tier_used": actual_tier,
-        "llm_model_used": actual_model,
         "latency_log": latency_log,
         "guardrail_passed": guard["passed"],
     }

 from backend.generation.llm_client import active_model, chat_complete
 from backend.guardrails.checks import check_output
 from backend.pipeline.state import PipelineState
+from backend.sensing.labels import GESTURE_TO_TAG
 # ── Persona-specific tone tags (applied on top of affect base tag) ─────────────
         4,
     )
     return {
         "augmented_prompt": prompt,
         "candidates": candidates,
         "selected_response": selected,
+        "llm_tier_used": tier,
+        "llm_model_used": active_model(tier),
         "latency_log": latency_log,
         "guardrail_passed": guard["passed"],
     }

backend/pipeline/nodes/retrieval.py CHANGED Viewed

@@ -26,14 +26,13 @@ def run_fast(state: PipelineState) -> dict:
         top_k=settings.retrieval_fast_k,
         rerank_k=settings.retrieval_fast_k,
         bucket_filter=bucket_hint,
-        use_reranker=False,
     )
     return _build_return(state, chunks, "fast", t0)
 def run_full(state: PipelineState) -> dict:
-    """Full retrieval path with BGE cross-encoder reranking."""
     t0 = time.perf_counter()
     # Prefer gaze hint > intent bucket hint > None
@@ -49,7 +48,6 @@ def run_full(state: PipelineState) -> dict:
         top_k=settings.retrieval_top_k,
         rerank_k=settings.retrieval_rerank_k,
         bucket_filter=bucket_hint,
-        use_reranker=True,
     )
     return _build_return(state, chunks, "full", t0)

         top_k=settings.retrieval_fast_k,
         rerank_k=settings.retrieval_fast_k,
         bucket_filter=bucket_hint,
     )
     return _build_return(state, chunks, "fast", t0)
 def run_full(state: PipelineState) -> dict:
+    """Full retrieval path: top_k cosine matches narrowed to rerank_k."""
     t0 = time.perf_counter()
     # Prefer gaze hint > intent bucket hint > None
         top_k=settings.retrieval_top_k,
         rerank_k=settings.retrieval_rerank_k,
         bucket_filter=bucket_hint,
     )
     return _build_return(state, chunks, "full", t0)

backend/pipeline/state.py CHANGED Viewed

@@ -1,8 +1,7 @@
-# Typed state flowing through every LangGraph node.
 from __future__ import annotations
-import operator
-from typing import Annotated, Any
 from typing_extensions import TypedDict
@@ -26,7 +25,7 @@ class RetrievedChunk(TypedDict):
     text: str
     bucket: str  # family | medical | hobbies | daily_routine | social
     user: str
-    score: float  # cross-encoder rerank score
 class SubIntent(TypedDict):
@@ -66,7 +65,7 @@ class PipelineState(TypedDict):
     # ── Session context (set at turn start, stable across nodes) ──────────────
     user_id: str
     persona_profile: dict[str, Any]  # full profile from users.json
-    session_history: Annotated[list[dict], operator.add]  # auto-appended
     turn_id: int
     # ── L1: Sensing outputs ───────────────────────────────────────────────────
@@ -89,10 +88,10 @@ class PipelineState(TypedDict):
     augmented_prompt: str | None
     candidates: list[str]  # 2-3 candidate responses
     selected_response: str | None
-    llm_tier_used: str  # "primary" | "fallback" | "local"
     llm_model_used: str  # actual model name (e.g. "gemma4:31b-cloud")
     # ── L5: Feedback / tracking ───────────────────────────────────────────────
     latency_log: LatencyLog | None
-    mlflow_run_id: str | None
     guardrail_passed: bool

+# Typed state flowing through every pipeline node.
 from __future__ import annotations
+from typing import Any
 from typing_extensions import TypedDict
     text: str
     bucket: str  # family | medical | hobbies | daily_routine | social
     user: str
+    score: float  # cosine similarity from the embedder
 class SubIntent(TypedDict):
     # ── Session context (set at turn start, stable across nodes) ──────────────
     user_id: str
     persona_profile: dict[str, Any]  # full profile from users.json
+    session_history: list[dict]
     turn_id: int
     # ── L1: Sensing outputs ───────────────────────────────────────────────────
     augmented_prompt: str | None
     candidates: list[str]  # 2-3 candidate responses
     selected_response: str | None
+    llm_tier_used: str  # "primary" | "fallback"
     llm_model_used: str  # actual model name (e.g. "gemma4:31b-cloud")
     # ── L5: Feedback / tracking ───────────────────────────────────────────────
     latency_log: LatencyLog | None
+    run_id: str | None  # UUID assigned per turn; logged to logs/turns.jsonl
     guardrail_passed: bool

backend/retrieval/clustering.py DELETED Viewed

@@ -1,84 +0,0 @@
-# HDBSCAN-based semantic bucketing over BGE embeddings.
-from __future__ import annotations
-import json
-import numpy as np
-from backend.config.settings import settings
-from backend.retrieval.vector_store import _get_embedder
-BUCKET_LABELS = ["family", "medical", "hobbies", "daily_routine", "social"]
-def cluster_persona_memories(user_id: str) -> dict[str, list[str]]:
-    # Embed all memory chunks for a persona and cluster with HDBSCAN.
-    import hdbscan
-    memory_path = settings.memories_dir / f"{user_id}.json"
-    with open(memory_path) as f:
-        persona = json.load(f)
-    texts, true_buckets = [], []
-    for bucket, memories in persona["memory_buckets"].items():
-        for mem in memories:
-            texts.append(mem)
-            true_buckets.append(bucket)
-    embedder = _get_embedder()
-    vecs = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
-    clusterer = hdbscan.HDBSCAN(
-        min_cluster_size=3,
-        min_samples=2,
-        metric="euclidean",
-    )
-    labels = clusterer.fit_predict(vecs)
-    clusters: dict[str, list[str]] = {}
-    for text, label, _true_bucket in zip(texts, labels, true_buckets):
-        key = f"cluster_{label}" if label >= 0 else "noise"
-        clusters.setdefault(key, []).append(text)
-    return clusters
-def evaluate_bucket_alignment(user_id: str) -> dict:
-    # Compare HDBSCAN clusters against hand-authored bucket labels, return purity scores.
-    import hdbscan
-    memory_path = settings.memories_dir / f"{user_id}.json"
-    with open(memory_path) as f:
-        persona = json.load(f)
-    texts, true_buckets = [], []
-    for bucket, memories in persona["memory_buckets"].items():
-        for mem in memories:
-            texts.append(mem)
-            true_buckets.append(bucket)
-    embedder = _get_embedder()
-    vecs = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
-    clusterer = hdbscan.HDBSCAN(min_cluster_size=3, min_samples=2, metric="euclidean")
-    pred_labels = clusterer.fit_predict(vecs)
-    cluster_bucket_counts: dict[int, dict[str, int]] = {}
-    for pred, true in zip(pred_labels, true_buckets):
-        cluster_bucket_counts.setdefault(pred, {})
-        cluster_bucket_counts[pred][true] = cluster_bucket_counts[pred].get(true, 0) + 1
-    purity_scores = {}
-    for cluster_id, bucket_counts in cluster_bucket_counts.items():
-        total = sum(bucket_counts.values())
-        dominant = max(bucket_counts.values())
-        purity_scores[cluster_id] = round(dominant / total, 3)
-    return {
-        "n_clusters": len([k for k in purity_scores if k >= 0]),
-        "n_noise": cluster_bucket_counts.get(-1, {}),
-        "cluster_purity": purity_scores,
-        "mean_purity": round(
-            np.mean([v for k, v in purity_scores.items() if k >= 0] or [0.0]), 3
-        ),
-    }

backend/retrieval/vector_store.py CHANGED Viewed

@@ -1,102 +1,96 @@
-# FAISS retrieval with BGE embeddings and cross-encoder reranking.
-from __future__ import annotations
 import json
 from functools import lru_cache
 from pathlib import Path
-import numpy as np
 from backend.config.settings import settings
 from backend.pipeline.state import RetrievedChunk
-@lru_cache(maxsize=1)
-def _get_embedder():
-    from sentence_transformers import SentenceTransformer
-    return SentenceTransformer(settings.embed_model)
-@lru_cache(maxsize=1)
-def _get_reranker():
-    from sentence_transformers import CrossEncoder
-    return CrossEncoder(settings.rerank_model)
 @lru_cache(maxsize=1)
-def _get_faiss():
-    import faiss
-    return faiss
-# ── Index cache (one FAISS index per user_id) ─────────────────────────────────
-_index_cache: dict[str, tuple] = {}
-def load_index(user_id: str):
     if user_id not in _index_cache:
-        faiss = _get_faiss()
         store_path = settings.faiss_store_dir / user_id
-        index = faiss.read_index(str(store_path / "index.faiss"))
         with open(store_path / "meta.json") as f:
             meta = json.load(f)
-        _index_cache[user_id] = (index, meta)
     return _index_cache[user_id]
-# ── Core retrieve function ─────────────────────────────────────────────────────
 def retrieve(
     query: str,
     user_id: str,
     top_k: int = 5,
     rerank_k: int = 3,
     bucket_filter: str | None = None,
-    use_reranker: bool = True,
 ) -> list[RetrievedChunk]:
     embedder = _get_embedder()
-    index, meta = load_index(user_id)
-    q_vec = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
-    _, idxs = index.search(q_vec, top_k)
-    candidates = [meta[i] for i in idxs[0] if 0 <= i < len(meta)]
     if bucket_filter:
-        filtered = [c for c in candidates if c["bucket"] == bucket_filter]
         candidates = filtered if filtered else candidates  # fallback: all buckets
-    if use_reranker and len(candidates) > 1:
-        reranker = _get_reranker()
-        pairs = [(query, c["text"]) for c in candidates]
-        ce_scores = reranker.predict(pairs)
-        ranked = sorted(zip(ce_scores, candidates), key=lambda x: x[0], reverse=True)
-        top = [
-            RetrievedChunk(
-                text=c["text"], bucket=c["bucket"], user=c["user"], score=float(s)
-            )
-            for s, c in ranked[:rerank_k]
-        ]
-    else:
-        top = [
-            RetrievedChunk(
-                text=c["text"], bucket=c["bucket"], user=c["user"], score=1.0
-            )
-            for c in candidates[:rerank_k]
-        ]
-    return top
-# ── Index builder ──────────────────────────────────────────────────────────────
-def build_index(persona_path: str | Path):
     with open(persona_path) as f:
         persona = json.load(f)
@@ -109,19 +103,20 @@ def build_index(persona_path: str | Path):
             meta.append({"text": mem, "bucket": bucket, "user": user_name})
     embedder = _get_embedder()
-    vecs = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
-    dim = vecs.shape[1]
-    faiss = _get_faiss()
-    index = faiss.IndexFlatIP(dim)
-    index.add(vecs.astype(np.float32))
-    return index, meta
-def save_index(index, meta: list[dict], save_dir: str | Path) -> None:
     p = Path(save_dir)
     p.mkdir(parents=True, exist_ok=True)
-    _get_faiss().write_index(index, str(p / "index.faiss"))
     with open(p / "meta.json", "w") as f:
         json.dump(meta, f, indent=2)
@@ -133,16 +128,15 @@ def build_all(
     memories_dir = Path(memories_dir or settings.memories_dir)
     store_dir = Path(store_dir or settings.faiss_store_dir)
     for persona_file in sorted(memories_dir.glob("*.json")):
         uid = persona_file.stem
         print(f"  Building index for {uid} …")
-        index, meta = build_index(persona_file)
-        save_index(index, meta, store_dir / uid)
         print(f"    Saved {len(meta)} chunks → {store_dir / uid}/")
     print("\nAll indexes built.")
-# ── Entrypoint ────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     build_all()

+# BGE embeddings + torch-tensor cosine search (mps → cuda → cpu).
 import json
 from functools import lru_cache
 from pathlib import Path
+import torch
 from backend.config.settings import settings
 from backend.pipeline.state import RetrievedChunk
+def _select_device() -> str:
+    if torch.backends.mps.is_available():
+        return "mps"
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+_DEVICE = _select_device()
+def get_device() -> str:
+    return _DEVICE
 @lru_cache(maxsize=1)
+def _get_embedder():
+    from sentence_transformers import SentenceTransformer
+    return SentenceTransformer(settings.embed_model, device=_DEVICE)
+# Index cache: one (vectors_tensor, meta) per user_id.
+_index_cache: dict[str, tuple[torch.Tensor, list[dict]]] = {}
+def load_index(user_id: str) -> tuple[torch.Tensor, list[dict]]:
     if user_id not in _index_cache:
         store_path = settings.faiss_store_dir / user_id
+        vecs = torch.load(
+            store_path / "vectors.pt", map_location=_DEVICE, weights_only=True
+        )
         with open(store_path / "meta.json") as f:
             meta = json.load(f)
+        _index_cache[user_id] = (vecs, meta)
     return _index_cache[user_id]
+# Retrieve.
 def retrieve(
     query: str,
     user_id: str,
     top_k: int = 5,
     rerank_k: int = 3,
     bucket_filter: str | None = None,
 ) -> list[RetrievedChunk]:
     embedder = _get_embedder()
+    vecs, meta = load_index(user_id)
+    q_vec = embedder.encode(
+        [query],
+        convert_to_tensor=True,
+        normalize_embeddings=True,
+        device=_DEVICE,
+    )[0]
+    scores = vecs @ q_vec  # cosine sim, vectors are L2-normalised
+    k = min(top_k, scores.shape[0])
+    top_scores, top_idxs = torch.topk(scores, k)
+    top_scores_list = top_scores.tolist()
+    top_idxs_list = top_idxs.tolist()
+    candidates = [
+        (top_scores_list[i], meta[idx])
+        for i, idx in enumerate(top_idxs_list)
+        if 0 <= idx < len(meta)
+    ]
     if bucket_filter:
+        filtered = [(s, c) for s, c in candidates if c["bucket"] == bucket_filter]
         candidates = filtered if filtered else candidates  # fallback: all buckets
+    return [
+        RetrievedChunk(
+            text=c["text"], bucket=c["bucket"], user=c["user"], score=float(s)
+        )
+        for s, c in candidates[:rerank_k]
+    ]
+# Index builder.
+def build_index(persona_path: str | Path) -> tuple[torch.Tensor, list[dict]]:
     with open(persona_path) as f:
         persona = json.load(f)
             meta.append({"text": mem, "bucket": bucket, "user": user_name})
     embedder = _get_embedder()
+    vecs = embedder.encode(
+        chunks,
+        convert_to_tensor=True,
+        normalize_embeddings=True,
+        device=_DEVICE,
+    )
+    return vecs, meta
+def save_index(vecs: torch.Tensor, meta: list[dict], save_dir: str | Path) -> None:
     p = Path(save_dir)
     p.mkdir(parents=True, exist_ok=True)
+    # Move to CPU before saving so the file is portable across devices.
+    torch.save(vecs.detach().cpu(), p / "vectors.pt")
     with open(p / "meta.json", "w") as f:
         json.dump(meta, f, indent=2)
     memories_dir = Path(memories_dir or settings.memories_dir)
     store_dir = Path(store_dir or settings.faiss_store_dir)
+    print(f"Embedder device: {_DEVICE}")
     for persona_file in sorted(memories_dir.glob("*.json")):
         uid = persona_file.stem
         print(f"  Building index for {uid} …")
+        vecs, meta = build_index(persona_file)
+        save_index(vecs, meta, store_dir / uid)
         print(f"    Saved {len(meta)} chunks → {store_dir / uid}/")
     print("\nAll indexes built.")
 if __name__ == "__main__":
     build_all()

backend/sensing/air_writing.py DELETED Viewed

@@ -1,155 +0,0 @@
-# Air writing recognition — fingertip trajectory → DTW character matching.
-from __future__ import annotations
-import time
-from dataclasses import dataclass, field
-import numpy as np
-from backend.config.settings import settings
-mp = None
-# ── Landmark index ─────────────────────────────────────────────────────────────
-_INDEX_TIP = 8
-@dataclass
-class AirWriter:
-    """
-    Stateful air-writing recogniser. Feed frames from a webcam loop.
-    Call `get_text()` to retrieve and clear the current buffer.
-    """
-    _trajectory: list[tuple[float, float]] = field(default_factory=list)
-    _in_stroke: bool = False
-    _stroke_end_time: float = field(default=0.0)
-    _text_buffer: list[str] = field(default_factory=list)
-    _templates: dict[str, np.ndarray] = field(default_factory=dict)
-    def __post_init__(self):
-        global mp
-        import mediapipe as mp
-        self._hands = mp.solutions.hands.Hands(
-            static_image_mode=False,
-            max_num_hands=1,
-            min_detection_confidence=0.6,
-            min_tracking_confidence=0.5,
-        )
-        self._prev_pt: tuple[float, float] | None = None
-        self._templates = _load_templates()
-    def process_frame(self, bgr_frame) -> str | None:
-        """
-        Process one frame. Returns a recognised character when a stroke
-        completes, or None otherwise.
-        """
-        import cv2
-        rgb = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
-        result = self._hands.process(rgb)
-        if not result.multi_hand_landmarks:
-            self._prev_pt = None
-            return self._check_stroke_end()
-        h, w = bgr_frame.shape[:2]
-        lm = result.multi_hand_landmarks[0].landmark
-        tip = (lm[_INDEX_TIP].x * w, lm[_INDEX_TIP].y * h)
-        velocity = 0.0
-        if self._prev_pt is not None:
-            velocity = np.linalg.norm(np.array(tip) - np.array(self._prev_pt))
-        self._prev_pt = tip
-        start_v = settings.air_write_velocity_start
-        end_v = settings.air_write_velocity_end
-        if velocity > start_v:
-            self._in_stroke = True
-            self._trajectory.append(tip)
-            self._stroke_end_time = 0.0
-        elif self._in_stroke and velocity < end_v:
-            if self._stroke_end_time == 0.0:
-                self._stroke_end_time = time.time()
-            return self._check_stroke_end()
-        return None
-    def _check_stroke_end(self) -> str | None:
-        if not self._in_stroke or self._stroke_end_time == 0.0:
-            return None
-        gap_s = settings.air_write_end_gap_ms / 1000.0
-        if time.time() - self._stroke_end_time >= gap_s:
-            char = self._recognise(self._trajectory)
-            self._trajectory = []
-            self._in_stroke = False
-            self._stroke_end_time = 0.0
-            if char:
-                self._text_buffer.append(char)
-            return char
-        return None
-    def _recognise(self, trajectory: list[tuple[float, float]]) -> str | None:
-        if len(trajectory) < 5 or not self._templates:
-            return None
-        query = _normalise_trajectory(np.array(trajectory))
-        best_char, best_dist = None, float("inf")
-        for char, template in self._templates.items():
-            dist = _dtw_distance(query, template)
-            if dist < best_dist:
-                best_dist = dist
-                best_char = char
-        return best_char
-    def get_text(self) -> str:
-        """Return and clear the accumulated air-written text."""
-        text = "".join(self._text_buffer)
-        self._text_buffer.clear()
-        return text
-    def release(self):
-        self._hands.close()
-# ── DTW helpers ───────────────────────────────────────────────────────────────
-def _normalise_trajectory(pts: np.ndarray) -> np.ndarray:
-    """Scale trajectory to unit bounding box, resample to 32 points."""
-    pts = pts - pts.min(axis=0)
-    scale = pts.max(axis=0) + 1e-6
-    pts = pts / scale
-    # Resample to fixed length via linear interpolation
-    t_old = np.linspace(0, 1, len(pts))
-    t_new = np.linspace(0, 1, 32)
-    return np.column_stack(
-        [
-            np.interp(t_new, t_old, pts[:, 0]),
-            np.interp(t_new, t_old, pts[:, 1]),
-        ]
-    )
-def _dtw_distance(a: np.ndarray, b: np.ndarray) -> float:
-    """Simple O(n²) DTW — trajectories are short (32 pts), so this is fine."""
-    n, m = len(a), len(b)
-    dtw = np.full((n + 1, m + 1), np.inf)
-    dtw[0, 0] = 0.0
-    for i in range(1, n + 1):
-        for j in range(1, m + 1):
-            cost = np.linalg.norm(a[i - 1] - b[j - 1])
-            dtw[i, j] = cost + min(dtw[i - 1, j], dtw[i, j - 1], dtw[i - 1, j - 1])
-    return float(dtw[n, m])
-def _load_templates() -> dict[str, np.ndarray]:
-    template_dir = settings.data_dir / "air_write_templates"
-    if not template_dir.exists():
-        return {}
-    templates = {}
-    for f in template_dir.glob("*.npy"):
-        char = f.stem  # filename = character label
-        templates[char] = np.load(f)
-    return templates

backend/sensing/face_mesh.py DELETED Viewed

@@ -1,145 +0,0 @@
-# Facial affect detection via MediaPipe Face Mesh (MAR/EAR/BRI/LCP → emotion).
-from __future__ import annotations
-from dataclasses import dataclass, field
-import numpy as np
-from backend.config.settings import settings
-from backend.pipeline.state import AffectState, AffectVector
-mp = None
-cv2 = None
-# ── MediaPipe landmark indices ────────────────────────────────────────────────
-# MAR — mouth vertical / horizontal ratio
-_MOUTH_TOP = 13
-_MOUTH_BOTTOM = 14
-_MOUTH_LEFT = 61
-_MOUTH_RIGHT = 291
-# EAR — eye vertical / horizontal ratio (right eye)
-_EYE_TOP = 159
-_EYE_BOTTOM = 145
-_EYE_LEFT = 33
-_EYE_RIGHT = 133
-# BRI — brow vertical displacement relative to eye centre
-_BROW_LEFT = 70
-_BROW_RIGHT = 300
-# LCP — mouth corner horizontal displacement from neutral baseline
-_CORNER_LEFT = 61
-_CORNER_RIGHT = 291
-# ── Affect classes ────────────────────────────────────────────────────────────
-AFFECT_CLASSES = ["HAPPY", "FRUSTRATED", "NEUTRAL", "SURPRISED"]
-@dataclass
-class AffectDetector:
-    """
-    Stateful detector that maintains EMA-smoothed affect across frames.
-    Create one instance per session and call `process_frame` each frame.
-    """
-    _smoothed: AffectVector = field(
-        default_factory=lambda: AffectVector(MAR=0.0, EAR=0.3, BRI=0.0, LCP=0.0)
-    )
-    _neutral_lcp: float = 0.0  # calibrated at session start
-    _calibrated: bool = False
-    def __post_init__(self):
-        global mp, cv2
-        import cv2
-        import mediapipe as mp
-        self._face_mesh = mp.solutions.face_mesh.FaceMesh(
-            static_image_mode=False,
-            max_num_faces=1,
-            refine_landmarks=True,  # enables iris landmarks (468-477)
-            min_detection_confidence=0.5,
-            min_tracking_confidence=0.5,
-        )
-    def process_frame(self, bgr_frame: np.ndarray) -> AffectState | None:
-        """
-        Process one BGR frame from OpenCV and return the current AffectState,
-        or None if no face is detected.
-        """
-        rgb = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
-        result = self._face_mesh.process(rgb)
-        if not result.multi_face_landmarks:
-            return None
-        lm = result.multi_face_landmarks[0].landmark
-        h, w = bgr_frame.shape[:2]
-        def pt(idx):
-            l = lm[idx]
-            return np.array([l.x * w, l.y * h])
-        raw = self._compute_features(pt)
-        if not self._calibrated:
-            self._neutral_lcp = raw["LCP"]
-            self._calibrated = True
-        raw["LCP"] = raw["LCP"] - self._neutral_lcp  # relative to neutral baseline
-        alpha = settings.affect_ema_alpha
-        smoothed = AffectVector(
-            MAR=alpha * raw["MAR"] + (1 - alpha) * self._smoothed["MAR"],
-            EAR=alpha * raw["EAR"] + (1 - alpha) * self._smoothed["EAR"],
-            BRI=alpha * raw["BRI"] + (1 - alpha) * self._smoothed["BRI"],
-            LCP=alpha * raw["LCP"] + (1 - alpha) * self._smoothed["LCP"],
-        )
-        self._smoothed = smoothed
-        emotion = self._classify(smoothed)
-        return AffectState(emotion=emotion, vector=raw, smoothed=smoothed)
-    def _compute_features(self, pt) -> dict:
-        # MAR
-        mouth_v = np.linalg.norm(pt(_MOUTH_TOP) - pt(_MOUTH_BOTTOM))
-        mouth_h = np.linalg.norm(pt(_MOUTH_LEFT) - pt(_MOUTH_RIGHT))
-        MAR = mouth_v / (mouth_h + 1e-6)
-        # EAR
-        eye_v = np.linalg.norm(pt(_EYE_TOP) - pt(_EYE_BOTTOM))
-        eye_h = np.linalg.norm(pt(_EYE_LEFT) - pt(_EYE_RIGHT))
-        EAR = eye_v / (eye_h + 1e-6)
-        # BRI — average brow displacement relative to eye centre
-        eye_center = (pt(_EYE_LEFT) + pt(_EYE_RIGHT)) / 2
-        inter_ocular = eye_h
-        brow_mid = (pt(_BROW_LEFT) + pt(_BROW_RIGHT)) / 2
-        BRI = (eye_center[1] - brow_mid[1]) / (inter_ocular + 1e-6)
-        # LCP — average horizontal mouth corner displacement
-        LCP = float((pt(_CORNER_LEFT)[0] + pt(_CORNER_RIGHT)[0]) / 2)
-        return {
-            "MAR": float(MAR),
-            "EAR": float(EAR),
-            "BRI": float(BRI),
-            "LCP": float(LCP),
-        }
-    @staticmethod
-    def _classify(v: AffectVector) -> str:
-        if v["BRI"] > 0.25 and v["MAR"] > 0.3:
-            return "SURPRISED"
-        if v["EAR"] < 0.15 and v["LCP"] < -5:
-            return "FRUSTRATED"
-        if v["LCP"] > 5:
-            return "HAPPY"
-        return "NEUTRAL"
-    def release(self):
-        self._face_mesh.close()

backend/sensing/gaze.py DELETED Viewed

@@ -1,92 +0,0 @@
-# Gaze-based retrieval bucket hinting via MediaPipe iris landmarks.
-from __future__ import annotations
-import time
-from dataclasses import dataclass, field
-from backend.config.settings import settings
-mp = None
-# ── Iris landmark indices ──────────────────────────────────────────────────────
-# MediaPipe refine_landmarks=True adds iris landmarks 468-477
-_LEFT_IRIS_CENTER = 468
-_RIGHT_IRIS_CENTER = 473
-# ── Screen region → bucket map ─────────────────────────────────────────────────
-# Defined as (x_min, y_min, x_max, y_max) in normalised [0,1] coords
-_REGION_BUCKET: list[tuple[tuple[float, float, float, float], str]] = [
-    ((0.3, 0.3, 0.7, 0.7), "social"),  # centre checked first (most specific)
-    ((0.0, 0.0, 0.5, 0.5), "family"),
-    ((0.5, 0.0, 1.0, 0.5), "medical"),
-    ((0.0, 0.5, 0.5, 1.0), "hobbies"),
-    ((0.5, 0.5, 1.0, 1.0), "daily_routine"),
-]
-@dataclass
-class GazeTracker:
-    """
-    Stateful gaze tracker. Call `process_frame` each frame.
-    Returns the bucket name when dwell threshold is exceeded, else None.
-    """
-    _dwell_start: float = field(default=0.0)
-    _current_region: str | None = field(default=None)
-    def __post_init__(self):
-        global mp
-        import mediapipe as mp
-        self._face_mesh = mp.solutions.face_mesh.FaceMesh(
-            static_image_mode=False,
-            max_num_faces=1,
-            refine_landmarks=True,
-            min_detection_confidence=0.5,
-            min_tracking_confidence=0.5,
-        )
-    def process_frame(self, bgr_frame) -> str | None:
-        import cv2
-        rgb = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
-        result = self._face_mesh.process(rgb)
-        if not result.multi_face_landmarks:
-            self._reset()
-            return None
-        lm = result.multi_face_landmarks[0].landmark
-        # Average left + right iris centres for gaze estimate
-        gaze_x = (lm[_LEFT_IRIS_CENTER].x + lm[_RIGHT_IRIS_CENTER].x) / 2
-        gaze_y = (lm[_LEFT_IRIS_CENTER].y + lm[_RIGHT_IRIS_CENTER].y) / 2
-        bucket = self._region_for(gaze_x, gaze_y)
-        if bucket != self._current_region:
-            self._current_region = bucket
-            self._dwell_start = time.time()
-            return None
-        dwell = time.time() - self._dwell_start
-        if dwell >= settings.gaze_dwell_threshold_s and bucket is not None:
-            self._reset()
-            return bucket
-        return None
-    @staticmethod
-    def _region_for(x: float, y: float) -> str | None:
-        for (x0, y0, x1, y1), bucket in _REGION_BUCKET:
-            if x0 <= x <= x1 and y0 <= y <= y1:
-                return bucket
-        return None
-    def _reset(self):
-        self._dwell_start = 0.0
-        self._current_region = None
-    def release(self):
-        self._face_mesh.close()

backend/sensing/gesture.py DELETED Viewed

@@ -1,102 +0,0 @@
-# Hand gesture recognition via MediaPipe Hands.
-from __future__ import annotations
-import numpy as np
-mp = None
-# Gesture → prompt constraint tag mapping
-GESTURE_TO_TAG: dict[str, str] = {
-    "THUMBS_UP": "[GESTURE:THUMBS_UP][TONE:AFFIRMATIVE]",
-    "THUMBS_DOWN": "[GESTURE:THUMBS_DOWN][TONE:NEGATIVE]",
-    "POINTING": "[GESTURE:POINTING][INTENT:REFERENTIAL]",
-    "WAVING": "[GESTURE:WAVING][INTENT:GREETING]",
-}
-class GestureClassifier:
-    """
-    Stateful classifier — create one instance per session.
-    Feed MediaPipe hand landmark results each frame.
-    """
-    def __init__(self):
-        global mp
-        import mediapipe as mp
-        self._hands = mp.solutions.hands.Hands(
-            static_image_mode=False,
-            max_num_hands=1,
-            min_detection_confidence=0.6,
-            min_tracking_confidence=0.5,
-        )
-    def process_frame(self, bgr_frame) -> str | None:
-        """
-        Returns a gesture label string or None if no clear gesture is detected.
-        """
-        import cv2
-        rgb = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
-        result = self._hands.process(rgb)
-        if not result.multi_hand_landmarks:
-            return None
-        lm = result.multi_hand_landmarks[0].landmark
-        pts = np.array([[l.x, l.y, l.z] for l in lm])
-        return self._classify(pts)
-    def gesture_tag(self, bgr_frame) -> str | None:
-        """Convenience: returns the prompt tag directly, or None."""
-        gesture = self.process_frame(bgr_frame)
-        return GESTURE_TO_TAG.get(gesture) if gesture else None
-    @staticmethod
-    def _classify(pts: np.ndarray) -> str | None:
-        # Normalise: wrist at origin, scale by palm width
-        wrist = pts[0]
-        palm_width = np.linalg.norm(pts[5] - pts[17]) + 1e-6
-        p = (pts - wrist) / palm_width
-        thumb_tip = p[4]
-        index_tip = p[8]
-        middle_tip = p[12]
-        ring_tip = p[16]
-        pinky_tip = p[20]
-        index_mcp = p[5]  # knuckle
-        # THUMBS_UP: thumb tip above wrist, other fingers curled
-        fingers_curled = all(
-            np.linalg.norm(tip) < np.linalg.norm(mcp)
-            for tip, mcp in [(index_tip, p[5]), (middle_tip, p[9]), (ring_tip, p[13])]
-        )
-        if thumb_tip[1] < -0.3 and fingers_curled:
-            return "THUMBS_UP"
-        # THUMBS_DOWN: thumb tip below wrist, other fingers curled
-        if thumb_tip[1] > 0.3 and fingers_curled:
-            return "THUMBS_DOWN"
-        # POINTING: index extended, others curled
-        index_extended = np.linalg.norm(index_tip) > np.linalg.norm(index_mcp) * 1.3
-        others_curled = all(
-            np.linalg.norm(tip) < 0.5 for tip in [middle_tip, ring_tip, pinky_tip]
-        )
-        if index_extended and others_curled:
-            return "POINTING"
-        # WAVING: all fingers extended, hand roughly vertical
-        all_extended = all(
-            np.linalg.norm(tip) > 0.5
-            for tip in [index_tip, middle_tip, ring_tip, pinky_tip]
-        )
-        if all_extended:
-            return "WAVING"
-        return None
-    def release(self):
-        self._hands.close()

backend/sensing/labels.py ADDED Viewed

	@@ -0,0 +1,6 @@

+GESTURE_TO_TAG: dict[str, str] = {
+    "THUMBS_UP": "[GESTURE:THUMBS_UP][TONE:AFFIRMATIVE]",
+    "THUMBS_DOWN": "[GESTURE:THUMBS_DOWN][TONE:NEGATIVE]",
+    "POINTING": "[GESTURE:POINTING][INTENT:REFERENTIAL]",
+    "WAVING": "[GESTURE:WAVING][INTENT:GREETING]",
+}

backend/ui/app.py DELETED Viewed

@@ -1,163 +0,0 @@
-"""
-Streamlit frontend — webcam + chat + live metrics dashboard.
-Panels:
-  Left sidebar  — persona selector, session controls, live affect display
-  Centre        — chat interface with streaming response
-  Right sidebar — latency breakdown, bucket priors bar chart
-Run: streamlit run ui/app.py
-"""
-from __future__ import annotations
-import requests
-import streamlit as st
-# ── Config ─────────────────────────────────────────────────────────────────────
-API_BASE = "http://localhost:8000"
-st.set_page_config(
-    page_title="AAC Chatbot",
-    layout="wide",
-    initial_sidebar_state="expanded",
-)
-# ── Session state init ─────────────────────────────────────────────────────────
-if "user_id" not in st.session_state:
-    st.session_state.user_id = None
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-if "last_latency" not in st.session_state:
-    st.session_state.last_latency = {}
-if "last_affect" not in st.session_state:
-    st.session_state.last_affect = "NEUTRAL"
-if "affect_override" not in st.session_state:
-    st.session_state.affect_override = None
-# ── Sidebar ────────────────────────────────────────────────────────────────────
-with st.sidebar:
-    st.title("AAC Chatbot")
-    # Persona selection
-    try:
-        users_resp = requests.get(f"{API_BASE}/users", timeout=3)
-        users = users_resp.json().get("users", [])
-    except Exception:
-        users = []
-        st.error("API not reachable — start the FastAPI server first.")
-    user_options = {u["id"]: f"{u['name']} ({u['condition']})" for u in users}
-    selected = st.selectbox(
-        "Select persona",
-        options=list(user_options.keys()),
-        format_func=lambda k: user_options.get(k, k),
-    )
-    if selected != st.session_state.user_id:
-        st.session_state.user_id = selected
-        st.session_state.messages = []
-        try:
-            requests.post(f"{API_BASE}/session/reset", params={"user_id": selected})
-        except Exception:
-            pass
-    st.divider()
-    # Affect override (for demo / testing without webcam)
-    st.subheader("Affect Override")
-    st.caption("Simulates webcam affect detection")
-    affect_choice = st.radio(
-        "Current affect",
-        ["Auto (webcam)", "HAPPY", "FRUSTRATED", "NEUTRAL", "SURPRISED"],
-        index=0,
-    )
-    st.session_state.affect_override = (
-        None if affect_choice == "Auto (webcam)" else affect_choice
-    )
-    st.divider()
-    # Live affect indicator
-    st.subheader("Detected Affect")
-    affect_emoji = {
-        "HAPPY": "😊",
-        "FRUSTRATED": "😤",
-        "NEUTRAL": "😐",
-        "SURPRISED": "😲",
-    }
-    af = st.session_state.last_affect
-    st.markdown(f"### {affect_emoji.get(af, '❓')} {af}")
-    # Webcam placeholder
-    st.divider()
-    st.subheader("Webcam Feed")
-    st.info(
-        "Live webcam sensing runs in the sensing client.\nAffect is sent to the API automatically."
-    )
-# ── Main chat area ─────────────────────────────────────────────────────────────
-st.header(f"Talking as: {user_options.get(st.session_state.user_id, '—')}")
-chat_col, metrics_col = st.columns([3, 1])
-with chat_col:
-    for msg in st.session_state.messages:
-        role_label = "Partner" if msg["role"] == "partner" else "AAC User"
-        with st.chat_message("user" if msg["role"] == "partner" else "assistant"):
-            st.markdown(f"**{role_label}:** {msg['content']}")
-    query = st.chat_input("Type as the communication partner…")
-    if query and st.session_state.user_id:
-        st.session_state.messages.append({"role": "partner", "content": query})
-        with st.chat_message("user"):
-            st.markdown(f"**Partner:** {query}")
-        with st.chat_message("assistant"):
-            with st.spinner("Generating response…"):
-                try:
-                    payload = {
-                        "user_id": st.session_state.user_id,
-                        "query": query,
-                        "affect_override": st.session_state.affect_override,
-                    }
-                    resp = requests.post(f"{API_BASE}/chat", json=payload, timeout=15)
-                    resp.raise_for_status()
-                    data = resp.json()
-                    response_text = data.get("response", "I don't know.")
-                    st.markdown(f"**AAC User:** {response_text}")
-                    st.session_state.messages.append(
-                        {"role": "aac_user", "content": response_text}
-                    )
-                    st.session_state.last_affect = data.get("affect", "NEUTRAL")
-                    st.session_state.last_latency = data.get("latency", {})
-                    if not data.get("guardrail_passed", True):
-                        st.warning("⚠ Guardrail triggered — response was sanitised.")
-                except requests.exceptions.Timeout:
-                    st.error("Request timed out. Is the server running?")
-                except Exception as e:
-                    st.error(f"Error: {e}")
-with metrics_col:
-    st.subheader("Turn Latency (s)")
-    lat = st.session_state.last_latency
-    if lat:
-        for key, label in [
-            ("t_sensing", "Sensing"),
-            ("t_intent", "Intent"),
-            ("t_retrieval", "Retrieval"),
-            ("t_generation", "Generation"),
-            ("t_total", "**Total**"),
-        ]:
-            val = lat.get(key, 0.0)
-            st.metric(label=label, value=f"{val:.3f}s")
-    else:
-        st.caption("No turn yet.")

requirements.txt CHANGED Viewed

@@ -1,38 +1,19 @@
-# ── Orchestration ──────────────────────────────────────────────────────────────
-langgraph>=1.1
-langchain-core>=0.2
-pydantic>=2.0
-pydantic-settings>=2.0
-# ── LLM clients ────────────────────────────────────────────────────────────────
-openai>=1.0          # OpenAI-compatible client for vLLM + Ollama
-ollama>=0.2          # local dev fallback (direct Ollama SDK)
 # ── Retrieval ──────────────────────────────────────────────────────────────────
-faiss-cpu>=1.7
 sentence-transformers>=3.0
 torch>=2.0
 transformers>=4.40
 numpy>=1.24
-# ── Clustering ─────────────────────────────────────────────────────────────────
-hdbscan>=0.8.29
-scikit-learn>=1.3
-# ── Sensing ────────────────────────────────────────────────────────────────────
-mediapipe>=0.10
-opencv-python>=4.8
 # ── API backend ────────────────────────────────────────────────────────────────
 fastapi>=0.111
 uvicorn[standard]>=0.29
-# ── UI ─────────────────────────────────────────────────────────────────────────
-streamlit>=1.35
-requests>=2.31      # Streamlit → FastAPI calls
-# ── Experiment tracking ────────────────────────────────────────────────────────
-mlflow>=2.13
 # ── Utilities ──────────────────────────────────────────────────────────────────
 python-dotenv>=1.0

+# ── LLM client ────────────────────────────────────────────────────────────────
+openai>=1.0          # talks to Ollama Cloud over OpenAI-compatible HTTP
 # ── Retrieval ──────────────────────────────────────────────────────────────────
 sentence-transformers>=3.0
 torch>=2.0
 transformers>=4.40
 numpy>=1.24
 # ── API backend ────────────────────────────────────────────────────────────────
 fastapi>=0.111
 uvicorn[standard]>=0.29
+# ── Config / validation ───────────────────────────────────────────────────────
+pydantic>=2.0
+pydantic-settings>=2.0
 # ── Utilities ──────────────────────────────────────────────────────────────────
 python-dotenv>=1.0

setup.sh CHANGED Viewed

@@ -10,10 +10,8 @@ ok()    { printf "\033[1;32m==> %s\033[0m\n" "$1"; }
 warn()  { printf "\033[1;33m==> %s\033[0m\n" "$1"; }
 fail()  { printf "\033[1;31mERROR: %s\033[0m\n" "$1"; exit 1; }
-# ── Pre-flight: conda ────────────────────────────────────────────────────────
 command -v conda >/dev/null 2>&1 || fail "conda not found. Install Miniconda/Anaconda first."
-# ── Conda environment ────────────────────────────────────────────────────────
 if conda info --envs | grep -q "^${CONDA_ENV} "; then
   info "Conda env '$CONDA_ENV' already exists — reusing it"
 else
@@ -26,38 +24,29 @@ fi
 eval "$(conda shell.bash hook)"
 conda activate "$CONDA_ENV"
-# ── Install dependencies ─────────────────────────────────────────────────────
 info "Installing Python dependencies..."
 pip install --upgrade pip --quiet
 pip install -r requirements.txt --quiet
 ok "Dependencies installed"
-# ── Environment file ─────────────────────────────────────────────────────────
 if [ -f "$ENV_FILE" ]; then
   warn ".env already exists — skipping copy (review $ENV_EXAMPLE for new vars)"
 else
   info "Copying $ENV_EXAMPLE → $ENV_FILE..."
   cp "$ENV_EXAMPLE" "$ENV_FILE"
-  ok ".env created — edit it to configure LLM tiers and endpoints"
 fi
-# ── FAISS index build ────────────────────────────────────────────────────────
-info "Building FAISS indexes (downloads BGE embedder + reranker on first run)..."
 python -m backend.retrieval.vector_store
-ok "FAISS indexes built in data/faiss_store/"
-# ── Ollama model pull ────────────────────────────────────────────────────────
 if ! command -v ollama >/dev/null 2>&1; then
   warn "Ollama not installed — install it from https://ollama.com then re-run this script"
-else
-  LOCAL_MODEL=$(grep -E '^LOCAL_MODEL=' "$ENV_FILE" 2>/dev/null | cut -d= -f2 | sed 's/#.*//' | tr -d ' ' || echo "qwen3:8b")
-  [ -z "$LOCAL_MODEL" ] && LOCAL_MODEL="qwen3:8b"
-  info "Pulling Ollama model: $LOCAL_MODEL (skips if already pulled)..."
-  ollama pull "$LOCAL_MODEL"
-  ok "Ollama model $LOCAL_MODEL ready"
 fi
-# ── Frontend dependencies ────────────────────────────────────────────────────
 if command -v pnpm >/dev/null 2>&1; then
   info "Installing frontend dependencies..."
   pnpm --dir frontend install --silent
@@ -66,7 +55,6 @@ else
   warn "pnpm not found — install it (npm i -g pnpm) then run: pnpm --dir frontend install"
 fi
-# ── Done ──────────────────────────────────────────────────────────────────────
 echo ""
 ok "Setup complete!"
 echo ""

 warn()  { printf "\033[1;33m==> %s\033[0m\n" "$1"; }
 fail()  { printf "\033[1;31mERROR: %s\033[0m\n" "$1"; exit 1; }
 command -v conda >/dev/null 2>&1 || fail "conda not found. Install Miniconda/Anaconda first."
 if conda info --envs | grep -q "^${CONDA_ENV} "; then
   info "Conda env '$CONDA_ENV' already exists — reusing it"
 else
 eval "$(conda shell.bash hook)"
 conda activate "$CONDA_ENV"
 info "Installing Python dependencies..."
 pip install --upgrade pip --quiet
 pip install -r requirements.txt --quiet
 ok "Dependencies installed"
 if [ -f "$ENV_FILE" ]; then
   warn ".env already exists — skipping copy (review $ENV_EXAMPLE for new vars)"
 else
   info "Copying $ENV_EXAMPLE → $ENV_FILE..."
   cp "$ENV_EXAMPLE" "$ENV_FILE"
+  ok ".env created — edit it to configure Ollama Cloud model names"
 fi
+info "Building vector indexes (downloads BGE-small embedder on first run)..."
 python -m backend.retrieval.vector_store
+ok "Vector indexes built in data/faiss_store/"
+# Ollama: tiers point at Ollama Cloud — no local pull needed. Just check the
+# daemon is reachable so the OpenAI-compatible proxy works.
 if ! command -v ollama >/dev/null 2>&1; then
   warn "Ollama not installed — install it from https://ollama.com then re-run this script"
 fi
 if command -v pnpm >/dev/null 2>&1; then
   info "Installing frontend dependencies..."
   pnpm --dir frontend install --silent
   warn "pnpm not found — install it (npm i -g pnpm) then run: pnpm --dir frontend install"
 fi
 echo ""
 ok "Setup complete!"
 echo ""