diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..56a617463bbfc01cd95baacdd74b6b577c50ac6a
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,39 @@
+# Exclude everything that is not needed inside the Docker image.
+# This keeps the build context small and prevents large data files
+# from being sent to the Docker daemon.
+
+# Python virtual environment
+venv/
+.venv/
+
+# nuScenes dataset (large binary data – not needed in the Space)
+DataSet/
+
+# Frontend (deployed separately on Vercel)
+frontend/
+node_modules/
+
+# Training artefacts and legacy scripts
+archive/
+log/
+
+# Byte-compiled / cache files
+__pycache__/
+**/__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+
+# Editor and OS artefacts
+.git/
+.gitignore
+.DS_Store
+*.swp
+*.swo
+
+# Unused model checkpoints (only the two needed ones are copied explicitly)
+models/best_cv_synced_model.pth
+models/best_social_model_fusion_smoke.pth
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d81f164e10c1450a5602f5ca5ff0bd9f2b662acd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.env
+venv/
+.venv/
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..affdfd417a05724db8fef0ef93a585f878cb0baa
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,39 @@
+# Hugging Face Spaces — Docker deployment
+# SDK: docker  |  app_port: 7860
+#
+# Build context is the repo root.
+# Only backend/ and models/ are copied in; DataSet/ is intentionally excluded.
+
+FROM python:3.11-slim
+
+# System libraries required by opencv-python-headless
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libglib2.0-0 \
+        libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/*
+
+# HF Spaces runs containers as uid 1000 by default
+RUN useradd -m -u 1000 appuser
+
+WORKDIR /app
+
+# Install CPU-only PyTorch first so pip does not download the large CUDA wheels
+RUN pip install --no-cache-dir \
+        torch==2.2.2+cpu \
+        torchvision==0.17.2+cpu \
+        --index-url https://download.pytorch.org/whl/cpu
+
+# Install remaining API dependencies
+COPY requirements-hf.txt requirements-hf.txt
+RUN pip install --no-cache-dir -r requirements-hf.txt
+
+# Copy only what the API needs at runtime
+COPY --chown=appuser:appuser backend/ backend/
+COPY --chown=appuser:appuser models/  models/
+
+USER appuser
+
+EXPOSE 7860
+
+CMD ["python", "-m", "uvicorn", "backend.app.main:app", \
+     "--host", "0.0.0.0", "--port", "7860"]
diff --git a/README.md b/README.md
index 116d7699144c0ac5fec674d7cdb72e9d48f82734..9504e943f0cf808ff5ac1c436877c610033cf3bf 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,390 @@
 ---
-title: IntentDrive
-emoji: 📈
-colorFrom: purple
-colorTo: yellow
+title: IntentDrive BEV Trajectory Backend
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
-license: mit
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# IntentDrive — Road User Trajectory Prediction
+
+An end-to-end trajectory forecasting system for vulnerable road users (VRUs). The system connects camera-based perception, lightweight multi-agent tracking, and a transformer-based social forecasting model through a structured FastAPI backend and a React visualization dashboard.
+
+> **Competition:** Computer Vision Challenge — AI and Computer Vision Track
+> **Team:** 4% | **Lead:** Sajith J | **Institution:** Sri Shakthi Institute of Engineering & Technology
+
+---
+
+## Problem Statement
+
+In Level 4 autonomous driving, reacting to the *current* position of pedestrians and cyclists is insufficient. VRUs can behave unpredictably and may be occluded behind vehicles or other objects. This project builds a system that uses **2 seconds of past motion history** to predict the **next 6 seconds of future trajectory**, enabling safer and more proactive decisions.
+
+> **"Math over Pixels"** — our deliberate architectural decision. Rather than relying purely on visual signals, we model the underlying kinematics and social interactions of agents, making the system robust to occlusion and poor lighting.
+
+Real-world context: A Waymo robotaxi struck a child near Grant Elementary School in Santa Monica on January 23, 2026, causing minor injuries. Systems like IntentDrive are designed to anticipate such scenarios before they occur.
+
+---
+
+## Project Overview
+
+This project addresses the problem of safety-critical motion forecasting for pedestrians, cyclists, and motorcyclists in autonomous driving scenarios. Given a short observed history of agent positions, the system predicts **K=3 multimodal 6-second future trajectories** (12 future steps) along with per-mode probability scores.
+
+The full pipeline includes:
+
+- Object detection and optional keypoint extraction from camera frames
+- Image-to-BEV coordinate conversion using camera intrinsics and scene geometry
+- Temporal tracking to build per-agent motion histories
+- Social context construction from neighboring agent tracks within a **50-meter radius**
+- Transformer-based trajectory forecasting with goal-conditioned multimodal decoding
+- LiDAR and radar fusion for improved short-term kinematic estimation
+- FastAPI backend serving inference, live frame access, and health endpoints
+- React + TypeScript dashboard for BEV scene visualization, trajectory rendering, and sensor overlay
+
+---
+
+## System Architecture
+
+The pipeline operates across five stages:
+
+**Stage 1 — Data Ingestion & Preprocessing**
+Multi-sensor input (6x cameras, LiDAR_TOP, 5x radar channels) is ingested from nuScenes. Timestamps are synchronized via sample-token matching. All sensor readings are projected into a unified ego-centric BEV coordinate frame using sensor-to-ego calibration matrices and quaternion-to-yaw conversion.
+
+**Stage 2 — Feature Extraction**
+Three parallel branches process sensor data simultaneously:
+- **Camera branch:** Faster R-CNN (ResNet50-FPN) for multi-class object detection + Keypoint R-CNN for 17-point human pose estimation
+- **LiDAR branch:** Occupancy and depth geometry extraction
+- **Radar branch:** Velocity vectors and Doppler motion cues
+
+**Stage 3 — Fusion & Tracking**
+Cross-sensor fusion combines semantic detections, spatial geometry, and motion dynamics into unified agent representations. Multi-object tracking maintains consistent IDs across frames using nearest-neighbor IoU matching with pixel gating. Motion encoding builds a 4-step history of (x, y, velocity_x, velocity_y, speed, heading_sin, heading_cos) per agent.
+
+**Stage 4 — Model Inference**
+A goal-conditioned Trajectory Transformer with social attention predicts 3 trajectory modes, each 12 steps (6 seconds) into the future. Post-processing assigns direction labels (Straight / Left / Right / Backward) and top-3 probabilities per VRU.
+
+**Stage 5 — Deployment & Visualization**
+Outputs include camera overlay with bounding boxes and skeleton paths, a holographic skeleton panel for explainability, and a fused BEV map with direction probabilities.
+
+---
+
+## Model Architecture
+
+### Base Model: TrajectoryTransformer
+
+The base model (`backend/app/ml/model.py`) is a goal-conditioned multimodal trajectory forecaster operating on 4-step observed windows with 7 features per timestep: x, y, velocity_x, velocity_y, speed, heading_sin, heading_cos.
+
+**Components:**
+
+| Component | Description |
+|---|---|
+| Feature Embedding | Linear projection from 7 input features to d_model=64 |
+| Positional Encoding | Sinusoidal positional encoding over the observed sequence |
+| Temporal Encoder | 2-layer TransformerEncoder, 4 attention heads, feedforward dim 256 |
+| Social Attention | Multi-head attention pooling over encoded neighbor agent representations, 4 heads |
+| Goal Head | MLP predicting K=3 distinct 2D endpoint goals from the combined context |
+| Trajectory Head | MLP conditioned on context + each predicted goal; outputs a 12-step path per mode |
+| Probability Head | Linear layer with softmax producing per-mode confidence scores |
+
+**Forward pass summary:**
+
+1. Each agent's 4-step observed sequence is embedded and positionally encoded.
+2. The TransformerEncoder produces a context vector from the final timestep.
+3. Each neighboring agent within the social radius is independently encoded and pooled into a social context vector via cross-attention.
+4. Target and social context vectors are concatenated to form a 128-dimensional hidden state.
+5. K=3 goal endpoints are predicted from the hidden state.
+6. Each goal is concatenated back to the hidden state to condition the trajectory decoder, producing 3 independent 12-step trajectory modes.
+7. Mode probabilities are produced via a linear + softmax head.
+
+**Loss function:**
+
+The training objective combines four terms:
+
+- Best-of-K trajectory loss (minimum L2 error over K modes)
+- Goal loss (L2 distance from the best-mode predicted endpoint to ground truth endpoint)
+- Probability cross-entropy loss (supervising the mode probability head)
+- Diversity regularization loss (penalizes mode collapse via exponential repulsion between modes)
+
+### Fusion Model: TrajectoryTransformerFusion
+
+The fusion variant (`backend/app/ml/model_fusion.py`) extends the base model with a sensor-aware input branch. In addition to the standard 7-feature kinematic input, per-timestep fusion features of dimension 3 are accepted: normalized LiDAR point count, normalized radar point count, and composite sensor strength. These fusion features are projected to d_model=64 via a separate linear layer, added to the base kinematic embedding, and normalized with LayerNorm before entering the shared TransformerEncoder. The fusion model supports loading weights from a base model checkpoint for initialization.
+
+---
+
+## Dataset
+
+**Source:** nuScenes mini split (V1.0-mini), annotations loaded via nuScenes JSON tables. The model was trained and evaluated exclusively using the provided dataset, without incorporating any external data sources.
+
+**Target classes:** pedestrian, bicycle, motorcycle
+
+**Sensors used:** 6x cameras, LIDAR_TOP, 5x radar channels
+
+**Windowing:**
+- Takes a **2-second history** of motion as input (4 observed steps at 2 Hz)
+- Outputs **K=3 multimodal trajectory predictions over a 6-second prediction horizon** (12 future steps at 2 Hz), each with an associated probability score
+
+**Input features per observed step:**
+- x, y position (BEV meters)
+- velocity_x, velocity_y (m/s)
+- speed (m/s)
+- heading_sin, heading_cos (unit circle encoding)
+
+**Social context radius:** 50 meters
+
+**Data augmentation (training split only):** random rotation, horizontal reflection, Gaussian coordinate noise injection
+
+**Split protocol:** deterministic 80/20 train/validation split (seed 42)
+
+---
+
+## Performance
+
+### Baseline: Constant-Velocity Model
+
+| Metric | Value |
+|---|---|
+| minADE (K=3) | 0.65 m |
+| minFDE (K=3) | 1.35 m |
+| Miss Rate (>2.0 m) | 19.9 % |
+
+### Base Model — Camera-Only Transformer (best_social_model.pth)
+
+| Metric | Value | Improvement vs Baseline |
+|---|---|---|
+| Validation trajectories | 468 | — |
+| minADE (K=3) | 0.50 m | 23.1% |
+| minFDE (K=3) | 0.96 m | 29.6% |
+| Miss Rate (>2.0 m) | 9.9 % | 50.8% |
+
+### Fusion Model — LiDAR + Radar (best_social_model_fusion.pth)
+
+| Metric | Value | Improvement vs Baseline |
+|---|---|---|
+| Validation trajectories | 468 | — |
+| minADE (K=3) | **0.42 m** | **35.4%** |
+| minFDE (K=3) | **0.78 m** | **42.2%** |
+| Miss Rate (>2.0 m) | **7.1 %** | **64.3%** |
+
+### Runtime Benchmark
+
+| Stage | Latency |
+|---|---|
+| Detection model — Faster R-CNN (per frame) | 30.7 ms |
+| Sensor fusion — LiDAR + Radar lookup | 12 ms |
+| Transformer prediction head (per agent) | 14.6 ms |
+| Full end-to-end pipeline (2-frame loop) | ~58 ms |
+| Equivalent throughput | ~17.24 FPS |
+
+### Model Efficiency
+
+| Model | Parameters | Size |
+|---|---|---|
+| Base Transformer | ~146K | ~0.6 MB |
+| Fusion Transformer | ~146K | ~0.6 MB |
+
+The prediction module is compact and edge-friendly. The real-time bottleneck comes from the heavy CNN perception stack (Faster R-CNN), not the trajectory prediction head.
+
+---
+
+## Repository Structure
+
+```
+bev/
+├── backend/
+│   ├── app/
+│   │   ├── api/
+│   │   │   └── routes/          # FastAPI route modules: health, live, predict
+│   │   ├── core/                # Serialization and shared utilities
+│   │   ├── ml/
+│   │   │   ├── model.py         # TrajectoryTransformer (base, camera-only)
+│   │   │   ├── model_fusion.py  # TrajectoryTransformerFusion (LiDAR + Radar)
+│   │   │   ├── inference.py     # Inference pipeline
+│   │   │   └── sensor_fusion.py # LiDAR/radar feature extraction
+│   │   ├── services/            # Business logic layer
+│   │   └── main.py              # FastAPI application factory
+│   └── scripts/
+│       ├── data/                # Dataset construction from nuScenes images
+│       ├── training/
+│       │   ├── train.py                  # Stage 1: Base model training
+│       │   ├── train_phase2_fusion.py    # Stage 2: Fusion model training
+│       │   └── finetune_cv_pipeline.py   # CV-synced fine-tuning
+│       ├── evaluation/
+│       │   ├── evaluate.py               # Base model evaluation
+│       │   ├── evaluate_phase2_fusion.py # Fusion model evaluation
+│       │   └── benchmark_perf.py         # Runtime latency benchmarking
+│       └── tools/
+├── frontend/
+│   ├── src/
+│   │   ├── App.tsx              # Main dashboard component
+│   │   ├── types.ts             # TypeScript type definitions
+│   │   ├── api/                 # API client layer
+│   │   ├── components/          # UI components
+│   │   └── styles.css           # Global styles
+│   ├── package.json
+│   └── vite.config.ts
+├── models/
+│   ├── best_social_model.pth          # Trained base model checkpoint
+│   ├── best_social_model_fusion.pth   # Trained fusion model checkpoint
+│   ├── best_cv_synced_model.pth       # CV-pipeline fine-tuned checkpoint
+│   └── best_social_model_fusion_smoke.pth
+├── extracted_training_data.json       # Preprocessed nuScenes trajectory data
+└── log/                               # Training logs
+```
+
+---
+
+## Setup and Installation
+
+### Prerequisites
+
+- Python 3.10 or later
+- Node.js 18 or later and npm
+- nuScenes mini dataset (V1.0-mini) if retraining from scratch; pretrained checkpoints are included in `models/`
+- GPU recommended (tested on NVIDIA RTX 5050 — 8 GB VRAM)
+
+### Backend
+
+```bash
+# Create and activate a virtual environment
+python -m venv venv
+venv\Scripts\activate          # Windows
+# source venv/bin/activate     # Linux / macOS
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+### Frontend
+
+```bash
+cd frontend
+npm install
+```
+
+---
+
+## How to Run
+
+### 1. Start the Backend API Server
+
+From the repository root with the virtual environment active:
+
+```bash
+uvicorn backend.app.main:app --host 0.0.0.0 --port 8000 --reload
+```
+
+The API will be available at `http://localhost:8000`.
+Interactive API documentation is available at `http://localhost:8000/docs`.
+
+### 2. Start the Frontend Dashboard
+
+```bash
+cd frontend
+npm run dev
+```
+
+The dashboard will be available at `http://localhost:5173`.
+
+### 3. Train the Base Model (Stage 1)
+
+Ensure `extracted_training_data.json` is present at the repository root (or rebuild it using `backend/scripts/data/build_dataset_from_images.py`).
+
+```bash
+python -m backend.scripts.training.train
+```
+
+Checkpoints are saved to `models/best_social_model.pth`. Training logs are written to `log/`.
+
+### 4. Train the Fusion Model (Stage 2)
+
+```bash
+python -m backend.scripts.training.train_phase2_fusion
+```
+
+The fusion model initializes from the base checkpoint and trains with LiDAR and radar features using differential learning rates. The output checkpoint is saved to `models/best_social_model_fusion.pth`.
+
+### 5. Evaluate Models
+
+```bash
+# Base model
+python -m backend.scripts.evaluation.evaluate
+
+# Fusion model
+python -m backend.scripts.evaluation.evaluate_phase2_fusion
+
+# Runtime latency benchmark
+python -m backend.scripts.evaluation.benchmark_perf
+```
+
+---
+
+## API Endpoints
+
+| Method | Path | Description |
+|---|---|---|
+| GET | `/api/health` | Service health check |
+| GET | `/api/live/frame` | Retrieve the latest processed camera frame |
+| POST | `/api/predict` | Run trajectory prediction on a submitted scene |
+
+The prediction endpoint returns a structured payload including multimodal trajectories, per-mode probabilities, agent detections, sensor summary, and scene geometry.
+
+---
+
+## Training Strategy
+
+Training follows a two-stage transfer learning approach:
+
+**Stage 1 — Social Trajectory Transformer**
+Train the base model end-to-end using only camera-derived BEV trajectories. The model learns social interaction patterns, goal-conditioned decoding, and multimodal prediction from kinematic features alone.
+
+**Stage 2 — Fusion Transfer Learning**
+Initialize the fusion model from the Stage 1 checkpoint. Add the LiDAR and radar input branch and fine-tune using differential learning rates — lower rates for the pre-trained transformer backbone and higher rates for the new fusion branch. This preserves learned social behavior while adapting to richer sensor signals.
+
+**Optimization:**
+- Optimizer: Adam
+- LR scheduling: ReduceLROnPlateau
+- Early stopping with best checkpoint selection based on minADE
+
+---
+
+## Robustness Analysis
+
+**Noise & Motion Stability:** Data augmentation (rotation, flip, Gaussian noise) improves generalization. Radar fusion stabilizes motion estimation. Multi-modal outputs reduce prediction failure in edge cases.
+
+**Lighting Conditions:** Camera performance degrades in low-light conditions. LiDAR and Radar remain reliable regardless of lighting. Multi-sensor fusion reduces dependency on visual quality alone.
+
+**Occlusion Handling:** Motion history + social context encoding allows the model to predict agent positions even when temporarily invisible. Radar supports cross-traffic awareness for agents occluded by large vehicles. Long-term occlusion remains an open challenge for future work.
+
+---
+
+## Sample Training Output
+
+```
+Train Loss: 2.1834
+ADE: 0.5491, FDE: 1.0873
+Current Learning Rate: 0.0005
+```
+
+---
+
+## Output Visualizations
+
+![Output visualization](public/output.jpeg)
+![Output visualization2](public/output2.png)
+![Output visualization3](public/output4.png)
+![Output visualization4](public/output3.png)
+
+---
+
+## References
+
+- Attention Is All You Need — https://arxiv.org/abs/1706.03762
+- Trajectron++ — https://arxiv.org/abs/2001.03093
+- nuScenes Dataset Paper — https://arxiv.org/abs/1903.11027
+- BEVFormer — https://arxiv.org/abs/2203.17270
+- BEVFusion — https://arxiv.org/abs/2205.13542
+
+---
+
+## License
+
+This project is licensed under the terms of the MIT License. See [LICENSE](LICENSE) for details.
\ No newline at end of file
diff --git a/backend/README.md b/backend/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..334bf30db72c224319652cbfaa192a33434cf211
--- /dev/null
+++ b/backend/README.md
@@ -0,0 +1,78 @@
+# Backend (Phase 1)
+
+This backend exposes your existing CV + trajectory prediction pipeline through FastAPI.
+
+## Folder Structure
+
+```text
+backend/
+	app/
+		api/
+			dependencies.py
+			routes/
+				health.py
+				live.py
+				predict.py
+		core/
+			serialization.py
+			uploads.py
+		ml/
+			inference.py
+			model.py
+			model_fusion.py
+			sensor_fusion.py
+		legacy/
+			dataset.py
+			dataset_fusion.py
+			data_loader.py
+			cv_perception.py
+			map_renderer.py
+			visualization.py
+		services/
+			pipeline.py
+		main.py
+		schemas.py
+	scripts/
+		training/
+		evaluation/
+		data/
+		tools/
+		legacy/
+	requirements.txt
+```
+
+Notes:
+- Runtime model and fusion logic is now under `backend/app/ml`.
+- Legacy helper modules were moved under `backend/app/legacy`.
+- Training, evaluation, and data scripts were moved under `backend/scripts/*`.
+- Root-level `inference.py`, `model.py`, `model_fusion.py`, and `sensor_fusion.py` remain as compatibility wrappers.
+
+## Run
+
+From the repository root:
+
+```powershell
+.\\venv\\Scripts\\python.exe -m pip install -r backend/requirements.txt
+.\\venv\\Scripts\\python.exe -m uvicorn backend.app.main:app --reload --host 0.0.0.0 --port 8000
+```
+
+## Endpoints
+
+- `GET /api/health`
+- `GET /api/live/frames?channel=CAM_FRONT&limit=200`
+- `GET /api/live/frame-image?path=<dataset_frame_path>`
+- `POST /api/predict/two-image` (multipart form)
+- `POST /api/predict/live-fusion` (JSON body)
+
+## Phase 2 Scene Geometry
+
+Prediction responses now include `scene_geometry` with image-grounded BEV primitives:
+
+- `road_polygon`: camera-derived drivable area in BEV coordinates.
+- `lane_lines`: lane candidates projected into BEV.
+- `elements`: projected actor footprints from detections.
+- `quality`: confidence score in `[0, 1]` for extracted scene structure.
+
+Notes:
+- The backend keeps using existing model files at repository root.
+- Keep running from repo root so relative data paths remain stable.
diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76caeb35beddbff1c495305a99e1850d368a6bae
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1 @@
+"""Backend package root."""
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f11ee6f31ea36690be73831e977a388a4c9b58
--- /dev/null
+++ b/backend/app/__init__.py
@@ -0,0 +1 @@
+"""FastAPI backend package for Phase 1 migration."""
diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9ae901c7a1b41e61236f03db335d7dd27ed0f8
--- /dev/null
+++ b/backend/app/api/__init__.py
@@ -0,0 +1 @@
+"""API package for route modules and dependencies."""
diff --git a/backend/app/api/dependencies.py b/backend/app/api/dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d3602faf7a3dcebe3e64a514083e6773fff7c3f
--- /dev/null
+++ b/backend/app/api/dependencies.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from ..services.pipeline import TrajectoryPipeline
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+pipeline = TrajectoryPipeline(repo_root=REPO_ROOT)
+
+
+def get_pipeline() -> TrajectoryPipeline:
+    return pipeline
diff --git a/backend/app/api/routes/__init__.py b/backend/app/api/routes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b5cfd9edd4e183470b68e68fa8f7de0b3128d
--- /dev/null
+++ b/backend/app/api/routes/__init__.py
@@ -0,0 +1 @@
+"""Route modules for the FastAPI application."""
diff --git a/backend/app/api/routes/health.py b/backend/app/api/routes/health.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8285bf097c0ba9091119378f2046b6f9c5e8cff
--- /dev/null
+++ b/backend/app/api/routes/health.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from typing import Any
+
+from fastapi import APIRouter
+
+from ..dependencies import pipeline
+
+router = APIRouter()
+
+
+@router.get("/health")
+def health() -> dict[str, Any]:
+    return {
+        "status": "ok",
+        "using_fusion_model": pipeline.using_fusion_model,
+        "dataset_root": str(pipeline.data_root),
+        "dataset_exists": pipeline.data_root.exists(),
+    }
diff --git a/backend/app/api/routes/live.py b/backend/app/api/routes/live.py
new file mode 100644
index 0000000000000000000000000000000000000000..b60da03366003a7aff9d8c4bbba13374f03685da
--- /dev/null
+++ b/backend/app/api/routes/live.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import mimetypes
+from pathlib import Path
+from typing import Any
+
+from fastapi import APIRouter, HTTPException, Query
+from fastapi.responses import FileResponse
+
+from ..dependencies import pipeline
+
+router = APIRouter()
+
+
+def resolve_dataset_frame_path(path_value: str) -> Path:
+    candidate = Path(path_value).expanduser()
+    if not candidate.is_absolute():
+        candidate = (pipeline.repo_root / candidate).resolve()
+    else:
+        candidate = candidate.resolve()
+
+    data_root = pipeline.data_root.resolve()
+    try:
+        candidate.relative_to(data_root)
+    except ValueError as exc:
+        raise HTTPException(status_code=403, detail="Frame path is outside DataSet root.") from exc
+
+    if not candidate.exists() or not candidate.is_file():
+        raise HTTPException(status_code=404, detail="Frame image was not found.")
+
+    if candidate.suffix.lower() not in {".jpg", ".jpeg", ".png", ".webp"}:
+        raise HTTPException(status_code=400, detail="Unsupported frame image file type.")
+
+    return candidate
+
+
+@router.get("/live/frames")
+def list_live_frames(
+    channel: str = Query(default="CAM_FRONT"),
+    limit: int = Query(default=200, ge=1, le=2000),
+) -> dict[str, Any]:
+    paths = pipeline.list_channel_image_paths(channel)
+    names = [p.name for p in paths[:limit]]
+    return {"channel": channel, "count": len(names), "frames": names}
+
+
+@router.get("/live/frame-image")
+def get_live_frame_image(path: str = Query(..., min_length=1)):
+    frame_path = resolve_dataset_frame_path(path)
+    media_type = mimetypes.guess_type(str(frame_path))[0] or "application/octet-stream"
+    return FileResponse(path=frame_path, media_type=media_type)
diff --git a/backend/app/api/routes/predict.py b/backend/app/api/routes/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bf758d7ee7646a3d3058aa7c389f90e964eb80
--- /dev/null
+++ b/backend/app/api/routes/predict.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from fastapi import APIRouter, File, Form, HTTPException, UploadFile
+
+from ...core.serialization import build_prediction_payload
+from ...core.uploads import upload_to_rgb_array
+from ...schemas import LiveFusionRequest, PredictionResponse
+from ..dependencies import pipeline
+
+router = APIRouter()
+
+
+@router.post("/predict/two-image", response_model=PredictionResponse)
+async def predict_two_image(
+    image_prev: UploadFile = File(...),
+    image_curr: UploadFile = File(...),
+    score_threshold: float = Form(0.35),
+    tracking_gate_px: float = Form(130.0),
+    min_motion_px: float = Form(0.0),
+    use_pose: bool = Form(False),
+):
+    img_prev = await upload_to_rgb_array(image_prev)
+    img_curr = await upload_to_rgb_array(image_curr)
+
+    result = pipeline.build_two_image_agents_bundle(
+        img_prev=img_prev,
+        img_curr=img_curr,
+        score_threshold=float(score_threshold),
+        tracking_gate_px=float(tracking_gate_px),
+        min_motion_px=float(min_motion_px),
+        use_pose=bool(use_pose),
+        img_prev_name=image_prev.filename,
+        img_curr_name=image_curr.filename,
+    )
+
+    if "error" in result:
+        raise HTTPException(status_code=400, detail=result["error"])
+
+    return build_prediction_payload(result)
+
+
+@router.post("/predict/live-fusion", response_model=PredictionResponse)
+def predict_live_fusion(req: LiveFusionRequest):
+    result = pipeline.build_live_agents_bundle(
+        anchor_idx=req.anchor_idx,
+        score_threshold=req.score_threshold,
+        tracking_gate_px=req.tracking_gate_px,
+        use_pose=req.use_pose,
+    )
+
+    if "error" in result:
+        raise HTTPException(status_code=400, detail=result["error"])
+
+    return build_prediction_payload(result)
diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad54831a303c93b970c6c8b19e12a1d5ae971f4
--- /dev/null
+++ b/backend/app/core/__init__.py
@@ -0,0 +1 @@
+"""Core helpers for serialization and file handling."""
diff --git a/backend/app/core/serialization.py b/backend/app/core/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6600e52a5777a6d2ebbee9583fe8258142b3ff83
--- /dev/null
+++ b/backend/app/core/serialization.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+
+
+def to_jsonable(value: Any) -> Any:
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    if isinstance(value, (np.floating, np.integer)):
+        return value.item()
+    if isinstance(value, dict):
+        return {str(k): to_jsonable(v) for k, v in value.items()}
+    if isinstance(value, tuple):
+        return [to_jsonable(v) for v in value]
+    if isinstance(value, list):
+        return [to_jsonable(v) for v in value]
+    return value
+
+
+def serialize_agents(agents: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    serialized = []
+    for agent in agents:
+        serialized.append(
+            {
+                "id": int(agent.get("id", 0)),
+                "type": str(agent.get("type", "unknown")),
+                "raw_label": agent.get("raw_label"),
+                "history": [
+                    {"x": float(pt[0]), "y": float(pt[1])}
+                    for pt in agent.get("history", [])
+                ],
+                "predictions": [
+                    [{"x": float(pt[0]), "y": float(pt[1])} for pt in mode]
+                    for mode in agent.get("predictions", [])
+                ],
+                "probabilities": [float(p) for p in agent.get("probabilities", [])],
+                "is_target": bool(agent.get("is_target", False)),
+            }
+        )
+    return serialized
+
+
+def build_prediction_payload(result: dict[str, Any]) -> dict[str, Any]:
+    core_excludes = {
+        "agents",
+        "target_track_id",
+        "mode",
+        "camera_snapshots",
+        "fusion_data",
+        "scene_geometry",
+        "error",
+    }
+
+    payload: dict[str, Any] = {
+        "mode": result.get("mode", "unknown"),
+        "target_track_id": result.get("target_track_id"),
+        "agents": serialize_agents(result.get("agents", [])),
+        "meta": to_jsonable({k: v for k, v in result.items() if k not in core_excludes}),
+    }
+
+    snapshots = result.get("camera_snapshots")
+    if snapshots:
+        payload["detections"] = {
+            name: {
+                "frame_path": snap.get("frame_path"),
+                "detections": to_jsonable(snap.get("detections", [])),
+            }
+            for name, snap in snapshots.items()
+        }
+
+    fusion_data = result.get("fusion_data")
+    if fusion_data:
+        payload["sensors"] = {
+            "sample_token": fusion_data.get("sample_token"),
+            "lidar_points": int(len(fusion_data.get("lidar_xy", []))),
+            "radar_points": int(len(fusion_data.get("radar_xy", []))),
+            "radar_channel_counts": to_jsonable(fusion_data.get("radar_channel_counts", {})),
+        }
+
+    if result.get("scene_geometry") is not None:
+        payload["scene_geometry"] = to_jsonable(result.get("scene_geometry"))
+
+    return payload
diff --git a/backend/app/core/uploads.py b/backend/app/core/uploads.py
new file mode 100644
index 0000000000000000000000000000000000000000..90291e20076db58a13e5afd70191b085687d9df3
--- /dev/null
+++ b/backend/app/core/uploads.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+import io
+
+import numpy as np
+from fastapi import HTTPException, UploadFile
+from PIL import Image
+
+
+async def upload_to_rgb_array(upload: UploadFile) -> np.ndarray:
+    raw = await upload.read()
+    if not raw:
+        raise HTTPException(status_code=400, detail=f"Uploaded file '{upload.filename}' is empty.")
+
+    try:
+        image = Image.open(io.BytesIO(raw)).convert("RGB")
+    except Exception as exc:
+        raise HTTPException(status_code=400, detail=f"Could not parse image '{upload.filename}': {exc}") from exc
+
+    return np.asarray(image)
diff --git a/backend/app/legacy/__init__.py b/backend/app/legacy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b043e7e81941fb4ed1099179227aa2c4c1ca9d3
--- /dev/null
+++ b/backend/app/legacy/__init__.py
@@ -0,0 +1 @@
+"""Legacy helper modules preserved for compatibility."""
diff --git a/backend/app/legacy/cv_perception.py b/backend/app/legacy/cv_perception.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd40d331f9dd9bf7d03bc1a2c7874b0e4e5b8c2
--- /dev/null
+++ b/backend/app/legacy/cv_perception.py
@@ -0,0 +1,119 @@
+﻿import torch
+import torchvision
+from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
+from PIL import Image, ImageDraw
+import os
+import math
+
+# Map COCO classes to our Hackathon targets
+TARGET_CLASSES = {
+    1: 'Person',
+    2: 'Bicycle',
+    3: 'Car',
+    4: 'Motorcycle'
+}
+
+def load_perception_model():
+    print("[System] Loading Faster R-CNN (ResNet-50-FPN Backbone)...")
+    weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+    model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
+    model.eval()
+    return model, weights
+
+def extract_features(img_path, model, weights, score_threshold=0.7):
+    image = Image.open(img_path).convert("RGB")
+    preprocess = weights.transforms()
+    input_batch = preprocess(image).unsqueeze(0)
+    
+    with torch.no_grad():
+        prediction = model(input_batch)[0]
+        
+    extracted = []
+    for i, box in enumerate(prediction['boxes']):
+        score = prediction['scores'][i].item()
+        label = prediction['labels'][i].item()
+        
+        if score > score_threshold and label in TARGET_CLASSES:
+            box = box.tolist()
+            class_name = TARGET_CLASSES[label]
+            # Get bottom-center coordinate for BEV mapping
+            center_x = (box[0] + box[2]) / 2.0
+            bottom_y = box[3]
+            
+            extracted.append({
+                'type': class_name,
+                'bbox': box,
+                'coord': (center_x, bottom_y)
+            })
+    return extracted, image
+
+def calculate_distance(c1, c2):
+    return math.sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2)
+
+def process_frame_sequence(frame1_path, frame2_path, model, weights):
+    """
+    Takes 2 sequential frames, detects objects, matches them to find movement, 
+    and bridges the data to the AI Brain.
+    """
+    print(f"\n[Step 1] Analyzing Frame T-1: {os.path.basename(frame1_path)}")
+    objs_f1, img1 = extract_features(frame1_path, model, weights)
+    
+    print(f"[Step 2] Analyzing Frame T0: {os.path.basename(frame2_path)}")
+    objs_f2, img2 = extract_features(frame2_path, model, weights)
+    
+    print("\n[Step 3] Temporal Tracking (Finding Moving Cyclists/Pedestrians)")
+    tracked_history = []
+    
+    # Simple Tracking by linking nearest objects between Frame 1 and Frame 2
+    for obj2 in objs_f2:
+        best_match = None
+        min_dist = float('inf')
+        
+        for obj1 in objs_f1:
+            if obj1['type'] == obj2['type']: # Must be same class
+                dist = calculate_distance(obj1['coord'], obj2['coord'])
+                if dist < 50.0:  # Max pixel movement threshold between 2 frames
+                    min_dist = dist
+                    best_match = obj1
+                    
+        if best_match:
+            # Calculate movement vector (Velocity)
+            dx = obj2['coord'][0] - best_match['coord'][0]
+            dy = obj2['coord'][1] - best_match['coord'][1]
+            is_moving = abs(dx) > 1.0 or abs(dy) > 1.0
+            
+            if is_moving and obj2['type'] in ['Person', 'Bicycle']:
+                print(f" -> Spotted Moving {obj2['type']}! dx: {dx:.2f}, dy: {dy:.2f}")
+                
+                # Format: [(x_t-1, y_t-1), (x_t0, y_t0)] 
+                # This is EXACTLY what the AI Brain needs!
+                history = [best_match['coord'], obj2['coord']]
+                
+                tracked_history.append({
+                    "type": obj2['type'],
+                    "history": history
+                })
+                
+    print(f"\n[Step 4] Handoff to AI Brain: Found {len(tracked_history)} moving VRUs.")
+    return tracked_history
+
+if __name__ == '__main__':
+    # We will use two identical images to simulate the script architecture
+    # In reality, this would be image_001.jpg and image_002.jpg
+    import glob
+    cam_front_images = glob.glob("DataSet/samples/CAM_FRONT/*.jpg")
+    
+    if len(cam_front_images) >= 2:
+        f1 = cam_front_images[0]
+        f2 = cam_front_images[1] # Next sequential frame
+        
+        try:
+            model, weights = load_perception_model()
+            vru_data_for_ai = process_frame_sequence(f1, f2, model, weights)
+            
+            print("\n--- FINAL JSON PAYLOAD FOR TRANSFORMER MODEL ---")
+            for person in vru_data_for_ai:
+                print(f"Target: {person['type']}")
+                print(f"Historical Trajectory [T-1, T0]: {person['history']}")
+        except Exception as e:
+            print("Model not loaded, but script structure is ready.")
diff --git a/backend/app/legacy/data_loader.py b/backend/app/legacy/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf891516c49bcc6b4a5ba764f41f5366a8fd2a7f
--- /dev/null
+++ b/backend/app/legacy/data_loader.py
@@ -0,0 +1,347 @@
+from pathlib import Path
+import json
+
+DATA_ROOT = Path("DataSet/v1.0-mini")
+
+
+def load_json(name):
+    with open(DATA_ROOT / f"{name}.json") as f:
+        return json.load(f)
+
+
+def build_lookup(table):
+    return {item['token']: item for item in table}
+
+
+def extract_pedestrian_instances(sample_annotations, instances, categories):
+    cat_lookup = build_lookup(categories)
+    inst_lookup = build_lookup(instances)
+
+    pedestrian_instances = set()
+
+    for ann in sample_annotations:
+        inst = inst_lookup[ann['instance_token']]
+        category = cat_lookup[inst['category_token']]['name']
+
+        # Include pedestrians, bicycles, and motorcycles (Vulnerable Road Users)
+        if "pedestrian" in category or "bicycle" in category or "motorcycle" in category:
+            pedestrian_instances.add(ann['instance_token'])
+
+    return pedestrian_instances
+
+
+def build_trajectories(sample_annotations, pedestrian_instances):
+    ann_lookup = build_lookup(sample_annotations)
+
+    visited = set()
+    trajectories = []
+
+    for ann in sample_annotations:
+        if ann['token'] in visited:
+            continue
+
+        if ann['instance_token'] not in pedestrian_instances:
+            continue
+
+        current = ann
+        while current['prev'] != "":
+            current = ann_lookup[current['prev']]
+
+        traj = []
+
+        while current:
+            visited.add(current['token'])
+
+            x, y, _ = current['translation']
+            traj.append([x, y])
+
+            if current['next'] == "":
+                break
+
+            current = ann_lookup[current['next']]
+
+        if len(traj) >= 16:  # 4 past + 12 future (6 seconds)
+            trajectories.append(traj)
+
+    return trajectories
+
+
+def create_windows(trajectories):
+    import math
+    samples = []
+
+    for traj in trajectories:
+        # Require 16 frames: 4 history + 12 future
+        for i in range(len(traj) - 15):
+
+            # ---------------- MAIN TRAJECTORY ----------------
+            window = traj[i:i+16]
+
+            x3, y3 = window[3]
+            window = [[x - x3, y - y3] for x, y in window]
+
+            vel = []
+            for j in range(len(window)):
+                if j == 0:
+                    vel.append([0, 0, 0, 0, 0])
+                else:
+                    dx = window[j][0] - window[j-1][0]
+                    dy = window[j][1] - window[j-1][1]
+                    speed = math.hypot(dx, dy)
+                    if speed > 1e-5:
+                        sin_t = dy / speed
+                        cos_t = dx / speed
+                    else:
+                        sin_t = 0.0
+                        cos_t = 0.0
+                    vel.append([dx, dy, speed, sin_t, cos_t])
+
+            obs = []
+            for j in range(4):
+                obs.append([
+                    window[j][0],
+                    window[j][1],
+                    vel[j][0],
+                    vel[j][1],
+                    vel[j][2],
+                    vel[j][3],
+                    vel[j][4]
+                ])
+
+            # Future is now 12 steps (6 seconds)
+            future = window[4:16]
+
+            # ---------------- NEIGHBORS ----------------
+            neighbors = []
+
+            for other_traj in trajectories:
+                if other_traj is traj:
+                    continue
+
+                if len(other_traj) < i + 4:
+                    continue
+
+                x1, y1 = traj[i + 3] # Main trajectory center
+                x2, y2 = other_traj[i + 3]
+
+                dist = math.hypot(x1 - x2, y1 - y2)
+
+                # Expanded Social Radius to 50 meters to account for much longer timeframe
+                if dist < 50.0:  
+
+                    n_window = other_traj[i:i+4]
+
+                    # Center around main trajectory's last observed timestep
+                    n_window = [[x - x1, y - y1] for x, y in n_window]
+
+                    vel_n = []
+                    for j in range(len(n_window)):
+                        if j == 0:
+                            vel_n.append([0, 0, 0, 0, 0])
+                        else:
+                            dx = n_window[j][0] - n_window[j-1][0]
+                            dy = n_window[j][1] - n_window[j-1][1]
+                            speed = math.hypot(dx, dy)
+                            if speed > 1e-5:
+                                sin_t = dy / speed
+                                cos_t = dx / speed
+                            else:
+                                sin_t = 0.0
+                                cos_t = 0.0
+                            vel_n.append([dx, dy, speed, sin_t, cos_t])
+
+                    n_obs = []
+                    for j in range(4):
+                        n_obs.append([
+                            n_window[j][0],
+                            n_window[j][1],
+                            vel_n[j][0],
+                            vel_n[j][1],
+                            vel_n[j][2],
+                            vel_n[j][3],
+                            vel_n[j][4]
+                        ])
+
+                    neighbors.append(n_obs)
+
+            samples.append((obs, neighbors, future))
+
+    return samples
+
+
+def build_trajectories_with_sensor(sample_annotations, pedestrian_instances):
+    ann_lookup = build_lookup(sample_annotations)
+
+    visited = set()
+    trajectories = []
+
+    for ann in sample_annotations:
+        if ann['token'] in visited:
+            continue
+
+        if ann['instance_token'] not in pedestrian_instances:
+            continue
+
+        current = ann
+        while current['prev'] != "":
+            current = ann_lookup[current['prev']]
+
+        traj = []
+
+        while current:
+            visited.add(current['token'])
+
+            x, y, _ = current['translation']
+            traj.append({
+                'x': x,
+                'y': y,
+                'sample_token': current['sample_token'],
+                'num_lidar_pts': float(current.get('num_lidar_pts', 0.0)),
+                'num_radar_pts': float(current.get('num_radar_pts', 0.0)),
+            })
+
+            if current['next'] == "":
+                break
+
+            current = ann_lookup[current['next']]
+
+        if len(traj) >= 16:
+            trajectories.append(traj)
+
+    return trajectories
+
+
+def create_windows_with_sensor(trajectories):
+    import math
+    samples = []
+
+    for traj in trajectories:
+        for i in range(len(traj) - 15):
+            window = traj[i:i + 16]
+
+            x3, y3 = window[3]['x'], window[3]['y']
+            centered_xy = [[p['x'] - x3, p['y'] - y3] for p in window]
+
+            vel = []
+            for j in range(len(centered_xy)):
+                if j == 0:
+                    vel.append([0, 0, 0, 0, 0])
+                else:
+                    dx = centered_xy[j][0] - centered_xy[j - 1][0]
+                    dy = centered_xy[j][1] - centered_xy[j - 1][1]
+                    speed = math.hypot(dx, dy)
+                    if speed > 1e-5:
+                        sin_t = dy / speed
+                        cos_t = dx / speed
+                    else:
+                        sin_t = 0.0
+                        cos_t = 0.0
+                    vel.append([dx, dy, speed, sin_t, cos_t])
+
+            obs = []
+            fusion_obs = []
+            for j in range(4):
+                obs.append([
+                    centered_xy[j][0],
+                    centered_xy[j][1],
+                    vel[j][0],
+                    vel[j][1],
+                    vel[j][2],
+                    vel[j][3],
+                    vel[j][4],
+                ])
+
+                lidar_pts = min(80.0, window[j]['num_lidar_pts']) / 80.0
+                radar_pts = min(30.0, window[j]['num_radar_pts']) / 30.0
+                sensor_strength = min(1.0, (window[j]['num_lidar_pts'] + 2.0 * window[j]['num_radar_pts']) / 100.0)
+                fusion_obs.append([lidar_pts, radar_pts, sensor_strength])
+
+            future = centered_xy[4:16]
+
+            neighbors = []
+            for other_traj in trajectories:
+                if other_traj is traj:
+                    continue
+
+                if len(other_traj) < i + 4:
+                    continue
+
+                x1, y1 = traj[i + 3]['x'], traj[i + 3]['y']
+                x2, y2 = other_traj[i + 3]['x'], other_traj[i + 3]['y']
+
+                dist = math.hypot(x1 - x2, y1 - y2)
+                if dist >= 50.0:
+                    continue
+
+                n_window = other_traj[i:i + 4]
+                n_window_xy = [[p['x'] - x1, p['y'] - y1] for p in n_window]
+
+                vel_n = []
+                for j in range(len(n_window_xy)):
+                    if j == 0:
+                        vel_n.append([0, 0, 0, 0, 0])
+                    else:
+                        dx = n_window_xy[j][0] - n_window_xy[j - 1][0]
+                        dy = n_window_xy[j][1] - n_window_xy[j - 1][1]
+                        speed = math.hypot(dx, dy)
+                        if speed > 1e-5:
+                            sin_t = dy / speed
+                            cos_t = dx / speed
+                        else:
+                            sin_t = 0.0
+                            cos_t = 0.0
+                        vel_n.append([dx, dy, speed, sin_t, cos_t])
+
+                n_obs = []
+                for j in range(4):
+                    n_obs.append([
+                        n_window_xy[j][0],
+                        n_window_xy[j][1],
+                        vel_n[j][0],
+                        vel_n[j][1],
+                        vel_n[j][2],
+                        vel_n[j][3],
+                        vel_n[j][4],
+                    ])
+
+                neighbors.append(n_obs)
+
+            samples.append((obs, neighbors, fusion_obs, future))
+
+    return samples
+
+
+def main():
+    print("Loading data...")
+
+    sample_annotations = load_json("sample_annotation")
+    instances = load_json("instance")
+    categories = load_json("category")
+
+    print("Filtering pedestrians...")
+    ped_instances = extract_pedestrian_instances(
+        sample_annotations, instances, categories
+    )
+
+    print("Building trajectories...")
+    trajectories = build_trajectories(sample_annotations, ped_instances)
+
+    print("Creating training samples...")
+    samples = create_windows(trajectories)
+
+    print("\n--- DEBUG ---")
+    obs, neighbors, future = samples[0]
+
+    print("Obs length:", len(obs))
+    print("Future length:", len(future))
+    print("Neighbors count:", len(neighbors))
+
+    if len(neighbors) > 0:
+        print("One neighbor shape:", len(neighbors[0]), len(neighbors[0][0]))
+
+    print(f"\nTotal trajectories: {len(trajectories)}")
+    print(f"Total samples: {len(samples)}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/backend/app/legacy/dataset.py b/backend/app/legacy/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd60cdb02bd3bd6cf3e4863dfb44819869225f32
--- /dev/null
+++ b/backend/app/legacy/dataset.py
@@ -0,0 +1,100 @@
+import torch
+from torch.utils.data import Dataset
+import random
+import math
+
+def augment_data(obs, neighbors, future):
+    # obs: (4, 7) tensor
+    # neighbors: list of (4, 7) tensors
+    # future: (12, 2) tensor
+    
+    # Random Scene Rotation (0-360)
+    theta = random.uniform(0, 2 * math.pi)
+    cos_t = math.cos(theta)
+    sin_t = math.sin(theta)
+
+    # Random X-axis reflection
+    flip_x = random.choice([-1.0, 1.0])
+    
+    # Gaussian Coordinate Noise
+    noise_std = 0.05
+
+    def apply_transform(feat, is_obs=True):
+        new_feat = feat.clone()
+        for i in range(new_feat.size(0)):
+            x, y = new_feat[i, 0].item(), new_feat[i, 1].item()
+            
+            # Apply Noise
+            x += random.gauss(0, noise_std)
+            y += random.gauss(0, noise_std)
+            
+            # Apply Flip
+            x *= flip_x
+            
+            # Apply Rotation
+            nx = x * cos_t - y * sin_t
+            ny = x * sin_t + y * cos_t
+            
+            new_feat[i, 0] = nx
+            new_feat[i, 1] = ny
+            
+            if is_obs:
+                # Transform dx, dy
+                dx, dy = new_feat[i, 2].item(), new_feat[i, 3].item()
+                dx *= flip_x
+                ndx = dx * cos_t - dy * sin_t
+                ndy = dx * sin_t + dy * cos_t
+                new_feat[i, 2] = ndx
+                new_feat[i, 3] = ndy
+                
+                # Recompute sin_t, cos_t based on new dx, dy to be safe
+                speed = math.hypot(ndx, ndy)
+                if speed > 1e-5:
+                    new_feat[i, 5] = ndy / speed
+                    new_feat[i, 6] = ndx / speed
+                else:
+                    new_feat[i, 5] = 0.0
+                    new_feat[i, 6] = 0.0
+
+        return new_feat
+
+    new_obs = apply_transform(obs, is_obs=True)
+    new_future = apply_transform(future, is_obs=False)
+    
+    new_neighbors = []
+    for n in neighbors: # n is (4, 7) tensor
+        if not isinstance(n, torch.Tensor):
+            n = torch.tensor(n, dtype=torch.float32)
+        new_neighbors.append(apply_transform(n, is_obs=True))
+        
+    return new_obs, new_neighbors, new_future
+
+class TrajectoryDataset(Dataset):
+    def __init__(self, samples, augment=False):
+        self.obs = []
+        self.neighbors = []
+        self.future = []
+        self.augment = augment
+
+        for obs, neighbors, future in samples:
+            self.obs.append(obs)
+            self.neighbors.append(neighbors)
+            self.future.append(future)
+
+        # Convert to tensors
+        self.obs = torch.tensor(self.obs, dtype=torch.float32)
+        self.future = torch.tensor(self.future, dtype=torch.float32)
+        # Neighbors remain lists of matrices, will convert in getitem or augment
+
+    def __len__(self):
+        return len(self.obs)
+
+    def __getitem__(self, idx):
+        obs = self.obs[idx].clone()
+        future = self.future[idx].clone()
+        neighbors = [torch.tensor(n, dtype=torch.float32) for n in self.neighbors[idx]]
+        
+        if self.augment:
+            obs, neighbors, future = augment_data(obs, neighbors, future)
+            
+        return obs, neighbors, future
diff --git a/backend/app/legacy/dataset_fusion.py b/backend/app/legacy/dataset_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdea5fa2b96c9cb9b38186270057733fae6db26b
--- /dev/null
+++ b/backend/app/legacy/dataset_fusion.py
@@ -0,0 +1,37 @@
+import torch
+from torch.utils.data import Dataset
+
+from .dataset import augment_data
+
+
+class FusionTrajectoryDataset(Dataset):
+    def __init__(self, samples, augment=False):
+        self.obs = []
+        self.neighbors = []
+        self.fusion = []
+        self.future = []
+        self.augment = augment
+
+        for obs, neighbors, fusion_obs, future in samples:
+            self.obs.append(obs)
+            self.neighbors.append(neighbors)
+            self.fusion.append(fusion_obs)
+            self.future.append(future)
+
+        self.obs = torch.tensor(self.obs, dtype=torch.float32)
+        self.fusion = torch.tensor(self.fusion, dtype=torch.float32)
+        self.future = torch.tensor(self.future, dtype=torch.float32)
+
+    def __len__(self):
+        return len(self.obs)
+
+    def __getitem__(self, idx):
+        obs = self.obs[idx].clone()
+        fusion_obs = self.fusion[idx].clone()
+        future = self.future[idx].clone()
+        neighbors = [torch.tensor(n, dtype=torch.float32) for n in self.neighbors[idx]]
+
+        if self.augment:
+            obs, neighbors, future = augment_data(obs, neighbors, future)
+
+        return obs, neighbors, fusion_obs, future
diff --git a/backend/app/legacy/map_renderer.py b/backend/app/legacy/map_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcbf22cb3f6ce90455a6b4b726579479ff0b35b
--- /dev/null
+++ b/backend/app/legacy/map_renderer.py
@@ -0,0 +1,101 @@
+import matplotlib.pyplot as plt
+import json
+import os
+import numpy as np
+
+DATAROOT = './DataSet'
+VERSION = 'v1.0-mini'
+
+def get_map_mask():
+    """
+    Since the vector map expansion (JSON API) is not included in the raw dataset, 
+    we use the actual raw HD Map Raster Masks (PNGs) inherently included in the v1.0-mini dataset.
+    """
+    map_json_path = os.path.join(DATAROOT, VERSION, 'map.json')
+    try:
+        with open(map_json_path, 'r') as f:
+            map_data = json.load(f)
+            
+        # Grab the first available semantic prior map (binary mask of drivable area)
+        filename = map_data[0]['filename']
+        img_path = os.path.join(DATAROOT, filename)
+        
+        if os.path.exists(img_path):
+            img = plt.imread(img_path)
+            return img
+        else:
+            print(f"Map image not found at {img_path}")
+            return None
+    except Exception as e:
+        print(f"Error loading map.json: {e}")
+        return None
+
+def render_map_patch(x_center, y_center, radius=50.0, ax=None):
+    """
+    Simulates extracting an HD map patch by grabbing a corresponding 
+    section of the full-scale dataset map mask and displaying it.
+    """
+    if ax is None:
+        fig, ax = plt.subplots(figsize=(10, 10))
+        
+    mask = get_map_mask()
+    if mask is None:
+        return ax
+
+    # nuScenes standard raster resolution is 10 pixels per meter (0.1m)
+    pixels_per_meter = 10 
+    
+    # Let's find an interesting visual patch in the massive 20000x20000 map
+    # We will offset heavily into the image so we don't just see black emptiness
+    offset_x = 8000 
+    offset_y = 8500 
+
+    x_min_px = int(offset_x + (x_center - radius) * pixels_per_meter)
+    x_max_px = int(offset_x + (x_center + radius) * pixels_per_meter)
+    y_min_px = int(offset_y + (y_center - radius) * pixels_per_meter)
+    y_max_px = int(offset_y + (y_center + radius) * pixels_per_meter)
+
+    # Prevent out of bounds
+    x_min_px, x_max_px = max(0, x_min_px), min(mask.shape[1], x_max_px)
+    y_min_px, y_max_px = max(0, y_min_px), min(mask.shape[0], y_max_px)
+
+    crop = mask[y_min_px:y_max_px, x_min_px:x_max_px]
+
+    # Convert grayscale mask to an RGBA mask to allow custom colors and true transparency in the visual
+    import numpy as np
+    # True means drivable area, false is background
+    colored_mask = np.zeros((crop.shape[0], crop.shape[1], 4), dtype=np.float32)
+    
+    # Let's paint the drivable area road gray-blue with some opacity (e.g. 0.4)
+    # The road pixels in the original image are often 1.0 (or close to it)
+    road_pixels = crop > 0.5
+    
+    # Paint road pixels (R=0.2, G=0.3, B=0.5, Alpha=0.3 for a technical blueprint look)
+    colored_mask[road_pixels] = [0.2, 0.3, 0.5, 0.3]
+    # Background remains perfectly transparent (Alpha=0)
+
+    # Use imshow with the explicit RGBA mask
+    ax.imshow(colored_mask,  
+              extent=[x_center - radius, x_center + radius, y_center - radius, y_center + radius], 
+              origin='lower', zorder=-1) # Z-order ensures map is behind all points
+    
+    return ax
+
+if __name__ == "__main__":
+    print("Loading Native HD Map Mask from Raw Dataset...")
+    fig, ax = plt.subplots(figsize=(10, 10))
+    
+    # Test rendering a patch at center 0,0
+    render_map_patch(0, 0, radius=60.0, ax=ax)
+    
+    # Draw a fake car path on top to prove it works
+    ax.plot([0, 10, 30], 
+            [0, 5, 15], 
+            'r*-', linewidth=3, markersize=10, label="Vehicle Trajectory")
+    
+    ax.legend()
+    plt.grid(True, linestyle='--', alpha=0.5)
+    plt.title("Phase 3: Dataset-Native HD Map Raster Overlay")
+    plt.savefig("demo_raw_map.png", bbox_inches='tight')
+    plt.show()
+    print("Successfully generated 'demo_raw_map.png' using strictly internal dataset files!")
diff --git a/backend/app/legacy/visualization.py b/backend/app/legacy/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..639f52bb4eb66965ef5294736b44c33a725c0ec3
--- /dev/null
+++ b/backend/app/legacy/visualization.py
@@ -0,0 +1,399 @@
+﻿import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import numpy as np
+from ..ml.inference import predict
+from .map_renderer import render_map_patch
+
+def plot_scene(
+    points,
+    neighbor_points_list=None,
+    neighbor_types=None,
+    is_live_camera=False,
+    sensor_fusion=None,
+    presentation_mode=False,
+    max_vru_display=6,
+):
+    if neighbor_points_list is None: sibling_pts = []
+    else: sibling_pts = neighbor_points_list
+        
+    if neighbor_types is None: n_types = ['Car'] * len(sibling_pts)
+    else: n_types = neighbor_types
+
+    # Set up dark "Extreme 3D Mode" environment if it's Live Camera
+    plt.style.use('dark_background') if is_live_camera else plt.style.use('default')
+    fig = plt.figure(figsize=(14, 12))
+    ax = plt.gca()
+    
+    # ---------------- EGO VEHICLE & CAMERA PERSPECTIVE ----------------
+    if is_live_camera:
+        # In live camera mode, we anchor the BEV map to the Ego car!
+        ego_x, ego_y = 0.0, -2.0
+        ax.set_facecolor('#0b0e14') 
+        
+        # Add Compass Directions
+        ax.text(0, 48, "N (Forward)", color="white", fontsize=14, weight="bold", ha="center")
+        ax.text(0, -8, "S (Rear)", color="white", fontsize=14, weight="bold", ha="center", alpha=0.5)
+        ax.text(32, ego_y, "E (Right)", color="white", fontsize=14, weight="bold", ha="left", alpha=0.5)
+        ax.text(-32, ego_y, "W (Left)", color="white", fontsize=14, weight="bold", ha="right", alpha=0.5)
+        
+        plt.grid(True, linestyle='dotted', color='#1a2436', alpha=0.9, zorder=0)
+        
+        theta = np.linspace(np.pi/3, 2 * np.pi/3, 50)
+        fov_range = 60
+        ax.fill_between(
+            [ego_x] + list(ego_x + fov_range * np.cos(theta)) + [ego_x],
+            [ego_y] + list(ego_y + fov_range * np.sin(theta)) + [ego_y],
+            color='#00ffff', alpha=0.1, zorder=1, label='360 Camera / LiDAR FOV'
+        )
+        car_rect = patches.Rectangle((ego_x - 1.2, ego_y - 2.5), 2.4, 5.0, linewidth=2, edgecolor='#00ffff', facecolor='#001a1a', zorder=7, label="Autonomous Ego Vehicle")
+        ax.add_patch(car_rect)
+        
+        ax.set_xlim(-35, 35)
+        ax.set_ylim(-10, 50)
+        map_center_x, map_center_y = 0, 20
+        ego_ref = np.array([ego_x, ego_y], dtype=np.float32)
+    else:
+        map_center_x, map_center_y = points[-1][0], points[-1][1]
+        ego_x, ego_y = map_center_x - 12, map_center_y - 6
+        theta = np.linspace(-np.pi/6, np.pi/6, 50)
+        ax.fill_between(
+            [ego_x] + list(ego_x + 50 * np.cos(theta)) + [ego_x],
+            [ego_y] + list(ego_y + 50 * np.sin(theta)) + [ego_y],
+            color='cyan', alpha=0.15, zorder=2
+        )
+        car_rect = patches.Rectangle((ego_x - 2.4, ego_y - 1.0), 4.8, 2.0, linewidth=2, edgecolor='black', facecolor='cyan', zorder=7)
+        ax.add_patch(car_rect)
+        ax.set_xlim(map_center_x - 15, map_center_x + 35)
+        ax.set_ylim(map_center_y - 20, map_center_y + 20)
+        plt.grid(True, linestyle='solid', color='lightgray', alpha=0.5, zorder=1)
+        ego_ref = np.array([map_center_x, map_center_y], dtype=np.float32)
+
+    if not is_live_camera:
+        render_map_patch(map_center_x, map_center_y, radius=120.0, ax=ax)
+
+    # ---------------- Phase 1 Sensor Fusion Overlay ----------------
+    if is_live_camera and sensor_fusion is not None:
+        lidar_xy = sensor_fusion.get('lidar_xy', None)
+        radar_xy = sensor_fusion.get('radar_xy', None)
+        radar_vel = sensor_fusion.get('radar_vel', None)
+
+        if lidar_xy is not None and len(lidar_xy) > 0:
+            # Remove very-near ego returns to avoid halo clutter around the car.
+            r = np.hypot(lidar_xy[:, 0] - ego_ref[0], lidar_xy[:, 1] - ego_ref[1])
+            lidar_vis = lidar_xy[r > 6.0]
+
+            if presentation_mode:
+                step = 18 if len(lidar_vis) > 12000 else 10
+                lidar_plot = lidar_vis[::step] if len(lidar_vis) > 0 else lidar_vis
+                lidar_size = 3
+                lidar_alpha = 0.10
+            else:
+                lidar_plot = lidar_vis[::4] if len(lidar_vis) > 4000 else lidar_vis
+                lidar_size = 5
+                lidar_alpha = 0.18
+
+            ax.scatter(
+                lidar_plot[:, 0],
+                lidar_plot[:, 1],
+                s=lidar_size,
+                c='#22d3ee',
+                alpha=lidar_alpha,
+                linewidths=0,
+                label='LiDAR occupancy',
+                zorder=2,
+            )
+
+        if radar_xy is not None and len(radar_xy) > 0:
+            if presentation_mode and len(radar_xy) > 180:
+                radar_plot = radar_xy[::2]
+            else:
+                radar_plot = radar_xy
+
+            ax.scatter(
+                radar_plot[:, 0],
+                radar_plot[:, 1],
+                s=18 if presentation_mode else 24,
+                c='#facc15',
+                alpha=0.78 if presentation_mode else 0.85,
+                edgecolors='black',
+                linewidths=0.5,
+                label='Radar returns (multi-ch)',
+                zorder=6,
+            )
+
+            if radar_vel is not None and len(radar_vel) == len(radar_xy):
+                speeds = np.hypot(radar_vel[:, 0], radar_vel[:, 1])
+                if presentation_mode:
+                    idx = np.where(speeds > 0.6)[0]
+                    if len(idx) > 18:
+                        idx = idx[np.argsort(speeds[idx])[-18:]]
+                else:
+                    step = max(1, len(radar_xy) // 40)
+                    idx = np.arange(0, len(radar_xy), step)
+
+                for i in idx:
+                    x0, y0 = radar_xy[i, 0], radar_xy[i, 1]
+                    vx, vy = radar_vel[i, 0], radar_vel[i, 1]
+                    ax.arrow(
+                        x0,
+                        y0,
+                        vx * (0.45 if presentation_mode else 0.6),
+                        vy * (0.45 if presentation_mode else 0.6),
+                        head_width=0.45 if presentation_mode else 0.6,
+                        head_length=0.6 if presentation_mode else 0.8,
+                        fc='#fde68a',
+                        ec='#facc15',
+                        alpha=0.65 if presentation_mode else 0.75,
+                        zorder=6,
+                        length_includes_head=True,
+                    )
+
+    # ---------------- MULTI-AGENT PREDICTIONS ----------------
+    color_map = {'Car': '#ffff00', 'Truck': '#ffaa00', 'Bus': '#ff8800', 'Person': '#ff00ff', 'Bike': '#ff5500'}
+
+    def build_agent_fusion_features(agent_points):
+        if sensor_fusion is None:
+            return None
+
+        lidar_xy = sensor_fusion.get('lidar_xy', None)
+        radar_xy = sensor_fusion.get('radar_xy', None)
+
+        if lidar_xy is None and radar_xy is None:
+            return None
+
+        feats = []
+        for px, py in agent_points:
+            if lidar_xy is not None and len(lidar_xy) > 0:
+                dl = np.hypot(lidar_xy[:, 0] - px, lidar_xy[:, 1] - py)
+                lidar_cnt = int((dl < 2.0).sum())
+            else:
+                lidar_cnt = 0
+
+            if radar_xy is not None and len(radar_xy) > 0:
+                dr = np.hypot(radar_xy[:, 0] - px, radar_xy[:, 1] - py)
+                radar_cnt = int((dr < 2.5).sum())
+            else:
+                radar_cnt = 0
+
+            lidar_norm = min(80.0, float(lidar_cnt)) / 80.0
+            radar_norm = min(30.0, float(radar_cnt)) / 30.0
+            sensor_strength = min(1.0, (float(lidar_cnt) + 2.0 * float(radar_cnt)) / 100.0)
+            feats.append([lidar_norm, radar_norm, sensor_strength])
+
+        return feats
+
+    def classify_mode_direction(hist_x, hist_y, pred_x, pred_y):
+        if len(hist_x) < 2:
+            return 'Straight'
+
+        # Current motion heading from the last observed segment.
+        hx = hist_x[-1] - hist_x[-2]
+        hy = hist_y[-1] - hist_y[-2]
+        if np.hypot(hx, hy) < 1e-6:
+            hx, hy = 0.0, 1.0
+
+        # Predicted heading from current point to mode endpoint.
+        px = pred_x[-1] - hist_x[-1]
+        py = pred_y[-1] - hist_y[-1]
+        if np.hypot(px, py) < 1e-6:
+            return 'Straight'
+
+        angle_deg = np.degrees(np.arctan2(hx * py - hy * px, hx * px + hy * py))
+
+        if abs(angle_deg) <= 30:
+            return 'Straight'
+        if 30 < angle_deg < 140:
+            return 'Left'
+        if -140 < angle_deg < -30:
+            return 'Right'
+        return 'Backward'
+
+    all_agents_to_predict = [(points, 'Person (Primary)')]
+    for i, n_pts in enumerate(sibling_pts):
+        # We now run predictions for ANY vulnerable user (Person or Bicycle)
+        if is_live_camera and n_types[i] in ['Person', 'Bicycle']:
+            all_agents_to_predict.append((n_pts, f"{n_types[i]} {i}"))
+
+    # Keep the live demo readable by limiting displayed VRUs in presentation mode.
+    if is_live_camera and presentation_mode and len(all_agents_to_predict) > max_vru_display:
+        primary = all_agents_to_predict[0]
+        others = all_agents_to_predict[1:]
+
+        def _dist_to_ego(agent_entry):
+            pts = agent_entry[0]
+            if len(pts) == 0:
+                return 1e9
+            px, py = pts[-1][0], pts[-1][1]
+            return float(np.hypot(px - ego_ref[0], py - ego_ref[1]))
+
+        others = sorted(others, key=_dist_to_ego)
+        all_agents_to_predict = [primary] + others[: max(0, max_vru_display - 1)]
+
+    vru_mode_summaries = []
+    vru_counter = 1
+
+    # Predict and plot the future for all identified vulnerable users
+    for agent_pts, label in all_agents_to_predict:
+        fusion_feats = build_agent_fusion_features(agent_pts)
+        pred, probs, attn_weights = predict(agent_pts, sibling_pts, fusion_feats=fusion_feats)
+        tx, ty = [p[0] for p in agent_pts], [p[1] for p in agent_pts]
+        is_primary = 'Primary' in label
+        mode_direction_scores = {}
+        
+        # Plot their history (tail)
+        plt.plot(tx, ty, color='white' if is_primary else '#ff00ff', linestyle='solid' if is_live_camera else 'dashed', linewidth=3, zorder=5)
+        if is_live_camera:
+            point_label = 'Primary VRU (t=0)' if is_primary else 'Target VRU (t=0)'
+        else:
+            point_label = f"{label} (t=0)"
+        plt.scatter(tx[-1], ty[-1], c='white' if is_primary else '#ff00ff', s=250 if is_primary else 150, edgecolors='black', linewidths=2, label=point_label, zorder=8)
+        
+        # --- NEW: Add an extremely obvious Vector Arrow showing their Current Walking Direction ---
+        if len(tx) >= 2:
+            dx_dir = tx[-1] - tx[-2]
+            dy_dir = ty[-1] - ty[-2]
+            dir_mag = np.hypot(dx_dir, dy_dir)
+            if dir_mag > 0.01:
+                # The arrow dynamically scales to their movement speed and points exactly where they are headed!
+                arr_dx, arr_dy = (dx_dir/dir_mag)*3, (dy_dir/dir_mag)*3
+                ax.arrow(tx[-1], ty[-1], arr_dx, arr_dy, head_width=1.5, head_length=2.0, fc='#00ffff', ec='white', zorder=12, width=0.4, alpha=0.9)
+                
+        # Plot their Future prediction paths
+        colors = ['#0088ff', '#ff8800', '#ff0044']
+        mode_curves = []
+        
+        for mode_i in range(pred.shape[0]):
+            x_pred_raw = pred[mode_i][:, 0].numpy()
+            y_pred_raw = pred[mode_i][:, 1].numpy()
+
+            dx = x_pred_raw - x_pred_raw[0]
+            dy = y_pred_raw - y_pred_raw[0]
+            
+            x_pred = tx[-1] + dx * (2.0 if is_live_camera else 4.0)
+            y_pred = ty[-1] + dy * (2.0 if is_live_camera else 4.0)
+            mode_curves.append((mode_i, x_pred, y_pred))
+
+            mode_direction = classify_mode_direction(tx, ty, x_pred, y_pred)
+            mode_prob = float(probs[mode_i].item())
+            mode_direction_scores[mode_direction] = mode_direction_scores.get(mode_direction, 0.0) + mode_prob
+
+        if presentation_mode and is_live_camera:
+            draw_modes = [int(np.argmax(probs.numpy()))]
+        else:
+            draw_modes = [m[0] for m in mode_curves]
+
+        for mode_i, x_pred, y_pred in mode_curves:
+            if mode_i not in draw_modes:
+                continue
+            plt.plot(
+                x_pred,
+                y_pred,
+                color=colors[mode_i],
+                linewidth=3.0 if presentation_mode else 2.5 + (0 if mode_i > 0 else 1),
+                alpha=0.9 if presentation_mode else (0.8 if mode_i == 0 else 0.4),
+                zorder=5,
+            )
+            for t in range(0, len(x_pred), 3 if presentation_mode else 2):
+                plt.scatter(
+                    x_pred[t],
+                    y_pred[t],
+                    color=colors[mode_i],
+                    alpha=max(0.35, 1.0 - (t / 12)),
+                    s=28 if presentation_mode else 40,
+                    zorder=6,
+                )
+
+        # Per-agent Top-3 direction probabilities for live demo readability.
+        sorted_modes = sorted(mode_direction_scores.items(), key=lambda kv: kv[1], reverse=True)
+        top_modes = sorted_modes[:3]
+        vru_id = f"VRU-{vru_counter}" + ("*" if is_primary else "")
+        vru_mode_summaries.append((vru_id, top_modes))
+
+        if is_live_camera and (not presentation_mode) and len(top_modes) > 0:
+            primary_dir, primary_prob = top_modes[0]
+            ax.text(
+                tx[-1] + 0.8,
+                ty[-1] + 1.2,
+                f"{vru_id}: {primary_dir} {primary_prob*100:.0f}%",
+                fontsize=8,
+                color='white',
+                bbox=dict(facecolor='#111827', edgecolor='#60a5fa', alpha=0.8, boxstyle='round,pad=0.2'),
+                zorder=13
+            )
+        vru_counter += 1
+
+    # ---------------- PLOT NEIGHBORS (Vehicles/Trucks) ----------------
+    for i, n_pts in enumerate(sibling_pts):
+        if is_live_camera and n_types[i] in ['Person', 'Bicycle']:
+            continue # Already predicted above
+            
+        n_type = n_types[i]
+        n_color = color_map.get(n_type, 'yellow')
+        n_x, n_y = [p[0] for p in n_pts], [p[1] for p in n_pts]
+        
+        marker_size = 400 if n_type in ['Truck', 'Bus'] else 200
+        marker_shape = 's' if n_type in ['Truck', 'Bus'] else 'o'
+        
+        plt.plot(n_x, n_y, color=n_color, linestyle=':', linewidth=2, zorder=4)
+        plt.scatter(n_x[-1], n_y[-1], c=n_color, marker=marker_shape, s=marker_size, edgecolors='white' if is_live_camera else 'black', linewidth=1.5, label=f'Moving ({n_type})', zorder=7)
+
+    # UI Embellishments
+    plt.title("Ego-Centric BEV Matrix: Multi-Agent Parallel Forecasting", color="white" if is_live_camera else "black", fontsize=20, weight='bold', pad=15)
+    plt.xlabel("X Lateral Offset (meters)", color="white" if is_live_camera else "black", weight='bold', fontsize=13)
+    plt.ylabel("Y Depth Offset (meters)", color="white" if is_live_camera else "black", weight='bold', fontsize=13)
+
+    if is_live_camera:
+        ax.tick_params(axis='both', colors='white', labelsize=11)
+        for spine in ax.spines.values():
+            spine.set_color('#94a3b8')
+
+    handles, labels = ax.get_legend_handles_labels()
+    unique_labels, unique_handles = [], []
+    for h, l in zip(handles, labels):
+        if l not in unique_labels:
+            unique_labels.append(l)
+            unique_handles.append(h)
+
+    if is_live_camera:
+        leg = ax.legend(
+            unique_handles,
+            unique_labels,
+            loc='upper right',
+            fancybox=True,
+            framealpha=0.95,
+            facecolor='#111827',
+            edgecolor='#94a3b8',
+            fontsize=10,
+            title='Legend'
+        )
+        plt.setp(leg.get_texts(), color='white')
+        plt.setp(leg.get_title(), color='white', weight='bold')
+
+        if len(vru_mode_summaries) > 0:
+            summary_lines = ["Top-3 Direction Probabilities"]
+            summary_lines.append("VRU-* = primary target")
+            for vru_id, top_modes in vru_mode_summaries[:max_vru_display]:
+                mode_text = " | ".join([f"{name}:{prob*100:.0f}%" for name, prob in top_modes])
+                summary_lines.append(f"{vru_id} -> {mode_text}")
+
+            fig.subplots_adjust(right=0.80)
+            ax.text(
+                1.02,
+                0.62,
+                "\n".join(summary_lines),
+                transform=ax.transAxes,
+                va='top',
+                ha='left',
+                fontsize=9,
+                color='white',
+                bbox=dict(facecolor='#0f172a', edgecolor='#60a5fa', alpha=0.95, boxstyle='round,pad=0.4')
+            )
+    else:
+        leg = ax.legend(unique_handles, unique_labels, loc='upper left', bbox_to_anchor=(1.02, 1.0), fancybox=True, framealpha=0.9)
+
+    ax.set_aspect('equal', adjustable='box')
+    return fig
+
+if __name__ == "__main__":
+    main_pedestrian = [(0, 0), (10, 0), (20, 0), (30, 0)]
+    plot_scene(main_pedestrian, is_live_camera=True)
diff --git a/backend/app/main.py b/backend/app/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2f1503ce5fe0ecade6811827fa8a2e96945a94
--- /dev/null
+++ b/backend/app/main.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from .api.dependencies import pipeline
+from .api.routes.health import router as health_router
+from .api.routes.live import get_live_frame_image, resolve_dataset_frame_path, router as live_router
+from .api.routes.predict import router as predict_router
+from .core.serialization import build_prediction_payload
+
+def create_app() -> FastAPI:
+    app = FastAPI(
+        title="BEV Trajectory Backend",
+        version="0.2.0",
+        description="Structured FastAPI backend for CV + trajectory prediction",
+    )
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    app.include_router(health_router, prefix="/api", tags=["health"])
+    app.include_router(live_router, prefix="/api", tags=["live"])
+    app.include_router(predict_router, prefix="/api", tags=["predict"])
+    return app
+
+
+app = create_app()
+
+
+__all__ = [
+    "app",
+    "create_app",
+    "pipeline",
+    "build_prediction_payload",
+    "resolve_dataset_frame_path",
+    "get_live_frame_image",
+]
diff --git a/backend/app/ml/__init__.py b/backend/app/ml/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5795e31dea16fa1158ca07c4f61487d6ec67ca50
--- /dev/null
+++ b/backend/app/ml/__init__.py
@@ -0,0 +1 @@
+"""Runtime ML modules used by the FastAPI pipeline."""
diff --git a/backend/app/ml/inference.py b/backend/app/ml/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e228f3970d5ae181c9a9c83be8706a87f6dd43a1
--- /dev/null
+++ b/backend/app/ml/inference.py
@@ -0,0 +1,172 @@
+from pathlib import Path
+
+import torch
+from .model import TrajectoryTransformer
+from .model_fusion import TrajectoryTransformerFusion
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+MODEL_DIR = REPO_ROOT / "models"
+FUSION_CKPT = MODEL_DIR / "best_social_model_fusion.pth"
+BASE_CKPT = MODEL_DIR / "best_social_model.pth"
+
+# ----------------------------
+# LOAD MODEL
+# ----------------------------
+USING_FUSION_MODEL = False
+
+if FUSION_CKPT.exists():
+    model = TrajectoryTransformerFusion(fusion_dim=3).to(device)
+    try:
+        model.load_state_dict(torch.load(FUSION_CKPT, map_location=device))
+        USING_FUSION_MODEL = True
+        print("Inference: Loaded Phase 2 fusion checkpoint (best_social_model_fusion.pth).")
+    except Exception as e:
+        print(f"Warning: could not load fusion checkpoint ({e}). Falling back to base model.")
+        model = TrajectoryTransformer().to(device)
+        try:
+            model.load_state_dict(torch.load(BASE_CKPT, map_location=device))
+            print("Inference: Loaded base checkpoint (best_social_model.pth).")
+        except Exception as e2:
+            print(f"Warning: could not load base checkpoint ({e2}).")
+else:
+    model = TrajectoryTransformer().to(device)
+    try:
+        model.load_state_dict(torch.load(BASE_CKPT, map_location=device))
+        print("Inference: Loaded base checkpoint (best_social_model.pth).")
+    except Exception as e:
+        print(f"Warning: could not load model weights ({e}), starting fresh.")
+
+model.eval()
+
+
+# ----------------------------
+# PREPROCESS INPUT
+# ----------------------------
+def prepare_input(points):
+    import math
+    x3, y3 = points[3]
+    window = [[x - x3, y - y3] for x, y in points]
+
+    vel = []
+    for j in range(len(window)):
+        if j == 0:
+            vel.append([0, 0, 0, 0, 0])
+        else:
+            dx = window[j][0] - window[j-1][0]
+            dy = window[j][1] - window[j-1][1]
+            speed = math.hypot(dx, dy)
+            if speed > 1e-5:
+                sin_t = dy / speed
+                cos_t = dx / speed
+            else:
+                sin_t = 0.0
+                cos_t = 0.0
+            vel.append([dx, dy, speed, sin_t, cos_t])
+
+    obs = []
+    for j in range(4):
+        obs.append([
+            window[j][0],
+            window[j][1],
+            vel[j][0],
+            vel[j][1],
+            vel[j][2],
+            vel[j][3],
+            vel[j][4]
+        ])
+
+    return obs, (x3, y3)
+
+
+# ----------------------------
+# PREDICTION FUNCTION
+# ----------------------------
+def predict(points, neighbor_points_list=None, fusion_feats=None):
+    if neighbor_points_list is None:
+        neighbor_points_list = []
+        
+    obs, origin = prepare_input(points)
+
+    obs = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)  # (1,4,7)
+
+    # Prepare neighbors exactly as the main trajectory
+    import math
+    x1, y1 = points[-1]
+    neighbors = []
+    for np_points in neighbor_points_list:
+        n_window = [[x - x1, y - y1] for x, y in np_points]
+        vel_n = []
+        for j in range(len(n_window)):
+            if j == 0:
+                vel_n.append([0, 0, 0, 0, 0])
+            else:
+                dx = n_window[j][0] - n_window[j-1][0]
+                dy = n_window[j][1] - n_window[j-1][1]
+                speed = math.hypot(dx, dy)
+                if speed > 1e-5:
+                    sin_t = dy / speed
+                    cos_t = dx / speed
+                else:
+                    sin_t = 0.0
+                    cos_t = 0.0
+                vel_n.append([dx, dy, speed, sin_t, cos_t])
+        
+        n_obs = []
+        for j in range(4):
+            n_obs.append([
+                n_window[j][0], n_window[j][1],
+                vel_n[j][0], vel_n[j][1], vel_n[j][2], vel_n[j][3], vel_n[j][4]
+            ])
+        neighbors.append(n_obs)
+
+    neighbors_batch = [neighbors]  # batch size = 1
+
+    with torch.no_grad():
+        if USING_FUSION_MODEL:
+            if fusion_feats is None:
+                fusion_tensor = torch.zeros((1, 4, 3), dtype=torch.float32, device=device)
+            else:
+                fusion_tensor = torch.tensor(fusion_feats, dtype=torch.float32).unsqueeze(0).to(device)
+            pred, goals, probs, attn_weights = model(obs, neighbors_batch, fusion_tensor)
+        else:
+            pred, goals, probs, attn_weights = model(obs, neighbors_batch)
+
+    pred = pred.squeeze(0).cpu()
+    probs = probs.squeeze(0).cpu()
+
+    if attn_weights and attn_weights[0] is not None:
+        attn_weights = [w.cpu() for w in attn_weights]
+
+    # convert back to real coordinates
+    x0, y0 = origin
+    pred_real = pred.clone()
+
+    pred_real[:, :, 0] += x0
+    pred_real[:, :, 1] += y0
+
+    return pred_real, probs, attn_weights
+
+
+# ----------------------------
+# DEMO RUN
+# ----------------------------
+if __name__ == "__main__":
+
+    points = [
+        (0, 0),
+        (10, 0),
+        (20, 0),
+        (30, 0)
+    ]
+
+    pred, probs, _ = predict(points)
+
+    print("\nInput Points:")
+    print(points)
+
+    print("\nPredicted Trajectories (Real Coordinates):")
+    for i in range(pred.shape[0]):
+        print(f"\nTrajectory {i+1} (prob={probs[i].item():.2f}):")
+        print(pred[i])
\ No newline at end of file
diff --git a/backend/app/ml/model.py b/backend/app/ml/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ebf20d6dc72c80fb32c1339b0b71fb0e548231a
--- /dev/null
+++ b/backend/app/ml/model.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+import math
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=100):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0) # (1, max_len, d_model)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1), :]
+
+class TrajectoryTransformer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.d_model = 64
+        
+        # 1. Feature Embedding & Positional Encoding
+        self.embed = nn.Linear(7, self.d_model)
+        self.pos_enc = PositionalEncoding(self.d_model)
+
+        # 2. Transformer Sequence Encoder (Replaces LSTM)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=self.d_model, nhead=4, dim_feedforward=256, batch_first=True
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
+
+        # 3. Social Attention (Target queries Neighbors)
+        self.social_attn = nn.MultiheadAttention(embed_dim=self.d_model, num_heads=4, batch_first=True)
+
+        self.K = 3  # number of future modes
+
+        # 4. GOAL-CONDITIONED ARCHITECTURE 
+        # Base hidden context: Target (64) + Social (64) = 128
+        self.hidden_dim = 128
+        self.future_len = 12  # Now predicting 6 seconds into future
+        
+        # Step A: Predict exactly K distinct endpoints (goals)
+        self.goal_head = nn.Sequential(
+            nn.Linear(self.hidden_dim, 64),
+            nn.ReLU(),
+            nn.Linear(64, self.K * 2) # X, Y for K goals
+        )
+
+        # Step B: Given the encoded context PLUS a specific Goal, draw the path to get there
+        self.traj_head = nn.Sequential(
+            nn.Linear(self.hidden_dim + 2, 128),
+            nn.ReLU(),
+            nn.Linear(128, self.future_len * 2) # 12 steps to reach the destination
+        )
+
+        # 5. Probabilities of each mode
+        self.prob_head = nn.Linear(self.hidden_dim, self.K)
+
+    # ----------------------------
+    # SOCIAL POOLING
+    # ----------------------------
+    def social_pool(self, h_target, neighbor_h_list, device):
+        if len(neighbor_h_list) == 0:
+            return torch.zeros(self.d_model, device=device), None
+
+        # h_target: (64) -> query: (1, 1, 64)
+        query = h_target.unsqueeze(0).unsqueeze(0)
+        
+        # neighbor_h_list: N x 64 -> key, value: (1, N, 64)
+        neighbor_h_tensor = torch.stack(neighbor_h_list).unsqueeze(0)
+        
+        # apply attention
+        attn_output, attn_weights = self.social_attn(query, neighbor_h_tensor, neighbor_h_tensor)
+        
+        return attn_output.squeeze(0).squeeze(0), attn_weights.squeeze(0)
+
+    # ----------------------------
+    # FORWARD PASS
+    # ----------------------------
+    def forward(self, x, neighbors):
+        """
+        x: (B, 4, 7)
+        neighbors: list of length B
+        """
+
+        B = x.size(0)
+        device = x.device
+
+        # Encode main trajectory sequence with Transformer
+        x_emb = self.embed(x)
+        x_emb = self.pos_enc(x_emb)
+        enc_out = self.transformer_encoder(x_emb)
+        h = enc_out[:, -1, :]  # Grab context from last timestep (B, 64)
+
+        final_h = []
+        batch_attn_weights = []
+
+        # Loop through batch to handle variable size neighbors
+        for i in range(B):
+            h_target = h[i]  # (64)
+
+            neighbor_h_list = []
+            for n in neighbors[i]:
+                n_tensor = torch.tensor(n, dtype=torch.float32, device=device).unsqueeze(0)
+
+                n_emb = self.pos_enc(self.embed(n_tensor))
+                n_enc_out = self.transformer_encoder(n_emb)
+                
+                neighbor_h_list.append(n_enc_out[0, -1, :])  # (64)
+                
+            # Social attention pooling
+            h_social, attn_weights = self.social_pool(h_target, neighbor_h_list, device)
+            batch_attn_weights.append(attn_weights)
+
+            # Combine Target and Social context
+            h_combined = torch.cat([h_target, h_social], dim=0)  # (128)
+            final_h.append(h_combined)
+
+        h_final = torch.stack(final_h)  # (B, 128)
+
+        # GOAL-CONDITIONED LOGIC
+        # 1. Predict Goals (End-points at t=6)
+        goals = self.goal_head(h_final)
+        goals = goals.view(B, self.K, 2)  # (B, K, 2)
+        
+        # 2. Condition trajectories on the predicted goals
+        trajs = []
+        for k in range(self.K):
+            goal_k = goals[:, k, :] # Get the k-th destination (B, 2)
+            # Concat the base context array with the goal coordinate!
+            conditioned_context = torch.cat([h_final, goal_k], dim=1) # (B, 130)
+            
+            # Predict the path given the condition
+            traj_k = self.traj_head(conditioned_context).view(B, 1, self.future_len, 2)
+            trajs.append(traj_k)
+            
+        traj = torch.cat(trajs, dim=1) # (B, K, 12, 2)
+
+        # 3. Mode Probabilities
+        probs = self.prob_head(h_final)
+        probs = torch.softmax(probs, dim=1)
+
+        return traj, goals, probs, batch_attn_weights
\ No newline at end of file
diff --git a/backend/app/ml/model_fusion.py b/backend/app/ml/model_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..4564b9018c0be81a3315c9b333c7cab50ae5452f
--- /dev/null
+++ b/backend/app/ml/model_fusion.py
@@ -0,0 +1,138 @@
+import math
+
+import torch
+import torch.nn as nn
+
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=100):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1), :]
+
+
+class TrajectoryTransformerFusion(nn.Module):
+    def __init__(self, fusion_dim=3):
+        super().__init__()
+        self.d_model = 64
+
+        # Base kinematic embedding from original model features.
+        self.base_embed = nn.Linear(7, self.d_model)
+
+        # Fusion branch: LiDAR/Radar strength features per timestep.
+        self.fusion_embed = nn.Linear(fusion_dim, self.d_model)
+        self.fusion_ln = nn.LayerNorm(self.d_model)
+
+        self.pos_enc = PositionalEncoding(self.d_model)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=self.d_model,
+            nhead=4,
+            dim_feedforward=256,
+            batch_first=True,
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
+
+        self.social_attn = nn.MultiheadAttention(embed_dim=self.d_model, num_heads=4, batch_first=True)
+
+        self.K = 3
+        self.hidden_dim = 128
+        self.future_len = 12
+
+        self.goal_head = nn.Sequential(
+            nn.Linear(self.hidden_dim, 64),
+            nn.ReLU(),
+            nn.Linear(64, self.K * 2),
+        )
+
+        self.traj_head = nn.Sequential(
+            nn.Linear(self.hidden_dim + 2, 128),
+            nn.ReLU(),
+            nn.Linear(128, self.future_len * 2),
+        )
+
+        self.prob_head = nn.Linear(self.hidden_dim, self.K)
+
+    def load_from_base_checkpoint(self, ckpt_path, map_location='cpu'):
+        state = torch.load(ckpt_path, map_location=map_location)
+
+        remap = {}
+        for k, v in state.items():
+            if k.startswith('embed.'):
+                remap['base_embed.' + k[len('embed.'):]] = v
+            else:
+                remap[k] = v
+
+        missing, unexpected = self.load_state_dict(remap, strict=False)
+        return missing, unexpected
+
+    def social_pool(self, h_target, neighbor_h_list, device):
+        if len(neighbor_h_list) == 0:
+            return torch.zeros(self.d_model, device=device), None
+
+        query = h_target.unsqueeze(0).unsqueeze(0)
+        neighbor_h_tensor = torch.stack(neighbor_h_list).unsqueeze(0)
+        attn_output, attn_weights = self.social_attn(query, neighbor_h_tensor, neighbor_h_tensor)
+        return attn_output.squeeze(0).squeeze(0), attn_weights.squeeze(0)
+
+    def forward(self, x, neighbors, fusion_feats=None):
+        """
+        x: (B, 4, 7)
+        neighbors: list length B, each element is list of neighbors with shape (4, 7)
+        fusion_feats: (B, 4, F) where F=3 [lidar_pts_norm, radar_pts_norm, sensor_strength]
+        """
+        B = x.size(0)
+        device = x.device
+
+        x_emb = self.base_embed(x)
+        if fusion_feats is not None:
+            x_emb = self.fusion_ln(x_emb + self.fusion_embed(fusion_feats))
+
+        x_emb = self.pos_enc(x_emb)
+        enc_out = self.transformer_encoder(x_emb)
+        h = enc_out[:, -1, :]
+
+        final_h = []
+        batch_attn_weights = []
+
+        for i in range(B):
+            h_target = h[i]
+            neighbor_h_list = []
+
+            for n in neighbors[i]:
+                n_tensor = torch.as_tensor(n, dtype=torch.float32, device=device).unsqueeze(0)
+                n_emb = self.pos_enc(self.base_embed(n_tensor))
+                n_enc_out = self.transformer_encoder(n_emb)
+                neighbor_h_list.append(n_enc_out[0, -1, :])
+
+            h_social, attn_weights = self.social_pool(h_target, neighbor_h_list, device)
+            batch_attn_weights.append(attn_weights)
+
+            h_combined = torch.cat([h_target, h_social], dim=0)
+            final_h.append(h_combined)
+
+        h_final = torch.stack(final_h)
+
+        goals = self.goal_head(h_final).view(B, self.K, 2)
+
+        trajs = []
+        for k in range(self.K):
+            goal_k = goals[:, k, :]
+            conditioned_context = torch.cat([h_final, goal_k], dim=1)
+            traj_k = self.traj_head(conditioned_context).view(B, 1, self.future_len, 2)
+            trajs.append(traj_k)
+
+        traj = torch.cat(trajs, dim=1)
+
+        probs = self.prob_head(h_final)
+        probs = torch.softmax(probs, dim=1)
+
+        return traj, goals, probs, batch_attn_weights
diff --git a/backend/app/ml/sensor_fusion.py b/backend/app/ml/sensor_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..5289a386e936d54cbe58d2df69ab5c0ad836a4d8
--- /dev/null
+++ b/backend/app/ml/sensor_fusion.py
@@ -0,0 +1,396 @@
+import json
+import os
+from collections import defaultdict
+from functools import lru_cache
+
+import numpy as np
+
+
+@lru_cache(maxsize=1)
+def _load_sample_data_index(data_root: str, version: str):
+    sample_data_path = os.path.join(data_root, version, "sample_data.json")
+    with open(sample_data_path, "r", encoding="utf-8") as f:
+        records = json.load(f)
+
+    by_basename = {}
+    by_sample_token = defaultdict(list)
+
+    for rec in records:
+        basename = os.path.basename(rec.get("filename", ""))
+        if basename:
+            by_basename[basename] = rec
+        token = rec.get("sample_token")
+        if token:
+            by_sample_token[token].append(rec)
+
+    return by_basename, dict(by_sample_token)
+
+
+@lru_cache(maxsize=1)
+def _load_calibrated_sensor_index(data_root: str, version: str):
+    calib_path = os.path.join(data_root, version, "calibrated_sensor.json")
+    with open(calib_path, "r", encoding="utf-8") as f:
+        records = json.load(f)
+    return {r["token"]: r for r in records}
+
+
+def _channel_from_filename(rel_path: str) -> str:
+    parts = rel_path.replace("\\", "/").split("/")
+    if len(parts) >= 2:
+        return parts[1]
+    return ""
+
+
+def _quat_wxyz_to_rot(q):
+    # nuScenes stores quaternion as [w, x, y, z]
+    w, x, y, z = q
+    n = np.sqrt(w * w + x * x + y * y + z * z)
+    if n < 1e-12:
+        return np.eye(3, dtype=np.float32)
+
+    w, x, y, z = w / n, x / n, y / n, z / n
+
+    return np.array(
+        [
+            [1 - 2 * (y * y + z * z), 2 * (x * y - z * w), 2 * (x * z + y * w)],
+            [2 * (x * y + z * w), 1 - 2 * (x * x + z * z), 2 * (y * z - x * w)],
+            [2 * (x * z - y * w), 2 * (y * z + x * w), 1 - 2 * (x * x + y * y)],
+        ],
+        dtype=np.float32,
+    )
+
+
+def _transform_points_sensor_to_ego(points_xyz: np.ndarray, calib: dict):
+    if points_xyz.size == 0:
+        return points_xyz
+
+    if calib is None:
+        return points_xyz
+
+    rot = _quat_wxyz_to_rot(calib.get("rotation", [1.0, 0.0, 0.0, 0.0]))
+    t = np.asarray(calib.get("translation", [0.0, 0.0, 0.0]), dtype=np.float32)
+
+    # Row-vector form: p_ego = p_sensor * R^T + t
+    return points_xyz @ rot.T + t
+
+
+def _transform_vel_sensor_to_ego(vel_xy: np.ndarray, calib: dict):
+    if vel_xy.size == 0:
+        return vel_xy
+
+    if calib is None:
+        return vel_xy
+
+    rot = _quat_wxyz_to_rot(calib.get("rotation", [1.0, 0.0, 0.0, 0.0]))
+
+    v_xyz = np.zeros((vel_xy.shape[0], 3), dtype=np.float32)
+    v_xyz[:, 0] = vel_xy[:, 0]
+    v_xyz[:, 1] = vel_xy[:, 1]
+
+    v_ego = v_xyz @ rot.T
+    return v_ego[:, :2].astype(np.float32)
+
+
+def _load_lidar_pcd_bin(file_path: str) -> np.ndarray:
+    arr = np.fromfile(file_path, dtype=np.float32)
+    if arr.size == 0:
+        return np.zeros((0, 3), dtype=np.float32)
+
+    # nuScenes lidar .pcd.bin is typically [x, y, z, intensity, ring_index]
+    if arr.size % 5 == 0:
+        pts = arr.reshape(-1, 5)[:, :3]
+    elif arr.size % 4 == 0:
+        pts = arr.reshape(-1, 4)[:, :3]
+    else:
+        usable = (arr.size // 3) * 3
+        pts = arr[:usable].reshape(-1, 3)
+
+    return pts.astype(np.float32)
+
+
+def _parse_pcd_binary(file_path: str):
+    # Minimal PCD parser for nuScenes radar files (DATA binary).
+    with open(file_path, "rb") as f:
+        raw = f.read()
+
+    header_end = raw.find(b"DATA binary")
+    if header_end == -1:
+        return {}
+
+    line_end = raw.find(b"\n", header_end)
+    if line_end == -1:
+        return {}
+
+    header_blob = raw[: line_end + 1].decode("utf-8", errors="ignore")
+    data_blob = raw[line_end + 1 :]
+
+    header = {}
+    for line in header_blob.splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        key, *vals = line.split()
+        header[key.upper()] = vals
+
+    fields = header.get("FIELDS", [])
+    sizes = [int(x) for x in header.get("SIZE", [])]
+    types = header.get("TYPE", [])
+    counts = [int(x) for x in header.get("COUNT", [])]
+    points = int(header.get("POINTS", ["0"])[0])
+
+    if not fields or not sizes or not types or not counts or points <= 0:
+        return {}
+
+    np_map = {
+        ("F", 4): np.float32,
+        ("F", 8): np.float64,
+        ("I", 1): np.int8,
+        ("I", 2): np.int16,
+        ("I", 4): np.int32,
+        ("U", 1): np.uint8,
+        ("U", 2): np.uint16,
+        ("U", 4): np.uint32,
+    }
+
+    dtype_parts = []
+    expanded_fields = []
+    for field, size, typ, cnt in zip(fields, sizes, types, counts):
+        base = np_map.get((typ, size), np.float32)
+        if cnt == 1:
+            dtype_parts.append((field, base))
+            expanded_fields.append(field)
+        else:
+            for i in range(cnt):
+                name = f"{field}_{i}"
+                dtype_parts.append((name, base))
+                expanded_fields.append(name)
+
+    point_dtype = np.dtype(dtype_parts)
+    byte_need = point_dtype.itemsize * points
+    if len(data_blob) < byte_need:
+        return {}
+
+    rec = np.frombuffer(data_blob[:byte_need], dtype=point_dtype, count=points)
+    out = {}
+    for name in expanded_fields:
+        out[name] = rec[name]
+    return out
+
+
+def _load_radar_pcd(file_path: str):
+    fields = _parse_pcd_binary(file_path)
+    if not fields:
+        return np.zeros((0, 3), dtype=np.float32), np.zeros((0, 2), dtype=np.float32)
+
+    x = fields.get("x")
+    y = fields.get("y")
+    z = fields.get("z")
+
+    # Prefer compensated velocity fields when available.
+    vx = fields.get("vx_comp", fields.get("vx"))
+    vy = fields.get("vy_comp", fields.get("vy"))
+
+    if x is None or y is None or z is None:
+        return np.zeros((0, 3), dtype=np.float32), np.zeros((0, 2), dtype=np.float32)
+
+    if vx is None:
+        vx = np.zeros_like(x)
+    if vy is None:
+        vy = np.zeros_like(y)
+
+    pts = np.stack([x, y, z], axis=1).astype(np.float32)
+    vel = np.stack([vx, vy], axis=1).astype(np.float32)
+    return pts, vel
+
+
+def _ego_xyz_to_bev(points_xyz: np.ndarray):
+    # Ego frame: +x front, +y left, +z up
+    # BEV UI: +x right, +y forward
+    if points_xyz.size == 0:
+        return np.zeros((0, 2), dtype=np.float32)
+    x_bev = -points_xyz[:, 1]
+    y_bev = points_xyz[:, 0]
+    return np.stack([x_bev, y_bev], axis=1).astype(np.float32)
+
+
+def _ego_vel_to_bev(vxy_ego: np.ndarray):
+    if vxy_ego.size == 0:
+        return np.zeros((0, 2), dtype=np.float32)
+    vx_bev = -vxy_ego[:, 1]
+    vy_bev = vxy_ego[:, 0]
+    return np.stack([vx_bev, vy_bev], axis=1).astype(np.float32)
+
+
+def load_fusion_for_cam_frame(cam_filename: str, data_root: str = "DataSet", version: str = "v1.0-mini"):
+    by_basename, by_sample = _load_sample_data_index(data_root, version)
+    calib_by_token = _load_calibrated_sensor_index(data_root, version)
+
+    basename = os.path.basename(cam_filename)
+    cam_rec = by_basename.get(basename)
+    if not cam_rec:
+        return None
+
+    sample_token = cam_rec.get("sample_token")
+    if not sample_token:
+        return None
+
+    related = by_sample.get(sample_token, [])
+
+    lidar_rec = None
+    radar_recs = {}
+    radar_channels = [
+        "RADAR_FRONT",
+        "RADAR_FRONT_LEFT",
+        "RADAR_FRONT_RIGHT",
+        "RADAR_BACK_LEFT",
+        "RADAR_BACK_RIGHT",
+    ]
+
+    for rec in related:
+        rel = rec.get("filename", "")
+        if not rel.startswith("samples/"):
+            continue
+
+        ch = _channel_from_filename(rel)
+        if ch == "LIDAR_TOP":
+            lidar_rec = rec
+        elif ch in radar_channels:
+            radar_recs[ch] = rec
+
+    lidar_bev = np.zeros((0, 2), dtype=np.float32)
+    lidar_path = None
+
+    if lidar_rec is not None:
+        lidar_path = os.path.join(data_root, lidar_rec.get("filename", ""))
+        if os.path.exists(lidar_path):
+            lidar_xyz = _load_lidar_pcd_bin(lidar_path)
+            lidar_calib = calib_by_token.get(lidar_rec.get("calibrated_sensor_token"))
+            lidar_xyz_ego = _transform_points_sensor_to_ego(lidar_xyz, lidar_calib)
+            lidar_bev = _ego_xyz_to_bev(lidar_xyz_ego)
+
+    radar_xy_list = []
+    radar_vel_list = []
+    radar_paths = {}
+    radar_channel_counts = {}
+
+    for ch in radar_channels:
+        rec = radar_recs.get(ch)
+        if rec is None:
+            continue
+
+        p = os.path.join(data_root, rec.get("filename", ""))
+        radar_paths[ch] = p
+        if not os.path.exists(p):
+            radar_channel_counts[ch] = 0
+            continue
+
+        radar_xyz, radar_vel_xy = _load_radar_pcd(p)
+        radar_calib = calib_by_token.get(rec.get("calibrated_sensor_token"))
+
+        radar_xyz_ego = _transform_points_sensor_to_ego(radar_xyz, radar_calib)
+        radar_vel_ego = _transform_vel_sensor_to_ego(radar_vel_xy, radar_calib)
+
+        radar_bev = _ego_xyz_to_bev(radar_xyz_ego)
+        radar_vel_bev = _ego_vel_to_bev(radar_vel_ego)
+
+        if radar_bev.size > 0:
+            m_ch = (
+                (radar_bev[:, 1] > -20.0)
+                & (radar_bev[:, 1] < 100.0)
+                & (radar_bev[:, 0] > -70.0)
+                & (radar_bev[:, 0] < 70.0)
+            )
+            radar_bev = radar_bev[m_ch]
+            radar_vel_bev = radar_vel_bev[m_ch]
+
+        radar_channel_counts[ch] = int(radar_bev.shape[0])
+
+        if radar_bev.size > 0:
+            radar_xy_list.append(radar_bev)
+            radar_vel_list.append(radar_vel_bev)
+
+    if radar_xy_list:
+        radar_bev_all = np.concatenate(radar_xy_list, axis=0).astype(np.float32)
+        radar_vel_all = np.concatenate(radar_vel_list, axis=0).astype(np.float32)
+    else:
+        radar_bev_all = np.zeros((0, 2), dtype=np.float32)
+        radar_vel_all = np.zeros((0, 2), dtype=np.float32)
+
+    # Keep interaction region for live BEV visualization.
+    if lidar_bev.size > 0:
+        m = (
+            (lidar_bev[:, 1] > -15.0)
+            & (lidar_bev[:, 1] < 85.0)
+            & (lidar_bev[:, 0] > -60.0)
+            & (lidar_bev[:, 0] < 60.0)
+        )
+        lidar_bev = lidar_bev[m]
+
+    if radar_bev_all.size > 0:
+        m = (
+            (radar_bev_all[:, 1] > -20.0)
+            & (radar_bev_all[:, 1] < 100.0)
+            & (radar_bev_all[:, 0] > -70.0)
+            & (radar_bev_all[:, 0] < 70.0)
+        )
+        radar_bev_all = radar_bev_all[m]
+        if radar_vel_all.shape[0] == m.shape[0]:
+            radar_vel_all = radar_vel_all[m]
+
+    return {
+        "sample_token": sample_token,
+        "lidar_xy": lidar_bev,
+        "radar_xy": radar_bev_all,
+        "radar_vel": radar_vel_all,
+        "lidar_path": lidar_path,
+        "radar_path": radar_paths.get("RADAR_FRONT"),
+        "radar_paths": radar_paths,
+        "radar_channel_counts": radar_channel_counts,
+    }
+
+
+def radar_stabilize_motion(tracked_agents, fusion_data, dt_seconds: float = 0.5):
+    if not fusion_data:
+        return tracked_agents
+
+    radar_xy = fusion_data.get("radar_xy")
+    radar_vel = fusion_data.get("radar_vel")
+
+    if radar_xy is None or radar_vel is None or len(radar_xy) == 0:
+        return tracked_agents
+
+    stabilized = []
+
+    for agent in tracked_agents:
+        if agent.get("type") not in ["Person", "Bicycle", "Car", "Truck", "Bus", "Motorcycle"]:
+            stabilized.append(agent)
+            continue
+
+        x_curr, y_curr = agent["history"][-1]
+        d = np.hypot(radar_xy[:, 0] - x_curr, radar_xy[:, 1] - y_curr)
+        near_idx = np.where(d < 3.0)[0]
+
+        if near_idx.size > 0:
+            rv = radar_vel[near_idx].mean(axis=0)
+            radar_dx = float(rv[0] * dt_seconds)
+            radar_dy = float(rv[1] * dt_seconds)
+
+            cam_dx = float(agent.get("dx", 0.0))
+            cam_dy = float(agent.get("dy", 0.0))
+
+            fused_dx = 0.7 * cam_dx + 0.3 * radar_dx
+            fused_dy = 0.7 * cam_dy + 0.3 * radar_dy
+
+            x4, y4 = x_curr, y_curr
+            h3 = (x4 - 3.0 * fused_dx, y4 - 3.0 * fused_dy)
+            h2 = (x4 - 2.0 * fused_dx, y4 - 2.0 * fused_dy)
+            h1 = (x4 - 1.0 * fused_dx, y4 - 1.0 * fused_dy)
+
+            agent = dict(agent)
+            agent["dx"] = fused_dx
+            agent["dy"] = fused_dy
+            agent["history"] = [h3, h2, h1, (x4, y4)]
+
+        stabilized.append(agent)
+
+    return stabilized
diff --git a/backend/app/schemas.py b/backend/app/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..a09c0f88eb42a07ab36fd7f51c854ba0b9914399
--- /dev/null
+++ b/backend/app/schemas.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class Point2D(BaseModel):
+    x: float
+    y: float
+
+
+class AgentState(BaseModel):
+    id: int
+    type: str
+    raw_label: str | None = None
+    history: list[Point2D] = Field(default_factory=list)
+    predictions: list[list[Point2D]] = Field(default_factory=list)
+    probabilities: list[float] = Field(default_factory=list)
+    is_target: bool = False
+
+
+class LiveFusionRequest(BaseModel):
+    anchor_idx: int = Field(default=3, ge=0)
+    score_threshold: float = Field(default=0.35, ge=0.0, le=1.0)
+    tracking_gate_px: float = Field(default=130.0, ge=1.0, le=500.0)
+    use_pose: bool = False
+
+
+class PredictionResponse(BaseModel):
+    mode: str
+    target_track_id: int | None = None
+    agents: list[AgentState] = Field(default_factory=list)
+    meta: dict[str, Any] = Field(default_factory=dict)
+    detections: dict[str, Any] | None = None
+    sensors: dict[str, Any] | None = None
+    scene_geometry: dict[str, Any] | None = None
+
+    model_config = ConfigDict(extra="allow")
diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad325d28e296d2f4ac335f562e4b4a5c3ec29a56
--- /dev/null
+++ b/backend/app/services/__init__.py
@@ -0,0 +1 @@
+"""Service layer for model and perception pipelines."""
diff --git a/backend/app/services/pipeline.py b/backend/app/services/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..65226797a77b9e3514857d741261890a715e57b2
--- /dev/null
+++ b/backend/app/services/pipeline.py
@@ -0,0 +1,1255 @@
+from __future__ import annotations
+
+import base64
+import io
+import json
+import math
+import threading
+from collections import defaultdict
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from PIL import Image
+Image.MAX_IMAGE_PIXELS = None
+
+try:
+    import cv2
+except Exception:
+    cv2 = None
+
+from torchvision.models.detection import (
+    FasterRCNN_ResNet50_FPN_Weights,
+    KeypointRCNN_ResNet50_FPN_Weights,
+    fasterrcnn_resnet50_fpn,
+    keypointrcnn_resnet50_fpn,
+)
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+from ..ml.inference import USING_FUSION_MODEL, predict as trajectory_predict
+from ..ml.sensor_fusion import load_fusion_for_cam_frame, radar_stabilize_motion
+
+COCO_TO_LABEL = {
+    1: "person",
+    2: "bicycle",
+    3: "car",
+    4: "motorcycle",
+    6: "bus",
+    8: "truck",
+}
+
+VRU_LABELS = {"person", "bicycle", "motorcycle"}
+VEHICLE_LABELS = {"car", "bus", "truck"}
+
+
+@lru_cache(maxsize=1)
+def _load_hd_map_indices(data_root: str, version: str) -> dict[str, Any]:
+    base = Path(data_root) / version
+
+    with open(base / "sample.json", "r", encoding="utf-8") as f:
+        samples = json.load(f)
+    with open(base / "sample_data.json", "r", encoding="utf-8") as f:
+        sample_data = json.load(f)
+    with open(base / "scene.json", "r", encoding="utf-8") as f:
+        scenes = json.load(f)
+    with open(base / "log.json", "r", encoding="utf-8") as f:
+        logs = json.load(f)
+    with open(base / "map.json", "r", encoding="utf-8") as f:
+        maps = json.load(f)
+    with open(base / "ego_pose.json", "r", encoding="utf-8") as f:
+        ego_poses = json.load(f)
+
+    sample_by_token = {r["token"]: r for r in samples}
+    scene_by_token = {r["token"]: r for r in scenes}
+    log_by_token = {r["token"]: r for r in logs}
+    ego_pose_by_token = {r["token"]: r for r in ego_poses}
+
+    sample_data_by_sample: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    sample_data_by_basename: dict[str, dict[str, Any]] = {}
+    for rec in sample_data:
+        sample_token = rec.get("sample_token")
+        if sample_token:
+            sample_data_by_sample[str(sample_token)].append(rec)
+
+        filename = rec.get("filename")
+        if filename:
+            sample_data_by_basename[Path(str(filename)).name] = rec
+
+    map_by_log_token: dict[str, dict[str, Any]] = {}
+    for rec in maps:
+        for log_token in rec.get("log_tokens", []):
+            map_by_log_token[str(log_token)] = rec
+
+    return {
+        "sample_by_token": sample_by_token,
+        "scene_by_token": scene_by_token,
+        "log_by_token": log_by_token,
+        "map_by_log_token": map_by_log_token,
+        "sample_data_by_sample": dict(sample_data_by_sample),
+        "sample_data_by_basename": sample_data_by_basename,
+        "ego_pose_by_token": ego_pose_by_token,
+    }
+
+
+@lru_cache(maxsize=8)
+def _get_map_size(map_path: str) -> tuple[int, int] | None:
+    p = Path(map_path)
+    if not p.exists():
+        return None
+
+    with Image.open(p) as img:
+        w, h = img.size
+    return int(w), int(h)
+
+
+def _load_map_crop_gray(map_path: str, left: int, top: int, right: int, bottom: int) -> np.ndarray | None:
+    p = Path(map_path)
+    if not p.exists():
+        return None
+
+    if right <= left or bottom <= top:
+        return None
+
+    with Image.open(p) as img:
+        crop = img.crop((int(left), int(top), int(right), int(bottom))).convert("L")
+    return np.asarray(crop, dtype=np.uint8)
+
+
+def _quat_wxyz_to_yaw(q: list[float] | tuple[float, float, float, float]) -> float:
+    if len(q) != 4:
+        return 0.0
+
+    w, x, y, z = [float(v) for v in q]
+    n = math.sqrt(w * w + x * x + y * y + z * z)
+    if n < 1e-12:
+        return 0.0
+
+    w, x, y, z = w / n, x / n, y / n, z / n
+    siny_cosp = 2.0 * (w * z + x * y)
+    cosy_cosp = 1.0 - 2.0 * (y * y + z * z)
+    return float(math.atan2(siny_cosp, cosy_cosp))
+
+
+class TrajectoryPipeline:
+    def __init__(self, repo_root: Path | None = None):
+        self.repo_root = Path(repo_root) if repo_root else REPO_ROOT
+        self.data_root = self.repo_root / "DataSet"
+        self._model_lock = threading.Lock()
+        self._models: dict[str, Any] | None = None
+
+    @property
+    def using_fusion_model(self) -> bool:
+        return bool(USING_FUSION_MODEL)
+
+    @staticmethod
+    def normalize_probs(probs: list[float] | np.ndarray) -> list[float]:
+        arr = np.asarray(probs, dtype=float)
+        arr = np.clip(arr, 1e-6, None)
+        arr = arr / arr.sum()
+        return arr.tolist()
+
+    @staticmethod
+    def coco_kind(label_name: str | None) -> str | None:
+        if label_name in VRU_LABELS:
+            return "pedestrian"
+        if label_name in VEHICLE_LABELS:
+            return "vehicle"
+        return None
+
+    @staticmethod
+    def iou_xyxy(box_a: list[float], box_b: list[float]) -> float:
+        ax1, ay1, ax2, ay2 = box_a
+        bx1, by1, bx2, by2 = box_b
+
+        ix1 = max(ax1, bx1)
+        iy1 = max(ay1, by1)
+        ix2 = min(ax2, bx2)
+        iy2 = min(ay2, by2)
+
+        iw = max(0.0, ix2 - ix1)
+        ih = max(0.0, iy2 - iy1)
+        inter = iw * ih
+
+        area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+        area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+        union = area_a + area_b - inter
+
+        if union <= 1e-9:
+            return 0.0
+        return inter / union
+
+    @staticmethod
+    def pixel_to_bev(center_x: float, bottom_y: float, width: int, height: int) -> tuple[float, float]:
+        x_div = max(1.0, width / 80.0)
+        y_div = max(1.0, height / 50.0)
+
+        x_m = (center_x - 0.5 * width) / x_div
+        y_m = (bottom_y - 0.58 * height) / y_div
+        return float(x_m), float(y_m)
+
+    def list_channel_image_paths(self, channel: str) -> list[Path]:
+        base = self.data_root / "samples" / channel
+        if not base.exists():
+            return []
+        return sorted(base.glob("*.jpg"))
+
+    @staticmethod
+    def load_image_array(image_path: str | Path) -> np.ndarray:
+        return np.asarray(Image.open(image_path).convert("RGB"))
+
+    @staticmethod
+    def _clip_bev(x: float, y: float) -> tuple[float, float]:
+        return float(np.clip(x, -40.0, 40.0)), float(np.clip(y, -14.0, 62.0))
+
+    def _poly_px_to_bev_points(
+        self,
+        polygon_px: list[tuple[float, float]],
+        width: int,
+        height: int,
+    ) -> list[dict[str, float]]:
+        out = []
+        for px, py in polygon_px:
+            bx, by = self.pixel_to_bev(float(px), float(py), width, height)
+            bx, by = self._clip_bev(bx, by)
+            out.append({"x": bx, "y": by})
+        return out
+
+    def _project_detection_elements(
+        self,
+        detections: list[dict[str, Any]],
+        width: int,
+        height: int,
+    ) -> list[dict[str, Any]]:
+        elements = []
+
+        for det in detections:
+            box = det.get("box")
+            if box is None or len(box) != 4:
+                continue
+
+            x1, y1, x2, y2 = [float(v) for v in box]
+            cx = 0.5 * (x1 + x2)
+            bx, by = self.pixel_to_bev(cx, y2, width, height)
+            bx, by = self._clip_bev(bx, by)
+
+            kind = str(det.get("kind", "vehicle"))
+            box_w_px = max(1.0, x2 - x1)
+            half_w = float(np.clip((box_w_px / max(1.0, width)) * 12.0, 0.25, 2.2))
+            length = 0.9 if kind == "pedestrian" else 2.1
+
+            polygon = [
+                {"x": bx - half_w, "y": by - 0.25 * length},
+                {"x": bx + half_w, "y": by - 0.25 * length},
+                {"x": bx + half_w, "y": by + length},
+                {"x": bx - half_w, "y": by + length},
+            ]
+
+            elements.append(
+                {
+                    "kind": kind,
+                    "track_id": det.get("track_id"),
+                    "score": float(det.get("score", 0.0)),
+                    "polygon": polygon,
+                }
+            )
+
+        return elements[:24]
+
+    def extract_scene_geometry(
+        self,
+        image_arr: np.ndarray,
+        detections: list[dict[str, Any]] | None,
+    ) -> dict[str, Any] | None:
+        if image_arr is None:
+            return None
+
+        h, w = image_arr.shape[:2]
+        if h < 20 or w < 20:
+            return None
+
+        if detections is None:
+            detections = []
+
+        roi_px = [
+            (0.08 * w, h - 1),
+            (0.42 * w, 0.56 * h),
+            (0.58 * w, 0.56 * h),
+            (0.92 * w, h - 1),
+        ]
+
+        scene = {
+            "source": "camera-derived" if cv2 is not None else "heuristic-fallback",
+            "quality": 0.0,
+            "road_polygon": self._poly_px_to_bev_points(roi_px, w, h),
+            "lane_lines": [],
+            "elements": self._project_detection_elements(detections, w, h),
+            "image_size": {"width": int(w), "height": int(h)},
+        }
+
+        if cv2 is None:
+            scene["quality"] = 0.12
+            return scene
+
+        gray = cv2.cvtColor(image_arr, cv2.COLOR_RGB2GRAY)
+        blur = cv2.GaussianBlur(gray, (5, 5), 0)
+        edges = cv2.Canny(blur, 60, 160)
+
+        roi_mask = np.zeros_like(edges)
+        roi_poly = np.array([
+            [
+                (int(0.08 * w), h - 1),
+                (int(0.42 * w), int(0.56 * h)),
+                (int(0.58 * w), int(0.56 * h)),
+                (int(0.92 * w), h - 1),
+            ]
+        ], dtype=np.int32)
+        cv2.fillPoly(roi_mask, roi_poly, 255)
+        masked_edges = cv2.bitwise_and(edges, roi_mask)
+
+        lines = cv2.HoughLinesP(
+            masked_edges,
+            rho=1,
+            theta=np.pi / 180.0,
+            threshold=max(24, int(0.03 * w)),
+            minLineLength=max(28, int(0.05 * w)),
+            maxLineGap=max(22, int(0.03 * w)),
+        )
+
+        lane_candidates: list[tuple[float, list[dict[str, float]]]] = []
+        if lines is not None:
+            for line in lines:
+                x1, y1, x2, y2 = [int(v) for v in line[0]]
+                dx = float(x2 - x1)
+                dy = float(y2 - y1)
+                length = float(np.hypot(dx, dy))
+
+                if length < max(24.0, 0.04 * w):
+                    continue
+                if abs(dy) < 8.0:
+                    continue
+
+                slope = dy / dx if abs(dx) > 1e-6 else np.sign(dy) * 1e6
+                if abs(slope) < 0.35:
+                    continue
+
+                p1x, p1y = self.pixel_to_bev(float(x1), float(y1), w, h)
+                p2x, p2y = self.pixel_to_bev(float(x2), float(y2), w, h)
+                p1x, p1y = self._clip_bev(p1x, p1y)
+                p2x, p2y = self._clip_bev(p2x, p2y)
+
+                lane_candidates.append(
+                    (
+                        length,
+                        [
+                            {"x": p1x, "y": p1y},
+                            {"x": p2x, "y": p2y},
+                        ],
+                    )
+                )
+
+        lane_candidates.sort(key=lambda item: item[0], reverse=True)
+        scene["lane_lines"] = [item[1] for item in lane_candidates[:10]]
+
+        edge_density = float(masked_edges.mean() / 255.0)
+        lane_quality = min(1.0, len(scene["lane_lines"]) / 6.0)
+        edge_quality = min(1.0, edge_density * 8.0)
+        scene["quality"] = float(np.clip(0.55 * lane_quality + 0.45 * edge_quality, 0.0, 1.0))
+
+        return scene
+
+    def lookup_sample_token_for_filename(self, filename: str | None) -> str | None:
+        if not filename:
+            return None
+
+        try:
+            idx = _load_hd_map_indices(str(self.data_root), "v1.0-mini")
+        except Exception:
+            return None
+
+        rec = idx["sample_data_by_basename"].get(Path(filename).name)
+        if not rec:
+            return None
+
+        sample_token = rec.get("sample_token")
+        if not sample_token:
+            return None
+
+        return str(sample_token)
+
+    def _build_hd_map_layer(
+        self,
+        sample_token: str,
+        radius_m: float = 45.0,
+        out_size: int = 480,
+    ) -> dict[str, Any] | None:
+        try:
+            idx = _load_hd_map_indices(str(self.data_root), "v1.0-mini")
+        except Exception:
+            return None
+
+        sample_rec = idx["sample_by_token"].get(sample_token)
+        if sample_rec is None:
+            return None
+
+        sample_data_list = idx["sample_data_by_sample"].get(sample_token, [])
+        if len(sample_data_list) == 0:
+            return None
+
+        ref_rec = next(
+            (r for r in sample_data_list if "LIDAR_TOP" in str(r.get("filename", ""))),
+            sample_data_list[0],
+        )
+
+        ego_pose = idx["ego_pose_by_token"].get(str(ref_rec.get("ego_pose_token", "")))
+        if ego_pose is None:
+            return None
+
+        scene_rec = idx["scene_by_token"].get(str(sample_rec.get("scene_token", "")))
+        if scene_rec is None:
+            return None
+
+        log_token = str(scene_rec.get("log_token", ""))
+        map_rec = idx["map_by_log_token"].get(log_token)
+        if map_rec is None:
+            return None
+
+        map_rel = str(map_rec.get("filename", ""))
+        map_path = self.data_root / map_rel
+        map_size = _get_map_size(str(map_path))
+        if map_size is None:
+            return None
+        map_w, map_h = map_size
+
+        translation = ego_pose.get("translation", [0.0, 0.0, 0.0])
+        ego_x = float(translation[0])
+        ego_y = float(translation[1])
+        yaw = _quat_wxyz_to_yaw(ego_pose.get("rotation", [1.0, 0.0, 0.0, 0.0]))
+
+        # nuScenes semantic prior raster masks use 0.1m per pixel.
+        ppm = 10.0
+        x_right = np.linspace(-radius_m, radius_m, out_size, dtype=np.float32)
+        y_forward = np.linspace(radius_m, -radius_m, out_size, dtype=np.float32)
+        x_grid, y_grid = np.meshgrid(x_right, y_forward)
+
+        gx = ego_x + np.cos(yaw) * y_grid + np.sin(yaw) * x_grid
+        gy = ego_y + np.sin(yaw) * y_grid - np.cos(yaw) * x_grid
+
+        px_opts = [gx * ppm, (map_w - 1.0) - gx * ppm]
+        py_opts = [gy * ppm, (map_h - 1.0) - gy * ppm]
+
+        best_px = None
+        best_py = None
+        best_valid_ratio = -1.0
+        for px in px_opts:
+            for py in py_opts:
+                valid = (px >= 0.0) & (px <= (map_w - 1.0)) & (py >= 0.0) & (py <= (map_h - 1.0))
+                ratio = float(valid.mean())
+                if ratio > best_valid_ratio:
+                    best_valid_ratio = ratio
+                    best_px = px
+                    best_py = py
+
+        if best_px is None or best_py is None or best_valid_ratio < 0.15:
+            return None
+
+        crop_left = int(max(0, math.floor(float(best_px.min())) - 2))
+        crop_top = int(max(0, math.floor(float(best_py.min())) - 2))
+        crop_right = int(min(map_w, math.ceil(float(best_px.max())) + 3))
+        crop_bottom = int(min(map_h, math.ceil(float(best_py.max())) + 3))
+
+        map_crop = _load_map_crop_gray(str(map_path), crop_left, crop_top, crop_right, crop_bottom)
+        if map_crop is None or map_crop.size == 0:
+            return None
+
+        remap_x = best_px - float(crop_left)
+        remap_y = best_py - float(crop_top)
+
+        if cv2 is not None:
+            patch = cv2.remap(
+                map_crop,
+                remap_x.astype(np.float32),
+                remap_y.astype(np.float32),
+                interpolation=cv2.INTER_LINEAR,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=0,
+            )
+            patch_u8 = patch.astype(np.uint8)
+        else:
+            crop_h, crop_w = map_crop.shape[:2]
+            xi = np.clip(np.round(remap_x).astype(np.int32), 0, crop_w - 1)
+            yi = np.clip(np.round(remap_y).astype(np.int32), 0, crop_h - 1)
+            patch_u8 = map_crop[yi, xi]
+
+        drivable = patch_u8 > 96
+        strong = patch_u8 > 170
+        if float(drivable.mean()) < 0.01:
+            return None
+
+        rgba = np.zeros((out_size, out_size, 4), dtype=np.uint8)
+        rgba[drivable] = [72, 94, 114, 130]
+        rgba[strong] = [170, 194, 216, 192]
+
+        buf = io.BytesIO()
+        Image.fromarray(rgba, mode="RGBA").save(buf, format="PNG")
+        png_b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+
+        return {
+            "source": "nuscenes-semantic-prior",
+            "map_token": map_rec.get("token"),
+            "valid_ratio": round(best_valid_ratio, 3),
+            "image_png_base64": png_b64,
+            "opacity": 0.62,
+            "bounds": {
+                "min_x": -float(radius_m),
+                "max_x": float(radius_m),
+                "min_y": -float(radius_m),
+                "max_y": float(radius_m),
+            },
+        }
+
+    def _attach_hd_map_layer(self, scene_geometry: dict[str, Any] | None, sample_token: str | None):
+        if not sample_token:
+            return scene_geometry
+
+        map_layer = self._build_hd_map_layer(sample_token)
+        if map_layer is None:
+            return scene_geometry
+
+        if scene_geometry is None:
+            bounds = map_layer["bounds"]
+            scene_geometry = {
+                "source": "hd-map",
+                "quality": 0.55,
+                "road_polygon": [
+                    {"x": bounds["min_x"], "y": bounds["min_y"]},
+                    {"x": bounds["max_x"], "y": bounds["min_y"]},
+                    {"x": bounds["max_x"], "y": bounds["max_y"]},
+                    {"x": bounds["min_x"], "y": bounds["max_y"]},
+                ],
+                "lane_lines": [],
+                "elements": [],
+            }
+        else:
+            scene_geometry = dict(scene_geometry)
+            prev_source = str(scene_geometry.get("source", "")).strip()
+            if "hd-map" not in prev_source:
+                scene_geometry["source"] = f"{prev_source}+hd-map" if prev_source else "hd-map"
+            scene_geometry["quality"] = float(np.clip(max(float(scene_geometry.get("quality", 0.0)), 0.55), 0.0, 1.0))
+
+        scene_geometry["map_layer"] = map_layer
+        return scene_geometry
+
+    def load_cv_models(self) -> dict[str, Any]:
+        if self._models is not None:
+            return self._models
+
+        with self._model_lock:
+            if self._models is not None:
+                return self._models
+
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+            try:
+                det_weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+                det_model = fasterrcnn_resnet50_fpn(weights=det_weights, progress=False)
+                det_model.to(device).eval()
+
+                pose_weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
+                pose_model = keypointrcnn_resnet50_fpn(weights=pose_weights, progress=False)
+                pose_model.to(device).eval()
+
+                self._models = {
+                    "device": device,
+                    "device_name": str(device),
+                    "det_model": det_model,
+                    "det_weights": det_weights,
+                    "pose_model": pose_model,
+                    "pose_weights": pose_weights,
+                }
+            except Exception as exc:
+                self._models = {
+                    "error": str(exc),
+                    "device": device,
+                    "device_name": str(device),
+                }
+
+            return self._models
+
+    def detect_objects_and_pose(
+        self,
+        image_arr: np.ndarray,
+        models: dict[str, Any],
+        score_threshold: float = 0.55,
+        use_pose: bool = True,
+    ) -> list[dict[str, Any]]:
+        if "error" in models:
+            return []
+
+        device = models["device"]
+        pil_img = Image.fromarray(image_arr)
+
+        det_input = models["det_weights"].transforms()(pil_img).unsqueeze(0).to(device)
+        with torch.no_grad():
+            det_out = models["det_model"](det_input)[0]
+
+        boxes = det_out["boxes"].detach().cpu().numpy() if len(det_out["boxes"]) > 0 else np.zeros((0, 4))
+        scores = det_out["scores"].detach().cpu().numpy() if len(det_out["scores"]) > 0 else np.zeros((0,))
+        labels = det_out["labels"].detach().cpu().numpy() if len(det_out["labels"]) > 0 else np.zeros((0,))
+
+        detections: list[dict[str, Any]] = []
+        for i in range(len(scores)):
+            score = float(scores[i])
+            label_idx = int(labels[i])
+            label_name = COCO_TO_LABEL.get(label_idx)
+
+            if label_name is None or score < score_threshold:
+                continue
+
+            kind = self.coco_kind(label_name)
+            if kind is None:
+                continue
+
+            x1, y1, x2, y2 = [float(v) for v in boxes[i]]
+            detections.append(
+                {
+                    "score": score,
+                    "raw_label": label_name,
+                    "kind": kind,
+                    "box": [x1, y1, x2, y2],
+                    "center_x": 0.5 * (x1 + x2),
+                    "bottom_y": y2,
+                    "keypoints": None,
+                }
+            )
+
+        if use_pose:
+            pose_input = models["pose_weights"].transforms()(pil_img).unsqueeze(0).to(device)
+            with torch.no_grad():
+                pose_out = models["pose_model"](pose_input)[0]
+
+            p_boxes = pose_out["boxes"].detach().cpu().numpy() if len(pose_out["boxes"]) > 0 else np.zeros((0, 4))
+            p_scores = pose_out["scores"].detach().cpu().numpy() if len(pose_out["scores"]) > 0 else np.zeros((0,))
+            p_labels = pose_out["labels"].detach().cpu().numpy() if len(pose_out["labels"]) > 0 else np.zeros((0,))
+            p_keypoints = (
+                pose_out["keypoints"].detach().cpu().numpy()
+                if len(pose_out["keypoints"]) > 0
+                else np.zeros((0, 17, 3))
+            )
+
+            assigned = set()
+            for i in range(len(p_scores)):
+                if int(p_labels[i]) != 1:
+                    continue
+                if float(p_scores[i]) < max(0.25, 0.8 * score_threshold):
+                    continue
+
+                pose_box = [float(v) for v in p_boxes[i]]
+                best_idx = None
+                best_iou = 0.0
+
+                for det_idx, det in enumerate(detections):
+                    if det_idx in assigned:
+                        continue
+                    if det["raw_label"] != "person":
+                        continue
+
+                    iou_val = self.iou_xyxy(det["box"], pose_box)
+                    if iou_val > best_iou:
+                        best_iou = iou_val
+                        best_idx = det_idx
+
+                if best_idx is not None and best_iou > 0.1:
+                    detections[best_idx]["keypoints"] = p_keypoints[i].tolist()
+                    assigned.add(best_idx)
+
+        return detections
+
+    @staticmethod
+    def match_two_frame_tracks(
+        det_prev: list[dict[str, Any]],
+        det_curr: list[dict[str, Any]],
+        tracking_gate_px: float = 90.0,
+    ) -> list[tuple[dict[str, Any], dict[str, Any], float]]:
+        used_curr = set()
+        matches = []
+
+        det_prev = sorted(det_prev, key=lambda d: d["score"], reverse=True)
+        det_curr = sorted(det_curr, key=lambda d: d["score"], reverse=True)
+
+        for d0 in det_prev:
+            best_idx = None
+            best_dist = 1e9
+
+            for j, d1 in enumerate(det_curr):
+                if j in used_curr:
+                    continue
+                if d0["kind"] != d1["kind"]:
+                    continue
+
+                dist = math.hypot(d1["center_x"] - d0["center_x"], d1["bottom_y"] - d0["bottom_y"])
+                if dist < tracking_gate_px and dist < best_dist:
+                    best_dist = dist
+                    best_idx = j
+
+            if best_idx is None:
+                continue
+
+            used_curr.add(best_idx)
+            d1 = det_curr[best_idx]
+            matches.append((d0, d1, float(best_dist)))
+
+        return matches
+
+    def build_two_image_agents_bundle(
+        self,
+        img_prev: np.ndarray,
+        img_curr: np.ndarray,
+        score_threshold: float,
+        tracking_gate_px: float,
+        min_motion_px: float,
+        use_pose: bool,
+        img_prev_name: str | None = None,
+        img_curr_name: str | None = None,
+    ) -> dict[str, Any]:
+        models = self.load_cv_models()
+        if "error" in models:
+            return {
+                "error": f"Could not load CV models ({models['error']}).",
+                "device": models.get("device_name", "unknown"),
+            }
+
+        det_prev = self.detect_objects_and_pose(img_prev, models, score_threshold=score_threshold, use_pose=use_pose)
+        det_curr = self.detect_objects_and_pose(img_curr, models, score_threshold=score_threshold, use_pose=use_pose)
+
+        det_prev_vru = [d for d in det_prev if d.get("kind") == "pedestrian"]
+        det_curr_vru = [d for d in det_curr if d.get("kind") == "pedestrian"]
+
+        for i, d in enumerate(det_prev):
+            d["det_id"] = i + 1
+            d["track_id"] = None
+        for i, d in enumerate(det_curr):
+            d["det_id"] = i + 1
+            d["track_id"] = None
+
+        if len(det_curr_vru) == 0:
+            return {"error": "No pedestrian/cyclist detections found in image 2 (t0)."}
+
+        matches = self.match_two_frame_tracks(
+            det_prev_vru,
+            det_curr_vru,
+            tracking_gate_px=tracking_gate_px,
+        )
+
+        matched_curr_ids = {id(m[1]) for m in matches}
+        for d1 in det_curr_vru:
+            if id(d1) in matched_curr_ids:
+                continue
+
+            if len(det_prev_vru) == 0:
+                matches.append((None, d1, float("inf")))
+                continue
+
+            nearest_prev = min(
+                det_prev_vru,
+                key=lambda d0: math.hypot(d1["center_x"] - d0["center_x"], d1["bottom_y"] - d0["bottom_y"]),
+            )
+            dist = math.hypot(
+                d1["center_x"] - nearest_prev["center_x"],
+                d1["bottom_y"] - nearest_prev["bottom_y"],
+            )
+
+            if dist <= 1.5 * tracking_gate_px:
+                matches.append((nearest_prev, d1, float(dist)))
+            else:
+                matches.append((None, d1, float("inf")))
+
+        h0, w0 = img_prev.shape[:2]
+        h1, w1 = img_curr.shape[:2]
+
+        tracks = []
+        for track_id, (d0, d1, dist_px) in enumerate(matches, start=1):
+            if d0 is not None and d0.get("track_id") is None:
+                d0["track_id"] = track_id
+            d1["track_id"] = track_id
+
+            if d0 is not None:
+                p_prev = self.pixel_to_bev(d0["center_x"], d0["bottom_y"], w0, h0)
+            else:
+                p_prev = None
+
+            p_curr = self.pixel_to_bev(d1["center_x"], d1["bottom_y"], w1, h1)
+
+            if p_prev is None:
+                vx, vy = 0.0, 0.0
+                p_prev = p_curr
+            else:
+                vx = p_curr[0] - p_prev[0]
+                vy = p_curr[1] - p_prev[1]
+
+            if dist_px < float(min_motion_px):
+                vx, vy = 0.0, 0.0
+                p_prev = p_curr
+
+            hist = [
+                (p_curr[0] - 3.0 * vx, p_curr[1] - 3.0 * vy),
+                (p_curr[0] - 2.0 * vx, p_curr[1] - 2.0 * vy),
+                (p_prev[0], p_prev[1]),
+                (p_curr[0], p_curr[1]),
+            ]
+
+            tracks.append(
+                {
+                    "id": track_id,
+                    "kind": d1["kind"],
+                    "raw_label": d1["raw_label"],
+                    "history_world": hist,
+                }
+            )
+
+        agents = []
+        for tr in tracks:
+            neighbors = [other["history_world"] for other in tracks if other["id"] != tr["id"]]
+
+            pred, probs, _ = trajectory_predict(
+                tr["history_world"],
+                neighbor_points_list=neighbors,
+                fusion_feats=None,
+            )
+
+            pred_np = pred.detach().cpu().numpy()
+            probs_np = probs.detach().cpu().numpy()
+
+            predictions = []
+            for mode_i in range(pred_np.shape[0]):
+                predictions.append([(float(p[0]), float(p[1])) for p in pred_np[mode_i]])
+
+            agents.append(
+                {
+                    "id": int(tr["id"]),
+                    "type": "pedestrian" if tr["kind"] == "pedestrian" else "vehicle",
+                    "raw_label": tr["raw_label"],
+                    "history": [tuple(map(float, p)) for p in tr["history_world"]],
+                    "predictions": predictions,
+                    "probabilities": self.normalize_probs(probs_np.tolist()),
+                    "is_target": True,
+                }
+            )
+
+        scene_geometry = self.extract_scene_geometry(img_curr, det_curr)
+        sample_token = self.lookup_sample_token_for_filename(img_curr_name)
+        scene_geometry = self._attach_hd_map_layer(scene_geometry, sample_token)
+
+        return {
+            "mode": "two_upload",
+            "agents": agents,
+            "target_track_id": None,
+            "device": models.get("device_name", "unknown"),
+            "match_count": len(agents),
+            "scene_geometry": scene_geometry,
+            "camera_snapshots": {
+                "pair_prev": {"detections": det_prev},
+                "pair_curr": {"detections": det_curr},
+            },
+        }
+
+    def track_front_agents(
+        self,
+        front_paths: list[Path],
+        models: dict[str, Any],
+        score_threshold: float = 0.55,
+        tracking_gate_px: float = 90.0,
+        use_pose: bool = True,
+    ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+        tracks: dict[int, dict[str, Any]] = {}
+        next_track_id = 1
+        front_final_detections: list[dict[str, Any]] = []
+
+        for frame_idx, frame_path in enumerate(front_paths):
+            frame_arr = self.load_image_array(frame_path)
+            h, w = frame_arr.shape[:2]
+
+            detections = self.detect_objects_and_pose(
+                frame_arr,
+                models,
+                score_threshold=score_threshold,
+                use_pose=use_pose,
+            )
+            detections.sort(key=lambda d: d["score"], reverse=True)
+
+            matched_track_ids = set()
+            frame_dets_with_ids = []
+
+            for det in detections:
+                wx, wy = self.pixel_to_bev(det["center_x"], det["bottom_y"], w, h)
+
+                best_track_id = None
+                best_dist = 1e9
+
+                for tid, tr in tracks.items():
+                    if tr["kind"] != det["kind"]:
+                        continue
+                    if tr["last_seen"] != frame_idx - 1:
+                        continue
+                    if tid in matched_track_ids:
+                        continue
+
+                    px_last, py_last = tr["history_pixel"][-1]
+                    dist = math.hypot(det["center_x"] - px_last, det["bottom_y"] - py_last)
+                    if dist < tracking_gate_px and dist < best_dist:
+                        best_dist = dist
+                        best_track_id = tid
+
+                if best_track_id is None:
+                    best_track_id = next_track_id
+                    next_track_id += 1
+                    tracks[best_track_id] = {
+                        "id": best_track_id,
+                        "kind": det["kind"],
+                        "raw_label": det["raw_label"],
+                        "history_pixel": [],
+                        "history_world": [],
+                        "last_seen": -1,
+                        "last_box": None,
+                        "last_keypoints": None,
+                        "misses": 0,
+                    }
+
+                tr = tracks[best_track_id]
+                tr["history_pixel"].append((float(det["center_x"]), float(det["bottom_y"])))
+                tr["history_world"].append((float(wx), float(wy)))
+                tr["last_seen"] = frame_idx
+                tr["raw_label"] = det["raw_label"]
+                tr["last_box"] = det["box"]
+                tr["last_keypoints"] = det.get("keypoints")
+                tr["misses"] = 0
+
+                matched_track_ids.add(best_track_id)
+
+                det = dict(det)
+                det["track_id"] = best_track_id
+                frame_dets_with_ids.append(det)
+
+            for tid, tr in tracks.items():
+                if tr["last_seen"] == frame_idx:
+                    continue
+                if tr["last_seen"] < frame_idx - 1:
+                    continue
+
+                if len(tr["history_pixel"]) >= 2:
+                    px_prev, py_prev = tr["history_pixel"][-2]
+                    px_last, py_last = tr["history_pixel"][-1]
+                    wx_prev, wy_prev = tr["history_world"][-2]
+                    wx_last, wy_last = tr["history_world"][-1]
+
+                    px_ex = px_last + (px_last - px_prev)
+                    py_ex = py_last + (py_last - py_prev)
+                    wx_ex = wx_last + (wx_last - wx_prev)
+                    wy_ex = wy_last + (wy_last - wy_prev)
+                else:
+                    px_ex, py_ex = tr["history_pixel"][-1]
+                    wx_ex, wy_ex = tr["history_world"][-1]
+
+                tr["history_pixel"].append((float(px_ex), float(py_ex)))
+                tr["history_world"].append((float(wx_ex), float(wy_ex)))
+                tr["last_seen"] = frame_idx
+                tr["misses"] += 1
+
+            if frame_idx == len(front_paths) - 1:
+                front_final_detections = frame_dets_with_ids
+
+        valid_tracks = []
+        for tid, tr in tracks.items():
+            if len(tr["history_world"]) != len(front_paths):
+                continue
+            if tr["misses"] > 2:
+                continue
+
+            x0, y0 = tr["history_world"][0]
+            x1, y1 = tr["history_world"][-1]
+            motion = math.hypot(x1 - x0, y1 - y0)
+            if motion < 0.08:
+                continue
+
+            valid_tracks.append(
+                {
+                    "id": tid,
+                    "kind": tr["kind"],
+                    "raw_label": tr["raw_label"],
+                    "history_pixel": [tuple(p) for p in tr["history_pixel"]],
+                    "history_world": [tuple(p) for p in tr["history_world"]],
+                    "last_box": tr["last_box"],
+                    "last_keypoints": tr["last_keypoints"],
+                }
+            )
+
+        valid_tracks.sort(key=lambda t: t["id"])
+        return valid_tracks, front_final_detections
+
+    @staticmethod
+    def raw_label_to_stabilizer_type(raw_label: str) -> str:
+        if raw_label == "person":
+            return "Person"
+        if raw_label == "bicycle":
+            return "Bicycle"
+        if raw_label == "motorcycle":
+            return "Motorcycle"
+        if raw_label == "bus":
+            return "Bus"
+        if raw_label == "truck":
+            return "Truck"
+        return "Car"
+
+    @staticmethod
+    def build_fusion_features(history_world: list[tuple[float, float]], fusion_data: dict[str, Any] | None):
+        if not fusion_data:
+            return None
+
+        lidar_xy = fusion_data.get("lidar_xy")
+        radar_xy = fusion_data.get("radar_xy")
+
+        if lidar_xy is None and radar_xy is None:
+            return None
+
+        feats = []
+        for px, py in history_world:
+            if lidar_xy is not None and len(lidar_xy) > 0:
+                dl = np.hypot(lidar_xy[:, 0] - px, lidar_xy[:, 1] - py)
+                lidar_cnt = int((dl < 2.0).sum())
+            else:
+                lidar_cnt = 0
+
+            if radar_xy is not None and len(radar_xy) > 0:
+                dr = np.hypot(radar_xy[:, 0] - px, radar_xy[:, 1] - py)
+                radar_cnt = int((dr < 2.5).sum())
+            else:
+                radar_cnt = 0
+
+            lidar_norm = min(80.0, float(lidar_cnt)) / 80.0
+            radar_norm = min(30.0, float(radar_cnt)) / 30.0
+            sensor_strength = min(1.0, (float(lidar_cnt) + 2.0 * float(radar_cnt)) / 100.0)
+            feats.append([lidar_norm, radar_norm, sensor_strength])
+
+        return feats
+
+    def stabilize_tracks_with_radar(self, tracks: list[dict[str, Any]], fusion_data: dict[str, Any] | None):
+        if not tracks:
+            return tracks
+
+        packed = []
+        for tr in tracks:
+            hist = tr["history_world"]
+            if len(hist) >= 2:
+                dx = float(hist[-1][0] - hist[-2][0])
+                dy = float(hist[-1][1] - hist[-2][1])
+            else:
+                dx = 0.0
+                dy = 0.0
+
+            packed.append(
+                {
+                    "type": self.raw_label_to_stabilizer_type(tr.get("raw_label", "car")),
+                    "history": [tuple(p) for p in hist],
+                    "dx": dx,
+                    "dy": dy,
+                }
+            )
+
+        stabilized = radar_stabilize_motion(packed, fusion_data, dt_seconds=0.5)
+
+        updated = []
+        for tr, st in zip(tracks, stabilized):
+            t_copy = dict(tr)
+            t_copy["history_world"] = [(float(x), float(y)) for x, y in st["history"]]
+            updated.append(t_copy)
+
+        return updated
+
+    @staticmethod
+    def choose_target_track_id(tracks: list[dict[str, Any]]) -> int | None:
+        if not tracks:
+            return None
+
+        peds = [t for t in tracks if t["kind"] == "pedestrian"]
+        if peds:
+            best = min(peds, key=lambda t: math.hypot(t["history_world"][-1][0], t["history_world"][-1][1]))
+            return best["id"]
+
+        return tracks[0]["id"]
+
+    def build_agents_from_tracks(self, tracks: list[dict[str, Any]], fusion_data: dict[str, Any] | None):
+        if not tracks:
+            return [], None, []
+
+        tracks_work = []
+        for tr in tracks:
+            tracks_work.append(
+                {
+                    "id": tr["id"],
+                    "kind": tr["kind"],
+                    "raw_label": tr["raw_label"],
+                    "history_pixel": [tuple(p) for p in tr["history_pixel"]],
+                    "history_world": [tuple(p) for p in tr["history_world"]],
+                    "last_box": tr.get("last_box"),
+                    "last_keypoints": tr.get("last_keypoints"),
+                }
+            )
+
+        tracks_work = self.stabilize_tracks_with_radar(tracks_work, fusion_data)
+
+        target_id = self.choose_target_track_id(tracks_work)
+        agents = []
+
+        for tr in tracks_work:
+            neighbors = [other["history_world"] for other in tracks_work if other["id"] != tr["id"]]
+
+            if len(neighbors) > 12:
+                x0, y0 = tr["history_world"][-1]
+                neighbors = sorted(
+                    neighbors,
+                    key=lambda nh: math.hypot(nh[-1][0] - x0, nh[-1][1] - y0),
+                )[:12]
+
+            fusion_feats = self.build_fusion_features(tr["history_world"], fusion_data)
+
+            pred, probs, _ = trajectory_predict(
+                tr["history_world"],
+                neighbor_points_list=neighbors,
+                fusion_feats=fusion_feats,
+            )
+
+            pred_np = pred.detach().cpu().numpy()
+            probs_np = probs.detach().cpu().numpy()
+
+            predictions = []
+            for mode_i in range(pred_np.shape[0]):
+                predictions.append([(float(p[0]), float(p[1])) for p in pred_np[mode_i]])
+
+            agents.append(
+                {
+                    "id": int(tr["id"]),
+                    "type": "pedestrian" if tr["kind"] == "pedestrian" else "vehicle",
+                    "raw_label": tr["raw_label"],
+                    "history": [tuple(map(float, p)) for p in tr["history_world"]],
+                    "predictions": predictions,
+                    "probabilities": self.normalize_probs(probs_np.tolist()),
+                    "is_target": tr["id"] == target_id,
+                }
+            )
+
+        return agents, target_id, tracks_work
+
+    @staticmethod
+    def assign_track_ids_to_front_detections(
+        detections: list[dict[str, Any]],
+        tracks: list[dict[str, Any]],
+        gate_px: float = 90.0,
+    ) -> list[dict[str, Any]]:
+        if not detections:
+            return []
+
+        out = []
+        used_ids = set()
+
+        for det_idx, det in enumerate(detections):
+            d = dict(det)
+            d.setdefault("det_id", det_idx + 1)
+
+            if d.get("track_id") is not None:
+                used_ids.add(d["track_id"])
+                out.append(d)
+                continue
+
+            best_id = None
+            best_dist = 1e9
+
+            for tr in tracks:
+                if tr["id"] in used_ids:
+                    continue
+                if tr["kind"] != d["kind"]:
+                    continue
+
+                px, py = tr["history_pixel"][-1]
+                dist = math.hypot(d["center_x"] - px, d["bottom_y"] - py)
+                if dist < gate_px and dist < best_dist:
+                    best_dist = dist
+                    best_id = tr["id"]
+
+            d["track_id"] = best_id
+            if best_id is not None:
+                used_ids.add(best_id)
+
+            out.append(d)
+
+        return out
+
+    def build_live_agents_bundle(
+        self,
+        anchor_idx: int,
+        score_threshold: float,
+        tracking_gate_px: float,
+        use_pose: bool,
+    ) -> dict[str, Any]:
+        front_paths = self.list_channel_image_paths("CAM_FRONT")
+        if len(front_paths) < 4:
+            return {"error": "Need at least 4 CAM_FRONT frames in DataSet/samples/CAM_FRONT."}
+
+        if anchor_idx < 3:
+            anchor_idx = 3
+        if anchor_idx >= len(front_paths):
+            anchor_idx = len(front_paths) - 1
+
+        models = self.load_cv_models()
+        if "error" in models:
+            return {
+                "error": f"Could not load CV models ({models['error']}).",
+                "device": models.get("device_name", "unknown"),
+            }
+
+        window_paths = front_paths[anchor_idx - 3 : anchor_idx + 1]
+
+        tracks, front_dets = self.track_front_agents(
+            window_paths,
+            models,
+            score_threshold=score_threshold,
+            tracking_gate_px=tracking_gate_px,
+            use_pose=use_pose,
+        )
+
+        if len(tracks) == 0:
+            return {"error": "No valid tracked moving agents found in selected frame window."}
+
+        front_curr = window_paths[-1]
+        fusion_data = load_fusion_for_cam_frame(
+            front_curr.name,
+            data_root=str(self.data_root),
+            version="v1.0-mini",
+        )
+
+        agents, target_id, tracks_stable = self.build_agents_from_tracks(tracks, fusion_data)
+        if len(agents) == 0:
+            return {"error": "Tracking succeeded but trajectory prediction produced no agents."}
+
+        front_dets = self.assign_track_ids_to_front_detections(front_dets, tracks_stable, gate_px=tracking_gate_px)
+        front_img = self.load_image_array(front_curr)
+        scene_geometry = self.extract_scene_geometry(front_img, front_dets)
+        live_sample_token = str(fusion_data.get("sample_token")) if fusion_data and fusion_data.get("sample_token") else None
+        scene_geometry = self._attach_hd_map_layer(scene_geometry, live_sample_token)
+
+        return {
+            "mode": "live_fusion",
+            "agents": agents,
+            "target_track_id": target_id,
+            "device": models.get("device_name", "unknown"),
+            "front_anchor_path": str(front_curr),
+            "track_count": len(agents),
+            "scene_geometry": scene_geometry,
+            "camera_snapshots": {
+                "CAM_FRONT": {
+                    "frame_path": str(front_curr),
+                    "detections": front_dets,
+                }
+            },
+            "fusion_data": fusion_data,
+        }
diff --git a/backend/scripts/__init__.py b/backend/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a9eeeb976413373bbc0ce72dbdc3103a337b81
--- /dev/null
+++ b/backend/scripts/__init__.py
@@ -0,0 +1 @@
+"""Organized script modules for training, evaluation, and tools."""
diff --git a/backend/scripts/data/__init__.py b/backend/scripts/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d709ef9332cee235a37a95dbd70f8c07ebe523
--- /dev/null
+++ b/backend/scripts/data/__init__.py
@@ -0,0 +1 @@
+"""Data preparation script modules."""
diff --git a/backend/scripts/data/build_dataset_from_images.py b/backend/scripts/data/build_dataset_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..6833175b1a38d000552ea06890669398bafb58f2
--- /dev/null
+++ b/backend/scripts/data/build_dataset_from_images.py
@@ -0,0 +1,119 @@
+﻿import torch
+import torchvision
+from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
+from PIL import Image
+import os
+import glob
+import math
+import json
+
+TARGET_CLASSES = {1: 'Person', 2: 'Bicycle', 3: 'Car', 4: 'Motorcycle'}
+
+# Set up GPU acceleration
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def load_perception_model():
+    print(f"[System] Loading Pre-Trained Faster R-CNN (ResNet-50-FPN) on {device.type.upper()}...")
+    weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+    model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
+    model.to(device) # Move model to GPU
+    model.eval()
+    return model, weights
+
+def extract_features(img_path, model, weights, score_threshold=0.7):
+    image = Image.open(img_path).convert("RGB")
+    preprocess = weights.transforms()
+    # Move the image tensor to the GPU so the math runs on CUDA
+    input_batch = preprocess(image).unsqueeze(0).to(device)
+    
+    with torch.no_grad():
+        prediction = model(input_batch)[0]
+        
+    extracted = []
+    # prediction items are on GPU, so we use .item() to pull the raw number back out
+    for i, box in enumerate(prediction['boxes']):
+        score = prediction['scores'][i].item()
+        label = prediction['labels'][i].item()
+        if score > score_threshold and label in TARGET_CLASSES:
+            center_x = (box[0] + box[2]).item() / 2.0
+            bottom_y = box[3].item()
+            extracted.append({
+                'type': TARGET_CLASSES[label],
+                'coord': (round(center_x, 2), round(bottom_y, 2))
+            })
+    return extracted
+
+def process_dataset_into_trajectories():
+    print("="*60)
+    print(f"| Starting Automated Dataset Pre-Processing Pipeline |")
+    print(f"| Hardware Acceleration: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'} |")
+    print("="*60)
+    
+    model, weights = load_perception_model()
+    
+    # Get images chronologically to simulate a video feed
+    image_paths = sorted(glob.glob("DataSet/samples/CAM_FRONT/*.jpg"))
+    if not image_paths:
+        print("[!] No images found to process.")
+        return
+        
+    print(f"[System] Success: Found a total of {len(image_paths)} valid image frames in the folder. Processing now...")
+    
+    dataset_trajectories = []
+    
+    # We need 4 frames of history for our AI Model (T-3, T-2, T-1, T0)
+    for i in range(len(image_paths) - 3):
+        frames = image_paths[i:i+4]
+        frame_data = []
+        
+        # Output progress every 50 frames
+        if i % 50 == 0:
+            print(f"   -> Processing frame sequence {i}/{len(image_paths)}")
+            
+        for f in frames:
+            objs = extract_features(f, model, weights)
+            frame_data.append(objs)
+            
+        for obj_t0 in frame_data[0]:
+            target_type = obj_t0['type']
+            track_history = [obj_t0['coord']]
+            valid_track = True
+            
+            last_coord = obj_t0['coord']
+            for t in range(1, 4):
+                best_dist = float('inf')
+                best_coord = None
+                for obj_t_next in frame_data[t]:
+                    if obj_t_next['type'] == target_type:
+                        dist = math.sqrt((last_coord[0] - obj_t_next['coord'][0])**2 + 
+                                         (last_coord[1] - obj_t_next['coord'][1])**2)
+                        if dist < 60.0 and dist < best_dist: 
+                            best_dist = dist
+                            best_coord = obj_t_next['coord']
+                            
+                if best_coord:
+                    track_history.append(best_coord)
+                    last_coord = best_coord
+                else:
+                    valid_track = False
+                    break
+                    
+            if valid_track:
+                dataset_trajectories.append({
+                    "agent_type": target_type,
+                    "trajectory_pixels": track_history
+                })
+
+    output_file = "extracted_training_data.json"
+    with open(output_file, "w") as f:
+        json.dump(dataset_trajectories, f, indent=4)
+        
+    print(f"\n[Success] Pipeline Complete!")
+    print(f"[+] Extracted {len(dataset_trajectories)} valid moving trajectories from raw images.")
+    print(f"[+] Saved AI Training payload to: {output_file}")
+
+if __name__ == '__main__':
+    try:
+        process_dataset_into_trajectories()
+    except Exception as e:
+        print(f"Error during processing: {e}")
diff --git a/backend/scripts/evaluation/__init__.py b/backend/scripts/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43661ad96cdedca2d50816a6797a2cc6daf08b8
--- /dev/null
+++ b/backend/scripts/evaluation/__init__.py
@@ -0,0 +1 @@
+"""Evaluation and benchmarking script modules."""
diff --git a/backend/scripts/evaluation/benchmark_perf.py b/backend/scripts/evaluation/benchmark_perf.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1ea4e7701981bbc9ec4acca37bf0e6fc1b85b7
--- /dev/null
+++ b/backend/scripts/evaluation/benchmark_perf.py
@@ -0,0 +1,109 @@
+import time
+
+import torch
+from PIL import Image
+from torchvision.models.detection import (
+    fasterrcnn_resnet50_fpn,
+    FasterRCNN_ResNet50_FPN_Weights,
+    keypointrcnn_resnet50_fpn,
+    KeypointRCNN_ResNet50_FPN_Weights,
+)
+
+from backend.app.ml.sensor_fusion import load_fusion_for_cam_frame
+from backend.app.ml.inference import predict, USING_FUSION_MODEL
+
+
+def main():
+    img_path = r"DataSet/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg"
+    img = Image.open(img_path).convert("RGB")
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("device", device)
+    print("using_fusion_model", USING_FUSION_MODEL)
+
+    t0 = time.perf_counter()
+    w_det = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+    m_det = fasterrcnn_resnet50_fpn(weights=w_det, progress=False).to(device).eval()
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    load_det = (time.perf_counter() - t0) * 1000
+
+    t0 = time.perf_counter()
+    w_pose = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
+    m_pose = keypointrcnn_resnet50_fpn(weights=w_pose, progress=False).to(device).eval()
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    load_pose = (time.perf_counter() - t0) * 1000
+
+    print("load_ms_fasterrcnn", round(load_det, 2))
+    print("load_ms_keypointrcnn", round(load_pose, 2))
+
+    in_det = w_det.transforms()(img).unsqueeze(0).to(device)
+    in_pose = w_pose.transforms()(img).unsqueeze(0).to(device)
+
+    with torch.no_grad():
+        _ = m_det(in_det)
+        _ = m_pose(in_pose)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+
+    n = 5
+
+    st = time.perf_counter()
+    with torch.no_grad():
+        for _ in range(n):
+            _ = m_det(in_det)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    det_ms = (time.perf_counter() - st) * 1000 / n
+
+    st = time.perf_counter()
+    with torch.no_grad():
+        for _ in range(n):
+            _ = m_pose(in_pose)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    pose_ms = (time.perf_counter() - st) * 1000 / n
+
+    print("avg_ms_det_per_frame", round(det_ms, 2))
+    print("avg_ms_pose_per_frame", round(pose_ms, 2))
+
+    m = 30
+    st = time.perf_counter()
+    for _ in range(m):
+        _ = load_fusion_for_cam_frame(
+            "n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg",
+            data_root="DataSet",
+        )
+    fusion_ms = (time.perf_counter() - st) * 1000 / m
+    print("avg_ms_fusion_lookup", round(fusion_ms, 2))
+
+    pts = [(0, 10), (2, 10), (4, 10), (6, 10)]
+    neigh = [
+        [(8, 12), (8.5, 12), (9, 12), (9.5, 12)],
+        [(15, 7), (15.5, 7.2), (16, 7.5), (16.4, 7.7)],
+    ]
+    fusion_feats = [[0.2, 0.1, 0.25], [0.25, 0.1, 0.3], [0.3, 0.12, 0.35], [0.35, 0.15, 0.4]]
+
+    for _ in range(10):
+        _ = predict(pts, neigh, fusion_feats=fusion_feats)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+
+    k = 300
+    st = time.perf_counter()
+    for _ in range(k):
+        _ = predict(pts, neigh, fusion_feats=fusion_feats)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    pred_ms = (time.perf_counter() - st) * 1000 / k
+    print("avg_ms_transformer_predict", round(pred_ms, 4))
+
+    approx = 2 * det_ms + pose_ms + fusion_ms + 6 * pred_ms
+    fps = 1000.0 / approx if approx > 0 else 0.0
+    print("approx_live_2frame_ms", round(approx, 2))
+    print("approx_live_equiv_fps", round(fps, 2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/scripts/evaluation/evaluate.py b/backend/scripts/evaluation/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..f23dc537aacab1c05a358a4a9ce3aae8d4fd3132
--- /dev/null
+++ b/backend/scripts/evaluation/evaluate.py
@@ -0,0 +1,127 @@
+import torch
+from torch.utils.data import DataLoader
+from pathlib import Path
+from backend.app.legacy.dataset import TrajectoryDataset
+from backend.app.ml.model import TrajectoryTransformer
+from backend.scripts.training.train import get_data, collate_fn, compute_ade, compute_fde
+import numpy as np
+import random
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+BASE_CKPT = REPO_ROOT / "models" / "best_social_model.pth"
+
+def evaluate():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')       
+    print(f"Running Evaluation on {device}...")
+
+    samples = get_data()
+    
+    # Use the same deterministic split as train.py to evaluate on validation set
+    random.seed(42)
+    random.shuffle(samples)
+    train_size = int(0.8 * len(samples))
+    val_samples = samples[train_size:]
+
+    dataset = TrajectoryDataset(val_samples, augment=False)
+    eval_loader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn)     
+
+    # Load Model
+    model = TrajectoryTransformer().to(device)
+    try:
+        model.load_state_dict(torch.load(BASE_CKPT, map_location=device, weights_only=True))
+        print("Successfully loaded 'best_social_model.pth' from models folder")
+    except Exception as e:
+        print(f"Could not load model weights: {e}")
+        return
+
+    model.eval()
+
+    total_ade = 0
+    total_fde = 0
+    miss_rate = 0
+    
+    cv_total_ade = 0
+    cv_total_fde = 0
+    cv_miss_rate = 0
+    
+    total_samples = 0
+
+    # Miss rate threshold: if best path's endpoint is off by more than 2.0 meters
+    MISS_THRESHOLD = 2.0
+
+    print("\n--- Starting Deep Evaluation ---")
+    with torch.no_grad():
+        for obs, neighbors, future in eval_loader:
+            obs, future = obs.to(device), future.to(device)
+
+            # --- MODEL PREDICTION ---
+            pred, goals, probs, _ = model(obs, neighbors)
+
+            # Find the best prediction out of K=3 for each item in the batch    
+            gt = future.unsqueeze(1)
+            error = torch.norm(pred - gt, dim=3).mean(dim=2)
+            best_idx = torch.argmin(error, dim=1)
+            best_pred = pred[torch.arange(pred.size(0)), best_idx]
+
+            # Metrics Model
+            batch_ade = compute_ade(best_pred, future).item()
+            batch_fde = compute_fde(best_pred, future).item()
+
+            total_ade += batch_ade * obs.size(0)
+            total_fde += batch_fde * obs.size(0)
+
+            final_displacement = torch.norm(best_pred[:, -1] - future[:, -1], dim=1)
+            misses = (final_displacement > MISS_THRESHOLD).sum().item()
+            miss_rate += misses
+            
+            # --- CONSTANT VELOCITY BASELINE ---
+            vx = obs[:, 3, 2].unsqueeze(1) # dx at last observed step
+            vy = obs[:, 3, 3].unsqueeze(1) # dy at last observed step
+            
+            t = torch.arange(1, 13, device=device).unsqueeze(0).float() # Horizon is 12 steps
+            
+            x_last = obs[:, 3, 0].unsqueeze(1) # x at last step
+            y_last = obs[:, 3, 1].unsqueeze(1) # y at last step
+            
+            cv_pred_x = x_last + vx * t
+            cv_pred_y = y_last + vy * t
+            cv_pred = torch.stack([cv_pred_x, cv_pred_y], dim=-1)
+            
+            # Metrics CV Baseline
+            cv_batch_ade = compute_ade(cv_pred, future).item()
+            cv_batch_fde = compute_fde(cv_pred, future).item()
+            
+            cv_total_ade += cv_batch_ade * obs.size(0)
+            cv_total_fde += cv_batch_fde * obs.size(0)
+            
+            cv_final_displacement = torch.norm(cv_pred[:, -1] - future[:, -1], dim=1)
+            cv_misses = (cv_final_displacement > MISS_THRESHOLD).sum().item()
+            cv_miss_rate += cv_misses
+
+            total_samples += obs.size(0)
+
+    # Average metrics
+    avg_ade = total_ade / total_samples
+    avg_fde = total_fde / total_samples
+    avg_miss_rate = (miss_rate / total_samples) * 100
+    
+    cv_avg_ade = cv_total_ade / total_samples
+    cv_avg_fde = cv_total_fde / total_samples
+    cv_avg_miss_rate = (cv_miss_rate / total_samples) * 100
+
+    print("\n========================================================")
+    print("           HACKATHON FINAL METRICS REPORT               ")
+    print("========================================================")
+    print(f"Total Trajectories Evaluated (Val Set): {total_samples}")
+    print(f"Prediction Horizon:           6 Seconds (12 steps)")
+    print(f"Social Context Radius:        50 Meters")
+    print("--------------------------------------------------------")
+    print("METRIC                  | BASELINE (CV) | OUR TRANSFORMER ")
+    print("------------------------|---------------|-----------------")
+    print(f"minADE@3 (meters)       | {cv_avg_ade:13.2f} | {avg_ade:15.2f}")
+    print(f"minFDE@3 (meters)       | {cv_avg_fde:13.2f} | {avg_fde:15.2f}")
+    print(f"Miss Rate (>2.0m)       | {cv_avg_miss_rate:12.1f}% | {avg_miss_rate:14.1f}%")
+    print("========================================================\n")
+
+if __name__ == '__main__':
+    evaluate()
diff --git a/backend/scripts/evaluation/evaluate_phase2_fusion.py b/backend/scripts/evaluation/evaluate_phase2_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec777bcb493769300d5e9737afd8674ddd5a023f
--- /dev/null
+++ b/backend/scripts/evaluation/evaluate_phase2_fusion.py
@@ -0,0 +1,137 @@
+import random
+from pathlib import Path
+
+import torch
+from torch.utils.data import DataLoader
+
+from backend.app.legacy.data_loader import (
+    load_json,
+    extract_pedestrian_instances,
+    build_trajectories_with_sensor,
+    create_windows_with_sensor,
+)
+from backend.app.legacy.dataset_fusion import FusionTrajectoryDataset
+from backend.app.ml.model_fusion import TrajectoryTransformerFusion
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_FUSION_CKPT = REPO_ROOT / "models" / "best_social_model_fusion.pth"
+
+
+def collate_fn_fusion(batch):
+    obs, neighbors, fusion_obs, future = zip(*batch)
+    obs = torch.stack(obs)
+    fusion_obs = torch.stack(fusion_obs)
+    future = torch.stack(future)
+    return obs, list(neighbors), fusion_obs, future
+
+
+def compute_ade(pred, gt):
+    return torch.mean(torch.norm(pred - gt, dim=2))
+
+
+def compute_fde(pred, gt):
+    return torch.mean(torch.norm(pred[:, -1] - gt[:, -1], dim=1))
+
+
+def load_fusion_samples():
+    sample_annotations = load_json("sample_annotation")
+    instances = load_json("instance")
+    categories = load_json("category")
+
+    ped_instances = extract_pedestrian_instances(sample_annotations, instances, categories)
+    trajectories = build_trajectories_with_sensor(sample_annotations, ped_instances)
+    return create_windows_with_sensor(trajectories)
+
+
+def evaluate_fusion(ckpt=DEFAULT_FUSION_CKPT):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Running Phase 2 Fusion Evaluation on {device}...")
+
+    ckpt_path = Path(ckpt)
+    if not ckpt_path.is_absolute():
+        ckpt_path = REPO_ROOT / ckpt_path
+
+    samples = load_fusion_samples()
+    random.seed(42)
+    random.shuffle(samples)
+    train_size = int(0.8 * len(samples))
+    val_samples = samples[train_size:]
+
+    dataset = FusionTrajectoryDataset(val_samples, augment=False)
+    loader = DataLoader(dataset, batch_size=64, collate_fn=collate_fn_fusion)
+
+    model = TrajectoryTransformerFusion(fusion_dim=3).to(device)
+    model.load_state_dict(torch.load(ckpt_path, map_location=device))
+    model.eval()
+
+    total_ade = 0.0
+    total_fde = 0.0
+    miss_count = 0
+
+    cv_total_ade = 0.0
+    cv_total_fde = 0.0
+    cv_miss_count = 0
+
+    total_n = 0
+    miss_threshold = 2.0
+
+    with torch.no_grad():
+        for obs, neighbors, fusion_obs, future in loader:
+            obs = obs.to(device)
+            fusion_obs = fusion_obs.to(device)
+            future = future.to(device)
+
+            pred, goals, probs, _ = model(obs, neighbors, fusion_obs)
+
+            gt = future.unsqueeze(1)
+            err = torch.norm(pred - gt, dim=3).mean(dim=2)
+            best_idx = torch.argmin(err, dim=1)
+            best_pred = pred[torch.arange(pred.size(0), device=device), best_idx]
+
+            total_ade += compute_ade(best_pred, future).item() * obs.size(0)
+            total_fde += compute_fde(best_pred, future).item() * obs.size(0)
+
+            final_disp = torch.norm(best_pred[:, -1] - future[:, -1], dim=1)
+            miss_count += (final_disp > miss_threshold).sum().item()
+
+            # Constant velocity baseline for comparison.
+            vx = obs[:, 3, 2].unsqueeze(1)
+            vy = obs[:, 3, 3].unsqueeze(1)
+            t = torch.arange(1, 13, device=device).unsqueeze(0).float()
+            x_last = obs[:, 3, 0].unsqueeze(1)
+            y_last = obs[:, 3, 1].unsqueeze(1)
+
+            cv_x = x_last + vx * t
+            cv_y = y_last + vy * t
+            cv_pred = torch.stack([cv_x, cv_y], dim=-1)
+
+            cv_total_ade += compute_ade(cv_pred, future).item() * obs.size(0)
+            cv_total_fde += compute_fde(cv_pred, future).item() * obs.size(0)
+            cv_final = torch.norm(cv_pred[:, -1] - future[:, -1], dim=1)
+            cv_miss_count += (cv_final > miss_threshold).sum().item()
+
+            total_n += obs.size(0)
+
+    avg_ade = total_ade / total_n
+    avg_fde = total_fde / total_n
+    avg_miss = 100.0 * miss_count / total_n
+
+    cv_avg_ade = cv_total_ade / total_n
+    cv_avg_fde = cv_total_fde / total_n
+    cv_avg_miss = 100.0 * cv_miss_count / total_n
+
+    print("\n========================================================")
+    print("         PHASE 2 FUSION METRICS REPORT                 ")
+    print("========================================================")
+    print(f"Total Trajectories Evaluated: {total_n}")
+    print("--------------------------------------------------------")
+    print("METRIC                  | BASELINE (CV) | FUSION MODEL ")
+    print("------------------------|---------------|----------------")
+    print(f"minADE@3 (meters)       | {cv_avg_ade:13.2f} | {avg_ade:14.2f}")
+    print(f"minFDE@3 (meters)       | {cv_avg_fde:13.2f} | {avg_fde:14.2f}")
+    print(f"Miss Rate (>2.0m)       | {cv_avg_miss:12.1f}% | {avg_miss:13.1f}%")
+    print("========================================================\n")
+
+
+if __name__ == '__main__':
+    evaluate_fusion()
diff --git a/backend/scripts/legacy/__init__.py b/backend/scripts/legacy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7902317c3fd11e18fc720f18bc02f8e0f520be69
--- /dev/null
+++ b/backend/scripts/legacy/__init__.py
@@ -0,0 +1 @@
+"""Legacy Streamlit and hackathon app modules."""
diff --git a/backend/scripts/legacy/app_streamlit.py b/backend/scripts/legacy/app_streamlit.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb11bc82f1c7648dd155ed2a4380ec4bd23542d6
--- /dev/null
+++ b/backend/scripts/legacy/app_streamlit.py
@@ -0,0 +1,2533 @@
+﻿import json
+import io
+import math
+import time
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import streamlit as st
+import torch
+from PIL import Image
+
+try:
+    import cv2
+except Exception:
+    cv2 = None
+
+from torchvision.models.detection import (
+    FasterRCNN_ResNet50_FPN_Weights,
+    KeypointRCNN_ResNet50_FPN_Weights,
+    fasterrcnn_resnet50_fpn,
+    keypointrcnn_resnet50_fpn,
+)
+
+from backend.app.ml.inference import USING_FUSION_MODEL, predict as trajectory_predict
+from backend.app.ml.sensor_fusion import load_fusion_for_cam_frame, radar_stabilize_motion
+
+# ----------------------------
+# PAGE CONFIG
+# ----------------------------
+st.set_page_config(page_title="Multi-Agent Trajectory Prediction Simulator", layout="wide")
+
+BG_PRIMARY = "#05070f"
+BG_SECONDARY = "#0b1220"
+GRID_COLOR = "rgba(100, 116, 139, 0.22)"
+ACCENT = "#eb6b26"
+TARGET_PURPLE = "#a855f7"
+VRU_GREEN = "#22c55e"
+VEHICLE_YELLOW = "#facc15"
+EGO_CYAN = "#22d3ee"
+WHITE = "#e5e7eb"
+TRAJ_MODE_COLORS = ["#22d3ee", "#a855f7", "#fb923c"]
+
+ROAD_ASPHALT = "rgba(26, 34, 45, 0.94)"
+ROAD_SHOULDER = "rgba(12, 18, 28, 0.90)"
+LANE_SOLID = "rgba(226, 232, 240, 0.88)"
+LANE_DASH = "rgba(203, 213, 225, 0.72)"
+CENTER_DASH = "rgba(250, 204, 21, 0.82)"
+
+CAMERA_VIEWS = [
+    ("CAM_FRONT", "Front", 0.0),
+    ("CAM_FRONT_LEFT", "Front-Left", 40.0),
+    ("CAM_FRONT_RIGHT", "Front-Right", -40.0),
+]
+
+SYNTH_SKELETON_EDGES = [
+    (0, 1),
+    (1, 2),
+    (1, 3),
+    (2, 4),
+    (3, 5),
+    (1, 6),
+    (6, 7),
+    (6, 8),
+]
+
+COCO_SKELETON_EDGES = [
+    (0, 1),
+    (0, 2),
+    (1, 3),
+    (2, 4),
+    (5, 6),
+    (5, 7),
+    (7, 9),
+    (6, 8),
+    (8, 10),
+    (5, 11),
+    (6, 12),
+    (11, 12),
+    (11, 13),
+    (13, 15),
+    (12, 14),
+    (14, 16),
+]
+
+COCO_TO_LABEL = {
+    1: "person",
+    2: "bicycle",
+    3: "car",
+    4: "motorcycle",
+    6: "bus",
+    8: "truck",
+}
+
+VRU_LABELS = {"person", "bicycle", "motorcycle"}
+VEHICLE_LABELS = {"car", "bus", "truck"}
+
+
+def normalize_probs(probs):
+    arr = np.asarray(probs, dtype=float)
+    arr = np.clip(arr, 1e-6, None)
+    arr = arr / arr.sum()
+    return arr.tolist()
+
+
+def agent_color(agent):
+    if agent.get("is_target", False):
+        return TARGET_PURPLE
+    if agent.get("type") == "pedestrian":
+        return VRU_GREEN
+    return VEHICLE_YELLOW
+
+
+def coco_kind(label_name):
+    if label_name in VRU_LABELS:
+        return "pedestrian"
+    if label_name in VEHICLE_LABELS:
+        return "vehicle"
+    return None
+
+
+def iou_xyxy(box_a, box_b):
+    ax1, ay1, ax2, ay2 = box_a
+    bx1, by1, bx2, by2 = box_b
+
+    ix1 = max(ax1, bx1)
+    iy1 = max(ay1, by1)
+    ix2 = min(ax2, bx2)
+    iy2 = min(ay2, by2)
+
+    iw = max(0.0, ix2 - ix1)
+    ih = max(0.0, iy2 - iy1)
+    inter = iw * ih
+
+    area_a = max(0.0, ax2 - ax1) * max(0.0, ay2 - ay1)
+    area_b = max(0.0, bx2 - bx1) * max(0.0, by2 - by1)
+    union = area_a + area_b - inter
+
+    if union <= 1e-9:
+        return 0.0
+    return inter / union
+
+
+def pixel_to_bev(center_x, bottom_y, width, height):
+    # Dynamic scaling from current frame dimensions (no hardcoded resolution assumptions).
+    x_div = max(1.0, width / 80.0)
+    y_div = max(1.0, height / 50.0)
+
+    x_m = (center_x - 0.5 * width) / x_div
+    y_m = (bottom_y - 0.58 * height) / y_div
+    return float(x_m), float(y_m)
+
+
+def fallback_canvas():
+    h, w = 540, 960
+    canvas = np.zeros((h, w, 3), dtype=np.uint8)
+    canvas[:, :, 0] = 10
+    canvas[:, :, 1] = 14
+    canvas[:, :, 2] = 28
+    return canvas
+
+
+@st.cache_data(show_spinner=False)
+def list_channel_image_paths(channel):
+    base = Path("DataSet") / "samples" / channel
+    if not base.exists():
+        return []
+    return [str(p) for p in sorted(base.glob("*.jpg"))]
+
+
+@st.cache_data(show_spinner=False)
+def load_image_array(image_path):
+    return np.asarray(Image.open(image_path).convert("RGB"))
+
+
+def load_camera_frame(channel, frame_idx=0):
+    image_paths = list_channel_image_paths(channel)
+    if image_paths:
+        idx = int(np.clip(frame_idx, 0, len(image_paths) - 1))
+        return load_image_array(image_paths[idx]), image_paths[idx]
+    return fallback_canvas(), None
+
+
+@st.cache_resource(show_spinner=False)
+def load_cv_models():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    try:
+        det_weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+        det_model = fasterrcnn_resnet50_fpn(weights=det_weights, progress=False)
+        det_model.to(device).eval()
+
+        pose_weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
+        pose_model = keypointrcnn_resnet50_fpn(weights=pose_weights, progress=False)
+        pose_model.to(device).eval()
+
+        return {
+            "device": device,
+            "device_name": str(device),
+            "det_model": det_model,
+            "det_weights": det_weights,
+            "pose_model": pose_model,
+            "pose_weights": pose_weights,
+        }
+    except Exception as exc:
+        return {
+            "error": str(exc),
+            "device": device,
+            "device_name": str(device),
+        }
+
+
+def detect_objects_and_pose(image_arr, models, score_threshold=0.55, use_pose=True):
+    if "error" in models:
+        return []
+
+    device = models["device"]
+    pil_img = Image.fromarray(image_arr)
+
+    det_input = models["det_weights"].transforms()(pil_img).unsqueeze(0).to(device)
+    with torch.no_grad():
+        det_out = models["det_model"](det_input)[0]
+
+    boxes = det_out["boxes"].detach().cpu().numpy() if len(det_out["boxes"]) > 0 else np.zeros((0, 4))
+    scores = det_out["scores"].detach().cpu().numpy() if len(det_out["scores"]) > 0 else np.zeros((0,))
+    labels = det_out["labels"].detach().cpu().numpy() if len(det_out["labels"]) > 0 else np.zeros((0,))
+
+    detections = []
+    for i in range(len(scores)):
+        score = float(scores[i])
+        label_idx = int(labels[i])
+        label_name = COCO_TO_LABEL.get(label_idx)
+
+        if label_name is None or score < score_threshold:
+            continue
+
+        kind = coco_kind(label_name)
+        if kind is None:
+            continue
+
+        x1, y1, x2, y2 = [float(v) for v in boxes[i]]
+        detections.append(
+            {
+                "score": score,
+                "raw_label": label_name,
+                "kind": kind,
+                "box": [x1, y1, x2, y2],
+                "center_x": 0.5 * (x1 + x2),
+                "bottom_y": y2,
+                "keypoints": None,
+            }
+        )
+
+    if use_pose:
+        pose_input = models["pose_weights"].transforms()(pil_img).unsqueeze(0).to(device)
+        with torch.no_grad():
+            pose_out = models["pose_model"](pose_input)[0]
+
+        p_boxes = pose_out["boxes"].detach().cpu().numpy() if len(pose_out["boxes"]) > 0 else np.zeros((0, 4))
+        p_scores = pose_out["scores"].detach().cpu().numpy() if len(pose_out["scores"]) > 0 else np.zeros((0,))
+        p_labels = pose_out["labels"].detach().cpu().numpy() if len(pose_out["labels"]) > 0 else np.zeros((0,))
+        p_keypoints = pose_out["keypoints"].detach().cpu().numpy() if len(pose_out["keypoints"]) > 0 else np.zeros((0, 17, 3))
+
+        assigned = set()
+        for i in range(len(p_scores)):
+            if int(p_labels[i]) != 1:
+                continue
+            if float(p_scores[i]) < max(0.25, 0.8 * score_threshold):
+                continue
+
+            pose_box = [float(v) for v in p_boxes[i]]
+            best_idx = None
+            best_iou = 0.0
+
+            for det_idx, det in enumerate(detections):
+                if det_idx in assigned:
+                    continue
+                if det["raw_label"] != "person":
+                    continue
+                iou_val = iou_xyxy(det["box"], pose_box)
+                if iou_val > best_iou:
+                    best_iou = iou_val
+                    best_idx = det_idx
+
+            if best_idx is not None and best_iou > 0.1:
+                detections[best_idx]["keypoints"] = p_keypoints[i].tolist()
+                assigned.add(best_idx)
+
+    return detections
+
+
+def track_front_agents(front_paths, models, score_threshold=0.55, tracking_gate_px=90.0, use_pose=True):
+    tracks = {}
+    next_track_id = 1
+    front_final_detections = []
+
+    for frame_idx, frame_path in enumerate(front_paths):
+        frame_arr = load_image_array(frame_path)
+        h, w = frame_arr.shape[:2]
+
+        detections = detect_objects_and_pose(
+            frame_arr,
+            models,
+            score_threshold=score_threshold,
+            use_pose=use_pose,
+        )
+        detections.sort(key=lambda d: d["score"], reverse=True)
+
+        matched_track_ids = set()
+        frame_dets_with_ids = []
+
+        for det in detections:
+            wx, wy = pixel_to_bev(det["center_x"], det["bottom_y"], w, h)
+
+            best_track_id = None
+            best_dist = 1e9
+
+            for tid, tr in tracks.items():
+                if tr["kind"] != det["kind"]:
+                    continue
+                if tr["last_seen"] != frame_idx - 1:
+                    continue
+                if tid in matched_track_ids:
+                    continue
+
+                px_last, py_last = tr["history_pixel"][-1]
+                dist = math.hypot(det["center_x"] - px_last, det["bottom_y"] - py_last)
+                if dist < tracking_gate_px and dist < best_dist:
+                    best_dist = dist
+                    best_track_id = tid
+
+            if best_track_id is None:
+                best_track_id = next_track_id
+                next_track_id += 1
+                tracks[best_track_id] = {
+                    "id": best_track_id,
+                    "kind": det["kind"],
+                    "raw_label": det["raw_label"],
+                    "history_pixel": [],
+                    "history_world": [],
+                    "last_seen": -1,
+                    "last_box": None,
+                    "last_keypoints": None,
+                    "misses": 0,
+                }
+
+            tr = tracks[best_track_id]
+            tr["history_pixel"].append((float(det["center_x"]), float(det["bottom_y"])))
+            tr["history_world"].append((float(wx), float(wy)))
+            tr["last_seen"] = frame_idx
+            tr["raw_label"] = det["raw_label"]
+            tr["last_box"] = det["box"]
+            tr["last_keypoints"] = det.get("keypoints")
+            tr["misses"] = 0
+
+            matched_track_ids.add(best_track_id)
+
+            det = dict(det)
+            det["track_id"] = best_track_id
+            frame_dets_with_ids.append(det)
+
+        # Extrapolate temporarily-lost tracks so 4-point histories can still be formed.
+        for tid, tr in tracks.items():
+            if tr["last_seen"] == frame_idx:
+                continue
+            if tr["last_seen"] < frame_idx - 1:
+                continue
+
+            if len(tr["history_pixel"]) >= 2:
+                px_prev, py_prev = tr["history_pixel"][-2]
+                px_last, py_last = tr["history_pixel"][-1]
+                wx_prev, wy_prev = tr["history_world"][-2]
+                wx_last, wy_last = tr["history_world"][-1]
+
+                px_ex = px_last + (px_last - px_prev)
+                py_ex = py_last + (py_last - py_prev)
+                wx_ex = wx_last + (wx_last - wx_prev)
+                wy_ex = wy_last + (wy_last - wy_prev)
+            else:
+                px_ex, py_ex = tr["history_pixel"][-1]
+                wx_ex, wy_ex = tr["history_world"][-1]
+
+            tr["history_pixel"].append((float(px_ex), float(py_ex)))
+            tr["history_world"].append((float(wx_ex), float(wy_ex)))
+            tr["last_seen"] = frame_idx
+            tr["misses"] += 1
+
+        if frame_idx == len(front_paths) - 1:
+            front_final_detections = frame_dets_with_ids
+
+    valid_tracks = []
+    for tid, tr in tracks.items():
+        if len(tr["history_world"]) != len(front_paths):
+            continue
+        if tr["misses"] > 2:
+            continue
+
+        x0, y0 = tr["history_world"][0]
+        x1, y1 = tr["history_world"][-1]
+        motion = math.hypot(x1 - x0, y1 - y0)
+        if motion < 0.08:
+            continue
+
+        valid_tracks.append(
+            {
+                "id": tid,
+                "kind": tr["kind"],
+                "raw_label": tr["raw_label"],
+                "history_pixel": [tuple(p) for p in tr["history_pixel"]],
+                "history_world": [tuple(p) for p in tr["history_world"]],
+                "last_box": tr["last_box"],
+                "last_keypoints": tr["last_keypoints"],
+            }
+        )
+
+    valid_tracks.sort(key=lambda t: t["id"])
+    return valid_tracks, front_final_detections
+
+
+def raw_label_to_stabilizer_type(raw_label):
+    if raw_label == "person":
+        return "Person"
+    if raw_label == "bicycle":
+        return "Bicycle"
+    if raw_label == "motorcycle":
+        return "Motorcycle"
+    if raw_label == "bus":
+        return "Bus"
+    if raw_label == "truck":
+        return "Truck"
+    return "Car"
+
+
+def build_fusion_features(history_world, fusion_data):
+    if not fusion_data:
+        return None
+
+    lidar_xy = fusion_data.get("lidar_xy")
+    radar_xy = fusion_data.get("radar_xy")
+
+    if lidar_xy is None and radar_xy is None:
+        return None
+
+    feats = []
+    for px, py in history_world:
+        if lidar_xy is not None and len(lidar_xy) > 0:
+            dl = np.hypot(lidar_xy[:, 0] - px, lidar_xy[:, 1] - py)
+            lidar_cnt = int((dl < 2.0).sum())
+        else:
+            lidar_cnt = 0
+
+        if radar_xy is not None and len(radar_xy) > 0:
+            dr = np.hypot(radar_xy[:, 0] - px, radar_xy[:, 1] - py)
+            radar_cnt = int((dr < 2.5).sum())
+        else:
+            radar_cnt = 0
+
+        lidar_norm = min(80.0, float(lidar_cnt)) / 80.0
+        radar_norm = min(30.0, float(radar_cnt)) / 30.0
+        sensor_strength = min(1.0, (float(lidar_cnt) + 2.0 * float(radar_cnt)) / 100.0)
+        feats.append([lidar_norm, radar_norm, sensor_strength])
+
+    return feats
+
+
+def stabilize_tracks_with_radar(tracks, fusion_data):
+    if not tracks:
+        return tracks
+
+    packed = []
+    for tr in tracks:
+        hist = tr["history_world"]
+        if len(hist) >= 2:
+            dx = float(hist[-1][0] - hist[-2][0])
+            dy = float(hist[-1][1] - hist[-2][1])
+        else:
+            dx = 0.0
+            dy = 0.0
+
+        packed.append(
+            {
+                "type": raw_label_to_stabilizer_type(tr.get("raw_label", "car")),
+                "history": [tuple(p) for p in hist],
+                "dx": dx,
+                "dy": dy,
+            }
+        )
+
+    stabilized = radar_stabilize_motion(packed, fusion_data, dt_seconds=0.5)
+
+    updated = []
+    for tr, st in zip(tracks, stabilized):
+        t_copy = dict(tr)
+        t_copy["history_world"] = [(float(x), float(y)) for x, y in st["history"]]
+        updated.append(t_copy)
+
+    return updated
+
+
+def choose_target_track_id(tracks):
+    if not tracks:
+        return None
+
+    peds = [t for t in tracks if t["kind"] == "pedestrian"]
+    if peds:
+        best = min(peds, key=lambda t: math.hypot(t["history_world"][-1][0], t["history_world"][-1][1]))
+        return best["id"]
+
+    return tracks[0]["id"]
+
+
+def build_agents_from_tracks(tracks, fusion_data):
+    if not tracks:
+        return [], None, []
+
+    tracks_work = []
+    for tr in tracks:
+        tracks_work.append(
+            {
+                "id": tr["id"],
+                "kind": tr["kind"],
+                "raw_label": tr["raw_label"],
+                "history_pixel": [tuple(p) for p in tr["history_pixel"]],
+                "history_world": [tuple(p) for p in tr["history_world"]],
+                "last_box": tr.get("last_box"),
+                "last_keypoints": tr.get("last_keypoints"),
+            }
+        )
+
+    tracks_work = stabilize_tracks_with_radar(tracks_work, fusion_data)
+
+    target_id = choose_target_track_id(tracks_work)
+    agents = []
+
+    for tr in tracks_work:
+        neighbors = []
+        for other in tracks_work:
+            if other["id"] == tr["id"]:
+                continue
+            neighbors.append(other["history_world"])
+
+        if len(neighbors) > 12:
+            x0, y0 = tr["history_world"][-1]
+            neighbors = sorted(
+                neighbors,
+                key=lambda nh: math.hypot(nh[-1][0] - x0, nh[-1][1] - y0),
+            )[:12]
+
+        fusion_feats = build_fusion_features(tr["history_world"], fusion_data)
+
+        pred, probs, _ = trajectory_predict(
+            tr["history_world"],
+            neighbor_points_list=neighbors,
+            fusion_feats=fusion_feats,
+        )
+
+        pred_np = pred.detach().cpu().numpy()
+        probs_np = probs.detach().cpu().numpy()
+
+        predictions = []
+        for mode_i in range(pred_np.shape[0]):
+            mode_path = [(float(p[0]), float(p[1])) for p in pred_np[mode_i]]
+            predictions.append(mode_path)
+
+        agents.append(
+            {
+                "id": int(tr["id"]),
+                "type": "pedestrian" if tr["kind"] == "pedestrian" else "vehicle",
+                "raw_label": tr["raw_label"],
+                "history": [tuple(map(float, p)) for p in tr["history_world"]],
+                "predictions": predictions,
+                "probabilities": normalize_probs(probs_np.tolist()),
+                "is_target": tr["id"] == target_id,
+            }
+        )
+
+    return agents, target_id, tracks_work
+
+
+def assign_track_ids_to_front_detections(detections, tracks, gate_px=90.0):
+    if not detections:
+        return []
+
+    out = []
+    used_ids = set()
+
+    for det_idx, det in enumerate(detections):
+        d = dict(det)
+        d.setdefault("det_id", det_idx + 1)
+
+        if d.get("track_id") is not None:
+            used_ids.add(d["track_id"])
+            out.append(d)
+            continue
+
+        best_id = None
+        best_dist = 1e9
+
+        for tr in tracks:
+            if tr["id"] in used_ids:
+                continue
+            if tr["kind"] != d["kind"]:
+                continue
+
+            px, py = tr["history_pixel"][-1]
+            dist = math.hypot(d["center_x"] - px, d["bottom_y"] - py)
+            if dist < gate_px and dist < best_dist:
+                best_dist = dist
+                best_id = tr["id"]
+
+        d["track_id"] = best_id
+        if best_id is not None:
+            used_ids.add(best_id)
+        out.append(d)
+
+    return out
+
+
+@st.cache_data(show_spinner=False)
+def build_live_agents_bundle(anchor_idx, score_threshold, tracking_gate_px, use_pose):
+    front_paths = list_channel_image_paths("CAM_FRONT")
+    if len(front_paths) < 4:
+        return {"error": "Need at least 4 CAM_FRONT frames in DataSet/samples/CAM_FRONT."}
+
+    if anchor_idx < 3:
+        anchor_idx = 3
+    if anchor_idx >= len(front_paths):
+        anchor_idx = len(front_paths) - 1
+
+    models = load_cv_models()
+    if "error" in models:
+        return {
+            "error": f"Could not load CV models ({models['error']}).",
+            "device": models.get("device_name", "unknown"),
+        }
+
+    window_paths = front_paths[anchor_idx - 3 : anchor_idx + 1]
+
+    tracks, front_dets = track_front_agents(
+        window_paths,
+        models,
+        score_threshold=score_threshold,
+        tracking_gate_px=tracking_gate_px,
+        use_pose=use_pose,
+    )
+
+    if len(tracks) == 0:
+        return {"error": "No valid tracked moving agents found in selected frame window."}
+
+    front_curr = window_paths[-1]
+    fusion_data = load_fusion_for_cam_frame(Path(front_curr).name)
+
+    agents, target_id, tracks_stable = build_agents_from_tracks(tracks, fusion_data)
+    if len(agents) == 0:
+        return {"error": "Tracking succeeded but trajectory prediction produced no agents."}
+
+    snapshots = {}
+    for channel, _, _ in CAMERA_VIEWS:
+        ch_paths = list_channel_image_paths(channel)
+
+        if not ch_paths:
+            snapshots[channel] = {
+                "image": fallback_canvas(),
+                "detections": [],
+                "frame_path": None,
+            }
+            continue
+
+        ch_idx = min(anchor_idx, len(ch_paths) - 1)
+        ch_path = ch_paths[ch_idx]
+        ch_arr = load_image_array(ch_path)
+
+        if channel == "CAM_FRONT" and Path(ch_path).name == Path(front_curr).name:
+            ch_dets = [dict(d) for d in front_dets]
+        else:
+            ch_dets = detect_objects_and_pose(
+                ch_arr,
+                models,
+                score_threshold=score_threshold,
+                use_pose=use_pose,
+            )
+
+        for i, det in enumerate(ch_dets):
+            det.setdefault("track_id", None)
+            det.setdefault("det_id", i + 1)
+
+        snapshots[channel] = {
+            "image": ch_arr,
+            "detections": ch_dets,
+            "frame_path": ch_path,
+        }
+
+    if "CAM_FRONT" in snapshots:
+        snapshots["CAM_FRONT"]["detections"] = assign_track_ids_to_front_detections(
+            snapshots["CAM_FRONT"]["detections"],
+            tracks_stable,
+            gate_px=tracking_gate_px,
+        )
+
+    return {
+        "agents": agents,
+        "fusion_data": fusion_data,
+        "camera_snapshots": snapshots,
+        "target_track_id": target_id,
+        "device": models.get("device_name", "unknown"),
+        "front_anchor_path": front_curr,
+        "mode": "live_fusion",
+    }
+
+
+def uploaded_file_to_array(uploaded_file):
+    if uploaded_file is None:
+        return None
+    try:
+        return np.asarray(Image.open(io.BytesIO(uploaded_file.getvalue())).convert("RGB"))
+    except Exception:
+        return None
+
+
+def match_two_frame_tracks(det_prev, det_curr, tracking_gate_px=90.0, min_motion_px=0.0):
+    used_curr = set()
+    matches = []
+
+    det_prev = sorted(det_prev, key=lambda d: d["score"], reverse=True)
+    det_curr = sorted(det_curr, key=lambda d: d["score"], reverse=True)
+
+    for d0 in det_prev:
+        best_idx = None
+        best_dist = 1e9
+
+        for j, d1 in enumerate(det_curr):
+            if j in used_curr:
+                continue
+            if d0["kind"] != d1["kind"]:
+                continue
+
+            dist = math.hypot(d1["center_x"] - d0["center_x"], d1["bottom_y"] - d0["bottom_y"])
+            if dist < tracking_gate_px and dist < best_dist:
+                best_dist = dist
+                best_idx = j
+
+        if best_idx is None:
+            continue
+
+        used_curr.add(best_idx)
+        d1 = det_curr[best_idx]
+
+        matches.append((d0, d1, float(best_dist)))
+
+    return matches
+
+
+def build_two_image_agents_bundle(img_prev, img_curr, score_threshold, tracking_gate_px, min_motion_px, use_pose):
+    models = load_cv_models()
+    if "error" in models:
+        return {
+            "error": f"Could not load CV models ({models['error']}).",
+            "device": models.get("device_name", "unknown"),
+        }
+
+    det_prev = detect_objects_and_pose(img_prev, models, score_threshold=score_threshold, use_pose=use_pose)
+    det_curr = detect_objects_and_pose(img_curr, models, score_threshold=score_threshold, use_pose=use_pose)
+
+    # Two-image mode focuses on VRUs (pedestrians/cyclists/motorcycles).
+    det_prev_vru = [d for d in det_prev if d.get("kind") == "pedestrian"]
+    det_curr_vru = [d for d in det_curr if d.get("kind") == "pedestrian"]
+
+    for i, d in enumerate(det_prev):
+        d["det_id"] = i + 1
+        d["track_id"] = None
+    for i, d in enumerate(det_curr):
+        d["det_id"] = i + 1
+        d["track_id"] = None
+
+    if len(det_curr_vru) == 0:
+        return {"error": "No pedestrian/cyclist detections found in image 2 (t0)."}
+
+    matches = match_two_frame_tracks(
+        det_prev_vru,
+        det_curr_vru,
+        tracking_gate_px=tracking_gate_px,
+        min_motion_px=0.0,
+    )
+
+    # Backfill unmatched current VRUs so every visible VRU at t0 gets a prediction.
+    matched_curr_ids = {id(m[1]) for m in matches}
+    for d1 in det_curr_vru:
+        if id(d1) in matched_curr_ids:
+            continue
+
+        if len(det_prev_vru) == 0:
+            matches.append((None, d1, float("inf")))
+            continue
+
+        nearest_prev = min(
+            det_prev_vru,
+            key=lambda d0: math.hypot(d1["center_x"] - d0["center_x"], d1["bottom_y"] - d0["bottom_y"]),
+        )
+        dist = math.hypot(
+            d1["center_x"] - nearest_prev["center_x"],
+            d1["bottom_y"] - nearest_prev["bottom_y"],
+        )
+
+        # If previous frame support is weak, still include the agent with near-static history.
+        if dist <= 1.5 * tracking_gate_px:
+            matches.append((nearest_prev, d1, float(dist)))
+        else:
+            matches.append((None, d1, float("inf")))
+
+    h0, w0 = img_prev.shape[:2]
+    h1, w1 = img_curr.shape[:2]
+
+    tracks = []
+    for track_id, (d0, d1, dist_px) in enumerate(matches, start=1):
+        if d0 is not None and d0.get("track_id") is None:
+            d0["track_id"] = track_id
+        d1["track_id"] = track_id
+
+        if d0 is not None:
+            p_prev = pixel_to_bev(d0["center_x"], d0["bottom_y"], w0, h0)
+        else:
+            p_prev = None
+        p_curr = pixel_to_bev(d1["center_x"], d1["bottom_y"], w1, h1)
+
+        if p_prev is None:
+            vx, vy = 0.0, 0.0
+            p_prev = p_curr
+        else:
+            vx = p_curr[0] - p_prev[0]
+            vy = p_curr[1] - p_prev[1]
+
+        # Keep the agent even if tiny displacement; just make observation history static.
+        if dist_px < float(min_motion_px):
+            vx, vy = 0.0, 0.0
+            p_prev = p_curr
+
+        # Reconstruct a 4-point observation history from 2 frames.
+        hist = [
+            (p_curr[0] - 3.0 * vx, p_curr[1] - 3.0 * vy),
+            (p_curr[0] - 2.0 * vx, p_curr[1] - 2.0 * vy),
+            (p_prev[0], p_prev[1]),
+            (p_curr[0], p_curr[1]),
+        ]
+
+        tracks.append(
+            {
+                "id": track_id,
+                "kind": d1["kind"],
+                "raw_label": d1["raw_label"],
+                "history_world": hist,
+            }
+        )
+
+    # In this mode, every VRU is treated as a target for prediction display.
+    target_track_id = None
+
+    agents = []
+    for tr in tracks:
+        neighbors = [other["history_world"] for other in tracks if other["id"] != tr["id"]]
+
+        pred, probs, _ = trajectory_predict(
+            tr["history_world"],
+            neighbor_points_list=neighbors,
+            fusion_feats=None,
+        )
+
+        pred_np = pred.detach().cpu().numpy()
+        probs_np = probs.detach().cpu().numpy()
+
+        predictions = []
+        for mode_i in range(pred_np.shape[0]):
+            predictions.append([(float(p[0]), float(p[1])) for p in pred_np[mode_i]])
+
+        agents.append(
+            {
+                "id": int(tr["id"]),
+                "type": "pedestrian" if tr["kind"] == "pedestrian" else "vehicle",
+                "raw_label": tr["raw_label"],
+                "history": [tuple(map(float, p)) for p in tr["history_world"]],
+                "predictions": predictions,
+                "probabilities": normalize_probs(probs_np.tolist()),
+                "is_target": True,
+            }
+        )
+
+    return {
+        "agents": agents,
+        "target_track_id": target_track_id,
+        "camera_snapshots": {
+            "pair_prev": {"image": img_prev, "detections": det_prev},
+            "pair_curr": {"image": img_curr, "detections": det_curr},
+        },
+        "device": models.get("device_name", "unknown"),
+        "mode": "two_upload",
+        "match_count": len(agents),
+    }
+
+
+def bev_to_pixel(x_m, y_m, width, height):
+    x_div = max(1.0, width / 80.0)
+    y_div = max(1.0, height / 50.0)
+
+    px = x_m * x_div + 0.5 * width
+    py = y_m * y_div + 0.58 * height
+    return float(px), float(py)
+
+
+def create_prediction_overlay_figure(image_arr, detections, agents, step, target_track_id=None, highlight_track_ids=None):
+    fig = create_camera_figure_detections(
+        image_arr,
+        detections,
+        camera_label="Prediction Output",
+        target_track_id=target_track_id,
+        highlight_track_ids=highlight_track_ids,
+    )
+
+    h, w = image_arr.shape[:2]
+
+    for a in agents:
+        color = agent_color(a)
+        k = best_mode_idx(a)
+        pred = a["predictions"][k]
+        end_idx = max(1, min(step, len(pred)))
+        path_world = [a["history"][-1]] + pred[:end_idx]
+
+        px = []
+        py = []
+        for xw, yw in path_world:
+            u, v = bev_to_pixel(xw, yw, w, h)
+            px.append(u)
+            py.append(v)
+
+        # Glow trail for a cleaner, reference-style visual emphasis.
+        for lw, op in [(14, 0.12), (8, 0.20), (4, 0.95)]:
+            fig.add_trace(
+                go.Scatter(
+                    x=px,
+                    y=py,
+                    mode="lines",
+                    line={"color": color, "width": lw, "shape": "spline", "smoothing": 1.1},
+                    opacity=op,
+                    hoverinfo="skip",
+                    showlegend=False,
+                )
+            )
+
+    return fig
+
+
+def remove_vru_foreground_from_scene(scene_image, scene_detections=None):
+    if scene_image is None or cv2 is None:
+        return scene_image
+
+    if scene_detections is None or len(scene_detections) == 0:
+        return scene_image
+
+    h, w = scene_image.shape[:2]
+    mask = np.zeros((h, w), dtype=np.uint8)
+
+    for det in scene_detections:
+        if det.get("kind") != "pedestrian":
+            continue
+
+        x1, y1, x2, y2 = det.get("box", [0, 0, 0, 0])
+        padx = 0.08 * (x2 - x1)
+        pady = 0.10 * (y2 - y1)
+
+        xa = int(max(0, min(w - 1, x1 - padx)))
+        ya = int(max(0, min(h - 1, y1 - pady)))
+        xb = int(max(0, min(w - 1, x2 + padx)))
+        yb = int(max(0, min(h - 1, y2 + pady)))
+
+        if xb > xa and yb > ya:
+            cv2.rectangle(mask, (xa, ya), (xb, yb), color=255, thickness=-1)
+
+    if int(mask.sum()) == 0:
+        return scene_image
+
+    bgr = cv2.cvtColor(scene_image, cv2.COLOR_RGB2BGR)
+    inpainted = cv2.inpaint(bgr, mask, 7, cv2.INPAINT_TELEA)
+    return cv2.cvtColor(inpainted, cv2.COLOR_BGR2RGB)
+
+
+def build_pseudo_bev_background(scene_image, x_min, x_max, y_min, y_max, scene_detections=None):
+    # Context BEV from a single front-view frame using inverse-perspective remap.
+    if scene_image is None or cv2 is None:
+        return None
+
+    cleaned = remove_vru_foreground_from_scene(scene_image, scene_detections=scene_detections)
+    h, w = cleaned.shape[:2]
+    if h < 20 or w < 20:
+        return None
+
+    out_w, out_h = 1100, 820
+
+    xs = np.linspace(x_min, x_max, out_w, dtype=np.float32)
+    ys = np.linspace(y_max, y_min, out_h, dtype=np.float32)
+    xg, yg = np.meshgrid(xs, ys)
+
+    cx = 0.5 * w
+    horizon = 0.42 * h
+
+    depth = np.clip((yg - y_min) + 2.0, 2.0, None)
+
+    map_x = cx + (0.95 * w) * xg / (depth + 6.0)
+    map_y = horizon + (5.8 * h) / depth
+
+    map_x = np.clip(map_x, 0, w - 1).astype(np.float32)
+    map_y = np.clip(map_y, 0, h - 1).astype(np.float32)
+
+    warped = cv2.remap(cleaned, map_x, map_y, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
+    warped = cv2.GaussianBlur(warped, (0, 0), 0.8)
+    warped = np.clip(warped.astype(np.float32) * 0.78, 0, 255).astype(np.uint8)
+    return warped
+
+
+def compute_reference_bounds(agents, step, show_multimodal):
+    xs = [0.0]
+    ys = [0.0]
+
+    for a in agents:
+        for xh, yh in a["history"]:
+            xs.append(float(xh))
+            ys.append(float(yh))
+
+        k_best = best_mode_idx(a)
+        best_path = a["predictions"][k_best][: max(1, min(step, len(a["predictions"][k_best])))]
+        for xp, yp in best_path:
+            xs.append(float(xp))
+            ys.append(float(yp))
+
+        if show_multimodal:
+            for m, m_path in enumerate(a["predictions"]):
+                if m == k_best:
+                    continue
+                m_slice = m_path[: max(1, min(step, len(m_path)))]
+                for xp, yp in m_slice:
+                    xs.append(float(xp))
+                    ys.append(float(yp))
+
+    x_min = min(xs) - 6.0
+    x_max = max(xs) + 6.0
+    y_min = min(ys) - 8.0
+    y_max = max(ys) + 10.0
+
+    min_x_span = 44.0
+    min_y_span = 64.0
+
+    x_span = x_max - x_min
+    y_span = y_max - y_min
+
+    if x_span < min_x_span:
+        xc = 0.5 * (x_min + x_max)
+        x_min = xc - 0.5 * min_x_span
+        x_max = xc + 0.5 * min_x_span
+
+    if y_span < min_y_span:
+        yc = 0.5 * (y_min + y_max)
+        y_min = yc - 0.5 * min_y_span
+        y_max = yc + 0.5 * min_y_span
+
+    return x_min, x_max, y_min, y_max
+
+
+def spread_agent_markers(agents, step, tol=0.45, radius=0.55):
+    positions = [position_at_step(a, step) for a in agents]
+    offsets = []
+
+    for i, (xi, yi) in enumerate(positions):
+        near = []
+        for j, (xj, yj) in enumerate(positions):
+            if math.hypot(xi - xj, yi - yj) <= tol:
+                near.append(j)
+
+        if len(near) <= 1:
+            offsets.append((0.0, 0.0))
+            continue
+
+        near_sorted = sorted(near)
+        rank = near_sorted.index(i)
+        ang = 2.0 * math.pi * rank / len(near_sorted)
+        offsets.append((radius * math.cos(ang), radius * math.sin(ang)))
+
+    return positions, offsets
+
+
+def hex_to_rgba(hex_color, alpha):
+    alpha = float(np.clip(alpha, 0.0, 1.0))
+    c = str(hex_color).lstrip("#")
+    if len(c) != 6:
+        return f"rgba(229,231,235,{alpha:.3f})"
+    r = int(c[0:2], 16)
+    g = int(c[2:4], 16)
+    b = int(c[4:6], 16)
+    return f"rgba({r},{g},{b},{alpha:.3f})"
+
+
+def summarize_agent_probabilities(agent):
+    bins = {"Straight": 0.0, "Left": 0.0, "Right": 0.0, "Stop": 0.0}
+
+    classifier = globals().get("classify_direction")
+    for mode_idx, mode_path in enumerate(agent.get("predictions", [])):
+        if mode_idx >= len(agent.get("probabilities", [])):
+            continue
+
+        if callable(classifier):
+            direction = classifier(agent["history"], mode_path)
+        else:
+            direction = ["Straight", "Left", "Right"][mode_idx % 3]
+
+        if direction not in bins:
+            direction = "Straight"
+
+        bins[direction] += float(agent["probabilities"][mode_idx])
+
+    ranked = sorted(bins.items(), key=lambda kv: kv[1], reverse=True)
+    top3 = ranked[:3]
+    summary = ", ".join([f"{name} {prob * 100:.0f}%" for name, prob in top3])
+    return summary, bins
+
+
+def add_structured_road_scene(fig, x_min, x_max, y_min, y_max, add_crosswalk=True):
+    road_half = float(np.clip(0.24 * (x_max - x_min), 9.5, 15.5))
+    shoulder_half = road_half + 3.2
+
+    fig.add_shape(
+        type="rect",
+        x0=x_min,
+        y0=y_min,
+        x1=x_max,
+        y1=y_max,
+        line={"width": 0},
+        fillcolor=ROAD_SHOULDER,
+        layer="below",
+    )
+
+    fig.add_shape(
+        type="rect",
+        x0=-shoulder_half,
+        y0=y_min,
+        x1=shoulder_half,
+        y1=y_max,
+        line={"width": 0},
+        fillcolor="rgba(18, 25, 35, 0.95)",
+        layer="below",
+    )
+
+    fig.add_shape(
+        type="rect",
+        x0=-road_half,
+        y0=y_min,
+        x1=road_half,
+        y1=y_max,
+        line={"width": 0},
+        fillcolor=ROAD_ASPHALT,
+        layer="below",
+    )
+
+    for x_edge in (-road_half, road_half):
+        fig.add_shape(
+            type="line",
+            x0=x_edge,
+            y0=y_min,
+            x1=x_edge,
+            y1=y_max,
+            line={"color": LANE_SOLID, "width": 2.5},
+            layer="below",
+        )
+
+    lane_w = (2.0 * road_half) / 4.0
+    for lane_idx in range(1, 4):
+        x_lane = -road_half + lane_idx * lane_w
+        line_color = CENTER_DASH if lane_idx == 2 else LANE_DASH
+        line_width = 2.4 if lane_idx == 2 else 1.8
+        fig.add_shape(
+            type="line",
+            x0=x_lane,
+            y0=y_min,
+            x1=x_lane,
+            y1=y_max,
+            line={"color": line_color, "width": line_width, "dash": "dash"},
+            layer="below",
+        )
+
+    if add_crosswalk:
+        cross_y = float(np.clip(8.0, y_min + 5.5, y_max - 5.5))
+        stripe_h = 0.7
+        stripe_gap = 0.55
+        for i in range(-4, 5):
+            y0 = cross_y + i * (stripe_h + stripe_gap)
+            y1 = y0 + stripe_h
+            fig.add_shape(
+                type="rect",
+                x0=-road_half + 0.7,
+                y0=y0,
+                x1=road_half - 0.7,
+                y1=y1,
+                line={"width": 0},
+                fillcolor="rgba(229, 231, 235, 0.14)",
+                layer="below",
+            )
+
+
+def build_reference_bev_figure(agents, step, show_multimodal, scene_image=None, scene_detections=None):
+    fig = go.Figure()
+
+    x_min, x_max, y_min, y_max = compute_reference_bounds(agents, step, show_multimodal)
+
+    bg = build_pseudo_bev_background(
+        scene_image,
+        x_min,
+        x_max,
+        y_min,
+        y_max,
+        scene_detections=scene_detections,
+    )
+
+    add_structured_road_scene(fig, x_min, x_max, y_min, y_max, add_crosswalk=True)
+
+    if bg is not None:
+        fig.add_layout_image(
+            dict(
+                source=Image.fromarray(bg),
+                xref="x",
+                yref="y",
+                x=x_min,
+                y=y_max,
+                sizex=x_max - x_min,
+                sizey=y_max - y_min,
+                sizing="stretch",
+                opacity=0.38,
+                layer="below",
+            )
+        )
+
+        # Dark wash to keep trajectories readable on real-scene texture.
+        fig.add_shape(
+            type="rect",
+            x0=x_min,
+            y0=y_min,
+            x1=x_max,
+            y1=y_max,
+            line={"width": 0},
+            fillcolor="rgba(4, 8, 18, 0.36)",
+            layer="below",
+        )
+
+    fig.add_shape(
+        type="rect",
+        x0=-1.1,
+        y0=-2.2,
+        x1=1.1,
+        y1=2.2,
+        line={"color": EGO_CYAN, "width": 2.2},
+        fillcolor="rgba(34,211,238,0.20)",
+    )
+    fig.add_annotation(
+        x=0.0,
+        y=4.2,
+        ax=0.0,
+        ay=1.2,
+        showarrow=True,
+        arrowhead=3,
+        arrowwidth=2.8,
+        arrowcolor=EGO_CYAN,
+        text="",
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker={"size": 10, "symbol": "circle", "color": VRU_GREEN},
+            name="Pedestrian",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker={"size": 10, "symbol": "square", "color": VEHICLE_YELLOW},
+            name="Vehicle",
+        )
+    )
+
+    positions, marker_offsets = spread_agent_markers(agents, step)
+    alt_legend_added = False
+
+    for idx, a in enumerate(agents):
+        base_color = agent_color(a)
+        best_idx = best_mode_idx(a)
+        best_prob = float(a["probabilities"][best_idx]) if len(a["probabilities"]) > 0 else 0.0
+        marker_color = hex_to_rgba(base_color, 0.48 + 0.52 * best_prob)
+
+        cx, cy = positions[idx]
+        ox, oy = marker_offsets[idx]
+        curr_x = cx + ox
+        curr_y = cy + oy
+
+        summary_text, _ = summarize_agent_probabilities(a)
+        hover_text = (
+            f"ID {a['id']}<br>Type: {a['type'].title()}"
+            f"<br>{summary_text}<br>Best path confidence: {best_prob * 100:.1f}%"
+        )
+
+        hx, hy = smooth_path(a["history"])
+        fig.add_trace(
+            go.Scatter(
+                x=hx,
+                y=hy,
+                mode="lines",
+                line={"color": "rgba(226,232,240,0.55)", "width": 2.2, "dash": "dot", "shape": "spline", "smoothing": 1.0},
+                hovertemplate=f"ID {a['id']} past trajectory<extra></extra>",
+                name="Past trajectory" if idx == 0 else None,
+                showlegend=(idx == 0),
+            )
+        )
+
+        fig.add_trace(
+            go.Scatter(
+                x=[curr_x],
+                y=[curr_y],
+                mode="markers+text",
+                marker={
+                    "size": 11,
+                    "symbol": "circle" if a.get("type") == "pedestrian" else "square",
+                    "color": marker_color,
+                    "line": {"color": "rgba(5,7,15,0.95)", "width": 1.2},
+                },
+                text=[f"ID {a['id']}"],
+                textposition="top center",
+                textfont={"size": 10, "color": WHITE},
+                hovertemplate=f"{hover_text}<extra></extra>",
+                showlegend=False,
+            )
+        )
+
+        px, py = previous_position_for_velocity(a, step)
+        dx, dy = cx - px, cy - py
+        norm = math.hypot(dx, dy)
+        if norm > 1e-3:
+            vx, vy = (dx / norm) * 2.0, (dy / norm) * 2.0
+            fig.add_annotation(
+                x=curr_x + vx,
+                y=curr_y + vy,
+                ax=curr_x,
+                ay=curr_y,
+                showarrow=True,
+                arrowhead=2,
+                arrowsize=1,
+                arrowwidth=2,
+                arrowcolor=base_color,
+                text="",
+            )
+
+        mode_order = [best_idx, 0, 1, 2]
+        mode_order = list(dict.fromkeys(mode_order))
+
+        for rank, m in enumerate(mode_order[:3]):
+            if (not show_multimodal) and rank > 0:
+                continue
+
+            mode_prob = float(a["probabilities"][m]) if m < len(a["probabilities"]) else 0.0
+            mode_color = TRAJ_MODE_COLORS[m % len(TRAJ_MODE_COLORS)]
+
+            mode_path = a["predictions"][m]
+            mode_slice = mode_path[: max(1, min(step, len(mode_path)))]
+            tx, ty = smooth_path([a["history"][-1]] + mode_slice)
+            is_best = m == best_idx
+
+            if is_best:
+                for lw, op in [(14, 0.08), (9, 0.16)]:
+                    fig.add_trace(
+                        go.Scatter(
+                            x=tx,
+                            y=ty,
+                            mode="lines",
+                            line={"color": mode_color, "width": lw, "shape": "spline", "smoothing": 1.15},
+                            opacity=op,
+                            hoverinfo="skip",
+                            showlegend=False,
+                        )
+                    )
+
+            fig.add_trace(
+                go.Scatter(
+                    x=tx,
+                    y=ty,
+                    mode="lines",
+                    line={
+                        "color": mode_color,
+                        "width": 4.1 if is_best else 2.1,
+                        "dash": "solid" if is_best else "dash",
+                        "shape": "spline",
+                        "smoothing": 1.15,
+                    },
+                    opacity=(0.72 + 0.26 * mode_prob) if is_best else (0.36 + 0.32 * mode_prob),
+                    hovertemplate=(
+                        f"ID {a['id']}<br>Mode {m + 1}"
+                        f"<br>Probability: {mode_prob * 100:.1f}%<extra></extra>"
+                    ),
+                    name=(
+                        "Best path" if (is_best and idx == 0) else
+                        "Alternative paths" if ((not is_best) and (not alt_legend_added)) else None
+                    ),
+                    showlegend=(is_best and idx == 0) or ((not is_best) and (not alt_legend_added)),
+                )
+            )
+
+            if (not is_best) and (not alt_legend_added):
+                alt_legend_added = True
+
+        if a.get("is_target", False):
+            fig.add_trace(
+                go.Scatter(
+                    x=[curr_x + 0.9],
+                    y=[curr_y + 1.1],
+                    mode="text",
+                    text=[summary_text],
+                    textfont={"size": 9, "color": "rgba(226,232,240,0.90)"},
+                    hoverinfo="skip",
+                    showlegend=False,
+                )
+            )
+
+    fig.update_layout(
+        title={"text": "Main BEV Simulation", "x": 0.02, "font": {"size": 20, "color": WHITE}},
+        paper_bgcolor=BG_SECONDARY,
+        plot_bgcolor=BG_SECONDARY,
+        legend={"orientation": "h", "y": 1.03, "x": 0.0, "font": {"color": WHITE, "size": 11}},
+        margin={"l": 16, "r": 16, "t": 52, "b": 10},
+        height=700,
+    )
+    fig.update_xaxes(
+        title_text="X Lateral (m)",
+        range=[x_min, x_max],
+        color=WHITE,
+        dtick=5,
+        showgrid=True,
+        gridcolor="rgba(148,163,184,0.16)",
+        zeroline=False,
+    )
+    fig.update_yaxes(
+        title_text="Y Forward (m)",
+        range=[y_min, y_max],
+        color=WHITE,
+        dtick=5,
+        showgrid=True,
+        gridcolor="rgba(148,163,184,0.16)",
+        scaleanchor="x",
+        scaleratio=1,
+        zeroline=False,
+    )
+
+    return fig
+
+
+def best_mode_idx(agent):
+    probs = np.asarray(agent["probabilities"], dtype=float)
+    return int(np.argmax(probs))
+
+
+def position_at_step(agent, step):
+    if step <= 0:
+        return tuple(agent["history"][-1])
+
+    k = best_mode_idx(agent)
+    pred = agent["predictions"][k]
+    idx = min(step - 1, len(pred) - 1)
+    return tuple(pred[idx])
+
+
+def previous_position_for_velocity(agent, step):
+    if step <= 1:
+        return tuple(agent["history"][-1])
+
+    k = best_mode_idx(agent)
+    pred = agent["predictions"][k]
+    idx = max(0, min(step - 2, len(pred) - 1))
+    return tuple(pred[idx])
+
+
+def project_world_to_camera(x, y, width, height, yaw_deg):
+    # Ego frame: x right, y forward.
+    yaw = np.deg2rad(yaw_deg)
+    side = x * np.cos(yaw) + y * np.sin(yaw)
+    depth = y * np.cos(yaw) - x * np.sin(yaw)
+
+    if depth <= 1.2:
+        return None
+
+    focal = width * 0.85
+    u = width * 0.5 + (side / depth) * focal
+    v = height * 0.84 - min(280.0, 460.0 / (depth + 0.6))
+    return float(u), float(v), float(depth)
+
+
+def build_synth_skeleton_points(u, v, box_w, box_h):
+    head = (u, v - 0.38 * box_h)
+    neck = (u, v - 0.28 * box_h)
+    l_sh = (u - 0.22 * box_w, v - 0.22 * box_h)
+    r_sh = (u + 0.22 * box_w, v - 0.22 * box_h)
+    l_hand = (u - 0.34 * box_w, v - 0.03 * box_h)
+    r_hand = (u + 0.34 * box_w, v - 0.03 * box_h)
+    hip = (u, v - 0.02 * box_h)
+    l_knee = (u - 0.14 * box_w, v + 0.30 * box_h)
+    r_knee = (u + 0.14 * box_w, v + 0.30 * box_h)
+    return [head, neck, l_sh, r_sh, l_hand, r_hand, hip, l_knee, r_knee]
+
+
+def add_polyline_trace(fig, points, edges, color, point_size=4):
+    xs = []
+    ys = []
+    for a, b in edges:
+        if a >= len(points) or b >= len(points):
+            continue
+        xs.extend([points[a][0], points[b][0], None])
+        ys.extend([points[a][1], points[b][1], None])
+
+    fig.add_trace(
+        go.Scatter(
+            x=xs,
+            y=ys,
+            mode="lines",
+            line={"color": color, "width": 2},
+            hoverinfo="skip",
+            showlegend=False,
+        )
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[p[0] for p in points],
+            y=[p[1] for p in points],
+            mode="markers",
+            marker={"size": point_size, "color": "#e2e8f0"},
+            hoverinfo="skip",
+            showlegend=False,
+        )
+    )
+
+
+def add_coco_pose_trace(fig, keypoints, color, conf_thresh=0.2):
+    if keypoints is None:
+        return
+    if len(keypoints) < 17:
+        return
+
+    xs = []
+    ys = []
+    for a, b in COCO_SKELETON_EDGES:
+        if keypoints[a][2] < conf_thresh or keypoints[b][2] < conf_thresh:
+            continue
+        xs.extend([keypoints[a][0], keypoints[b][0], None])
+        ys.extend([keypoints[a][1], keypoints[b][1], None])
+
+    if len(xs) > 0:
+        fig.add_trace(
+            go.Scatter(
+                x=xs,
+                y=ys,
+                mode="lines",
+                line={"color": color, "width": 2},
+                hoverinfo="skip",
+                showlegend=False,
+            )
+        )
+
+    pts = [kp for kp in keypoints if kp[2] >= conf_thresh]
+    if len(pts) > 0:
+        fig.add_trace(
+            go.Scatter(
+                x=[p[0] for p in pts],
+                y=[p[1] for p in pts],
+                mode="markers",
+                marker={"size": 4, "color": "#e2e8f0"},
+                hoverinfo="skip",
+                showlegend=False,
+            )
+        )
+
+
+def create_camera_figure_projected(image_arr, agents, camera_label, yaw_deg, step):
+    h, w = image_arr.shape[0], image_arr.shape[1]
+
+    fig = go.Figure()
+    fig.add_trace(go.Image(z=image_arr))
+
+    for agent in agents:
+        x, y = position_at_step(agent, step)
+        projection = project_world_to_camera(x, y, w, h, yaw_deg)
+        if projection is None:
+            continue
+
+        u, v, depth = projection
+        if u < -40 or u > w + 40 or v < -40 or v > h + 40:
+            continue
+
+        is_ped = agent["type"] == "pedestrian"
+        color = agent_color(agent)
+
+        box_h = max(22.0, min(180.0, 260.0 / (depth + 0.5)))
+        box_w = box_h * (0.42 if is_ped else 0.90)
+        x1, y1 = u - box_w / 2, v - box_h
+        x2, y2 = u + box_w / 2, v
+
+        fig.add_shape(
+            type="rect",
+            x0=x1,
+            y0=y1,
+            x1=x2,
+            y1=y2,
+            line={"color": color, "width": 2},
+            fillcolor="rgba(0,0,0,0)",
+        )
+
+        fig.add_trace(
+            go.Scatter(
+                x=[x1],
+                y=[max(4, y1 - 12)],
+                mode="text",
+                text=[f"ID {agent['id']}"],
+                textfont={"size": 11, "color": color},
+                hoverinfo="skip",
+                showlegend=False,
+            )
+        )
+
+        if is_ped:
+            kps = build_synth_skeleton_points(u, v, box_w, box_h)
+            add_polyline_trace(fig, kps, SYNTH_SKELETON_EDGES, color, point_size=4)
+
+    fig.update_xaxes(visible=False, range=[0, w])
+    fig.update_yaxes(visible=False, range=[h, 0], scaleanchor="x", scaleratio=1)
+    fig.update_layout(
+        title={"text": camera_label, "x": 0.02, "font": {"color": WHITE, "size": 15}},
+        paper_bgcolor=BG_SECONDARY,
+        plot_bgcolor=BG_SECONDARY,
+        margin={"l": 0, "r": 0, "t": 36, "b": 0},
+        height=300,
+    )
+    return fig
+
+
+def create_camera_figure_detections(image_arr, detections, camera_label, target_track_id=None, highlight_track_ids=None):
+    h, w = image_arr.shape[0], image_arr.shape[1]
+
+    fig = go.Figure()
+    fig.add_trace(go.Image(z=image_arr))
+
+    for i, det in enumerate(detections):
+        x1, y1, x2, y2 = det["box"]
+        kind = det.get("kind", "vehicle")
+        track_id = det.get("track_id")
+
+        if highlight_track_ids is not None and track_id is not None and track_id in highlight_track_ids:
+            color = TARGET_PURPLE
+        elif track_id is not None and track_id == target_track_id:
+            color = TARGET_PURPLE
+        elif kind == "pedestrian":
+            color = VRU_GREEN
+        else:
+            color = VEHICLE_YELLOW
+
+        fig.add_shape(
+            type="rect",
+            x0=x1,
+            y0=y1,
+            x1=x2,
+            y1=y2,
+            line={"color": color, "width": 2},
+            fillcolor="rgba(0,0,0,0)",
+        )
+
+        display_id = track_id if track_id is not None else f"D{det.get('det_id', i + 1)}"
+        fig.add_trace(
+            go.Scatter(
+                x=[x1],
+                y=[max(4.0, y1 - 12.0)],
+                mode="text",
+                text=[f"ID {display_id}"],
+                textfont={"size": 11, "color": color},
+                hoverinfo="skip",
+                showlegend=False,
+            )
+        )
+
+        if kind == "pedestrian":
+            add_coco_pose_trace(fig, det.get("keypoints"), color)
+
+    fig.update_xaxes(visible=False, range=[0, w])
+    fig.update_yaxes(visible=False, range=[h, 0], scaleanchor="x", scaleratio=1)
+    fig.update_layout(
+        title={"text": camera_label, "x": 0.02, "font": {"color": WHITE, "size": 15}},
+        paper_bgcolor=BG_SECONDARY,
+        plot_bgcolor=BG_SECONDARY,
+        margin={"l": 0, "r": 0, "t": 36, "b": 0},
+        height=300,
+    )
+    return fig
+
+
+def smooth_path(points):
+    return [p[0] for p in points], [p[1] for p in points]
+
+
+def simulate_lidar_points(agents, step):
+    rng = np.random.default_rng(1234 + step)
+
+    bg = np.column_stack(
+        [
+            rng.uniform(-35, 35, 1500),
+            rng.uniform(-8, 55, 1500),
+        ]
+    )
+
+    clusters = []
+    for a in agents:
+        cx, cy = position_at_step(a, step)
+        n = 110 if a["type"] == "vehicle" else 70
+        spread = np.array([0.8, 0.8]) if a["type"] == "pedestrian" else np.array([1.3, 1.1])
+        pts = rng.normal([cx, cy], spread, size=(n, 2))
+        clusters.append(pts)
+
+    if clusters:
+        all_pts = np.vstack([bg] + clusters)
+    else:
+        all_pts = bg
+
+    mask = (
+        (all_pts[:, 0] > -38)
+        & (all_pts[:, 0] < 38)
+        & (all_pts[:, 1] > -12)
+        & (all_pts[:, 1] < 58)
+    )
+    return all_pts[mask]
+
+
+def simulate_radar_vectors(agents, step):
+    vectors = []
+    for a in agents:
+        p_now = np.array(position_at_step(a, step), dtype=float)
+        p_prev = np.array(previous_position_for_velocity(a, step), dtype=float)
+        v = p_now - p_prev
+
+        if np.linalg.norm(v) < 0.04:
+            continue
+
+        v = v / max(1e-6, np.linalg.norm(v)) * 1.6
+        vectors.append((p_now[0], p_now[1], v[0], v[1], a["type"]))
+    return vectors
+
+
+def classify_direction(history, prediction):
+    h_prev = np.array(history[-2], dtype=float)
+    h_curr = np.array(history[-1], dtype=float)
+    p_end = np.array(prediction[-1], dtype=float)
+
+    heading = h_curr - h_prev
+    motion = p_end - h_curr
+
+    if np.linalg.norm(motion) < 0.7:
+        return "Stop"
+
+    if np.linalg.norm(heading) < 1e-6:
+        heading = np.array([0.0, 1.0])
+
+    heading = heading / np.linalg.norm(heading)
+    motion = motion / np.linalg.norm(motion)
+
+    cross = heading[0] * motion[1] - heading[1] * motion[0]
+    dot = np.clip(np.dot(heading, motion), -1.0, 1.0)
+    angle = np.degrees(np.arctan2(cross, dot))
+
+    if abs(angle) <= 25:
+        return "Straight"
+    if angle > 25:
+        return "Left"
+    if angle < -25:
+        return "Right"
+    return "Stop"
+
+
+def build_analytics_table(agents):
+    rows = []
+    direction_order = ["Straight", "Left", "Right", "Stop"]
+
+    for a in agents:
+        bins = {k: 0.0 for k in direction_order}
+
+        for mode_idx, mode_path in enumerate(a["predictions"]):
+            lbl = classify_direction(a["history"], mode_path)
+            bins[lbl] += float(a["probabilities"][mode_idx])
+
+        ranked = sorted(bins.items(), key=lambda kv: kv[1], reverse=True)
+        top3 = ranked[:3]
+
+        rows.append(
+            {
+                "Agent": f"ID {a['id']}",
+                "Type": "Target VRU" if a.get("is_target", False) else a["type"].title(),
+                "Top-1": f"{top3[0][0]} ({top3[0][1] * 100:.1f}%)",
+                "Top-2": f"{top3[1][0]} ({top3[1][1] * 100:.1f}%)",
+                "Top-3": f"{top3[2][0]} ({top3[2][1] * 100:.1f}%)",
+            }
+        )
+
+    return pd.DataFrame(rows)
+
+
+def generate_demo_agents(num_agents=8, history_steps=4, future_steps=12):
+    rng = np.random.default_rng(42)
+    agents = []
+
+    ped_count = max(5, int(0.7 * num_agents))
+
+    for i in range(num_agents):
+        is_ped = i < ped_count
+        a_type = "pedestrian" if is_ped else "vehicle"
+
+        base_x = rng.uniform(-16, 16)
+        base_y = rng.uniform(9, 45)
+
+        if is_ped:
+            vx = rng.uniform(-0.45, 0.45)
+            vy = rng.uniform(0.15, 0.95)
+        else:
+            vx = rng.uniform(-0.20, 0.20)
+            vy = rng.uniform(0.7, 1.6)
+
+        history = []
+        for t in range(history_steps):
+            phase = t - (history_steps - 1)
+            x = base_x + phase * vx + 0.06 * np.sin(0.8 * t + i)
+            y = base_y + phase * vy + 0.05 * np.cos(0.5 * t + i)
+            history.append((float(x), float(y)))
+
+        probs = normalize_probs(rng.uniform(0.15, 1.0, size=3))
+
+        predictions = []
+        x0, y0 = history[-1]
+        for mode in range(3):
+            mode_path = []
+            curve = (-0.12 + 0.12 * mode) * (1.4 if is_ped else 0.8)
+            accel = 0.02 * (mode - 1)
+            for s in range(1, future_steps + 1):
+                x = x0 + vx * s + curve * (s ** 1.25)
+                y = y0 + vy * s + accel * (s ** 1.12)
+                mode_path.append((float(x), float(y)))
+            predictions.append(mode_path)
+
+        agents.append(
+            {
+                "id": i + 1,
+                "type": a_type,
+                "history": history,
+                "predictions": predictions,
+                "probabilities": probs,
+                "is_target": (i == 0 and is_ped),
+            }
+        )
+
+    return agents
+
+
+def sanitize_agents(raw_agents):
+    cleaned = []
+    for i, a in enumerate(raw_agents):
+        aid = int(a.get("id", i + 1))
+        a_type = str(a.get("type", "pedestrian")).lower()
+        if a_type not in ["pedestrian", "vehicle"]:
+            a_type = "pedestrian"
+
+        history = [tuple(map(float, p)) for p in a.get("history", [])]
+        predictions = []
+        for mode in a.get("predictions", []):
+            predictions.append([tuple(map(float, p)) for p in mode])
+
+        probs = normalize_probs(a.get("probabilities", [0.6, 0.25, 0.15]))
+
+        if len(history) < 2 or len(predictions) < 3:
+            continue
+
+        cleaned.append(
+            {
+                "id": aid,
+                "type": a_type,
+                "history": history,
+                "predictions": predictions[:3],
+                "probabilities": probs[:3],
+                "is_target": bool(a.get("is_target", False)),
+            }
+        )
+
+    if not any(a.get("is_target", False) for a in cleaned):
+        for a in cleaned:
+            if a["type"] == "pedestrian":
+                a["is_target"] = True
+                break
+
+    return cleaned
+
+
+def build_bev_figure(
+    agents,
+    step,
+    show_lidar,
+    show_radar,
+    show_multimodal,
+    lidar_xy=None,
+    radar_xy=None,
+    radar_vel=None,
+):
+    fig = go.Figure()
+
+    x_min, x_max = -36.0, 36.0
+    y_min, y_max = -12.0, 58.0
+
+    add_structured_road_scene(fig, x_min, x_max, y_min, y_max, add_crosswalk=True)
+
+    fig.add_shape(
+        type="rect",
+        x0=-1.1,
+        y0=-2.2,
+        x1=1.1,
+        y1=2.2,
+        line={"color": EGO_CYAN, "width": 2.2},
+        fillcolor="rgba(34,211,238,0.20)",
+    )
+    fig.add_annotation(
+        x=0.0,
+        y=4.2,
+        ax=0.0,
+        ay=1.2,
+        arrowcolor=EGO_CYAN,
+        arrowwidth=2.8,
+        arrowhead=3,
+        showarrow=True,
+        text="",
+    )
+
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker={"size": 10, "symbol": "circle", "color": VRU_GREEN},
+            name="Pedestrian",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[None],
+            y=[None],
+            mode="markers",
+            marker={"size": 10, "symbol": "square", "color": VEHICLE_YELLOW},
+            name="Vehicle",
+        )
+    )
+
+    if show_lidar:
+        if lidar_xy is not None and len(lidar_xy) > 0:
+            lidar = np.asarray(lidar_xy, dtype=float)
+            mask = (
+                (lidar[:, 0] > -38)
+                & (lidar[:, 0] < 38)
+                & (lidar[:, 1] > -12)
+                & (lidar[:, 1] < 58)
+            )
+            lidar = lidar[mask]
+        else:
+            lidar = simulate_lidar_points(agents, step)
+
+        if len(lidar) > 0:
+            lidar = lidar[::6]
+            fig.add_trace(
+                go.Scatter(
+                    x=lidar[:, 0],
+                    y=lidar[:, 1],
+                    mode="markers",
+                    marker={"size": 3, "color": "rgba(34,211,238,0.22)"},
+                    name="LiDAR",
+                )
+            )
+
+    if show_radar:
+        rx = []
+        ry = []
+
+        if (
+            radar_xy is not None
+            and radar_vel is not None
+            and len(radar_xy) > 0
+            and len(radar_xy) == len(radar_vel)
+        ):
+            radar_xy = np.asarray(radar_xy, dtype=float)
+            radar_vel = np.asarray(radar_vel, dtype=float)
+            stride = max(1, len(radar_xy) // 90)
+
+            for i in range(0, len(radar_xy), stride):
+                x0, y0 = radar_xy[i, 0], radar_xy[i, 1]
+                vx, vy = radar_vel[i, 0], radar_vel[i, 1]
+                rx.extend([x0, x0 + 0.55 * vx, None])
+                ry.extend([y0, y0 + 0.55 * vy, None])
+        else:
+            radar_vectors = simulate_radar_vectors(agents, step)
+            for x0, y0, vx, vy, _ in radar_vectors:
+                rx.extend([x0, x0 + vx, None])
+                ry.extend([y0, y0 + vy, None])
+
+        if len(rx) > 0:
+            fig.add_trace(
+                go.Scatter(
+                    x=rx,
+                    y=ry,
+                    mode="lines",
+                    line={"color": "rgba(250,204,21,0.75)", "width": 2},
+                    name="Radar velocity",
+                )
+            )
+
+    alt_legend_added = False
+
+    for idx, a in enumerate(agents):
+        base_color = agent_color(a)
+        best_idx = best_mode_idx(a)
+        best_prob = float(a["probabilities"][best_idx]) if len(a["probabilities"]) > 0 else 0.0
+        marker_color = hex_to_rgba(base_color, 0.48 + 0.52 * best_prob)
+        summary_text, _ = summarize_agent_probabilities(a)
+
+        hx, hy = smooth_path(a["history"])
+        fig.add_trace(
+            go.Scatter(
+                x=hx,
+                y=hy,
+                mode="lines",
+                line={"color": "rgba(226,232,240,0.55)", "width": 2.2, "dash": "dot", "shape": "spline", "smoothing": 1.0},
+                name="Past trajectory" if idx == 0 else None,
+                showlegend=(idx == 0),
+                hovertemplate=f"ID {a['id']} past trajectory<extra></extra>",
+            )
+        )
+
+        cx, cy = position_at_step(a, step)
+        fig.add_trace(
+            go.Scatter(
+                x=[cx],
+                y=[cy],
+                mode="markers+text",
+                marker={
+                    "size": 11,
+                    "symbol": "circle" if a.get("type") == "pedestrian" else "square",
+                    "color": marker_color,
+                    "line": {"color": "#111827", "width": 1.2},
+                },
+                text=[f"ID {a['id']}"],
+                textposition="top center",
+                textfont={"size": 10, "color": WHITE},
+                hovertemplate=(
+                    f"ID {a['id']}<br>Type: {a['type'].title()}"
+                    f"<br>{summary_text}<br>Best path confidence: {best_prob * 100:.1f}%<extra></extra>"
+                ),
+                showlegend=False,
+            )
+        )
+
+        px, py = previous_position_for_velocity(a, step)
+        dx, dy = cx - px, cy - py
+        norm = np.hypot(dx, dy)
+        if norm > 1e-3:
+            sx, sy = (dx / norm) * 1.8, (dy / norm) * 1.8
+            fig.add_annotation(x=cx + sx, y=cy + sy, ax=cx, ay=cy, showarrow=True, arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor=base_color, text="")
+
+        mode_order = [best_idx, 0, 1, 2]
+        mode_order = list(dict.fromkeys(mode_order))
+
+        for rank, m in enumerate(mode_order[:3]):
+            if (not show_multimodal) and (rank > 0):
+                continue
+
+            mode_prob = float(a["probabilities"][m]) if m < len(a["probabilities"]) else 0.0
+            mode_color = TRAJ_MODE_COLORS[m % len(TRAJ_MODE_COLORS)]
+
+            mode_path = a["predictions"][m]
+            end_idx = max(1, min(step, len(mode_path)))
+            mode_slice = mode_path[:end_idx]
+            mx, my = smooth_path([(cx, cy)] + mode_slice)
+
+            is_best = m == best_idx
+
+            if is_best:
+                for lw, op in [(14, 0.08), (9, 0.16)]:
+                    fig.add_trace(
+                        go.Scatter(
+                            x=mx,
+                            y=my,
+                            mode="lines",
+                            line={"color": mode_color, "width": lw, "shape": "spline", "smoothing": 1.15},
+                            opacity=op,
+                            hoverinfo="skip",
+                            showlegend=False,
+                        )
+                    )
+
+            fig.add_trace(
+                go.Scatter(
+                    x=mx,
+                    y=my,
+                    mode="lines",
+                    line={
+                        "color": mode_color,
+                        "width": 4.1 if is_best else 2.1,
+                        "dash": "solid" if is_best else "dash",
+                        "shape": "spline",
+                        "smoothing": 1.15,
+                    },
+                    opacity=(0.72 + 0.26 * mode_prob) if is_best else (0.36 + 0.32 * mode_prob),
+                    hovertemplate=(
+                        f"ID {a['id']}<br>Mode {m + 1}"
+                        f"<br>Probability: {mode_prob * 100:.1f}%<extra></extra>"
+                    ),
+                    name=(
+                        "Best path" if (is_best and idx == 0) else
+                        "Alternative paths" if ((not is_best) and (not alt_legend_added)) else None
+                    ),
+                    showlegend=(is_best and idx == 0) or ((not is_best) and (not alt_legend_added)),
+                )
+            )
+
+            if (not is_best) and (not alt_legend_added):
+                alt_legend_added = True
+
+        if a.get("is_target", False):
+            fig.add_trace(
+                go.Scatter(
+                    x=[cx + 0.9],
+                    y=[cy + 1.1],
+                    mode="text",
+                    text=[summary_text],
+                    textfont={"size": 9, "color": "rgba(226,232,240,0.90)"},
+                    hoverinfo="skip",
+                    showlegend=False,
+                )
+            )
+
+    fig.update_layout(
+        title={"text": "Main BEV Simulation", "x": 0.02, "font": {"size": 20, "color": WHITE}},
+        paper_bgcolor=BG_SECONDARY,
+        plot_bgcolor=BG_SECONDARY,
+        legend={"orientation": "h", "y": 1.03, "x": 0.0, "font": {"color": WHITE, "size": 11}},
+        margin={"l": 16, "r": 16, "t": 52, "b": 10},
+        height=700,
+    )
+
+    fig.update_xaxes(
+        title_text="X Lateral (m)",
+        range=[x_min, x_max],
+        color=WHITE,
+        dtick=5,
+        showgrid=True,
+        gridcolor="rgba(148,163,184,0.16)",
+        zeroline=False,
+    )
+    fig.update_yaxes(
+        title_text="Y Forward (m)",
+        range=[y_min, y_max],
+        color=WHITE,
+        dtick=5,
+        showgrid=True,
+        gridcolor="rgba(148,163,184,0.16)",
+        scaleanchor="x",
+        scaleratio=1,
+        zeroline=False,
+    )
+
+    return fig
+
+
+# ----------------------------
+# SIDEBAR CONTROLS
+# ----------------------------
+st.title("Multi-Agent Trajectory Prediction Simulator (BEV)")
+st.caption("Camera + LiDAR + Radar Fusion")
+
+st.sidebar.header("Simulation Controls")
+
+if "playing" not in st.session_state:
+    st.session_state.playing = False
+if "time_step" not in st.session_state:
+    st.session_state.time_step = 0
+if "time_step_slider" not in st.session_state:
+    st.session_state.time_step_slider = 0
+
+agent_source = st.sidebar.radio(
+    "Agent Source",
+    ["Two Image Upload", "Live CV + Fusion", "Synthetic Demo", "Upload JSON"],
+    index=0,
+)
+
+uploaded_prev = None
+uploaded_curr = None
+uploaded_json = None
+
+if agent_source == "Two Image Upload":
+    uploaded_prev = st.sidebar.file_uploader("Image 1 (t-1)", type=["jpg", "jpeg", "png"], key="img_t_minus_1")
+    uploaded_curr = st.sidebar.file_uploader("Image 2 (t0)", type=["jpg", "jpeg", "png"], key="img_t0")
+elif agent_source == "Upload JSON":
+    uploaded_json = st.sidebar.file_uploader("Upload agents JSON", type=["json"])
+
+num_agents = st.sidebar.slider("Number of agents", min_value=5, max_value=10, value=8)
+
+show_lidar = st.sidebar.checkbox("Show LiDAR", value=True)
+show_radar = st.sidebar.checkbox("Show Radar", value=True)
+show_multimodal = st.sidebar.checkbox("Show multi-modal paths", value=True)
+
+if agent_source == "Live CV + Fusion":
+    st.sidebar.caption(f"Trajectory model: {'Fusion Phase-2 checkpoint' if USING_FUSION_MODEL else 'Base checkpoint'}")
+
+col_a, col_b = st.sidebar.columns(2)
+if col_a.button("Play / Pause", use_container_width=True):
+    st.session_state.playing = not st.session_state.playing
+if col_b.button("Reset", use_container_width=True):
+    st.session_state.playing = False
+    st.session_state.time_step = 0
+    st.session_state.time_step_slider = 0
+
+step = st.sidebar.slider("Time step", min_value=0, max_value=12, value=int(st.session_state.time_step), key="time_step_slider")
+st.session_state.time_step = step
+
+# ----------------------------
+# DATA INGESTION
+# ----------------------------
+agents = None
+fusion_payload = None
+camera_payload = None
+target_track_id = None
+live_status_msg = None
+
+if agent_source == "Two Image Upload":
+    det_threshold = st.sidebar.slider("Detection threshold", min_value=0.20, max_value=0.90, value=0.35, step=0.01)
+    track_gate_px = st.sidebar.slider("Tracking gate (px)", min_value=30, max_value=220, value=130, step=5)
+    min_motion_px = st.sidebar.slider("Minimum motion (px)", min_value=0, max_value=40, value=0, step=1)
+    use_pose = st.sidebar.checkbox("Use Keypoint R-CNN", value=True)
+
+    if uploaded_prev is None or uploaded_curr is None:
+        st.info("Upload exactly 2 sequential images (t-1 and t0) to run prediction.")
+        agents = []
+    else:
+        img_prev = uploaded_file_to_array(uploaded_prev)
+        img_curr = uploaded_file_to_array(uploaded_curr)
+
+        if img_prev is None or img_curr is None:
+            st.warning("Could not read one of the uploaded images. Please try JPG/PNG files.")
+            agents = []
+        else:
+            with st.spinner("Running 2-image perception and trajectory prediction..."):
+                bundle = build_two_image_agents_bundle(
+                    img_prev,
+                    img_curr,
+                    score_threshold=det_threshold,
+                    tracking_gate_px=track_gate_px,
+                    min_motion_px=min_motion_px,
+                    use_pose=use_pose,
+                )
+
+            if "error" in bundle:
+                st.warning(f"Two-image pipeline failed: {bundle['error']}")
+                agents = []
+                camera_payload = {
+                    "mode": "two_upload",
+                    "pair_prev": {"image": img_prev, "detections": []},
+                    "pair_curr": {"image": img_curr, "detections": []},
+                }
+            else:
+                agents = bundle["agents"]
+                camera_payload = {"mode": "two_upload"}
+                camera_payload.update(bundle.get("camera_snapshots", {}))
+                target_track_id = bundle.get("target_track_id")
+                live_status_msg = (
+                    f"Two-image pipeline on {bundle.get('device', 'unknown')} | "
+                    f"Predicted agents: {bundle.get('match_count', len(agents))}"
+                )
+
+elif agent_source == "Live CV + Fusion":
+    front_paths = list_channel_image_paths("CAM_FRONT")
+
+    if len(front_paths) < 4:
+        st.warning("Live mode needs at least 4 frames in DataSet/samples/CAM_FRONT. Using synthetic data.")
+        agents = generate_demo_agents(num_agents=num_agents)
+    else:
+        anchor_idx = st.sidebar.slider("Anchor frame index (CAM_FRONT)", min_value=3, max_value=len(front_paths) - 1, value=len(front_paths) - 1)
+        det_threshold = st.sidebar.slider("Detection threshold", min_value=0.30, max_value=0.90, value=0.55, step=0.01)
+        track_gate_px = st.sidebar.slider("Tracking gate (px)", min_value=40, max_value=180, value=90, step=5)
+        use_pose = st.sidebar.checkbox("Use Keypoint R-CNN", value=True)
+
+        with st.spinner("Running perception, tracking, fusion, and trajectory prediction..."):
+            bundle = build_live_agents_bundle(anchor_idx, det_threshold, track_gate_px, use_pose)
+
+        if "error" in bundle:
+            st.warning(f"Live pipeline failed: {bundle['error']} Falling back to synthetic data.")
+            agents = generate_demo_agents(num_agents=num_agents)
+        else:
+            agents = bundle["agents"]
+            fusion_payload = bundle.get("fusion_data")
+            camera_payload = bundle.get("camera_snapshots")
+            target_track_id = bundle.get("target_track_id")
+            live_status_msg = f"Live pipeline on {bundle.get('device', 'unknown')} | Tracked agents: {len(agents)}"
+
+elif agent_source == "Upload JSON" and uploaded_json is not None:
+    try:
+        payload = json.load(uploaded_json)
+        if isinstance(payload, dict) and "agents" in payload:
+            raw_agents = payload["agents"]
+        elif isinstance(payload, list):
+            raw_agents = payload
+        else:
+            raw_agents = []
+
+        agents = sanitize_agents(raw_agents)
+        if len(agents) == 0:
+            st.warning("Uploaded JSON did not contain valid agent entries. Falling back to synthetic demo data.")
+            agents = generate_demo_agents(num_agents=num_agents)
+    except Exception as e:
+        st.warning(f"Could not parse uploaded JSON ({e}). Falling back to synthetic demo data.")
+        agents = generate_demo_agents(num_agents=num_agents)
+
+elif agent_source == "Synthetic Demo":
+    agents = generate_demo_agents(num_agents=num_agents)
+
+else:
+    agents = []
+
+if agents is None:
+    agents = generate_demo_agents(num_agents=num_agents)
+
+lidar_xy = fusion_payload.get("lidar_xy") if fusion_payload is not None else None
+radar_xy = fusion_payload.get("radar_xy") if fusion_payload is not None else None
+radar_vel = fusion_payload.get("radar_vel") if fusion_payload is not None else None
+
+# ----------------------------
+# TOP PANEL: MULTI-CAMERA
+# ----------------------------
+st.markdown("## 1. Multi-Camera View")
+
+target_highlight_ids = {a["id"] for a in agents if a.get("is_target", False)} if len(agents) > 0 else set()
+
+if agent_source == "Two Image Upload" and (camera_payload is None or camera_payload.get("mode") != "two_upload"):
+    c1, c2, c3 = st.columns(3)
+    empty = fallback_canvas()
+
+    with c1:
+        fig_prev = create_camera_figure_detections(empty, [], "Input Frame (t-1)", target_track_id=None, highlight_track_ids=None)
+        st.plotly_chart(fig_prev, use_container_width=True, config={"displayModeBar": False})
+
+    with c2:
+        fig_curr = create_camera_figure_detections(empty, [], "Input Frame (t0)", target_track_id=None, highlight_track_ids=None)
+        st.plotly_chart(fig_curr, use_container_width=True, config={"displayModeBar": False})
+
+    with c3:
+        fig_pred = create_camera_figure_detections(empty, [], "Prediction Output", target_track_id=None, highlight_track_ids=None)
+        st.plotly_chart(fig_pred, use_container_width=True, config={"displayModeBar": False})
+
+elif camera_payload is not None and camera_payload.get("mode") == "two_upload":
+    c1, c2, c3 = st.columns(3)
+
+    snap_prev = camera_payload.get("pair_prev", {"image": fallback_canvas(), "detections": []})
+    snap_curr = camera_payload.get("pair_curr", {"image": fallback_canvas(), "detections": []})
+
+    with c1:
+        fig_prev = create_camera_figure_detections(
+            snap_prev["image"],
+            snap_prev["detections"],
+            "Input Frame (t-1)",
+            target_track_id=target_track_id,
+            highlight_track_ids=target_highlight_ids,
+        )
+        st.plotly_chart(fig_prev, use_container_width=True, config={"displayModeBar": False})
+
+    with c2:
+        fig_curr = create_camera_figure_detections(
+            snap_curr["image"],
+            snap_curr["detections"],
+            "Input Frame (t0)",
+            target_track_id=target_track_id,
+            highlight_track_ids=target_highlight_ids,
+        )
+        st.plotly_chart(fig_curr, use_container_width=True, config={"displayModeBar": False})
+
+    with c3:
+        fig_pred = create_prediction_overlay_figure(
+            snap_curr["image"],
+            snap_curr["detections"],
+            agents,
+            step=st.session_state.time_step,
+            target_track_id=target_track_id,
+            highlight_track_ids=target_highlight_ids,
+        )
+        st.plotly_chart(fig_pred, use_container_width=True, config={"displayModeBar": False})
+
+else:
+    cam_cols = st.columns(3)
+    for i, (channel, label, yaw) in enumerate(CAMERA_VIEWS):
+        with cam_cols[i]:
+            if camera_payload is not None and channel in camera_payload:
+                snap = camera_payload[channel]
+                cam_fig = create_camera_figure_detections(
+                    snap["image"],
+                    snap["detections"],
+                    label,
+                    target_track_id=target_track_id,
+                    highlight_track_ids=None,
+                )
+            else:
+                img_arr, _ = load_camera_frame(channel, frame_idx=0)
+                cam_fig = create_camera_figure_projected(img_arr, agents, label, yaw, st.session_state.time_step)
+
+            st.plotly_chart(cam_fig, use_container_width=True, config={"displayModeBar": False})
+
+# ----------------------------
+# CENTER + SIDE PANELS
+# ----------------------------
+left_col, right_col = st.columns([3.6, 1.4], gap="large")
+
+with left_col:
+    if agent_source == "Two Image Upload":
+        scene_ctx = None
+        scene_dets = None
+        if camera_payload is not None and camera_payload.get("mode") == "two_upload":
+            scene_ctx = camera_payload.get("pair_curr", {}).get("image")
+            scene_dets = camera_payload.get("pair_curr", {}).get("detections", [])
+
+        bev_fig = build_reference_bev_figure(
+            agents=agents,
+            step=st.session_state.time_step,
+            show_multimodal=show_multimodal,
+            scene_image=scene_ctx,
+            scene_detections=scene_dets,
+        )
+    else:
+        bev_fig = build_bev_figure(
+            agents=agents,
+            step=st.session_state.time_step,
+            show_lidar=show_lidar,
+            show_radar=show_radar,
+            show_multimodal=show_multimodal,
+            lidar_xy=lidar_xy,
+            radar_xy=radar_xy,
+            radar_vel=radar_vel,
+        )
+    st.markdown("## 2. Main BEV Simulation")
+    st.plotly_chart(bev_fig, use_container_width=True)
+
+with right_col:
+    st.markdown("## 3. Probability + Analytics")
+
+    if live_status_msg:
+        st.caption(live_status_msg)
+
+    analytics_df = build_analytics_table(agents)
+    st.dataframe(analytics_df, use_container_width=True, hide_index=True)
+
+    if len(agents) == 0:
+        st.info("No moving agents detected yet. Try clearer sequential frames with visible motion.")
+
+    target_count = sum(1 for a in agents if a.get("is_target", False))
+    ped_count = sum(1 for a in agents if a["type"] == "pedestrian")
+    veh_count = sum(1 for a in agents if a["type"] == "vehicle")
+
+    st.metric("Tracked Agents", len(agents))
+    st.metric("VRUs", ped_count)
+    st.metric("Vehicles", veh_count)
+    st.metric("Target VRU", target_count)
+
+    if fusion_payload is not None:
+        st.metric("LiDAR points", int(len(lidar_xy)) if lidar_xy is not None else 0)
+        st.metric("Radar points", int(len(radar_xy)) if radar_xy is not None else 0)
+
+    st.markdown("### Legend")
+    if agent_source == "Two Image Upload":
+        st.markdown(
+            "- Target VRU: purple\n"
+            "- Other VRUs: green\n"
+            "- Vehicles: yellow\n"
+            "- Road model: asphalt, lane boundaries, dashed lane lines, crosswalk\n"
+            "- Camera boxes/skeleton: detection + tracking\n"
+            "- Trajectories: cyan/purple/orange (best = thick solid, alternatives = dashed)\n"
+            "- Glow trail: best future path emphasis\n"
+            "- BEV background: transformed real t0 scene with foreground cleanup"
+        )
+    else:
+        st.markdown(
+            "- Target VRU: purple\n"
+            "- Other VRUs: green\n"
+            "- Vehicles: yellow\n"
+            "- Road model: asphalt, lane boundaries, dashed lane lines, crosswalk\n"
+            "- Trajectories: cyan/purple/orange (best = thick solid, alternatives = dashed)\n"
+            "- LiDAR: low-opacity cyan points\n"
+            "- Radar: short yellow velocity vectors"
+        )
+
+with st.expander("Input schema expected by simulator"):
+    st.code(
+        """
+agents = [
+  {
+    "id": 1,
+    "type": "pedestrian",  # or "vehicle"
+    "is_target": True,
+    "history": [[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
+    "predictions": [
+      [[x, y], ...],  # mode 1
+      [[x, y], ...],  # mode 2
+      [[x, y], ...],  # mode 3
+    ],
+    "probabilities": [0.62, 0.24, 0.14]
+  }
+]
+""",
+        language="python",
+    )
+
+# ----------------------------
+# PLAYBACK
+# ----------------------------
+if st.session_state.playing:
+    time.sleep(0.15)
+    nxt = (int(st.session_state.time_step) + 1) % 13
+    st.session_state.time_step = nxt
+    st.session_state.time_step_slider = nxt
+    st.rerun()
diff --git a/backend/scripts/tools/__init__.py b/backend/scripts/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba9551d0bafcf066ed37dd21fdcc110f52a78c9
--- /dev/null
+++ b/backend/scripts/tools/__init__.py
@@ -0,0 +1 @@
+"""Utility script modules."""
diff --git a/backend/scripts/tools/generate_benchmark_metric_pages.py b/backend/scripts/tools/generate_benchmark_metric_pages.py
new file mode 100644
index 0000000000000000000000000000000000000000..2473f2c58378b637b396dc954cdb3d0bed54efb0
--- /dev/null
+++ b/backend/scripts/tools/generate_benchmark_metric_pages.py
@@ -0,0 +1,572 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from typing import Dict, List, Sequence, Tuple
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Rectangle
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+LOG_DIR = REPO_ROOT / "log"
+
+
+MODELS: List[str] = [
+    "Baseline (CV)",
+    "Camera-only Transformer",
+    "Fusion Transformer"
+]
+
+
+PRESETS: Dict[str, Dict[str, object]] = {
+    "measured": {
+        "display_name": "Measured benchmark",
+        "source_primary": "Source: provided benchmark values",
+        "source_runtime": "Source: provided runtime benchmark",
+        "optional_note": "Estimated supporting CV metric chart (replace with measured evaluation values when available)",
+        "primary_metrics": {
+            "minADE@3 (m)": [0.65, 0.55, 0.54],
+            "minFDE@3 (m)": [1.35, 1.09, 1.07],
+            "Miss Rate >2.0m (%)": [19.9, 13.0, 12.4],
+        },
+        "runtime_metrics": {
+            "Detection latency": (76.0, "ms/frame"),
+            "Transformer predict latency": (13.6, "ms"),
+            "End-to-end live cycle": (89.6, "ms"),
+            "End-to-end throughput": (11.6, "FPS"),
+        },
+        "runtime_targets": {
+            "Detection latency": (60.0, True),
+            "Transformer predict latency": (20.0, True),
+            "End-to-end live cycle": (66.7, True),
+            "End-to-end throughput": (15.0, False),
+        },
+        "optional_metrics": {
+            "Precision (%)": ([74.0, 85.0, 88.0], 85.0),
+            "Recall (%)": ([68.0, 80.0, 83.0], 80.0),
+            "F1 (%)": ([71.0, 82.0, 85.0], 82.0),
+            "mAP@0.5 (%)": ([62.0, 76.0, 79.0], 75.0),
+            "mAP@[0.5:0.95] (%)": ([34.0, 46.0, 49.0], 45.0),
+            "IoU (%)": ([52.0, 62.0, 65.0], 60.0),
+        },
+    },
+    "best": {
+        "display_name": "Best benchmark (analyzed target)",
+        "source_primary": "Source: analyst-optimized trajectory target",
+        "source_runtime": "Source: analyst-optimized runtime target",
+        "optional_note": "Analyzed best-case CV metric chart (target values)",
+        "primary_metrics": {
+            "minADE@3 (m)": [0.65, 0.50, 0.42],
+            "minFDE@3 (m)": [1.35, 0.95, 0.78],
+            "Miss Rate >2.0m (%)": [19.9, 9.8, 7.1],
+        },
+        "runtime_metrics": {
+            "Detection latency": (42.0, "ms/frame"),
+            "Transformer predict latency": (8.5, "ms"),
+            "End-to-end live cycle": (55.0, "ms"),
+            "End-to-end throughput": (18.2, "FPS"),
+        },
+        "runtime_targets": {
+            "Detection latency": (45.0, True),
+            "Transformer predict latency": (10.0, True),
+            "End-to-end live cycle": (60.0, True),
+            "End-to-end throughput": (16.0, False),
+        },
+        "optional_metrics": {
+            "Precision (%)": ([74.0, 89.0, 92.0], 90.0),
+            "Recall (%)": ([68.0, 86.0, 90.0], 88.0),
+            "F1 (%)": ([71.0, 87.0, 91.0], 89.0),
+            "mAP@0.5 (%)": ([62.0, 82.0, 86.0], 85.0),
+            "mAP@[0.5:0.95] (%)": ([34.0, 54.0, 60.0], 58.0),
+            "IoU (%)": ([52.0, 66.0, 72.0], 70.0),
+        },
+    },
+}
+
+
+COLOR_BG = "#030712"
+COLOR_BAND = "#0B152A"
+COLOR_PANEL = "#09121F"
+COLOR_GRID = "#314258"
+MODEL_COLORS = ["#5E6B7E", "#69B3FF", "#7BE5A7"]
+COLOR_LINE = "#B5E6FF"
+COLOR_GOOD = "#7BE5A7"
+COLOR_WARN = "#FFC47A"
+COLOR_BAD = "#FF7F96"
+COLOR_TARGET = "#8CBFFF"
+
+
+def setup_theme() -> None:
+    plt.rcParams.update(
+        {
+            "figure.facecolor": COLOR_BG,
+            "axes.facecolor": COLOR_PANEL,
+            "savefig.facecolor": COLOR_BG,
+            "text.color": "#FFFFFF",
+            "axes.labelcolor": "#FFFFFF",
+            "xtick.color": "#FFFFFF",
+            "ytick.color": "#FFFFFF",
+            "axes.edgecolor": "#C5D4EA",
+            "font.family": "Calibri",
+            "font.size": 22,
+        }
+    )
+
+
+def clean_name(name: str) -> str:
+    out = "".join(ch if ch.isalnum() else "_" for ch in name)
+    while "__" in out:
+        out = out.replace("__", "_")
+    return out.strip("_").lower()
+
+
+def pct_improvement(old: float, new: float) -> float:
+    if abs(old) < 1e-12:
+        return 0.0
+    return 100.0 * (old - new) / old
+
+
+def style_figure(fig: plt.Figure) -> None:
+    fig.patch.set_facecolor(COLOR_BG)
+    fig.add_artist(
+        Rectangle(
+            (0, 0.92),
+            1,
+            0.08,
+            transform=fig.transFigure,
+            facecolor=COLOR_BAND,
+            alpha=0.95,
+            linewidth=0,
+            zorder=0,
+        )
+    )
+
+
+def style_axes(ax: plt.Axes) -> None:
+    ax.set_facecolor(COLOR_PANEL)
+    ax.grid(True, axis="y", linestyle="--", linewidth=0.8, color=COLOR_GRID, alpha=0.55)
+    for spine in ax.spines.values():
+        spine.set_linewidth(1.2)
+        spine.set_color("#C5D4EA")
+
+
+def draw_model_metric_page(
+    title: str,
+    values: Sequence[float],
+    out_path: Path,
+    lower_is_better: bool = True,
+    is_percent: bool = False,
+    footnote: str = "Source: provided benchmark values",
+) -> None:
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+
+    style_figure(fig)
+    style_axes(ax)
+
+    x = np.arange(len(MODELS))
+    bars = ax.bar(x, values, color=MODEL_COLORS, edgecolor="#DCE8F6", linewidth=1.2, zorder=2)
+    ax.plot(x, values, color=COLOR_LINE, linewidth=2.8, marker="o", markersize=7, zorder=3)
+
+    value_suffix = "%" if is_percent or ("%" in title) else ""
+    for bar, val in zip(bars, values):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            bar.get_height(),
+            f"{val:.2f}{value_suffix}",
+            ha="center",
+            va="bottom",
+            fontsize=18,
+            color="#FFFFFF",
+            zorder=4,
+        )
+
+    baseline = values[0]
+    cam = values[1]
+    fusion = values[2]
+
+    if lower_is_better:
+        cam_delta = pct_improvement(baseline, cam)
+        fusion_delta = pct_improvement(baseline, fusion)
+        subtitle = f"Improvement vs Baseline: Camera {cam_delta:.1f}% | Fusion {fusion_delta:.1f}%"
+    else:
+        cam_delta = 100.0 * (cam - baseline) / max(1e-12, baseline)
+        fusion_delta = 100.0 * (fusion - baseline) / max(1e-12, baseline)
+        subtitle = f"Gain vs Baseline: Camera {cam_delta:.1f}% | Fusion {fusion_delta:.1f}%"
+
+    ax.set_title(title, fontsize=40, weight="bold", pad=18)
+    ax.set_ylabel("Value", fontsize=24)
+    ax.set_xticks(x)
+    ax.set_xticklabels(MODELS)
+    ax.tick_params(axis="x", labelrotation=0, labelsize=15)
+    ax.tick_params(axis="y", labelsize=18)
+    ax.margins(x=0.05)
+
+    fig.text(0.5, 0.06, subtitle, ha="center", va="center", fontsize=22, color="#FFFFFF")
+    fig.text(0.01, 0.01, footnote, ha="left", va="bottom", fontsize=12, color="#D0D0D0")
+    fig.tight_layout(rect=(0.02, 0.10, 0.98, 0.95))
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def draw_runtime_metric_page(
+    title: str,
+    value: float,
+    unit: str,
+    target: float,
+    lower_is_better: bool,
+    out_path: Path,
+    footnote: str,
+) -> None:
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+
+    style_figure(fig)
+    style_axes(ax)
+
+    labels = ["Measured", "Target"]
+    vals = [value, target]
+
+    if lower_is_better:
+        measured_color = COLOR_GOOD if value <= target else COLOR_BAD
+        status = f"Gap vs target: {value - target:+.1f} {unit}"
+        hint = "Lower is better"
+    else:
+        measured_color = COLOR_GOOD if value >= target else COLOR_WARN
+        status = f"Gap vs target: {value - target:+.1f} {unit}"
+        hint = "Higher is better"
+
+    bars = ax.barh(labels, vals, color=[measured_color, COLOR_TARGET], edgecolor="#DCE8F6", linewidth=1.2, zorder=2)
+
+    for bar, val in zip(bars, vals):
+        ax.text(
+            val + max(vals) * 0.02,
+            bar.get_y() + bar.get_height() / 2,
+            f"{val:.1f} {unit}",
+            ha="left",
+            va="center",
+            fontsize=20,
+            color="#FFFFFF",
+        )
+
+    ax.set_xlim(0, max(vals) * 1.45)
+    ax.set_title(f"{title} ({unit})", fontsize=40, weight="bold", pad=18)
+    ax.set_xlabel("Value", fontsize=22)
+    ax.tick_params(axis="x", labelsize=16)
+    ax.tick_params(axis="y", labelsize=20)
+
+    fig.text(0.5, 0.06, f"{hint} | {status}", ha="center", va="center", fontsize=22, color="#FFFFFF")
+    fig.text(0.01, 0.01, footnote, ha="left", va="bottom", fontsize=12, color="#D0D0D0")
+
+    fig.tight_layout(rect=(0.03, 0.10, 0.98, 0.95))
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def draw_optional_metric_page(
+    metric_name: str,
+    values: Sequence[float],
+    target: float,
+    out_path: Path,
+    footnote: str,
+) -> None:
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+
+    style_figure(fig)
+    style_axes(ax)
+
+    x = np.arange(len(MODELS))
+    bars = ax.bar(x, values, color=MODEL_COLORS, edgecolor="#DCE8F6", linewidth=1.2, zorder=2)
+    ax.plot(x, values, color=COLOR_LINE, linewidth=2.8, marker="o", markersize=7, zorder=3)
+    ax.axhline(target, color=COLOR_TARGET, linestyle="--", linewidth=2.0, zorder=1)
+
+    for bar, val in zip(bars, values):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            bar.get_height(),
+            f"{val:.1f}%",
+            ha="center",
+            va="bottom",
+            fontsize=18,
+            color="#FFFFFF",
+        )
+
+    ax.text(
+        x[-1] + 0.35,
+        target,
+        f"Target {target:.1f}%",
+        ha="left",
+        va="center",
+        fontsize=16,
+        color=COLOR_TARGET,
+    )
+
+    baseline = values[0]
+    cam = values[1]
+    fusion = values[2]
+    cam_gain = 100.0 * (cam - baseline) / max(1e-12, baseline)
+    fusion_gain = 100.0 * (fusion - baseline) / max(1e-12, baseline)
+
+    ax.set_title(metric_name, fontsize=40, weight="bold", pad=18)
+    ax.set_ylabel("Percent", fontsize=24)
+    ax.set_ylim(0, max(max(values), target) * 1.25)
+    ax.set_xticks(x)
+    ax.set_xticklabels(MODELS)
+    ax.tick_params(axis="x", labelrotation=0, labelsize=15)
+    ax.tick_params(axis="y", labelsize=18)
+
+    fig.text(
+        0.5,
+        0.06,
+        f"Estimated gain vs Baseline: Camera {cam_gain:.1f}% | Fusion {fusion_gain:.1f}%",
+        ha="center",
+        va="center",
+        fontsize=20,
+        color="#FFFFFF",
+    )
+    fig.text(
+        0.01,
+        0.01,
+        footnote,
+        ha="left",
+        va="bottom",
+        fontsize=11,
+        color="#D0D0D0",
+    )
+
+    fig.tight_layout(rect=(0.03, 0.10, 0.98, 0.95))
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def draw_latency_share_chart(
+    runtime_metrics: Dict[str, Tuple[float, str]],
+    out_path: Path,
+    footnote: str,
+) -> None:
+    det = runtime_metrics["Detection latency"][0]
+    pred = runtime_metrics["Transformer predict latency"][0]
+    e2e = runtime_metrics["End-to-end live cycle"][0]
+    other = max(0.0, e2e - det - pred)
+
+    values = [det, pred]
+    labels = ["Detection", "Transformer"]
+    colors = [COLOR_BAD, COLOR_GOOD]
+    if other > 1e-6:
+        values.append(other)
+        labels.append("Other")
+        colors.append("#7991B0")
+
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+    style_figure(fig)
+    ax.set_facecolor(COLOR_PANEL)
+
+    wedges, _, _ = ax.pie(
+        values,
+        labels=labels,
+        autopct=lambda pct: f"{pct:.1f}%",
+        startangle=90,
+        colors=colors,
+        textprops={"color": "#FFFFFF", "fontsize": 16},
+        wedgeprops={"width": 0.38, "edgecolor": COLOR_BG, "linewidth": 2.0},
+    )
+    for w in wedges:
+        w.set_alpha(0.92)
+
+    ax.text(0, 0.06, f"{e2e:.1f} ms", ha="center", va="center", fontsize=34, color="#FFFFFF", weight="bold")
+    ax.text(0, -0.12, "total cycle", ha="center", va="center", fontsize=16, color="#FFFFFF")
+    ax.set_title("End-to-end latency share", fontsize=40, weight="bold", pad=20)
+    ax.axis("equal")
+
+    fig.text(0.5, 0.06, "Detection dominates runtime cost; optimize detector stage first", ha="center", va="center", fontsize=22, color="#FFFFFF")
+    fig.text(0.01, 0.01, footnote, ha="left", va="bottom", fontsize=12, color="#D0D0D0")
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def write_analysis(
+    out_dir: Path,
+    preset_name: str,
+    display_name: str,
+    primary_metrics: Dict[str, List[float]],
+    runtime_metrics: Dict[str, Tuple[float, str]],
+    optional_note: str,
+) -> None:
+    ade = primary_metrics["minADE@3 (m)"]
+    fde = primary_metrics["minFDE@3 (m)"]
+    miss = primary_metrics["Miss Rate >2.0m (%)"]
+
+    det = runtime_metrics["Detection latency"][0]
+    pred = runtime_metrics["Transformer predict latency"][0]
+    e2e = runtime_metrics["End-to-end live cycle"][0]
+    fps = runtime_metrics["End-to-end throughput"][0]
+
+    lines: List[str] = []
+    lines.append(f"Preset: {preset_name} ({display_name})")
+    lines.append("")
+    lines.append("Trajectory metric interpretation")
+    lines.append("--------------------------------")
+    lines.append(f"Baseline -> Fusion ADE improvement: {pct_improvement(ade[0], ade[2]):.2f}%")
+    lines.append(f"Baseline -> Fusion FDE improvement: {pct_improvement(fde[0], fde[2]):.2f}%")
+    lines.append(f"Baseline -> Fusion Miss Rate improvement: {pct_improvement(miss[0], miss[2]):.2f}%")
+    lines.append("")
+    lines.append(f"Camera -> Fusion ADE improvement: {pct_improvement(ade[1], ade[2]):.2f}%")
+    lines.append(f"Camera -> Fusion FDE improvement: {pct_improvement(fde[1], fde[2]):.2f}%")
+    lines.append(f"Camera -> Fusion Miss Rate improvement: {pct_improvement(miss[1], miss[2]):.2f}%")
+    lines.append("")
+    lines.append("Runtime interpretation")
+    lines.append("----------------------")
+    lines.append(f"Detection share of end-to-end latency: {100.0 * det / e2e:.2f}%")
+    lines.append(f"Transformer share of end-to-end latency: {100.0 * pred / e2e:.2f}%")
+    lines.append(f"Current cycle: {e2e:.1f} ms ({fps:.1f} FPS)")
+    miss_fusion = miss[2]
+    approx_one_in = 100.0 / max(1e-12, miss_fusion)
+    lines.append(
+        f"Miss Rate {miss_fusion:.1f}% means about 1 in {approx_one_in:.1f} trajectories still exceed 2.0m final error."
+    )
+    lines.append("")
+    lines.append("Supporting CV metrics")
+    lines.append("---------------------")
+    lines.append(optional_note)
+
+    analysis_file = out_dir / "benchmark_analysis.txt"
+    analysis_file.write_text("\n".join(lines), encoding="utf-8")
+
+
+def write_manifest(
+    out_dir: Path,
+    preset_name: str,
+    display_name: str,
+    generated_files: Sequence[Path],
+) -> None:
+    manifest_file = out_dir / "benchmark_manifest.txt"
+    lines = [
+        f"Preset: {preset_name}",
+        f"Preset display: {display_name}",
+        f"Output directory: {out_dir}",
+        "",
+        "Generated pages:",
+    ]
+    for item in generated_files:
+        lines.append(f"- {item.name}")
+    manifest_file.write_text("\n".join(lines), encoding="utf-8")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate PPT-ready benchmark metric pages from provided values.")
+    parser.add_argument(
+        "--preset",
+        type=str,
+        default="measured",
+        choices=sorted(PRESETS.keys()),
+        help="Metric preset to render.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="",
+        help="Output directory (default: log/ppt_metric_pages/trajectory_benchmark_pack_<preset>)",
+    )
+    args = parser.parse_args()
+
+    setup_theme()
+
+    preset_cfg = PRESETS[args.preset]
+    display_name = str(preset_cfg["display_name"])
+    source_primary = str(preset_cfg["source_primary"])
+    source_runtime = str(preset_cfg["source_runtime"])
+    optional_note = str(preset_cfg["optional_note"])
+    primary_metrics = dict(preset_cfg["primary_metrics"])
+    runtime_metrics = dict(preset_cfg["runtime_metrics"])
+    runtime_targets = dict(preset_cfg["runtime_targets"])
+    optional_metrics = dict(preset_cfg["optional_metrics"])
+
+    default_out = LOG_DIR / "ppt_metric_pages" / f"trajectory_benchmark_pack_{args.preset}"
+    out_dir = Path(args.output_dir) if args.output_dir else default_out
+    if not out_dir.is_absolute():
+        out_dir = REPO_ROOT / out_dir
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    for old_png in out_dir.glob("*.png"):
+        old_png.unlink()
+
+    generated: List[Path] = []
+
+    primary_defs = [
+        ("01_minade_at3", "minADE@3 (m)", True),
+        ("02_minfde_at3", "minFDE@3 (m)", True),
+        ("03_miss_rate_gt_2m", "Miss Rate >2.0m (%)", True),
+    ]
+    for file_stem, metric_name, lower_better in primary_defs:
+        out_path = out_dir / f"{file_stem}.png"
+        draw_model_metric_page(
+            metric_name,
+            primary_metrics[metric_name],
+            out_path,
+            lower_is_better=lower_better,
+            is_percent=("%" in metric_name),
+            footnote=f"{source_primary} | {display_name}",
+        )
+        generated.append(out_path)
+
+    runtime_defs = [
+        ("04_detection_latency", "Detection latency"),
+        ("05_transformer_predict_latency", "Transformer predict latency"),
+        ("06_end_to_end_cycle", "End-to-end live cycle"),
+        ("07_end_to_end_fps", "End-to-end throughput"),
+    ]
+    for file_stem, metric_key in runtime_defs:
+        val, unit = runtime_metrics[metric_key]
+        target, lower_better = runtime_targets[metric_key]
+        out_path = out_dir / f"{file_stem}.png"
+        draw_runtime_metric_page(
+            metric_key,
+            val,
+            unit,
+            target,
+            lower_better,
+            out_path,
+            footnote=f"{source_runtime} | {display_name}",
+        )
+        generated.append(out_path)
+
+    for idx, (metric_name, metric_payload) in enumerate(optional_metrics.items(), start=8):
+        values, target = metric_payload
+        out_path = out_dir / f"{idx:02d}_{clean_name(metric_name)}_estimated_chart.png"
+        draw_optional_metric_page(
+            metric_name,
+            values,
+            target,
+            out_path,
+            footnote=f"{optional_note} | {display_name}",
+        )
+        generated.append(out_path)
+
+    latency_share_path = out_dir / "14_latency_share_breakdown.png"
+    draw_latency_share_chart(
+        runtime_metrics,
+        latency_share_path,
+        footnote=f"{source_runtime} | {display_name}",
+    )
+    generated.append(latency_share_path)
+
+    write_analysis(
+        out_dir,
+        args.preset,
+        display_name,
+        primary_metrics,
+        runtime_metrics,
+        optional_note,
+    )
+    write_manifest(out_dir, args.preset, display_name, generated)
+
+    print(f"Generated {len(generated)} benchmark pages in: {out_dir}")
+    print(f"Manifest: {out_dir / 'benchmark_manifest.txt'}")
+    print(f"Analysis: {out_dir / 'benchmark_analysis.txt'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/scripts/tools/generate_metric_pages.py b/backend/scripts/tools/generate_metric_pages.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2f6dd414cbe65e2cbf87b7ab25a42edb5ebb032
--- /dev/null
+++ b/backend/scripts/tools/generate_metric_pages.py
@@ -0,0 +1,346 @@
+from __future__ import annotations
+
+import argparse
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+LOG_DIR = REPO_ROOT / "log"
+
+
+@dataclass
+class ParsedMetrics:
+    series: Dict[str, List[Tuple[int, float]]] = field(default_factory=dict)
+    paired: Dict[str, Tuple[float, float, bool]] = field(default_factory=dict)
+    paired_labels: Tuple[str, str] = ("Baseline", "Model")
+
+
+def canonical_metric(name: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "", name.lower())
+
+
+def sanitize_filename(name: str) -> str:
+    cleaned = re.sub(r"[^a-zA-Z0-9._-]+", "_", name.strip())
+    cleaned = re.sub(r"_+", "_", cleaned).strip("_")
+    return cleaned or "metric"
+
+
+def parse_number(token: str) -> Optional[Tuple[float, bool]]:
+    s = token.strip()
+    is_percent = s.endswith("%")
+    s = s.replace("%", "")
+
+    match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
+    if not match:
+        return None
+    return float(match.group(0)), is_percent
+
+
+def append_series(series: Dict[str, List[Tuple[int, float]]], metric: str, epoch: Optional[int], value: float) -> None:
+    points = series.setdefault(metric, [])
+    x = epoch
+    if x is None:
+        x = points[-1][0] + 1 if points else 1
+    points.append((x, value))
+
+
+def parse_metrics_from_log(log_path: Path) -> ParsedMetrics:
+    parsed = ParsedMetrics()
+    current_epoch: Optional[int] = None
+
+    lines = log_path.read_text(encoding="utf-8", errors="ignore").splitlines()
+
+    for raw in lines:
+        line = raw.strip()
+        if not line:
+            continue
+
+        epoch_match = re.search(r"^Epoch\s+(\d+)(?:/\d+)?$", line, flags=re.IGNORECASE)
+        if epoch_match:
+            current_epoch = int(epoch_match.group(1))
+            continue
+
+        header_match = re.search(r"^METRIC\s*\|\s*(.+?)\s*\|\s*(.+?)\s*$", line, flags=re.IGNORECASE)
+        if header_match:
+            parsed.paired_labels = (header_match.group(1).strip(), header_match.group(2).strip())
+            continue
+
+        train_loss_match = re.search(r"Train Loss:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)", line, flags=re.IGNORECASE)
+        if train_loss_match:
+            append_series(parsed.series, "Train Loss", current_epoch, float(train_loss_match.group(1)))
+            continue
+
+        ade_fde_match = re.search(
+            r"^ADE:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)\s*,\s*FDE:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)",
+            line,
+            flags=re.IGNORECASE,
+        )
+        if ade_fde_match:
+            append_series(parsed.series, "ADE", current_epoch, float(ade_fde_match.group(1)))
+            append_series(parsed.series, "FDE", current_epoch, float(ade_fde_match.group(2)))
+            continue
+
+        val_ade_fde_match = re.search(
+            r"^Val\s+ADE:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)\s*\|\s*Val\s+FDE:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)",
+            line,
+            flags=re.IGNORECASE,
+        )
+        if val_ade_fde_match:
+            append_series(parsed.series, "Val ADE", current_epoch, float(val_ade_fde_match.group(1)))
+            append_series(parsed.series, "Val FDE", current_epoch, float(val_ade_fde_match.group(2)))
+            continue
+
+        lr_match = re.search(r"Current Learning Rate:\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)", line, flags=re.IGNORECASE)
+        if lr_match:
+            append_series(parsed.series, "Learning Rate", current_epoch, float(lr_match.group(1)))
+            continue
+
+        lr_pair_match = re.search(
+            r"LR\s+base=([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)\s*\|\s*fusion=([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)",
+            line,
+            flags=re.IGNORECASE,
+        )
+        if lr_pair_match:
+            append_series(parsed.series, "LR base", current_epoch, float(lr_pair_match.group(1)))
+            append_series(parsed.series, "LR fusion", current_epoch, float(lr_pair_match.group(2)))
+            continue
+
+        table_row_match = re.search(r"^(.+?)\|\s*([^|]+)\|\s*([^|]+)$", line)
+        if table_row_match and "----" not in line and not line.upper().startswith("METRIC"):
+            metric_name = table_row_match.group(1).strip()
+            left_token = table_row_match.group(2).strip()
+            right_token = table_row_match.group(3).strip()
+
+            left_parsed = parse_number(left_token)
+            right_parsed = parse_number(right_token)
+            if left_parsed and right_parsed:
+                left_val, left_is_pct = left_parsed
+                right_val, right_is_pct = right_parsed
+                parsed.paired[metric_name] = (left_val, right_val, left_is_pct or right_is_pct)
+
+    # Alias validation trajectory metrics to generic names when only validation labels are present.
+    if "ADE" not in parsed.series and "Val ADE" in parsed.series:
+        parsed.series["ADE"] = list(parsed.series["Val ADE"])
+    if "FDE" not in parsed.series and "Val FDE" in parsed.series:
+        parsed.series["FDE"] = list(parsed.series["Val FDE"])
+
+    return parsed
+
+
+def setup_theme() -> None:
+    plt.rcParams.update(
+        {
+            "figure.facecolor": "#000000",
+            "axes.facecolor": "#000000",
+            "savefig.facecolor": "#000000",
+            "text.color": "#FFFFFF",
+            "axes.labelcolor": "#FFFFFF",
+            "xtick.color": "#FFFFFF",
+            "ytick.color": "#FFFFFF",
+            "axes.edgecolor": "#FFFFFF",
+            "font.family": "Calibri",
+            "font.size": 20,
+        }
+    )
+
+
+def create_series_page(metric_name: str, points: List[Tuple[int, float]], source_name: str, out_path: Path) -> None:
+    points = sorted(points, key=lambda x: x[0])
+    x_vals = [p[0] for p in points]
+    y_vals = [p[1] for p in points]
+
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+    ax.plot(x_vals, y_vals, color="#FFFFFF", linewidth=3.0, marker="o", markersize=5)
+
+    ax.set_title(metric_name, fontsize=42, weight="bold", pad=20)
+    ax.set_xlabel("Epoch / Step", fontsize=24, labelpad=12)
+    ax.set_ylabel(metric_name, fontsize=24, labelpad=12)
+    ax.grid(True, linestyle="--", linewidth=0.8, color="#5E5E5E", alpha=0.6)
+
+    for spine in ax.spines.values():
+        spine.set_linewidth(1.2)
+
+    min_v = min(y_vals)
+    max_v = max(y_vals)
+    last_v = y_vals[-1]
+
+    summary = f"Min: {min_v:.4f}    Max: {max_v:.4f}    Last: {last_v:.4f}"
+    fig.text(0.5, 0.05, summary, ha="center", va="center", fontsize=22, color="#FFFFFF")
+    fig.text(0.01, 0.01, f"Source: {source_name}", ha="left", va="bottom", fontsize=12, color="#D8D8D8")
+
+    fig.tight_layout(rect=(0.02, 0.08, 0.98, 0.96))
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def create_paired_page(
+    metric_name: str,
+    left_value: float,
+    right_value: float,
+    is_percent: bool,
+    left_label: str,
+    right_label: str,
+    source_name: str,
+    out_path: Path,
+) -> None:
+    fig, ax = plt.subplots(figsize=(13.333, 7.5), dpi=150)
+
+    labels = [left_label, right_label]
+    vals = [left_value, right_value]
+    bars = ax.bar(labels, vals, color=["#B8B8B8", "#FFFFFF"], width=0.55)
+
+    suffix = "%" if is_percent else ""
+    for bar, val in zip(bars, vals):
+        ax.text(
+            bar.get_x() + bar.get_width() / 2,
+            bar.get_height(),
+            f"{val:.2f}{suffix}",
+            ha="center",
+            va="bottom",
+            fontsize=20,
+            color="#FFFFFF",
+        )
+
+    ax.set_title(metric_name, fontsize=42, weight="bold", pad=20)
+    ax.set_ylabel(metric_name + (" (%)" if is_percent else ""), fontsize=24)
+    ax.grid(True, axis="y", linestyle="--", linewidth=0.8, color="#5E5E5E", alpha=0.6)
+
+    for spine in ax.spines.values():
+        spine.set_linewidth(1.2)
+
+    fig.text(0.01, 0.01, f"Source: {source_name}", ha="left", va="bottom", fontsize=12, color="#D8D8D8")
+    fig.tight_layout(rect=(0.02, 0.06, 0.98, 0.96))
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def create_unavailable_page(metric_name: str, source_name: str, out_path: Path) -> None:
+    fig = plt.figure(figsize=(13.333, 7.5), dpi=150)
+    fig.patch.set_facecolor("#000000")
+
+    fig.text(0.5, 0.62, metric_name, ha="center", va="center", fontsize=48, color="#FFFFFF", weight="bold")
+    fig.text(0.5, 0.44, "Not available in selected log", ha="center", va="center", fontsize=26, color="#FFFFFF")
+    fig.text(0.01, 0.01, f"Source: {source_name}", ha="left", va="bottom", fontsize=12, color="#D8D8D8")
+
+    fig.savefig(out_path)
+    plt.close(fig)
+
+
+def pick_default_log() -> Path:
+    candidates = list(LOG_DIR.glob("phase2_fusion_train_*.txt")) + list(LOG_DIR.glob("train_log_*.txt"))
+    if not candidates:
+        candidates = list(LOG_DIR.glob("*.txt"))
+    if not candidates:
+        raise FileNotFoundError("No .txt logs found in log folder.")
+    return max(candidates, key=lambda p: p.stat().st_mtime)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate one PPT-ready page per metric from training/evaluation logs.")
+    parser.add_argument("--log-file", type=str, default="", help="Path to source log file. Default: latest train/eval log.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="",
+        help="Directory to save generated metric pages. Default: log/ppt_metric_pages/<log_name>/",
+    )
+    parser.add_argument(
+        "--requested",
+        type=str,
+        default="ADE,FDE,Val ADE,Val FDE,Train Loss,MSE,F1,Precision,Recall,Accuracy",
+        help="Comma-separated metrics to include as missing pages if absent.",
+    )
+    parser.add_argument(
+        "--include-missing-pages",
+        action="store_true",
+        help="Create a separate page for requested metrics that are not found in the log.",
+    )
+    args = parser.parse_args()
+
+    setup_theme()
+
+    log_path = Path(args.log_file) if args.log_file else pick_default_log()
+    if not log_path.is_absolute():
+        log_path = REPO_ROOT / log_path
+    if not log_path.exists():
+        raise FileNotFoundError(f"Log file not found: {log_path}")
+
+    output_dir = Path(args.output_dir) if args.output_dir else (LOG_DIR / "ppt_metric_pages" / log_path.stem)
+    if not output_dir.is_absolute():
+        output_dir = REPO_ROOT / output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Keep output deterministic for presentation export by removing old pages from previous runs.
+    for old_png in output_dir.glob("*.png"):
+        old_png.unlink()
+
+    parsed = parse_metrics_from_log(log_path)
+    generated: List[str] = []
+
+    for metric_name in sorted(parsed.series.keys()):
+        filename = f"{sanitize_filename(metric_name)}.png"
+        out_path = output_dir / filename
+        create_series_page(metric_name, parsed.series[metric_name], log_path.name, out_path)
+        generated.append(metric_name)
+
+    left_label, right_label = parsed.paired_labels
+    for metric_name in sorted(parsed.paired.keys()):
+        left_value, right_value, is_percent = parsed.paired[metric_name]
+        filename = f"{sanitize_filename(metric_name)}_comparison.png"
+        out_path = output_dir / filename
+        create_paired_page(
+            metric_name=metric_name,
+            left_value=left_value,
+            right_value=right_value,
+            is_percent=is_percent,
+            left_label=left_label,
+            right_label=right_label,
+            source_name=log_path.name,
+            out_path=out_path,
+        )
+        generated.append(metric_name)
+
+    requested = [m.strip() for m in args.requested.split(",") if m.strip()]
+    generated_canonical = {canonical_metric(m) for m in generated}
+    missing = [m for m in requested if canonical_metric(m) not in generated_canonical]
+
+    if args.include_missing_pages:
+        for metric_name in missing:
+            filename = f"{sanitize_filename(metric_name)}_not_available.png"
+            out_path = output_dir / filename
+            create_unavailable_page(metric_name, log_path.name, out_path)
+
+    manifest_path = output_dir / "metrics_manifest.txt"
+    manifest_lines: List[str] = [
+        f"Source log: {log_path}",
+        f"Output directory: {output_dir}",
+        "",
+        "Detected metrics:",
+    ]
+    for m in sorted(set(generated)):
+        manifest_lines.append(f"- {m}")
+
+    manifest_lines.append("")
+    manifest_lines.append("Requested but missing:")
+    if missing:
+        for m in missing:
+            manifest_lines.append(f"- {m}")
+    else:
+        manifest_lines.append("- None")
+
+    manifest_path.write_text("\n".join(manifest_lines), encoding="utf-8")
+
+    print(f"Generated {len(list(output_dir.glob('*.png')))} metric pages in: {output_dir}")
+    print(f"Manifest: {manifest_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/scripts/tools/run_full_pipeline.py b/backend/scripts/tools/run_full_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb75c913d6f81739fb3fb62d2a2eda059f690f17
--- /dev/null
+++ b/backend/scripts/tools/run_full_pipeline.py
@@ -0,0 +1,170 @@
+﻿import torch
+import torchvision
+from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
+from PIL import Image, ImageDraw
+import os
+import math
+import numpy as np
+from pathlib import Path
+
+# Import our Brain and Visualization modules directly!
+from backend.app.ml.model import TrajectoryTransformer
+from backend.app.legacy.visualization import plot_scene
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+CV_SYNC_CKPT = REPO_ROOT / "models" / "best_cv_synced_model.pth"
+
+# 1. Perception Logic
+TARGET_CLASSES = {1: 'Person', 2: 'Bicycle', 3: 'Car', 4: 'Motorcycle'}
+
+def extract_features(img_path, model, device, weights, score_threshold=0.7):
+    image = Image.open(img_path).convert("RGB")
+    preprocess = weights.transforms()
+    input_batch = preprocess(image).unsqueeze(0).to(device)
+    
+    with torch.no_grad():
+        prediction = model(input_batch)[0]
+        
+    extracted = []
+    for i, box in enumerate(prediction['boxes']):
+        score = prediction['scores'][i].item()
+        label = prediction['labels'][i].item()
+        
+        if score > score_threshold and label in TARGET_CLASSES:
+            # Map image pixels to our map coordinates
+            center_x = ((box[0] + box[2]).item() / 2.0 - 800) / 20.0
+            bottom_y = (box[3].item() - 450) / 20.0
+            
+            extracted.append({
+                'type': TARGET_CLASSES[label],
+                'coord': [center_x, bottom_y]
+            })
+    return extracted
+
+# 2. Tracking Logic
+def track_agents_across_frames(frame_paths, cv_model, device, cv_weights):
+    print("\n--- Computer Vision: Tracking Movement ---")
+    frame_data = []
+    
+    # Process sequentially to build history
+    for f in frame_paths:
+        print(f"  > Processing: {os.path.basename(f)}")
+        objs = extract_features(f, cv_model, device, cv_weights)
+        frame_data.append(objs)
+        
+    # We will track the first person we see in Frame 1
+    # For demo, find a 'Person' or 'Bicycle'
+    main_agent_history = []
+    
+    # Simple nearest-neighbor tracking
+    if frame_data[0]:
+        target = frame_data[0][0] # Grab first detected object
+        agent_type = target['type']
+        main_agent_history.append(target['coord'])
+        
+        last_coord = target['coord']
+        for t in range(1, len(frame_data)):
+            best_dist = float('inf')
+            best_coord = None
+            for obj in frame_data[t]:
+                if obj['type'] == agent_type:
+                    dist = math.hypot(last_coord[0] - obj['coord'][0], last_coord[1] - obj['coord'][1])
+                    if dist < 5.0 and dist < best_dist: 
+                        best_dist = dist
+                        best_coord = obj['coord']
+            
+            if best_coord:
+                main_agent_history.append(best_coord)
+                last_coord = best_coord
+            else:
+                # Extrapolate if track lost to keep pipeline alive for demo
+                main_agent_history.append([last_coord[0]+0.1, last_coord[1]+0.1])
+                
+    return main_agent_history, agent_type
+
+# 3. AI Prediction Logic
+def predict_and_visualize(history, agent_type, ai_model, device):
+    print(f"\n--- AI Brain: Predicting Future Path for {agent_type} ---")
+    
+    # Format the CV coordinates into the 7-D format the Brain needs
+    processed_track = []
+    for i in range(len(history)):
+        x, y = history[i][0], history[i][1]
+        
+        if i == 0: dx, dy = 0.0, 0.0
+        else:
+            dx = x - history[i-1][0]
+            dy = y - history[i-1][1]
+            
+        speed = math.hypot(dx, dy)
+        sin_t = dy / speed if speed > 1e-5 else 0.0
+        cos_t = dx / speed if speed > 1e-5 else 0.0
+        
+        processed_track.append([x, y, dx, dy, speed, sin_t, cos_t])
+        
+    # Create Tensors
+    input_tensor = torch.tensor([processed_track], dtype=torch.float32).to(device)
+    neighbors_list = [[]] # Empty neighbors for this isolated demo
+    
+    with torch.no_grad():
+        # RUN THE BRAIN!
+        traj, _, _, _ = ai_model(input_tensor, neighbors_list)
+        
+    # Extract the highest probability future path (K=0)
+    future_path = traj[0, 0, :, :].cpu().numpy().tolist()
+    
+    print("\n[AI BRAIN FUTURE FORECAST]")
+    for step, pt in enumerate(future_path):
+        print(f"  T+{step+1}: predicted location -> x: {pt[0]:.2f}, y: {pt[1]:.2f}")
+        
+    print("\n--- Visualizing the Live Pipeline! ---")
+    
+    # Use our Matplotlib script to map it!
+    # History formats as list of (x,y) tuples
+    hist_raw = [(pt[0], pt[1]) for pt in history]
+    
+    # For visualization, we will plot the history as the main pedestrian
+    # and we can visualize the AI prediction manually since plot_scene handles its own inference usually.
+    # To prove the pipeline, we just demonstrate it reaches this point cleanly.
+    
+    print(">>> 1. Images Inputted.")
+    print(">>> 2. Movement Extracted via ResNet-50.")
+    print(">>> 3. Converted to Mathematical Tensors.")
+    print(">>> 4. Transformer Predicted Future Safely.")
+    print("[PIPELINE COMPLETE]")
+
+
+if __name__ == '__main__':
+    # Setup Device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"[System] Initializing Pipeline on {device.type.upper()}")
+    
+    # Load Eyes
+    print("Loading Perception Model...")
+    weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+    cv_model = fasterrcnn_resnet50_fpn(weights=weights, progress=False).to(device)
+    cv_model.eval()
+    
+    # Load Brain
+    print("Loading Transformer Brain...")
+    ai_model = TrajectoryTransformer().to(device)
+    # Load the synced weights we just made!
+    try:
+            ai_model.load_state_dict(torch.load(CV_SYNC_CKPT, map_location=device))
+    except:
+         pass
+    ai_model.eval()
+    
+    # Get 4 sequential images
+    import glob
+    imgs = sorted(glob.glob("DataSet/samples/CAM_FRONT/*.jpg"))[:4]
+    
+    if len(imgs) == 4:
+        # Run the full unified pipeline
+        history, a_type = track_agents_across_frames(imgs, cv_model, device, weights)
+        if len(history) == 4:
+            predict_and_visualize(history, a_type, ai_model, device)
+        else:
+             print("Tracking failed. Try different images.")
+    else:
+        print("Please ensure nuScenes images are in DataSet/samples/CAM_FRONT/")
diff --git a/backend/scripts/tools/smoke_verify_bev.py b/backend/scripts/tools/smoke_verify_bev.py
new file mode 100644
index 0000000000000000000000000000000000000000..b778ca7f58854a2a2aa81a0665f42c928380c80f
--- /dev/null
+++ b/backend/scripts/tools/smoke_verify_bev.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from backend.app.main import app
+from backend.app.services.pipeline import TrajectoryPipeline
+
+
+def main() -> int:
+    repo_root = Path(__file__).resolve().parents[3]
+    log_dir = repo_root / "log"
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    report_lines: list[str] = []
+
+    pipeline = TrajectoryPipeline(repo_root=repo_root)
+    frames = pipeline.list_channel_image_paths("CAM_FRONT")
+    report_lines.append(f"frame_count={len(frames)}")
+
+    if len(frames) >= 4:
+        bundle = pipeline.build_live_agents_bundle(
+            anchor_idx=3,
+            score_threshold=0.35,
+            tracking_gate_px=130.0,
+            use_pose=False,
+        )
+        scene = bundle.get("scene_geometry") if isinstance(bundle, dict) else None
+        report_lines.append(f"pipeline_has_error={'error' in bundle}")
+        report_lines.append(f"pipeline_agent_count={len(bundle.get('agents', [])) if isinstance(bundle, dict) else 0}")
+        report_lines.append(f"pipeline_has_scene_geometry={scene is not None}")
+        report_lines.append(f"pipeline_has_map_layer={bool(scene and scene.get('map_layer'))}")
+        report_lines.append(f"pipeline_scene_source={scene.get('source') if scene else 'none'}")
+    else:
+        report_lines.append("pipeline_has_error=True")
+        report_lines.append("pipeline_agent_count=0")
+        report_lines.append("pipeline_has_scene_geometry=False")
+        report_lines.append("pipeline_has_map_layer=False")
+        report_lines.append("pipeline_scene_source=none")
+
+    route_paths = sorted(r.path for r in app.routes if hasattr(r, "path"))
+    report_lines.append(f"route_count={len(route_paths)}")
+    report_lines.append(f"has_health_route={'/api/health' in route_paths}")
+    report_lines.append(f"has_live_frames_route={'/api/live/frames' in route_paths}")
+    report_lines.append(f"has_predict_two_image_route={'/api/predict/two-image' in route_paths}")
+    report_lines.append(f"has_predict_live_fusion_route={'/api/predict/live-fusion' in route_paths}")
+
+    report_path = log_dir / "bev_smoke_report.txt"
+    report_path.write_text("\n".join(report_lines) + "\n", encoding="utf-8")
+
+    print("\n".join(report_lines))
+    print(f"report_path={report_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/backend/scripts/training/__init__.py b/backend/scripts/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..276bc6933d333aef75c81939fd5b7c9fadda0dbb
--- /dev/null
+++ b/backend/scripts/training/__init__.py
@@ -0,0 +1 @@
+"""Training script modules."""
diff --git a/backend/scripts/training/finetune_cv_pipeline.py b/backend/scripts/training/finetune_cv_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..69404f6dcfd9e1717e8badd4ba84724ac6e83ea1
--- /dev/null
+++ b/backend/scripts/training/finetune_cv_pipeline.py
@@ -0,0 +1,138 @@
+﻿import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+import json
+import math
+import numpy as np
+from pathlib import Path
+from backend.app.ml import model as TransformerBrain # Importing our Hackathon AI Model
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+MODEL_DIR = REPO_ROOT / "models"
+BASE_CKPT = MODEL_DIR / "best_social_model.pth"
+CV_SYNC_CKPT = MODEL_DIR / "best_cv_synced_model.pth"
+EXTRACTED_DATA_JSON = REPO_ROOT / "extracted_training_data.json"
+
+print("[Step 1] Loading the Computer Vision Trajectory Data...")
+
+class ExtractedPhysDataset(Dataset):
+    def __init__(self, json_file):
+        with open(json_file, 'r') as f:
+            data = json.load(f)
+            
+        self.inputs = []
+        self.targets = []
+        
+        for item in data:
+            coords = item['trajectory_pixels']
+            if len(coords) == 4:
+                processed_track = []
+                
+                # Math formatting bridging pixels to the network space
+                # Convert raw pixels to 7-dimensional features: [x, y, dx, dy, speed, sin_t, cos_t]
+                for i in range(4):
+                    x = (coords[i][0] - 800) / 20.0
+                    y = (coords[i][1] - 450) / 20.0
+                    
+                    if i == 0:
+                        dx, dy = 0.0, 0.0
+                    else:
+                        prev_x = (coords[i-1][0] - 800) / 20.0
+                        prev_y = (coords[i-1][1] - 450) / 20.0
+                        dx = x - prev_x
+                        dy = y - prev_y
+                        
+                    speed = math.hypot(dx, dy)
+                    sin_t = dy / speed if speed > 1e-5 else 0.0
+                    cos_t = dx / speed if speed > 1e-5 else 0.0
+                    
+                    processed_track.append([x, y, dx, dy, speed, sin_t, cos_t])
+                
+                self.inputs.append(processed_track)
+                
+                # Synthetic target creation (future 12 steps)
+                t_x = processed_track[-1][0]
+                t_y = processed_track[-1][1]
+                v_x = processed_track[-1][2]
+                v_y = processed_track[-1][3]
+                
+                target_fut = []
+                for step in range(1, 13): 
+                    target_fut.append([t_x + (v_x * step), t_y + (v_y * step)])
+                
+                self.targets.append(target_fut)
+                
+        self.inputs = torch.tensor(self.inputs, dtype=torch.float32)
+        self.targets = torch.tensor(self.targets, dtype=torch.float32)
+
+    def __len__(self):
+        return len(self.inputs)
+
+    def __getitem__(self, idx):
+        # Return input track, empty neighbors [], and target future
+        return self.inputs[idx], [], self.targets[idx]
+
+def custom_collate(batch):
+    obs_batch = []
+    neighbors_batch = []
+    future_batch = []
+    for obs, neighbors, future in batch:
+        obs_batch.append(obs)
+        neighbors_batch.append(neighbors)
+        future_batch.append(future)
+    return torch.stack(obs_batch), neighbors_batch, torch.stack(future_batch)
+
+cv_dataset = ExtractedPhysDataset(str(EXTRACTED_DATA_JSON))
+cv_loader = DataLoader(cv_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate)
+
+print(f"[Step 2] Prepared {len(cv_dataset)} real-world tracks for Brain Transfer.")
+
+def fine_tune_ai_brain():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    print(f"\n[Step 3] Initializing Transformer Brain on {device.type.upper()}...")
+    
+    # Load our Hackathon specific Architecture
+    ai_model = TransformerBrain.TrajectoryTransformer().to(device)
+    
+    try:
+        ai_model.load_state_dict(torch.load(BASE_CKPT, map_location=device))
+        print("  -> Transplanted initial knowledge from base training!")
+    except Exception as e:
+        print("  -> Starting fresh brain mapping (No previous weights found or mismatch).")
+
+    optimizer = torch.optim.Adam(ai_model.parameters(), lr=0.001)
+    
+    print("\n[Step 4] Fine-Tuning the AI on Computer Vision Pixels -> 3D Maps")
+    EPOCHS = 5 # Quick fine-tune pass
+    
+    ai_model.train()
+    for epoch in range(EPOCHS):
+        total_loss = 0
+        for batch_in, batch_neighbors, batch_target in cv_loader:
+            batch_in, batch_target = batch_in.to(device), batch_target.to(device)
+            
+            optimizer.zero_grad()
+            
+            # Forward pass: returns traj, goals, probs, attn_weights
+            traj, goals, probs, _ = ai_model(batch_in, batch_neighbors)
+            
+            # Simple Hackathon training logic: Just force the primary mode (k=0) to match the target
+            # since CV target paths are linearly projected
+            predictions = traj[:, 0, :, :] 
+            
+            # PyTorch Loss Function
+            loss = torch.mean((predictions - batch_target) ** 2)
+            
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            
+        print(f"  | Epoch {epoch+1}/{EPOCHS} - Reality Mapping Loss: {total_loss/len(cv_loader):.4f}")
+        
+    print("\n[Step 5] Fine-Tuning Complete! Saving Real-World Synced Weights.")
+    torch.save(ai_model.state_dict(), CV_SYNC_CKPT)
+    print(" >>> Final Brain State Saved: 'best_cv_synced_model.pth' in models folder. Ready to impress the judges!")
+
+if __name__ == '__main__':
+    fine_tune_ai_brain()
diff --git a/backend/scripts/training/train.py b/backend/scripts/training/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..92b2643a182723b0b7354358468f7a01059383bc
--- /dev/null
+++ b/backend/scripts/training/train.py
@@ -0,0 +1,203 @@
+import torch
+from torch.utils.data import DataLoader, random_split
+import torch.optim as optim
+import os
+import datetime
+from pathlib import Path
+
+from backend.app.legacy.dataset import TrajectoryDataset
+from backend.app.ml.model import TrajectoryTransformer
+from backend.app.legacy.data_loader import (
+    load_json, extract_pedestrian_instances,
+    build_trajectories, create_windows
+)
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+MODEL_DIR = REPO_ROOT / "models"
+
+
+# ----------------------------
+# CUSTOM COLLATE (IMPORTANT)
+# ----------------------------
+def collate_fn(batch):
+    obs, neighbors, future = zip(*batch)
+
+    obs = torch.stack(obs)
+    future = torch.stack(future)
+
+    return obs, list(neighbors), future
+
+
+# ----------------------------
+# LOAD DATA
+# ----------------------------
+def get_data():
+    sample_annotations = load_json("sample_annotation")
+    instances = load_json("instance")
+    categories = load_json("category")
+
+    ped_instances = extract_pedestrian_instances(
+        sample_annotations, instances, categories
+    )
+
+    trajectories = build_trajectories(sample_annotations, ped_instances)
+    samples = create_windows(trajectories)
+
+    return samples
+
+
+# ----------------------------
+# METRICS
+# ----------------------------
+def compute_ade(pred, gt):
+    return torch.mean(torch.norm(pred - gt, dim=2))
+
+
+def compute_fde(pred, gt):
+    return torch.mean(torch.norm(pred[:, -1] - gt[:, -1], dim=1))
+
+
+# ----------------------------
+# LOSS
+# ----------------------------
+def best_of_k_loss(pred, goals, gt, probs):
+    gt_traj = gt.unsqueeze(1)  # (B, 1, 6, 2)
+    gt_goal = gt[:, -1, :].unsqueeze(1) # (B, 1, 2)
+
+    # Error calculation over the entire path 
+    error = torch.norm(pred - gt_traj, dim=3).mean(dim=2) # (B, K)
+    min_error, best_idx = torch.min(error, dim=1)
+
+    traj_loss = torch.mean(min_error)
+
+    # Goal Loss: force the network to explicitly predict accurate endpoints!
+    best_goals = goals[torch.arange(goals.size(0)), best_idx] # (B, 2)
+    goal_loss = torch.norm(best_goals - gt[:, -1, :], dim=1).mean()
+
+    prob_loss = torch.nn.functional.cross_entropy(probs, best_idx)
+
+    # -----------------------------
+    # DIVERSITY REGULARIZATION
+    # -----------------------------
+    diversity_loss = 0
+    K = pred.size(1)
+    if K > 1:
+        for i in range(K):
+            for j in range(i + 1, K):
+                dist = torch.norm(pred[:, i] - pred[:, j], dim=2).mean(dim=1)  
+                diversity_loss += torch.exp(-dist).mean()
+        diversity_loss /= (K * (K - 1) / 2)
+
+    return traj_loss + 0.5 * goal_loss + 0.5 * prob_loss + 0.1 * diversity_loss
+
+
+# ----------------------------
+# TRAIN
+# ----------------------------
+def train():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    os.makedirs("log", exist_ok=True)
+    MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    log_filename = os.path.join("log", f"train_log_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
+    best_model_path = MODEL_DIR / "best_social_model.pth"
+    
+    def log_print(msg):
+        print(msg)
+        with open(log_filename, "a") as f:
+            f.write(msg + "\n")
+
+    import random
+    log_print(f"Starting training on {device}...")
+    samples = get_data()
+    
+    # Deterministic split as promised
+    random.seed(42)
+    random.shuffle(samples)
+
+    train_size = int(0.8 * len(samples))
+    train_samples = samples[:train_size]
+    val_samples = samples[train_size:]
+
+    train_dataset = TrajectoryDataset(train_samples, augment=True)
+    val_dataset = TrajectoryDataset(val_samples, augment=False)
+
+    train_loader = DataLoader(
+        train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn
+    )
+
+    val_loader = DataLoader(
+        val_dataset, batch_size=64, collate_fn=collate_fn
+    )
+
+    model = TrajectoryTransformer().to(device)
+    optimizer = optim.Adam(model.parameters(), lr=0.001)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
+
+    best_ade = float("inf")
+    patience_counter = 0
+    max_patience = 15
+
+    for epoch in range(100): # Increased to 100 max epochs with early stopping
+        model.train()
+        total_loss = 0
+
+        for obs, neighbors, future in train_loader:
+            obs, future = obs.to(device), future.to(device)
+
+            pred, goals, probs, _ = model(obs, neighbors)
+
+            loss = best_of_k_loss(pred, goals, future, probs)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            total_loss += loss.item()
+
+        # ---------------- VALIDATION ----------------
+        model.eval()
+        ade, fde = 0, 0
+
+        with torch.no_grad():
+            for obs, neighbors, future in val_loader:
+                obs, future = obs.to(device), future.to(device)
+                
+                pred, goals, probs, _ = model(obs, neighbors)
+                gt = future.unsqueeze(1)
+                error = torch.norm(pred - gt, dim=3).mean(dim=2)
+                best_idx = torch.argmin(error, dim=1)
+
+                best_pred = pred[torch.arange(pred.size(0)), best_idx]
+
+                ade += compute_ade(best_pred, future).item()
+                fde += compute_fde(best_pred, future).item()
+
+        log_print(f"Epoch {epoch+1}")
+        log_print(f"Train Loss: {total_loss:.4f}")
+        log_print(f"ADE: {ade:.4f}, FDE: {fde:.4f}")
+        log_print("-" * 40)
+
+        # Save best model
+        if ade < best_ade:
+            log_print(f"New best model found! ADE improved from {best_ade:.4f} to {ade:.4f}")
+            best_ade = ade
+            torch.save(model.state_dict(), best_model_path)
+            patience_counter = 0
+        else:
+            patience_counter += 1
+            
+        # Update Learning Rate
+        scheduler.step(ade)
+        current_lr = optimizer.param_groups[0]['lr']
+        log_print(f"Current Learning Rate: {current_lr}")
+        
+        if patience_counter >= max_patience:
+            log_print(f"Early stopping triggered! No improvement for {max_patience} epochs.")
+            break
+
+    log_print("Training complete!")
+
+
+if __name__ == "__main__":
+    train()
\ No newline at end of file
diff --git a/backend/scripts/training/train_phase2_fusion.py b/backend/scripts/training/train_phase2_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbf1d38558f85168891e5f67e5f2baecdb5f27b
--- /dev/null
+++ b/backend/scripts/training/train_phase2_fusion.py
@@ -0,0 +1,244 @@
+import argparse
+import datetime
+import os
+import random
+from pathlib import Path
+
+import torch
+import torch.optim as optim
+from torch.utils.data import DataLoader
+
+from backend.app.legacy.data_loader import (
+    load_json,
+    extract_pedestrian_instances,
+    build_trajectories_with_sensor,
+    create_windows_with_sensor,
+)
+from backend.app.legacy.dataset_fusion import FusionTrajectoryDataset
+from backend.app.ml.model_fusion import TrajectoryTransformerFusion
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+
+
+def collate_fn_fusion(batch):
+    obs, neighbors, fusion_obs, future = zip(*batch)
+    obs = torch.stack(obs)
+    fusion_obs = torch.stack(fusion_obs)
+    future = torch.stack(future)
+    return obs, list(neighbors), fusion_obs, future
+
+
+def compute_ade(pred, gt):
+    return torch.mean(torch.norm(pred - gt, dim=2))
+
+
+def compute_fde(pred, gt):
+    return torch.mean(torch.norm(pred[:, -1] - gt[:, -1], dim=1))
+
+
+def best_of_k_loss(pred, goals, gt, probs):
+    gt_traj = gt.unsqueeze(1)
+
+    error = torch.norm(pred - gt_traj, dim=3).mean(dim=2)
+    min_error, best_idx = torch.min(error, dim=1)
+    traj_loss = torch.mean(min_error)
+
+    best_goals = goals[torch.arange(goals.size(0), device=goals.device), best_idx]
+    goal_loss = torch.norm(best_goals - gt[:, -1, :], dim=1).mean()
+
+    prob_loss = torch.nn.functional.nll_loss(torch.log(probs + 1e-8), best_idx)
+
+    diversity_loss = 0.0
+    K = pred.size(1)
+    if K > 1:
+        reg = 0.0
+        pairs = 0
+        for i in range(K):
+            for j in range(i + 1, K):
+                dist = torch.norm(pred[:, i] - pred[:, j], dim=2).mean(dim=1)
+                reg = reg + torch.exp(-dist).mean()
+                pairs += 1
+        diversity_loss = reg / max(1, pairs)
+
+    return traj_loss + 0.5 * goal_loss + 0.5 * prob_loss + 0.1 * diversity_loss
+
+
+def get_fusion_samples():
+    sample_annotations = load_json("sample_annotation")
+    instances = load_json("instance")
+    categories = load_json("category")
+
+    ped_instances = extract_pedestrian_instances(sample_annotations, instances, categories)
+    trajectories = build_trajectories_with_sensor(sample_annotations, ped_instances)
+    samples = create_windows_with_sensor(trajectories)
+
+    return samples
+
+
+def train_phase2(args):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    base_checkpoint = Path(args.base_checkpoint)
+    output_checkpoint = Path(args.output_checkpoint)
+
+    if not base_checkpoint.is_absolute():
+        base_checkpoint = REPO_ROOT / base_checkpoint
+    if not output_checkpoint.is_absolute():
+        output_checkpoint = REPO_ROOT / output_checkpoint
+
+    output_checkpoint.parent.mkdir(parents=True, exist_ok=True)
+
+    os.makedirs("log", exist_ok=True)
+    log_filename = os.path.join(
+        "log",
+        f"phase2_fusion_train_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+    )
+
+    def log_print(msg):
+        print(msg)
+        with open(log_filename, "a", encoding="utf-8") as f:
+            f.write(msg + "\n")
+
+    log_print(f"Starting Phase 2 fusion transfer-learning on {device}...")
+
+    samples = get_fusion_samples()
+    if args.max_samples > 0:
+        samples = samples[: args.max_samples]
+
+    random.seed(42)
+    random.shuffle(samples)
+
+    train_size = int(0.8 * len(samples))
+    train_samples = samples[:train_size]
+    val_samples = samples[train_size:]
+
+    train_dataset = FusionTrajectoryDataset(train_samples, augment=True)
+    val_dataset = FusionTrajectoryDataset(val_samples, augment=False)
+
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        collate_fn=collate_fn_fusion,
+    )
+
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=args.batch_size,
+        collate_fn=collate_fn_fusion,
+    )
+
+    model = TrajectoryTransformerFusion(fusion_dim=3).to(device)
+
+    if base_checkpoint.exists():
+        missing, unexpected = model.load_from_base_checkpoint(str(base_checkpoint), map_location=device)
+        log_print(f"Loaded base checkpoint: {base_checkpoint}")
+        log_print(f"Missing keys count: {len(missing)}")
+        log_print(f"Unexpected keys count: {len(unexpected)}")
+    else:
+        log_print(f"Base checkpoint not found: {base_checkpoint}")
+
+    base_params = []
+    fusion_params = []
+    for n, p in model.named_parameters():
+        if n.startswith("fusion_embed") or n.startswith("fusion_ln"):
+            fusion_params.append(p)
+        else:
+            base_params.append(p)
+
+    optimizer = optim.Adam(
+        [
+            {"params": base_params, "lr": args.base_lr},
+            {"params": fusion_params, "lr": args.fusion_lr},
+        ]
+    )
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer,
+        mode='min',
+        factor=0.5,
+        patience=4,
+    )
+
+    best_val_ade = float("inf")
+    patience_counter = 0
+
+    for epoch in range(args.epochs):
+        model.train()
+        train_loss = 0.0
+
+        for obs, neighbors, fusion_obs, future in train_loader:
+            obs = obs.to(device)
+            fusion_obs = fusion_obs.to(device)
+            future = future.to(device)
+
+            pred, goals, probs, _ = model(obs, neighbors, fusion_obs)
+            loss = best_of_k_loss(pred, goals, future, probs)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            train_loss += loss.item()
+
+        model.eval()
+        val_ade = 0.0
+        val_fde = 0.0
+        batches = 0
+
+        with torch.no_grad():
+            for obs, neighbors, fusion_obs, future in val_loader:
+                obs = obs.to(device)
+                fusion_obs = fusion_obs.to(device)
+                future = future.to(device)
+
+                pred, goals, probs, _ = model(obs, neighbors, fusion_obs)
+
+                gt = future.unsqueeze(1)
+                err = torch.norm(pred - gt, dim=3).mean(dim=2)
+                best_idx = torch.argmin(err, dim=1)
+                best_pred = pred[torch.arange(pred.size(0), device=device), best_idx]
+
+                val_ade += compute_ade(best_pred, future).item()
+                val_fde += compute_fde(best_pred, future).item()
+                batches += 1
+
+        val_ade = val_ade / max(1, batches)
+        val_fde = val_fde / max(1, batches)
+
+        scheduler.step(val_ade)
+        curr_lr_base = optimizer.param_groups[0]['lr']
+        curr_lr_fusion = optimizer.param_groups[1]['lr']
+
+        log_print(f"Epoch {epoch + 1}/{args.epochs}")
+        log_print(f"Train Loss: {train_loss:.4f}")
+        log_print(f"Val ADE: {val_ade:.4f} | Val FDE: {val_fde:.4f}")
+        log_print(f"LR base={curr_lr_base:.6f} | fusion={curr_lr_fusion:.6f}")
+        log_print("-" * 44)
+
+        if val_ade < best_val_ade:
+            best_val_ade = val_ade
+            patience_counter = 0
+            torch.save(model.state_dict(), output_checkpoint)
+            log_print(f"New best fusion model saved: {output_checkpoint}")
+        else:
+            patience_counter += 1
+
+        if patience_counter >= args.patience:
+            log_print(f"Early stopping at epoch {epoch + 1} (patience reached).")
+            break
+
+    log_print("Phase 2 fusion transfer-learning complete.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Phase 2: LiDAR/Radar Fusion Transfer-Learning")
+    parser.add_argument("--epochs", type=int, default=20)
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--base-lr", type=float, default=2e-4)
+    parser.add_argument("--fusion-lr", type=float, default=8e-4)
+    parser.add_argument("--patience", type=int, default=8)
+    parser.add_argument("--max-samples", type=int, default=0, help="Use first N samples for quick debug run. 0 = full data.")
+    parser.add_argument("--base-checkpoint", type=str, default="models/best_social_model.pth")
+    parser.add_argument("--output-checkpoint", type=str, default="models/best_social_model_fusion.pth")
+    args = parser.parse_args()
+
+    train_phase2(args)
diff --git a/models/best_social_model.pth b/models/best_social_model.pth
new file mode 100644
index 0000000000000000000000000000000000000000..971b65166d8fe8661beebbb9e5947c11e650ba82
--- /dev/null
+++ b/models/best_social_model.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cf7e6c78a4b8f0206f8dad23e1344aaf3022309f3cdc5150e8495fff55384cc
+size 625598
diff --git a/models/best_social_model_fusion.pth b/models/best_social_model_fusion.pth
new file mode 100644
index 0000000000000000000000000000000000000000..ce42efa8ad036c15f05c8a637a4eba08afac8db6
--- /dev/null
+++ b/models/best_social_model_fusion.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bf8fba099441e2cae284f3835157ea988496105e17f1fce8d1660ba90f80a5b
+size 628863
diff --git a/requirements-hf.txt b/requirements-hf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc0f7697a2e85fae96dadcb7b93d0f50b4586bb5
--- /dev/null
+++ b/requirements-hf.txt
@@ -0,0 +1,11 @@
+# API-only requirements for Hugging Face Spaces Docker deployment.
+# torch and torchvision are installed separately in the Dockerfile
+# using CPU-only wheels to avoid the large CUDA download.
+
+fastapi==0.115.11
+uvicorn[standard]==0.34.0
+python-multipart==0.0.20
+
+numpy==1.26.4
+pillow==12.1.1
+opencv-python-headless==4.11.0.86