Spaces:

Draken1606
/

undertrial-ai

Sleeping

App Files Files Community

Shabista Sehar commited on Apr 24

Commit

a085ad1

1 Parent(s): d8f8a45

modified

Browse files

Files changed (11) hide show

client.py +2 -1
demo_comparison.py +222 -0
models.py +1 -0
openenv.yaml +1 -1
pyproject.toml +1 -0
requirements.txt +8 -0
server/app.py +11 -1
server/performance_tracker.py +3 -0
server/reward.py +9 -0
server/undertrial_environment.py +2 -1
training/train_grpo.py +110 -9

client.py CHANGED Viewed

@@ -19,7 +19,7 @@ from .models import (
     AssessSuretyAction, ClassifyBailTypeAction,
     ReadSubmissionsAction, AssessFlightRiskAction,
     CheckCaseFactorsAction, ApplyProportionalityAction,
-    SubmitMemoAction,
     StepResult,
 )
@@ -141,5 +141,6 @@ __all__ = [
     "AssessFlightRiskAction",
     "CheckCaseFactorsAction",
     "ApplyProportionalityAction",
     "SubmitMemoAction",
 ]

     AssessSuretyAction, ClassifyBailTypeAction,
     ReadSubmissionsAction, AssessFlightRiskAction,
     CheckCaseFactorsAction, ApplyProportionalityAction,
+    PullCriminalHistoryAction, SubmitMemoAction,
     StepResult,
 )
     "AssessFlightRiskAction",
     "CheckCaseFactorsAction",
     "ApplyProportionalityAction",
+    "PullCriminalHistoryAction",
     "SubmitMemoAction",
 ]

demo_comparison.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+UndertriAI — Before/After Demo Comparison Script
+Demonstrates the environment using DEMO001 (Ramesh Kumar — IPC 420 cheating case).
+Shows two simulated agent trajectories on the SAME case:
+  1. Naive agent: skips tools, guesses wrong
+  2. Skilled agent: uses tools properly, reaches correct conclusion
+This script does NOT require a trained model — it simulates both agent
+behaviors programmatically to show the reward difference.
+Usage:
+    python demo_comparison.py
+"""
+import sys
+import os
+import json
+# Add parent of project root so relative imports within the package work
+_project_root = os.path.dirname(os.path.abspath(__file__))
+_parent = os.path.dirname(_project_root)
+_pkg_name = os.path.basename(_project_root)
+if _parent not in sys.path:
+    sys.path.insert(0, _parent)
+# Import via package name (needed for relative imports in server/)
+_env_mod = __import__(f"{_pkg_name}.server.undertrial_environment", fromlist=["UndertriAIEnvironment"])
+UndertriAIEnvironment = _env_mod.UndertriAIEnvironment
+_models_mod = __import__(f"{_pkg_name}.models", fromlist=[
+    "ComputeStatutoryEligibilityAction", "AssessFlightRiskAction",
+    "ReadSubmissionsAction", "CheckCaseFactorsAction", "SubmitMemoAction",
+])
+ComputeStatutoryEligibilityAction = _models_mod.ComputeStatutoryEligibilityAction
+AssessFlightRiskAction = _models_mod.AssessFlightRiskAction
+ReadSubmissionsAction = _models_mod.ReadSubmissionsAction
+CheckCaseFactorsAction = _models_mod.CheckCaseFactorsAction
+SubmitMemoAction = _models_mod.SubmitMemoAction
+def run_demo():
+    """Run before/after comparison on DEMO001."""
+    print("=" * 65)
+    print("  UndertriAI — Before vs After Training Demo")
+    print("  Case: DEMO001 — Ramesh Kumar vs State of Delhi (IPC 420)")
+    print("=" * 65)
+    env = UndertriAIEnvironment()
+    # ================================================================
+    # NAIVE AGENT (simulates untrained model behavior)
+    # ================================================================
+    print("\n" + "─" * 65)
+    print("  NAIVE AGENT (before training)")
+    print("─" * 65)
+    obs = env.reset(stage=1, seed=0)
+    print(f"  Case: {obs.case_title}")
+    print(f"  Crime: {obs.crime_type} | Sections: {obs.ipc_sections}")
+    print(f"  Custody: {env._episode.get('custody_months')} months")
+    # Naive agent: calls one tool minimally, then submits wrong answer
+    print("\n  Step 1: Read submissions (both)")
+    result = env.step(ReadSubmissionsAction(
+        party="both",
+    ))
+    print(f"    → {result.observation.action_result[:80]}...")
+    # Naive agent gets the outcome WRONG (denies bail when it should be granted)
+    print("\n  Step 2: Submit memo (WRONG — denies bail)")
+    result = env.step(SubmitMemoAction(
+        flight_risk="High",
+        flight_risk_justification="Accused may flee",
+        statutory_eligible=False,
+        statutory_computation="Unknown sections, cannot determine",
+        grounds_for_bail=["None identified"],
+        grounds_against_bail=["Serious charge"],
+        recommended_outcome="Bail Denied",
+        recommended_conditions=[],
+    ))
+    naive_reward = result.reward
+    naive_info = result.info
+    print(f"\n  NAIVE REWARD: {naive_reward:.4f}")
+    print(f"    Outcome match:     {naive_info.get('outcome_match', 'N/A')}")
+    print(f"    Flight risk acc:   {naive_info.get('flight_risk_accuracy', 'N/A')}")
+    print(f"    Statutory acc:     {naive_info.get('statutory_accuracy', 'N/A')}")
+    print(f"    Condition score:   {naive_info.get('condition_appropriateness', 'N/A')}")
+    print(f"    Bias penalty:      {naive_info.get('bias_penalty', 'N/A')}")
+    print(f"    Ground truth:      {naive_info.get('ground_truth_outcome', 'N/A')}")
+    # ================================================================
+    # SKILLED AGENT (simulates trained model behavior)
+    # ================================================================
+    print("\n" + "─" * 65)
+    print("  SKILLED AGENT (after training)")
+    print("─" * 65)
+    obs = env.reset(stage=1, seed=0)  # Same case
+    print(f"  Case: {obs.case_title}")
+    # Skilled agent: uses multiple relevant tools
+    print("\n  Step 1: Read submissions (both)")
+    result = env.step(ReadSubmissionsAction(party="both"))
+    print(f"    → {result.observation.action_result[:80]}...")
+    print("\n  Step 2: Compute statutory eligibility")
+    result = env.step(ComputeStatutoryEligibilityAction(
+        sections_invoked=["420"],
+        max_sentence_years=7.0,
+        custody_months=8.0,
+        special_law_applicable=False,
+    ))
+    print(f"    → {result.observation.action_result[:100]}...")
+    print("\n  Step 3: Assess flight risk")
+    result = env.step(AssessFlightRiskAction(
+        severity_of_offence="moderate",
+        roots_in_community="Permanent resident of Delhi, family with minor children",
+        prior_absconding=False,
+        passport_status="unknown",
+    ))
+    print(f"    �� {result.observation.action_result[:100]}...")
+    print("\n  Step 4: Check case factors")
+    result = env.step(CheckCaseFactorsAction(
+        factors_to_check=["nature_of_offence", "criminal_history", "evidence_tampering"],
+    ))
+    print(f"    → {result.observation.action_result[:100]}...")
+    # Skilled agent: correct outcome with proper reasoning
+    print("\n  Step 5: Submit memo (CORRECT — grants bail with conditions)")
+    result = env.step(SubmitMemoAction(
+        flight_risk="Low",
+        flight_risk_justification=(
+            "Accused is a permanent resident of Delhi with family ties including "
+            "two minor children. No prior criminal record. IPC 420 is a moderate "
+            "offence. No evidence of prior absconding. Prosecution has not cited "
+            "any flight risk. Community roots are strong."
+        ),
+        statutory_eligible=False,
+        statutory_computation=(
+            "IPC Section 420: max sentence 7 years (84 months). "
+            "BNSS 479 threshold = 42 months (50%). "
+            "Time served = 8 months (9.5%). "
+            "Threshold NOT yet met — not eligible for default bail. "
+            "However, bail sought on merits, not statutory default."
+        ),
+        grounds_for_bail=[
+            "No prior criminal record — first-time offender",
+            "Permanent resident of Delhi with strong family ties",
+            "Two minor children dependent on accused",
+            "No flight risk identified by prosecution",
+            "Offence is non-violent (cheating, not bodily harm)",
+        ],
+        grounds_against_bail=[
+            "Investigation still pending per prosecution",
+            "Alleged fraud of Rs. 50,000",
+        ],
+        recommended_outcome="Bail Granted",
+        recommended_conditions=[
+            "Personal bond of Rs. 25,000 with one local surety",
+            "Weekly reporting to the concerned police station",
+            "Surrender passport if held",
+            "Not to leave Delhi without court permission",
+            "Cooperate with ongoing investigation",
+        ],
+    ))
+    skilled_reward = result.reward
+    skilled_info = result.info
+    print(f"\n  SKILLED REWARD: {skilled_reward:.4f}")
+    print(f"    Outcome match:     {skilled_info.get('outcome_match', 'N/A')}")
+    print(f"    Flight risk acc:   {skilled_info.get('flight_risk_accuracy', 'N/A')}")
+    print(f"    Statutory acc:     {skilled_info.get('statutory_accuracy', 'N/A')}")
+    print(f"    Condition score:   {skilled_info.get('condition_appropriateness', 'N/A')}")
+    print(f"    Bias penalty:      {skilled_info.get('bias_penalty', 'N/A')}")
+    print(f"    Ground truth:      {skilled_info.get('ground_truth_outcome', 'N/A')}")
+    # ================================================================
+    # COMPARISON
+    # ================================================================
+    print("\n" + "═" * 65)
+    print("  COMPARISON SUMMARY")
+    print("═" * 65)
+    delta = skilled_reward - naive_reward
+    print(f"  Naive agent reward:   {naive_reward:.4f}")
+    print(f"  Skilled agent reward: {skilled_reward:.4f}")
+    print(f"  Improvement:          {delta:+.4f} ({delta/max(0.01, abs(naive_reward))*100:+.0f}%)")
+    print()
+    # Component-by-component comparison
+    components = [
+        ("Outcome Match", "outcome_match"),
+        ("Flight Risk",   "flight_risk_accuracy"),
+        ("Statutory",     "statutory_accuracy"),
+        ("Conditions",    "condition_appropriateness"),
+        ("Bias Penalty",  "bias_penalty"),
+    ]
+    print(f"  {'Component':<20} {'Naive':>8} {'Skilled':>8} {'Delta':>8}")
+    print(f"  {'─'*20} {'─'*8} {'─'*8} {'─'*8}")
+    for name, key in components:
+        n = naive_info.get(key, 0)
+        s = skilled_info.get(key, 0)
+        d = s - n
+        sign = "+" if d >= 0 else ""
+        print(f"  {name:<20} {n:>8.3f} {s:>8.3f} {sign}{d:>7.3f}")
+    print()
+    print(f"  Ground truth: {skilled_info.get('ground_truth_outcome', '?')}")
+    print(f"  Naive agent:  Bail Denied (WRONG)")
+    print(f"  Skilled agent: Bail Granted (CORRECT)")
+    print("═" * 65)
+    return {
+        "naive_reward": naive_reward,
+        "skilled_reward": skilled_reward,
+        "delta": delta,
+    }
+if __name__ == "__main__":
+    results = run_demo()

models.py CHANGED Viewed

@@ -200,6 +200,7 @@ BailAction = Union[
     AssessFlightRiskAction,
     CheckCaseFactorsAction,
     ApplyProportionalityAction,
     SubmitMemoAction,
 ]

     AssessFlightRiskAction,
     CheckCaseFactorsAction,
     ApplyProportionalityAction,
+    PullCriminalHistoryAction,
     SubmitMemoAction,
 ]

openenv.yaml CHANGED Viewed

@@ -131,7 +131,7 @@ endpoints:
 training:
   method: GRPO
   framework: TRL + Unsloth
-  model: unsloth/Qwen2.5-7B-Instruct
   notebook: training/UndertriAI_GRPO_Training.ipynb
   script: training/train_grpo.py
   modes:

 training:
   method: GRPO
   framework: TRL + Unsloth
+  model: unsloth/Qwen2.5-3B-Instruct
   notebook: training/UndertriAI_GRPO_Training.ipynb
   script: training/train_grpo.py
   modes:

pyproject.toml CHANGED Viewed

@@ -32,6 +32,7 @@ train = [
     "torch>=2.1.0",
     "datasets>=2.18.0",
     "transformers>=4.40.0",
 ]
 [project.scripts]

     "torch>=2.1.0",
     "datasets>=2.18.0",
     "transformers>=4.40.0",
+    "matplotlib>=3.7.0",
 ]
 [project.scripts]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+# UndertriAI — Server dependencies
+fastapi>=0.110.0
+uvicorn[standard]>=0.27.0
+pydantic>=2.6.0
+websockets>=12.0
+openenv-core>=0.1.0
+matplotlib>=3.7.0
+httpx>=0.27.0

server/app.py CHANGED Viewed

@@ -4,6 +4,7 @@ Wraps UndertriAIEnvironment as an OpenEnv-compatible HTTP + WebSocket server.
 """
 import os
 from pathlib import Path
 from dataclasses import dataclass, field
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -13,6 +14,8 @@ import json
 import uuid
 from typing import List, Optional
 from .undertrial_environment import UndertriAIEnvironment
 from .performance_tracker import PerformanceTracker
 from .adaptive_selector import AdaptiveSelector
@@ -215,6 +218,11 @@ def step(payload: dict):
                         v["curriculum_stage"] = stage
                         env.dataset._episodes.setdefault(stage, []).append(v)
                     session.synthetic_cases_generated += len(variants)
     return {
         "session_id": session_id,
@@ -255,6 +263,7 @@ def list_tools():
             {"name": "assess_flight_risk",           "description": "Systematic flight risk assessment with scoring matrix"},
             {"name": "check_case_factors",           "description": "Examine specific case factors (parity, evidence tampering, victim vulnerability)"},
             {"name": "apply_proportionality",        "description": "Apply BNSS 479 proportionality: custody vs. max sentence vs. trial timeline"},
             {"name": "submit_memo",                  "description": "TERMINAL — Submit structured bail assessment memo"},
         ]
     }
@@ -337,7 +346,7 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
                     AssessSuretyAction, ClassifyBailTypeAction,
                     ReadSubmissionsAction, AssessFlightRiskAction,
                     CheckCaseFactorsAction, ApplyProportionalityAction,
-                    SubmitMemoAction,
                 )
                 ACTION_MAP = {
                     "request_document":              RequestDocumentAction,
@@ -350,6 +359,7 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
                     "assess_flight_risk":            AssessFlightRiskAction,
                     "check_case_factors":            CheckCaseFactorsAction,
                     "apply_proportionality":         ApplyProportionalityAction,
                     "submit_memo":                   SubmitMemoAction,
                 }
                 action_cls = ACTION_MAP.get(tool_name)

 """
 import os
+import logging
 from pathlib import Path
 from dataclasses import dataclass, field
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 import uuid
 from typing import List, Optional
+logger = logging.getLogger("undertrial")
 from .undertrial_environment import UndertriAIEnvironment
 from .performance_tracker import PerformanceTracker
 from .adaptive_selector import AdaptiveSelector
                         v["curriculum_stage"] = stage
                         env.dataset._episodes.setdefault(stage, []).append(v)
                     session.synthetic_cases_generated += len(variants)
+                    for v in variants:
+                        logger.info(
+                            f"Synthetic case generated: {v['case_id']} "
+                            f"({v.get('perturbation_type', 'unknown')})"
+                        )
     return {
         "session_id": session_id,
             {"name": "assess_flight_risk",           "description": "Systematic flight risk assessment with scoring matrix"},
             {"name": "check_case_factors",           "description": "Examine specific case factors (parity, evidence tampering, victim vulnerability)"},
             {"name": "apply_proportionality",        "description": "Apply BNSS 479 proportionality: custody vs. max sentence vs. trial timeline"},
+            {"name": "pull_criminal_history",        "description": "Pull accused's prior criminal record, bail history, and conviction status"},
             {"name": "submit_memo",                  "description": "TERMINAL — Submit structured bail assessment memo"},
         ]
     }
                     AssessSuretyAction, ClassifyBailTypeAction,
                     ReadSubmissionsAction, AssessFlightRiskAction,
                     CheckCaseFactorsAction, ApplyProportionalityAction,
+                    PullCriminalHistoryAction, SubmitMemoAction,
                 )
                 ACTION_MAP = {
                     "request_document":              RequestDocumentAction,
                     "assess_flight_risk":            AssessFlightRiskAction,
                     "check_case_factors":            CheckCaseFactorsAction,
                     "apply_proportionality":         ApplyProportionalityAction,
+                    "pull_criminal_history":         PullCriminalHistoryAction,
                     "submit_memo":                   SubmitMemoAction,
                 }
                 action_cls = ACTION_MAP.get(tool_name)

server/performance_tracker.py CHANGED Viewed

@@ -37,6 +37,9 @@ class PerformanceTracker:
     Thread-safe for single-session use (no locks needed).
     All public methods handle missing/malformed input gracefully.
     """
     def __init__(self, alpha: float = 0.1):

     Thread-safe for single-session use (no locks needed).
     All public methods handle missing/malformed input gracefully.
+    NOTE: Tracker state is in-memory only. Server restart clears history.
+    For production: persist via tracker.get_profile() → JSON file on /reset.
     """
     def __init__(self, alpha: float = 0.1):

server/reward.py CHANGED Viewed

@@ -208,6 +208,11 @@ def compute_statutory_accuracy(
             return 0.0
     # ── Standard IPC/BNSS statutory scoring ──────────────────────────────
     # Compute ground-truth eligibility for cases with known custody duration
     half_sent_months = (max_sent * 12) / 2
     truly_eligible   = (custody_mo >= half_sent_months) and not special_laws
@@ -244,6 +249,10 @@ def compute_statutory_accuracy(
     elif has_numbers or has_time_ref:
         score += 0.15 if direction_correct else 0.05
     return min(1.0, score)

             return 0.0
     # ── Standard IPC/BNSS statutory scoring ──────────────────────────────
+    # D4: Detect unreliable custody_months=6.0 default on serious crimes.
+    # 74% of episodes have custody_months=6.0 which may be a dataset default.
+    # Cap score at 0.60 to avoid rewarding threshold arithmetic on unreliable data.
+    custody_unreliable = (custody_mo == 6.0 and max_sent > 3.0)
     # Compute ground-truth eligibility for cases with known custody duration
     half_sent_months = (max_sent * 12) / 2
     truly_eligible   = (custody_mo >= half_sent_months) and not special_laws
     elif has_numbers or has_time_ref:
         score += 0.15 if direction_correct else 0.05
+    # D4: Cap score when custody data is unreliable (likely dataset default)
+    if custody_unreliable:
+        score = min(score, 0.60)
     return min(1.0, score)

server/undertrial_environment.py CHANGED Viewed

@@ -307,11 +307,12 @@ class UndertriAIEnvironment(Environment):
         elif isinstance(action, AssessSuretyAction):
             feasible = action.proposed_amount <= (action.income_estimate or 50000) * 3
             return (
                 f"Surety Assessment:\n"
                 f"  Proposed Amount: ₹{action.proposed_amount:,}\n"
                 f"  Accused Occupation: {action.accused_occupation}\n"
-                f"  Income Estimate: ₹{action.income_estimate:,}/month\n"
                 f"  → {'FINANCIALLY FEASIBLE ✓' if feasible else 'AMOUNT MAY BE EXCESSIVE — consider reduction'}"
             )

         elif isinstance(action, AssessSuretyAction):
             feasible = action.proposed_amount <= (action.income_estimate or 50000) * 3
+            income_str = f"₹{action.income_estimate:,}/month" if action.income_estimate is not None else "Not provided"
             return (
                 f"Surety Assessment:\n"
                 f"  Proposed Amount: ₹{action.proposed_amount:,}\n"
                 f"  Accused Occupation: {action.accused_occupation}\n"
+                f"  Income Estimate: {income_str}\n"
                 f"  → {'FINANCIALLY FEASIBLE ✓' if feasible else 'AMOUNT MAY BE EXCESSIVE — consider reduction'}"
             )

training/train_grpo.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 UndertriAI — GRPO Training Script
-Fine-tunes Qwen2.5-7B-Instruct using Group Relative Policy Optimization
 against the UndertriAI bail assessment environment.
 Run in Google Colab (T4 GPU recommended):
@@ -373,8 +373,13 @@ def reward_conditions(completions: List[str], episode_batch: List[Dict], **kwarg
                 if kw in cond_text:
                     score = min(1.0, score + 0.04)
         else:
-            # Denial should have empty conditions
-            score = 1.0 if len(conditions) == 0 else 0.5
         scores.append(min(1.0, score))
     return scores
@@ -508,12 +513,19 @@ def load_episodes(
         test   = last test_fraction
     """
     path = Path(episodes_dir) / f"episodes_stage_{stage}.jsonl"
     if not path.exists():
         path = Path(episodes_dir) / "episodes_all.jsonl"
     if not path.exists():
         raise FileNotFoundError(f"No episodes found in {episodes_dir}.")
     with open(path, encoding="utf-8") as f:
         all_eps = [json.loads(l) for l in f if l.strip()]
     n = len(all_eps)
     n_test = max(1, int(n * test_fraction))
@@ -678,14 +690,14 @@ def train(
 ):
     print("=" * 60)
     print("  UndertriAI — GRPO Training with Unsloth")
-    print(f"  Model: Qwen2.5-7B-Instruct | Stage: {stage}")
     print("=" * 60)
     # ── Load model ──────────────────────────────────────────
     from unsloth import FastLanguageModel  # type: ignore
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name   = "unsloth/Qwen2.5-7B-Instruct",
         max_seq_length = max_seq_len,
         load_in_4bit = True,
         fast_inference = False,
@@ -791,23 +803,101 @@ def train(
     model.save_pretrained(output_dir, save_adapters_only=True)
     tokenizer.save_pretrained(output_dir)
     print(f"\nModel adapters saved to {output_dir}")
     return results
 # ============================================================
 # CELL 7 — Evaluate baseline (before training)
 # ============================================================
 def evaluate_baseline(episodes_dir: str, n_samples: int = 20):
     """
-    Quick evaluation of a zero-shot Qwen2.5-7B-Instruct on bail cases.
     Run this BEFORE training to get the baseline reward curve starting point.
     """
     print("\nEvaluating zero-shot baseline...")
     from unsloth import FastLanguageModel  # type: ignore
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name   = "unsloth/Qwen2.5-7B-Instruct",
         max_seq_length = 3072,
         load_in_4bit = True,
     )
@@ -970,7 +1060,7 @@ def train_curriculum(
     # Load model once — reused across all stages
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name="unsloth/Qwen2.5-7B-Instruct",
         max_seq_length=3072,
         load_in_4bit=True,
         fast_inference=False,
@@ -1131,6 +1221,12 @@ def train_curriculum(
     }, indent=2))
     print(f"  Results saved: {results_path}")
     return stage_results
@@ -1175,7 +1271,7 @@ def train_adaptive(
     # Load model once
     model, tokenizer = FastLanguageModel.from_pretrained(
-        model_name="unsloth/Qwen2.5-7B-Instruct",
         max_seq_length=3072,
         load_in_4bit=True,
         fast_inference=False,
@@ -1372,6 +1468,11 @@ def train_adaptive(
     tokenizer.save_pretrained(final_dir)
     print(f"  Final model saved: {final_dir}")
     return results

 """
 UndertriAI — GRPO Training Script
+Fine-tunes Qwen2.5-3B-Instruct using Group Relative Policy Optimization
 against the UndertriAI bail assessment environment.
 Run in Google Colab (T4 GPU recommended):
                 if kw in cond_text:
                     score = min(1.0, score + 0.04)
         else:
+            # Denial: empty conditions is correct ONLY when GT also denied
+            gt_outcome = ep.get("ground_truth", {}).get("outcome", "").lower()
+            gt_denied = "deni" in gt_outcome
+            if len(conditions) == 0:
+                score = 1.0 if gt_denied else 0.3  # H3: 0.3 not 1.0 when GT=granted
+            else:
+                score = 0.5  # Denied but listed conditions — inconsistent
         scores.append(min(1.0, score))
     return scores
         test   = last test_fraction
     """
     path = Path(episodes_dir) / f"episodes_stage_{stage}.jsonl"
+    use_all_fallback = False
     if not path.exists():
         path = Path(episodes_dir) / "episodes_all.jsonl"
+        use_all_fallback = True
     if not path.exists():
         raise FileNotFoundError(f"No episodes found in {episodes_dir}.")
     with open(path, encoding="utf-8") as f:
         all_eps = [json.loads(l) for l in f if l.strip()]
+    # H1: filter by curriculum_stage when falling back to episodes_all.jsonl
+    if use_all_fallback:
+        filtered = [ep for ep in all_eps if ep.get("curriculum_stage") == stage]
+        if filtered:
+            all_eps = filtered
     n = len(all_eps)
     n_test = max(1, int(n * test_fraction))
 ):
     print("=" * 60)
     print("  UndertriAI — GRPO Training with Unsloth")
+    print(f"  Model: Qwen2.5-3B-Instruct | Stage: {stage}")
     print("=" * 60)
     # ── Load model ──────────────────────────────────────────
     from unsloth import FastLanguageModel  # type: ignore
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name   = "unsloth/Qwen2.5-3B-Instruct",
         max_seq_length = max_seq_len,
         load_in_4bit = True,
         fast_inference = False,
     model.save_pretrained(output_dir, save_adapters_only=True)
     tokenizer.save_pretrained(output_dir)
     print(f"\nModel adapters saved to {output_dir}")
+    # Save training plots (C6)
+    save_training_plots(trainer.state.log_history, output_dir)
     return results
+# ============================================================
+# Plot saving utility (C6)
+# ============================================================
+def save_training_plots(log_history: list, output_dir: str) -> None:
+    """
+    Save training reward curve and loss plots.
+    Called at the end of train(), train_curriculum(), and train_adaptive().
+    """
+    try:
+        import matplotlib
+        matplotlib.use("Agg")  # Non-interactive backend
+        import matplotlib.pyplot as plt
+        import numpy as np
+    except ImportError:
+        print("[WARNING] matplotlib not installed — skipping plot generation.")
+        return
+    plots_dir = Path(output_dir) / "plots"
+    plots_dir.mkdir(parents=True, exist_ok=True)
+    # Extract reward data from training log
+    steps   = [e["step"]   for e in log_history if "reward" in e]
+    rewards = [e["reward"] for e in log_history if "reward" in e]
+    if not steps:
+        print("[WARNING] No reward data in training log — skipping plots.")
+        return
+    # Plot 1: Reward curve
+    fig, ax = plt.subplots(figsize=(10, 5))
+    fig.patch.set_facecolor("#0a0d1a")
+    ax.set_facecolor("#0a0d1a")
+    ax.plot(steps, rewards, color="#6366f1", linewidth=1.5, alpha=0.6, label="Raw")
+    if len(rewards) > 5:
+        smooth = np.convolve(rewards, np.ones(5) / 5, mode="valid")
+        ax.plot(steps[2:-2], smooth, color="#14b8a6", linewidth=2, label="Smoothed")
+    ax.set_xlabel("Training Step", color="#94a3b8")
+    ax.set_ylabel("Reward", color="#94a3b8")
+    ax.set_title("UndertriAI — Training Reward Curve", color="#e2e8f0", pad=12)
+    ax.tick_params(colors="#94a3b8")
+    ax.grid(True, alpha=0.2)
+    ax.legend(facecolor="#111827", edgecolor="#1e2d45", labelcolor="#94a3b8")
+    for spine in ax.spines.values():
+        spine.set_color("#1e2d45")
+    fig.tight_layout()
+    reward_path = plots_dir / "reward_curve.png"
+    fig.savefig(str(reward_path), dpi=150, bbox_inches="tight", facecolor="#0a0d1a")
+    plt.close(fig)
+    print(f"  Plot saved: {reward_path}")
+    # Plot 2: Loss curve (if available)
+    loss_steps  = [e["step"] for e in log_history if "loss" in e]
+    loss_values = [e["loss"] for e in log_history if "loss" in e]
+    if loss_steps:
+        fig2, ax2 = plt.subplots(figsize=(10, 5))
+        fig2.patch.set_facecolor("#0a0d1a")
+        ax2.set_facecolor("#0a0d1a")
+        ax2.plot(loss_steps, loss_values, color="#f97316", linewidth=1.5)
+        ax2.set_xlabel("Training Step", color="#94a3b8")
+        ax2.set_ylabel("Loss", color="#94a3b8")
+        ax2.set_title("UndertriAI — Training Loss", color="#e2e8f0", pad=12)
+        ax2.tick_params(colors="#94a3b8")
+        ax2.grid(True, alpha=0.2)
+        for spine in ax2.spines.values():
+            spine.set_color("#1e2d45")
+        fig2.tight_layout()
+        loss_path = plots_dir / "training_loss.png"
+        fig2.savefig(str(loss_path), dpi=150, bbox_inches="tight", facecolor="#0a0d1a")
+        plt.close(fig2)
+        print(f"  Plot saved: {loss_path}")
 # ============================================================
 # CELL 7 — Evaluate baseline (before training)
 # ============================================================
 def evaluate_baseline(episodes_dir: str, n_samples: int = 20):
     """
+    Quick evaluation of a zero-shot Qwen2.5-3B-Instruct on bail cases.
     Run this BEFORE training to get the baseline reward curve starting point.
     """
     print("\nEvaluating zero-shot baseline...")
     from unsloth import FastLanguageModel  # type: ignore
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name   = "unsloth/Qwen2.5-3B-Instruct",
         max_seq_length = 3072,
         load_in_4bit = True,
     )
     # Load model once — reused across all stages
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/Qwen2.5-3B-Instruct",
         max_seq_length=3072,
         load_in_4bit=True,
         fast_inference=False,
     }, indent=2))
     print(f"  Results saved: {results_path}")
+    # Save training plots (C6) — use last trainer's log
+    try:
+        save_training_plots(trainer.state.log_history, output_dir)
+    except Exception:
+        print("  [WARNING] Could not save training plots.")
     return stage_results
     # Load model once
     model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="unsloth/Qwen2.5-3B-Instruct",
         max_seq_length=3072,
         load_in_4bit=True,
         fast_inference=False,
     tokenizer.save_pretrained(final_dir)
     print(f"  Final model saved: {final_dir}")
+    # Save training plots (C6)
+    # Build a synthetic log_history from reward_curve for adaptive mode
+    adaptive_log = [{"step": s, "reward": r} for s, r in reward_curve]
+    save_training_plots(adaptive_log, output_dir)
     return results