Spaces:

Draken1606
/

undertrial-ai

Running

App Files Files Community

Draken1606 commited on 17 days ago

Commit

37edd09

1 Parent(s): c1adced

Fix 5 bugs: inference mode reset, step_counts in curriculum, adapter-only save (x3), DEMO001 false defence claim, episode_id in /reset

Browse files

Files changed (3) hide show

server/app.py +2 -2
server/dataset.py +1 -1
training/train_grpo.py +19 -10

server/app.py CHANGED Viewed

@@ -69,12 +69,12 @@ def health():
 @app.post("/reset")
-def reset(stage: int = 1, session_id: str = None, seed: int = None):
     if session_id is None:
         session_id = str(uuid.uuid4())
     env = get_or_create_env(session_id)
     env.set_stage(stage)
-    obs = env.reset(stage=stage, seed=seed)
     return {
         "session_id": session_id,
         "observation": obs.model_dump(),

 @app.post("/reset")
+def reset(stage: int = 1, session_id: str = None, seed: int = None, episode_id: str = None):
     if session_id is None:
         session_id = str(uuid.uuid4())
     env = get_or_create_env(session_id)
     env.set_stage(stage)
+    obs = env.reset(stage=stage, seed=seed, episode_id=episode_id)
     return {
         "session_id": session_id,
         "observation": obs.model_dump(),

server/dataset.py CHANGED Viewed

@@ -74,7 +74,7 @@ class BailDataset:
                     "Investigation is still pending and accused may tamper with evidence.",
                 ],
                 "defence_arguments": [
-                    "Accused has been in custody for 8 months on a 7-year max offence — already served more than half the equivalent.",
                     "No prior criminal record. Permanent resident of Delhi with family ties.",
                     "No evidence of flight risk or evidence tampering.",
                 ],

                     "Investigation is still pending and accused may tamper with evidence.",
                 ],
                 "defence_arguments": [
+                    "Accused has been in custody for 8 months; threshold under BNSS 479 for a 7-year offence is 42 months — not yet met. Bail is sought on community ties and clean record, not statutory default.",
                     "No prior criminal record. Permanent resident of Delhi with family ties.",
                     "No evidence of flight risk or evidence tampering.",
                 ],

training/train_grpo.py CHANGED Viewed

@@ -656,10 +656,10 @@ def train(
     results_path.write_text(json.dumps(results, indent=2))
     print(f"\nResults saved to {results_path}")
-    # ── Save model ────────────────────────────────────────────
-    model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
-    print(f"\nModel saved to {output_dir}")
     return results
@@ -904,7 +904,10 @@ def train_curriculum(
         def reward_fn(completions: List[str], episode: List[str], **kwargs) -> List[float]:
             ep_objs = [json.loads(e) for e in episode]
-            return combined_reward(completions, ep_objs)
         stage_output = f"{output_dir}/stage_{stage}"
         config = GRPOConfig(
@@ -924,6 +927,11 @@ def train_curriculum(
             remove_unused_columns=False,
         )
         trainer = GRPOTrainer(
             model=model,
             processing_class=tokenizer,
@@ -963,10 +971,11 @@ def train_curriculum(
             print(f"  ✗ Stage {stage} below threshold ({post_reward:.2f} < {threshold:.2f})")
             print(f"  → Continuing to next stage anyway (curriculum mode)")
-        # Save checkpoint after each stage
-        model.save_pretrained(stage_output)
         tokenizer.save_pretrained(stage_output)
-        print(f"  Checkpoint saved: {stage_output}")
     # ── Final summary ──
     print(f"\n{'═' * 60}")
@@ -978,11 +987,11 @@ def train_curriculum(
               f"(Δ = {r['delta']:+.4f})")
     print(f"  Total traces harvested: {len(accumulated_traces)}")
-    # Save final model
     final_dir = f"{output_dir}/final"
-    model.save_pretrained(final_dir)
     tokenizer.save_pretrained(final_dir)
-    print(f"\n  Final model saved: {final_dir}")
     # Save results
     results_path = Path(output_dir) / "curriculum_results.json"

     results_path.write_text(json.dumps(results, indent=2))
     print(f"\nResults saved to {results_path}")
+    # Save LoRA adapters only — safe for 4-bit quantized models
+    model.save_pretrained(output_dir, save_adapters_only=True)
     tokenizer.save_pretrained(output_dir)
+    print(f"\nModel adapters saved to {output_dir}")
     return results
         def reward_fn(completions: List[str], episode: List[str], **kwargs) -> List[float]:
             ep_objs = [json.loads(e) for e in episode]
+            # Pass step_count=1 for curriculum training (single-shot XML, no multi-step env loop)
+            # This keeps efficiency contribution honest rather than silently 0.0
+            step_counts = [1] * len(completions)
+            return combined_reward(completions, ep_objs, step_counts=step_counts)
         stage_output = f"{output_dir}/stage_{stage}"
         config = GRPOConfig(
             remove_unused_columns=False,
         )
+        # ── Switch model back to training mode before trainer.train() ──
+        # evaluate_on_stage calls FastLanguageModel.for_inference(model);
+        # without this reset, stages 2-4 train in inference mode silently.
+        FastLanguageModel.for_training(model)
         trainer = GRPOTrainer(
             model=model,
             processing_class=tokenizer,
             print(f"  ✗ Stage {stage} below threshold ({post_reward:.2f} < {threshold:.2f})")
             print(f"  → Continuing to next stage anyway (curriculum mode)")
+        # Save LoRA adapters only — safe for 4-bit models (save_pretrained_merged
+        # requires a full merge which can OOM on T4)
+        model.save_pretrained(stage_output, save_adapters_only=True)
         tokenizer.save_pretrained(stage_output)
+        print(f"  Checkpoint saved (adapters): {stage_output}")
     # ── Final summary ──
     print(f"\n{'═' * 60}")
               f"(Δ = {r['delta']:+.4f})")
     print(f"  Total traces harvested: {len(accumulated_traces)}")
+    # Save final model (adapters only — merge separately if needed)
     final_dir = f"{output_dir}/final"
+    model.save_pretrained(final_dir, save_adapters_only=True)
     tokenizer.save_pretrained(final_dir)
+    print(f"\n  Final model saved (adapters): {final_dir}")
     # Save results
     results_path = Path(output_dir) / "curriculum_results.json"