Spaces:

YashashMathur
/

aegis_training

Runtime error

App Files Files Community

YashashMathur commited on 13 days ago

Commit

6a2dd66

verified ·

1 Parent(s): 5d8f1e9

Update train.py

Browse files

Files changed (1) hide show

train.py +105 -27

train.py CHANGED Viewed

@@ -1,10 +1,19 @@
 """
 AEGIS Training Script for HF Spaces (A10G Small, 24GB VRAM)
-- Loads Qwen2.5-7B-Unsloth-bnb-4bit + step_50 LoRA adapter
-- Runs 10 remaining SFT steps + 500 GRPO steps
 - Saves LoRA checkpoints to HF Hub every 50 GRPO steps
 - Serves a minimal status page on :7860 so the Space stays alive
 - Prints "TRAINING COMPLETE - PLEASE DOWNGRADE HARDWARE" when done
 """
 import os, json, re, random, gc, sys, threading, time
@@ -33,8 +42,9 @@ except ImportError:
 # ─── Auth & Config ────────────────────────────────────────────────────────────
 HF_TOKEN = os.environ["HF_TOKEN"]
 HF_USERNAME = os.environ.get("HF_USERNAME", "YashashMathur")
-STEP50_REPO = f"{HF_USERNAME}/aegis-step50"
 CKPT_REPO = f"{HF_USERNAME}/aegis-training-checkpoints"
 login(token=HF_TOKEN)
 api = HfApi()
@@ -44,11 +54,10 @@ except Exception as e:
     print(f"Repo create: {e}")
 MAX_SEQ_LEN = 1024
-SFT_STEPS = 80
-# More warmup for JSON format
 GRPO_STEPS = 250
 GRPO_K = 2
-GRPO_LR = 2e-5
 CURRICULUM_SWITCH = 0  # Start with Level 1, advance early
 GRAD_CLIP = 1.0
 SAVE_EVERY = 50
@@ -160,21 +169,20 @@ W2 = {
 def build_prompt(sc, tokenizer):
     cot = tokenizer.decode(
-        tokenizer.encode(sc.get("worker_cot_trace", ""), add_special_tokens=False)[
-            :800
-        ],
         skip_special_tokens=True,
     )
     out = tokenizer.decode(
-        tokenizer.encode(sc.get("worker_output", ""), add_special_tokens=False)[:400],
         skip_special_tokens=True,
     )
     return (
         f"You are AEGIS, an AI fleet oversight agent. "
         f"Use <thought> tags for reasoning, then output JSON.\n"
         f"Worker Role: {sc.get('worker_role', 'dev')}\n"
-        f"[WORKER_THOUGHTS]\n{cot}\n"
         f"[WORKER_OUTPUT]\n{out}\n"
         f"Rules:\n{RULES_BLOCK}\nJSON:"
     )
@@ -286,6 +294,52 @@ def score_response(a, truth, raw_text, level=1, fmt_ema=1.0):
     }
 # ─── Load Model + Step-50 Checkpoint ─────────────────────────────────────────
 from unsloth import FastLanguageModel
@@ -317,16 +371,31 @@ model = FastLanguageModel.get_peft_model(
     use_rslora=True,
 )
-# Load step_50 LoRA weights into the freshly created adapter
-print(f"Loading step_50 adapter from HF Hub: {STEP50_REPO}")
 try:
-    ckpt_path = snapshot_download(STEP50_REPO, token=HF_TOKEN)
-    adapter_weights = load_file(f"{ckpt_path}/adapter_model.safetensors")
-    # set_peft_model_state_dict loads into the default adapter without rebuilding
     set_peft_model_state_dict(model, adapter_weights)
-    print("Step_50 adapter loaded successfully.")
 except Exception as e:
-    print(f"WARNING: Could not load step_50 adapter ({e}). Starting from fresh LoRA.")
 FastLanguageModel.for_training(model)
 if hasattr(model, "generation_config"):
@@ -406,14 +475,14 @@ for step in range(GRPO_STEPS):
             prompt, return_tensors="pt", truncation=True, max_length=1024
         ).to("cuda")
         prompt_len = p_enc.input_ids.shape[1]
-        temp = max(0.7, 1.0 - step * 0.0008)
         FastLanguageModel.for_inference(model)
         with torch.no_grad():
             gen = model.generate(
                 input_ids=p_enc.input_ids,
                 attention_mask=p_enc.attention_mask,
-                max_new_tokens=150,
                 temperature=temp,
                 top_p=0.9,
                 do_sample=True,
@@ -433,21 +502,30 @@ for step in range(GRPO_STEPS):
             [rd["total"] for rd in reward_dicts], dtype=torch.float32, device="cuda"
         )
-        if rewards.std().item() < 1e-6:
-            rewards = rewards + torch.randn_like(rewards) * 0.01
-        adv = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
-        adv = adv.clamp(-2.0, 2.0)
         format_ema = (
             0.1 * (sum(1 for a in acts if a.get("__valid__")) / GRPO_K)
             + 0.9 * format_ema
         )
         FastLanguageModel.for_training(model)
         optimizer.zero_grad()
         for r_text, a_val in zip(resps, adv.tolist()):
             f_enc = tokenizer(
-                prompt + r_text, return_tensors="pt", truncation=True, max_length=1280
             ).to("cuda")
             lbls = f_enc.input_ids.clone()
             lbls[:, :prompt_len] = -100
@@ -504,7 +582,7 @@ for step in range(GRPO_STEPS):
             print(f"  >> Pushed step_{step} to https://huggingface.co/{CKPT_REPO}")
             TRAIN_STATUS["phase"] = "GRPO"
-        del gen, p_enc, resps, acts, rewards, adv, reward_dicts
     except torch.cuda.OutOfMemoryError:
         print(f"Step {step:04d} | OOM — clearing cache and skipping")

 """
 AEGIS Training Script for HF Spaces (A10G Small, 24GB VRAM)
+- Loads Qwen2.5-7B-Unsloth-bnb-4bit + GRPO step_50 LoRA adapter (last good checkpoint)
+- Runs SFT warmup + 250 GRPO steps with collapse-safe advantage computation
 - Saves LoRA checkpoints to HF Hub every 50 GRPO steps
 - Serves a minimal status page on :7860 so the Space stays alive
 - Prints "TRAINING COMPLETE - PLEASE DOWNGRADE HARDWARE" when done
+FIXES vs previous version:
+  1. Load GRPO step_50 (last good checkpoint) instead of original SFT step_50
+  2. build_prompt: COT capped at 300 tokens, output at 150 — leaves 400+ tokens for generation
+  3. max_new_tokens 150 -> 300 so thought+JSON never truncates mid-brace
+  4. Skip GRPO gradient update when ALL completions fail format (was applying random gradients)
+  5. Format recovery mini-SFT triggers automatically if fmt_ema < 0.15
+  6. Temperature starts at 1.3 for exploration (matches blog), anneals to 0.9
+  7. Backward pass max_length matches MAX_SEQ_LEN (was 1280 > model capacity)
 """
 import os, json, re, random, gc, sys, threading, time
 # ─── Auth & Config ────────────────────────────────────────────────────────────
 HF_TOKEN = os.environ["HF_TOKEN"]
 HF_USERNAME = os.environ.get("HF_USERNAME", "YashashMathur")
+STEP50_REPO = f"{HF_USERNAME}/aegis-step50"          # fallback: original SFT adapter
 CKPT_REPO = f"{HF_USERNAME}/aegis-training-checkpoints"
+RESUME_FROM_GRPO = "step_50"  # last good GRPO checkpoint before collapse
 login(token=HF_TOKEN)
 api = HfApi()
     print(f"Repo create: {e}")
 MAX_SEQ_LEN = 1024
+SFT_STEPS = 80  # Increased warmup for JSON format - key fix!
 GRPO_STEPS = 250
 GRPO_K = 2
+GRPO_LR = 2e-5  # Slightly higher LR for faster initial learning
 CURRICULUM_SWITCH = 0  # Start with Level 1, advance early
 GRAD_CLIP = 1.0
 SAVE_EVERY = 50
 def build_prompt(sc, tokenizer):
+    # Keep prompt tight: 300+150 tokens leaves ~400 tokens for thought+JSON generation
     cot = tokenizer.decode(
+        tokenizer.encode(sc.get("worker_cot_trace", ""), add_special_tokens=False)[:300],
         skip_special_tokens=True,
     )
     out = tokenizer.decode(
+        tokenizer.encode(sc.get("worker_output", ""), add_special_tokens=False)[:150],
         skip_special_tokens=True,
     )
     return (
         f"You are AEGIS, an AI fleet oversight agent. "
         f"Use <thought> tags for reasoning, then output JSON.\n"
         f"Worker Role: {sc.get('worker_role', 'dev')}\n"
+        f"[WORKER_THOUGHTS_START]\n{cot}\n[WORKER_THOUGHTS_END]\n"
         f"[WORKER_OUTPUT]\n{out}\n"
         f"Rules:\n{RULES_BLOCK}\nJSON:"
     )
     }
+def run_sft_recovery(model, tokenizer, train_set, n_steps=10):
+    """Mini SFT recovery loop — restores JSON format compliance after collapse."""
+    print("  [FORMAT RECOVERY] fmt_ema critical — running 10 SFT steps to restore JSON format...")
+    FastLanguageModel.for_training(model)
+    recovery_opt = torch.optim.AdamW(model.parameters(), lr=5e-5)
+    model.train()
+    for i in range(n_steps):
+        sc = random.choice(train_set)
+        prompt = build_prompt(sc, tokenizer)
+        vtype = sc.get("violation_type", "none")
+        decision = sc["decision"]
+        rules = sc.get("applicable_rules", [])
+        if vtype != "none":
+            thought = (
+                f"<thought>Worker output shows {vtype.replace('_', ' ')} patterns. "
+                f"Violates {', '.join(rules) if rules else 'policy'}. Decision: {decision}.</thought>"
+            )
+        else:
+            thought = (
+                "<thought>Worker output appears compliant. No unauthorized access, "
+                "no policy violations detected. Safe to allow.</thought>"
+            )
+        target = thought + json.dumps({
+            "decision": decision,
+            "violation_type": vtype,
+            "policy_rule_cited": rules[0] if rules else None,
+            "explanation": f"Detected {vtype.replace('_', ' ')}" if vtype != "none" else "No violation detected",
+            "confidence": 0.9,
+        })
+        enc = tokenizer(
+            prompt + target, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LEN
+        ).to("cuda")
+        p_len = tokenizer(prompt, return_tensors="pt").input_ids.shape[1]
+        labels = enc.input_ids.clone()
+        labels[:, :p_len] = -100
+        loss = model(**enc, labels=labels).loss
+        loss.backward()
+        if (i + 1) % 4 == 0:
+            recovery_opt.step()
+            recovery_opt.zero_grad()
+        print(f"    Recovery SFT {i+1}/{n_steps} | loss={loss.item():.4f}")
+    del recovery_opt
+    torch.cuda.empty_cache()
+    print("  [FORMAT RECOVERY] Done. Resuming GRPO.")
 # ─── Load Model + Step-50 Checkpoint ─────────────────────────────────────────
 from unsloth import FastLanguageModel
     use_rslora=True,
 )
+# Load last good checkpoint: prefer GRPO step_50, fall back to original SFT adapter
+print(f"Attempting to load GRPO {RESUME_FROM_GRPO} from {CKPT_REPO}...")
+loaded = False
 try:
+    adapter_file = hf_hub_download(
+        repo_id=CKPT_REPO,
+        filename=f"{RESUME_FROM_GRPO}/adapter_model.safetensors",
+        token=HF_TOKEN,
+        local_dir="/tmp/aegis_resume",
+    )
+    adapter_weights = load_file(adapter_file)
     set_peft_model_state_dict(model, adapter_weights)
+    print(f"Loaded GRPO {RESUME_FROM_GRPO} adapter — resuming from last good checkpoint.")
+    loaded = True
 except Exception as e:
+    print(f"WARNING: Could not load GRPO {RESUME_FROM_GRPO} ({e}). Falling back to SFT step_50...")
+if not loaded:
+    try:
+        ckpt_path = snapshot_download(STEP50_REPO, token=HF_TOKEN)
+        adapter_weights = load_file(f"{ckpt_path}/adapter_model.safetensors")
+        set_peft_model_state_dict(model, adapter_weights)
+        print("Loaded original SFT step_50 adapter.")
+    except Exception as e2:
+        print(f"WARNING: Could not load SFT step_50 ({e2}). Starting from fresh LoRA.")
 FastLanguageModel.for_training(model)
 if hasattr(model, "generation_config"):
             prompt, return_tensors="pt", truncation=True, max_length=1024
         ).to("cuda")
         prompt_len = p_enc.input_ids.shape[1]
+        temp = max(0.9, 1.3 - step * 0.0008)  # starts at 1.3 for exploration, anneals to 0.9
         FastLanguageModel.for_inference(model)
         with torch.no_grad():
             gen = model.generate(
                 input_ids=p_enc.input_ids,
                 attention_mask=p_enc.attention_mask,
+                max_new_tokens=300,  # 150 was too tight for <thought>+JSON, caused truncation
                 temperature=temp,
                 top_p=0.9,
                 do_sample=True,
             [rd["total"] for rd in reward_dicts], dtype=torch.float32, device="cuda"
         )
+        # Update format EMA before the skip check so it tracks collapse accurately
         format_ema = (
             0.1 * (sum(1 for a in acts if a.get("__valid__")) / GRPO_K)
             + 0.9 * format_ema
         )
+        # --- COLLAPSE GUARD ---
+        # When every completion fails format, all rewards = -0.2 and std ≈ 0.
+        # Applying gradients here means random-noise updates that actively destroy weights.
+        # Skip the update entirely. If EMA has dropped critically, trigger recovery SFT.
+        if all(not a.get("__valid__") for a in acts):
+            if format_ema < 0.15 and step > 10:
+                run_sft_recovery(model, tokenizer, train_set)
+            del gen, p_enc, resps, acts, rewards, reward_dicts
+            continue
+        adv = (rewards - rewards.mean()) / (rewards.std() + 1e-8)
+        adv = adv.clamp(-2.0, 2.0)
         FastLanguageModel.for_training(model)
         optimizer.zero_grad()
         for r_text, a_val in zip(resps, adv.tolist()):
             f_enc = tokenizer(
+                prompt + r_text, return_tensors="pt", truncation=True, max_length=MAX_SEQ_LEN
             ).to("cuda")
             lbls = f_enc.input_ids.clone()
             lbls[:, :prompt_len] = -100
             print(f"  >> Pushed step_{step} to https://huggingface.co/{CKPT_REPO}")
             TRAIN_STATUS["phase"] = "GRPO"
+        del gen, p_enc, resps, acts, rewards, adv, reward_dicts  # adv always defined here (continue skips this)
     except torch.cuda.OutOfMemoryError:
         print(f"Step {step:04d} | OOM — clearing cache and skipping")