ScottzillaSystems
/

self-healing-training

Model card Files Files and versions

xet

Community

ScottzillaSystems commited on 7 days ago

Commit

fa60c5e

verified ·

1 Parent(s): cf520c9

Upload examples/demo_sft_self_healing.py

Browse files

Files changed (1) hide show

examples/demo_sft_self_healing.py +121 -0

examples/demo_sft_self_healing.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+"""
+Demo: Self-Healing SFT Training
+===============================
+Loads Qwen2.5-0.5B, trains on Capybara dataset with full self-healing.
+Usage:
+    python demo_sft_self_healing.py
+Requirements:
+    pip install transformers trl datasets torch self-healing-training
+"""
+import os, sys, json, time
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+from self_healing import SelfHealingTrainer, HealingConfig
+def main():
+    print("\n" + "=" * 60)
+    print("  SELF-HEALING SFT TRAINING DEMO")
+    print("=" * 60 + "\n")
+    # Model
+    model_id = "Qwen/Qwen2.5-0.5B"
+    print(f"[1/4] Loading model: {model_id}")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Dataset
+    print("[2/4] Loading dataset: trl-lib/Capybara")
+    dataset = load_dataset("trl-lib/Capybara", split="train[:2000]")
+    # Config
+    from trl import SFTConfig, SFTTrainer
+    training_args = SFTConfig(
+        output_dir="./sft-output",
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        learning_rate=2e-5,
+        max_steps=200,
+        logging_steps=10,
+        logging_strategy="steps",
+        logging_first_step=True,
+        save_steps=500,
+        bf16=True,
+        report_to="none",  # Set to "trackio" for live monitoring
+        run_name="selfheal-sft-demo",
+        disable_tqdm=True,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        tokenizer=tokenizer,
+    )
+    # Self-healing wrapper
+    print("[3/4] Wrapping with SelfHealingTrainer...")
+    healing_config = HealingConfig(
+        nan_patience=3,
+        loss_spike_factor=5.0,
+        divergence_patience=50,
+        max_recovery_attempts=5,
+        max_lr_reductions=3,
+        max_batch_reductions=2,
+        zclip_enabled=True,
+        zclip_z_threshold=3.0,
+        postmortem_path="./sft-postmortem.json",
+    )
+    sh_trainer = SelfHealingTrainer(trainer, healing_config)
+    # Dry-run validation
+    try:
+        sh_trainer.dry_run(num_steps=2)
+        print("  ✓ Dry-run passed!\n")
+    except Exception as e:
+        print(f"  ✗ Dry-run failed: {e}")
+        sys.exit(1)
+    # Train
+    print("[4/4] Training with self-healing...\n")
+    result = sh_trainer.train()
+    # Report
+    print("\n" + "=" * 60)
+    print("  DEMO COMPLETE")
+    print("=" * 60)
+    report = sh_trainer.get_report()
+    print(f"  Converged:     {report['converged']}")
+    print(f"  Attempts:      {report['attempts']}")
+    print(f"  Recoveries:    {report['total_recoveries']}")
+    print(f"  ZClip clips:   {report['zclip_total_clips']}")
+    print(f"  NaN count:     {report['nan_count']}")
+    print(f"  LR reductions: {report['lr_reductions']}")
+    if report["recovery_history"]:
+        print("\n  Recovery log:")
+        for i, rec in enumerate(report["recovery_history"]):
+            print(f"    [{i+1}] {rec['failure']}: {rec['actions']}")
+    # Save postmortem
+    if os.path.exists(healing_config.postmortem_path):
+        with open(healing_config.postmortem_path) as f:
+            pm = json.load(f)
+        print(f"\n  Postmortem: {pm.get('exit_reason', 'unknown')} "
+              f"at step {pm.get('last_step', '?')}")
+if __name__ == "__main__":
+    main()