| """ |
| Self-Healing Training System (SHTS) |
| =================================== |
| |
| A fully autonomous self-healing layer for Hugging Face TRL trainers. |
| |
| Architecture: |
| 1. DETECTION — SelfHealingCallback monitors loss, gradients, OOM, memory |
| 2. DIAGNOSIS — Root-cause classifier: NaN/divergence/OOM/data/API errors |
| 3. RECOVERY — HealingActions applies fixes: rollback, reduce LR, halve batch |
| 4. ORCHESTRATION — SelfHealingTrainer retry loop with state persistence |
| |
| Based on: |
| - Unicron (arxiv:2401.00134): Cost-aware self-healing at cluster scale |
| - ZClip (arxiv:2504.02507): Z-score adaptive gradient clipping |
| - PTT (post-training-toolkit): DiagnosticsCallback + postmortem pattern |
| - Pioneer Agent (arxiv:2604.09791): Structured decision tree for iteration |
| - Deep Researcher (arxiv:2604.05854): Dry-run validation pattern |
| |
| Usage: |
| from self_healing import SelfHealingTrainer, HealingConfig |
| from trl import SFTTrainer, SFTConfig |
| |
| trainer = SFTTrainer(model=model, args=training_args, train_dataset=ds, tokenizer=tok) |
| sh = SelfHealingTrainer(trainer, HealingConfig()) |
| sh.train() |
| |
| Author: Autonomous ML Intern |
| """ |
|
|
| from .core import ( |
| HealingConfig, |
| SelfHealingCallback, |
| HealingActions, |
| SelfHealingTrainer, |
| ZClip, |
| FailureType, |
| FAILURE_RECIPES, |
| ) |
|
|
| __version__ = "1.0.0" |
| __all__ = [ |
| "HealingConfig", |
| "SelfHealingCallback", |
| "HealingActions", |
| "SelfHealingTrainer", |
| "ZClip", |
| "FailureType", |
| "FAILURE_RECIPES", |
| ] |