ScottzillaSystems commited on
Commit
c343cc2
·
verified ·
1 Parent(s): 6b06c9e

Upload self_healing/__init__.py

Browse files
Files changed (1) hide show
  1. self_healing/__init__.py +50 -0
self_healing/__init__.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Self-Healing Training System (SHTS)
3
+ ===================================
4
+
5
+ A fully autonomous self-healing layer for Hugging Face TRL trainers.
6
+
7
+ Architecture:
8
+ 1. DETECTION — SelfHealingCallback monitors loss, gradients, OOM, memory
9
+ 2. DIAGNOSIS — Root-cause classifier: NaN/divergence/OOM/data/API errors
10
+ 3. RECOVERY — HealingActions applies fixes: rollback, reduce LR, halve batch
11
+ 4. ORCHESTRATION — SelfHealingTrainer retry loop with state persistence
12
+
13
+ Based on:
14
+ - Unicron (arxiv:2401.00134): Cost-aware self-healing at cluster scale
15
+ - ZClip (arxiv:2504.02507): Z-score adaptive gradient clipping
16
+ - PTT (post-training-toolkit): DiagnosticsCallback + postmortem pattern
17
+ - Pioneer Agent (arxiv:2604.09791): Structured decision tree for iteration
18
+ - Deep Researcher (arxiv:2604.05854): Dry-run validation pattern
19
+
20
+ Usage:
21
+ from self_healing import SelfHealingTrainer, HealingConfig
22
+ from trl import SFTTrainer, SFTConfig
23
+
24
+ trainer = SFTTrainer(model=model, args=training_args, train_dataset=ds, tokenizer=tok)
25
+ sh = SelfHealingTrainer(trainer, HealingConfig())
26
+ sh.train()
27
+
28
+ Author: Autonomous ML Intern
29
+ """
30
+
31
+ from .core import (
32
+ HealingConfig,
33
+ SelfHealingCallback,
34
+ HealingActions,
35
+ SelfHealingTrainer,
36
+ ZClip,
37
+ FailureType,
38
+ FAILURE_RECIPES,
39
+ )
40
+
41
+ __version__ = "1.0.0"
42
+ __all__ = [
43
+ "HealingConfig",
44
+ "SelfHealingCallback",
45
+ "HealingActions",
46
+ "SelfHealingTrainer",
47
+ "ZClip",
48
+ "FailureType",
49
+ "FAILURE_RECIPES",
50
+ ]