{ "checkpoint": "ckpt_13.pt", "model": "GPT2-S", "training_step": 10000, "instability_type": "Slow divergence", "learning_rate": "6e-4", "decay": "0.1", "warm": "2000", "data_type": "FP8_with_FP8_head" }