Training checkpoint at step 34000
Browse files- trainer_state.json +186 -6
trainer_state.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"best_global_step":
|
| 3 |
-
"best_metric": 2.
|
| 4 |
-
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 100,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -12068,6 +12068,186 @@
|
|
| 12068 |
"eval_samples_per_second": 2.47,
|
| 12069 |
"eval_steps_per_second": 1.235,
|
| 12070 |
"step": 33500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12071 |
}
|
| 12072 |
],
|
| 12073 |
"logging_steps": 25,
|
|
@@ -12087,7 +12267,7 @@
|
|
| 12087 |
"attributes": {}
|
| 12088 |
}
|
| 12089 |
},
|
| 12090 |
-
"total_flos": 7.
|
| 12091 |
"train_batch_size": 1,
|
| 12092 |
"trial_name": null,
|
| 12093 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
+
"best_global_step": 34000,
|
| 3 |
+
"best_metric": 2.479717493057251,
|
| 4 |
+
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-34000",
|
| 5 |
+
"epoch": 0.68,
|
| 6 |
"eval_steps": 100,
|
| 7 |
+
"global_step": 34000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 12068 |
"eval_samples_per_second": 2.47,
|
| 12069 |
"eval_steps_per_second": 1.235,
|
| 12070 |
"step": 33500
|
| 12071 |
+
},
|
| 12072 |
+
{
|
| 12073 |
+
"epoch": 0.6705,
|
| 12074 |
+
"grad_norm": 1.441673290204565,
|
| 12075 |
+
"learning_rate": 3.6613333333333334e-06,
|
| 12076 |
+
"loss": 2.4663,
|
| 12077 |
+
"step": 33525
|
| 12078 |
+
},
|
| 12079 |
+
{
|
| 12080 |
+
"epoch": 0.671,
|
| 12081 |
+
"grad_norm": 1.5853407892539593,
|
| 12082 |
+
"learning_rate": 3.6557777777777782e-06,
|
| 12083 |
+
"loss": 2.4595,
|
| 12084 |
+
"step": 33550
|
| 12085 |
+
},
|
| 12086 |
+
{
|
| 12087 |
+
"epoch": 0.6715,
|
| 12088 |
+
"grad_norm": 1.5822227773393136,
|
| 12089 |
+
"learning_rate": 3.6502222222222226e-06,
|
| 12090 |
+
"loss": 2.4752,
|
| 12091 |
+
"step": 33575
|
| 12092 |
+
},
|
| 12093 |
+
{
|
| 12094 |
+
"epoch": 0.672,
|
| 12095 |
+
"grad_norm": 1.4559128373290389,
|
| 12096 |
+
"learning_rate": 3.644666666666667e-06,
|
| 12097 |
+
"loss": 2.4839,
|
| 12098 |
+
"step": 33600
|
| 12099 |
+
},
|
| 12100 |
+
{
|
| 12101 |
+
"epoch": 0.672,
|
| 12102 |
+
"eval_loss": 2.480393648147583,
|
| 12103 |
+
"eval_runtime": 43.6412,
|
| 12104 |
+
"eval_samples_per_second": 2.383,
|
| 12105 |
+
"eval_steps_per_second": 1.192,
|
| 12106 |
+
"step": 33600
|
| 12107 |
+
},
|
| 12108 |
+
{
|
| 12109 |
+
"epoch": 0.6725,
|
| 12110 |
+
"grad_norm": 1.532302660160229,
|
| 12111 |
+
"learning_rate": 3.6391111111111114e-06,
|
| 12112 |
+
"loss": 2.4842,
|
| 12113 |
+
"step": 33625
|
| 12114 |
+
},
|
| 12115 |
+
{
|
| 12116 |
+
"epoch": 0.673,
|
| 12117 |
+
"grad_norm": 1.8197828444753166,
|
| 12118 |
+
"learning_rate": 3.633555555555556e-06,
|
| 12119 |
+
"loss": 2.4769,
|
| 12120 |
+
"step": 33650
|
| 12121 |
+
},
|
| 12122 |
+
{
|
| 12123 |
+
"epoch": 0.6735,
|
| 12124 |
+
"grad_norm": 1.5577840126586067,
|
| 12125 |
+
"learning_rate": 3.6280000000000002e-06,
|
| 12126 |
+
"loss": 2.4755,
|
| 12127 |
+
"step": 33675
|
| 12128 |
+
},
|
| 12129 |
+
{
|
| 12130 |
+
"epoch": 0.674,
|
| 12131 |
+
"grad_norm": 1.8234890351489574,
|
| 12132 |
+
"learning_rate": 3.6224444444444447e-06,
|
| 12133 |
+
"loss": 2.4763,
|
| 12134 |
+
"step": 33700
|
| 12135 |
+
},
|
| 12136 |
+
{
|
| 12137 |
+
"epoch": 0.674,
|
| 12138 |
+
"eval_loss": 2.480318546295166,
|
| 12139 |
+
"eval_runtime": 42.0675,
|
| 12140 |
+
"eval_samples_per_second": 2.472,
|
| 12141 |
+
"eval_steps_per_second": 1.236,
|
| 12142 |
+
"step": 33700
|
| 12143 |
+
},
|
| 12144 |
+
{
|
| 12145 |
+
"epoch": 0.6745,
|
| 12146 |
+
"grad_norm": 1.580558693036695,
|
| 12147 |
+
"learning_rate": 3.616888888888889e-06,
|
| 12148 |
+
"loss": 2.4674,
|
| 12149 |
+
"step": 33725
|
| 12150 |
+
},
|
| 12151 |
+
{
|
| 12152 |
+
"epoch": 0.675,
|
| 12153 |
+
"grad_norm": 1.8364115704619381,
|
| 12154 |
+
"learning_rate": 3.6113333333333335e-06,
|
| 12155 |
+
"loss": 2.468,
|
| 12156 |
+
"step": 33750
|
| 12157 |
+
},
|
| 12158 |
+
{
|
| 12159 |
+
"epoch": 0.6755,
|
| 12160 |
+
"grad_norm": 1.441698909778923,
|
| 12161 |
+
"learning_rate": 3.605777777777778e-06,
|
| 12162 |
+
"loss": 2.4705,
|
| 12163 |
+
"step": 33775
|
| 12164 |
+
},
|
| 12165 |
+
{
|
| 12166 |
+
"epoch": 0.676,
|
| 12167 |
+
"grad_norm": 1.6076672764771844,
|
| 12168 |
+
"learning_rate": 3.6002222222222227e-06,
|
| 12169 |
+
"loss": 2.4667,
|
| 12170 |
+
"step": 33800
|
| 12171 |
+
},
|
| 12172 |
+
{
|
| 12173 |
+
"epoch": 0.676,
|
| 12174 |
+
"eval_loss": 2.480243444442749,
|
| 12175 |
+
"eval_runtime": 42.1706,
|
| 12176 |
+
"eval_samples_per_second": 2.466,
|
| 12177 |
+
"eval_steps_per_second": 1.233,
|
| 12178 |
+
"step": 33800
|
| 12179 |
+
},
|
| 12180 |
+
{
|
| 12181 |
+
"epoch": 0.6765,
|
| 12182 |
+
"grad_norm": 1.449175055825869,
|
| 12183 |
+
"learning_rate": 3.5946666666666667e-06,
|
| 12184 |
+
"loss": 2.4666,
|
| 12185 |
+
"step": 33825
|
| 12186 |
+
},
|
| 12187 |
+
{
|
| 12188 |
+
"epoch": 0.677,
|
| 12189 |
+
"grad_norm": 1.3909591726588006,
|
| 12190 |
+
"learning_rate": 3.5891111111111115e-06,
|
| 12191 |
+
"loss": 2.4654,
|
| 12192 |
+
"step": 33850
|
| 12193 |
+
},
|
| 12194 |
+
{
|
| 12195 |
+
"epoch": 0.6775,
|
| 12196 |
+
"grad_norm": 2.238175776696452,
|
| 12197 |
+
"learning_rate": 3.5835555555555555e-06,
|
| 12198 |
+
"loss": 2.4774,
|
| 12199 |
+
"step": 33875
|
| 12200 |
+
},
|
| 12201 |
+
{
|
| 12202 |
+
"epoch": 0.678,
|
| 12203 |
+
"grad_norm": 1.4514072202089707,
|
| 12204 |
+
"learning_rate": 3.5780000000000003e-06,
|
| 12205 |
+
"loss": 2.4641,
|
| 12206 |
+
"step": 33900
|
| 12207 |
+
},
|
| 12208 |
+
{
|
| 12209 |
+
"epoch": 0.678,
|
| 12210 |
+
"eval_loss": 2.480393648147583,
|
| 12211 |
+
"eval_runtime": 42.0258,
|
| 12212 |
+
"eval_samples_per_second": 2.475,
|
| 12213 |
+
"eval_steps_per_second": 1.237,
|
| 12214 |
+
"step": 33900
|
| 12215 |
+
},
|
| 12216 |
+
{
|
| 12217 |
+
"epoch": 0.6785,
|
| 12218 |
+
"grad_norm": 1.5826502787680072,
|
| 12219 |
+
"learning_rate": 3.5724444444444443e-06,
|
| 12220 |
+
"loss": 2.4659,
|
| 12221 |
+
"step": 33925
|
| 12222 |
+
},
|
| 12223 |
+
{
|
| 12224 |
+
"epoch": 0.679,
|
| 12225 |
+
"grad_norm": 1.6232233026873348,
|
| 12226 |
+
"learning_rate": 3.566888888888889e-06,
|
| 12227 |
+
"loss": 2.4767,
|
| 12228 |
+
"step": 33950
|
| 12229 |
+
},
|
| 12230 |
+
{
|
| 12231 |
+
"epoch": 0.6795,
|
| 12232 |
+
"grad_norm": 1.4754806955160604,
|
| 12233 |
+
"learning_rate": 3.561333333333334e-06,
|
| 12234 |
+
"loss": 2.4734,
|
| 12235 |
+
"step": 33975
|
| 12236 |
+
},
|
| 12237 |
+
{
|
| 12238 |
+
"epoch": 0.68,
|
| 12239 |
+
"grad_norm": 1.605195867452613,
|
| 12240 |
+
"learning_rate": 3.555777777777778e-06,
|
| 12241 |
+
"loss": 2.4667,
|
| 12242 |
+
"step": 34000
|
| 12243 |
+
},
|
| 12244 |
+
{
|
| 12245 |
+
"epoch": 0.68,
|
| 12246 |
+
"eval_loss": 2.479717493057251,
|
| 12247 |
+
"eval_runtime": 42.0643,
|
| 12248 |
+
"eval_samples_per_second": 2.472,
|
| 12249 |
+
"eval_steps_per_second": 1.236,
|
| 12250 |
+
"step": 34000
|
| 12251 |
}
|
| 12252 |
],
|
| 12253 |
"logging_steps": 25,
|
|
|
|
| 12267 |
"attributes": {}
|
| 12268 |
}
|
| 12269 |
},
|
| 12270 |
+
"total_flos": 7.630345833004243e+19,
|
| 12271 |
"train_batch_size": 1,
|
| 12272 |
"trial_name": null,
|
| 12273 |
"trial_params": null
|