irodkin commited on
Commit
4838b38
·
verified ·
1 Parent(s): 2e7c346

Training checkpoint at step 34000

Browse files
Files changed (1) hide show
  1. trainer_state.json +186 -6
trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 33400,
3
- "best_metric": 2.48046875,
4
- "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-33000",
5
- "epoch": 0.67,
6
  "eval_steps": 100,
7
- "global_step": 33500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12068,6 +12068,186 @@
12068
  "eval_samples_per_second": 2.47,
12069
  "eval_steps_per_second": 1.235,
12070
  "step": 33500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12071
  }
12072
  ],
12073
  "logging_steps": 25,
@@ -12087,7 +12267,7 @@
12087
  "attributes": {}
12088
  }
12089
  },
12090
- "total_flos": 7.518134864593918e+19,
12091
  "train_batch_size": 1,
12092
  "trial_name": null,
12093
  "trial_params": null
 
1
  {
2
+ "best_global_step": 34000,
3
+ "best_metric": 2.479717493057251,
4
+ "best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-34000",
5
+ "epoch": 0.68,
6
  "eval_steps": 100,
7
+ "global_step": 34000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12068
  "eval_samples_per_second": 2.47,
12069
  "eval_steps_per_second": 1.235,
12070
  "step": 33500
12071
+ },
12072
+ {
12073
+ "epoch": 0.6705,
12074
+ "grad_norm": 1.441673290204565,
12075
+ "learning_rate": 3.6613333333333334e-06,
12076
+ "loss": 2.4663,
12077
+ "step": 33525
12078
+ },
12079
+ {
12080
+ "epoch": 0.671,
12081
+ "grad_norm": 1.5853407892539593,
12082
+ "learning_rate": 3.6557777777777782e-06,
12083
+ "loss": 2.4595,
12084
+ "step": 33550
12085
+ },
12086
+ {
12087
+ "epoch": 0.6715,
12088
+ "grad_norm": 1.5822227773393136,
12089
+ "learning_rate": 3.6502222222222226e-06,
12090
+ "loss": 2.4752,
12091
+ "step": 33575
12092
+ },
12093
+ {
12094
+ "epoch": 0.672,
12095
+ "grad_norm": 1.4559128373290389,
12096
+ "learning_rate": 3.644666666666667e-06,
12097
+ "loss": 2.4839,
12098
+ "step": 33600
12099
+ },
12100
+ {
12101
+ "epoch": 0.672,
12102
+ "eval_loss": 2.480393648147583,
12103
+ "eval_runtime": 43.6412,
12104
+ "eval_samples_per_second": 2.383,
12105
+ "eval_steps_per_second": 1.192,
12106
+ "step": 33600
12107
+ },
12108
+ {
12109
+ "epoch": 0.6725,
12110
+ "grad_norm": 1.532302660160229,
12111
+ "learning_rate": 3.6391111111111114e-06,
12112
+ "loss": 2.4842,
12113
+ "step": 33625
12114
+ },
12115
+ {
12116
+ "epoch": 0.673,
12117
+ "grad_norm": 1.8197828444753166,
12118
+ "learning_rate": 3.633555555555556e-06,
12119
+ "loss": 2.4769,
12120
+ "step": 33650
12121
+ },
12122
+ {
12123
+ "epoch": 0.6735,
12124
+ "grad_norm": 1.5577840126586067,
12125
+ "learning_rate": 3.6280000000000002e-06,
12126
+ "loss": 2.4755,
12127
+ "step": 33675
12128
+ },
12129
+ {
12130
+ "epoch": 0.674,
12131
+ "grad_norm": 1.8234890351489574,
12132
+ "learning_rate": 3.6224444444444447e-06,
12133
+ "loss": 2.4763,
12134
+ "step": 33700
12135
+ },
12136
+ {
12137
+ "epoch": 0.674,
12138
+ "eval_loss": 2.480318546295166,
12139
+ "eval_runtime": 42.0675,
12140
+ "eval_samples_per_second": 2.472,
12141
+ "eval_steps_per_second": 1.236,
12142
+ "step": 33700
12143
+ },
12144
+ {
12145
+ "epoch": 0.6745,
12146
+ "grad_norm": 1.580558693036695,
12147
+ "learning_rate": 3.616888888888889e-06,
12148
+ "loss": 2.4674,
12149
+ "step": 33725
12150
+ },
12151
+ {
12152
+ "epoch": 0.675,
12153
+ "grad_norm": 1.8364115704619381,
12154
+ "learning_rate": 3.6113333333333335e-06,
12155
+ "loss": 2.468,
12156
+ "step": 33750
12157
+ },
12158
+ {
12159
+ "epoch": 0.6755,
12160
+ "grad_norm": 1.441698909778923,
12161
+ "learning_rate": 3.605777777777778e-06,
12162
+ "loss": 2.4705,
12163
+ "step": 33775
12164
+ },
12165
+ {
12166
+ "epoch": 0.676,
12167
+ "grad_norm": 1.6076672764771844,
12168
+ "learning_rate": 3.6002222222222227e-06,
12169
+ "loss": 2.4667,
12170
+ "step": 33800
12171
+ },
12172
+ {
12173
+ "epoch": 0.676,
12174
+ "eval_loss": 2.480243444442749,
12175
+ "eval_runtime": 42.1706,
12176
+ "eval_samples_per_second": 2.466,
12177
+ "eval_steps_per_second": 1.233,
12178
+ "step": 33800
12179
+ },
12180
+ {
12181
+ "epoch": 0.6765,
12182
+ "grad_norm": 1.449175055825869,
12183
+ "learning_rate": 3.5946666666666667e-06,
12184
+ "loss": 2.4666,
12185
+ "step": 33825
12186
+ },
12187
+ {
12188
+ "epoch": 0.677,
12189
+ "grad_norm": 1.3909591726588006,
12190
+ "learning_rate": 3.5891111111111115e-06,
12191
+ "loss": 2.4654,
12192
+ "step": 33850
12193
+ },
12194
+ {
12195
+ "epoch": 0.6775,
12196
+ "grad_norm": 2.238175776696452,
12197
+ "learning_rate": 3.5835555555555555e-06,
12198
+ "loss": 2.4774,
12199
+ "step": 33875
12200
+ },
12201
+ {
12202
+ "epoch": 0.678,
12203
+ "grad_norm": 1.4514072202089707,
12204
+ "learning_rate": 3.5780000000000003e-06,
12205
+ "loss": 2.4641,
12206
+ "step": 33900
12207
+ },
12208
+ {
12209
+ "epoch": 0.678,
12210
+ "eval_loss": 2.480393648147583,
12211
+ "eval_runtime": 42.0258,
12212
+ "eval_samples_per_second": 2.475,
12213
+ "eval_steps_per_second": 1.237,
12214
+ "step": 33900
12215
+ },
12216
+ {
12217
+ "epoch": 0.6785,
12218
+ "grad_norm": 1.5826502787680072,
12219
+ "learning_rate": 3.5724444444444443e-06,
12220
+ "loss": 2.4659,
12221
+ "step": 33925
12222
+ },
12223
+ {
12224
+ "epoch": 0.679,
12225
+ "grad_norm": 1.6232233026873348,
12226
+ "learning_rate": 3.566888888888889e-06,
12227
+ "loss": 2.4767,
12228
+ "step": 33950
12229
+ },
12230
+ {
12231
+ "epoch": 0.6795,
12232
+ "grad_norm": 1.4754806955160604,
12233
+ "learning_rate": 3.561333333333334e-06,
12234
+ "loss": 2.4734,
12235
+ "step": 33975
12236
+ },
12237
+ {
12238
+ "epoch": 0.68,
12239
+ "grad_norm": 1.605195867452613,
12240
+ "learning_rate": 3.555777777777778e-06,
12241
+ "loss": 2.4667,
12242
+ "step": 34000
12243
+ },
12244
+ {
12245
+ "epoch": 0.68,
12246
+ "eval_loss": 2.479717493057251,
12247
+ "eval_runtime": 42.0643,
12248
+ "eval_samples_per_second": 2.472,
12249
+ "eval_steps_per_second": 1.236,
12250
+ "step": 34000
12251
  }
12252
  ],
12253
  "logging_steps": 25,
 
12267
  "attributes": {}
12268
  }
12269
  },
12270
+ "total_flos": 7.630345833004243e+19,
12271
  "train_batch_size": 1,
12272
  "trial_name": null,
12273
  "trial_params": null