| { |
| "best_global_step": 88000, |
| "best_metric": 1.4495242834091187, |
| "best_model_checkpoint": "debertav3-ddp-8gpu-continue/checkpoint-88000", |
| "epoch": 15.0, |
| "eval_steps": 1000, |
| "global_step": 88200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00017006802721088434, |
| "grad_norm": 3.9429101943969727, |
| "learning_rate": 0.0, |
| "loss": 10.4075, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.008503401360544218, |
| "grad_norm": 1.971747636795044, |
| "learning_rate": 1.2250000000000001e-05, |
| "loss": 9.8035, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.017006802721088437, |
| "grad_norm": 1.3740853071212769, |
| "learning_rate": 2.4750000000000002e-05, |
| "loss": 8.7689, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.025510204081632654, |
| "grad_norm": 0.9082542061805725, |
| "learning_rate": 3.725e-05, |
| "loss": 7.832, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.034013605442176874, |
| "grad_norm": 1.015873670578003, |
| "learning_rate": 4.975e-05, |
| "loss": 7.1484, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04251700680272109, |
| "grad_norm": 1.0678784847259521, |
| "learning_rate": 6.225e-05, |
| "loss": 6.6754, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05102040816326531, |
| "grad_norm": 1.0898432731628418, |
| "learning_rate": 7.475e-05, |
| "loss": 6.2519, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.05952380952380952, |
| "grad_norm": 0.9323605298995972, |
| "learning_rate": 8.724999999999999e-05, |
| "loss": 5.8711, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.06802721088435375, |
| "grad_norm": 0.7210457921028137, |
| "learning_rate": 9.975000000000001e-05, |
| "loss": 5.5604, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.07653061224489796, |
| "grad_norm": 0.7947279214859009, |
| "learning_rate": 0.00011225, |
| "loss": 5.3079, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.08503401360544217, |
| "grad_norm": 0.8039622902870178, |
| "learning_rate": 0.00012475, |
| "loss": 5.1044, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.0935374149659864, |
| "grad_norm": 0.7226396799087524, |
| "learning_rate": 0.00013725, |
| "loss": 4.9271, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.10204081632653061, |
| "grad_norm": 0.7884791493415833, |
| "learning_rate": 0.00014975, |
| "loss": 4.7654, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.11054421768707483, |
| "grad_norm": 0.7507880330085754, |
| "learning_rate": 0.00016225000000000001, |
| "loss": 4.61, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.11904761904761904, |
| "grad_norm": 0.6950189471244812, |
| "learning_rate": 0.00017475, |
| "loss": 4.474, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.12755102040816327, |
| "grad_norm": 0.7212897539138794, |
| "learning_rate": 0.00018725, |
| "loss": 4.33, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.1360544217687075, |
| "grad_norm": 0.8659721612930298, |
| "learning_rate": 0.00019975, |
| "loss": 4.177, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.1445578231292517, |
| "grad_norm": 0.7730752229690552, |
| "learning_rate": 0.00021225, |
| "loss": 3.9974, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.15306122448979592, |
| "grad_norm": 0.8463068604469299, |
| "learning_rate": 0.00022475000000000001, |
| "loss": 3.814, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.16156462585034015, |
| "grad_norm": 0.8561194539070129, |
| "learning_rate": 0.00023725, |
| "loss": 3.6612, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.17006802721088435, |
| "grad_norm": 0.7385916709899902, |
| "learning_rate": 0.00024975, |
| "loss": 3.5319, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.17006802721088435, |
| "eval_loss": 3.6021459102630615, |
| "eval_runtime": 75.3501, |
| "eval_samples_per_second": 1245.374, |
| "eval_steps_per_second": 4.871, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.17857142857142858, |
| "grad_norm": 0.7785052061080933, |
| "learning_rate": 0.00026225, |
| "loss": 3.4196, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1870748299319728, |
| "grad_norm": 0.7699462175369263, |
| "learning_rate": 0.00027475, |
| "loss": 3.3248, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.195578231292517, |
| "grad_norm": 0.6943045258522034, |
| "learning_rate": 0.00028725, |
| "loss": 3.2327, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.20408163265306123, |
| "grad_norm": 0.7425329685211182, |
| "learning_rate": 0.00029975000000000005, |
| "loss": 3.1558, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.21258503401360543, |
| "grad_norm": 0.6596449613571167, |
| "learning_rate": 0.00031225000000000003, |
| "loss": 3.0896, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.22108843537414966, |
| "grad_norm": 0.6805656552314758, |
| "learning_rate": 0.00032475, |
| "loss": 3.0295, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.22959183673469388, |
| "grad_norm": 0.6616429686546326, |
| "learning_rate": 0.00033725, |
| "loss": 2.9716, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.23809523809523808, |
| "grad_norm": 0.699910581111908, |
| "learning_rate": 0.00034975, |
| "loss": 2.9154, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.2465986394557823, |
| "grad_norm": 0.6367819905281067, |
| "learning_rate": 0.00036225000000000005, |
| "loss": 2.88, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.25510204081632654, |
| "grad_norm": 0.633556067943573, |
| "learning_rate": 0.00037475000000000003, |
| "loss": 2.8288, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.26360544217687076, |
| "grad_norm": 0.6006012558937073, |
| "learning_rate": 0.00038725, |
| "loss": 2.8017, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.272108843537415, |
| "grad_norm": 0.6720077991485596, |
| "learning_rate": 0.00039975, |
| "loss": 2.762, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.28061224489795916, |
| "grad_norm": 0.6513054966926575, |
| "learning_rate": 0.00041225, |
| "loss": 2.7228, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2891156462585034, |
| "grad_norm": 0.6120091676712036, |
| "learning_rate": 0.00042475000000000005, |
| "loss": 2.6978, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.2976190476190476, |
| "grad_norm": 0.6610727906227112, |
| "learning_rate": 0.00043725000000000003, |
| "loss": 2.6641, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.30612244897959184, |
| "grad_norm": 0.6299336552619934, |
| "learning_rate": 0.00044975, |
| "loss": 2.6343, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.31462585034013607, |
| "grad_norm": 0.6401428580284119, |
| "learning_rate": 0.00046225, |
| "loss": 2.6212, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.3231292517006803, |
| "grad_norm": 0.6018512845039368, |
| "learning_rate": 0.00047475, |
| "loss": 2.5919, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.33163265306122447, |
| "grad_norm": 0.63248610496521, |
| "learning_rate": 0.00048725000000000005, |
| "loss": 2.5739, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.3401360544217687, |
| "grad_norm": 0.646512508392334, |
| "learning_rate": 0.0004997500000000001, |
| "loss": 2.5493, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3401360544217687, |
| "eval_loss": 2.755571126937866, |
| "eval_runtime": 75.4052, |
| "eval_samples_per_second": 1244.463, |
| "eval_steps_per_second": 4.867, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3486394557823129, |
| "grad_norm": 0.6081250309944153, |
| "learning_rate": 0.000499568661971831, |
| "loss": 2.538, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.35714285714285715, |
| "grad_norm": 0.584749698638916, |
| "learning_rate": 0.0004991285211267605, |
| "loss": 2.5185, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3656462585034014, |
| "grad_norm": 0.6018593907356262, |
| "learning_rate": 0.0004986883802816902, |
| "loss": 2.4893, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.3741496598639456, |
| "grad_norm": 0.5787435173988342, |
| "learning_rate": 0.0004982482394366197, |
| "loss": 2.4763, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.3826530612244898, |
| "grad_norm": 0.5765504240989685, |
| "learning_rate": 0.0004978080985915493, |
| "loss": 2.4471, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.391156462585034, |
| "grad_norm": 0.539054811000824, |
| "learning_rate": 0.0004973679577464789, |
| "loss": 2.4272, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.39965986394557823, |
| "grad_norm": 0.5542412400245667, |
| "learning_rate": 0.0004969278169014085, |
| "loss": 2.4178, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.40816326530612246, |
| "grad_norm": 0.5709657073020935, |
| "learning_rate": 0.0004964876760563381, |
| "loss": 2.4105, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.5530266165733337, |
| "learning_rate": 0.0004960475352112676, |
| "loss": 2.3812, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.42517006802721086, |
| "grad_norm": 0.5575669407844543, |
| "learning_rate": 0.0004956073943661972, |
| "loss": 2.3696, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.4336734693877551, |
| "grad_norm": 0.5466365814208984, |
| "learning_rate": 0.0004951672535211268, |
| "loss": 2.36, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.4421768707482993, |
| "grad_norm": 0.5788463354110718, |
| "learning_rate": 0.0004947271126760563, |
| "loss": 2.3409, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.45068027210884354, |
| "grad_norm": 0.5266286730766296, |
| "learning_rate": 0.0004942869718309859, |
| "loss": 2.3284, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.45918367346938777, |
| "grad_norm": 0.5419022440910339, |
| "learning_rate": 0.0004938468309859155, |
| "loss": 2.3175, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.467687074829932, |
| "grad_norm": 0.5173165202140808, |
| "learning_rate": 0.0004934066901408451, |
| "loss": 2.3021, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.5213769674301147, |
| "learning_rate": 0.0004929665492957746, |
| "loss": 2.2974, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.4846938775510204, |
| "grad_norm": 0.5393424034118652, |
| "learning_rate": 0.0004925264084507042, |
| "loss": 2.2834, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.4931972789115646, |
| "grad_norm": 0.5283843874931335, |
| "learning_rate": 0.0004920862676056338, |
| "loss": 2.2894, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.5017006802721088, |
| "grad_norm": 0.5366395711898804, |
| "learning_rate": 0.0004916461267605634, |
| "loss": 2.2643, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.5102040816326531, |
| "grad_norm": 0.5475978851318359, |
| "learning_rate": 0.000491205985915493, |
| "loss": 2.2525, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5102040816326531, |
| "eval_loss": 2.454470634460449, |
| "eval_runtime": 75.3482, |
| "eval_samples_per_second": 1245.405, |
| "eval_steps_per_second": 4.871, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5187074829931972, |
| "grad_norm": 0.5384633541107178, |
| "learning_rate": 0.0004907658450704226, |
| "loss": 2.2469, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.5272108843537415, |
| "grad_norm": 0.538545548915863, |
| "learning_rate": 0.0004903257042253521, |
| "loss": 2.2378, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.5357142857142857, |
| "grad_norm": 0.5223479270935059, |
| "learning_rate": 0.0004898855633802817, |
| "loss": 2.2343, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.54421768707483, |
| "grad_norm": 0.531447172164917, |
| "learning_rate": 0.0004894454225352113, |
| "loss": 2.2201, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.5527210884353742, |
| "grad_norm": 0.5139949321746826, |
| "learning_rate": 0.0004890052816901409, |
| "loss": 2.2098, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.5612244897959183, |
| "grad_norm": 0.5428385138511658, |
| "learning_rate": 0.0004885651408450705, |
| "loss": 2.2115, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5697278911564626, |
| "grad_norm": 0.5274862051010132, |
| "learning_rate": 0.000488125, |
| "loss": 2.1987, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.5782312925170068, |
| "grad_norm": 0.5110312104225159, |
| "learning_rate": 0.0004876848591549296, |
| "loss": 2.1843, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.5867346938775511, |
| "grad_norm": 0.49753338098526, |
| "learning_rate": 0.00048724471830985914, |
| "loss": 2.1755, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.5952380952380952, |
| "grad_norm": 0.5169926881790161, |
| "learning_rate": 0.00048680457746478874, |
| "loss": 2.1779, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.6037414965986394, |
| "grad_norm": 0.5181500911712646, |
| "learning_rate": 0.0004863644366197183, |
| "loss": 2.1658, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.6122448979591837, |
| "grad_norm": 0.5171290636062622, |
| "learning_rate": 0.00048592429577464793, |
| "loss": 2.1668, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.6207482993197279, |
| "grad_norm": 0.4980362355709076, |
| "learning_rate": 0.00048548415492957747, |
| "loss": 2.1545, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.6292517006802721, |
| "grad_norm": 0.5099490284919739, |
| "learning_rate": 0.00048504401408450707, |
| "loss": 2.1415, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.6377551020408163, |
| "grad_norm": 0.5084192752838135, |
| "learning_rate": 0.0004846038732394366, |
| "loss": 2.1417, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.6462585034013606, |
| "grad_norm": 0.48829177021980286, |
| "learning_rate": 0.00048416373239436626, |
| "loss": 2.1359, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.6547619047619048, |
| "grad_norm": 0.5111795663833618, |
| "learning_rate": 0.0004837235915492958, |
| "loss": 2.1348, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.6632653061224489, |
| "grad_norm": 0.49867865443229675, |
| "learning_rate": 0.00048328345070422534, |
| "loss": 2.1199, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.6717687074829932, |
| "grad_norm": 0.4912058711051941, |
| "learning_rate": 0.00048284330985915493, |
| "loss": 2.1169, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.6802721088435374, |
| "grad_norm": 0.5115543603897095, |
| "learning_rate": 0.00048240316901408453, |
| "loss": 2.1093, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6802721088435374, |
| "eval_loss": 2.320960521697998, |
| "eval_runtime": 75.9327, |
| "eval_samples_per_second": 1235.818, |
| "eval_steps_per_second": 4.833, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6887755102040817, |
| "grad_norm": 0.5165016651153564, |
| "learning_rate": 0.0004819630281690141, |
| "loss": 2.1103, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.6972789115646258, |
| "grad_norm": 0.49575796723365784, |
| "learning_rate": 0.00048152288732394367, |
| "loss": 2.1027, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.70578231292517, |
| "grad_norm": 0.5010895133018494, |
| "learning_rate": 0.0004810827464788732, |
| "loss": 2.0939, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.49885183572769165, |
| "learning_rate": 0.00048064260563380286, |
| "loss": 2.0951, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.7227891156462585, |
| "grad_norm": 0.47758999466896057, |
| "learning_rate": 0.0004802024647887324, |
| "loss": 2.0834, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.7312925170068028, |
| "grad_norm": 0.48634713888168335, |
| "learning_rate": 0.000479762323943662, |
| "loss": 2.0833, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.7397959183673469, |
| "grad_norm": 0.5028336644172668, |
| "learning_rate": 0.00047932218309859153, |
| "loss": 2.0804, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.7482993197278912, |
| "grad_norm": 0.4951624870300293, |
| "learning_rate": 0.00047888204225352113, |
| "loss": 2.0697, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.7568027210884354, |
| "grad_norm": 0.5072620511054993, |
| "learning_rate": 0.0004784419014084507, |
| "loss": 2.0745, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.7653061224489796, |
| "grad_norm": 0.4807872474193573, |
| "learning_rate": 0.0004780017605633803, |
| "loss": 2.0618, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.7738095238095238, |
| "grad_norm": 0.49827975034713745, |
| "learning_rate": 0.00047756161971830986, |
| "loss": 2.0648, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.782312925170068, |
| "grad_norm": 0.48257699608802795, |
| "learning_rate": 0.0004771214788732394, |
| "loss": 2.0542, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.7908163265306123, |
| "grad_norm": 0.4961060583591461, |
| "learning_rate": 0.00047668133802816905, |
| "loss": 2.0531, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.7993197278911565, |
| "grad_norm": 0.48454368114471436, |
| "learning_rate": 0.0004762411971830986, |
| "loss": 2.0497, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.8078231292517006, |
| "grad_norm": 0.48959559202194214, |
| "learning_rate": 0.0004758010563380282, |
| "loss": 2.0539, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.8163265306122449, |
| "grad_norm": 0.4989263415336609, |
| "learning_rate": 0.00047536091549295773, |
| "loss": 2.0436, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.8248299319727891, |
| "grad_norm": 0.491557776927948, |
| "learning_rate": 0.0004749207746478874, |
| "loss": 2.0347, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.4861533045768738, |
| "learning_rate": 0.0004744806338028169, |
| "loss": 2.0269, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.8418367346938775, |
| "grad_norm": 0.5007792711257935, |
| "learning_rate": 0.00047404049295774646, |
| "loss": 2.0277, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.8503401360544217, |
| "grad_norm": 0.48347318172454834, |
| "learning_rate": 0.00047360035211267606, |
| "loss": 2.0278, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.8503401360544217, |
| "eval_loss": 2.221730947494507, |
| "eval_runtime": 75.3978, |
| "eval_samples_per_second": 1244.585, |
| "eval_steps_per_second": 4.868, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.858843537414966, |
| "grad_norm": 0.4758061468601227, |
| "learning_rate": 0.00047316021126760565, |
| "loss": 2.0207, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.8673469387755102, |
| "grad_norm": 0.5015333890914917, |
| "learning_rate": 0.00047272007042253525, |
| "loss": 2.0213, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.8758503401360545, |
| "grad_norm": 0.46741312742233276, |
| "learning_rate": 0.0004722799295774648, |
| "loss": 2.0115, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.8843537414965986, |
| "grad_norm": 0.4716244339942932, |
| "learning_rate": 0.0004718397887323944, |
| "loss": 2.0041, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.8928571428571429, |
| "grad_norm": 0.470093309879303, |
| "learning_rate": 0.000471399647887324, |
| "loss": 2.0101, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.9013605442176871, |
| "grad_norm": 0.46333491802215576, |
| "learning_rate": 0.0004709595070422535, |
| "loss": 1.9985, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.9098639455782312, |
| "grad_norm": 0.4772166609764099, |
| "learning_rate": 0.0004705193661971831, |
| "loss": 1.9956, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.9183673469387755, |
| "grad_norm": 0.49403470754623413, |
| "learning_rate": 0.00047007922535211266, |
| "loss": 1.991, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.9268707482993197, |
| "grad_norm": 0.4855027198791504, |
| "learning_rate": 0.00046963908450704225, |
| "loss": 1.9972, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.935374149659864, |
| "grad_norm": 0.4754594564437866, |
| "learning_rate": 0.00046919894366197185, |
| "loss": 1.9902, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.9438775510204082, |
| "grad_norm": 0.4977014660835266, |
| "learning_rate": 0.00046875880281690144, |
| "loss": 1.9904, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.48129725456237793, |
| "learning_rate": 0.000468318661971831, |
| "loss": 1.9855, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.9608843537414966, |
| "grad_norm": 0.484291672706604, |
| "learning_rate": 0.00046787852112676053, |
| "loss": 1.9781, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.9693877551020408, |
| "grad_norm": 0.4759155809879303, |
| "learning_rate": 0.0004674383802816902, |
| "loss": 1.9766, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.9778911564625851, |
| "grad_norm": 0.48138466477394104, |
| "learning_rate": 0.0004669982394366197, |
| "loss": 1.9678, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.9863945578231292, |
| "grad_norm": 0.48765894770622253, |
| "learning_rate": 0.0004665580985915493, |
| "loss": 1.9746, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.9948979591836735, |
| "grad_norm": 0.47016099095344543, |
| "learning_rate": 0.00046611795774647885, |
| "loss": 1.9745, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.0034013605442176, |
| "grad_norm": 0.4722880721092224, |
| "learning_rate": 0.0004656778169014085, |
| "loss": 1.961, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.0119047619047619, |
| "grad_norm": 0.47606250643730164, |
| "learning_rate": 0.00046523767605633805, |
| "loss": 1.9548, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "grad_norm": 0.4867871403694153, |
| "learning_rate": 0.0004647975352112676, |
| "loss": 1.9531, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.0204081632653061, |
| "eval_loss": 2.1581504344940186, |
| "eval_runtime": 75.3496, |
| "eval_samples_per_second": 1245.381, |
| "eval_steps_per_second": 4.871, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.0289115646258504, |
| "grad_norm": 0.4676497280597687, |
| "learning_rate": 0.0004643573943661972, |
| "loss": 1.9505, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.0374149659863945, |
| "grad_norm": 0.47663334012031555, |
| "learning_rate": 0.0004639172535211268, |
| "loss": 1.9509, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.0459183673469388, |
| "grad_norm": 0.46838682889938354, |
| "learning_rate": 0.0004634771126760564, |
| "loss": 1.9522, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.054421768707483, |
| "grad_norm": 0.47476112842559814, |
| "learning_rate": 0.0004630369718309859, |
| "loss": 1.9575, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.0629251700680271, |
| "grad_norm": 0.4738709330558777, |
| "learning_rate": 0.0004625968309859155, |
| "loss": 1.9535, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.0714285714285714, |
| "grad_norm": 0.46784505248069763, |
| "learning_rate": 0.0004621566901408451, |
| "loss": 1.95, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.0799319727891157, |
| "grad_norm": 0.4729381501674652, |
| "learning_rate": 0.00046171654929577465, |
| "loss": 1.9382, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.08843537414966, |
| "grad_norm": 0.48842012882232666, |
| "learning_rate": 0.00046127640845070424, |
| "loss": 1.9383, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.096938775510204, |
| "grad_norm": 0.47705018520355225, |
| "learning_rate": 0.0004608362676056338, |
| "loss": 1.9298, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.1054421768707483, |
| "grad_norm": 0.47480863332748413, |
| "learning_rate": 0.00046039612676056343, |
| "loss": 1.9356, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.1139455782312926, |
| "grad_norm": 0.4836580455303192, |
| "learning_rate": 0.000459955985915493, |
| "loss": 1.936, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.1224489795918366, |
| "grad_norm": 0.47330114245414734, |
| "learning_rate": 0.00045951584507042257, |
| "loss": 1.9217, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.130952380952381, |
| "grad_norm": 0.45560622215270996, |
| "learning_rate": 0.0004590757042253521, |
| "loss": 1.9278, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.1394557823129252, |
| "grad_norm": 0.48356184363365173, |
| "learning_rate": 0.00045863556338028165, |
| "loss": 1.9204, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.1479591836734695, |
| "grad_norm": 0.48046737909317017, |
| "learning_rate": 0.0004581954225352113, |
| "loss": 1.9185, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.1564625850340136, |
| "grad_norm": 0.47801148891448975, |
| "learning_rate": 0.00045775528169014084, |
| "loss": 1.9084, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.1649659863945578, |
| "grad_norm": 0.46705126762390137, |
| "learning_rate": 0.00045731514084507044, |
| "loss": 1.9138, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.1734693877551021, |
| "grad_norm": 0.47566279768943787, |
| "learning_rate": 0.000456875, |
| "loss": 1.9139, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.1819727891156462, |
| "grad_norm": 0.4746539294719696, |
| "learning_rate": 0.00045643485915492963, |
| "loss": 1.916, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "grad_norm": 0.49360892176628113, |
| "learning_rate": 0.00045599471830985917, |
| "loss": 1.9086, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.1904761904761905, |
| "eval_loss": 2.1105546951293945, |
| "eval_runtime": 75.3645, |
| "eval_samples_per_second": 1245.136, |
| "eval_steps_per_second": 4.87, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.1989795918367347, |
| "grad_norm": 0.49067452549934387, |
| "learning_rate": 0.0004555545774647887, |
| "loss": 1.9044, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.2074829931972788, |
| "grad_norm": 0.48024600744247437, |
| "learning_rate": 0.0004551144366197183, |
| "loss": 1.9143, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.215986394557823, |
| "grad_norm": 0.4815312922000885, |
| "learning_rate": 0.0004546742957746479, |
| "loss": 1.9038, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.2244897959183674, |
| "grad_norm": 0.47568100690841675, |
| "learning_rate": 0.0004542341549295775, |
| "loss": 1.9079, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.2329931972789117, |
| "grad_norm": 0.47568491101264954, |
| "learning_rate": 0.00045379401408450704, |
| "loss": 1.9024, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.2414965986394557, |
| "grad_norm": 0.46364548802375793, |
| "learning_rate": 0.00045335387323943663, |
| "loss": 1.9008, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.4672023355960846, |
| "learning_rate": 0.00045291373239436623, |
| "loss": 1.9022, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.2585034013605443, |
| "grad_norm": 0.4671889841556549, |
| "learning_rate": 0.00045247359154929577, |
| "loss": 1.894, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.2670068027210886, |
| "grad_norm": 0.4657513201236725, |
| "learning_rate": 0.00045203345070422537, |
| "loss": 1.8978, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.2755102040816326, |
| "grad_norm": 0.4698520004749298, |
| "learning_rate": 0.0004515933098591549, |
| "loss": 1.8945, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.284013605442177, |
| "grad_norm": 0.4804818332195282, |
| "learning_rate": 0.00045115316901408456, |
| "loss": 1.8921, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.2925170068027212, |
| "grad_norm": 0.4709530174732208, |
| "learning_rate": 0.0004507130281690141, |
| "loss": 1.889, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.3010204081632653, |
| "grad_norm": 0.47395414113998413, |
| "learning_rate": 0.0004502728873239437, |
| "loss": 1.8837, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.3095238095238095, |
| "grad_norm": 0.4659099876880646, |
| "learning_rate": 0.00044983274647887323, |
| "loss": 1.8836, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.3180272108843538, |
| "grad_norm": 0.4623669683933258, |
| "learning_rate": 0.00044939260563380283, |
| "loss": 1.8823, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.3265306122448979, |
| "grad_norm": 0.4751649498939514, |
| "learning_rate": 0.0004489524647887324, |
| "loss": 1.8703, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.3350340136054422, |
| "grad_norm": 0.4874952435493469, |
| "learning_rate": 0.00044851232394366197, |
| "loss": 1.8765, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.3435374149659864, |
| "grad_norm": 0.48088109493255615, |
| "learning_rate": 0.00044807218309859156, |
| "loss": 1.8787, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.3520408163265305, |
| "grad_norm": 0.48666709661483765, |
| "learning_rate": 0.0004476320422535211, |
| "loss": 1.8723, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.3605442176870748, |
| "grad_norm": 0.5058099031448364, |
| "learning_rate": 0.00044719190140845075, |
| "loss": 1.8682, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.3605442176870748, |
| "eval_loss": 2.066969156265259, |
| "eval_runtime": 75.2958, |
| "eval_samples_per_second": 1246.271, |
| "eval_steps_per_second": 4.874, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.369047619047619, |
| "grad_norm": 0.47028714418411255, |
| "learning_rate": 0.0004467517605633803, |
| "loss": 1.8619, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.3775510204081631, |
| "grad_norm": 0.468432754278183, |
| "learning_rate": 0.00044631161971830983, |
| "loss": 1.8669, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.3860544217687074, |
| "grad_norm": 0.46689048409461975, |
| "learning_rate": 0.00044587147887323943, |
| "loss": 1.8657, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.3945578231292517, |
| "grad_norm": 0.464141309261322, |
| "learning_rate": 0.000445431338028169, |
| "loss": 1.863, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.403061224489796, |
| "grad_norm": 0.46267494559288025, |
| "learning_rate": 0.0004449911971830986, |
| "loss": 1.8696, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.4115646258503403, |
| "grad_norm": 0.4653218984603882, |
| "learning_rate": 0.00044455105633802816, |
| "loss": 1.8554, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.4200680272108843, |
| "grad_norm": 0.4844844937324524, |
| "learning_rate": 0.00044411091549295776, |
| "loss": 1.8525, |
| "step": 8350 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.47869589924812317, |
| "learning_rate": 0.00044367077464788735, |
| "loss": 1.8584, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.4370748299319729, |
| "grad_norm": 0.4788654148578644, |
| "learning_rate": 0.0004432306338028169, |
| "loss": 1.8562, |
| "step": 8450 |
| }, |
| { |
| "epoch": 1.445578231292517, |
| "grad_norm": 0.4887782335281372, |
| "learning_rate": 0.0004427904929577465, |
| "loss": 1.8583, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.4540816326530612, |
| "grad_norm": 0.48299601674079895, |
| "learning_rate": 0.00044235035211267603, |
| "loss": 1.8557, |
| "step": 8550 |
| }, |
| { |
| "epoch": 1.4625850340136055, |
| "grad_norm": 0.4597221910953522, |
| "learning_rate": 0.0004419102112676057, |
| "loss": 1.847, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.4710884353741496, |
| "grad_norm": 0.4786689877510071, |
| "learning_rate": 0.0004414700704225352, |
| "loss": 1.8393, |
| "step": 8650 |
| }, |
| { |
| "epoch": 1.4795918367346939, |
| "grad_norm": 0.46319809556007385, |
| "learning_rate": 0.0004410299295774648, |
| "loss": 1.8468, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.4880952380952381, |
| "grad_norm": 0.46916770935058594, |
| "learning_rate": 0.00044058978873239436, |
| "loss": 1.8414, |
| "step": 8750 |
| }, |
| { |
| "epoch": 1.4965986394557822, |
| "grad_norm": 0.4812500774860382, |
| "learning_rate": 0.00044014964788732395, |
| "loss": 1.8496, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.5051020408163265, |
| "grad_norm": 0.47240397334098816, |
| "learning_rate": 0.00043970950704225355, |
| "loss": 1.8415, |
| "step": 8850 |
| }, |
| { |
| "epoch": 1.5136054421768708, |
| "grad_norm": 0.4675607979297638, |
| "learning_rate": 0.0004392693661971831, |
| "loss": 1.8395, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.5221088435374148, |
| "grad_norm": 0.4568563401699066, |
| "learning_rate": 0.0004388380281690141, |
| "loss": 1.8349, |
| "step": 8950 |
| }, |
| { |
| "epoch": 1.5306122448979593, |
| "grad_norm": 0.4698484539985657, |
| "learning_rate": 0.00043839788732394363, |
| "loss": 1.8388, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.5306122448979593, |
| "eval_loss": 2.031731128692627, |
| "eval_runtime": 75.3373, |
| "eval_samples_per_second": 1245.584, |
| "eval_steps_per_second": 4.871, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.5391156462585034, |
| "grad_norm": 0.4721793532371521, |
| "learning_rate": 0.0004379577464788733, |
| "loss": 1.8368, |
| "step": 9050 |
| }, |
| { |
| "epoch": 1.5476190476190477, |
| "grad_norm": 0.48082154989242554, |
| "learning_rate": 0.0004375176056338028, |
| "loss": 1.8367, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.556122448979592, |
| "grad_norm": 0.47875961661338806, |
| "learning_rate": 0.00043707746478873237, |
| "loss": 1.834, |
| "step": 9150 |
| }, |
| { |
| "epoch": 1.564625850340136, |
| "grad_norm": 0.4505126178264618, |
| "learning_rate": 0.00043663732394366196, |
| "loss": 1.8293, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.5731292517006803, |
| "grad_norm": 0.4831013083457947, |
| "learning_rate": 0.00043619718309859156, |
| "loss": 1.8325, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.5816326530612246, |
| "grad_norm": 0.46912670135498047, |
| "learning_rate": 0.00043575704225352115, |
| "loss": 1.8201, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.5901360544217686, |
| "grad_norm": 0.4775020480155945, |
| "learning_rate": 0.0004353169014084507, |
| "loss": 1.8324, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.598639455782313, |
| "grad_norm": 0.4624606668949127, |
| "learning_rate": 0.0004348767605633803, |
| "loss": 1.8328, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.6071428571428572, |
| "grad_norm": 0.47817811369895935, |
| "learning_rate": 0.0004344366197183099, |
| "loss": 1.8265, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.6156462585034013, |
| "grad_norm": 0.4682134985923767, |
| "learning_rate": 0.0004339964788732395, |
| "loss": 1.8193, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.6241496598639455, |
| "grad_norm": 0.4736348092556, |
| "learning_rate": 0.000433556338028169, |
| "loss": 1.8144, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.6326530612244898, |
| "grad_norm": 0.47171303629875183, |
| "learning_rate": 0.00043311619718309856, |
| "loss": 1.8156, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.641156462585034, |
| "grad_norm": 0.4707530736923218, |
| "learning_rate": 0.0004326760563380282, |
| "loss": 1.8146, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.6496598639455784, |
| "grad_norm": 0.47999903559684753, |
| "learning_rate": 0.00043223591549295775, |
| "loss": 1.822, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.6581632653061225, |
| "grad_norm": 0.4823855757713318, |
| "learning_rate": 0.00043179577464788735, |
| "loss": 1.8161, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.47148072719573975, |
| "learning_rate": 0.0004313556338028169, |
| "loss": 1.8117, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.675170068027211, |
| "grad_norm": 0.46029239892959595, |
| "learning_rate": 0.00043091549295774654, |
| "loss": 1.8132, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.683673469387755, |
| "grad_norm": 0.47228437662124634, |
| "learning_rate": 0.0004304753521126761, |
| "loss": 1.816, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.6921768707482994, |
| "grad_norm": 0.4653479754924774, |
| "learning_rate": 0.0004300352112676056, |
| "loss": 1.8084, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.7006802721088436, |
| "grad_norm": 0.4696131646633148, |
| "learning_rate": 0.0004295950704225352, |
| "loss": 1.8072, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.7006802721088436, |
| "eval_loss": 1.9987208843231201, |
| "eval_runtime": 75.2953, |
| "eval_samples_per_second": 1246.28, |
| "eval_steps_per_second": 4.874, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.7091836734693877, |
| "grad_norm": 0.4656902253627777, |
| "learning_rate": 0.00042915492957746476, |
| "loss": 1.8049, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.717687074829932, |
| "grad_norm": 0.47671329975128174, |
| "learning_rate": 0.0004287147887323944, |
| "loss": 1.7934, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.7261904761904763, |
| "grad_norm": 0.46021008491516113, |
| "learning_rate": 0.00042827464788732395, |
| "loss": 1.8059, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.7346938775510203, |
| "grad_norm": 0.46965909004211426, |
| "learning_rate": 0.00042783450704225354, |
| "loss": 1.7958, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.7431972789115646, |
| "grad_norm": 0.4655004143714905, |
| "learning_rate": 0.0004273943661971831, |
| "loss": 1.7989, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.751700680272109, |
| "grad_norm": 0.48593828082084656, |
| "learning_rate": 0.0004269542253521127, |
| "loss": 1.7904, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.760204081632653, |
| "grad_norm": 0.4698497951030731, |
| "learning_rate": 0.0004265140845070423, |
| "loss": 1.8005, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.7687074829931972, |
| "grad_norm": 0.4775938391685486, |
| "learning_rate": 0.0004260739436619718, |
| "loss": 1.7992, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.7772108843537415, |
| "grad_norm": 0.45465579628944397, |
| "learning_rate": 0.0004256338028169014, |
| "loss": 1.7973, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.7857142857142856, |
| "grad_norm": 0.4619791507720947, |
| "learning_rate": 0.000425193661971831, |
| "loss": 1.7913, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.79421768707483, |
| "grad_norm": 0.4738638699054718, |
| "learning_rate": 0.0004247535211267606, |
| "loss": 1.7922, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.8027210884353742, |
| "grad_norm": 0.46855342388153076, |
| "learning_rate": 0.00042431338028169015, |
| "loss": 1.7894, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.8112244897959182, |
| "grad_norm": 0.48508965969085693, |
| "learning_rate": 0.0004238732394366197, |
| "loss": 1.7815, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.8197278911564627, |
| "grad_norm": 0.47747498750686646, |
| "learning_rate": 0.00042343309859154934, |
| "loss": 1.7907, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.8282312925170068, |
| "grad_norm": 0.47017601132392883, |
| "learning_rate": 0.0004229929577464789, |
| "loss": 1.7855, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.836734693877551, |
| "grad_norm": 0.4648725092411041, |
| "learning_rate": 0.00042255281690140847, |
| "loss": 1.7878, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.8452380952380953, |
| "grad_norm": 0.4595974385738373, |
| "learning_rate": 0.000422112676056338, |
| "loss": 1.7808, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.8537414965986394, |
| "grad_norm": 0.5028242468833923, |
| "learning_rate": 0.00042167253521126766, |
| "loss": 1.7838, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.8622448979591837, |
| "grad_norm": 0.4698718190193176, |
| "learning_rate": 0.0004212323943661972, |
| "loss": 1.7838, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.870748299319728, |
| "grad_norm": 0.46658000349998474, |
| "learning_rate": 0.00042079225352112675, |
| "loss": 1.7884, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.870748299319728, |
| "eval_loss": 1.9652538299560547, |
| "eval_runtime": 75.3728, |
| "eval_samples_per_second": 1244.998, |
| "eval_steps_per_second": 4.869, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.879251700680272, |
| "grad_norm": 0.4587540924549103, |
| "learning_rate": 0.00042035211267605634, |
| "loss": 1.7795, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.8877551020408163, |
| "grad_norm": 0.48107513785362244, |
| "learning_rate": 0.00041991197183098594, |
| "loss": 1.7843, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.8962585034013606, |
| "grad_norm": 0.468262255191803, |
| "learning_rate": 0.00041947183098591553, |
| "loss": 1.7831, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.4687144458293915, |
| "learning_rate": 0.0004190316901408451, |
| "loss": 1.7761, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.913265306122449, |
| "grad_norm": 0.48110878467559814, |
| "learning_rate": 0.00041859154929577467, |
| "loss": 1.7658, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.9217687074829932, |
| "grad_norm": 0.47303488850593567, |
| "learning_rate": 0.0004181514084507042, |
| "loss": 1.7733, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.9302721088435373, |
| "grad_norm": 0.48464885354042053, |
| "learning_rate": 0.0004177112676056338, |
| "loss": 1.765, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.9387755102040818, |
| "grad_norm": 0.4908495247364044, |
| "learning_rate": 0.0004172799295774648, |
| "loss": 1.7662, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.9472789115646258, |
| "grad_norm": 0.47335630655288696, |
| "learning_rate": 0.00041683978873239435, |
| "loss": 1.7689, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.95578231292517, |
| "grad_norm": 0.4794527292251587, |
| "learning_rate": 0.00041639964788732395, |
| "loss": 1.7659, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.9642857142857144, |
| "grad_norm": 0.47013363242149353, |
| "learning_rate": 0.00041595950704225354, |
| "loss": 1.7683, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.9727891156462585, |
| "grad_norm": 0.4743632674217224, |
| "learning_rate": 0.00041551936619718314, |
| "loss": 1.7674, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.9812925170068028, |
| "grad_norm": 0.46521708369255066, |
| "learning_rate": 0.0004150792253521127, |
| "loss": 1.7686, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.989795918367347, |
| "grad_norm": 0.48240792751312256, |
| "learning_rate": 0.0004146390845070422, |
| "loss": 1.7688, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.998299319727891, |
| "grad_norm": 0.453975111246109, |
| "learning_rate": 0.00041419894366197187, |
| "loss": 1.7658, |
| "step": 11750 |
| }, |
| { |
| "epoch": 2.006802721088435, |
| "grad_norm": 0.45248886942863464, |
| "learning_rate": 0.0004137588028169014, |
| "loss": 1.7543, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.0153061224489797, |
| "grad_norm": 0.47196707129478455, |
| "learning_rate": 0.0004133274647887324, |
| "loss": 1.751, |
| "step": 11850 |
| }, |
| { |
| "epoch": 2.0238095238095237, |
| "grad_norm": 0.46333810687065125, |
| "learning_rate": 0.00041288732394366195, |
| "loss": 1.7544, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.0323129251700682, |
| "grad_norm": 0.46540361642837524, |
| "learning_rate": 0.0004124471830985916, |
| "loss": 1.7586, |
| "step": 11950 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "grad_norm": 0.45927169919013977, |
| "learning_rate": 0.00041200704225352114, |
| "loss": 1.753, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.0408163265306123, |
| "eval_loss": 1.948580265045166, |
| "eval_runtime": 75.3406, |
| "eval_samples_per_second": 1245.53, |
| "eval_steps_per_second": 4.871, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.0493197278911564, |
| "grad_norm": 0.4919711947441101, |
| "learning_rate": 0.0004115669014084507, |
| "loss": 1.7501, |
| "step": 12050 |
| }, |
| { |
| "epoch": 2.057823129251701, |
| "grad_norm": 0.47275787591934204, |
| "learning_rate": 0.0004111267605633803, |
| "loss": 1.7579, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.066326530612245, |
| "grad_norm": 0.4734228551387787, |
| "learning_rate": 0.0004106866197183099, |
| "loss": 1.7445, |
| "step": 12150 |
| }, |
| { |
| "epoch": 2.074829931972789, |
| "grad_norm": 0.4777526259422302, |
| "learning_rate": 0.00041024647887323947, |
| "loss": 1.7558, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.4509522020816803, |
| "learning_rate": 0.000409806338028169, |
| "loss": 1.754, |
| "step": 12250 |
| }, |
| { |
| "epoch": 2.0918367346938775, |
| "grad_norm": 0.4808884859085083, |
| "learning_rate": 0.0004093661971830986, |
| "loss": 1.7487, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.1003401360544216, |
| "grad_norm": 0.46674636006355286, |
| "learning_rate": 0.00040892605633802815, |
| "loss": 1.7428, |
| "step": 12350 |
| }, |
| { |
| "epoch": 2.108843537414966, |
| "grad_norm": 0.4768598973751068, |
| "learning_rate": 0.00040848591549295775, |
| "loss": 1.7372, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.11734693877551, |
| "grad_norm": 0.4708414673805237, |
| "learning_rate": 0.00040804577464788734, |
| "loss": 1.7458, |
| "step": 12450 |
| }, |
| { |
| "epoch": 2.1258503401360542, |
| "grad_norm": 0.47017523646354675, |
| "learning_rate": 0.0004076056338028169, |
| "loss": 1.7497, |
| "step": 12500 |
| }, |
| { |
| "epoch": 2.1343537414965987, |
| "grad_norm": 0.4945257306098938, |
| "learning_rate": 0.0004071654929577465, |
| "loss": 1.7451, |
| "step": 12550 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.4588756859302521, |
| "learning_rate": 0.00040672535211267607, |
| "loss": 1.7393, |
| "step": 12600 |
| }, |
| { |
| "epoch": 2.1513605442176873, |
| "grad_norm": 0.49352967739105225, |
| "learning_rate": 0.00040628521126760567, |
| "loss": 1.7366, |
| "step": 12650 |
| }, |
| { |
| "epoch": 2.1598639455782314, |
| "grad_norm": 0.47481125593185425, |
| "learning_rate": 0.0004058450704225352, |
| "loss": 1.7373, |
| "step": 12700 |
| }, |
| { |
| "epoch": 2.1683673469387754, |
| "grad_norm": 0.48250919580459595, |
| "learning_rate": 0.00040540492957746475, |
| "loss": 1.7409, |
| "step": 12750 |
| }, |
| { |
| "epoch": 2.17687074829932, |
| "grad_norm": 0.4601617157459259, |
| "learning_rate": 0.0004049647887323944, |
| "loss": 1.7413, |
| "step": 12800 |
| }, |
| { |
| "epoch": 2.185374149659864, |
| "grad_norm": 0.47254276275634766, |
| "learning_rate": 0.00040452464788732394, |
| "loss": 1.7495, |
| "step": 12850 |
| }, |
| { |
| "epoch": 2.193877551020408, |
| "grad_norm": 0.48082679510116577, |
| "learning_rate": 0.00040408450704225354, |
| "loss": 1.7429, |
| "step": 12900 |
| }, |
| { |
| "epoch": 2.2023809523809526, |
| "grad_norm": 0.46352913975715637, |
| "learning_rate": 0.0004036443661971831, |
| "loss": 1.7401, |
| "step": 12950 |
| }, |
| { |
| "epoch": 2.2108843537414966, |
| "grad_norm": 0.4566120207309723, |
| "learning_rate": 0.00040321302816901414, |
| "loss": 1.7337, |
| "step": 13000 |
| }, |
| { |
| "epoch": 2.2108843537414966, |
| "eval_loss": 1.921891689300537, |
| "eval_runtime": 75.3847, |
| "eval_samples_per_second": 1244.802, |
| "eval_steps_per_second": 4.868, |
| "step": 13000 |
| }, |
| { |
| "epoch": 2.2193877551020407, |
| "grad_norm": 0.5124084949493408, |
| "learning_rate": 0.0004027728873239437, |
| "loss": 1.741, |
| "step": 13050 |
| }, |
| { |
| "epoch": 2.227891156462585, |
| "grad_norm": 0.4604070782661438, |
| "learning_rate": 0.00040233274647887327, |
| "loss": 1.7372, |
| "step": 13100 |
| }, |
| { |
| "epoch": 2.2363945578231292, |
| "grad_norm": 0.49075254797935486, |
| "learning_rate": 0.0004019014084507042, |
| "loss": 1.7411, |
| "step": 13150 |
| }, |
| { |
| "epoch": 2.2448979591836733, |
| "grad_norm": 0.4861091077327728, |
| "learning_rate": 0.00040146126760563376, |
| "loss": 1.7418, |
| "step": 13200 |
| }, |
| { |
| "epoch": 2.253401360544218, |
| "grad_norm": 0.47723260521888733, |
| "learning_rate": 0.0004010211267605634, |
| "loss": 1.7375, |
| "step": 13250 |
| }, |
| { |
| "epoch": 2.261904761904762, |
| "grad_norm": 0.4777429401874542, |
| "learning_rate": 0.00040058098591549295, |
| "loss": 1.7305, |
| "step": 13300 |
| }, |
| { |
| "epoch": 2.270408163265306, |
| "grad_norm": 0.4744473993778229, |
| "learning_rate": 0.00040014084507042255, |
| "loss": 1.7291, |
| "step": 13350 |
| }, |
| { |
| "epoch": 2.2789115646258504, |
| "grad_norm": 0.46367913484573364, |
| "learning_rate": 0.0003997007042253521, |
| "loss": 1.7305, |
| "step": 13400 |
| }, |
| { |
| "epoch": 2.2874149659863945, |
| "grad_norm": 0.4830113649368286, |
| "learning_rate": 0.00039926056338028174, |
| "loss": 1.7298, |
| "step": 13450 |
| }, |
| { |
| "epoch": 2.295918367346939, |
| "grad_norm": 0.48178356885910034, |
| "learning_rate": 0.0003988204225352113, |
| "loss": 1.715, |
| "step": 13500 |
| }, |
| { |
| "epoch": 2.304421768707483, |
| "grad_norm": 0.46279290318489075, |
| "learning_rate": 0.0003983802816901408, |
| "loss": 1.7265, |
| "step": 13550 |
| }, |
| { |
| "epoch": 2.312925170068027, |
| "grad_norm": 0.4898684620857239, |
| "learning_rate": 0.0003979401408450704, |
| "loss": 1.7257, |
| "step": 13600 |
| }, |
| { |
| "epoch": 2.3214285714285716, |
| "grad_norm": 0.47215622663497925, |
| "learning_rate": 0.0003975, |
| "loss": 1.7291, |
| "step": 13650 |
| }, |
| { |
| "epoch": 2.3299319727891157, |
| "grad_norm": 0.4575498104095459, |
| "learning_rate": 0.0003970598591549296, |
| "loss": 1.7165, |
| "step": 13700 |
| }, |
| { |
| "epoch": 2.3384353741496597, |
| "grad_norm": 0.4656940996646881, |
| "learning_rate": 0.00039661971830985915, |
| "loss": 1.7183, |
| "step": 13750 |
| }, |
| { |
| "epoch": 2.3469387755102042, |
| "grad_norm": 0.4670725464820862, |
| "learning_rate": 0.00039617957746478874, |
| "loss": 1.7217, |
| "step": 13800 |
| }, |
| { |
| "epoch": 2.3554421768707483, |
| "grad_norm": 0.47170403599739075, |
| "learning_rate": 0.00039573943661971834, |
| "loss": 1.7255, |
| "step": 13850 |
| }, |
| { |
| "epoch": 2.3639455782312924, |
| "grad_norm": 0.49639126658439636, |
| "learning_rate": 0.0003952992957746479, |
| "loss": 1.7246, |
| "step": 13900 |
| }, |
| { |
| "epoch": 2.372448979591837, |
| "grad_norm": 0.4941042959690094, |
| "learning_rate": 0.0003948591549295775, |
| "loss": 1.7183, |
| "step": 13950 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.4859507381916046, |
| "learning_rate": 0.000394419014084507, |
| "loss": 1.7239, |
| "step": 14000 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "eval_loss": 1.9032775163650513, |
| "eval_runtime": 75.3175, |
| "eval_samples_per_second": 1245.912, |
| "eval_steps_per_second": 4.873, |
| "step": 14000 |
| }, |
| { |
| "epoch": 2.389455782312925, |
| "grad_norm": 0.48601582646369934, |
| "learning_rate": 0.00039397887323943667, |
| "loss": 1.721, |
| "step": 14050 |
| }, |
| { |
| "epoch": 2.3979591836734695, |
| "grad_norm": 0.4685114920139313, |
| "learning_rate": 0.0003935387323943662, |
| "loss": 1.7155, |
| "step": 14100 |
| }, |
| { |
| "epoch": 2.4064625850340136, |
| "grad_norm": 0.44943785667419434, |
| "learning_rate": 0.0003930985915492958, |
| "loss": 1.7154, |
| "step": 14150 |
| }, |
| { |
| "epoch": 2.4149659863945576, |
| "grad_norm": 0.48964107036590576, |
| "learning_rate": 0.00039265845070422535, |
| "loss": 1.7123, |
| "step": 14200 |
| }, |
| { |
| "epoch": 2.423469387755102, |
| "grad_norm": 0.457769513130188, |
| "learning_rate": 0.00039221830985915494, |
| "loss": 1.7084, |
| "step": 14250 |
| }, |
| { |
| "epoch": 2.431972789115646, |
| "grad_norm": 0.5480780005455017, |
| "learning_rate": 0.00039177816901408454, |
| "loss": 1.7129, |
| "step": 14300 |
| }, |
| { |
| "epoch": 2.4404761904761907, |
| "grad_norm": 0.48546379804611206, |
| "learning_rate": 0.0003913380281690141, |
| "loss": 1.7078, |
| "step": 14350 |
| }, |
| { |
| "epoch": 2.4489795918367347, |
| "grad_norm": 0.46493494510650635, |
| "learning_rate": 0.00039089788732394367, |
| "loss": 1.7104, |
| "step": 14400 |
| }, |
| { |
| "epoch": 2.457482993197279, |
| "grad_norm": 0.4606137275695801, |
| "learning_rate": 0.0003904577464788732, |
| "loss": 1.7045, |
| "step": 14450 |
| }, |
| { |
| "epoch": 2.4659863945578233, |
| "grad_norm": 0.46123144030570984, |
| "learning_rate": 0.00039001760563380286, |
| "loss": 1.7047, |
| "step": 14500 |
| }, |
| { |
| "epoch": 2.4744897959183674, |
| "grad_norm": 0.4724046289920807, |
| "learning_rate": 0.0003895774647887324, |
| "loss": 1.711, |
| "step": 14550 |
| }, |
| { |
| "epoch": 2.4829931972789114, |
| "grad_norm": 0.47036615014076233, |
| "learning_rate": 0.00038913732394366195, |
| "loss": 1.7077, |
| "step": 14600 |
| }, |
| { |
| "epoch": 2.491496598639456, |
| "grad_norm": 0.465732216835022, |
| "learning_rate": 0.00038869718309859154, |
| "loss": 1.7041, |
| "step": 14650 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.4794946610927582, |
| "learning_rate": 0.00038825704225352114, |
| "loss": 1.7037, |
| "step": 14700 |
| }, |
| { |
| "epoch": 2.508503401360544, |
| "grad_norm": 0.47816696763038635, |
| "learning_rate": 0.00038781690140845073, |
| "loss": 1.7056, |
| "step": 14750 |
| }, |
| { |
| "epoch": 2.5170068027210886, |
| "grad_norm": 0.47812485694885254, |
| "learning_rate": 0.0003873767605633803, |
| "loss": 1.7063, |
| "step": 14800 |
| }, |
| { |
| "epoch": 2.5255102040816326, |
| "grad_norm": 0.4651651084423065, |
| "learning_rate": 0.00038693661971830987, |
| "loss": 1.693, |
| "step": 14850 |
| }, |
| { |
| "epoch": 2.534013605442177, |
| "grad_norm": 0.48417726159095764, |
| "learning_rate": 0.00038649647887323946, |
| "loss": 1.697, |
| "step": 14900 |
| }, |
| { |
| "epoch": 2.542517006802721, |
| "grad_norm": 0.4502563178539276, |
| "learning_rate": 0.000386056338028169, |
| "loss": 1.703, |
| "step": 14950 |
| }, |
| { |
| "epoch": 2.5510204081632653, |
| "grad_norm": 0.45644158124923706, |
| "learning_rate": 0.0003856161971830986, |
| "loss": 1.6959, |
| "step": 15000 |
| }, |
| { |
| "epoch": 2.5510204081632653, |
| "eval_loss": 1.8780722618103027, |
| "eval_runtime": 75.398, |
| "eval_samples_per_second": 1244.582, |
| "eval_steps_per_second": 4.868, |
| "step": 15000 |
| }, |
| { |
| "epoch": 2.5595238095238093, |
| "grad_norm": 0.4715486764907837, |
| "learning_rate": 0.00038517605633802814, |
| "loss": 1.7015, |
| "step": 15050 |
| }, |
| { |
| "epoch": 2.568027210884354, |
| "grad_norm": 0.44938182830810547, |
| "learning_rate": 0.0003847359154929578, |
| "loss": 1.6954, |
| "step": 15100 |
| }, |
| { |
| "epoch": 2.576530612244898, |
| "grad_norm": 0.4778023660182953, |
| "learning_rate": 0.00038429577464788733, |
| "loss": 1.6973, |
| "step": 15150 |
| }, |
| { |
| "epoch": 2.5850340136054424, |
| "grad_norm": 0.4602490961551666, |
| "learning_rate": 0.00038385563380281693, |
| "loss": 1.7041, |
| "step": 15200 |
| }, |
| { |
| "epoch": 2.5935374149659864, |
| "grad_norm": 0.4905562400817871, |
| "learning_rate": 0.0003834242957746479, |
| "loss": 1.6929, |
| "step": 15250 |
| }, |
| { |
| "epoch": 2.6020408163265305, |
| "grad_norm": 0.47374477982521057, |
| "learning_rate": 0.00038298415492957747, |
| "loss": 1.6963, |
| "step": 15300 |
| }, |
| { |
| "epoch": 2.6105442176870746, |
| "grad_norm": 0.45640283823013306, |
| "learning_rate": 0.00038254401408450707, |
| "loss": 1.6941, |
| "step": 15350 |
| }, |
| { |
| "epoch": 2.619047619047619, |
| "grad_norm": 0.46338459849357605, |
| "learning_rate": 0.0003821038732394366, |
| "loss": 1.6898, |
| "step": 15400 |
| }, |
| { |
| "epoch": 2.627551020408163, |
| "grad_norm": 0.458769291639328, |
| "learning_rate": 0.0003816637323943662, |
| "loss": 1.6897, |
| "step": 15450 |
| }, |
| { |
| "epoch": 2.6360544217687076, |
| "grad_norm": 0.44556647539138794, |
| "learning_rate": 0.00038122359154929575, |
| "loss": 1.6918, |
| "step": 15500 |
| }, |
| { |
| "epoch": 2.6445578231292517, |
| "grad_norm": 0.48705628514289856, |
| "learning_rate": 0.0003807834507042254, |
| "loss": 1.6948, |
| "step": 15550 |
| }, |
| { |
| "epoch": 2.6530612244897958, |
| "grad_norm": 0.46732062101364136, |
| "learning_rate": 0.00038034330985915494, |
| "loss": 1.6884, |
| "step": 15600 |
| }, |
| { |
| "epoch": 2.6615646258503403, |
| "grad_norm": 0.44862720370292664, |
| "learning_rate": 0.0003799031690140845, |
| "loss": 1.6921, |
| "step": 15650 |
| }, |
| { |
| "epoch": 2.6700680272108843, |
| "grad_norm": 0.4507545828819275, |
| "learning_rate": 0.0003794630281690141, |
| "loss": 1.6841, |
| "step": 15700 |
| }, |
| { |
| "epoch": 2.678571428571429, |
| "grad_norm": 0.4554910659790039, |
| "learning_rate": 0.00037902288732394367, |
| "loss": 1.6898, |
| "step": 15750 |
| }, |
| { |
| "epoch": 2.687074829931973, |
| "grad_norm": 0.47400516271591187, |
| "learning_rate": 0.00037858274647887326, |
| "loss": 1.6875, |
| "step": 15800 |
| }, |
| { |
| "epoch": 2.695578231292517, |
| "grad_norm": 0.46053117513656616, |
| "learning_rate": 0.0003781426056338028, |
| "loss": 1.6791, |
| "step": 15850 |
| }, |
| { |
| "epoch": 2.704081632653061, |
| "grad_norm": 0.4834235608577728, |
| "learning_rate": 0.0003777024647887324, |
| "loss": 1.6883, |
| "step": 15900 |
| }, |
| { |
| "epoch": 2.7125850340136055, |
| "grad_norm": 0.648454487323761, |
| "learning_rate": 0.000377262323943662, |
| "loss": 1.686, |
| "step": 15950 |
| }, |
| { |
| "epoch": 2.7210884353741496, |
| "grad_norm": 0.46427688002586365, |
| "learning_rate": 0.0003768221830985916, |
| "loss": 1.6771, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.7210884353741496, |
| "eval_loss": 1.8668512105941772, |
| "eval_runtime": 75.2617, |
| "eval_samples_per_second": 1246.836, |
| "eval_steps_per_second": 4.876, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.729591836734694, |
| "grad_norm": 0.4748222827911377, |
| "learning_rate": 0.00037638204225352113, |
| "loss": 1.6826, |
| "step": 16050 |
| }, |
| { |
| "epoch": 2.738095238095238, |
| "grad_norm": 0.4745228588581085, |
| "learning_rate": 0.0003759419014084507, |
| "loss": 1.6837, |
| "step": 16100 |
| }, |
| { |
| "epoch": 2.746598639455782, |
| "grad_norm": 0.4836510419845581, |
| "learning_rate": 0.0003755017605633803, |
| "loss": 1.6803, |
| "step": 16150 |
| }, |
| { |
| "epoch": 2.7551020408163263, |
| "grad_norm": 0.48129406571388245, |
| "learning_rate": 0.00037506161971830986, |
| "loss": 1.6804, |
| "step": 16200 |
| }, |
| { |
| "epoch": 2.7636054421768708, |
| "grad_norm": 0.45295995473861694, |
| "learning_rate": 0.00037462147887323946, |
| "loss": 1.6763, |
| "step": 16250 |
| }, |
| { |
| "epoch": 2.772108843537415, |
| "grad_norm": 0.45548656582832336, |
| "learning_rate": 0.000374181338028169, |
| "loss": 1.6793, |
| "step": 16300 |
| }, |
| { |
| "epoch": 2.7806122448979593, |
| "grad_norm": 0.46522030234336853, |
| "learning_rate": 0.00037374119718309865, |
| "loss": 1.6792, |
| "step": 16350 |
| }, |
| { |
| "epoch": 2.7891156462585034, |
| "grad_norm": 0.46659553050994873, |
| "learning_rate": 0.0003733010563380282, |
| "loss": 1.6705, |
| "step": 16400 |
| }, |
| { |
| "epoch": 2.7976190476190474, |
| "grad_norm": 0.4808714985847473, |
| "learning_rate": 0.00037286091549295773, |
| "loss": 1.6751, |
| "step": 16450 |
| }, |
| { |
| "epoch": 2.806122448979592, |
| "grad_norm": 0.46006572246551514, |
| "learning_rate": 0.00037242077464788733, |
| "loss": 1.6708, |
| "step": 16500 |
| }, |
| { |
| "epoch": 2.814625850340136, |
| "grad_norm": 0.4552723467350006, |
| "learning_rate": 0.0003719806338028169, |
| "loss": 1.6765, |
| "step": 16550 |
| }, |
| { |
| "epoch": 2.8231292517006805, |
| "grad_norm": 0.4788212180137634, |
| "learning_rate": 0.0003715404929577465, |
| "loss": 1.6774, |
| "step": 16600 |
| }, |
| { |
| "epoch": 2.8316326530612246, |
| "grad_norm": 0.4770471453666687, |
| "learning_rate": 0.00037110035211267606, |
| "loss": 1.6703, |
| "step": 16650 |
| }, |
| { |
| "epoch": 2.8401360544217686, |
| "grad_norm": 0.4665678143501282, |
| "learning_rate": 0.00037066021126760566, |
| "loss": 1.6754, |
| "step": 16700 |
| }, |
| { |
| "epoch": 2.8486394557823127, |
| "grad_norm": 0.46680372953414917, |
| "learning_rate": 0.0003702200704225352, |
| "loss": 1.67, |
| "step": 16750 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.4522228240966797, |
| "learning_rate": 0.0003697799295774648, |
| "loss": 1.6718, |
| "step": 16800 |
| }, |
| { |
| "epoch": 2.8656462585034013, |
| "grad_norm": 0.4507662057876587, |
| "learning_rate": 0.0003693397887323944, |
| "loss": 1.6713, |
| "step": 16850 |
| }, |
| { |
| "epoch": 2.8741496598639458, |
| "grad_norm": 0.45078015327453613, |
| "learning_rate": 0.00036889964788732393, |
| "loss": 1.6641, |
| "step": 16900 |
| }, |
| { |
| "epoch": 2.88265306122449, |
| "grad_norm": 0.46813851594924927, |
| "learning_rate": 0.0003684595070422535, |
| "loss": 1.6684, |
| "step": 16950 |
| }, |
| { |
| "epoch": 2.891156462585034, |
| "grad_norm": 0.4676934480667114, |
| "learning_rate": 0.0003680193661971831, |
| "loss": 1.6678, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.891156462585034, |
| "eval_loss": 1.8514577150344849, |
| "eval_runtime": 75.2869, |
| "eval_samples_per_second": 1246.419, |
| "eval_steps_per_second": 4.875, |
| "step": 17000 |
| }, |
| { |
| "epoch": 2.8996598639455784, |
| "grad_norm": 0.4356513023376465, |
| "learning_rate": 0.0003675792253521127, |
| "loss": 1.6586, |
| "step": 17050 |
| }, |
| { |
| "epoch": 2.9081632653061225, |
| "grad_norm": 0.44688156247138977, |
| "learning_rate": 0.00036713908450704226, |
| "loss": 1.6596, |
| "step": 17100 |
| }, |
| { |
| "epoch": 2.9166666666666665, |
| "grad_norm": 0.46431487798690796, |
| "learning_rate": 0.0003666989436619718, |
| "loss": 1.6682, |
| "step": 17150 |
| }, |
| { |
| "epoch": 2.925170068027211, |
| "grad_norm": 0.44890233874320984, |
| "learning_rate": 0.00036625880281690145, |
| "loss": 1.6594, |
| "step": 17200 |
| }, |
| { |
| "epoch": 2.933673469387755, |
| "grad_norm": 0.45430788397789, |
| "learning_rate": 0.000365818661971831, |
| "loss": 1.6617, |
| "step": 17250 |
| }, |
| { |
| "epoch": 2.942176870748299, |
| "grad_norm": 0.4674071967601776, |
| "learning_rate": 0.0003653785211267606, |
| "loss": 1.6583, |
| "step": 17300 |
| }, |
| { |
| "epoch": 2.9506802721088436, |
| "grad_norm": 1.025103211402893, |
| "learning_rate": 0.0003649383802816901, |
| "loss": 1.6637, |
| "step": 17350 |
| }, |
| { |
| "epoch": 2.9591836734693877, |
| "grad_norm": 0.5043914318084717, |
| "learning_rate": 0.0003645070422535212, |
| "loss": 1.6673, |
| "step": 17400 |
| }, |
| { |
| "epoch": 2.967687074829932, |
| "grad_norm": 0.4789212942123413, |
| "learning_rate": 0.0003640669014084507, |
| "loss": 1.665, |
| "step": 17450 |
| }, |
| { |
| "epoch": 2.9761904761904763, |
| "grad_norm": 0.458146870136261, |
| "learning_rate": 0.00036362676056338027, |
| "loss": 1.6659, |
| "step": 17500 |
| }, |
| { |
| "epoch": 2.9846938775510203, |
| "grad_norm": 0.45948123931884766, |
| "learning_rate": 0.00036319542253521127, |
| "loss": 1.6559, |
| "step": 17550 |
| }, |
| { |
| "epoch": 2.9931972789115644, |
| "grad_norm": 0.44673070311546326, |
| "learning_rate": 0.0003627552816901408, |
| "loss": 1.6596, |
| "step": 17600 |
| }, |
| { |
| "epoch": 3.001700680272109, |
| "grad_norm": 0.46580860018730164, |
| "learning_rate": 0.00036231514084507046, |
| "loss": 1.6646, |
| "step": 17650 |
| }, |
| { |
| "epoch": 3.010204081632653, |
| "grad_norm": 0.48107218742370605, |
| "learning_rate": 0.000361875, |
| "loss": 1.6516, |
| "step": 17700 |
| }, |
| { |
| "epoch": 3.0187074829931975, |
| "grad_norm": 0.4705383777618408, |
| "learning_rate": 0.0003614348591549296, |
| "loss": 1.6506, |
| "step": 17750 |
| }, |
| { |
| "epoch": 3.0272108843537415, |
| "grad_norm": 0.4649271070957184, |
| "learning_rate": 0.00036099471830985914, |
| "loss": 1.6523, |
| "step": 17800 |
| }, |
| { |
| "epoch": 3.0357142857142856, |
| "grad_norm": 0.4697044789791107, |
| "learning_rate": 0.00036055457746478873, |
| "loss": 1.6556, |
| "step": 17850 |
| }, |
| { |
| "epoch": 3.04421768707483, |
| "grad_norm": 0.441441148519516, |
| "learning_rate": 0.00036011443661971833, |
| "loss": 1.6483, |
| "step": 17900 |
| }, |
| { |
| "epoch": 3.052721088435374, |
| "grad_norm": 0.4512791037559509, |
| "learning_rate": 0.00035967429577464787, |
| "loss": 1.655, |
| "step": 17950 |
| }, |
| { |
| "epoch": 3.061224489795918, |
| "grad_norm": 0.47064149379730225, |
| "learning_rate": 0.00035923415492957746, |
| "loss": 1.6493, |
| "step": 18000 |
| }, |
| { |
| "epoch": 3.061224489795918, |
| "eval_loss": 1.8354525566101074, |
| "eval_runtime": 75.3083, |
| "eval_samples_per_second": 1246.064, |
| "eval_steps_per_second": 4.873, |
| "step": 18000 |
| }, |
| { |
| "epoch": 3.0697278911564627, |
| "grad_norm": 0.4441589415073395, |
| "learning_rate": 0.00035879401408450706, |
| "loss": 1.6469, |
| "step": 18050 |
| }, |
| { |
| "epoch": 3.078231292517007, |
| "grad_norm": 0.46877768635749817, |
| "learning_rate": 0.00035835387323943666, |
| "loss": 1.6449, |
| "step": 18100 |
| }, |
| { |
| "epoch": 3.086734693877551, |
| "grad_norm": 0.9494704008102417, |
| "learning_rate": 0.0003579137323943662, |
| "loss": 1.6426, |
| "step": 18150 |
| }, |
| { |
| "epoch": 3.0952380952380953, |
| "grad_norm": 0.4520435631275177, |
| "learning_rate": 0.00035747359154929574, |
| "loss": 1.6521, |
| "step": 18200 |
| }, |
| { |
| "epoch": 3.1037414965986394, |
| "grad_norm": 0.46680590510368347, |
| "learning_rate": 0.0003570334507042254, |
| "loss": 1.655, |
| "step": 18250 |
| }, |
| { |
| "epoch": 3.1122448979591835, |
| "grad_norm": 0.47364234924316406, |
| "learning_rate": 0.00035659330985915493, |
| "loss": 1.6404, |
| "step": 18300 |
| }, |
| { |
| "epoch": 3.120748299319728, |
| "grad_norm": 0.47683462500572205, |
| "learning_rate": 0.0003561531690140845, |
| "loss": 1.6432, |
| "step": 18350 |
| }, |
| { |
| "epoch": 3.129251700680272, |
| "grad_norm": 0.4633096158504486, |
| "learning_rate": 0.00035571302816901407, |
| "loss": 1.6459, |
| "step": 18400 |
| }, |
| { |
| "epoch": 3.137755102040816, |
| "grad_norm": 0.4648244380950928, |
| "learning_rate": 0.0003552728873239437, |
| "loss": 1.6392, |
| "step": 18450 |
| }, |
| { |
| "epoch": 3.1462585034013606, |
| "grad_norm": 0.46435174345970154, |
| "learning_rate": 0.00035483274647887326, |
| "loss": 1.6396, |
| "step": 18500 |
| }, |
| { |
| "epoch": 3.1547619047619047, |
| "grad_norm": 0.45293596386909485, |
| "learning_rate": 0.0003543926056338028, |
| "loss": 1.6413, |
| "step": 18550 |
| }, |
| { |
| "epoch": 3.163265306122449, |
| "grad_norm": 0.46072807908058167, |
| "learning_rate": 0.0003539524647887324, |
| "loss": 1.6379, |
| "step": 18600 |
| }, |
| { |
| "epoch": 3.171768707482993, |
| "grad_norm": 0.45269516110420227, |
| "learning_rate": 0.000353512323943662, |
| "loss": 1.6409, |
| "step": 18650 |
| }, |
| { |
| "epoch": 3.1802721088435373, |
| "grad_norm": 0.4461529850959778, |
| "learning_rate": 0.0003530721830985916, |
| "loss": 1.6373, |
| "step": 18700 |
| }, |
| { |
| "epoch": 3.188775510204082, |
| "grad_norm": 0.45222413539886475, |
| "learning_rate": 0.0003526320422535211, |
| "loss": 1.6317, |
| "step": 18750 |
| }, |
| { |
| "epoch": 3.197278911564626, |
| "grad_norm": 0.44513842463493347, |
| "learning_rate": 0.0003521919014084507, |
| "loss": 1.6365, |
| "step": 18800 |
| }, |
| { |
| "epoch": 3.20578231292517, |
| "grad_norm": 0.4705728590488434, |
| "learning_rate": 0.00035175176056338026, |
| "loss": 1.6434, |
| "step": 18850 |
| }, |
| { |
| "epoch": 3.2142857142857144, |
| "grad_norm": 0.47750329971313477, |
| "learning_rate": 0.00035131161971830986, |
| "loss": 1.6337, |
| "step": 18900 |
| }, |
| { |
| "epoch": 3.2227891156462585, |
| "grad_norm": 0.4608128070831299, |
| "learning_rate": 0.00035087147887323945, |
| "loss": 1.642, |
| "step": 18950 |
| }, |
| { |
| "epoch": 3.2312925170068025, |
| "grad_norm": 0.46269285678863525, |
| "learning_rate": 0.000350431338028169, |
| "loss": 1.6326, |
| "step": 19000 |
| }, |
| { |
| "epoch": 3.2312925170068025, |
| "eval_loss": 1.8231135606765747, |
| "eval_runtime": 75.3579, |
| "eval_samples_per_second": 1245.245, |
| "eval_steps_per_second": 4.87, |
| "step": 19000 |
| }, |
| { |
| "epoch": 3.239795918367347, |
| "grad_norm": 0.44600149989128113, |
| "learning_rate": 0.0003499911971830986, |
| "loss": 1.6372, |
| "step": 19050 |
| }, |
| { |
| "epoch": 3.248299319727891, |
| "grad_norm": 0.45678427815437317, |
| "learning_rate": 0.0003495510563380282, |
| "loss": 1.6348, |
| "step": 19100 |
| }, |
| { |
| "epoch": 3.2568027210884356, |
| "grad_norm": 0.45453667640686035, |
| "learning_rate": 0.0003491109154929578, |
| "loss": 1.6422, |
| "step": 19150 |
| }, |
| { |
| "epoch": 3.2653061224489797, |
| "grad_norm": 0.4784705936908722, |
| "learning_rate": 0.0003486707746478873, |
| "loss": 1.6382, |
| "step": 19200 |
| }, |
| { |
| "epoch": 3.2738095238095237, |
| "grad_norm": 0.4635883569717407, |
| "learning_rate": 0.00034823063380281686, |
| "loss": 1.6388, |
| "step": 19250 |
| }, |
| { |
| "epoch": 3.282312925170068, |
| "grad_norm": 0.45983946323394775, |
| "learning_rate": 0.0003477904929577465, |
| "loss": 1.6408, |
| "step": 19300 |
| }, |
| { |
| "epoch": 3.2908163265306123, |
| "grad_norm": 0.4456377923488617, |
| "learning_rate": 0.00034735035211267605, |
| "loss": 1.6388, |
| "step": 19350 |
| }, |
| { |
| "epoch": 3.2993197278911564, |
| "grad_norm": 0.4579452872276306, |
| "learning_rate": 0.00034691021126760565, |
| "loss": 1.6283, |
| "step": 19400 |
| }, |
| { |
| "epoch": 3.307823129251701, |
| "grad_norm": 0.43990224599838257, |
| "learning_rate": 0.0003464700704225352, |
| "loss": 1.6327, |
| "step": 19450 |
| }, |
| { |
| "epoch": 3.316326530612245, |
| "grad_norm": 0.4531456530094147, |
| "learning_rate": 0.00034602992957746484, |
| "loss": 1.6302, |
| "step": 19500 |
| }, |
| { |
| "epoch": 3.324829931972789, |
| "grad_norm": 0.4910786747932434, |
| "learning_rate": 0.0003455897887323944, |
| "loss": 1.6275, |
| "step": 19550 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.46175333857536316, |
| "learning_rate": 0.0003451496478873239, |
| "loss": 1.6349, |
| "step": 19600 |
| }, |
| { |
| "epoch": 3.3418367346938775, |
| "grad_norm": 0.4583819508552551, |
| "learning_rate": 0.0003447095070422535, |
| "loss": 1.6279, |
| "step": 19650 |
| }, |
| { |
| "epoch": 3.3503401360544216, |
| "grad_norm": 0.4596096873283386, |
| "learning_rate": 0.0003442693661971831, |
| "loss": 1.6203, |
| "step": 19700 |
| }, |
| { |
| "epoch": 3.358843537414966, |
| "grad_norm": 0.476151704788208, |
| "learning_rate": 0.0003438292253521127, |
| "loss": 1.63, |
| "step": 19750 |
| }, |
| { |
| "epoch": 3.36734693877551, |
| "grad_norm": 0.45185205340385437, |
| "learning_rate": 0.00034338908450704225, |
| "loss": 1.6228, |
| "step": 19800 |
| }, |
| { |
| "epoch": 3.3758503401360542, |
| "grad_norm": 0.4486692249774933, |
| "learning_rate": 0.00034294894366197184, |
| "loss": 1.623, |
| "step": 19850 |
| }, |
| { |
| "epoch": 3.3843537414965987, |
| "grad_norm": 0.4323517084121704, |
| "learning_rate": 0.0003425088028169014, |
| "loss": 1.6203, |
| "step": 19900 |
| }, |
| { |
| "epoch": 3.392857142857143, |
| "grad_norm": 0.4673045575618744, |
| "learning_rate": 0.00034206866197183103, |
| "loss": 1.6229, |
| "step": 19950 |
| }, |
| { |
| "epoch": 3.4013605442176873, |
| "grad_norm": 0.5606664419174194, |
| "learning_rate": 0.0003416285211267606, |
| "loss": 1.6248, |
| "step": 20000 |
| }, |
| { |
| "epoch": 3.4013605442176873, |
| "eval_loss": 1.8098527193069458, |
| "eval_runtime": 75.3357, |
| "eval_samples_per_second": 1245.611, |
| "eval_steps_per_second": 4.872, |
| "step": 20000 |
| }, |
| { |
| "epoch": 3.4098639455782314, |
| "grad_norm": 0.4416612386703491, |
| "learning_rate": 0.0003411883802816901, |
| "loss": 1.6244, |
| "step": 20050 |
| }, |
| { |
| "epoch": 3.4183673469387754, |
| "grad_norm": 0.46314534544944763, |
| "learning_rate": 0.0003407482394366197, |
| "loss": 1.619, |
| "step": 20100 |
| }, |
| { |
| "epoch": 3.4268707482993195, |
| "grad_norm": 0.46182000637054443, |
| "learning_rate": 0.0003403080985915493, |
| "loss": 1.6198, |
| "step": 20150 |
| }, |
| { |
| "epoch": 3.435374149659864, |
| "grad_norm": 0.46425795555114746, |
| "learning_rate": 0.0003398679577464789, |
| "loss": 1.6198, |
| "step": 20200 |
| }, |
| { |
| "epoch": 3.443877551020408, |
| "grad_norm": 0.4472617208957672, |
| "learning_rate": 0.00033942781690140844, |
| "loss": 1.6208, |
| "step": 20250 |
| }, |
| { |
| "epoch": 3.4523809523809526, |
| "grad_norm": 0.4484054446220398, |
| "learning_rate": 0.000338987676056338, |
| "loss": 1.617, |
| "step": 20300 |
| }, |
| { |
| "epoch": 3.4608843537414966, |
| "grad_norm": 0.46471381187438965, |
| "learning_rate": 0.00033855633802816904, |
| "loss": 1.6235, |
| "step": 20350 |
| }, |
| { |
| "epoch": 3.4693877551020407, |
| "grad_norm": 0.471675306558609, |
| "learning_rate": 0.0003381161971830986, |
| "loss": 1.6211, |
| "step": 20400 |
| }, |
| { |
| "epoch": 3.477891156462585, |
| "grad_norm": 0.4538419246673584, |
| "learning_rate": 0.0003376760563380282, |
| "loss": 1.6159, |
| "step": 20450 |
| }, |
| { |
| "epoch": 3.4863945578231292, |
| "grad_norm": 0.4715994894504547, |
| "learning_rate": 0.0003372359154929577, |
| "loss": 1.6155, |
| "step": 20500 |
| }, |
| { |
| "epoch": 3.4948979591836733, |
| "grad_norm": 0.44585472345352173, |
| "learning_rate": 0.00033679577464788737, |
| "loss": 1.6169, |
| "step": 20550 |
| }, |
| { |
| "epoch": 3.503401360544218, |
| "grad_norm": 0.4732266962528229, |
| "learning_rate": 0.0003363556338028169, |
| "loss": 1.6175, |
| "step": 20600 |
| }, |
| { |
| "epoch": 3.511904761904762, |
| "grad_norm": 0.4438916742801666, |
| "learning_rate": 0.0003359154929577465, |
| "loss": 1.6179, |
| "step": 20650 |
| }, |
| { |
| "epoch": 3.520408163265306, |
| "grad_norm": 0.46406659483909607, |
| "learning_rate": 0.00033547535211267605, |
| "loss": 1.6169, |
| "step": 20700 |
| }, |
| { |
| "epoch": 3.5289115646258504, |
| "grad_norm": 0.438435435295105, |
| "learning_rate": 0.00033503521126760564, |
| "loss": 1.6155, |
| "step": 20750 |
| }, |
| { |
| "epoch": 3.5374149659863945, |
| "grad_norm": 0.4469165503978729, |
| "learning_rate": 0.00033459507042253524, |
| "loss": 1.6124, |
| "step": 20800 |
| }, |
| { |
| "epoch": 3.545918367346939, |
| "grad_norm": 0.4506942927837372, |
| "learning_rate": 0.0003341549295774648, |
| "loss": 1.6128, |
| "step": 20850 |
| }, |
| { |
| "epoch": 3.554421768707483, |
| "grad_norm": 0.4475231468677521, |
| "learning_rate": 0.0003337147887323944, |
| "loss": 1.6118, |
| "step": 20900 |
| }, |
| { |
| "epoch": 3.562925170068027, |
| "grad_norm": 0.45778709650039673, |
| "learning_rate": 0.0003332746478873239, |
| "loss": 1.6098, |
| "step": 20950 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.44058629870414734, |
| "learning_rate": 0.00033283450704225357, |
| "loss": 1.6153, |
| "step": 21000 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "eval_loss": 1.791085124015808, |
| "eval_runtime": 75.3212, |
| "eval_samples_per_second": 1245.85, |
| "eval_steps_per_second": 4.872, |
| "step": 21000 |
| }, |
| { |
| "epoch": 3.5799319727891157, |
| "grad_norm": 0.44585561752319336, |
| "learning_rate": 0.0003323943661971831, |
| "loss": 1.6093, |
| "step": 21050 |
| }, |
| { |
| "epoch": 3.5884353741496597, |
| "grad_norm": 0.4633313715457916, |
| "learning_rate": 0.00033195422535211265, |
| "loss": 1.608, |
| "step": 21100 |
| }, |
| { |
| "epoch": 3.5969387755102042, |
| "grad_norm": 0.4513951539993286, |
| "learning_rate": 0.00033151408450704224, |
| "loss": 1.6066, |
| "step": 21150 |
| }, |
| { |
| "epoch": 3.6054421768707483, |
| "grad_norm": 0.44720685482025146, |
| "learning_rate": 0.00033107394366197184, |
| "loss": 1.6104, |
| "step": 21200 |
| }, |
| { |
| "epoch": 3.6139455782312924, |
| "grad_norm": 0.4755708873271942, |
| "learning_rate": 0.00033063380281690144, |
| "loss": 1.6138, |
| "step": 21250 |
| }, |
| { |
| "epoch": 3.622448979591837, |
| "grad_norm": 0.46023187041282654, |
| "learning_rate": 0.000330193661971831, |
| "loss": 1.6087, |
| "step": 21300 |
| }, |
| { |
| "epoch": 3.630952380952381, |
| "grad_norm": 0.45014435052871704, |
| "learning_rate": 0.00032975352112676057, |
| "loss": 1.606, |
| "step": 21350 |
| }, |
| { |
| "epoch": 3.6394557823129254, |
| "grad_norm": 0.440222829580307, |
| "learning_rate": 0.00032931338028169017, |
| "loss": 1.606, |
| "step": 21400 |
| }, |
| { |
| "epoch": 3.6479591836734695, |
| "grad_norm": 0.46397948265075684, |
| "learning_rate": 0.0003288732394366197, |
| "loss": 1.6085, |
| "step": 21450 |
| }, |
| { |
| "epoch": 3.6564625850340136, |
| "grad_norm": 0.5181043744087219, |
| "learning_rate": 0.0003284330985915493, |
| "loss": 1.6098, |
| "step": 21500 |
| }, |
| { |
| "epoch": 3.6649659863945576, |
| "grad_norm": 0.4351687431335449, |
| "learning_rate": 0.00032799295774647884, |
| "loss": 1.6087, |
| "step": 21550 |
| }, |
| { |
| "epoch": 3.673469387755102, |
| "grad_norm": 0.4317842721939087, |
| "learning_rate": 0.0003275528169014085, |
| "loss": 1.6068, |
| "step": 21600 |
| }, |
| { |
| "epoch": 3.681972789115646, |
| "grad_norm": 0.5115000009536743, |
| "learning_rate": 0.00032711267605633804, |
| "loss": 1.5992, |
| "step": 21650 |
| }, |
| { |
| "epoch": 3.6904761904761907, |
| "grad_norm": 0.4375057816505432, |
| "learning_rate": 0.00032667253521126763, |
| "loss": 1.6026, |
| "step": 21700 |
| }, |
| { |
| "epoch": 3.6989795918367347, |
| "grad_norm": 0.4488801956176758, |
| "learning_rate": 0.00032623239436619717, |
| "loss": 1.6057, |
| "step": 21750 |
| }, |
| { |
| "epoch": 3.707482993197279, |
| "grad_norm": 0.46254992485046387, |
| "learning_rate": 0.00032579225352112677, |
| "loss": 1.6031, |
| "step": 21800 |
| }, |
| { |
| "epoch": 3.715986394557823, |
| "grad_norm": 0.43931296467781067, |
| "learning_rate": 0.00032535211267605636, |
| "loss": 1.6035, |
| "step": 21850 |
| }, |
| { |
| "epoch": 3.7244897959183674, |
| "grad_norm": 0.4594497084617615, |
| "learning_rate": 0.0003249119718309859, |
| "loss": 1.6051, |
| "step": 21900 |
| }, |
| { |
| "epoch": 3.7329931972789114, |
| "grad_norm": 0.45316246151924133, |
| "learning_rate": 0.0003244718309859155, |
| "loss": 1.5937, |
| "step": 21950 |
| }, |
| { |
| "epoch": 3.741496598639456, |
| "grad_norm": 0.4681023359298706, |
| "learning_rate": 0.0003240316901408451, |
| "loss": 1.6015, |
| "step": 22000 |
| }, |
| { |
| "epoch": 3.741496598639456, |
| "eval_loss": 1.7835361957550049, |
| "eval_runtime": 75.3121, |
| "eval_samples_per_second": 1246.002, |
| "eval_steps_per_second": 4.873, |
| "step": 22000 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.5095068216323853, |
| "learning_rate": 0.0003235915492957747, |
| "loss": 1.6031, |
| "step": 22050 |
| }, |
| { |
| "epoch": 3.758503401360544, |
| "grad_norm": 0.43521663546562195, |
| "learning_rate": 0.00032315140845070423, |
| "loss": 1.6038, |
| "step": 22100 |
| }, |
| { |
| "epoch": 3.7670068027210886, |
| "grad_norm": 0.4796842932701111, |
| "learning_rate": 0.00032271126760563377, |
| "loss": 1.6061, |
| "step": 22150 |
| }, |
| { |
| "epoch": 3.7755102040816326, |
| "grad_norm": 0.45405638217926025, |
| "learning_rate": 0.00032227112676056337, |
| "loss": 1.5938, |
| "step": 22200 |
| }, |
| { |
| "epoch": 3.784013605442177, |
| "grad_norm": 0.46786293387413025, |
| "learning_rate": 0.00032183098591549296, |
| "loss": 1.6031, |
| "step": 22250 |
| }, |
| { |
| "epoch": 3.792517006802721, |
| "grad_norm": 0.43882298469543457, |
| "learning_rate": 0.00032139084507042256, |
| "loss": 1.6018, |
| "step": 22300 |
| }, |
| { |
| "epoch": 3.8010204081632653, |
| "grad_norm": 0.45325416326522827, |
| "learning_rate": 0.0003209507042253521, |
| "loss": 1.5905, |
| "step": 22350 |
| }, |
| { |
| "epoch": 3.8095238095238093, |
| "grad_norm": 0.4605984687805176, |
| "learning_rate": 0.0003205105633802817, |
| "loss": 1.596, |
| "step": 22400 |
| }, |
| { |
| "epoch": 3.818027210884354, |
| "grad_norm": 0.5207810401916504, |
| "learning_rate": 0.0003200704225352113, |
| "loss": 1.6048, |
| "step": 22450 |
| }, |
| { |
| "epoch": 3.826530612244898, |
| "grad_norm": 0.45419174432754517, |
| "learning_rate": 0.00031963028169014083, |
| "loss": 1.5972, |
| "step": 22500 |
| }, |
| { |
| "epoch": 3.8350340136054424, |
| "grad_norm": 0.46196499466896057, |
| "learning_rate": 0.00031919014084507043, |
| "loss": 1.5952, |
| "step": 22550 |
| }, |
| { |
| "epoch": 3.8435374149659864, |
| "grad_norm": 0.48807084560394287, |
| "learning_rate": 0.00031874999999999997, |
| "loss": 1.6006, |
| "step": 22600 |
| }, |
| { |
| "epoch": 3.8520408163265305, |
| "grad_norm": 0.44178763031959534, |
| "learning_rate": 0.000318318661971831, |
| "loss": 1.6006, |
| "step": 22650 |
| }, |
| { |
| "epoch": 3.8605442176870746, |
| "grad_norm": 0.4443242847919464, |
| "learning_rate": 0.00031787852112676057, |
| "loss": 1.5875, |
| "step": 22700 |
| }, |
| { |
| "epoch": 3.869047619047619, |
| "grad_norm": 0.45137760043144226, |
| "learning_rate": 0.00031743838028169016, |
| "loss": 1.5874, |
| "step": 22750 |
| }, |
| { |
| "epoch": 3.877551020408163, |
| "grad_norm": 0.44187167286872864, |
| "learning_rate": 0.0003169982394366197, |
| "loss": 1.5935, |
| "step": 22800 |
| }, |
| { |
| "epoch": 3.8860544217687076, |
| "grad_norm": 0.46749913692474365, |
| "learning_rate": 0.0003165580985915493, |
| "loss": 1.5918, |
| "step": 22850 |
| }, |
| { |
| "epoch": 3.8945578231292517, |
| "grad_norm": 0.45157769322395325, |
| "learning_rate": 0.0003161179577464789, |
| "loss": 1.5965, |
| "step": 22900 |
| }, |
| { |
| "epoch": 3.9030612244897958, |
| "grad_norm": 0.6856850981712341, |
| "learning_rate": 0.00031567781690140844, |
| "loss": 1.5929, |
| "step": 22950 |
| }, |
| { |
| "epoch": 3.9115646258503403, |
| "grad_norm": 0.45894768834114075, |
| "learning_rate": 0.00031523767605633803, |
| "loss": 1.5915, |
| "step": 23000 |
| }, |
| { |
| "epoch": 3.9115646258503403, |
| "eval_loss": 1.7726092338562012, |
| "eval_runtime": 75.3291, |
| "eval_samples_per_second": 1245.72, |
| "eval_steps_per_second": 4.872, |
| "step": 23000 |
| }, |
| { |
| "epoch": 3.9200680272108843, |
| "grad_norm": 0.44938963651657104, |
| "learning_rate": 0.0003147975352112676, |
| "loss": 1.5913, |
| "step": 23050 |
| }, |
| { |
| "epoch": 3.928571428571429, |
| "grad_norm": 0.4493829309940338, |
| "learning_rate": 0.0003143573943661972, |
| "loss": 1.5999, |
| "step": 23100 |
| }, |
| { |
| "epoch": 3.937074829931973, |
| "grad_norm": 0.44146212935447693, |
| "learning_rate": 0.00031391725352112676, |
| "loss": 1.5937, |
| "step": 23150 |
| }, |
| { |
| "epoch": 3.945578231292517, |
| "grad_norm": 0.4436129927635193, |
| "learning_rate": 0.0003134771126760563, |
| "loss": 1.5937, |
| "step": 23200 |
| }, |
| { |
| "epoch": 3.954081632653061, |
| "grad_norm": 0.4842548668384552, |
| "learning_rate": 0.0003130369718309859, |
| "loss": 1.5921, |
| "step": 23250 |
| }, |
| { |
| "epoch": 3.9625850340136055, |
| "grad_norm": 0.4605764150619507, |
| "learning_rate": 0.0003125968309859155, |
| "loss": 1.585, |
| "step": 23300 |
| }, |
| { |
| "epoch": 3.9710884353741496, |
| "grad_norm": 0.446140319108963, |
| "learning_rate": 0.0003121566901408451, |
| "loss": 1.5773, |
| "step": 23350 |
| }, |
| { |
| "epoch": 3.979591836734694, |
| "grad_norm": 0.4490399658679962, |
| "learning_rate": 0.00031171654929577463, |
| "loss": 1.5862, |
| "step": 23400 |
| }, |
| { |
| "epoch": 3.988095238095238, |
| "grad_norm": 0.43997710943222046, |
| "learning_rate": 0.00031128521126760564, |
| "loss": 1.5873, |
| "step": 23450 |
| }, |
| { |
| "epoch": 3.996598639455782, |
| "grad_norm": 0.4559916853904724, |
| "learning_rate": 0.00031084507042253523, |
| "loss": 1.5936, |
| "step": 23500 |
| }, |
| { |
| "epoch": 4.005102040816326, |
| "grad_norm": 0.46398648619651794, |
| "learning_rate": 0.0003104049295774648, |
| "loss": 1.5861, |
| "step": 23550 |
| }, |
| { |
| "epoch": 4.01360544217687, |
| "grad_norm": 0.4367043972015381, |
| "learning_rate": 0.00030996478873239437, |
| "loss": 1.5814, |
| "step": 23600 |
| }, |
| { |
| "epoch": 4.022108843537415, |
| "grad_norm": 0.4262286126613617, |
| "learning_rate": 0.0003095246478873239, |
| "loss": 1.5831, |
| "step": 23650 |
| }, |
| { |
| "epoch": 4.030612244897959, |
| "grad_norm": 1.598874568939209, |
| "learning_rate": 0.00030908450704225356, |
| "loss": 1.5863, |
| "step": 23700 |
| }, |
| { |
| "epoch": 4.039115646258503, |
| "grad_norm": 0.44894129037857056, |
| "learning_rate": 0.0003086443661971831, |
| "loss": 1.5852, |
| "step": 23750 |
| }, |
| { |
| "epoch": 4.0476190476190474, |
| "grad_norm": 0.4517632722854614, |
| "learning_rate": 0.0003082042253521127, |
| "loss": 1.581, |
| "step": 23800 |
| }, |
| { |
| "epoch": 4.0561224489795915, |
| "grad_norm": 0.4445061683654785, |
| "learning_rate": 0.00030776408450704224, |
| "loss": 1.5777, |
| "step": 23850 |
| }, |
| { |
| "epoch": 4.0646258503401365, |
| "grad_norm": 0.4282425343990326, |
| "learning_rate": 0.0003073239436619719, |
| "loss": 1.579, |
| "step": 23900 |
| }, |
| { |
| "epoch": 4.0731292517006805, |
| "grad_norm": 0.48686137795448303, |
| "learning_rate": 0.0003068838028169014, |
| "loss": 1.5766, |
| "step": 23950 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "grad_norm": 0.4572753310203552, |
| "learning_rate": 0.00030644366197183097, |
| "loss": 1.5751, |
| "step": 24000 |
| }, |
| { |
| "epoch": 4.081632653061225, |
| "eval_loss": 1.7599811553955078, |
| "eval_runtime": 75.3428, |
| "eval_samples_per_second": 1245.494, |
| "eval_steps_per_second": 4.871, |
| "step": 24000 |
| }, |
| { |
| "epoch": 4.090136054421769, |
| "grad_norm": 0.44744428992271423, |
| "learning_rate": 0.00030600352112676056, |
| "loss": 1.573, |
| "step": 24050 |
| }, |
| { |
| "epoch": 4.098639455782313, |
| "grad_norm": 0.45439615845680237, |
| "learning_rate": 0.00030556338028169016, |
| "loss": 1.5717, |
| "step": 24100 |
| }, |
| { |
| "epoch": 4.107142857142857, |
| "grad_norm": 0.46508198976516724, |
| "learning_rate": 0.00030512323943661975, |
| "loss": 1.5716, |
| "step": 24150 |
| }, |
| { |
| "epoch": 4.115646258503402, |
| "grad_norm": 0.45242032408714294, |
| "learning_rate": 0.0003046830985915493, |
| "loss": 1.5692, |
| "step": 24200 |
| }, |
| { |
| "epoch": 4.124149659863946, |
| "grad_norm": 0.441568523645401, |
| "learning_rate": 0.0003042429577464789, |
| "loss": 1.569, |
| "step": 24250 |
| }, |
| { |
| "epoch": 4.13265306122449, |
| "grad_norm": 0.43780016899108887, |
| "learning_rate": 0.00030380281690140843, |
| "loss": 1.5751, |
| "step": 24300 |
| }, |
| { |
| "epoch": 4.141156462585034, |
| "grad_norm": 0.47256138920783997, |
| "learning_rate": 0.00030336267605633803, |
| "loss": 1.5711, |
| "step": 24350 |
| }, |
| { |
| "epoch": 4.149659863945578, |
| "grad_norm": 0.4236377477645874, |
| "learning_rate": 0.0003029225352112676, |
| "loss": 1.5737, |
| "step": 24400 |
| }, |
| { |
| "epoch": 4.158163265306122, |
| "grad_norm": 0.4356014132499695, |
| "learning_rate": 0.00030248239436619716, |
| "loss": 1.5683, |
| "step": 24450 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 0.44677263498306274, |
| "learning_rate": 0.00030204225352112676, |
| "loss": 1.5661, |
| "step": 24500 |
| }, |
| { |
| "epoch": 4.175170068027211, |
| "grad_norm": 0.45358529686927795, |
| "learning_rate": 0.00030160211267605635, |
| "loss": 1.5718, |
| "step": 24550 |
| }, |
| { |
| "epoch": 4.183673469387755, |
| "grad_norm": 0.46313029527664185, |
| "learning_rate": 0.00030116197183098595, |
| "loss": 1.5726, |
| "step": 24600 |
| }, |
| { |
| "epoch": 4.192176870748299, |
| "grad_norm": 0.4524216055870056, |
| "learning_rate": 0.0003007218309859155, |
| "loss": 1.5659, |
| "step": 24650 |
| }, |
| { |
| "epoch": 4.200680272108843, |
| "grad_norm": 0.46442392468452454, |
| "learning_rate": 0.00030028169014084503, |
| "loss": 1.57, |
| "step": 24700 |
| }, |
| { |
| "epoch": 4.209183673469388, |
| "grad_norm": 0.43893301486968994, |
| "learning_rate": 0.0002998415492957747, |
| "loss": 1.5646, |
| "step": 24750 |
| }, |
| { |
| "epoch": 4.217687074829932, |
| "grad_norm": 0.4545600116252899, |
| "learning_rate": 0.0002994014084507042, |
| "loss": 1.5656, |
| "step": 24800 |
| }, |
| { |
| "epoch": 4.226190476190476, |
| "grad_norm": 0.43572577834129333, |
| "learning_rate": 0.0002989612676056338, |
| "loss": 1.5619, |
| "step": 24850 |
| }, |
| { |
| "epoch": 4.23469387755102, |
| "grad_norm": 0.4401288330554962, |
| "learning_rate": 0.00029852112676056336, |
| "loss": 1.5626, |
| "step": 24900 |
| }, |
| { |
| "epoch": 4.243197278911564, |
| "grad_norm": 0.4619799554347992, |
| "learning_rate": 0.000298080985915493, |
| "loss": 1.5679, |
| "step": 24950 |
| }, |
| { |
| "epoch": 4.2517006802721085, |
| "grad_norm": 0.43690764904022217, |
| "learning_rate": 0.00029764084507042255, |
| "loss": 1.5602, |
| "step": 25000 |
| }, |
| { |
| "epoch": 4.2517006802721085, |
| "eval_loss": 1.7416114807128906, |
| "eval_runtime": 75.3172, |
| "eval_samples_per_second": 1245.918, |
| "eval_steps_per_second": 4.873, |
| "step": 25000 |
| }, |
| { |
| "epoch": 4.260204081632653, |
| "grad_norm": 0.48442286252975464, |
| "learning_rate": 0.0002972007042253521, |
| "loss": 1.5592, |
| "step": 25050 |
| }, |
| { |
| "epoch": 4.2687074829931975, |
| "grad_norm": 0.44272610545158386, |
| "learning_rate": 0.0002967605633802817, |
| "loss": 1.5612, |
| "step": 25100 |
| }, |
| { |
| "epoch": 4.2772108843537415, |
| "grad_norm": 0.44438502192497253, |
| "learning_rate": 0.0002963204225352113, |
| "loss": 1.5622, |
| "step": 25150 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.701813280582428, |
| "learning_rate": 0.0002958802816901409, |
| "loss": 1.5639, |
| "step": 25200 |
| }, |
| { |
| "epoch": 4.29421768707483, |
| "grad_norm": 0.4430041015148163, |
| "learning_rate": 0.0002954401408450704, |
| "loss": 1.5655, |
| "step": 25250 |
| }, |
| { |
| "epoch": 4.302721088435375, |
| "grad_norm": 0.4703254699707031, |
| "learning_rate": 0.000295, |
| "loss": 1.5585, |
| "step": 25300 |
| }, |
| { |
| "epoch": 4.311224489795919, |
| "grad_norm": 0.4456980228424072, |
| "learning_rate": 0.0002945598591549296, |
| "loss": 1.5602, |
| "step": 25350 |
| }, |
| { |
| "epoch": 4.319727891156463, |
| "grad_norm": 0.44341742992401123, |
| "learning_rate": 0.00029411971830985915, |
| "loss": 1.565, |
| "step": 25400 |
| }, |
| { |
| "epoch": 4.328231292517007, |
| "grad_norm": 0.6620036959648132, |
| "learning_rate": 0.00029367957746478875, |
| "loss": 1.563, |
| "step": 25450 |
| }, |
| { |
| "epoch": 4.336734693877551, |
| "grad_norm": 0.4276537597179413, |
| "learning_rate": 0.0002932482394366197, |
| "loss": 1.556, |
| "step": 25500 |
| }, |
| { |
| "epoch": 4.345238095238095, |
| "grad_norm": 0.5484892129898071, |
| "learning_rate": 0.0002928080985915493, |
| "loss": 1.5683, |
| "step": 25550 |
| }, |
| { |
| "epoch": 4.35374149659864, |
| "grad_norm": 0.4457222819328308, |
| "learning_rate": 0.0002923679577464789, |
| "loss": 1.5626, |
| "step": 25600 |
| }, |
| { |
| "epoch": 4.362244897959184, |
| "grad_norm": 0.4299587905406952, |
| "learning_rate": 0.0002919278169014085, |
| "loss": 1.5617, |
| "step": 25650 |
| }, |
| { |
| "epoch": 4.370748299319728, |
| "grad_norm": 0.6894689202308655, |
| "learning_rate": 0.000291487676056338, |
| "loss": 1.556, |
| "step": 25700 |
| }, |
| { |
| "epoch": 4.379251700680272, |
| "grad_norm": 0.4507542550563812, |
| "learning_rate": 0.00029104753521126756, |
| "loss": 1.5605, |
| "step": 25750 |
| }, |
| { |
| "epoch": 4.387755102040816, |
| "grad_norm": 0.45560920238494873, |
| "learning_rate": 0.0002906073943661972, |
| "loss": 1.5631, |
| "step": 25800 |
| }, |
| { |
| "epoch": 4.39625850340136, |
| "grad_norm": 0.4419881999492645, |
| "learning_rate": 0.00029016725352112676, |
| "loss": 1.558, |
| "step": 25850 |
| }, |
| { |
| "epoch": 4.404761904761905, |
| "grad_norm": 0.42712947726249695, |
| "learning_rate": 0.00028972711267605635, |
| "loss": 1.5571, |
| "step": 25900 |
| }, |
| { |
| "epoch": 4.413265306122449, |
| "grad_norm": 0.4276752769947052, |
| "learning_rate": 0.0002892869718309859, |
| "loss": 1.5517, |
| "step": 25950 |
| }, |
| { |
| "epoch": 4.421768707482993, |
| "grad_norm": 0.4421616494655609, |
| "learning_rate": 0.00028884683098591554, |
| "loss": 1.5549, |
| "step": 26000 |
| }, |
| { |
| "epoch": 4.421768707482993, |
| "eval_loss": 1.7275468111038208, |
| "eval_runtime": 75.3579, |
| "eval_samples_per_second": 1245.244, |
| "eval_steps_per_second": 4.87, |
| "step": 26000 |
| }, |
| { |
| "epoch": 4.430272108843537, |
| "grad_norm": 0.43331626057624817, |
| "learning_rate": 0.0002884066901408451, |
| "loss": 1.553, |
| "step": 26050 |
| }, |
| { |
| "epoch": 4.438775510204081, |
| "grad_norm": 0.42342087626457214, |
| "learning_rate": 0.0002879665492957746, |
| "loss": 1.5508, |
| "step": 26100 |
| }, |
| { |
| "epoch": 4.447278911564625, |
| "grad_norm": 0.4396851062774658, |
| "learning_rate": 0.0002875264084507042, |
| "loss": 1.5496, |
| "step": 26150 |
| }, |
| { |
| "epoch": 4.45578231292517, |
| "grad_norm": 0.4484754204750061, |
| "learning_rate": 0.0002870862676056338, |
| "loss": 1.559, |
| "step": 26200 |
| }, |
| { |
| "epoch": 4.464285714285714, |
| "grad_norm": 0.4413243532180786, |
| "learning_rate": 0.0002866461267605634, |
| "loss": 1.5497, |
| "step": 26250 |
| }, |
| { |
| "epoch": 4.4727891156462585, |
| "grad_norm": 0.509044885635376, |
| "learning_rate": 0.00028620598591549295, |
| "loss": 1.5507, |
| "step": 26300 |
| }, |
| { |
| "epoch": 4.4812925170068025, |
| "grad_norm": 0.4461870491504669, |
| "learning_rate": 0.00028576584507042255, |
| "loss": 1.5516, |
| "step": 26350 |
| }, |
| { |
| "epoch": 4.489795918367347, |
| "grad_norm": 0.48340821266174316, |
| "learning_rate": 0.00028532570422535214, |
| "loss": 1.5542, |
| "step": 26400 |
| }, |
| { |
| "epoch": 4.4982993197278915, |
| "grad_norm": 0.4417915344238281, |
| "learning_rate": 0.0002848855633802817, |
| "loss": 1.5539, |
| "step": 26450 |
| }, |
| { |
| "epoch": 4.506802721088436, |
| "grad_norm": 0.47127747535705566, |
| "learning_rate": 0.0002844454225352113, |
| "loss": 1.547, |
| "step": 26500 |
| }, |
| { |
| "epoch": 4.51530612244898, |
| "grad_norm": 0.7486183047294617, |
| "learning_rate": 0.0002840052816901408, |
| "loss": 1.5534, |
| "step": 26550 |
| }, |
| { |
| "epoch": 4.523809523809524, |
| "grad_norm": 0.4753170907497406, |
| "learning_rate": 0.0002835651408450704, |
| "loss": 1.5568, |
| "step": 26600 |
| }, |
| { |
| "epoch": 4.532312925170068, |
| "grad_norm": 0.43808746337890625, |
| "learning_rate": 0.000283125, |
| "loss": 1.5518, |
| "step": 26650 |
| }, |
| { |
| "epoch": 4.540816326530612, |
| "grad_norm": 0.4334624707698822, |
| "learning_rate": 0.0002826848591549296, |
| "loss": 1.5507, |
| "step": 26700 |
| }, |
| { |
| "epoch": 4.549319727891157, |
| "grad_norm": 0.4278333783149719, |
| "learning_rate": 0.00028224471830985915, |
| "loss": 1.5543, |
| "step": 26750 |
| }, |
| { |
| "epoch": 4.557823129251701, |
| "grad_norm": 0.44058939814567566, |
| "learning_rate": 0.0002818045774647887, |
| "loss": 1.5462, |
| "step": 26800 |
| }, |
| { |
| "epoch": 4.566326530612245, |
| "grad_norm": 0.42488718032836914, |
| "learning_rate": 0.00028136443661971834, |
| "loss": 1.5555, |
| "step": 26850 |
| }, |
| { |
| "epoch": 4.574829931972789, |
| "grad_norm": 0.4430455267429352, |
| "learning_rate": 0.0002809242957746479, |
| "loss": 1.5481, |
| "step": 26900 |
| }, |
| { |
| "epoch": 4.583333333333333, |
| "grad_norm": 1.010523796081543, |
| "learning_rate": 0.0002804841549295775, |
| "loss": 1.5446, |
| "step": 26950 |
| }, |
| { |
| "epoch": 4.591836734693878, |
| "grad_norm": 0.42729562520980835, |
| "learning_rate": 0.000280044014084507, |
| "loss": 1.5464, |
| "step": 27000 |
| }, |
| { |
| "epoch": 4.591836734693878, |
| "eval_loss": 1.7172296047210693, |
| "eval_runtime": 75.3539, |
| "eval_samples_per_second": 1245.31, |
| "eval_steps_per_second": 4.87, |
| "step": 27000 |
| }, |
| { |
| "epoch": 4.600340136054422, |
| "grad_norm": 0.4399702847003937, |
| "learning_rate": 0.00027960387323943667, |
| "loss": 1.5452, |
| "step": 27050 |
| }, |
| { |
| "epoch": 4.608843537414966, |
| "grad_norm": 0.4524286687374115, |
| "learning_rate": 0.0002791637323943662, |
| "loss": 1.5534, |
| "step": 27100 |
| }, |
| { |
| "epoch": 4.61734693877551, |
| "grad_norm": 0.43100252747535706, |
| "learning_rate": 0.00027872359154929575, |
| "loss": 1.5479, |
| "step": 27150 |
| }, |
| { |
| "epoch": 4.625850340136054, |
| "grad_norm": 0.43541115522384644, |
| "learning_rate": 0.00027828345070422534, |
| "loss": 1.542, |
| "step": 27200 |
| }, |
| { |
| "epoch": 4.634353741496598, |
| "grad_norm": 0.454149454832077, |
| "learning_rate": 0.00027784330985915494, |
| "loss": 1.5442, |
| "step": 27250 |
| }, |
| { |
| "epoch": 4.642857142857143, |
| "grad_norm": 0.4360577464103699, |
| "learning_rate": 0.00027740316901408453, |
| "loss": 1.541, |
| "step": 27300 |
| }, |
| { |
| "epoch": 4.651360544217687, |
| "grad_norm": 0.45018696784973145, |
| "learning_rate": 0.0002769630281690141, |
| "loss": 1.5522, |
| "step": 27350 |
| }, |
| { |
| "epoch": 4.659863945578231, |
| "grad_norm": 0.4641735851764679, |
| "learning_rate": 0.00027652288732394367, |
| "loss": 1.5409, |
| "step": 27400 |
| }, |
| { |
| "epoch": 4.668367346938775, |
| "grad_norm": 0.4704575836658478, |
| "learning_rate": 0.00027608274647887327, |
| "loss": 1.5386, |
| "step": 27450 |
| }, |
| { |
| "epoch": 4.6768707482993195, |
| "grad_norm": 0.4325176775455475, |
| "learning_rate": 0.0002756426056338028, |
| "loss": 1.5432, |
| "step": 27500 |
| }, |
| { |
| "epoch": 4.685374149659864, |
| "grad_norm": 0.5194480419158936, |
| "learning_rate": 0.0002752024647887324, |
| "loss": 1.5409, |
| "step": 27550 |
| }, |
| { |
| "epoch": 4.6938775510204085, |
| "grad_norm": 0.6043953895568848, |
| "learning_rate": 0.00027476232394366194, |
| "loss": 1.5412, |
| "step": 27600 |
| }, |
| { |
| "epoch": 4.7023809523809526, |
| "grad_norm": 0.49982741475105286, |
| "learning_rate": 0.0002743221830985916, |
| "loss": 1.544, |
| "step": 27650 |
| }, |
| { |
| "epoch": 4.710884353741497, |
| "grad_norm": 0.45368969440460205, |
| "learning_rate": 0.00027388204225352113, |
| "loss": 1.5452, |
| "step": 27700 |
| }, |
| { |
| "epoch": 4.719387755102041, |
| "grad_norm": 0.42949676513671875, |
| "learning_rate": 0.00027344190140845073, |
| "loss": 1.5431, |
| "step": 27750 |
| }, |
| { |
| "epoch": 4.727891156462585, |
| "grad_norm": 0.4460367262363434, |
| "learning_rate": 0.00027300176056338027, |
| "loss": 1.5379, |
| "step": 27800 |
| }, |
| { |
| "epoch": 4.736394557823129, |
| "grad_norm": 0.43941059708595276, |
| "learning_rate": 0.0002725616197183098, |
| "loss": 1.5438, |
| "step": 27850 |
| }, |
| { |
| "epoch": 4.744897959183674, |
| "grad_norm": 0.44306865334510803, |
| "learning_rate": 0.00027212147887323946, |
| "loss": 1.5403, |
| "step": 27900 |
| }, |
| { |
| "epoch": 4.753401360544218, |
| "grad_norm": 0.4779192805290222, |
| "learning_rate": 0.000271681338028169, |
| "loss": 1.5374, |
| "step": 27950 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "grad_norm": 0.44125354290008545, |
| "learning_rate": 0.0002712411971830986, |
| "loss": 1.5386, |
| "step": 28000 |
| }, |
| { |
| "epoch": 4.761904761904762, |
| "eval_loss": 1.7089167833328247, |
| "eval_runtime": 75.3103, |
| "eval_samples_per_second": 1246.032, |
| "eval_steps_per_second": 4.873, |
| "step": 28000 |
| }, |
| { |
| "epoch": 4.770408163265306, |
| "grad_norm": 0.47051918506622314, |
| "learning_rate": 0.00027080105633802814, |
| "loss": 1.5367, |
| "step": 28050 |
| }, |
| { |
| "epoch": 4.77891156462585, |
| "grad_norm": 0.44151777029037476, |
| "learning_rate": 0.0002703609154929578, |
| "loss": 1.5363, |
| "step": 28100 |
| }, |
| { |
| "epoch": 4.787414965986395, |
| "grad_norm": 0.43706709146499634, |
| "learning_rate": 0.00026992077464788733, |
| "loss": 1.5396, |
| "step": 28150 |
| }, |
| { |
| "epoch": 4.795918367346939, |
| "grad_norm": 0.4319562017917633, |
| "learning_rate": 0.0002694806338028169, |
| "loss": 1.5339, |
| "step": 28200 |
| }, |
| { |
| "epoch": 4.804421768707483, |
| "grad_norm": 0.4437416195869446, |
| "learning_rate": 0.00026904049295774647, |
| "loss": 1.5259, |
| "step": 28250 |
| }, |
| { |
| "epoch": 4.812925170068027, |
| "grad_norm": 0.45064008235931396, |
| "learning_rate": 0.00026860035211267606, |
| "loss": 1.5305, |
| "step": 28300 |
| }, |
| { |
| "epoch": 4.821428571428571, |
| "grad_norm": 0.4223947823047638, |
| "learning_rate": 0.00026816021126760566, |
| "loss": 1.5367, |
| "step": 28350 |
| }, |
| { |
| "epoch": 4.829931972789115, |
| "grad_norm": 0.41959166526794434, |
| "learning_rate": 0.0002677200704225352, |
| "loss": 1.5321, |
| "step": 28400 |
| }, |
| { |
| "epoch": 4.83843537414966, |
| "grad_norm": 0.4325518310070038, |
| "learning_rate": 0.0002672799295774648, |
| "loss": 1.5326, |
| "step": 28450 |
| }, |
| { |
| "epoch": 4.846938775510204, |
| "grad_norm": 0.43113574385643005, |
| "learning_rate": 0.0002668397887323944, |
| "loss": 1.5364, |
| "step": 28500 |
| }, |
| { |
| "epoch": 4.855442176870748, |
| "grad_norm": 0.4148525297641754, |
| "learning_rate": 0.000266399647887324, |
| "loss": 1.535, |
| "step": 28550 |
| }, |
| { |
| "epoch": 4.863945578231292, |
| "grad_norm": 0.4255242943763733, |
| "learning_rate": 0.0002659595070422535, |
| "loss": 1.5307, |
| "step": 28600 |
| }, |
| { |
| "epoch": 4.872448979591836, |
| "grad_norm": 0.4426940679550171, |
| "learning_rate": 0.00026551936619718307, |
| "loss": 1.5298, |
| "step": 28650 |
| }, |
| { |
| "epoch": 4.880952380952381, |
| "grad_norm": 0.4304163157939911, |
| "learning_rate": 0.0002650792253521127, |
| "loss": 1.5296, |
| "step": 28700 |
| }, |
| { |
| "epoch": 4.889455782312925, |
| "grad_norm": 0.43492448329925537, |
| "learning_rate": 0.00026463908450704226, |
| "loss": 1.5295, |
| "step": 28750 |
| }, |
| { |
| "epoch": 4.8979591836734695, |
| "grad_norm": 0.4191015660762787, |
| "learning_rate": 0.00026420774647887326, |
| "loss": 1.5275, |
| "step": 28800 |
| }, |
| { |
| "epoch": 4.906462585034014, |
| "grad_norm": 0.4442043602466583, |
| "learning_rate": 0.0002637676056338028, |
| "loss": 1.5241, |
| "step": 28850 |
| }, |
| { |
| "epoch": 4.914965986394558, |
| "grad_norm": 0.41631436347961426, |
| "learning_rate": 0.0002633274647887324, |
| "loss": 1.5296, |
| "step": 28900 |
| }, |
| { |
| "epoch": 4.923469387755102, |
| "grad_norm": 0.4311072528362274, |
| "learning_rate": 0.000262887323943662, |
| "loss": 1.5343, |
| "step": 28950 |
| }, |
| { |
| "epoch": 4.931972789115647, |
| "grad_norm": 0.4260921776294708, |
| "learning_rate": 0.00026244718309859154, |
| "loss": 1.528, |
| "step": 29000 |
| }, |
| { |
| "epoch": 4.931972789115647, |
| "eval_loss": 1.695530652999878, |
| "eval_runtime": 75.7527, |
| "eval_samples_per_second": 1238.755, |
| "eval_steps_per_second": 4.845, |
| "step": 29000 |
| }, |
| { |
| "epoch": 4.940476190476191, |
| "grad_norm": 0.43122732639312744, |
| "learning_rate": 0.00026200704225352113, |
| "loss": 1.5345, |
| "step": 29050 |
| }, |
| { |
| "epoch": 4.948979591836735, |
| "grad_norm": 0.44380810856819153, |
| "learning_rate": 0.00026156690140845067, |
| "loss": 1.5284, |
| "step": 29100 |
| }, |
| { |
| "epoch": 4.957482993197279, |
| "grad_norm": 0.43009868264198303, |
| "learning_rate": 0.0002611267605633803, |
| "loss": 1.5315, |
| "step": 29150 |
| }, |
| { |
| "epoch": 4.965986394557823, |
| "grad_norm": 0.4425157904624939, |
| "learning_rate": 0.00026068661971830986, |
| "loss": 1.5301, |
| "step": 29200 |
| }, |
| { |
| "epoch": 4.974489795918368, |
| "grad_norm": 0.41929659247398376, |
| "learning_rate": 0.00026024647887323946, |
| "loss": 1.533, |
| "step": 29250 |
| }, |
| { |
| "epoch": 4.982993197278912, |
| "grad_norm": 0.43218111991882324, |
| "learning_rate": 0.000259806338028169, |
| "loss": 1.5215, |
| "step": 29300 |
| }, |
| { |
| "epoch": 4.991496598639456, |
| "grad_norm": 0.4425402283668518, |
| "learning_rate": 0.0002593661971830986, |
| "loss": 1.5312, |
| "step": 29350 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.44686195254325867, |
| "learning_rate": 0.0002589260563380282, |
| "loss": 1.5262, |
| "step": 29400 |
| }, |
| { |
| "epoch": 5.008503401360544, |
| "grad_norm": 0.42469632625579834, |
| "learning_rate": 0.00025848591549295773, |
| "loss": 1.5216, |
| "step": 29450 |
| }, |
| { |
| "epoch": 5.017006802721088, |
| "grad_norm": 0.44917765259742737, |
| "learning_rate": 0.0002580457746478873, |
| "loss": 1.5148, |
| "step": 29500 |
| }, |
| { |
| "epoch": 5.025510204081633, |
| "grad_norm": 0.42792636156082153, |
| "learning_rate": 0.0002576056338028169, |
| "loss": 1.5236, |
| "step": 29550 |
| }, |
| { |
| "epoch": 5.034013605442177, |
| "grad_norm": 0.42785459756851196, |
| "learning_rate": 0.0002571654929577465, |
| "loss": 1.5185, |
| "step": 29600 |
| }, |
| { |
| "epoch": 5.042517006802721, |
| "grad_norm": 0.4389972388744354, |
| "learning_rate": 0.00025672535211267606, |
| "loss": 1.5152, |
| "step": 29650 |
| }, |
| { |
| "epoch": 5.051020408163265, |
| "grad_norm": 0.4294348657131195, |
| "learning_rate": 0.0002562852112676056, |
| "loss": 1.5164, |
| "step": 29700 |
| }, |
| { |
| "epoch": 5.059523809523809, |
| "grad_norm": 0.4204474985599518, |
| "learning_rate": 0.00025584507042253525, |
| "loss": 1.5126, |
| "step": 29750 |
| }, |
| { |
| "epoch": 5.068027210884353, |
| "grad_norm": 0.4308267831802368, |
| "learning_rate": 0.0002554049295774648, |
| "loss": 1.5138, |
| "step": 29800 |
| }, |
| { |
| "epoch": 5.076530612244898, |
| "grad_norm": 0.43260395526885986, |
| "learning_rate": 0.0002549647887323944, |
| "loss": 1.5211, |
| "step": 29850 |
| }, |
| { |
| "epoch": 5.085034013605442, |
| "grad_norm": 0.4192730486392975, |
| "learning_rate": 0.00025452464788732393, |
| "loss": 1.5165, |
| "step": 29900 |
| }, |
| { |
| "epoch": 5.093537414965986, |
| "grad_norm": 0.418124794960022, |
| "learning_rate": 0.0002540845070422535, |
| "loss": 1.5152, |
| "step": 29950 |
| }, |
| { |
| "epoch": 5.1020408163265305, |
| "grad_norm": 0.4320596158504486, |
| "learning_rate": 0.0002536443661971831, |
| "loss": 1.5071, |
| "step": 30000 |
| }, |
| { |
| "epoch": 5.1020408163265305, |
| "eval_loss": 1.6892961263656616, |
| "eval_runtime": 75.4228, |
| "eval_samples_per_second": 1244.173, |
| "eval_steps_per_second": 4.866, |
| "step": 30000 |
| }, |
| { |
| "epoch": 5.110544217687075, |
| "grad_norm": 0.43125247955322266, |
| "learning_rate": 0.00025320422535211266, |
| "loss": 1.5116, |
| "step": 30050 |
| }, |
| { |
| "epoch": 5.119047619047619, |
| "grad_norm": 0.4130070209503174, |
| "learning_rate": 0.00025276408450704225, |
| "loss": 1.5125, |
| "step": 30100 |
| }, |
| { |
| "epoch": 5.127551020408164, |
| "grad_norm": 0.42717909812927246, |
| "learning_rate": 0.0002523239436619718, |
| "loss": 1.5122, |
| "step": 30150 |
| }, |
| { |
| "epoch": 5.136054421768708, |
| "grad_norm": 0.4337432086467743, |
| "learning_rate": 0.00025188380281690145, |
| "loss": 1.5103, |
| "step": 30200 |
| }, |
| { |
| "epoch": 5.144557823129252, |
| "grad_norm": 0.4125542938709259, |
| "learning_rate": 0.000251443661971831, |
| "loss": 1.5218, |
| "step": 30250 |
| }, |
| { |
| "epoch": 5.153061224489796, |
| "grad_norm": 0.428830623626709, |
| "learning_rate": 0.0002510035211267606, |
| "loss": 1.5068, |
| "step": 30300 |
| }, |
| { |
| "epoch": 5.16156462585034, |
| "grad_norm": 0.41956421732902527, |
| "learning_rate": 0.0002505633802816901, |
| "loss": 1.5148, |
| "step": 30350 |
| }, |
| { |
| "epoch": 5.170068027210885, |
| "grad_norm": 0.43243858218193054, |
| "learning_rate": 0.0002501232394366197, |
| "loss": 1.5153, |
| "step": 30400 |
| }, |
| { |
| "epoch": 5.178571428571429, |
| "grad_norm": 0.41967883706092834, |
| "learning_rate": 0.0002496830985915493, |
| "loss": 1.517, |
| "step": 30450 |
| }, |
| { |
| "epoch": 5.187074829931973, |
| "grad_norm": 0.43599751591682434, |
| "learning_rate": 0.00024924295774647886, |
| "loss": 1.5097, |
| "step": 30500 |
| }, |
| { |
| "epoch": 5.195578231292517, |
| "grad_norm": 0.4145863950252533, |
| "learning_rate": 0.00024880281690140845, |
| "loss": 1.5054, |
| "step": 30550 |
| }, |
| { |
| "epoch": 5.204081632653061, |
| "grad_norm": 0.44736286997795105, |
| "learning_rate": 0.00024836267605633805, |
| "loss": 1.512, |
| "step": 30600 |
| }, |
| { |
| "epoch": 5.212585034013605, |
| "grad_norm": 0.4339098036289215, |
| "learning_rate": 0.00024792253521126764, |
| "loss": 1.5119, |
| "step": 30650 |
| }, |
| { |
| "epoch": 5.22108843537415, |
| "grad_norm": 0.4354366064071655, |
| "learning_rate": 0.0002474823943661972, |
| "loss": 1.5134, |
| "step": 30700 |
| }, |
| { |
| "epoch": 5.229591836734694, |
| "grad_norm": 0.4405953288078308, |
| "learning_rate": 0.0002470422535211268, |
| "loss": 1.5117, |
| "step": 30750 |
| }, |
| { |
| "epoch": 5.238095238095238, |
| "grad_norm": 0.4392901659011841, |
| "learning_rate": 0.0002466021126760563, |
| "loss": 1.5097, |
| "step": 30800 |
| }, |
| { |
| "epoch": 5.246598639455782, |
| "grad_norm": 0.4320152699947357, |
| "learning_rate": 0.0002461619718309859, |
| "loss": 1.5185, |
| "step": 30850 |
| }, |
| { |
| "epoch": 5.255102040816326, |
| "grad_norm": 0.4348084628582001, |
| "learning_rate": 0.0002457218309859155, |
| "loss": 1.5073, |
| "step": 30900 |
| }, |
| { |
| "epoch": 5.263605442176871, |
| "grad_norm": 0.4216504693031311, |
| "learning_rate": 0.0002452816901408451, |
| "loss": 1.5055, |
| "step": 30950 |
| }, |
| { |
| "epoch": 5.272108843537415, |
| "grad_norm": 0.417338103055954, |
| "learning_rate": 0.00024484154929577465, |
| "loss": 1.5102, |
| "step": 31000 |
| }, |
| { |
| "epoch": 5.272108843537415, |
| "eval_loss": 1.6826726198196411, |
| "eval_runtime": 75.3283, |
| "eval_samples_per_second": 1245.734, |
| "eval_steps_per_second": 4.872, |
| "step": 31000 |
| }, |
| { |
| "epoch": 5.280612244897959, |
| "grad_norm": 0.425647109746933, |
| "learning_rate": 0.00024440140845070424, |
| "loss": 1.51, |
| "step": 31050 |
| }, |
| { |
| "epoch": 5.289115646258503, |
| "grad_norm": 0.4205061197280884, |
| "learning_rate": 0.0002439612676056338, |
| "loss": 1.5071, |
| "step": 31100 |
| }, |
| { |
| "epoch": 5.2976190476190474, |
| "grad_norm": 0.4191541075706482, |
| "learning_rate": 0.00024352112676056338, |
| "loss": 1.5072, |
| "step": 31150 |
| }, |
| { |
| "epoch": 5.3061224489795915, |
| "grad_norm": 0.4266960024833679, |
| "learning_rate": 0.00024308098591549297, |
| "loss": 1.5024, |
| "step": 31200 |
| }, |
| { |
| "epoch": 5.3146258503401365, |
| "grad_norm": 0.43113160133361816, |
| "learning_rate": 0.00024264084507042252, |
| "loss": 1.5045, |
| "step": 31250 |
| }, |
| { |
| "epoch": 5.3231292517006805, |
| "grad_norm": 0.46872562170028687, |
| "learning_rate": 0.0002422007042253521, |
| "loss": 1.5043, |
| "step": 31300 |
| }, |
| { |
| "epoch": 5.331632653061225, |
| "grad_norm": 0.444063663482666, |
| "learning_rate": 0.00024176056338028168, |
| "loss": 1.503, |
| "step": 31350 |
| }, |
| { |
| "epoch": 5.340136054421769, |
| "grad_norm": 0.46224623918533325, |
| "learning_rate": 0.00024132042253521127, |
| "loss": 1.5001, |
| "step": 31400 |
| }, |
| { |
| "epoch": 5.348639455782313, |
| "grad_norm": 0.4229288697242737, |
| "learning_rate": 0.00024088028169014084, |
| "loss": 1.5051, |
| "step": 31450 |
| }, |
| { |
| "epoch": 5.357142857142857, |
| "grad_norm": 0.4342740476131439, |
| "learning_rate": 0.00024044014084507044, |
| "loss": 1.5043, |
| "step": 31500 |
| }, |
| { |
| "epoch": 5.365646258503402, |
| "grad_norm": 0.4987669885158539, |
| "learning_rate": 0.00024, |
| "loss": 1.5067, |
| "step": 31550 |
| }, |
| { |
| "epoch": 5.374149659863946, |
| "grad_norm": 0.4277132451534271, |
| "learning_rate": 0.0002395598591549296, |
| "loss": 1.5005, |
| "step": 31600 |
| }, |
| { |
| "epoch": 5.38265306122449, |
| "grad_norm": 0.4543018639087677, |
| "learning_rate": 0.00023911971830985914, |
| "loss": 1.5081, |
| "step": 31650 |
| }, |
| { |
| "epoch": 5.391156462585034, |
| "grad_norm": 0.4164925217628479, |
| "learning_rate": 0.00023867957746478874, |
| "loss": 1.4998, |
| "step": 31700 |
| }, |
| { |
| "epoch": 5.399659863945578, |
| "grad_norm": 0.4165375828742981, |
| "learning_rate": 0.0002382394366197183, |
| "loss": 1.5047, |
| "step": 31750 |
| }, |
| { |
| "epoch": 5.408163265306122, |
| "grad_norm": 0.43289613723754883, |
| "learning_rate": 0.0002377992957746479, |
| "loss": 1.499, |
| "step": 31800 |
| }, |
| { |
| "epoch": 5.416666666666667, |
| "grad_norm": 0.4113534688949585, |
| "learning_rate": 0.00023735915492957747, |
| "loss": 1.4992, |
| "step": 31850 |
| }, |
| { |
| "epoch": 5.425170068027211, |
| "grad_norm": 0.4257815480232239, |
| "learning_rate": 0.00023691901408450707, |
| "loss": 1.5033, |
| "step": 31900 |
| }, |
| { |
| "epoch": 5.433673469387755, |
| "grad_norm": 0.423606812953949, |
| "learning_rate": 0.00023647887323943663, |
| "loss": 1.4985, |
| "step": 31950 |
| }, |
| { |
| "epoch": 5.442176870748299, |
| "grad_norm": 0.5088992118835449, |
| "learning_rate": 0.0002360387323943662, |
| "loss": 1.5025, |
| "step": 32000 |
| }, |
| { |
| "epoch": 5.442176870748299, |
| "eval_loss": 1.6751487255096436, |
| "eval_runtime": 75.6944, |
| "eval_samples_per_second": 1239.708, |
| "eval_steps_per_second": 4.848, |
| "step": 32000 |
| }, |
| { |
| "epoch": 5.450680272108843, |
| "grad_norm": 0.42682725191116333, |
| "learning_rate": 0.00023559859154929577, |
| "loss": 1.5012, |
| "step": 32050 |
| }, |
| { |
| "epoch": 5.459183673469388, |
| "grad_norm": 0.4484409689903259, |
| "learning_rate": 0.00023515845070422537, |
| "loss": 1.5056, |
| "step": 32100 |
| }, |
| { |
| "epoch": 5.467687074829932, |
| "grad_norm": 0.4330119788646698, |
| "learning_rate": 0.00023471830985915493, |
| "loss": 1.4997, |
| "step": 32150 |
| }, |
| { |
| "epoch": 5.476190476190476, |
| "grad_norm": 0.4262082874774933, |
| "learning_rate": 0.00023427816901408453, |
| "loss": 1.4992, |
| "step": 32200 |
| }, |
| { |
| "epoch": 5.48469387755102, |
| "grad_norm": 0.4434804320335388, |
| "learning_rate": 0.0002338380281690141, |
| "loss": 1.4975, |
| "step": 32250 |
| }, |
| { |
| "epoch": 5.493197278911564, |
| "grad_norm": 0.42507055401802063, |
| "learning_rate": 0.00023340669014084507, |
| "loss": 1.4923, |
| "step": 32300 |
| }, |
| { |
| "epoch": 5.5017006802721085, |
| "grad_norm": 0.415923535823822, |
| "learning_rate": 0.00023296654929577464, |
| "loss": 1.4899, |
| "step": 32350 |
| }, |
| { |
| "epoch": 5.510204081632653, |
| "grad_norm": 0.43220096826553345, |
| "learning_rate": 0.0002325264084507042, |
| "loss": 1.4947, |
| "step": 32400 |
| }, |
| { |
| "epoch": 5.5187074829931975, |
| "grad_norm": 0.4283994436264038, |
| "learning_rate": 0.0002320862676056338, |
| "loss": 1.4921, |
| "step": 32450 |
| }, |
| { |
| "epoch": 5.5272108843537415, |
| "grad_norm": 0.439207524061203, |
| "learning_rate": 0.00023164612676056337, |
| "loss": 1.4859, |
| "step": 32500 |
| }, |
| { |
| "epoch": 5.535714285714286, |
| "grad_norm": 0.4383983612060547, |
| "learning_rate": 0.00023120598591549297, |
| "loss": 1.4933, |
| "step": 32550 |
| }, |
| { |
| "epoch": 5.54421768707483, |
| "grad_norm": 0.44482484459877014, |
| "learning_rate": 0.00023076584507042254, |
| "loss": 1.4957, |
| "step": 32600 |
| }, |
| { |
| "epoch": 5.552721088435375, |
| "grad_norm": 0.4300253987312317, |
| "learning_rate": 0.00023032570422535213, |
| "loss": 1.49, |
| "step": 32650 |
| }, |
| { |
| "epoch": 5.561224489795919, |
| "grad_norm": 0.46058133244514465, |
| "learning_rate": 0.00022988556338028167, |
| "loss": 1.4872, |
| "step": 32700 |
| }, |
| { |
| "epoch": 5.569727891156463, |
| "grad_norm": 0.4194552004337311, |
| "learning_rate": 0.00022944542253521127, |
| "loss": 1.4911, |
| "step": 32750 |
| }, |
| { |
| "epoch": 5.578231292517007, |
| "grad_norm": 0.42726796865463257, |
| "learning_rate": 0.00022900528169014084, |
| "loss": 1.4926, |
| "step": 32800 |
| }, |
| { |
| "epoch": 5.586734693877551, |
| "grad_norm": 0.42779192328453064, |
| "learning_rate": 0.00022856514084507043, |
| "loss": 1.4917, |
| "step": 32850 |
| }, |
| { |
| "epoch": 5.595238095238095, |
| "grad_norm": 0.42153412103652954, |
| "learning_rate": 0.000228125, |
| "loss": 1.4824, |
| "step": 32900 |
| }, |
| { |
| "epoch": 5.603741496598639, |
| "grad_norm": 0.4386555254459381, |
| "learning_rate": 0.0002276848591549296, |
| "loss": 1.4916, |
| "step": 32950 |
| }, |
| { |
| "epoch": 5.612244897959184, |
| "grad_norm": 0.4412122964859009, |
| "learning_rate": 0.00022724471830985917, |
| "loss": 1.4908, |
| "step": 33000 |
| }, |
| { |
| "epoch": 5.612244897959184, |
| "eval_loss": 1.6691830158233643, |
| "eval_runtime": 75.3357, |
| "eval_samples_per_second": 1245.611, |
| "eval_steps_per_second": 4.872, |
| "step": 33000 |
| }, |
| { |
| "epoch": 5.620748299319728, |
| "grad_norm": 0.42562374472618103, |
| "learning_rate": 0.00022680457746478873, |
| "loss": 1.4867, |
| "step": 33050 |
| }, |
| { |
| "epoch": 5.629251700680272, |
| "grad_norm": 0.4305470585823059, |
| "learning_rate": 0.0002263644366197183, |
| "loss": 1.4874, |
| "step": 33100 |
| }, |
| { |
| "epoch": 5.637755102040816, |
| "grad_norm": 0.42887523770332336, |
| "learning_rate": 0.0002259242957746479, |
| "loss": 1.4993, |
| "step": 33150 |
| }, |
| { |
| "epoch": 5.646258503401361, |
| "grad_norm": 0.5242183804512024, |
| "learning_rate": 0.00022548415492957747, |
| "loss": 1.4934, |
| "step": 33200 |
| }, |
| { |
| "epoch": 5.654761904761905, |
| "grad_norm": 0.4118496775627136, |
| "learning_rate": 0.00022504401408450706, |
| "loss": 1.4872, |
| "step": 33250 |
| }, |
| { |
| "epoch": 5.663265306122449, |
| "grad_norm": 0.4159320890903473, |
| "learning_rate": 0.00022460387323943663, |
| "loss": 1.4782, |
| "step": 33300 |
| }, |
| { |
| "epoch": 5.671768707482993, |
| "grad_norm": 0.4275069534778595, |
| "learning_rate": 0.00022416373239436623, |
| "loss": 1.4857, |
| "step": 33350 |
| }, |
| { |
| "epoch": 5.680272108843537, |
| "grad_norm": 0.41769281029701233, |
| "learning_rate": 0.00022372359154929577, |
| "loss": 1.479, |
| "step": 33400 |
| }, |
| { |
| "epoch": 5.688775510204081, |
| "grad_norm": 0.41575032472610474, |
| "learning_rate": 0.00022328345070422533, |
| "loss": 1.4849, |
| "step": 33450 |
| }, |
| { |
| "epoch": 5.697278911564625, |
| "grad_norm": 0.4337887763977051, |
| "learning_rate": 0.00022284330985915493, |
| "loss": 1.4877, |
| "step": 33500 |
| }, |
| { |
| "epoch": 5.70578231292517, |
| "grad_norm": 0.41992053389549255, |
| "learning_rate": 0.0002224031690140845, |
| "loss": 1.4809, |
| "step": 33550 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.42422786355018616, |
| "learning_rate": 0.0002219630281690141, |
| "loss": 1.4877, |
| "step": 33600 |
| }, |
| { |
| "epoch": 5.7227891156462585, |
| "grad_norm": 0.415855348110199, |
| "learning_rate": 0.00022152288732394366, |
| "loss": 1.4863, |
| "step": 33650 |
| }, |
| { |
| "epoch": 5.7312925170068025, |
| "grad_norm": 0.4512421488761902, |
| "learning_rate": 0.00022108274647887326, |
| "loss": 1.4894, |
| "step": 33700 |
| }, |
| { |
| "epoch": 5.739795918367347, |
| "grad_norm": 0.42408186197280884, |
| "learning_rate": 0.0002206426056338028, |
| "loss": 1.4767, |
| "step": 33750 |
| }, |
| { |
| "epoch": 5.7482993197278915, |
| "grad_norm": 0.42864853143692017, |
| "learning_rate": 0.0002202024647887324, |
| "loss": 1.4825, |
| "step": 33800 |
| }, |
| { |
| "epoch": 5.756802721088436, |
| "grad_norm": 0.4149647355079651, |
| "learning_rate": 0.00021976232394366196, |
| "loss": 1.4819, |
| "step": 33850 |
| }, |
| { |
| "epoch": 5.76530612244898, |
| "grad_norm": 0.45217257738113403, |
| "learning_rate": 0.00021932218309859156, |
| "loss": 1.487, |
| "step": 33900 |
| }, |
| { |
| "epoch": 5.773809523809524, |
| "grad_norm": 0.4308101236820221, |
| "learning_rate": 0.00021888204225352113, |
| "loss": 1.4842, |
| "step": 33950 |
| }, |
| { |
| "epoch": 5.782312925170068, |
| "grad_norm": 0.41859814524650574, |
| "learning_rate": 0.00021844190140845072, |
| "loss": 1.4856, |
| "step": 34000 |
| }, |
| { |
| "epoch": 5.782312925170068, |
| "eval_loss": 1.6499308347702026, |
| "eval_runtime": 75.302, |
| "eval_samples_per_second": 1246.169, |
| "eval_steps_per_second": 4.874, |
| "step": 34000 |
| }, |
| { |
| "epoch": 5.790816326530612, |
| "grad_norm": 0.4088199734687805, |
| "learning_rate": 0.0002180017605633803, |
| "loss": 1.4838, |
| "step": 34050 |
| }, |
| { |
| "epoch": 5.799319727891157, |
| "grad_norm": 0.8137006163597107, |
| "learning_rate": 0.00021756161971830989, |
| "loss": 1.4826, |
| "step": 34100 |
| }, |
| { |
| "epoch": 5.807823129251701, |
| "grad_norm": 0.4124939441680908, |
| "learning_rate": 0.00021712147887323943, |
| "loss": 1.4916, |
| "step": 34150 |
| }, |
| { |
| "epoch": 5.816326530612245, |
| "grad_norm": 0.4147464334964752, |
| "learning_rate": 0.00021668133802816902, |
| "loss": 1.485, |
| "step": 34200 |
| }, |
| { |
| "epoch": 5.824829931972789, |
| "grad_norm": 0.42168349027633667, |
| "learning_rate": 0.0002162411971830986, |
| "loss": 1.4787, |
| "step": 34250 |
| }, |
| { |
| "epoch": 5.833333333333333, |
| "grad_norm": 0.43687647581100464, |
| "learning_rate": 0.00021580105633802819, |
| "loss": 1.4859, |
| "step": 34300 |
| }, |
| { |
| "epoch": 5.841836734693878, |
| "grad_norm": 0.4150829017162323, |
| "learning_rate": 0.00021536091549295775, |
| "loss": 1.4806, |
| "step": 34350 |
| }, |
| { |
| "epoch": 5.850340136054422, |
| "grad_norm": 0.42533501982688904, |
| "learning_rate": 0.00021492077464788735, |
| "loss": 1.4774, |
| "step": 34400 |
| }, |
| { |
| "epoch": 5.858843537414966, |
| "grad_norm": 0.42881202697753906, |
| "learning_rate": 0.00021448063380281692, |
| "loss": 1.4793, |
| "step": 34450 |
| }, |
| { |
| "epoch": 5.86734693877551, |
| "grad_norm": 0.41647574305534363, |
| "learning_rate": 0.00021404049295774649, |
| "loss": 1.4844, |
| "step": 34500 |
| }, |
| { |
| "epoch": 5.875850340136054, |
| "grad_norm": 0.45438751578330994, |
| "learning_rate": 0.00021360035211267605, |
| "loss": 1.476, |
| "step": 34550 |
| }, |
| { |
| "epoch": 5.884353741496598, |
| "grad_norm": 0.4157373309135437, |
| "learning_rate": 0.00021316021126760562, |
| "loss": 1.4783, |
| "step": 34600 |
| }, |
| { |
| "epoch": 5.892857142857143, |
| "grad_norm": 0.4178527593612671, |
| "learning_rate": 0.00021272007042253522, |
| "loss": 1.477, |
| "step": 34650 |
| }, |
| { |
| "epoch": 5.901360544217687, |
| "grad_norm": 0.4168199896812439, |
| "learning_rate": 0.00021227992957746479, |
| "loss": 1.4685, |
| "step": 34700 |
| }, |
| { |
| "epoch": 5.909863945578231, |
| "grad_norm": 0.4487851858139038, |
| "learning_rate": 0.00021183978873239438, |
| "loss": 1.4782, |
| "step": 34750 |
| }, |
| { |
| "epoch": 5.918367346938775, |
| "grad_norm": 0.43783414363861084, |
| "learning_rate": 0.00021139964788732395, |
| "loss": 1.4727, |
| "step": 34800 |
| }, |
| { |
| "epoch": 5.9268707482993195, |
| "grad_norm": 0.4123859107494354, |
| "learning_rate": 0.00021095950704225352, |
| "loss": 1.4711, |
| "step": 34850 |
| }, |
| { |
| "epoch": 5.935374149659864, |
| "grad_norm": 0.41514667868614197, |
| "learning_rate": 0.00021051936619718309, |
| "loss": 1.4776, |
| "step": 34900 |
| }, |
| { |
| "epoch": 5.9438775510204085, |
| "grad_norm": 0.41285404562950134, |
| "learning_rate": 0.00021007922535211268, |
| "loss": 1.4796, |
| "step": 34950 |
| }, |
| { |
| "epoch": 5.9523809523809526, |
| "grad_norm": 0.407366544008255, |
| "learning_rate": 0.00020963908450704225, |
| "loss": 1.4687, |
| "step": 35000 |
| }, |
| { |
| "epoch": 5.9523809523809526, |
| "eval_loss": 1.6382627487182617, |
| "eval_runtime": 75.3966, |
| "eval_samples_per_second": 1244.606, |
| "eval_steps_per_second": 4.868, |
| "step": 35000 |
| }, |
| { |
| "epoch": 5.960884353741497, |
| "grad_norm": 0.4154813885688782, |
| "learning_rate": 0.00020919894366197185, |
| "loss": 1.4731, |
| "step": 35050 |
| }, |
| { |
| "epoch": 5.969387755102041, |
| "grad_norm": 0.40901270508766174, |
| "learning_rate": 0.0002087588028169014, |
| "loss": 1.4767, |
| "step": 35100 |
| }, |
| { |
| "epoch": 5.977891156462585, |
| "grad_norm": 0.41301828622817993, |
| "learning_rate": 0.000208318661971831, |
| "loss": 1.4688, |
| "step": 35150 |
| }, |
| { |
| "epoch": 5.986394557823129, |
| "grad_norm": 0.43283048272132874, |
| "learning_rate": 0.00020787852112676055, |
| "loss": 1.4673, |
| "step": 35200 |
| }, |
| { |
| "epoch": 5.994897959183674, |
| "grad_norm": 0.4242665469646454, |
| "learning_rate": 0.00020743838028169015, |
| "loss": 1.4749, |
| "step": 35250 |
| }, |
| { |
| "epoch": 6.003401360544218, |
| "grad_norm": 0.4176988899707794, |
| "learning_rate": 0.00020699823943661971, |
| "loss": 1.4713, |
| "step": 35300 |
| }, |
| { |
| "epoch": 6.011904761904762, |
| "grad_norm": 0.4250063896179199, |
| "learning_rate": 0.0002065580985915493, |
| "loss": 1.4688, |
| "step": 35350 |
| }, |
| { |
| "epoch": 6.020408163265306, |
| "grad_norm": 0.4247675836086273, |
| "learning_rate": 0.00020611795774647888, |
| "loss": 1.4661, |
| "step": 35400 |
| }, |
| { |
| "epoch": 6.02891156462585, |
| "grad_norm": 0.47026437520980835, |
| "learning_rate": 0.00020567781690140847, |
| "loss": 1.4641, |
| "step": 35450 |
| }, |
| { |
| "epoch": 6.037414965986395, |
| "grad_norm": 0.41622287034988403, |
| "learning_rate": 0.00020523767605633804, |
| "loss": 1.4676, |
| "step": 35500 |
| }, |
| { |
| "epoch": 6.045918367346939, |
| "grad_norm": 0.4315873980522156, |
| "learning_rate": 0.0002047975352112676, |
| "loss": 1.4701, |
| "step": 35550 |
| }, |
| { |
| "epoch": 6.054421768707483, |
| "grad_norm": 0.4140373170375824, |
| "learning_rate": 0.00020435739436619718, |
| "loss": 1.4625, |
| "step": 35600 |
| }, |
| { |
| "epoch": 6.062925170068027, |
| "grad_norm": 0.41319021582603455, |
| "learning_rate": 0.00020391725352112677, |
| "loss": 1.4674, |
| "step": 35650 |
| }, |
| { |
| "epoch": 6.071428571428571, |
| "grad_norm": 0.4266904294490814, |
| "learning_rate": 0.00020347711267605634, |
| "loss": 1.4651, |
| "step": 35700 |
| }, |
| { |
| "epoch": 6.079931972789115, |
| "grad_norm": 0.422049880027771, |
| "learning_rate": 0.00020304577464788732, |
| "loss": 1.4602, |
| "step": 35750 |
| }, |
| { |
| "epoch": 6.08843537414966, |
| "grad_norm": 0.4264945089817047, |
| "learning_rate": 0.0002026056338028169, |
| "loss": 1.4629, |
| "step": 35800 |
| }, |
| { |
| "epoch": 6.096938775510204, |
| "grad_norm": 0.47240814566612244, |
| "learning_rate": 0.00020216549295774648, |
| "loss": 1.4675, |
| "step": 35850 |
| }, |
| { |
| "epoch": 6.105442176870748, |
| "grad_norm": 0.4204316735267639, |
| "learning_rate": 0.00020172535211267608, |
| "loss": 1.4627, |
| "step": 35900 |
| }, |
| { |
| "epoch": 6.113945578231292, |
| "grad_norm": 0.41577088832855225, |
| "learning_rate": 0.00020128521126760562, |
| "loss": 1.4605, |
| "step": 35950 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "grad_norm": 0.4297136962413788, |
| "learning_rate": 0.0002008450704225352, |
| "loss": 1.4685, |
| "step": 36000 |
| }, |
| { |
| "epoch": 6.122448979591836, |
| "eval_loss": 1.6419459581375122, |
| "eval_runtime": 75.3644, |
| "eval_samples_per_second": 1245.137, |
| "eval_steps_per_second": 4.87, |
| "step": 36000 |
| }, |
| { |
| "epoch": 6.130952380952381, |
| "grad_norm": 0.408674418926239, |
| "learning_rate": 0.00020040492957746478, |
| "loss": 1.4666, |
| "step": 36050 |
| }, |
| { |
| "epoch": 6.139455782312925, |
| "grad_norm": 0.41451412439346313, |
| "learning_rate": 0.00019996478873239438, |
| "loss": 1.4611, |
| "step": 36100 |
| }, |
| { |
| "epoch": 6.1479591836734695, |
| "grad_norm": 0.4128754734992981, |
| "learning_rate": 0.00019952464788732395, |
| "loss": 1.4589, |
| "step": 36150 |
| }, |
| { |
| "epoch": 6.156462585034014, |
| "grad_norm": 0.4213225543498993, |
| "learning_rate": 0.00019908450704225354, |
| "loss": 1.462, |
| "step": 36200 |
| }, |
| { |
| "epoch": 6.164965986394558, |
| "grad_norm": 0.4226750433444977, |
| "learning_rate": 0.0001986443661971831, |
| "loss": 1.4605, |
| "step": 36250 |
| }, |
| { |
| "epoch": 6.173469387755102, |
| "grad_norm": 0.4176504611968994, |
| "learning_rate": 0.00019820422535211268, |
| "loss": 1.459, |
| "step": 36300 |
| }, |
| { |
| "epoch": 6.181972789115647, |
| "grad_norm": 0.42486241459846497, |
| "learning_rate": 0.00019776408450704225, |
| "loss": 1.4656, |
| "step": 36350 |
| }, |
| { |
| "epoch": 6.190476190476191, |
| "grad_norm": 0.40891167521476746, |
| "learning_rate": 0.00019732394366197184, |
| "loss": 1.4601, |
| "step": 36400 |
| }, |
| { |
| "epoch": 6.198979591836735, |
| "grad_norm": 0.4230293035507202, |
| "learning_rate": 0.0001968838028169014, |
| "loss": 1.4537, |
| "step": 36450 |
| }, |
| { |
| "epoch": 6.207482993197279, |
| "grad_norm": 0.4342026710510254, |
| "learning_rate": 0.000196443661971831, |
| "loss": 1.4575, |
| "step": 36500 |
| }, |
| { |
| "epoch": 6.215986394557823, |
| "grad_norm": 0.417348176240921, |
| "learning_rate": 0.00019600352112676057, |
| "loss": 1.4618, |
| "step": 36550 |
| }, |
| { |
| "epoch": 6.224489795918367, |
| "grad_norm": 0.4206349551677704, |
| "learning_rate": 0.00019556338028169017, |
| "loss": 1.4596, |
| "step": 36600 |
| }, |
| { |
| "epoch": 6.232993197278912, |
| "grad_norm": 0.42500296235084534, |
| "learning_rate": 0.0001951232394366197, |
| "loss": 1.4534, |
| "step": 36650 |
| }, |
| { |
| "epoch": 6.241496598639456, |
| "grad_norm": 0.42607611417770386, |
| "learning_rate": 0.0001946830985915493, |
| "loss": 1.4621, |
| "step": 36700 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 0.4912338852882385, |
| "learning_rate": 0.00019424295774647887, |
| "loss": 1.4629, |
| "step": 36750 |
| }, |
| { |
| "epoch": 6.258503401360544, |
| "grad_norm": 0.40748193860054016, |
| "learning_rate": 0.00019380281690140847, |
| "loss": 1.4639, |
| "step": 36800 |
| }, |
| { |
| "epoch": 6.267006802721088, |
| "grad_norm": 0.41081297397613525, |
| "learning_rate": 0.00019336267605633804, |
| "loss": 1.4505, |
| "step": 36850 |
| }, |
| { |
| "epoch": 6.275510204081632, |
| "grad_norm": 0.41694825887680054, |
| "learning_rate": 0.0001929225352112676, |
| "loss": 1.4573, |
| "step": 36900 |
| }, |
| { |
| "epoch": 6.284013605442177, |
| "grad_norm": 0.4185777008533478, |
| "learning_rate": 0.0001924823943661972, |
| "loss": 1.4621, |
| "step": 36950 |
| }, |
| { |
| "epoch": 6.292517006802721, |
| "grad_norm": 0.6816411018371582, |
| "learning_rate": 0.00019204225352112674, |
| "loss": 1.4526, |
| "step": 37000 |
| }, |
| { |
| "epoch": 6.292517006802721, |
| "eval_loss": 1.6315878629684448, |
| "eval_runtime": 75.3702, |
| "eval_samples_per_second": 1245.041, |
| "eval_steps_per_second": 4.869, |
| "step": 37000 |
| }, |
| { |
| "epoch": 6.301020408163265, |
| "grad_norm": 0.4173048436641693, |
| "learning_rate": 0.00019160211267605634, |
| "loss": 1.4542, |
| "step": 37050 |
| }, |
| { |
| "epoch": 6.309523809523809, |
| "grad_norm": 0.4043162167072296, |
| "learning_rate": 0.0001911619718309859, |
| "loss": 1.4541, |
| "step": 37100 |
| }, |
| { |
| "epoch": 6.318027210884353, |
| "grad_norm": 0.4198301136493683, |
| "learning_rate": 0.0001907218309859155, |
| "loss": 1.4526, |
| "step": 37150 |
| }, |
| { |
| "epoch": 6.326530612244898, |
| "grad_norm": 0.4946387708187103, |
| "learning_rate": 0.00019028169014084507, |
| "loss": 1.4631, |
| "step": 37200 |
| }, |
| { |
| "epoch": 6.335034013605442, |
| "grad_norm": 0.431738018989563, |
| "learning_rate": 0.00018984154929577466, |
| "loss": 1.449, |
| "step": 37250 |
| }, |
| { |
| "epoch": 6.343537414965986, |
| "grad_norm": 0.41441062092781067, |
| "learning_rate": 0.00018940140845070423, |
| "loss": 1.4501, |
| "step": 37300 |
| }, |
| { |
| "epoch": 6.3520408163265305, |
| "grad_norm": 0.41452255845069885, |
| "learning_rate": 0.0001889612676056338, |
| "loss": 1.445, |
| "step": 37350 |
| }, |
| { |
| "epoch": 6.360544217687075, |
| "grad_norm": 0.4315445125102997, |
| "learning_rate": 0.00018852112676056337, |
| "loss": 1.4556, |
| "step": 37400 |
| }, |
| { |
| "epoch": 6.369047619047619, |
| "grad_norm": 0.41262757778167725, |
| "learning_rate": 0.00018808098591549297, |
| "loss": 1.4531, |
| "step": 37450 |
| }, |
| { |
| "epoch": 6.377551020408164, |
| "grad_norm": 0.42860937118530273, |
| "learning_rate": 0.00018764084507042253, |
| "loss": 1.4612, |
| "step": 37500 |
| }, |
| { |
| "epoch": 6.386054421768708, |
| "grad_norm": 0.4128934442996979, |
| "learning_rate": 0.00018720070422535213, |
| "loss": 1.4467, |
| "step": 37550 |
| }, |
| { |
| "epoch": 6.394557823129252, |
| "grad_norm": 0.4122573137283325, |
| "learning_rate": 0.0001867605633802817, |
| "loss": 1.446, |
| "step": 37600 |
| }, |
| { |
| "epoch": 6.403061224489796, |
| "grad_norm": 0.41068488359451294, |
| "learning_rate": 0.0001863204225352113, |
| "loss": 1.4515, |
| "step": 37650 |
| }, |
| { |
| "epoch": 6.41156462585034, |
| "grad_norm": 0.4696698784828186, |
| "learning_rate": 0.00018588028169014083, |
| "loss": 1.4508, |
| "step": 37700 |
| }, |
| { |
| "epoch": 6.420068027210885, |
| "grad_norm": 0.41070887446403503, |
| "learning_rate": 0.00018544014084507043, |
| "loss": 1.4496, |
| "step": 37750 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 0.4017719626426697, |
| "learning_rate": 0.000185, |
| "loss": 1.4457, |
| "step": 37800 |
| }, |
| { |
| "epoch": 6.437074829931973, |
| "grad_norm": 0.420674592256546, |
| "learning_rate": 0.0001845598591549296, |
| "loss": 1.4524, |
| "step": 37850 |
| }, |
| { |
| "epoch": 6.445578231292517, |
| "grad_norm": 0.4091247618198395, |
| "learning_rate": 0.00018411971830985916, |
| "loss": 1.4454, |
| "step": 37900 |
| }, |
| { |
| "epoch": 6.454081632653061, |
| "grad_norm": 0.45321521162986755, |
| "learning_rate": 0.00018367957746478876, |
| "loss": 1.4511, |
| "step": 37950 |
| }, |
| { |
| "epoch": 6.462585034013605, |
| "grad_norm": 0.4094868004322052, |
| "learning_rate": 0.00018323943661971832, |
| "loss": 1.4523, |
| "step": 38000 |
| }, |
| { |
| "epoch": 6.462585034013605, |
| "eval_loss": 1.6195621490478516, |
| "eval_runtime": 75.373, |
| "eval_samples_per_second": 1244.995, |
| "eval_steps_per_second": 4.869, |
| "step": 38000 |
| }, |
| { |
| "epoch": 6.47108843537415, |
| "grad_norm": 0.41573402285575867, |
| "learning_rate": 0.00018279929577464787, |
| "loss": 1.4493, |
| "step": 38050 |
| }, |
| { |
| "epoch": 6.479591836734694, |
| "grad_norm": 0.4076358675956726, |
| "learning_rate": 0.00018235915492957746, |
| "loss": 1.4479, |
| "step": 38100 |
| }, |
| { |
| "epoch": 6.488095238095238, |
| "grad_norm": 0.41350802779197693, |
| "learning_rate": 0.00018192781690140844, |
| "loss": 1.4474, |
| "step": 38150 |
| }, |
| { |
| "epoch": 6.496598639455782, |
| "grad_norm": 0.42011725902557373, |
| "learning_rate": 0.00018149647887323944, |
| "loss": 1.4579, |
| "step": 38200 |
| }, |
| { |
| "epoch": 6.505102040816326, |
| "grad_norm": 0.4202333688735962, |
| "learning_rate": 0.000181056338028169, |
| "loss": 1.448, |
| "step": 38250 |
| }, |
| { |
| "epoch": 6.513605442176871, |
| "grad_norm": 0.41736331582069397, |
| "learning_rate": 0.0001806161971830986, |
| "loss": 1.4421, |
| "step": 38300 |
| }, |
| { |
| "epoch": 6.522108843537415, |
| "grad_norm": 0.4162420332431793, |
| "learning_rate": 0.00018017605633802817, |
| "loss": 1.4401, |
| "step": 38350 |
| }, |
| { |
| "epoch": 6.530612244897959, |
| "grad_norm": 0.40687423944473267, |
| "learning_rate": 0.00017973591549295777, |
| "loss": 1.4439, |
| "step": 38400 |
| }, |
| { |
| "epoch": 6.539115646258503, |
| "grad_norm": 0.42472130060195923, |
| "learning_rate": 0.0001792957746478873, |
| "loss": 1.4468, |
| "step": 38450 |
| }, |
| { |
| "epoch": 6.5476190476190474, |
| "grad_norm": 0.4216206669807434, |
| "learning_rate": 0.0001788556338028169, |
| "loss": 1.4454, |
| "step": 38500 |
| }, |
| { |
| "epoch": 6.5561224489795915, |
| "grad_norm": 0.4373023211956024, |
| "learning_rate": 0.00017841549295774647, |
| "loss": 1.4441, |
| "step": 38550 |
| }, |
| { |
| "epoch": 6.564625850340136, |
| "grad_norm": 0.4680120050907135, |
| "learning_rate": 0.00017797535211267607, |
| "loss": 1.4383, |
| "step": 38600 |
| }, |
| { |
| "epoch": 6.5731292517006805, |
| "grad_norm": 0.4241814911365509, |
| "learning_rate": 0.00017753521126760564, |
| "loss": 1.4413, |
| "step": 38650 |
| }, |
| { |
| "epoch": 6.581632653061225, |
| "grad_norm": 0.449913889169693, |
| "learning_rate": 0.00017709507042253523, |
| "loss": 1.4427, |
| "step": 38700 |
| }, |
| { |
| "epoch": 6.590136054421769, |
| "grad_norm": 0.4140756130218506, |
| "learning_rate": 0.0001766549295774648, |
| "loss": 1.4429, |
| "step": 38750 |
| }, |
| { |
| "epoch": 6.598639455782313, |
| "grad_norm": 0.4235837161540985, |
| "learning_rate": 0.00017621478873239437, |
| "loss": 1.4425, |
| "step": 38800 |
| }, |
| { |
| "epoch": 6.607142857142857, |
| "grad_norm": 0.4102327227592468, |
| "learning_rate": 0.00017577464788732394, |
| "loss": 1.4428, |
| "step": 38850 |
| }, |
| { |
| "epoch": 6.615646258503402, |
| "grad_norm": 0.41676509380340576, |
| "learning_rate": 0.00017533450704225353, |
| "loss": 1.4515, |
| "step": 38900 |
| }, |
| { |
| "epoch": 6.624149659863946, |
| "grad_norm": 0.41074737906455994, |
| "learning_rate": 0.0001748943661971831, |
| "loss": 1.4376, |
| "step": 38950 |
| }, |
| { |
| "epoch": 6.63265306122449, |
| "grad_norm": 0.4003889262676239, |
| "learning_rate": 0.00017445422535211267, |
| "loss": 1.4425, |
| "step": 39000 |
| }, |
| { |
| "epoch": 6.63265306122449, |
| "eval_loss": 1.6100125312805176, |
| "eval_runtime": 75.3679, |
| "eval_samples_per_second": 1245.078, |
| "eval_steps_per_second": 4.869, |
| "step": 39000 |
| }, |
| { |
| "epoch": 6.641156462585034, |
| "grad_norm": 0.4165302813053131, |
| "learning_rate": 0.00017401408450704226, |
| "loss": 1.4366, |
| "step": 39050 |
| }, |
| { |
| "epoch": 6.649659863945578, |
| "grad_norm": 0.4187438488006592, |
| "learning_rate": 0.00017357394366197183, |
| "loss": 1.4403, |
| "step": 39100 |
| }, |
| { |
| "epoch": 6.658163265306122, |
| "grad_norm": 0.41520532965660095, |
| "learning_rate": 0.0001731338028169014, |
| "loss": 1.4465, |
| "step": 39150 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.47344011068344116, |
| "learning_rate": 0.00017269366197183097, |
| "loss": 1.4451, |
| "step": 39200 |
| }, |
| { |
| "epoch": 6.675170068027211, |
| "grad_norm": 0.4409021735191345, |
| "learning_rate": 0.00017225352112676057, |
| "loss": 1.4367, |
| "step": 39250 |
| }, |
| { |
| "epoch": 6.683673469387755, |
| "grad_norm": 0.4071614742279053, |
| "learning_rate": 0.00017181338028169013, |
| "loss": 1.4343, |
| "step": 39300 |
| }, |
| { |
| "epoch": 6.692176870748299, |
| "grad_norm": 0.43377685546875, |
| "learning_rate": 0.00017137323943661973, |
| "loss": 1.4387, |
| "step": 39350 |
| }, |
| { |
| "epoch": 6.700680272108843, |
| "grad_norm": 0.41328275203704834, |
| "learning_rate": 0.0001709330985915493, |
| "loss": 1.4308, |
| "step": 39400 |
| }, |
| { |
| "epoch": 6.709183673469388, |
| "grad_norm": 0.4269335865974426, |
| "learning_rate": 0.0001704929577464789, |
| "loss": 1.4329, |
| "step": 39450 |
| }, |
| { |
| "epoch": 6.717687074829932, |
| "grad_norm": 0.4136207401752472, |
| "learning_rate": 0.00017005281690140843, |
| "loss": 1.4386, |
| "step": 39500 |
| }, |
| { |
| "epoch": 6.726190476190476, |
| "grad_norm": 0.41490909457206726, |
| "learning_rate": 0.00016961267605633803, |
| "loss": 1.4348, |
| "step": 39550 |
| }, |
| { |
| "epoch": 6.73469387755102, |
| "grad_norm": 0.41925111413002014, |
| "learning_rate": 0.0001691725352112676, |
| "loss": 1.4329, |
| "step": 39600 |
| }, |
| { |
| "epoch": 6.743197278911564, |
| "grad_norm": 0.4161663353443146, |
| "learning_rate": 0.0001687323943661972, |
| "loss": 1.4325, |
| "step": 39650 |
| }, |
| { |
| "epoch": 6.7517006802721085, |
| "grad_norm": 0.4175470769405365, |
| "learning_rate": 0.00016829225352112676, |
| "loss": 1.4331, |
| "step": 39700 |
| }, |
| { |
| "epoch": 6.760204081632653, |
| "grad_norm": 0.40980637073516846, |
| "learning_rate": 0.00016785211267605636, |
| "loss": 1.436, |
| "step": 39750 |
| }, |
| { |
| "epoch": 6.7687074829931975, |
| "grad_norm": 0.4164717197418213, |
| "learning_rate": 0.00016741197183098592, |
| "loss": 1.4354, |
| "step": 39800 |
| }, |
| { |
| "epoch": 6.7772108843537415, |
| "grad_norm": 0.41800665855407715, |
| "learning_rate": 0.00016697183098591552, |
| "loss": 1.4321, |
| "step": 39850 |
| }, |
| { |
| "epoch": 6.785714285714286, |
| "grad_norm": 0.4796483814716339, |
| "learning_rate": 0.00016653169014084506, |
| "loss": 1.428, |
| "step": 39900 |
| }, |
| { |
| "epoch": 6.79421768707483, |
| "grad_norm": 0.40372607111930847, |
| "learning_rate": 0.00016609154929577466, |
| "loss": 1.4335, |
| "step": 39950 |
| }, |
| { |
| "epoch": 6.802721088435375, |
| "grad_norm": 0.42244842648506165, |
| "learning_rate": 0.00016565140845070423, |
| "loss": 1.4391, |
| "step": 40000 |
| }, |
| { |
| "epoch": 6.802721088435375, |
| "eval_loss": 1.6078286170959473, |
| "eval_runtime": 75.3844, |
| "eval_samples_per_second": 1244.806, |
| "eval_steps_per_second": 4.868, |
| "step": 40000 |
| }, |
| { |
| "epoch": 6.811224489795919, |
| "grad_norm": 0.41039007902145386, |
| "learning_rate": 0.00016521126760563382, |
| "loss": 1.4388, |
| "step": 40050 |
| }, |
| { |
| "epoch": 6.819727891156463, |
| "grad_norm": 0.4182026982307434, |
| "learning_rate": 0.0001647711267605634, |
| "loss": 1.4335, |
| "step": 40100 |
| }, |
| { |
| "epoch": 6.828231292517007, |
| "grad_norm": 0.4472804665565491, |
| "learning_rate": 0.00016433098591549296, |
| "loss": 1.4357, |
| "step": 40150 |
| }, |
| { |
| "epoch": 6.836734693877551, |
| "grad_norm": 0.4294123649597168, |
| "learning_rate": 0.00016389084507042255, |
| "loss": 1.4355, |
| "step": 40200 |
| }, |
| { |
| "epoch": 6.845238095238095, |
| "grad_norm": 0.41815003752708435, |
| "learning_rate": 0.0001634507042253521, |
| "loss": 1.4324, |
| "step": 40250 |
| }, |
| { |
| "epoch": 6.853741496598639, |
| "grad_norm": 0.4064941704273224, |
| "learning_rate": 0.0001630105633802817, |
| "loss": 1.4366, |
| "step": 40300 |
| }, |
| { |
| "epoch": 6.862244897959184, |
| "grad_norm": 0.42009902000427246, |
| "learning_rate": 0.00016257922535211267, |
| "loss": 1.4325, |
| "step": 40350 |
| }, |
| { |
| "epoch": 6.870748299319728, |
| "grad_norm": 0.3989739716053009, |
| "learning_rate": 0.00016213908450704226, |
| "loss": 1.4238, |
| "step": 40400 |
| }, |
| { |
| "epoch": 6.879251700680272, |
| "grad_norm": 0.4133879840373993, |
| "learning_rate": 0.00016169894366197183, |
| "loss": 1.4281, |
| "step": 40450 |
| }, |
| { |
| "epoch": 6.887755102040816, |
| "grad_norm": 0.42547741532325745, |
| "learning_rate": 0.00016125880281690142, |
| "loss": 1.4279, |
| "step": 40500 |
| }, |
| { |
| "epoch": 6.896258503401361, |
| "grad_norm": 0.3915853202342987, |
| "learning_rate": 0.000160818661971831, |
| "loss": 1.4351, |
| "step": 40550 |
| }, |
| { |
| "epoch": 6.904761904761905, |
| "grad_norm": 0.4148283302783966, |
| "learning_rate": 0.00016037852112676056, |
| "loss": 1.4332, |
| "step": 40600 |
| }, |
| { |
| "epoch": 6.913265306122449, |
| "grad_norm": 0.4044345021247864, |
| "learning_rate": 0.00015993838028169013, |
| "loss": 1.4304, |
| "step": 40650 |
| }, |
| { |
| "epoch": 6.921768707482993, |
| "grad_norm": 0.4137207269668579, |
| "learning_rate": 0.00015949823943661972, |
| "loss": 1.4342, |
| "step": 40700 |
| }, |
| { |
| "epoch": 6.930272108843537, |
| "grad_norm": 0.40803512930870056, |
| "learning_rate": 0.0001590580985915493, |
| "loss": 1.4287, |
| "step": 40750 |
| }, |
| { |
| "epoch": 6.938775510204081, |
| "grad_norm": 0.4186759889125824, |
| "learning_rate": 0.0001586179577464789, |
| "loss": 1.4269, |
| "step": 40800 |
| }, |
| { |
| "epoch": 6.947278911564625, |
| "grad_norm": 0.5999934077262878, |
| "learning_rate": 0.00015817781690140846, |
| "loss": 1.4267, |
| "step": 40850 |
| }, |
| { |
| "epoch": 6.95578231292517, |
| "grad_norm": 3.3650548458099365, |
| "learning_rate": 0.00015773767605633805, |
| "loss": 1.4273, |
| "step": 40900 |
| }, |
| { |
| "epoch": 6.964285714285714, |
| "grad_norm": 0.4189580976963043, |
| "learning_rate": 0.0001572975352112676, |
| "loss": 1.4246, |
| "step": 40950 |
| }, |
| { |
| "epoch": 6.9727891156462585, |
| "grad_norm": 0.4199207127094269, |
| "learning_rate": 0.0001568573943661972, |
| "loss": 1.4264, |
| "step": 41000 |
| }, |
| { |
| "epoch": 6.9727891156462585, |
| "eval_loss": 1.6002153158187866, |
| "eval_runtime": 75.3647, |
| "eval_samples_per_second": 1245.132, |
| "eval_steps_per_second": 4.87, |
| "step": 41000 |
| }, |
| { |
| "epoch": 6.9812925170068025, |
| "grad_norm": 0.4282417595386505, |
| "learning_rate": 0.00015641725352112676, |
| "loss": 1.4302, |
| "step": 41050 |
| }, |
| { |
| "epoch": 6.989795918367347, |
| "grad_norm": 0.4040624797344208, |
| "learning_rate": 0.00015597711267605635, |
| "loss": 1.4289, |
| "step": 41100 |
| }, |
| { |
| "epoch": 6.9982993197278915, |
| "grad_norm": 0.4131171703338623, |
| "learning_rate": 0.00015553697183098592, |
| "loss": 1.4265, |
| "step": 41150 |
| }, |
| { |
| "epoch": 7.006802721088436, |
| "grad_norm": 0.41220322251319885, |
| "learning_rate": 0.0001550968309859155, |
| "loss": 1.421, |
| "step": 41200 |
| }, |
| { |
| "epoch": 7.01530612244898, |
| "grad_norm": 0.41320499777793884, |
| "learning_rate": 0.00015465669014084508, |
| "loss": 1.4126, |
| "step": 41250 |
| }, |
| { |
| "epoch": 7.023809523809524, |
| "grad_norm": 0.43131372332572937, |
| "learning_rate": 0.00015421654929577463, |
| "loss": 1.4185, |
| "step": 41300 |
| }, |
| { |
| "epoch": 7.032312925170068, |
| "grad_norm": 0.41249164938926697, |
| "learning_rate": 0.00015377640845070422, |
| "loss": 1.4225, |
| "step": 41350 |
| }, |
| { |
| "epoch": 7.040816326530612, |
| "grad_norm": 0.41785308718681335, |
| "learning_rate": 0.0001533362676056338, |
| "loss": 1.4193, |
| "step": 41400 |
| }, |
| { |
| "epoch": 7.049319727891157, |
| "grad_norm": 0.4036597013473511, |
| "learning_rate": 0.00015289612676056338, |
| "loss": 1.417, |
| "step": 41450 |
| }, |
| { |
| "epoch": 7.057823129251701, |
| "grad_norm": 1.470516562461853, |
| "learning_rate": 0.00015245598591549295, |
| "loss": 1.42, |
| "step": 41500 |
| }, |
| { |
| "epoch": 7.066326530612245, |
| "grad_norm": 0.4129942059516907, |
| "learning_rate": 0.00015201584507042255, |
| "loss": 1.4191, |
| "step": 41550 |
| }, |
| { |
| "epoch": 7.074829931972789, |
| "grad_norm": 0.41847845911979675, |
| "learning_rate": 0.00015157570422535212, |
| "loss": 1.4146, |
| "step": 41600 |
| }, |
| { |
| "epoch": 7.083333333333333, |
| "grad_norm": 0.4097050428390503, |
| "learning_rate": 0.0001511355633802817, |
| "loss": 1.4202, |
| "step": 41650 |
| }, |
| { |
| "epoch": 7.091836734693878, |
| "grad_norm": 0.42190149426460266, |
| "learning_rate": 0.00015069542253521125, |
| "loss": 1.4191, |
| "step": 41700 |
| }, |
| { |
| "epoch": 7.100340136054422, |
| "grad_norm": 0.4126221835613251, |
| "learning_rate": 0.00015025528169014085, |
| "loss": 1.4157, |
| "step": 41750 |
| }, |
| { |
| "epoch": 7.108843537414966, |
| "grad_norm": 0.42145752906799316, |
| "learning_rate": 0.00014981514084507042, |
| "loss": 1.4152, |
| "step": 41800 |
| }, |
| { |
| "epoch": 7.11734693877551, |
| "grad_norm": 0.41415074467658997, |
| "learning_rate": 0.000149375, |
| "loss": 1.4233, |
| "step": 41850 |
| }, |
| { |
| "epoch": 7.125850340136054, |
| "grad_norm": 0.4122840166091919, |
| "learning_rate": 0.00014893485915492958, |
| "loss": 1.4194, |
| "step": 41900 |
| }, |
| { |
| "epoch": 7.134353741496598, |
| "grad_norm": 0.4208846986293793, |
| "learning_rate": 0.00014849471830985918, |
| "loss": 1.417, |
| "step": 41950 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.41025856137275696, |
| "learning_rate": 0.00014805457746478874, |
| "loss": 1.4138, |
| "step": 42000 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "eval_loss": 1.5856564044952393, |
| "eval_runtime": 75.3402, |
| "eval_samples_per_second": 1245.536, |
| "eval_steps_per_second": 4.871, |
| "step": 42000 |
| }, |
| { |
| "epoch": 7.151360544217687, |
| "grad_norm": 0.4345760941505432, |
| "learning_rate": 0.0001476144366197183, |
| "loss": 1.4162, |
| "step": 42050 |
| }, |
| { |
| "epoch": 7.159863945578231, |
| "grad_norm": 0.42329126596450806, |
| "learning_rate": 0.00014717429577464788, |
| "loss": 1.4183, |
| "step": 42100 |
| }, |
| { |
| "epoch": 7.168367346938775, |
| "grad_norm": 0.45433273911476135, |
| "learning_rate": 0.00014673415492957748, |
| "loss": 1.4209, |
| "step": 42150 |
| }, |
| { |
| "epoch": 7.1768707482993195, |
| "grad_norm": 0.401620477437973, |
| "learning_rate": 0.00014629401408450704, |
| "loss": 1.4157, |
| "step": 42200 |
| }, |
| { |
| "epoch": 7.1853741496598635, |
| "grad_norm": 0.4111977219581604, |
| "learning_rate": 0.00014585387323943664, |
| "loss": 1.4176, |
| "step": 42250 |
| }, |
| { |
| "epoch": 7.1938775510204085, |
| "grad_norm": 0.4081551134586334, |
| "learning_rate": 0.0001454137323943662, |
| "loss": 1.4175, |
| "step": 42300 |
| }, |
| { |
| "epoch": 7.2023809523809526, |
| "grad_norm": 0.41629016399383545, |
| "learning_rate": 0.0001449735915492958, |
| "loss": 1.4156, |
| "step": 42350 |
| }, |
| { |
| "epoch": 7.210884353741497, |
| "grad_norm": 0.4202435314655304, |
| "learning_rate": 0.00014453345070422534, |
| "loss": 1.4158, |
| "step": 42400 |
| }, |
| { |
| "epoch": 7.219387755102041, |
| "grad_norm": 0.4117106795310974, |
| "learning_rate": 0.0001440933098591549, |
| "loss": 1.4128, |
| "step": 42450 |
| }, |
| { |
| "epoch": 7.227891156462585, |
| "grad_norm": 0.40955278277397156, |
| "learning_rate": 0.0001436531690140845, |
| "loss": 1.4101, |
| "step": 42500 |
| }, |
| { |
| "epoch": 7.23639455782313, |
| "grad_norm": 0.4062648415565491, |
| "learning_rate": 0.00014321302816901408, |
| "loss": 1.4123, |
| "step": 42550 |
| }, |
| { |
| "epoch": 7.244897959183674, |
| "grad_norm": 0.39645469188690186, |
| "learning_rate": 0.00014277288732394367, |
| "loss": 1.4122, |
| "step": 42600 |
| }, |
| { |
| "epoch": 7.253401360544218, |
| "grad_norm": 0.4102307856082916, |
| "learning_rate": 0.00014233274647887324, |
| "loss": 1.4142, |
| "step": 42650 |
| }, |
| { |
| "epoch": 7.261904761904762, |
| "grad_norm": 0.39169833064079285, |
| "learning_rate": 0.00014189260563380284, |
| "loss": 1.4123, |
| "step": 42700 |
| }, |
| { |
| "epoch": 7.270408163265306, |
| "grad_norm": 0.40885522961616516, |
| "learning_rate": 0.00014145246478873238, |
| "loss": 1.4099, |
| "step": 42750 |
| }, |
| { |
| "epoch": 7.27891156462585, |
| "grad_norm": 0.40849608182907104, |
| "learning_rate": 0.00014101232394366197, |
| "loss": 1.4112, |
| "step": 42800 |
| }, |
| { |
| "epoch": 7.287414965986395, |
| "grad_norm": 0.416453093290329, |
| "learning_rate": 0.00014057218309859154, |
| "loss": 1.4139, |
| "step": 42850 |
| }, |
| { |
| "epoch": 7.295918367346939, |
| "grad_norm": 0.40005016326904297, |
| "learning_rate": 0.00014013204225352114, |
| "loss": 1.4122, |
| "step": 42900 |
| }, |
| { |
| "epoch": 7.304421768707483, |
| "grad_norm": 0.40970712900161743, |
| "learning_rate": 0.0001396919014084507, |
| "loss": 1.4071, |
| "step": 42950 |
| }, |
| { |
| "epoch": 7.312925170068027, |
| "grad_norm": 0.3983543813228607, |
| "learning_rate": 0.0001392517605633803, |
| "loss": 1.4095, |
| "step": 43000 |
| }, |
| { |
| "epoch": 7.312925170068027, |
| "eval_loss": 1.581950068473816, |
| "eval_runtime": 75.8081, |
| "eval_samples_per_second": 1237.849, |
| "eval_steps_per_second": 4.841, |
| "step": 43000 |
| }, |
| { |
| "epoch": 7.321428571428571, |
| "grad_norm": 0.42998814582824707, |
| "learning_rate": 0.00013881161971830987, |
| "loss": 1.4165, |
| "step": 43050 |
| }, |
| { |
| "epoch": 7.329931972789115, |
| "grad_norm": 0.40602609515190125, |
| "learning_rate": 0.00013837147887323944, |
| "loss": 1.4079, |
| "step": 43100 |
| }, |
| { |
| "epoch": 7.33843537414966, |
| "grad_norm": 0.4095648229122162, |
| "learning_rate": 0.000137931338028169, |
| "loss": 1.4155, |
| "step": 43150 |
| }, |
| { |
| "epoch": 7.346938775510204, |
| "grad_norm": 0.5845966339111328, |
| "learning_rate": 0.0001374911971830986, |
| "loss": 1.3978, |
| "step": 43200 |
| }, |
| { |
| "epoch": 7.355442176870748, |
| "grad_norm": 0.39307621121406555, |
| "learning_rate": 0.00013705105633802817, |
| "loss": 1.4063, |
| "step": 43250 |
| }, |
| { |
| "epoch": 7.363945578231292, |
| "grad_norm": 0.41059941053390503, |
| "learning_rate": 0.00013661091549295776, |
| "loss": 1.4111, |
| "step": 43300 |
| }, |
| { |
| "epoch": 7.372448979591836, |
| "grad_norm": 0.4473901689052582, |
| "learning_rate": 0.00013617077464788733, |
| "loss": 1.4107, |
| "step": 43350 |
| }, |
| { |
| "epoch": 7.380952380952381, |
| "grad_norm": 0.4176762104034424, |
| "learning_rate": 0.00013573063380281693, |
| "loss": 1.4074, |
| "step": 43400 |
| }, |
| { |
| "epoch": 7.389455782312925, |
| "grad_norm": 0.40306830406188965, |
| "learning_rate": 0.00013529049295774647, |
| "loss": 1.4018, |
| "step": 43450 |
| }, |
| { |
| "epoch": 7.3979591836734695, |
| "grad_norm": 0.40745970606803894, |
| "learning_rate": 0.00013485035211267606, |
| "loss": 1.4094, |
| "step": 43500 |
| }, |
| { |
| "epoch": 7.406462585034014, |
| "grad_norm": 0.4084283411502838, |
| "learning_rate": 0.00013441021126760563, |
| "loss": 1.4065, |
| "step": 43550 |
| }, |
| { |
| "epoch": 7.414965986394558, |
| "grad_norm": 0.3982578217983246, |
| "learning_rate": 0.0001339700704225352, |
| "loss": 1.4058, |
| "step": 43600 |
| }, |
| { |
| "epoch": 7.423469387755102, |
| "grad_norm": 0.40555986762046814, |
| "learning_rate": 0.0001335299295774648, |
| "loss": 1.4071, |
| "step": 43650 |
| }, |
| { |
| "epoch": 7.431972789115647, |
| "grad_norm": 0.41015100479125977, |
| "learning_rate": 0.00013308978873239436, |
| "loss": 1.4055, |
| "step": 43700 |
| }, |
| { |
| "epoch": 7.440476190476191, |
| "grad_norm": 0.39785054326057434, |
| "learning_rate": 0.00013264964788732396, |
| "loss": 1.4052, |
| "step": 43750 |
| }, |
| { |
| "epoch": 7.448979591836735, |
| "grad_norm": 0.4046226441860199, |
| "learning_rate": 0.0001322095070422535, |
| "loss": 1.4099, |
| "step": 43800 |
| }, |
| { |
| "epoch": 7.457482993197279, |
| "grad_norm": 0.4013233482837677, |
| "learning_rate": 0.0001317693661971831, |
| "loss": 1.4105, |
| "step": 43850 |
| }, |
| { |
| "epoch": 7.465986394557823, |
| "grad_norm": 0.42137226462364197, |
| "learning_rate": 0.00013132922535211266, |
| "loss": 1.4074, |
| "step": 43900 |
| }, |
| { |
| "epoch": 7.474489795918368, |
| "grad_norm": 0.4147741496562958, |
| "learning_rate": 0.00013088908450704226, |
| "loss": 1.4055, |
| "step": 43950 |
| }, |
| { |
| "epoch": 7.482993197278912, |
| "grad_norm": 0.43600887060165405, |
| "learning_rate": 0.00013044894366197183, |
| "loss": 1.4049, |
| "step": 44000 |
| }, |
| { |
| "epoch": 7.482993197278912, |
| "eval_loss": 1.575681209564209, |
| "eval_runtime": 75.3426, |
| "eval_samples_per_second": 1245.498, |
| "eval_steps_per_second": 4.871, |
| "step": 44000 |
| }, |
| { |
| "epoch": 7.491496598639456, |
| "grad_norm": 0.40329524874687195, |
| "learning_rate": 0.00013000880281690142, |
| "loss": 1.406, |
| "step": 44050 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.39978840947151184, |
| "learning_rate": 0.000129568661971831, |
| "loss": 1.4051, |
| "step": 44100 |
| }, |
| { |
| "epoch": 7.508503401360544, |
| "grad_norm": 0.4256359934806824, |
| "learning_rate": 0.00012912852112676056, |
| "loss": 1.4096, |
| "step": 44150 |
| }, |
| { |
| "epoch": 7.517006802721088, |
| "grad_norm": 0.41504326462745667, |
| "learning_rate": 0.00012868838028169013, |
| "loss": 1.411, |
| "step": 44200 |
| }, |
| { |
| "epoch": 7.525510204081632, |
| "grad_norm": 0.41557520627975464, |
| "learning_rate": 0.00012824823943661972, |
| "loss": 1.3988, |
| "step": 44250 |
| }, |
| { |
| "epoch": 7.534013605442177, |
| "grad_norm": 0.654144287109375, |
| "learning_rate": 0.0001278080985915493, |
| "loss": 1.4002, |
| "step": 44300 |
| }, |
| { |
| "epoch": 7.542517006802721, |
| "grad_norm": 0.40340688824653625, |
| "learning_rate": 0.0001273679577464789, |
| "loss": 1.407, |
| "step": 44350 |
| }, |
| { |
| "epoch": 7.551020408163265, |
| "grad_norm": 0.4111216068267822, |
| "learning_rate": 0.00012692781690140846, |
| "loss": 1.4051, |
| "step": 44400 |
| }, |
| { |
| "epoch": 7.559523809523809, |
| "grad_norm": 0.40301740169525146, |
| "learning_rate": 0.00012648767605633805, |
| "loss": 1.4063, |
| "step": 44450 |
| }, |
| { |
| "epoch": 7.568027210884353, |
| "grad_norm": 0.40716397762298584, |
| "learning_rate": 0.0001260475352112676, |
| "loss": 1.4026, |
| "step": 44500 |
| }, |
| { |
| "epoch": 7.576530612244898, |
| "grad_norm": 0.40946149826049805, |
| "learning_rate": 0.0001256073943661972, |
| "loss": 1.3999, |
| "step": 44550 |
| }, |
| { |
| "epoch": 7.585034013605442, |
| "grad_norm": 0.4165254533290863, |
| "learning_rate": 0.00012516725352112676, |
| "loss": 1.4061, |
| "step": 44600 |
| }, |
| { |
| "epoch": 7.593537414965986, |
| "grad_norm": 0.4061264097690582, |
| "learning_rate": 0.00012472711267605635, |
| "loss": 1.3952, |
| "step": 44650 |
| }, |
| { |
| "epoch": 7.6020408163265305, |
| "grad_norm": 0.4414292573928833, |
| "learning_rate": 0.00012428697183098592, |
| "loss": 1.4061, |
| "step": 44700 |
| }, |
| { |
| "epoch": 7.610544217687075, |
| "grad_norm": 0.39658012986183167, |
| "learning_rate": 0.0001238468309859155, |
| "loss": 1.3985, |
| "step": 44750 |
| }, |
| { |
| "epoch": 7.619047619047619, |
| "grad_norm": 0.7822299599647522, |
| "learning_rate": 0.00012340669014084508, |
| "loss": 1.3993, |
| "step": 44800 |
| }, |
| { |
| "epoch": 7.627551020408164, |
| "grad_norm": 0.3982444703578949, |
| "learning_rate": 0.00012296654929577465, |
| "loss": 1.4011, |
| "step": 44850 |
| }, |
| { |
| "epoch": 7.636054421768708, |
| "grad_norm": 0.40807053446769714, |
| "learning_rate": 0.00012252640845070422, |
| "loss": 1.3984, |
| "step": 44900 |
| }, |
| { |
| "epoch": 7.644557823129252, |
| "grad_norm": 0.4086779057979584, |
| "learning_rate": 0.00012208626760563382, |
| "loss": 1.403, |
| "step": 44950 |
| }, |
| { |
| "epoch": 7.653061224489796, |
| "grad_norm": 0.4078817367553711, |
| "learning_rate": 0.00012164612676056338, |
| "loss": 1.3974, |
| "step": 45000 |
| }, |
| { |
| "epoch": 7.653061224489796, |
| "eval_loss": 1.564176082611084, |
| "eval_runtime": 75.8799, |
| "eval_samples_per_second": 1236.678, |
| "eval_steps_per_second": 4.837, |
| "step": 45000 |
| }, |
| { |
| "epoch": 7.66156462585034, |
| "grad_norm": 0.4033880829811096, |
| "learning_rate": 0.00012120598591549297, |
| "loss": 1.396, |
| "step": 45050 |
| }, |
| { |
| "epoch": 7.670068027210885, |
| "grad_norm": 0.41422680020332336, |
| "learning_rate": 0.00012076584507042255, |
| "loss": 1.3999, |
| "step": 45100 |
| }, |
| { |
| "epoch": 7.678571428571429, |
| "grad_norm": 0.41265836358070374, |
| "learning_rate": 0.00012032570422535212, |
| "loss": 1.3974, |
| "step": 45150 |
| }, |
| { |
| "epoch": 7.687074829931973, |
| "grad_norm": 0.4050711989402771, |
| "learning_rate": 0.0001198855633802817, |
| "loss": 1.3992, |
| "step": 45200 |
| }, |
| { |
| "epoch": 7.695578231292517, |
| "grad_norm": 0.4541753828525543, |
| "learning_rate": 0.00011944542253521128, |
| "loss": 1.3956, |
| "step": 45250 |
| }, |
| { |
| "epoch": 7.704081632653061, |
| "grad_norm": 0.469691663980484, |
| "learning_rate": 0.00011900528169014085, |
| "loss": 1.3941, |
| "step": 45300 |
| }, |
| { |
| "epoch": 7.712585034013605, |
| "grad_norm": 0.4149336516857147, |
| "learning_rate": 0.00011856514084507042, |
| "loss": 1.3926, |
| "step": 45350 |
| }, |
| { |
| "epoch": 7.72108843537415, |
| "grad_norm": 0.3968436121940613, |
| "learning_rate": 0.000118125, |
| "loss": 1.3945, |
| "step": 45400 |
| }, |
| { |
| "epoch": 7.729591836734694, |
| "grad_norm": 0.4118001461029053, |
| "learning_rate": 0.00011768485915492958, |
| "loss": 1.3945, |
| "step": 45450 |
| }, |
| { |
| "epoch": 7.738095238095238, |
| "grad_norm": 0.4041975438594818, |
| "learning_rate": 0.00011724471830985915, |
| "loss": 1.3958, |
| "step": 45500 |
| }, |
| { |
| "epoch": 7.746598639455782, |
| "grad_norm": 0.4147978723049164, |
| "learning_rate": 0.00011680457746478873, |
| "loss": 1.3885, |
| "step": 45550 |
| }, |
| { |
| "epoch": 7.755102040816326, |
| "grad_norm": 0.4063569903373718, |
| "learning_rate": 0.00011636443661971831, |
| "loss": 1.3955, |
| "step": 45600 |
| }, |
| { |
| "epoch": 7.763605442176871, |
| "grad_norm": 0.4077317416667938, |
| "learning_rate": 0.00011592429577464788, |
| "loss": 1.3969, |
| "step": 45650 |
| }, |
| { |
| "epoch": 7.772108843537415, |
| "grad_norm": 0.4094686508178711, |
| "learning_rate": 0.00011548415492957746, |
| "loss": 1.3956, |
| "step": 45700 |
| }, |
| { |
| "epoch": 7.780612244897959, |
| "grad_norm": 0.46225419640541077, |
| "learning_rate": 0.00011504401408450704, |
| "loss": 1.3964, |
| "step": 45750 |
| }, |
| { |
| "epoch": 7.789115646258503, |
| "grad_norm": 0.40823882818222046, |
| "learning_rate": 0.00011460387323943663, |
| "loss": 1.3865, |
| "step": 45800 |
| }, |
| { |
| "epoch": 7.7976190476190474, |
| "grad_norm": 0.4115474224090576, |
| "learning_rate": 0.0001141637323943662, |
| "loss": 1.3946, |
| "step": 45850 |
| }, |
| { |
| "epoch": 7.8061224489795915, |
| "grad_norm": 0.4270274043083191, |
| "learning_rate": 0.00011372359154929578, |
| "loss": 1.3961, |
| "step": 45900 |
| }, |
| { |
| "epoch": 7.814625850340136, |
| "grad_norm": 0.4096102714538574, |
| "learning_rate": 0.00011328345070422536, |
| "loss": 1.3909, |
| "step": 45950 |
| }, |
| { |
| "epoch": 7.8231292517006805, |
| "grad_norm": 0.41915324330329895, |
| "learning_rate": 0.00011284330985915494, |
| "loss": 1.3942, |
| "step": 46000 |
| }, |
| { |
| "epoch": 7.8231292517006805, |
| "eval_loss": 1.5574978590011597, |
| "eval_runtime": 75.3679, |
| "eval_samples_per_second": 1245.08, |
| "eval_steps_per_second": 4.869, |
| "step": 46000 |
| }, |
| { |
| "epoch": 7.831632653061225, |
| "grad_norm": 0.42200446128845215, |
| "learning_rate": 0.00011241197183098592, |
| "loss": 1.3898, |
| "step": 46050 |
| }, |
| { |
| "epoch": 7.840136054421769, |
| "grad_norm": 0.40634581446647644, |
| "learning_rate": 0.0001119718309859155, |
| "loss": 1.3872, |
| "step": 46100 |
| }, |
| { |
| "epoch": 7.848639455782313, |
| "grad_norm": 0.40481993556022644, |
| "learning_rate": 0.00011153169014084508, |
| "loss": 1.394, |
| "step": 46150 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.8925232887268066, |
| "learning_rate": 0.00011109154929577466, |
| "loss": 1.3921, |
| "step": 46200 |
| }, |
| { |
| "epoch": 7.865646258503402, |
| "grad_norm": 0.5260804295539856, |
| "learning_rate": 0.00011065140845070423, |
| "loss": 1.3892, |
| "step": 46250 |
| }, |
| { |
| "epoch": 7.874149659863946, |
| "grad_norm": 0.4104487895965576, |
| "learning_rate": 0.00011021126760563381, |
| "loss": 1.3966, |
| "step": 46300 |
| }, |
| { |
| "epoch": 7.88265306122449, |
| "grad_norm": 0.4222128391265869, |
| "learning_rate": 0.0001097711267605634, |
| "loss": 1.3911, |
| "step": 46350 |
| }, |
| { |
| "epoch": 7.891156462585034, |
| "grad_norm": 0.4017203748226166, |
| "learning_rate": 0.00010933098591549295, |
| "loss": 1.3858, |
| "step": 46400 |
| }, |
| { |
| "epoch": 7.899659863945578, |
| "grad_norm": 0.41633373498916626, |
| "learning_rate": 0.00010889084507042253, |
| "loss": 1.3888, |
| "step": 46450 |
| }, |
| { |
| "epoch": 7.908163265306122, |
| "grad_norm": 0.41837745904922485, |
| "learning_rate": 0.00010845070422535211, |
| "loss": 1.3827, |
| "step": 46500 |
| }, |
| { |
| "epoch": 7.916666666666667, |
| "grad_norm": 0.4108099341392517, |
| "learning_rate": 0.0001080105633802817, |
| "loss": 1.3867, |
| "step": 46550 |
| }, |
| { |
| "epoch": 7.925170068027211, |
| "grad_norm": 0.4089981019496918, |
| "learning_rate": 0.00010757042253521126, |
| "loss": 1.3883, |
| "step": 46600 |
| }, |
| { |
| "epoch": 7.933673469387755, |
| "grad_norm": 0.40665385127067566, |
| "learning_rate": 0.00010713028169014084, |
| "loss": 1.3867, |
| "step": 46650 |
| }, |
| { |
| "epoch": 7.942176870748299, |
| "grad_norm": 0.4587544798851013, |
| "learning_rate": 0.00010669014084507043, |
| "loss": 1.3887, |
| "step": 46700 |
| }, |
| { |
| "epoch": 7.950680272108843, |
| "grad_norm": 0.41734760999679565, |
| "learning_rate": 0.00010625, |
| "loss": 1.3865, |
| "step": 46750 |
| }, |
| { |
| "epoch": 7.959183673469388, |
| "grad_norm": 0.3909563422203064, |
| "learning_rate": 0.00010580985915492958, |
| "loss": 1.3826, |
| "step": 46800 |
| }, |
| { |
| "epoch": 7.967687074829932, |
| "grad_norm": 0.4237678647041321, |
| "learning_rate": 0.00010536971830985916, |
| "loss": 1.3844, |
| "step": 46850 |
| }, |
| { |
| "epoch": 7.976190476190476, |
| "grad_norm": 0.3990273177623749, |
| "learning_rate": 0.00010492957746478874, |
| "loss": 1.3834, |
| "step": 46900 |
| }, |
| { |
| "epoch": 7.98469387755102, |
| "grad_norm": 0.4431270360946655, |
| "learning_rate": 0.00010448943661971831, |
| "loss": 1.3829, |
| "step": 46950 |
| }, |
| { |
| "epoch": 7.993197278911564, |
| "grad_norm": 0.423221230506897, |
| "learning_rate": 0.00010404929577464789, |
| "loss": 1.3861, |
| "step": 47000 |
| }, |
| { |
| "epoch": 7.993197278911564, |
| "eval_loss": 1.5540677309036255, |
| "eval_runtime": 75.3265, |
| "eval_samples_per_second": 1245.764, |
| "eval_steps_per_second": 4.872, |
| "step": 47000 |
| }, |
| { |
| "epoch": 8.001700680272108, |
| "grad_norm": 0.4330544173717499, |
| "learning_rate": 0.00010360915492957747, |
| "loss": 1.3834, |
| "step": 47050 |
| }, |
| { |
| "epoch": 8.010204081632653, |
| "grad_norm": 0.4099094867706299, |
| "learning_rate": 0.00010316901408450704, |
| "loss": 1.3826, |
| "step": 47100 |
| }, |
| { |
| "epoch": 8.018707482993197, |
| "grad_norm": 0.41134965419769287, |
| "learning_rate": 0.00010272887323943662, |
| "loss": 1.3846, |
| "step": 47150 |
| }, |
| { |
| "epoch": 8.02721088435374, |
| "grad_norm": 0.4183215796947479, |
| "learning_rate": 0.0001022887323943662, |
| "loss": 1.379, |
| "step": 47200 |
| }, |
| { |
| "epoch": 8.035714285714286, |
| "grad_norm": 0.39894890785217285, |
| "learning_rate": 0.00010184859154929579, |
| "loss": 1.3839, |
| "step": 47250 |
| }, |
| { |
| "epoch": 8.04421768707483, |
| "grad_norm": 0.4777671992778778, |
| "learning_rate": 0.00010140845070422535, |
| "loss": 1.3789, |
| "step": 47300 |
| }, |
| { |
| "epoch": 8.052721088435375, |
| "grad_norm": 0.41188567876815796, |
| "learning_rate": 0.00010096830985915494, |
| "loss": 1.3848, |
| "step": 47350 |
| }, |
| { |
| "epoch": 8.061224489795919, |
| "grad_norm": 0.4015198349952698, |
| "learning_rate": 0.00010052816901408452, |
| "loss": 1.3883, |
| "step": 47400 |
| }, |
| { |
| "epoch": 8.069727891156463, |
| "grad_norm": 0.401507169008255, |
| "learning_rate": 0.00010008802816901409, |
| "loss": 1.381, |
| "step": 47450 |
| }, |
| { |
| "epoch": 8.078231292517007, |
| "grad_norm": 0.40761667490005493, |
| "learning_rate": 9.964788732394367e-05, |
| "loss": 1.3714, |
| "step": 47500 |
| }, |
| { |
| "epoch": 8.08673469387755, |
| "grad_norm": 0.427236407995224, |
| "learning_rate": 9.920774647887325e-05, |
| "loss": 1.3799, |
| "step": 47550 |
| }, |
| { |
| "epoch": 8.095238095238095, |
| "grad_norm": 0.40780577063560486, |
| "learning_rate": 9.876760563380282e-05, |
| "loss": 1.3762, |
| "step": 47600 |
| }, |
| { |
| "epoch": 8.103741496598639, |
| "grad_norm": 0.4071747064590454, |
| "learning_rate": 9.832746478873239e-05, |
| "loss": 1.3815, |
| "step": 47650 |
| }, |
| { |
| "epoch": 8.112244897959183, |
| "grad_norm": 0.40746966004371643, |
| "learning_rate": 9.788732394366197e-05, |
| "loss": 1.3794, |
| "step": 47700 |
| }, |
| { |
| "epoch": 8.120748299319727, |
| "grad_norm": 0.39780792593955994, |
| "learning_rate": 9.744718309859155e-05, |
| "loss": 1.3804, |
| "step": 47750 |
| }, |
| { |
| "epoch": 8.129251700680273, |
| "grad_norm": 0.4328888952732086, |
| "learning_rate": 9.700704225352112e-05, |
| "loss": 1.3792, |
| "step": 47800 |
| }, |
| { |
| "epoch": 8.137755102040817, |
| "grad_norm": 0.40212351083755493, |
| "learning_rate": 9.65669014084507e-05, |
| "loss": 1.3702, |
| "step": 47850 |
| }, |
| { |
| "epoch": 8.146258503401361, |
| "grad_norm": 0.3966144919395447, |
| "learning_rate": 9.612676056338028e-05, |
| "loss": 1.3761, |
| "step": 47900 |
| }, |
| { |
| "epoch": 8.154761904761905, |
| "grad_norm": 0.41284072399139404, |
| "learning_rate": 9.568661971830986e-05, |
| "loss": 1.3774, |
| "step": 47950 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "grad_norm": 0.41598987579345703, |
| "learning_rate": 9.524647887323943e-05, |
| "loss": 1.3746, |
| "step": 48000 |
| }, |
| { |
| "epoch": 8.16326530612245, |
| "eval_loss": 1.544783115386963, |
| "eval_runtime": 75.374, |
| "eval_samples_per_second": 1244.979, |
| "eval_steps_per_second": 4.869, |
| "step": 48000 |
| }, |
| { |
| "epoch": 8.171768707482993, |
| "grad_norm": 0.7141363024711609, |
| "learning_rate": 9.480633802816901e-05, |
| "loss": 1.378, |
| "step": 48050 |
| }, |
| { |
| "epoch": 8.180272108843537, |
| "grad_norm": 0.400291383266449, |
| "learning_rate": 9.43661971830986e-05, |
| "loss": 1.3805, |
| "step": 48100 |
| }, |
| { |
| "epoch": 8.188775510204081, |
| "grad_norm": 0.40866729617118835, |
| "learning_rate": 9.392605633802818e-05, |
| "loss": 1.3765, |
| "step": 48150 |
| }, |
| { |
| "epoch": 8.197278911564625, |
| "grad_norm": 0.39690279960632324, |
| "learning_rate": 9.348591549295775e-05, |
| "loss": 1.3747, |
| "step": 48200 |
| }, |
| { |
| "epoch": 8.20578231292517, |
| "grad_norm": 0.41751721501350403, |
| "learning_rate": 9.304577464788733e-05, |
| "loss": 1.376, |
| "step": 48250 |
| }, |
| { |
| "epoch": 8.214285714285714, |
| "grad_norm": 0.4105045795440674, |
| "learning_rate": 9.260563380281691e-05, |
| "loss": 1.383, |
| "step": 48300 |
| }, |
| { |
| "epoch": 8.22278911564626, |
| "grad_norm": 0.40793538093566895, |
| "learning_rate": 9.216549295774648e-05, |
| "loss": 1.3716, |
| "step": 48350 |
| }, |
| { |
| "epoch": 8.231292517006803, |
| "grad_norm": 0.746793270111084, |
| "learning_rate": 9.172535211267606e-05, |
| "loss": 1.3736, |
| "step": 48400 |
| }, |
| { |
| "epoch": 8.239795918367347, |
| "grad_norm": 0.41186049580574036, |
| "learning_rate": 9.128521126760564e-05, |
| "loss": 1.3751, |
| "step": 48450 |
| }, |
| { |
| "epoch": 8.248299319727892, |
| "grad_norm": 0.4152040481567383, |
| "learning_rate": 9.084507042253522e-05, |
| "loss": 1.3716, |
| "step": 48500 |
| }, |
| { |
| "epoch": 8.256802721088436, |
| "grad_norm": 0.4200298488140106, |
| "learning_rate": 9.040492957746479e-05, |
| "loss": 1.3762, |
| "step": 48550 |
| }, |
| { |
| "epoch": 8.26530612244898, |
| "grad_norm": 0.40081730484962463, |
| "learning_rate": 8.996478873239437e-05, |
| "loss": 1.371, |
| "step": 48600 |
| }, |
| { |
| "epoch": 8.273809523809524, |
| "grad_norm": 0.4015451967716217, |
| "learning_rate": 8.952464788732396e-05, |
| "loss": 1.3717, |
| "step": 48650 |
| }, |
| { |
| "epoch": 8.282312925170068, |
| "grad_norm": 0.41303500533103943, |
| "learning_rate": 8.908450704225352e-05, |
| "loss": 1.3793, |
| "step": 48700 |
| }, |
| { |
| "epoch": 8.290816326530612, |
| "grad_norm": 0.41865503787994385, |
| "learning_rate": 8.86443661971831e-05, |
| "loss": 1.3733, |
| "step": 48750 |
| }, |
| { |
| "epoch": 8.299319727891156, |
| "grad_norm": 0.39655590057373047, |
| "learning_rate": 8.820422535211267e-05, |
| "loss": 1.371, |
| "step": 48800 |
| }, |
| { |
| "epoch": 8.3078231292517, |
| "grad_norm": 0.4100252389907837, |
| "learning_rate": 8.776408450704226e-05, |
| "loss": 1.3778, |
| "step": 48850 |
| }, |
| { |
| "epoch": 8.316326530612244, |
| "grad_norm": 0.4039982259273529, |
| "learning_rate": 8.732394366197182e-05, |
| "loss": 1.3737, |
| "step": 48900 |
| }, |
| { |
| "epoch": 8.32482993197279, |
| "grad_norm": 0.40856197476387024, |
| "learning_rate": 8.68838028169014e-05, |
| "loss": 1.3687, |
| "step": 48950 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.40824389457702637, |
| "learning_rate": 8.644366197183099e-05, |
| "loss": 1.3726, |
| "step": 49000 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "eval_loss": 1.5451842546463013, |
| "eval_runtime": 75.3658, |
| "eval_samples_per_second": 1245.114, |
| "eval_steps_per_second": 4.87, |
| "step": 49000 |
| }, |
| { |
| "epoch": 8.341836734693878, |
| "grad_norm": 0.40559035539627075, |
| "learning_rate": 8.600352112676056e-05, |
| "loss": 1.3714, |
| "step": 49050 |
| }, |
| { |
| "epoch": 8.350340136054422, |
| "grad_norm": 0.40669360756874084, |
| "learning_rate": 8.556338028169014e-05, |
| "loss": 1.3716, |
| "step": 49100 |
| }, |
| { |
| "epoch": 8.358843537414966, |
| "grad_norm": 0.40370744466781616, |
| "learning_rate": 8.513204225352113e-05, |
| "loss": 1.37, |
| "step": 49150 |
| }, |
| { |
| "epoch": 8.36734693877551, |
| "grad_norm": 0.5396425127983093, |
| "learning_rate": 8.469190140845071e-05, |
| "loss": 1.3769, |
| "step": 49200 |
| }, |
| { |
| "epoch": 8.375850340136054, |
| "grad_norm": 0.401902973651886, |
| "learning_rate": 8.425176056338028e-05, |
| "loss": 1.364, |
| "step": 49250 |
| }, |
| { |
| "epoch": 8.384353741496598, |
| "grad_norm": 0.7124184966087341, |
| "learning_rate": 8.381161971830986e-05, |
| "loss": 1.3658, |
| "step": 49300 |
| }, |
| { |
| "epoch": 8.392857142857142, |
| "grad_norm": 0.4373975396156311, |
| "learning_rate": 8.337147887323944e-05, |
| "loss": 1.3658, |
| "step": 49350 |
| }, |
| { |
| "epoch": 8.401360544217686, |
| "grad_norm": 0.40361514687538147, |
| "learning_rate": 8.293133802816902e-05, |
| "loss": 1.366, |
| "step": 49400 |
| }, |
| { |
| "epoch": 8.40986394557823, |
| "grad_norm": 0.4027824103832245, |
| "learning_rate": 8.249119718309859e-05, |
| "loss": 1.3726, |
| "step": 49450 |
| }, |
| { |
| "epoch": 8.418367346938776, |
| "grad_norm": 0.41340282559394836, |
| "learning_rate": 8.205105633802817e-05, |
| "loss": 1.3691, |
| "step": 49500 |
| }, |
| { |
| "epoch": 8.42687074829932, |
| "grad_norm": 0.41044774651527405, |
| "learning_rate": 8.161091549295776e-05, |
| "loss": 1.3715, |
| "step": 49550 |
| }, |
| { |
| "epoch": 8.435374149659864, |
| "grad_norm": 0.7769300937652588, |
| "learning_rate": 8.117077464788732e-05, |
| "loss": 1.3604, |
| "step": 49600 |
| }, |
| { |
| "epoch": 8.443877551020408, |
| "grad_norm": 0.3973638713359833, |
| "learning_rate": 8.07306338028169e-05, |
| "loss": 1.3692, |
| "step": 49650 |
| }, |
| { |
| "epoch": 8.452380952380953, |
| "grad_norm": 0.3998255133628845, |
| "learning_rate": 8.029049295774649e-05, |
| "loss": 1.3758, |
| "step": 49700 |
| }, |
| { |
| "epoch": 8.460884353741497, |
| "grad_norm": 0.4052984416484833, |
| "learning_rate": 7.985035211267607e-05, |
| "loss": 1.3695, |
| "step": 49750 |
| }, |
| { |
| "epoch": 8.46938775510204, |
| "grad_norm": 0.40513521432876587, |
| "learning_rate": 7.941021126760564e-05, |
| "loss": 1.365, |
| "step": 49800 |
| }, |
| { |
| "epoch": 8.477891156462585, |
| "grad_norm": 0.41850411891937256, |
| "learning_rate": 7.89700704225352e-05, |
| "loss": 1.371, |
| "step": 49850 |
| }, |
| { |
| "epoch": 8.486394557823129, |
| "grad_norm": 0.40076538920402527, |
| "learning_rate": 7.852992957746479e-05, |
| "loss": 1.3583, |
| "step": 49900 |
| }, |
| { |
| "epoch": 8.494897959183673, |
| "grad_norm": 0.40688443183898926, |
| "learning_rate": 7.808978873239436e-05, |
| "loss": 1.368, |
| "step": 49950 |
| }, |
| { |
| "epoch": 8.503401360544217, |
| "grad_norm": 0.399404913187027, |
| "learning_rate": 7.764964788732394e-05, |
| "loss": 1.3683, |
| "step": 50000 |
| }, |
| { |
| "epoch": 8.503401360544217, |
| "eval_loss": 1.5378692150115967, |
| "eval_runtime": 75.368, |
| "eval_samples_per_second": 1245.078, |
| "eval_steps_per_second": 4.869, |
| "step": 50000 |
| }, |
| { |
| "epoch": 8.511904761904763, |
| "grad_norm": 0.40362656116485596, |
| "learning_rate": 7.720950704225352e-05, |
| "loss": 1.3642, |
| "step": 50050 |
| }, |
| { |
| "epoch": 8.520408163265307, |
| "grad_norm": 0.41159501671791077, |
| "learning_rate": 7.67693661971831e-05, |
| "loss": 1.3559, |
| "step": 50100 |
| }, |
| { |
| "epoch": 8.52891156462585, |
| "grad_norm": 0.4064503014087677, |
| "learning_rate": 7.632922535211267e-05, |
| "loss": 1.3689, |
| "step": 50150 |
| }, |
| { |
| "epoch": 8.537414965986395, |
| "grad_norm": 0.3967163562774658, |
| "learning_rate": 7.588908450704225e-05, |
| "loss": 1.3645, |
| "step": 50200 |
| }, |
| { |
| "epoch": 8.545918367346939, |
| "grad_norm": 0.40805837512016296, |
| "learning_rate": 7.544894366197183e-05, |
| "loss": 1.3617, |
| "step": 50250 |
| }, |
| { |
| "epoch": 8.554421768707483, |
| "grad_norm": 0.39720582962036133, |
| "learning_rate": 7.500880281690142e-05, |
| "loss": 1.3606, |
| "step": 50300 |
| }, |
| { |
| "epoch": 8.562925170068027, |
| "grad_norm": 0.3969172537326813, |
| "learning_rate": 7.456866197183098e-05, |
| "loss": 1.3661, |
| "step": 50350 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 0.40754255652427673, |
| "learning_rate": 7.412852112676057e-05, |
| "loss": 1.3674, |
| "step": 50400 |
| }, |
| { |
| "epoch": 8.579931972789115, |
| "grad_norm": 0.40162739157676697, |
| "learning_rate": 7.368838028169015e-05, |
| "loss": 1.3654, |
| "step": 50450 |
| }, |
| { |
| "epoch": 8.58843537414966, |
| "grad_norm": 0.4004862606525421, |
| "learning_rate": 7.324823943661972e-05, |
| "loss": 1.3567, |
| "step": 50500 |
| }, |
| { |
| "epoch": 8.596938775510203, |
| "grad_norm": 0.4237268269062042, |
| "learning_rate": 7.28080985915493e-05, |
| "loss": 1.3617, |
| "step": 50550 |
| }, |
| { |
| "epoch": 8.60544217687075, |
| "grad_norm": 0.4031754434108734, |
| "learning_rate": 7.236795774647888e-05, |
| "loss": 1.3597, |
| "step": 50600 |
| }, |
| { |
| "epoch": 8.613945578231293, |
| "grad_norm": 0.39399534463882446, |
| "learning_rate": 7.192781690140846e-05, |
| "loss": 1.3633, |
| "step": 50650 |
| }, |
| { |
| "epoch": 8.622448979591837, |
| "grad_norm": 0.4071154296398163, |
| "learning_rate": 7.148767605633803e-05, |
| "loss": 1.3634, |
| "step": 50700 |
| }, |
| { |
| "epoch": 8.630952380952381, |
| "grad_norm": 0.40059253573417664, |
| "learning_rate": 7.104753521126761e-05, |
| "loss": 1.3674, |
| "step": 50750 |
| }, |
| { |
| "epoch": 8.639455782312925, |
| "grad_norm": 0.4087289571762085, |
| "learning_rate": 7.060739436619719e-05, |
| "loss": 1.3646, |
| "step": 50800 |
| }, |
| { |
| "epoch": 8.64795918367347, |
| "grad_norm": 0.42607581615448, |
| "learning_rate": 7.016725352112676e-05, |
| "loss": 1.3645, |
| "step": 50850 |
| }, |
| { |
| "epoch": 8.656462585034014, |
| "grad_norm": 0.40754997730255127, |
| "learning_rate": 6.972711267605634e-05, |
| "loss": 1.3582, |
| "step": 50900 |
| }, |
| { |
| "epoch": 8.664965986394558, |
| "grad_norm": 0.4103423058986664, |
| "learning_rate": 6.928697183098592e-05, |
| "loss": 1.3635, |
| "step": 50950 |
| }, |
| { |
| "epoch": 8.673469387755102, |
| "grad_norm": 0.4093490540981293, |
| "learning_rate": 6.88468309859155e-05, |
| "loss": 1.3593, |
| "step": 51000 |
| }, |
| { |
| "epoch": 8.673469387755102, |
| "eval_loss": 1.5301626920700073, |
| "eval_runtime": 75.3714, |
| "eval_samples_per_second": 1245.021, |
| "eval_steps_per_second": 4.869, |
| "step": 51000 |
| }, |
| { |
| "epoch": 8.681972789115646, |
| "grad_norm": 0.403463751077652, |
| "learning_rate": 6.840669014084506e-05, |
| "loss": 1.3596, |
| "step": 51050 |
| }, |
| { |
| "epoch": 8.69047619047619, |
| "grad_norm": 0.406222403049469, |
| "learning_rate": 6.796654929577464e-05, |
| "loss": 1.3592, |
| "step": 51100 |
| }, |
| { |
| "epoch": 8.698979591836736, |
| "grad_norm": 0.41207653284072876, |
| "learning_rate": 6.752640845070423e-05, |
| "loss": 1.3574, |
| "step": 51150 |
| }, |
| { |
| "epoch": 8.70748299319728, |
| "grad_norm": 0.4042452573776245, |
| "learning_rate": 6.70862676056338e-05, |
| "loss": 1.3714, |
| "step": 51200 |
| }, |
| { |
| "epoch": 8.715986394557824, |
| "grad_norm": 0.3971654176712036, |
| "learning_rate": 6.664612676056338e-05, |
| "loss": 1.3557, |
| "step": 51250 |
| }, |
| { |
| "epoch": 8.724489795918368, |
| "grad_norm": 0.40285438299179077, |
| "learning_rate": 6.620598591549296e-05, |
| "loss": 1.3587, |
| "step": 51300 |
| }, |
| { |
| "epoch": 8.732993197278912, |
| "grad_norm": 0.4234530031681061, |
| "learning_rate": 6.576584507042254e-05, |
| "loss": 1.3682, |
| "step": 51350 |
| }, |
| { |
| "epoch": 8.741496598639456, |
| "grad_norm": 0.3918672502040863, |
| "learning_rate": 6.532570422535211e-05, |
| "loss": 1.3545, |
| "step": 51400 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 0.4000791013240814, |
| "learning_rate": 6.488556338028169e-05, |
| "loss": 1.361, |
| "step": 51450 |
| }, |
| { |
| "epoch": 8.758503401360544, |
| "grad_norm": 0.41339510679244995, |
| "learning_rate": 6.444542253521127e-05, |
| "loss": 1.3595, |
| "step": 51500 |
| }, |
| { |
| "epoch": 8.767006802721088, |
| "grad_norm": 0.4353317618370056, |
| "learning_rate": 6.400528169014084e-05, |
| "loss": 1.3601, |
| "step": 51550 |
| }, |
| { |
| "epoch": 8.775510204081632, |
| "grad_norm": 0.39730769395828247, |
| "learning_rate": 6.356514084507042e-05, |
| "loss": 1.3547, |
| "step": 51600 |
| }, |
| { |
| "epoch": 8.784013605442176, |
| "grad_norm": 0.41856640577316284, |
| "learning_rate": 6.3125e-05, |
| "loss": 1.354, |
| "step": 51650 |
| }, |
| { |
| "epoch": 8.79251700680272, |
| "grad_norm": 0.40725892782211304, |
| "learning_rate": 6.268485915492958e-05, |
| "loss": 1.3609, |
| "step": 51700 |
| }, |
| { |
| "epoch": 8.801020408163264, |
| "grad_norm": 0.4020816385746002, |
| "learning_rate": 6.224471830985915e-05, |
| "loss": 1.3507, |
| "step": 51750 |
| }, |
| { |
| "epoch": 8.80952380952381, |
| "grad_norm": 0.4009547233581543, |
| "learning_rate": 6.180457746478873e-05, |
| "loss": 1.3588, |
| "step": 51800 |
| }, |
| { |
| "epoch": 8.818027210884354, |
| "grad_norm": 0.42511284351348877, |
| "learning_rate": 6.136443661971832e-05, |
| "loss": 1.3567, |
| "step": 51850 |
| }, |
| { |
| "epoch": 8.826530612244898, |
| "grad_norm": 0.4135643541812897, |
| "learning_rate": 6.0924295774647885e-05, |
| "loss": 1.3536, |
| "step": 51900 |
| }, |
| { |
| "epoch": 8.835034013605442, |
| "grad_norm": 0.39955195784568787, |
| "learning_rate": 6.048415492957746e-05, |
| "loss": 1.3564, |
| "step": 51950 |
| }, |
| { |
| "epoch": 8.843537414965986, |
| "grad_norm": 0.4043436348438263, |
| "learning_rate": 6.004401408450704e-05, |
| "loss": 1.3527, |
| "step": 52000 |
| }, |
| { |
| "epoch": 8.843537414965986, |
| "eval_loss": 1.521606206893921, |
| "eval_runtime": 75.3981, |
| "eval_samples_per_second": 1244.58, |
| "eval_steps_per_second": 4.867, |
| "step": 52000 |
| }, |
| { |
| "epoch": 8.85204081632653, |
| "grad_norm": 0.40607118606567383, |
| "learning_rate": 5.960387323943662e-05, |
| "loss": 1.3567, |
| "step": 52050 |
| }, |
| { |
| "epoch": 8.860544217687075, |
| "grad_norm": 0.4122581481933594, |
| "learning_rate": 5.91637323943662e-05, |
| "loss": 1.355, |
| "step": 52100 |
| }, |
| { |
| "epoch": 8.869047619047619, |
| "grad_norm": 0.4980255961418152, |
| "learning_rate": 5.8723591549295774e-05, |
| "loss": 1.3523, |
| "step": 52150 |
| }, |
| { |
| "epoch": 8.877551020408163, |
| "grad_norm": 0.4053143262863159, |
| "learning_rate": 5.8283450704225356e-05, |
| "loss": 1.3556, |
| "step": 52200 |
| }, |
| { |
| "epoch": 8.886054421768707, |
| "grad_norm": 0.40937647223472595, |
| "learning_rate": 5.785211267605634e-05, |
| "loss": 1.3597, |
| "step": 52250 |
| }, |
| { |
| "epoch": 8.89455782312925, |
| "grad_norm": 0.4028383195400238, |
| "learning_rate": 5.741197183098592e-05, |
| "loss": 1.3548, |
| "step": 52300 |
| }, |
| { |
| "epoch": 8.903061224489797, |
| "grad_norm": 0.40696024894714355, |
| "learning_rate": 5.6971830985915496e-05, |
| "loss": 1.3553, |
| "step": 52350 |
| }, |
| { |
| "epoch": 8.91156462585034, |
| "grad_norm": 0.4004703164100647, |
| "learning_rate": 5.653169014084507e-05, |
| "loss": 1.3515, |
| "step": 52400 |
| }, |
| { |
| "epoch": 8.920068027210885, |
| "grad_norm": 0.4049835503101349, |
| "learning_rate": 5.6091549295774646e-05, |
| "loss": 1.3529, |
| "step": 52450 |
| }, |
| { |
| "epoch": 8.928571428571429, |
| "grad_norm": 0.4118352234363556, |
| "learning_rate": 5.565140845070423e-05, |
| "loss": 1.3606, |
| "step": 52500 |
| }, |
| { |
| "epoch": 8.937074829931973, |
| "grad_norm": 0.40466126799583435, |
| "learning_rate": 5.52112676056338e-05, |
| "loss": 1.3509, |
| "step": 52550 |
| }, |
| { |
| "epoch": 8.945578231292517, |
| "grad_norm": 0.39501574635505676, |
| "learning_rate": 5.477112676056338e-05, |
| "loss": 1.3515, |
| "step": 52600 |
| }, |
| { |
| "epoch": 8.954081632653061, |
| "grad_norm": 0.4046393036842346, |
| "learning_rate": 5.433098591549296e-05, |
| "loss": 1.3584, |
| "step": 52650 |
| }, |
| { |
| "epoch": 8.962585034013605, |
| "grad_norm": 0.4039493203163147, |
| "learning_rate": 5.3890845070422535e-05, |
| "loss": 1.3513, |
| "step": 52700 |
| }, |
| { |
| "epoch": 8.97108843537415, |
| "grad_norm": 0.4195407032966614, |
| "learning_rate": 5.345070422535212e-05, |
| "loss": 1.3536, |
| "step": 52750 |
| }, |
| { |
| "epoch": 8.979591836734693, |
| "grad_norm": 0.39963847398757935, |
| "learning_rate": 5.301056338028169e-05, |
| "loss": 1.3511, |
| "step": 52800 |
| }, |
| { |
| "epoch": 8.988095238095237, |
| "grad_norm": 0.41947850584983826, |
| "learning_rate": 5.2570422535211274e-05, |
| "loss": 1.3558, |
| "step": 52850 |
| }, |
| { |
| "epoch": 8.996598639455783, |
| "grad_norm": 0.4046323597431183, |
| "learning_rate": 5.213028169014085e-05, |
| "loss": 1.3511, |
| "step": 52900 |
| }, |
| { |
| "epoch": 9.005102040816327, |
| "grad_norm": 0.4054872393608093, |
| "learning_rate": 5.1690140845070424e-05, |
| "loss": 1.3428, |
| "step": 52950 |
| }, |
| { |
| "epoch": 9.013605442176871, |
| "grad_norm": 0.4009184241294861, |
| "learning_rate": 5.125e-05, |
| "loss": 1.3434, |
| "step": 53000 |
| }, |
| { |
| "epoch": 9.013605442176871, |
| "eval_loss": 1.5165001153945923, |
| "eval_runtime": 75.3865, |
| "eval_samples_per_second": 1244.772, |
| "eval_steps_per_second": 4.868, |
| "step": 53000 |
| }, |
| { |
| "epoch": 9.022108843537415, |
| "grad_norm": 0.4195316433906555, |
| "learning_rate": 5.0809859154929574e-05, |
| "loss": 1.358, |
| "step": 53050 |
| }, |
| { |
| "epoch": 9.03061224489796, |
| "grad_norm": 0.40940186381340027, |
| "learning_rate": 5.0369718309859156e-05, |
| "loss": 1.3453, |
| "step": 53100 |
| }, |
| { |
| "epoch": 9.039115646258503, |
| "grad_norm": 0.4012869596481323, |
| "learning_rate": 4.992957746478873e-05, |
| "loss": 1.3531, |
| "step": 53150 |
| }, |
| { |
| "epoch": 9.047619047619047, |
| "grad_norm": 0.4076648950576782, |
| "learning_rate": 4.948943661971831e-05, |
| "loss": 1.3433, |
| "step": 53200 |
| }, |
| { |
| "epoch": 9.056122448979592, |
| "grad_norm": 0.40203115344047546, |
| "learning_rate": 4.904929577464789e-05, |
| "loss": 1.3491, |
| "step": 53250 |
| }, |
| { |
| "epoch": 9.064625850340136, |
| "grad_norm": 0.40481826663017273, |
| "learning_rate": 4.860915492957747e-05, |
| "loss": 1.3509, |
| "step": 53300 |
| }, |
| { |
| "epoch": 9.07312925170068, |
| "grad_norm": 0.4018038213253021, |
| "learning_rate": 4.8169014084507045e-05, |
| "loss": 1.3485, |
| "step": 53350 |
| }, |
| { |
| "epoch": 9.081632653061224, |
| "grad_norm": 0.4100002646446228, |
| "learning_rate": 4.772887323943662e-05, |
| "loss": 1.3428, |
| "step": 53400 |
| }, |
| { |
| "epoch": 9.09013605442177, |
| "grad_norm": 1.7801686525344849, |
| "learning_rate": 4.72887323943662e-05, |
| "loss": 1.3445, |
| "step": 53450 |
| }, |
| { |
| "epoch": 9.098639455782314, |
| "grad_norm": 0.39769893884658813, |
| "learning_rate": 4.684859154929578e-05, |
| "loss": 1.3467, |
| "step": 53500 |
| }, |
| { |
| "epoch": 9.107142857142858, |
| "grad_norm": 0.4195043444633484, |
| "learning_rate": 4.640845070422536e-05, |
| "loss": 1.348, |
| "step": 53550 |
| }, |
| { |
| "epoch": 9.115646258503402, |
| "grad_norm": 0.4053761959075928, |
| "learning_rate": 4.596830985915493e-05, |
| "loss": 1.3465, |
| "step": 53600 |
| }, |
| { |
| "epoch": 9.124149659863946, |
| "grad_norm": 0.4017139673233032, |
| "learning_rate": 4.552816901408451e-05, |
| "loss": 1.3448, |
| "step": 53650 |
| }, |
| { |
| "epoch": 9.13265306122449, |
| "grad_norm": 0.4139016270637512, |
| "learning_rate": 4.5088028169014084e-05, |
| "loss": 1.349, |
| "step": 53700 |
| }, |
| { |
| "epoch": 9.141156462585034, |
| "grad_norm": 0.39910098910331726, |
| "learning_rate": 4.464788732394366e-05, |
| "loss": 1.3379, |
| "step": 53750 |
| }, |
| { |
| "epoch": 9.149659863945578, |
| "grad_norm": 0.39038217067718506, |
| "learning_rate": 4.420774647887324e-05, |
| "loss": 1.3422, |
| "step": 53800 |
| }, |
| { |
| "epoch": 9.158163265306122, |
| "grad_norm": 0.4174290895462036, |
| "learning_rate": 4.3767605633802816e-05, |
| "loss": 1.3417, |
| "step": 53850 |
| }, |
| { |
| "epoch": 9.166666666666666, |
| "grad_norm": 0.399420827627182, |
| "learning_rate": 4.33274647887324e-05, |
| "loss": 1.3414, |
| "step": 53900 |
| }, |
| { |
| "epoch": 9.17517006802721, |
| "grad_norm": 0.3991691470146179, |
| "learning_rate": 4.288732394366197e-05, |
| "loss": 1.3457, |
| "step": 53950 |
| }, |
| { |
| "epoch": 9.183673469387756, |
| "grad_norm": 0.4257224500179291, |
| "learning_rate": 4.2447183098591555e-05, |
| "loss": 1.3389, |
| "step": 54000 |
| }, |
| { |
| "epoch": 9.183673469387756, |
| "eval_loss": 1.514726996421814, |
| "eval_runtime": 75.3819, |
| "eval_samples_per_second": 1244.848, |
| "eval_steps_per_second": 4.869, |
| "step": 54000 |
| }, |
| { |
| "epoch": 9.1921768707483, |
| "grad_norm": 0.40039774775505066, |
| "learning_rate": 4.200704225352113e-05, |
| "loss": 1.3415, |
| "step": 54050 |
| }, |
| { |
| "epoch": 9.200680272108844, |
| "grad_norm": 0.3933996558189392, |
| "learning_rate": 4.1566901408450705e-05, |
| "loss": 1.3426, |
| "step": 54100 |
| }, |
| { |
| "epoch": 9.209183673469388, |
| "grad_norm": 0.4128095805644989, |
| "learning_rate": 4.112676056338028e-05, |
| "loss": 1.3471, |
| "step": 54150 |
| }, |
| { |
| "epoch": 9.217687074829932, |
| "grad_norm": 0.4024725556373596, |
| "learning_rate": 4.0686619718309855e-05, |
| "loss": 1.3454, |
| "step": 54200 |
| }, |
| { |
| "epoch": 9.226190476190476, |
| "grad_norm": 0.4105578064918518, |
| "learning_rate": 4.024647887323944e-05, |
| "loss": 1.3399, |
| "step": 54250 |
| }, |
| { |
| "epoch": 9.23469387755102, |
| "grad_norm": 0.4047715663909912, |
| "learning_rate": 3.980633802816901e-05, |
| "loss": 1.3442, |
| "step": 54300 |
| }, |
| { |
| "epoch": 9.243197278911564, |
| "grad_norm": 0.3958626687526703, |
| "learning_rate": 3.9366197183098594e-05, |
| "loss": 1.3441, |
| "step": 54350 |
| }, |
| { |
| "epoch": 9.251700680272108, |
| "grad_norm": 0.40880346298217773, |
| "learning_rate": 3.892605633802817e-05, |
| "loss": 1.3416, |
| "step": 54400 |
| }, |
| { |
| "epoch": 9.260204081632653, |
| "grad_norm": 0.41129016876220703, |
| "learning_rate": 3.848591549295775e-05, |
| "loss": 1.3428, |
| "step": 54450 |
| }, |
| { |
| "epoch": 9.268707482993197, |
| "grad_norm": 0.39472246170043945, |
| "learning_rate": 3.8045774647887326e-05, |
| "loss": 1.3396, |
| "step": 54500 |
| }, |
| { |
| "epoch": 9.27721088435374, |
| "grad_norm": 0.40167415142059326, |
| "learning_rate": 3.76056338028169e-05, |
| "loss": 1.3425, |
| "step": 54550 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.40154701471328735, |
| "learning_rate": 3.716549295774648e-05, |
| "loss": 1.3423, |
| "step": 54600 |
| }, |
| { |
| "epoch": 9.29421768707483, |
| "grad_norm": 0.40191853046417236, |
| "learning_rate": 3.672535211267606e-05, |
| "loss": 1.3397, |
| "step": 54650 |
| }, |
| { |
| "epoch": 9.302721088435375, |
| "grad_norm": 0.4014011025428772, |
| "learning_rate": 3.628521126760564e-05, |
| "loss": 1.3431, |
| "step": 54700 |
| }, |
| { |
| "epoch": 9.311224489795919, |
| "grad_norm": 0.4313969910144806, |
| "learning_rate": 3.584507042253521e-05, |
| "loss": 1.3413, |
| "step": 54750 |
| }, |
| { |
| "epoch": 9.319727891156463, |
| "grad_norm": 0.4143257141113281, |
| "learning_rate": 3.540492957746479e-05, |
| "loss": 1.3424, |
| "step": 54800 |
| }, |
| { |
| "epoch": 9.328231292517007, |
| "grad_norm": 0.40343379974365234, |
| "learning_rate": 3.4964788732394365e-05, |
| "loss": 1.3377, |
| "step": 54850 |
| }, |
| { |
| "epoch": 9.33673469387755, |
| "grad_norm": 0.4093693792819977, |
| "learning_rate": 3.452464788732394e-05, |
| "loss": 1.3426, |
| "step": 54900 |
| }, |
| { |
| "epoch": 9.345238095238095, |
| "grad_norm": 0.40565499663352966, |
| "learning_rate": 3.408450704225352e-05, |
| "loss": 1.3347, |
| "step": 54950 |
| }, |
| { |
| "epoch": 9.353741496598639, |
| "grad_norm": 0.40517303347587585, |
| "learning_rate": 3.3644366197183097e-05, |
| "loss": 1.3411, |
| "step": 55000 |
| }, |
| { |
| "epoch": 9.353741496598639, |
| "eval_loss": 1.5079046487808228, |
| "eval_runtime": 75.3606, |
| "eval_samples_per_second": 1245.199, |
| "eval_steps_per_second": 4.87, |
| "step": 55000 |
| }, |
| { |
| "epoch": 9.362244897959183, |
| "grad_norm": 0.4189765155315399, |
| "learning_rate": 3.320422535211268e-05, |
| "loss": 1.3391, |
| "step": 55050 |
| }, |
| { |
| "epoch": 9.370748299319727, |
| "grad_norm": 0.41205212473869324, |
| "learning_rate": 3.2764084507042253e-05, |
| "loss": 1.3335, |
| "step": 55100 |
| }, |
| { |
| "epoch": 9.379251700680273, |
| "grad_norm": 0.4138495624065399, |
| "learning_rate": 3.2323943661971835e-05, |
| "loss": 1.3385, |
| "step": 55150 |
| }, |
| { |
| "epoch": 9.387755102040817, |
| "grad_norm": 0.40465956926345825, |
| "learning_rate": 3.188380281690141e-05, |
| "loss": 1.3366, |
| "step": 55200 |
| }, |
| { |
| "epoch": 9.396258503401361, |
| "grad_norm": 0.41961055994033813, |
| "learning_rate": 3.144366197183099e-05, |
| "loss": 1.3386, |
| "step": 55250 |
| }, |
| { |
| "epoch": 9.404761904761905, |
| "grad_norm": 0.4017610251903534, |
| "learning_rate": 3.100352112676056e-05, |
| "loss": 1.3389, |
| "step": 55300 |
| }, |
| { |
| "epoch": 9.41326530612245, |
| "grad_norm": 0.3947182893753052, |
| "learning_rate": 3.056338028169014e-05, |
| "loss": 1.3366, |
| "step": 55350 |
| }, |
| { |
| "epoch": 9.421768707482993, |
| "grad_norm": 0.41199058294296265, |
| "learning_rate": 3.012323943661972e-05, |
| "loss": 1.3319, |
| "step": 55400 |
| }, |
| { |
| "epoch": 9.430272108843537, |
| "grad_norm": 0.44556739926338196, |
| "learning_rate": 2.96830985915493e-05, |
| "loss": 1.3386, |
| "step": 55450 |
| }, |
| { |
| "epoch": 9.438775510204081, |
| "grad_norm": 0.4042491316795349, |
| "learning_rate": 2.924295774647887e-05, |
| "loss": 1.3352, |
| "step": 55500 |
| }, |
| { |
| "epoch": 9.447278911564625, |
| "grad_norm": 0.4033842384815216, |
| "learning_rate": 2.880281690140845e-05, |
| "loss": 1.3333, |
| "step": 55550 |
| }, |
| { |
| "epoch": 9.45578231292517, |
| "grad_norm": 0.409463495016098, |
| "learning_rate": 2.8362676056338028e-05, |
| "loss": 1.3359, |
| "step": 55600 |
| }, |
| { |
| "epoch": 9.464285714285714, |
| "grad_norm": 0.4049001634120941, |
| "learning_rate": 2.7922535211267606e-05, |
| "loss": 1.3362, |
| "step": 55650 |
| }, |
| { |
| "epoch": 9.47278911564626, |
| "grad_norm": 0.4021693766117096, |
| "learning_rate": 2.7482394366197185e-05, |
| "loss": 1.3334, |
| "step": 55700 |
| }, |
| { |
| "epoch": 9.481292517006803, |
| "grad_norm": 0.3943755030632019, |
| "learning_rate": 2.7042253521126763e-05, |
| "loss": 1.3356, |
| "step": 55750 |
| }, |
| { |
| "epoch": 9.489795918367347, |
| "grad_norm": 0.4090036153793335, |
| "learning_rate": 2.660211267605634e-05, |
| "loss": 1.3359, |
| "step": 55800 |
| }, |
| { |
| "epoch": 9.498299319727892, |
| "grad_norm": 0.3975585103034973, |
| "learning_rate": 2.6161971830985917e-05, |
| "loss": 1.3356, |
| "step": 55850 |
| }, |
| { |
| "epoch": 9.506802721088436, |
| "grad_norm": 0.3984331488609314, |
| "learning_rate": 2.5721830985915492e-05, |
| "loss": 1.3334, |
| "step": 55900 |
| }, |
| { |
| "epoch": 9.51530612244898, |
| "grad_norm": 0.3891252279281616, |
| "learning_rate": 2.528169014084507e-05, |
| "loss": 1.3297, |
| "step": 55950 |
| }, |
| { |
| "epoch": 9.523809523809524, |
| "grad_norm": 0.4065133333206177, |
| "learning_rate": 2.484154929577465e-05, |
| "loss": 1.3357, |
| "step": 56000 |
| }, |
| { |
| "epoch": 9.523809523809524, |
| "eval_loss": 1.4985688924789429, |
| "eval_runtime": 75.3846, |
| "eval_samples_per_second": 1244.803, |
| "eval_steps_per_second": 4.868, |
| "step": 56000 |
| }, |
| { |
| "epoch": 9.532312925170068, |
| "grad_norm": 0.403832346200943, |
| "learning_rate": 2.4401408450704227e-05, |
| "loss": 1.3389, |
| "step": 56050 |
| }, |
| { |
| "epoch": 9.540816326530612, |
| "grad_norm": 0.39873868227005005, |
| "learning_rate": 2.3961267605633802e-05, |
| "loss": 1.3344, |
| "step": 56100 |
| }, |
| { |
| "epoch": 9.549319727891156, |
| "grad_norm": 0.40277689695358276, |
| "learning_rate": 2.352112676056338e-05, |
| "loss": 1.3335, |
| "step": 56150 |
| }, |
| { |
| "epoch": 9.5578231292517, |
| "grad_norm": 0.39874860644340515, |
| "learning_rate": 2.308098591549296e-05, |
| "loss": 1.3377, |
| "step": 56200 |
| }, |
| { |
| "epoch": 9.566326530612244, |
| "grad_norm": 0.4036062955856323, |
| "learning_rate": 2.2640845070422538e-05, |
| "loss": 1.335, |
| "step": 56250 |
| }, |
| { |
| "epoch": 9.57482993197279, |
| "grad_norm": 0.41648924350738525, |
| "learning_rate": 2.2200704225352113e-05, |
| "loss": 1.3341, |
| "step": 56300 |
| }, |
| { |
| "epoch": 9.583333333333334, |
| "grad_norm": 0.39397549629211426, |
| "learning_rate": 2.176056338028169e-05, |
| "loss": 1.3366, |
| "step": 56350 |
| }, |
| { |
| "epoch": 9.591836734693878, |
| "grad_norm": 0.39437320828437805, |
| "learning_rate": 2.1320422535211266e-05, |
| "loss": 1.335, |
| "step": 56400 |
| }, |
| { |
| "epoch": 9.600340136054422, |
| "grad_norm": 0.39650458097457886, |
| "learning_rate": 2.0880281690140845e-05, |
| "loss": 1.3385, |
| "step": 56450 |
| }, |
| { |
| "epoch": 9.608843537414966, |
| "grad_norm": 0.4140496850013733, |
| "learning_rate": 2.0440140845070423e-05, |
| "loss": 1.3284, |
| "step": 56500 |
| }, |
| { |
| "epoch": 9.61734693877551, |
| "grad_norm": 0.39376911520957947, |
| "learning_rate": 2e-05, |
| "loss": 1.3307, |
| "step": 56550 |
| }, |
| { |
| "epoch": 9.625850340136054, |
| "grad_norm": 0.4171138405799866, |
| "learning_rate": 1.955985915492958e-05, |
| "loss": 1.333, |
| "step": 56600 |
| }, |
| { |
| "epoch": 9.634353741496598, |
| "grad_norm": 0.41457417607307434, |
| "learning_rate": 1.911971830985916e-05, |
| "loss": 1.3366, |
| "step": 56650 |
| }, |
| { |
| "epoch": 9.642857142857142, |
| "grad_norm": 0.3986314535140991, |
| "learning_rate": 1.867957746478873e-05, |
| "loss": 1.3382, |
| "step": 56700 |
| }, |
| { |
| "epoch": 9.651360544217686, |
| "grad_norm": 0.4179774820804596, |
| "learning_rate": 1.823943661971831e-05, |
| "loss": 1.3332, |
| "step": 56750 |
| }, |
| { |
| "epoch": 9.65986394557823, |
| "grad_norm": 0.4044124186038971, |
| "learning_rate": 1.7799295774647887e-05, |
| "loss": 1.3273, |
| "step": 56800 |
| }, |
| { |
| "epoch": 9.668367346938776, |
| "grad_norm": 0.4135982096195221, |
| "learning_rate": 1.7359154929577466e-05, |
| "loss": 1.3311, |
| "step": 56850 |
| }, |
| { |
| "epoch": 9.67687074829932, |
| "grad_norm": 0.39998510479927063, |
| "learning_rate": 1.6919014084507044e-05, |
| "loss": 1.3366, |
| "step": 56900 |
| }, |
| { |
| "epoch": 9.685374149659864, |
| "grad_norm": 0.40354451537132263, |
| "learning_rate": 1.6478873239436623e-05, |
| "loss": 1.3326, |
| "step": 56950 |
| }, |
| { |
| "epoch": 9.693877551020408, |
| "grad_norm": 0.4005592167377472, |
| "learning_rate": 1.6038732394366198e-05, |
| "loss": 1.3319, |
| "step": 57000 |
| }, |
| { |
| "epoch": 9.693877551020408, |
| "eval_loss": 1.5001171827316284, |
| "eval_runtime": 75.6646, |
| "eval_samples_per_second": 1240.196, |
| "eval_steps_per_second": 4.85, |
| "step": 57000 |
| }, |
| { |
| "epoch": 9.702380952380953, |
| "grad_norm": 0.3942902386188507, |
| "learning_rate": 1.5598591549295773e-05, |
| "loss": 1.3331, |
| "step": 57050 |
| }, |
| { |
| "epoch": 9.710884353741497, |
| "grad_norm": 0.4057478606700897, |
| "learning_rate": 1.5158450704225353e-05, |
| "loss": 1.3295, |
| "step": 57100 |
| }, |
| { |
| "epoch": 9.71938775510204, |
| "grad_norm": 0.40495890378952026, |
| "learning_rate": 1.471830985915493e-05, |
| "loss": 1.3419, |
| "step": 57150 |
| }, |
| { |
| "epoch": 9.727891156462585, |
| "grad_norm": 0.3950183093547821, |
| "learning_rate": 1.4278169014084506e-05, |
| "loss": 1.3316, |
| "step": 57200 |
| }, |
| { |
| "epoch": 9.736394557823129, |
| "grad_norm": 0.4021071195602417, |
| "learning_rate": 1.3838028169014085e-05, |
| "loss": 1.3297, |
| "step": 57250 |
| }, |
| { |
| "epoch": 9.744897959183673, |
| "grad_norm": 0.40653911232948303, |
| "learning_rate": 1.3397887323943663e-05, |
| "loss": 1.3329, |
| "step": 57300 |
| }, |
| { |
| "epoch": 9.753401360544217, |
| "grad_norm": 0.40104779601097107, |
| "learning_rate": 1.2957746478873238e-05, |
| "loss": 1.3234, |
| "step": 57350 |
| }, |
| { |
| "epoch": 9.761904761904763, |
| "grad_norm": 0.408331036567688, |
| "learning_rate": 1.2517605633802817e-05, |
| "loss": 1.3301, |
| "step": 57400 |
| }, |
| { |
| "epoch": 9.770408163265307, |
| "grad_norm": 0.4001865088939667, |
| "learning_rate": 1.2077464788732395e-05, |
| "loss": 1.3282, |
| "step": 57450 |
| }, |
| { |
| "epoch": 9.77891156462585, |
| "grad_norm": 0.4088568389415741, |
| "learning_rate": 1.1637323943661972e-05, |
| "loss": 1.3292, |
| "step": 57500 |
| }, |
| { |
| "epoch": 9.787414965986395, |
| "grad_norm": 0.42663103342056274, |
| "learning_rate": 1.1197183098591549e-05, |
| "loss": 1.3327, |
| "step": 57550 |
| }, |
| { |
| "epoch": 9.795918367346939, |
| "grad_norm": 0.3960348963737488, |
| "learning_rate": 1.0757042253521127e-05, |
| "loss": 1.3326, |
| "step": 57600 |
| }, |
| { |
| "epoch": 9.804421768707483, |
| "grad_norm": 0.39671510457992554, |
| "learning_rate": 1.0316901408450704e-05, |
| "loss": 1.3306, |
| "step": 57650 |
| }, |
| { |
| "epoch": 9.812925170068027, |
| "grad_norm": 0.39632678031921387, |
| "learning_rate": 9.876760563380282e-06, |
| "loss": 1.3311, |
| "step": 57700 |
| }, |
| { |
| "epoch": 9.821428571428571, |
| "grad_norm": 0.4076337516307831, |
| "learning_rate": 9.43661971830986e-06, |
| "loss": 1.3348, |
| "step": 57750 |
| }, |
| { |
| "epoch": 9.829931972789115, |
| "grad_norm": 0.40061214566230774, |
| "learning_rate": 8.996478873239436e-06, |
| "loss": 1.3323, |
| "step": 57800 |
| }, |
| { |
| "epoch": 9.83843537414966, |
| "grad_norm": 0.3868488371372223, |
| "learning_rate": 8.556338028169014e-06, |
| "loss": 1.3274, |
| "step": 57850 |
| }, |
| { |
| "epoch": 9.846938775510203, |
| "grad_norm": 0.43311014771461487, |
| "learning_rate": 8.116197183098593e-06, |
| "loss": 1.3316, |
| "step": 57900 |
| }, |
| { |
| "epoch": 9.85544217687075, |
| "grad_norm": 0.39683276414871216, |
| "learning_rate": 7.67605633802817e-06, |
| "loss": 1.3302, |
| "step": 57950 |
| }, |
| { |
| "epoch": 9.863945578231293, |
| "grad_norm": 0.3994409143924713, |
| "learning_rate": 7.235915492957746e-06, |
| "loss": 1.3311, |
| "step": 58000 |
| }, |
| { |
| "epoch": 9.863945578231293, |
| "eval_loss": 1.499453067779541, |
| "eval_runtime": 75.4095, |
| "eval_samples_per_second": 1244.392, |
| "eval_steps_per_second": 4.867, |
| "step": 58000 |
| }, |
| { |
| "epoch": 9.872448979591837, |
| "grad_norm": 0.40057429671287537, |
| "learning_rate": 6.795774647887324e-06, |
| "loss": 1.331, |
| "step": 58050 |
| }, |
| { |
| "epoch": 9.880952380952381, |
| "grad_norm": 0.39245638251304626, |
| "learning_rate": 6.355633802816902e-06, |
| "loss": 1.334, |
| "step": 58100 |
| }, |
| { |
| "epoch": 9.889455782312925, |
| "grad_norm": 0.3993644714355469, |
| "learning_rate": 5.915492957746479e-06, |
| "loss": 1.3267, |
| "step": 58150 |
| }, |
| { |
| "epoch": 9.89795918367347, |
| "grad_norm": 0.39710524678230286, |
| "learning_rate": 5.475352112676056e-06, |
| "loss": 1.3201, |
| "step": 58200 |
| }, |
| { |
| "epoch": 9.906462585034014, |
| "grad_norm": 0.3986225426197052, |
| "learning_rate": 5.0352112676056345e-06, |
| "loss": 1.3275, |
| "step": 58250 |
| }, |
| { |
| "epoch": 9.914965986394558, |
| "grad_norm": 0.4060732126235962, |
| "learning_rate": 4.595070422535211e-06, |
| "loss": 1.3274, |
| "step": 58300 |
| }, |
| { |
| "epoch": 9.923469387755102, |
| "grad_norm": 0.3890203833580017, |
| "learning_rate": 4.154929577464789e-06, |
| "loss": 1.3259, |
| "step": 58350 |
| }, |
| { |
| "epoch": 9.931972789115646, |
| "grad_norm": 0.39963120222091675, |
| "learning_rate": 3.7147887323943665e-06, |
| "loss": 1.3343, |
| "step": 58400 |
| }, |
| { |
| "epoch": 9.94047619047619, |
| "grad_norm": 0.4122091233730316, |
| "learning_rate": 3.274647887323944e-06, |
| "loss": 1.3313, |
| "step": 58450 |
| }, |
| { |
| "epoch": 9.948979591836736, |
| "grad_norm": 0.40625423192977905, |
| "learning_rate": 2.8433098591549298e-06, |
| "loss": 1.329, |
| "step": 58500 |
| }, |
| { |
| "epoch": 9.95748299319728, |
| "grad_norm": 0.39917826652526855, |
| "learning_rate": 2.4031690140845074e-06, |
| "loss": 1.329, |
| "step": 58550 |
| }, |
| { |
| "epoch": 9.965986394557824, |
| "grad_norm": 0.4005703330039978, |
| "learning_rate": 1.9630281690140846e-06, |
| "loss": 1.3286, |
| "step": 58600 |
| }, |
| { |
| "epoch": 9.974489795918368, |
| "grad_norm": 0.4052295386791229, |
| "learning_rate": 1.522887323943662e-06, |
| "loss": 1.3334, |
| "step": 58650 |
| }, |
| { |
| "epoch": 9.982993197278912, |
| "grad_norm": 0.3987277150154114, |
| "learning_rate": 1.0827464788732394e-06, |
| "loss": 1.3279, |
| "step": 58700 |
| }, |
| { |
| "epoch": 9.991496598639456, |
| "grad_norm": 0.39191877841949463, |
| "learning_rate": 6.426056338028169e-07, |
| "loss": 1.322, |
| "step": 58750 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.39657750725746155, |
| "learning_rate": 2.0246478873239435e-07, |
| "loss": 1.3269, |
| "step": 58800 |
| }, |
| { |
| "epoch": 10.008503401360544, |
| "grad_norm": 0.4391802251338959, |
| "learning_rate": 0.000170377030162413, |
| "loss": 1.3375, |
| "step": 58850 |
| }, |
| { |
| "epoch": 10.017006802721088, |
| "grad_norm": 0.4108564257621765, |
| "learning_rate": 0.00017008700696055683, |
| "loss": 1.3625, |
| "step": 58900 |
| }, |
| { |
| "epoch": 10.025510204081632, |
| "grad_norm": 0.4225695729255676, |
| "learning_rate": 0.0001697969837587007, |
| "loss": 1.3704, |
| "step": 58950 |
| }, |
| { |
| "epoch": 10.034013605442176, |
| "grad_norm": 0.41939666867256165, |
| "learning_rate": 0.00016950696055684455, |
| "loss": 1.3693, |
| "step": 59000 |
| }, |
| { |
| "epoch": 10.034013605442176, |
| "eval_loss": 1.5424796342849731, |
| "eval_runtime": 75.2733, |
| "eval_samples_per_second": 1246.644, |
| "eval_steps_per_second": 4.876, |
| "step": 59000 |
| }, |
| { |
| "epoch": 10.04251700680272, |
| "grad_norm": 0.925662100315094, |
| "learning_rate": 0.00016921693735498842, |
| "loss": 1.3652, |
| "step": 59050 |
| }, |
| { |
| "epoch": 10.051020408163266, |
| "grad_norm": 0.42733487486839294, |
| "learning_rate": 0.00016892691415313225, |
| "loss": 1.3702, |
| "step": 59100 |
| }, |
| { |
| "epoch": 10.05952380952381, |
| "grad_norm": 0.43742504715919495, |
| "learning_rate": 0.0001686368909512761, |
| "loss": 1.3748, |
| "step": 59150 |
| }, |
| { |
| "epoch": 10.068027210884354, |
| "grad_norm": 0.5494563579559326, |
| "learning_rate": 0.00016834686774941994, |
| "loss": 1.3775, |
| "step": 59200 |
| }, |
| { |
| "epoch": 10.076530612244898, |
| "grad_norm": 0.43776175379753113, |
| "learning_rate": 0.0001680568445475638, |
| "loss": 1.3784, |
| "step": 59250 |
| }, |
| { |
| "epoch": 10.085034013605442, |
| "grad_norm": 0.4281368851661682, |
| "learning_rate": 0.00016776682134570767, |
| "loss": 1.3695, |
| "step": 59300 |
| }, |
| { |
| "epoch": 10.093537414965986, |
| "grad_norm": 0.5244280695915222, |
| "learning_rate": 0.0001674767981438515, |
| "loss": 1.3764, |
| "step": 59350 |
| }, |
| { |
| "epoch": 10.10204081632653, |
| "grad_norm": 0.40682801604270935, |
| "learning_rate": 0.00016718677494199536, |
| "loss": 1.3829, |
| "step": 59400 |
| }, |
| { |
| "epoch": 10.110544217687075, |
| "grad_norm": 0.42796218395233154, |
| "learning_rate": 0.00016690255220417631, |
| "loss": 1.376, |
| "step": 59450 |
| }, |
| { |
| "epoch": 10.119047619047619, |
| "grad_norm": 0.43184956908226013, |
| "learning_rate": 0.00016661252900232018, |
| "loss": 1.3794, |
| "step": 59500 |
| }, |
| { |
| "epoch": 10.127551020408163, |
| "grad_norm": 0.4699791371822357, |
| "learning_rate": 0.00016632250580046404, |
| "loss": 1.3849, |
| "step": 59550 |
| }, |
| { |
| "epoch": 10.136054421768707, |
| "grad_norm": 0.4018096923828125, |
| "learning_rate": 0.0001660324825986079, |
| "loss": 1.3765, |
| "step": 59600 |
| }, |
| { |
| "epoch": 10.14455782312925, |
| "grad_norm": 0.42693692445755005, |
| "learning_rate": 0.00016574245939675176, |
| "loss": 1.3726, |
| "step": 59650 |
| }, |
| { |
| "epoch": 10.153061224489797, |
| "grad_norm": 0.42560431361198425, |
| "learning_rate": 0.0001654524361948956, |
| "loss": 1.376, |
| "step": 59700 |
| }, |
| { |
| "epoch": 10.16156462585034, |
| "grad_norm": 0.42239609360694885, |
| "learning_rate": 0.00016516241299303943, |
| "loss": 1.3863, |
| "step": 59750 |
| }, |
| { |
| "epoch": 10.170068027210885, |
| "grad_norm": 0.41921770572662354, |
| "learning_rate": 0.0001648723897911833, |
| "loss": 1.3819, |
| "step": 59800 |
| }, |
| { |
| "epoch": 10.178571428571429, |
| "grad_norm": 0.41046616435050964, |
| "learning_rate": 0.00016458236658932715, |
| "loss": 1.3748, |
| "step": 59850 |
| }, |
| { |
| "epoch": 10.187074829931973, |
| "grad_norm": 0.41564711928367615, |
| "learning_rate": 0.000164292343387471, |
| "loss": 1.3802, |
| "step": 59900 |
| }, |
| { |
| "epoch": 10.195578231292517, |
| "grad_norm": 0.4275333881378174, |
| "learning_rate": 0.00016400232018561484, |
| "loss": 1.3757, |
| "step": 59950 |
| }, |
| { |
| "epoch": 10.204081632653061, |
| "grad_norm": 0.47480013966560364, |
| "learning_rate": 0.0001637122969837587, |
| "loss": 1.3858, |
| "step": 60000 |
| }, |
| { |
| "epoch": 10.204081632653061, |
| "eval_loss": 1.5496501922607422, |
| "eval_runtime": 75.245, |
| "eval_samples_per_second": 1247.113, |
| "eval_steps_per_second": 4.877, |
| "step": 60000 |
| }, |
| { |
| "epoch": 10.212585034013605, |
| "grad_norm": 2.0938565731048584, |
| "learning_rate": 0.00016342227378190257, |
| "loss": 1.3816, |
| "step": 60050 |
| }, |
| { |
| "epoch": 10.22108843537415, |
| "grad_norm": 0.4088470935821533, |
| "learning_rate": 0.0001631322505800464, |
| "loss": 1.3806, |
| "step": 60100 |
| }, |
| { |
| "epoch": 10.229591836734693, |
| "grad_norm": 0.43842536211013794, |
| "learning_rate": 0.00016284222737819026, |
| "loss": 1.3796, |
| "step": 60150 |
| }, |
| { |
| "epoch": 10.238095238095237, |
| "grad_norm": 0.41963255405426025, |
| "learning_rate": 0.00016255220417633412, |
| "loss": 1.3798, |
| "step": 60200 |
| }, |
| { |
| "epoch": 10.246598639455783, |
| "grad_norm": 0.4422262907028198, |
| "learning_rate": 0.00016226218097447796, |
| "loss": 1.3778, |
| "step": 60250 |
| }, |
| { |
| "epoch": 10.255102040816327, |
| "grad_norm": 0.43372678756713867, |
| "learning_rate": 0.00016197215777262182, |
| "loss": 1.3779, |
| "step": 60300 |
| }, |
| { |
| "epoch": 10.263605442176871, |
| "grad_norm": 0.42680031061172485, |
| "learning_rate": 0.00016168213457076568, |
| "loss": 1.3771, |
| "step": 60350 |
| }, |
| { |
| "epoch": 10.272108843537415, |
| "grad_norm": 0.4302733540534973, |
| "learning_rate": 0.0001613921113689095, |
| "loss": 1.3848, |
| "step": 60400 |
| }, |
| { |
| "epoch": 10.28061224489796, |
| "grad_norm": 0.4111240804195404, |
| "learning_rate": 0.00016110208816705337, |
| "loss": 1.3784, |
| "step": 60450 |
| }, |
| { |
| "epoch": 10.289115646258503, |
| "grad_norm": 0.43267524242401123, |
| "learning_rate": 0.0001608120649651972, |
| "loss": 1.3828, |
| "step": 60500 |
| }, |
| { |
| "epoch": 10.297619047619047, |
| "grad_norm": 0.4088628888130188, |
| "learning_rate": 0.00016052204176334107, |
| "loss": 1.3809, |
| "step": 60550 |
| }, |
| { |
| "epoch": 10.306122448979592, |
| "grad_norm": 0.4188777208328247, |
| "learning_rate": 0.00016023201856148493, |
| "loss": 1.3772, |
| "step": 60600 |
| }, |
| { |
| "epoch": 10.314625850340136, |
| "grad_norm": 0.4147690534591675, |
| "learning_rate": 0.0001599419953596288, |
| "loss": 1.3788, |
| "step": 60650 |
| }, |
| { |
| "epoch": 10.32312925170068, |
| "grad_norm": 0.48966458439826965, |
| "learning_rate": 0.00015965777262180974, |
| "loss": 1.378, |
| "step": 60700 |
| }, |
| { |
| "epoch": 10.331632653061224, |
| "grad_norm": 0.40553244948387146, |
| "learning_rate": 0.0001593677494199536, |
| "loss": 1.3879, |
| "step": 60750 |
| }, |
| { |
| "epoch": 10.34013605442177, |
| "grad_norm": 0.41595011949539185, |
| "learning_rate": 0.00015907772621809747, |
| "loss": 1.3801, |
| "step": 60800 |
| }, |
| { |
| "epoch": 10.348639455782314, |
| "grad_norm": 0.41190576553344727, |
| "learning_rate": 0.0001587877030162413, |
| "loss": 1.3838, |
| "step": 60850 |
| }, |
| { |
| "epoch": 10.357142857142858, |
| "grad_norm": 0.4079788625240326, |
| "learning_rate": 0.00015849767981438516, |
| "loss": 1.3885, |
| "step": 60900 |
| }, |
| { |
| "epoch": 10.365646258503402, |
| "grad_norm": 0.41605037450790405, |
| "learning_rate": 0.000158207656612529, |
| "loss": 1.3869, |
| "step": 60950 |
| }, |
| { |
| "epoch": 10.374149659863946, |
| "grad_norm": 0.41318464279174805, |
| "learning_rate": 0.00015791763341067286, |
| "loss": 1.3789, |
| "step": 61000 |
| }, |
| { |
| "epoch": 10.374149659863946, |
| "eval_loss": 1.549613356590271, |
| "eval_runtime": 75.299, |
| "eval_samples_per_second": 1246.219, |
| "eval_steps_per_second": 4.874, |
| "step": 61000 |
| }, |
| { |
| "epoch": 10.38265306122449, |
| "grad_norm": 0.42197248339653015, |
| "learning_rate": 0.00015762761020881672, |
| "loss": 1.3862, |
| "step": 61050 |
| }, |
| { |
| "epoch": 10.391156462585034, |
| "grad_norm": 0.423498272895813, |
| "learning_rate": 0.00015733758700696055, |
| "loss": 1.3831, |
| "step": 61100 |
| }, |
| { |
| "epoch": 10.399659863945578, |
| "grad_norm": 0.41805723309516907, |
| "learning_rate": 0.0001570475638051044, |
| "loss": 1.3807, |
| "step": 61150 |
| }, |
| { |
| "epoch": 10.408163265306122, |
| "grad_norm": 0.40790635347366333, |
| "learning_rate": 0.00015675754060324827, |
| "loss": 1.3818, |
| "step": 61200 |
| }, |
| { |
| "epoch": 10.416666666666666, |
| "grad_norm": 0.42006179690361023, |
| "learning_rate": 0.0001564675174013921, |
| "loss": 1.3775, |
| "step": 61250 |
| }, |
| { |
| "epoch": 10.42517006802721, |
| "grad_norm": 0.41415610909461975, |
| "learning_rate": 0.00015617749419953597, |
| "loss": 1.391, |
| "step": 61300 |
| }, |
| { |
| "epoch": 10.433673469387756, |
| "grad_norm": 0.44713294506073, |
| "learning_rate": 0.00015588747099767983, |
| "loss": 1.383, |
| "step": 61350 |
| }, |
| { |
| "epoch": 10.4421768707483, |
| "grad_norm": 0.44019782543182373, |
| "learning_rate": 0.00015559744779582366, |
| "loss": 1.3904, |
| "step": 61400 |
| }, |
| { |
| "epoch": 10.450680272108844, |
| "grad_norm": 0.4164921045303345, |
| "learning_rate": 0.00015530742459396752, |
| "loss": 1.3821, |
| "step": 61450 |
| }, |
| { |
| "epoch": 10.459183673469388, |
| "grad_norm": 0.4174480438232422, |
| "learning_rate": 0.00015501740139211139, |
| "loss": 1.3791, |
| "step": 61500 |
| }, |
| { |
| "epoch": 10.467687074829932, |
| "grad_norm": 0.49047085642814636, |
| "learning_rate": 0.00015472737819025522, |
| "loss": 1.3818, |
| "step": 61550 |
| }, |
| { |
| "epoch": 10.476190476190476, |
| "grad_norm": 0.405862957239151, |
| "learning_rate": 0.00015443735498839908, |
| "loss": 1.3817, |
| "step": 61600 |
| }, |
| { |
| "epoch": 10.48469387755102, |
| "grad_norm": 0.4351731538772583, |
| "learning_rate": 0.00015414733178654291, |
| "loss": 1.3806, |
| "step": 61650 |
| }, |
| { |
| "epoch": 10.493197278911564, |
| "grad_norm": 0.4245615303516388, |
| "learning_rate": 0.00015385730858468678, |
| "loss": 1.383, |
| "step": 61700 |
| }, |
| { |
| "epoch": 10.501700680272108, |
| "grad_norm": 0.39436227083206177, |
| "learning_rate": 0.00015356728538283064, |
| "loss": 1.3805, |
| "step": 61750 |
| }, |
| { |
| "epoch": 10.510204081632653, |
| "grad_norm": 0.41128745675086975, |
| "learning_rate": 0.0001532772621809745, |
| "loss": 1.386, |
| "step": 61800 |
| }, |
| { |
| "epoch": 10.518707482993197, |
| "grad_norm": 0.42046472430229187, |
| "learning_rate": 0.00015298723897911833, |
| "loss": 1.3897, |
| "step": 61850 |
| }, |
| { |
| "epoch": 10.527210884353742, |
| "grad_norm": 0.4486936926841736, |
| "learning_rate": 0.00015269721577726217, |
| "loss": 1.383, |
| "step": 61900 |
| }, |
| { |
| "epoch": 10.535714285714286, |
| "grad_norm": 0.4166390001773834, |
| "learning_rate": 0.00015240719257540603, |
| "loss": 1.3793, |
| "step": 61950 |
| }, |
| { |
| "epoch": 10.54421768707483, |
| "grad_norm": 0.42586439847946167, |
| "learning_rate": 0.0001521171693735499, |
| "loss": 1.3778, |
| "step": 62000 |
| }, |
| { |
| "epoch": 10.54421768707483, |
| "eval_loss": 1.5420804023742676, |
| "eval_runtime": 75.2999, |
| "eval_samples_per_second": 1246.204, |
| "eval_steps_per_second": 4.874, |
| "step": 62000 |
| }, |
| { |
| "epoch": 10.552721088435375, |
| "grad_norm": 0.39667069911956787, |
| "learning_rate": 0.00015182714617169375, |
| "loss": 1.3749, |
| "step": 62050 |
| }, |
| { |
| "epoch": 10.561224489795919, |
| "grad_norm": 0.40961819887161255, |
| "learning_rate": 0.0001515371229698376, |
| "loss": 1.3797, |
| "step": 62100 |
| }, |
| { |
| "epoch": 10.569727891156463, |
| "grad_norm": 0.4282205104827881, |
| "learning_rate": 0.00015124709976798144, |
| "loss": 1.3803, |
| "step": 62150 |
| }, |
| { |
| "epoch": 10.578231292517007, |
| "grad_norm": 0.42050084471702576, |
| "learning_rate": 0.00015095707656612528, |
| "loss": 1.3812, |
| "step": 62200 |
| }, |
| { |
| "epoch": 10.58673469387755, |
| "grad_norm": 0.4289666712284088, |
| "learning_rate": 0.00015066705336426914, |
| "loss": 1.3824, |
| "step": 62250 |
| }, |
| { |
| "epoch": 10.595238095238095, |
| "grad_norm": 0.4385221600532532, |
| "learning_rate": 0.000150377030162413, |
| "loss": 1.3733, |
| "step": 62300 |
| }, |
| { |
| "epoch": 10.603741496598639, |
| "grad_norm": 2.321608781814575, |
| "learning_rate": 0.00015008700696055686, |
| "loss": 1.3793, |
| "step": 62350 |
| }, |
| { |
| "epoch": 10.612244897959183, |
| "grad_norm": 0.4276087284088135, |
| "learning_rate": 0.00014979698375870072, |
| "loss": 1.382, |
| "step": 62400 |
| }, |
| { |
| "epoch": 10.620748299319727, |
| "grad_norm": 0.41472572088241577, |
| "learning_rate": 0.00014950696055684453, |
| "loss": 1.3772, |
| "step": 62450 |
| }, |
| { |
| "epoch": 10.629251700680273, |
| "grad_norm": 0.40703973174095154, |
| "learning_rate": 0.0001492169373549884, |
| "loss": 1.381, |
| "step": 62500 |
| }, |
| { |
| "epoch": 10.637755102040817, |
| "grad_norm": 0.41853588819503784, |
| "learning_rate": 0.00014892691415313225, |
| "loss": 1.377, |
| "step": 62550 |
| }, |
| { |
| "epoch": 10.646258503401361, |
| "grad_norm": 0.4194204807281494, |
| "learning_rate": 0.0001486368909512761, |
| "loss": 1.3803, |
| "step": 62600 |
| }, |
| { |
| "epoch": 10.654761904761905, |
| "grad_norm": 0.41111990809440613, |
| "learning_rate": 0.00014834686774941997, |
| "loss": 1.3778, |
| "step": 62650 |
| }, |
| { |
| "epoch": 10.66326530612245, |
| "grad_norm": 0.43283921480178833, |
| "learning_rate": 0.0001480568445475638, |
| "loss": 1.3759, |
| "step": 62700 |
| }, |
| { |
| "epoch": 10.671768707482993, |
| "grad_norm": 0.4069305956363678, |
| "learning_rate": 0.00014776682134570764, |
| "loss": 1.3779, |
| "step": 62750 |
| }, |
| { |
| "epoch": 10.680272108843537, |
| "grad_norm": 0.4265955984592438, |
| "learning_rate": 0.0001474767981438515, |
| "loss": 1.3818, |
| "step": 62800 |
| }, |
| { |
| "epoch": 10.688775510204081, |
| "grad_norm": 0.43886256217956543, |
| "learning_rate": 0.00014718677494199536, |
| "loss": 1.3739, |
| "step": 62850 |
| }, |
| { |
| "epoch": 10.697278911564625, |
| "grad_norm": 0.4144219756126404, |
| "learning_rate": 0.00014689675174013922, |
| "loss": 1.3748, |
| "step": 62900 |
| }, |
| { |
| "epoch": 10.70578231292517, |
| "grad_norm": 0.41580018401145935, |
| "learning_rate": 0.00014660672853828309, |
| "loss": 1.379, |
| "step": 62950 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "grad_norm": 0.41576939821243286, |
| "learning_rate": 0.00014631670533642692, |
| "loss": 1.3789, |
| "step": 63000 |
| }, |
| { |
| "epoch": 10.714285714285714, |
| "eval_loss": 1.5434892177581787, |
| "eval_runtime": 75.2563, |
| "eval_samples_per_second": 1246.925, |
| "eval_steps_per_second": 4.877, |
| "step": 63000 |
| }, |
| { |
| "epoch": 10.722789115646258, |
| "grad_norm": 0.44234171509742737, |
| "learning_rate": 0.00014602668213457075, |
| "loss": 1.3822, |
| "step": 63050 |
| }, |
| { |
| "epoch": 10.731292517006803, |
| "grad_norm": 0.4127776026725769, |
| "learning_rate": 0.00014573665893271461, |
| "loss": 1.3789, |
| "step": 63100 |
| }, |
| { |
| "epoch": 10.739795918367347, |
| "grad_norm": 0.4461323320865631, |
| "learning_rate": 0.00014544663573085848, |
| "loss": 1.3744, |
| "step": 63150 |
| }, |
| { |
| "epoch": 10.748299319727892, |
| "grad_norm": 0.5485215187072754, |
| "learning_rate": 0.00014515661252900234, |
| "loss": 1.3762, |
| "step": 63200 |
| }, |
| { |
| "epoch": 10.756802721088436, |
| "grad_norm": 0.4205337464809418, |
| "learning_rate": 0.00014486658932714617, |
| "loss": 1.3753, |
| "step": 63250 |
| }, |
| { |
| "epoch": 10.76530612244898, |
| "grad_norm": 0.4096565246582031, |
| "learning_rate": 0.00014457656612529003, |
| "loss": 1.3752, |
| "step": 63300 |
| }, |
| { |
| "epoch": 10.773809523809524, |
| "grad_norm": 0.44153106212615967, |
| "learning_rate": 0.00014428654292343387, |
| "loss": 1.3784, |
| "step": 63350 |
| }, |
| { |
| "epoch": 10.782312925170068, |
| "grad_norm": 0.41935479640960693, |
| "learning_rate": 0.00014399651972157773, |
| "loss": 1.3803, |
| "step": 63400 |
| }, |
| { |
| "epoch": 10.790816326530612, |
| "grad_norm": 0.40989023447036743, |
| "learning_rate": 0.0001437064965197216, |
| "loss": 1.3779, |
| "step": 63450 |
| }, |
| { |
| "epoch": 10.799319727891156, |
| "grad_norm": 0.41036033630371094, |
| "learning_rate": 0.00014341647331786542, |
| "loss": 1.38, |
| "step": 63500 |
| }, |
| { |
| "epoch": 10.8078231292517, |
| "grad_norm": 0.4159608781337738, |
| "learning_rate": 0.00014312645011600928, |
| "loss": 1.3775, |
| "step": 63550 |
| }, |
| { |
| "epoch": 10.816326530612244, |
| "grad_norm": 0.4197967052459717, |
| "learning_rate": 0.00014283642691415314, |
| "loss": 1.3763, |
| "step": 63600 |
| }, |
| { |
| "epoch": 10.82482993197279, |
| "grad_norm": 0.9038332104682922, |
| "learning_rate": 0.00014254640371229698, |
| "loss": 1.3774, |
| "step": 63650 |
| }, |
| { |
| "epoch": 10.833333333333334, |
| "grad_norm": 0.41402915120124817, |
| "learning_rate": 0.00014225638051044084, |
| "loss": 1.3699, |
| "step": 63700 |
| }, |
| { |
| "epoch": 10.841836734693878, |
| "grad_norm": 0.4222056567668915, |
| "learning_rate": 0.00014197215777262182, |
| "loss": 1.3716, |
| "step": 63750 |
| }, |
| { |
| "epoch": 10.850340136054422, |
| "grad_norm": 0.4178094267845154, |
| "learning_rate": 0.00014168213457076568, |
| "loss": 1.3776, |
| "step": 63800 |
| }, |
| { |
| "epoch": 10.858843537414966, |
| "grad_norm": 0.42892855405807495, |
| "learning_rate": 0.00014139211136890951, |
| "loss": 1.3747, |
| "step": 63850 |
| }, |
| { |
| "epoch": 10.86734693877551, |
| "grad_norm": 0.4110173285007477, |
| "learning_rate": 0.00014110208816705335, |
| "loss": 1.3807, |
| "step": 63900 |
| }, |
| { |
| "epoch": 10.875850340136054, |
| "grad_norm": 0.4244990050792694, |
| "learning_rate": 0.0001408120649651972, |
| "loss": 1.375, |
| "step": 63950 |
| }, |
| { |
| "epoch": 10.884353741496598, |
| "grad_norm": 0.5068066716194153, |
| "learning_rate": 0.00014052204176334107, |
| "loss": 1.3742, |
| "step": 64000 |
| }, |
| { |
| "epoch": 10.884353741496598, |
| "eval_loss": 1.5442039966583252, |
| "eval_runtime": 75.3144, |
| "eval_samples_per_second": 1245.964, |
| "eval_steps_per_second": 4.873, |
| "step": 64000 |
| }, |
| { |
| "epoch": 10.892857142857142, |
| "grad_norm": 0.4364668130874634, |
| "learning_rate": 0.00014023201856148493, |
| "loss": 1.3739, |
| "step": 64050 |
| }, |
| { |
| "epoch": 10.901360544217686, |
| "grad_norm": 0.4433183968067169, |
| "learning_rate": 0.00013994199535962877, |
| "loss": 1.3756, |
| "step": 64100 |
| }, |
| { |
| "epoch": 10.90986394557823, |
| "grad_norm": 0.41508781909942627, |
| "learning_rate": 0.00013965197215777263, |
| "loss": 1.3792, |
| "step": 64150 |
| }, |
| { |
| "epoch": 10.918367346938776, |
| "grad_norm": 0.4308226406574249, |
| "learning_rate": 0.00013936194895591646, |
| "loss": 1.3818, |
| "step": 64200 |
| }, |
| { |
| "epoch": 10.92687074829932, |
| "grad_norm": 0.43690988421440125, |
| "learning_rate": 0.00013907192575406032, |
| "loss": 1.3723, |
| "step": 64250 |
| }, |
| { |
| "epoch": 10.935374149659864, |
| "grad_norm": 0.9234839081764221, |
| "learning_rate": 0.00013878190255220418, |
| "loss": 1.3769, |
| "step": 64300 |
| }, |
| { |
| "epoch": 10.943877551020408, |
| "grad_norm": 0.4184640049934387, |
| "learning_rate": 0.00013849187935034804, |
| "loss": 1.3783, |
| "step": 64350 |
| }, |
| { |
| "epoch": 10.952380952380953, |
| "grad_norm": 0.418977826833725, |
| "learning_rate": 0.00013820185614849188, |
| "loss": 1.3708, |
| "step": 64400 |
| }, |
| { |
| "epoch": 10.960884353741497, |
| "grad_norm": 0.42863503098487854, |
| "learning_rate": 0.00013791183294663574, |
| "loss": 1.3643, |
| "step": 64450 |
| }, |
| { |
| "epoch": 10.96938775510204, |
| "grad_norm": 0.41124334931373596, |
| "learning_rate": 0.00013762180974477957, |
| "loss": 1.3788, |
| "step": 64500 |
| }, |
| { |
| "epoch": 10.977891156462585, |
| "grad_norm": 0.46177518367767334, |
| "learning_rate": 0.00013733178654292343, |
| "loss": 1.376, |
| "step": 64550 |
| }, |
| { |
| "epoch": 10.986394557823129, |
| "grad_norm": 0.4240172207355499, |
| "learning_rate": 0.0001370417633410673, |
| "loss": 1.3674, |
| "step": 64600 |
| }, |
| { |
| "epoch": 10.994897959183673, |
| "grad_norm": 0.4008079767227173, |
| "learning_rate": 0.00013675174013921113, |
| "loss": 1.3676, |
| "step": 64650 |
| }, |
| { |
| "epoch": 11.003401360544217, |
| "grad_norm": 0.40654611587524414, |
| "learning_rate": 0.000136461716937355, |
| "loss": 1.3716, |
| "step": 64700 |
| }, |
| { |
| "epoch": 11.011904761904763, |
| "grad_norm": 0.4086385667324066, |
| "learning_rate": 0.00013617169373549885, |
| "loss": 1.3692, |
| "step": 64750 |
| }, |
| { |
| "epoch": 11.020408163265307, |
| "grad_norm": 0.43329235911369324, |
| "learning_rate": 0.0001358816705336427, |
| "loss": 1.3694, |
| "step": 64800 |
| }, |
| { |
| "epoch": 11.02891156462585, |
| "grad_norm": 0.43725255131721497, |
| "learning_rate": 0.00013559164733178655, |
| "loss": 1.3648, |
| "step": 64850 |
| }, |
| { |
| "epoch": 11.037414965986395, |
| "grad_norm": 0.4894959628582001, |
| "learning_rate": 0.00013530162412993038, |
| "loss": 1.3681, |
| "step": 64900 |
| }, |
| { |
| "epoch": 11.045918367346939, |
| "grad_norm": 0.4180058538913727, |
| "learning_rate": 0.00013501160092807424, |
| "loss": 1.3637, |
| "step": 64950 |
| }, |
| { |
| "epoch": 11.054421768707483, |
| "grad_norm": 0.42746037244796753, |
| "learning_rate": 0.0001347215777262181, |
| "loss": 1.3674, |
| "step": 65000 |
| }, |
| { |
| "epoch": 11.054421768707483, |
| "eval_loss": 1.5379548072814941, |
| "eval_runtime": 75.1076, |
| "eval_samples_per_second": 1249.394, |
| "eval_steps_per_second": 4.886, |
| "step": 65000 |
| }, |
| { |
| "epoch": 11.062925170068027, |
| "grad_norm": 0.4063352644443512, |
| "learning_rate": 0.00013443155452436196, |
| "loss": 1.3644, |
| "step": 65050 |
| }, |
| { |
| "epoch": 11.071428571428571, |
| "grad_norm": 0.41466256976127625, |
| "learning_rate": 0.00013414153132250582, |
| "loss": 1.367, |
| "step": 65100 |
| }, |
| { |
| "epoch": 11.079931972789115, |
| "grad_norm": 0.40846332907676697, |
| "learning_rate": 0.00013385150812064966, |
| "loss": 1.3586, |
| "step": 65150 |
| }, |
| { |
| "epoch": 11.08843537414966, |
| "grad_norm": 0.4066697955131531, |
| "learning_rate": 0.0001335614849187935, |
| "loss": 1.3681, |
| "step": 65200 |
| }, |
| { |
| "epoch": 11.096938775510203, |
| "grad_norm": 0.4276025891304016, |
| "learning_rate": 0.00013327146171693735, |
| "loss": 1.3638, |
| "step": 65250 |
| }, |
| { |
| "epoch": 11.10544217687075, |
| "grad_norm": 0.4069554805755615, |
| "learning_rate": 0.00013298143851508121, |
| "loss": 1.3661, |
| "step": 65300 |
| }, |
| { |
| "epoch": 11.113945578231293, |
| "grad_norm": 0.5378205180168152, |
| "learning_rate": 0.00013269141531322508, |
| "loss": 1.365, |
| "step": 65350 |
| }, |
| { |
| "epoch": 11.122448979591837, |
| "grad_norm": 0.4328392744064331, |
| "learning_rate": 0.00013240139211136894, |
| "loss": 1.3681, |
| "step": 65400 |
| }, |
| { |
| "epoch": 11.130952380952381, |
| "grad_norm": 1.8447155952453613, |
| "learning_rate": 0.00013211136890951274, |
| "loss": 1.3643, |
| "step": 65450 |
| }, |
| { |
| "epoch": 11.139455782312925, |
| "grad_norm": 0.4180423319339752, |
| "learning_rate": 0.0001318213457076566, |
| "loss": 1.3644, |
| "step": 65500 |
| }, |
| { |
| "epoch": 11.14795918367347, |
| "grad_norm": 0.4122379720211029, |
| "learning_rate": 0.00013153132250580047, |
| "loss": 1.3626, |
| "step": 65550 |
| }, |
| { |
| "epoch": 11.156462585034014, |
| "grad_norm": 0.431257963180542, |
| "learning_rate": 0.00013124129930394433, |
| "loss": 1.3653, |
| "step": 65600 |
| }, |
| { |
| "epoch": 11.164965986394558, |
| "grad_norm": 0.39916712045669556, |
| "learning_rate": 0.0001309512761020882, |
| "loss": 1.3615, |
| "step": 65650 |
| }, |
| { |
| "epoch": 11.173469387755102, |
| "grad_norm": 0.4114948809146881, |
| "learning_rate": 0.00013066125290023202, |
| "loss": 1.37, |
| "step": 65700 |
| }, |
| { |
| "epoch": 11.181972789115646, |
| "grad_norm": 0.42501741647720337, |
| "learning_rate": 0.00013037122969837586, |
| "loss": 1.369, |
| "step": 65750 |
| }, |
| { |
| "epoch": 11.19047619047619, |
| "grad_norm": 0.4058555066585541, |
| "learning_rate": 0.00013008120649651972, |
| "loss": 1.3609, |
| "step": 65800 |
| }, |
| { |
| "epoch": 11.198979591836734, |
| "grad_norm": 0.40838614106178284, |
| "learning_rate": 0.00012979118329466358, |
| "loss": 1.3605, |
| "step": 65850 |
| }, |
| { |
| "epoch": 11.20748299319728, |
| "grad_norm": 0.4708266854286194, |
| "learning_rate": 0.00012950116009280744, |
| "loss": 1.3606, |
| "step": 65900 |
| }, |
| { |
| "epoch": 11.215986394557824, |
| "grad_norm": 0.40549665689468384, |
| "learning_rate": 0.0001292111368909513, |
| "loss": 1.3652, |
| "step": 65950 |
| }, |
| { |
| "epoch": 11.224489795918368, |
| "grad_norm": 0.4081163704395294, |
| "learning_rate": 0.00012892111368909513, |
| "loss": 1.3609, |
| "step": 66000 |
| }, |
| { |
| "epoch": 11.224489795918368, |
| "eval_loss": 1.5325791835784912, |
| "eval_runtime": 75.302, |
| "eval_samples_per_second": 1246.169, |
| "eval_steps_per_second": 4.874, |
| "step": 66000 |
| }, |
| { |
| "epoch": 11.232993197278912, |
| "grad_norm": 0.42392808198928833, |
| "learning_rate": 0.00012863109048723897, |
| "loss": 1.3617, |
| "step": 66050 |
| }, |
| { |
| "epoch": 11.241496598639456, |
| "grad_norm": 0.4850771725177765, |
| "learning_rate": 0.00012834106728538283, |
| "loss": 1.3733, |
| "step": 66100 |
| }, |
| { |
| "epoch": 11.25, |
| "grad_norm": 0.42814481258392334, |
| "learning_rate": 0.0001280510440835267, |
| "loss": 1.3652, |
| "step": 66150 |
| }, |
| { |
| "epoch": 11.258503401360544, |
| "grad_norm": 0.40000179409980774, |
| "learning_rate": 0.00012776102088167055, |
| "loss": 1.3717, |
| "step": 66200 |
| }, |
| { |
| "epoch": 11.267006802721088, |
| "grad_norm": 0.4144078493118286, |
| "learning_rate": 0.00012747099767981438, |
| "loss": 1.3645, |
| "step": 66250 |
| }, |
| { |
| "epoch": 11.275510204081632, |
| "grad_norm": 0.49152231216430664, |
| "learning_rate": 0.00012718097447795825, |
| "loss": 1.3661, |
| "step": 66300 |
| }, |
| { |
| "epoch": 11.284013605442176, |
| "grad_norm": 0.4072152078151703, |
| "learning_rate": 0.00012689095127610208, |
| "loss": 1.3671, |
| "step": 66350 |
| }, |
| { |
| "epoch": 11.29251700680272, |
| "grad_norm": 0.40859153866767883, |
| "learning_rate": 0.00012660092807424594, |
| "loss": 1.3667, |
| "step": 66400 |
| }, |
| { |
| "epoch": 11.301020408163266, |
| "grad_norm": 0.4389115869998932, |
| "learning_rate": 0.0001263109048723898, |
| "loss": 1.3648, |
| "step": 66450 |
| }, |
| { |
| "epoch": 11.30952380952381, |
| "grad_norm": 0.42225921154022217, |
| "learning_rate": 0.00012602088167053364, |
| "loss": 1.3584, |
| "step": 66500 |
| }, |
| { |
| "epoch": 11.318027210884354, |
| "grad_norm": 0.40919360518455505, |
| "learning_rate": 0.0001257308584686775, |
| "loss": 1.3613, |
| "step": 66550 |
| }, |
| { |
| "epoch": 11.326530612244898, |
| "grad_norm": 0.41595298051834106, |
| "learning_rate": 0.00012544083526682136, |
| "loss": 1.359, |
| "step": 66600 |
| }, |
| { |
| "epoch": 11.335034013605442, |
| "grad_norm": 0.405838280916214, |
| "learning_rate": 0.0001251508120649652, |
| "loss": 1.3581, |
| "step": 66650 |
| }, |
| { |
| "epoch": 11.343537414965986, |
| "grad_norm": 0.4283597767353058, |
| "learning_rate": 0.00012486078886310905, |
| "loss": 1.3653, |
| "step": 66700 |
| }, |
| { |
| "epoch": 11.35204081632653, |
| "grad_norm": 0.42228442430496216, |
| "learning_rate": 0.00012457076566125291, |
| "loss": 1.3666, |
| "step": 66750 |
| }, |
| { |
| "epoch": 11.360544217687075, |
| "grad_norm": 0.4238821566104889, |
| "learning_rate": 0.00012428074245939675, |
| "loss": 1.3621, |
| "step": 66800 |
| }, |
| { |
| "epoch": 11.369047619047619, |
| "grad_norm": 0.40990692377090454, |
| "learning_rate": 0.0001239907192575406, |
| "loss": 1.3663, |
| "step": 66850 |
| }, |
| { |
| "epoch": 11.377551020408163, |
| "grad_norm": 0.43895843625068665, |
| "learning_rate": 0.00012370069605568447, |
| "loss": 1.3581, |
| "step": 66900 |
| }, |
| { |
| "epoch": 11.386054421768707, |
| "grad_norm": 0.42183375358581543, |
| "learning_rate": 0.0001234106728538283, |
| "loss": 1.3655, |
| "step": 66950 |
| }, |
| { |
| "epoch": 11.39455782312925, |
| "grad_norm": 0.41603147983551025, |
| "learning_rate": 0.00012312064965197216, |
| "loss": 1.353, |
| "step": 67000 |
| }, |
| { |
| "epoch": 11.39455782312925, |
| "eval_loss": 1.5269132852554321, |
| "eval_runtime": 75.2762, |
| "eval_samples_per_second": 1246.596, |
| "eval_steps_per_second": 4.875, |
| "step": 67000 |
| }, |
| { |
| "epoch": 11.403061224489797, |
| "grad_norm": 0.42194870114326477, |
| "learning_rate": 0.00012283062645011603, |
| "loss": 1.3639, |
| "step": 67050 |
| }, |
| { |
| "epoch": 11.41156462585034, |
| "grad_norm": 0.42385604977607727, |
| "learning_rate": 0.00012254060324825986, |
| "loss": 1.3605, |
| "step": 67100 |
| }, |
| { |
| "epoch": 11.420068027210885, |
| "grad_norm": 0.4089812636375427, |
| "learning_rate": 0.00012225058004640372, |
| "loss": 1.3553, |
| "step": 67150 |
| }, |
| { |
| "epoch": 11.428571428571429, |
| "grad_norm": 0.402220219373703, |
| "learning_rate": 0.00012196055684454757, |
| "loss": 1.3643, |
| "step": 67200 |
| }, |
| { |
| "epoch": 11.437074829931973, |
| "grad_norm": 0.4062860310077667, |
| "learning_rate": 0.00012167053364269142, |
| "loss": 1.3615, |
| "step": 67250 |
| }, |
| { |
| "epoch": 11.445578231292517, |
| "grad_norm": 0.4270987808704376, |
| "learning_rate": 0.00012138051044083528, |
| "loss": 1.3586, |
| "step": 67300 |
| }, |
| { |
| "epoch": 11.454081632653061, |
| "grad_norm": 0.39737579226493835, |
| "learning_rate": 0.00012109048723897912, |
| "loss": 1.3566, |
| "step": 67350 |
| }, |
| { |
| "epoch": 11.462585034013605, |
| "grad_norm": 0.4278363287448883, |
| "learning_rate": 0.00012080046403712297, |
| "loss": 1.3597, |
| "step": 67400 |
| }, |
| { |
| "epoch": 11.47108843537415, |
| "grad_norm": 0.4197821319103241, |
| "learning_rate": 0.00012051044083526683, |
| "loss": 1.3627, |
| "step": 67450 |
| }, |
| { |
| "epoch": 11.479591836734693, |
| "grad_norm": 0.4141803979873657, |
| "learning_rate": 0.00012022041763341067, |
| "loss": 1.3576, |
| "step": 67500 |
| }, |
| { |
| "epoch": 11.488095238095237, |
| "grad_norm": 0.41713324189186096, |
| "learning_rate": 0.00011993039443155453, |
| "loss": 1.3544, |
| "step": 67550 |
| }, |
| { |
| "epoch": 11.496598639455783, |
| "grad_norm": 0.4070891737937927, |
| "learning_rate": 0.00011964037122969839, |
| "loss": 1.364, |
| "step": 67600 |
| }, |
| { |
| "epoch": 11.505102040816327, |
| "grad_norm": 0.4234033524990082, |
| "learning_rate": 0.00011935034802784222, |
| "loss": 1.3549, |
| "step": 67650 |
| }, |
| { |
| "epoch": 11.513605442176871, |
| "grad_norm": 0.4253610372543335, |
| "learning_rate": 0.00011906032482598608, |
| "loss": 1.36, |
| "step": 67700 |
| }, |
| { |
| "epoch": 11.522108843537415, |
| "grad_norm": 0.4099927246570587, |
| "learning_rate": 0.00011877030162412995, |
| "loss": 1.3639, |
| "step": 67750 |
| }, |
| { |
| "epoch": 11.53061224489796, |
| "grad_norm": 0.4126720726490021, |
| "learning_rate": 0.00011848607888631091, |
| "loss": 1.3629, |
| "step": 67800 |
| }, |
| { |
| "epoch": 11.539115646258503, |
| "grad_norm": 0.42233848571777344, |
| "learning_rate": 0.00011819605568445476, |
| "loss": 1.358, |
| "step": 67850 |
| }, |
| { |
| "epoch": 11.547619047619047, |
| "grad_norm": 0.42125800251960754, |
| "learning_rate": 0.00011790603248259861, |
| "loss": 1.3552, |
| "step": 67900 |
| }, |
| { |
| "epoch": 11.556122448979592, |
| "grad_norm": 0.4108964204788208, |
| "learning_rate": 0.00011761600928074247, |
| "loss": 1.3603, |
| "step": 67950 |
| }, |
| { |
| "epoch": 11.564625850340136, |
| "grad_norm": 0.4113200902938843, |
| "learning_rate": 0.00011732598607888632, |
| "loss": 1.3627, |
| "step": 68000 |
| }, |
| { |
| "epoch": 11.564625850340136, |
| "eval_loss": 1.5228930711746216, |
| "eval_runtime": 75.2543, |
| "eval_samples_per_second": 1246.958, |
| "eval_steps_per_second": 4.877, |
| "step": 68000 |
| }, |
| { |
| "epoch": 11.57312925170068, |
| "grad_norm": 0.43190208077430725, |
| "learning_rate": 0.00011703596287703016, |
| "loss": 1.354, |
| "step": 68050 |
| }, |
| { |
| "epoch": 11.581632653061224, |
| "grad_norm": 0.41988831758499146, |
| "learning_rate": 0.00011674593967517402, |
| "loss": 1.3598, |
| "step": 68100 |
| }, |
| { |
| "epoch": 11.59013605442177, |
| "grad_norm": 0.41497015953063965, |
| "learning_rate": 0.00011645591647331787, |
| "loss": 1.3432, |
| "step": 68150 |
| }, |
| { |
| "epoch": 11.598639455782314, |
| "grad_norm": 0.42652803659439087, |
| "learning_rate": 0.00011616589327146172, |
| "loss": 1.3565, |
| "step": 68200 |
| }, |
| { |
| "epoch": 11.607142857142858, |
| "grad_norm": 0.43378782272338867, |
| "learning_rate": 0.00011587587006960557, |
| "loss": 1.3538, |
| "step": 68250 |
| }, |
| { |
| "epoch": 11.615646258503402, |
| "grad_norm": 0.4106690287590027, |
| "learning_rate": 0.00011558584686774943, |
| "loss": 1.3514, |
| "step": 68300 |
| }, |
| { |
| "epoch": 11.624149659863946, |
| "grad_norm": 0.4098680913448334, |
| "learning_rate": 0.00011529582366589328, |
| "loss": 1.3628, |
| "step": 68350 |
| }, |
| { |
| "epoch": 11.63265306122449, |
| "grad_norm": 0.4068293869495392, |
| "learning_rate": 0.00011500580046403712, |
| "loss": 1.3561, |
| "step": 68400 |
| }, |
| { |
| "epoch": 11.641156462585034, |
| "grad_norm": 0.4472546875476837, |
| "learning_rate": 0.00011471577726218098, |
| "loss": 1.3565, |
| "step": 68450 |
| }, |
| { |
| "epoch": 11.649659863945578, |
| "grad_norm": 0.4033961594104767, |
| "learning_rate": 0.00011442575406032482, |
| "loss": 1.3545, |
| "step": 68500 |
| }, |
| { |
| "epoch": 11.658163265306122, |
| "grad_norm": 0.4254186749458313, |
| "learning_rate": 0.00011413573085846868, |
| "loss": 1.3571, |
| "step": 68550 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 0.412675142288208, |
| "learning_rate": 0.00011384570765661254, |
| "loss": 1.3513, |
| "step": 68600 |
| }, |
| { |
| "epoch": 11.67517006802721, |
| "grad_norm": 0.4190445840358734, |
| "learning_rate": 0.00011355568445475637, |
| "loss": 1.3549, |
| "step": 68650 |
| }, |
| { |
| "epoch": 11.683673469387756, |
| "grad_norm": 0.4222116470336914, |
| "learning_rate": 0.00011326566125290024, |
| "loss": 1.3561, |
| "step": 68700 |
| }, |
| { |
| "epoch": 11.6921768707483, |
| "grad_norm": 0.41736775636672974, |
| "learning_rate": 0.0001129756380510441, |
| "loss": 1.3541, |
| "step": 68750 |
| }, |
| { |
| "epoch": 11.700680272108844, |
| "grad_norm": 0.40362343192100525, |
| "learning_rate": 0.00011268561484918793, |
| "loss": 1.3489, |
| "step": 68800 |
| }, |
| { |
| "epoch": 11.709183673469388, |
| "grad_norm": 0.40062522888183594, |
| "learning_rate": 0.00011239559164733179, |
| "loss": 1.355, |
| "step": 68850 |
| }, |
| { |
| "epoch": 11.717687074829932, |
| "grad_norm": 0.40170249342918396, |
| "learning_rate": 0.00011210556844547564, |
| "loss": 1.3549, |
| "step": 68900 |
| }, |
| { |
| "epoch": 11.726190476190476, |
| "grad_norm": 0.4284593164920807, |
| "learning_rate": 0.00011181554524361949, |
| "loss": 1.3502, |
| "step": 68950 |
| }, |
| { |
| "epoch": 11.73469387755102, |
| "grad_norm": 0.4220130145549774, |
| "learning_rate": 0.00011152552204176335, |
| "loss": 1.3508, |
| "step": 69000 |
| }, |
| { |
| "epoch": 11.73469387755102, |
| "eval_loss": 1.517913818359375, |
| "eval_runtime": 75.269, |
| "eval_samples_per_second": 1246.716, |
| "eval_steps_per_second": 4.876, |
| "step": 69000 |
| }, |
| { |
| "epoch": 11.743197278911564, |
| "grad_norm": 0.41656509041786194, |
| "learning_rate": 0.0001112354988399072, |
| "loss": 1.3573, |
| "step": 69050 |
| }, |
| { |
| "epoch": 11.751700680272108, |
| "grad_norm": 0.4296277165412903, |
| "learning_rate": 0.00011094547563805104, |
| "loss": 1.3586, |
| "step": 69100 |
| }, |
| { |
| "epoch": 11.760204081632653, |
| "grad_norm": 0.5169918537139893, |
| "learning_rate": 0.0001106554524361949, |
| "loss": 1.3502, |
| "step": 69150 |
| }, |
| { |
| "epoch": 11.768707482993197, |
| "grad_norm": 0.40869855880737305, |
| "learning_rate": 0.00011036542923433875, |
| "loss": 1.3586, |
| "step": 69200 |
| }, |
| { |
| "epoch": 11.777210884353742, |
| "grad_norm": 0.41961830854415894, |
| "learning_rate": 0.0001100754060324826, |
| "loss": 1.3563, |
| "step": 69250 |
| }, |
| { |
| "epoch": 11.785714285714286, |
| "grad_norm": 0.40917539596557617, |
| "learning_rate": 0.00010978538283062645, |
| "loss": 1.3554, |
| "step": 69300 |
| }, |
| { |
| "epoch": 11.79421768707483, |
| "grad_norm": 0.4206612706184387, |
| "learning_rate": 0.00010949535962877031, |
| "loss": 1.3568, |
| "step": 69350 |
| }, |
| { |
| "epoch": 11.802721088435375, |
| "grad_norm": 0.41846826672554016, |
| "learning_rate": 0.00010920533642691415, |
| "loss": 1.3518, |
| "step": 69400 |
| }, |
| { |
| "epoch": 11.811224489795919, |
| "grad_norm": 0.4007967412471771, |
| "learning_rate": 0.000108915313225058, |
| "loss": 1.3488, |
| "step": 69450 |
| }, |
| { |
| "epoch": 11.819727891156463, |
| "grad_norm": 0.45398712158203125, |
| "learning_rate": 0.00010862529002320186, |
| "loss": 1.3522, |
| "step": 69500 |
| }, |
| { |
| "epoch": 11.828231292517007, |
| "grad_norm": 0.4125469923019409, |
| "learning_rate": 0.00010833526682134571, |
| "loss": 1.3475, |
| "step": 69550 |
| }, |
| { |
| "epoch": 11.83673469387755, |
| "grad_norm": 0.4218948781490326, |
| "learning_rate": 0.00010804524361948956, |
| "loss": 1.3476, |
| "step": 69600 |
| }, |
| { |
| "epoch": 11.845238095238095, |
| "grad_norm": 0.4130956828594208, |
| "learning_rate": 0.00010775522041763342, |
| "loss": 1.3406, |
| "step": 69650 |
| }, |
| { |
| "epoch": 11.853741496598639, |
| "grad_norm": 0.4252954125404358, |
| "learning_rate": 0.00010746519721577725, |
| "loss": 1.3497, |
| "step": 69700 |
| }, |
| { |
| "epoch": 11.862244897959183, |
| "grad_norm": 0.4810144603252411, |
| "learning_rate": 0.00010717517401392111, |
| "loss": 1.3513, |
| "step": 69750 |
| }, |
| { |
| "epoch": 11.870748299319727, |
| "grad_norm": 0.4183041453361511, |
| "learning_rate": 0.00010688515081206498, |
| "loss": 1.3464, |
| "step": 69800 |
| }, |
| { |
| "epoch": 11.879251700680273, |
| "grad_norm": 0.4231345057487488, |
| "learning_rate": 0.00010659512761020881, |
| "loss": 1.348, |
| "step": 69850 |
| }, |
| { |
| "epoch": 11.887755102040817, |
| "grad_norm": 0.4193786084651947, |
| "learning_rate": 0.00010630510440835267, |
| "loss": 1.3494, |
| "step": 69900 |
| }, |
| { |
| "epoch": 11.896258503401361, |
| "grad_norm": 0.41639646887779236, |
| "learning_rate": 0.00010601508120649653, |
| "loss": 1.3489, |
| "step": 69950 |
| }, |
| { |
| "epoch": 11.904761904761905, |
| "grad_norm": 0.4062506854534149, |
| "learning_rate": 0.00010572505800464037, |
| "loss": 1.3478, |
| "step": 70000 |
| }, |
| { |
| "epoch": 11.904761904761905, |
| "eval_loss": 1.5139620304107666, |
| "eval_runtime": 75.2979, |
| "eval_samples_per_second": 1246.237, |
| "eval_steps_per_second": 4.874, |
| "step": 70000 |
| }, |
| { |
| "epoch": 11.91326530612245, |
| "grad_norm": 0.49928614497184753, |
| "learning_rate": 0.00010543503480278423, |
| "loss": 1.3525, |
| "step": 70050 |
| }, |
| { |
| "epoch": 11.921768707482993, |
| "grad_norm": 0.4152166247367859, |
| "learning_rate": 0.0001051508120649652, |
| "loss": 1.3526, |
| "step": 70100 |
| }, |
| { |
| "epoch": 11.930272108843537, |
| "grad_norm": 0.42939451336860657, |
| "learning_rate": 0.00010486078886310905, |
| "loss": 1.352, |
| "step": 70150 |
| }, |
| { |
| "epoch": 11.938775510204081, |
| "grad_norm": 0.40568697452545166, |
| "learning_rate": 0.0001045707656612529, |
| "loss": 1.3493, |
| "step": 70200 |
| }, |
| { |
| "epoch": 11.947278911564625, |
| "grad_norm": 0.43555957078933716, |
| "learning_rate": 0.00010428074245939675, |
| "loss": 1.3425, |
| "step": 70250 |
| }, |
| { |
| "epoch": 11.95578231292517, |
| "grad_norm": 0.4200745224952698, |
| "learning_rate": 0.00010399071925754061, |
| "loss": 1.3477, |
| "step": 70300 |
| }, |
| { |
| "epoch": 11.964285714285714, |
| "grad_norm": 0.43165090680122375, |
| "learning_rate": 0.00010370069605568446, |
| "loss": 1.3522, |
| "step": 70350 |
| }, |
| { |
| "epoch": 11.972789115646258, |
| "grad_norm": 0.41488534212112427, |
| "learning_rate": 0.0001034106728538283, |
| "loss": 1.3509, |
| "step": 70400 |
| }, |
| { |
| "epoch": 11.981292517006803, |
| "grad_norm": 0.4268278479576111, |
| "learning_rate": 0.00010312064965197215, |
| "loss": 1.3567, |
| "step": 70450 |
| }, |
| { |
| "epoch": 11.989795918367347, |
| "grad_norm": 0.4317072927951813, |
| "learning_rate": 0.00010283062645011601, |
| "loss": 1.3441, |
| "step": 70500 |
| }, |
| { |
| "epoch": 11.998299319727892, |
| "grad_norm": 0.39999693632125854, |
| "learning_rate": 0.00010254060324825986, |
| "loss": 1.3486, |
| "step": 70550 |
| }, |
| { |
| "epoch": 12.006802721088436, |
| "grad_norm": 0.39868417382240295, |
| "learning_rate": 0.00010225058004640371, |
| "loss": 1.3422, |
| "step": 70600 |
| }, |
| { |
| "epoch": 12.01530612244898, |
| "grad_norm": 0.4184889793395996, |
| "learning_rate": 0.00010196055684454757, |
| "loss": 1.342, |
| "step": 70650 |
| }, |
| { |
| "epoch": 12.023809523809524, |
| "grad_norm": 0.4223806858062744, |
| "learning_rate": 0.00010167053364269142, |
| "loss": 1.339, |
| "step": 70700 |
| }, |
| { |
| "epoch": 12.032312925170068, |
| "grad_norm": 0.413501113653183, |
| "learning_rate": 0.00010138051044083527, |
| "loss": 1.3458, |
| "step": 70750 |
| }, |
| { |
| "epoch": 12.040816326530612, |
| "grad_norm": 0.41243672370910645, |
| "learning_rate": 0.00010109048723897913, |
| "loss": 1.3497, |
| "step": 70800 |
| }, |
| { |
| "epoch": 12.049319727891156, |
| "grad_norm": 0.4220627546310425, |
| "learning_rate": 0.00010080046403712296, |
| "loss": 1.3465, |
| "step": 70850 |
| }, |
| { |
| "epoch": 12.0578231292517, |
| "grad_norm": 0.4198216199874878, |
| "learning_rate": 0.00010051044083526682, |
| "loss": 1.3477, |
| "step": 70900 |
| }, |
| { |
| "epoch": 12.066326530612244, |
| "grad_norm": 0.4057486951351166, |
| "learning_rate": 0.00010022041763341068, |
| "loss": 1.3406, |
| "step": 70950 |
| }, |
| { |
| "epoch": 12.07482993197279, |
| "grad_norm": 0.4336649179458618, |
| "learning_rate": 9.993039443155452e-05, |
| "loss": 1.342, |
| "step": 71000 |
| }, |
| { |
| "epoch": 12.07482993197279, |
| "eval_loss": 1.5144281387329102, |
| "eval_runtime": 75.2698, |
| "eval_samples_per_second": 1246.702, |
| "eval_steps_per_second": 4.876, |
| "step": 71000 |
| }, |
| { |
| "epoch": 12.083333333333334, |
| "grad_norm": 0.4182901680469513, |
| "learning_rate": 9.964037122969838e-05, |
| "loss": 1.3432, |
| "step": 71050 |
| }, |
| { |
| "epoch": 12.091836734693878, |
| "grad_norm": 0.39864903688430786, |
| "learning_rate": 9.935034802784224e-05, |
| "loss": 1.3367, |
| "step": 71100 |
| }, |
| { |
| "epoch": 12.100340136054422, |
| "grad_norm": 0.40879112482070923, |
| "learning_rate": 9.906032482598607e-05, |
| "loss": 1.3431, |
| "step": 71150 |
| }, |
| { |
| "epoch": 12.108843537414966, |
| "grad_norm": 0.412240594625473, |
| "learning_rate": 9.877030162412993e-05, |
| "loss": 1.3406, |
| "step": 71200 |
| }, |
| { |
| "epoch": 12.11734693877551, |
| "grad_norm": 0.4245721995830536, |
| "learning_rate": 9.848027842227378e-05, |
| "loss": 1.3453, |
| "step": 71250 |
| }, |
| { |
| "epoch": 12.125850340136054, |
| "grad_norm": 0.40284958481788635, |
| "learning_rate": 9.819025522041763e-05, |
| "loss": 1.3383, |
| "step": 71300 |
| }, |
| { |
| "epoch": 12.134353741496598, |
| "grad_norm": 0.57562255859375, |
| "learning_rate": 9.790023201856149e-05, |
| "loss": 1.3417, |
| "step": 71350 |
| }, |
| { |
| "epoch": 12.142857142857142, |
| "grad_norm": 0.4117988348007202, |
| "learning_rate": 9.761020881670534e-05, |
| "loss": 1.3371, |
| "step": 71400 |
| }, |
| { |
| "epoch": 12.151360544217686, |
| "grad_norm": 0.4112182557582855, |
| "learning_rate": 9.732018561484918e-05, |
| "loss": 1.3447, |
| "step": 71450 |
| }, |
| { |
| "epoch": 12.15986394557823, |
| "grad_norm": 0.41666939854621887, |
| "learning_rate": 9.703016241299305e-05, |
| "loss": 1.3408, |
| "step": 71500 |
| }, |
| { |
| "epoch": 12.168367346938776, |
| "grad_norm": 0.4010598063468933, |
| "learning_rate": 9.674013921113689e-05, |
| "loss": 1.3443, |
| "step": 71550 |
| }, |
| { |
| "epoch": 12.17687074829932, |
| "grad_norm": 0.41796889901161194, |
| "learning_rate": 9.645011600928074e-05, |
| "loss": 1.3343, |
| "step": 71600 |
| }, |
| { |
| "epoch": 12.185374149659864, |
| "grad_norm": 0.42123886942863464, |
| "learning_rate": 9.616009280742459e-05, |
| "loss": 1.3401, |
| "step": 71650 |
| }, |
| { |
| "epoch": 12.193877551020408, |
| "grad_norm": 0.40508341789245605, |
| "learning_rate": 9.587006960556845e-05, |
| "loss": 1.3437, |
| "step": 71700 |
| }, |
| { |
| "epoch": 12.202380952380953, |
| "grad_norm": 0.44286808371543884, |
| "learning_rate": 9.558004640371231e-05, |
| "loss": 1.3419, |
| "step": 71750 |
| }, |
| { |
| "epoch": 12.210884353741497, |
| "grad_norm": 0.40285545587539673, |
| "learning_rate": 9.529002320185614e-05, |
| "loss": 1.333, |
| "step": 71800 |
| }, |
| { |
| "epoch": 12.21938775510204, |
| "grad_norm": 0.39993083477020264, |
| "learning_rate": 9.5e-05, |
| "loss": 1.3402, |
| "step": 71850 |
| }, |
| { |
| "epoch": 12.227891156462585, |
| "grad_norm": 0.4155488610267639, |
| "learning_rate": 9.470997679814387e-05, |
| "loss": 1.3378, |
| "step": 71900 |
| }, |
| { |
| "epoch": 12.236394557823129, |
| "grad_norm": 0.5050736665725708, |
| "learning_rate": 9.44199535962877e-05, |
| "loss": 1.3354, |
| "step": 71950 |
| }, |
| { |
| "epoch": 12.244897959183673, |
| "grad_norm": 0.4185927212238312, |
| "learning_rate": 9.412993039443156e-05, |
| "loss": 1.3357, |
| "step": 72000 |
| }, |
| { |
| "epoch": 12.244897959183673, |
| "eval_loss": 1.5064618587493896, |
| "eval_runtime": 75.3162, |
| "eval_samples_per_second": 1245.933, |
| "eval_steps_per_second": 4.873, |
| "step": 72000 |
| }, |
| { |
| "epoch": 12.253401360544217, |
| "grad_norm": 0.41753119230270386, |
| "learning_rate": 9.383990719257541e-05, |
| "loss": 1.3352, |
| "step": 72050 |
| }, |
| { |
| "epoch": 12.261904761904763, |
| "grad_norm": 0.4204673171043396, |
| "learning_rate": 9.354988399071926e-05, |
| "loss": 1.3436, |
| "step": 72100 |
| }, |
| { |
| "epoch": 12.270408163265307, |
| "grad_norm": 0.4101797044277191, |
| "learning_rate": 9.325986078886312e-05, |
| "loss": 1.3385, |
| "step": 72150 |
| }, |
| { |
| "epoch": 12.27891156462585, |
| "grad_norm": 0.4880974292755127, |
| "learning_rate": 9.296983758700696e-05, |
| "loss": 1.3322, |
| "step": 72200 |
| }, |
| { |
| "epoch": 12.287414965986395, |
| "grad_norm": 0.415211945772171, |
| "learning_rate": 9.267981438515081e-05, |
| "loss": 1.3409, |
| "step": 72250 |
| }, |
| { |
| "epoch": 12.295918367346939, |
| "grad_norm": 0.41376402974128723, |
| "learning_rate": 9.238979118329467e-05, |
| "loss": 1.3392, |
| "step": 72300 |
| }, |
| { |
| "epoch": 12.304421768707483, |
| "grad_norm": 0.43394094705581665, |
| "learning_rate": 9.209976798143852e-05, |
| "loss": 1.3345, |
| "step": 72350 |
| }, |
| { |
| "epoch": 12.312925170068027, |
| "grad_norm": 0.43497174978256226, |
| "learning_rate": 9.180974477958237e-05, |
| "loss": 1.3381, |
| "step": 72400 |
| }, |
| { |
| "epoch": 12.321428571428571, |
| "grad_norm": 0.4110974371433258, |
| "learning_rate": 9.151972157772622e-05, |
| "loss": 1.3375, |
| "step": 72450 |
| }, |
| { |
| "epoch": 12.329931972789115, |
| "grad_norm": 0.4111982583999634, |
| "learning_rate": 9.12354988399072e-05, |
| "loss": 1.3352, |
| "step": 72500 |
| }, |
| { |
| "epoch": 12.33843537414966, |
| "grad_norm": 0.4248039126396179, |
| "learning_rate": 9.094547563805104e-05, |
| "loss": 1.3353, |
| "step": 72550 |
| }, |
| { |
| "epoch": 12.346938775510203, |
| "grad_norm": 0.4378061294555664, |
| "learning_rate": 9.06554524361949e-05, |
| "loss": 1.3397, |
| "step": 72600 |
| }, |
| { |
| "epoch": 12.35544217687075, |
| "grad_norm": 0.41765543818473816, |
| "learning_rate": 9.036542923433875e-05, |
| "loss": 1.3372, |
| "step": 72650 |
| }, |
| { |
| "epoch": 12.363945578231293, |
| "grad_norm": 0.4655171036720276, |
| "learning_rate": 9.00754060324826e-05, |
| "loss": 1.337, |
| "step": 72700 |
| }, |
| { |
| "epoch": 12.372448979591837, |
| "grad_norm": 0.4178127348423004, |
| "learning_rate": 8.978538283062646e-05, |
| "loss": 1.3362, |
| "step": 72750 |
| }, |
| { |
| "epoch": 12.380952380952381, |
| "grad_norm": 0.4332452118396759, |
| "learning_rate": 8.94953596287703e-05, |
| "loss": 1.3379, |
| "step": 72800 |
| }, |
| { |
| "epoch": 12.389455782312925, |
| "grad_norm": 0.44955676794052124, |
| "learning_rate": 8.920533642691416e-05, |
| "loss": 1.3384, |
| "step": 72850 |
| }, |
| { |
| "epoch": 12.39795918367347, |
| "grad_norm": 0.42252740263938904, |
| "learning_rate": 8.891531322505802e-05, |
| "loss": 1.3398, |
| "step": 72900 |
| }, |
| { |
| "epoch": 12.406462585034014, |
| "grad_norm": 0.3996742069721222, |
| "learning_rate": 8.862529002320185e-05, |
| "loss": 1.3294, |
| "step": 72950 |
| }, |
| { |
| "epoch": 12.414965986394558, |
| "grad_norm": 0.5868241786956787, |
| "learning_rate": 8.833526682134571e-05, |
| "loss": 1.3347, |
| "step": 73000 |
| }, |
| { |
| "epoch": 12.414965986394558, |
| "eval_loss": 1.4998681545257568, |
| "eval_runtime": 75.3074, |
| "eval_samples_per_second": 1246.079, |
| "eval_steps_per_second": 4.873, |
| "step": 73000 |
| }, |
| { |
| "epoch": 12.423469387755102, |
| "grad_norm": 0.4118203818798065, |
| "learning_rate": 8.804524361948957e-05, |
| "loss": 1.3422, |
| "step": 73050 |
| }, |
| { |
| "epoch": 12.431972789115646, |
| "grad_norm": 0.5292280316352844, |
| "learning_rate": 8.775522041763341e-05, |
| "loss": 1.3353, |
| "step": 73100 |
| }, |
| { |
| "epoch": 12.44047619047619, |
| "grad_norm": 0.42021283507347107, |
| "learning_rate": 8.747099767981439e-05, |
| "loss": 1.3379, |
| "step": 73150 |
| }, |
| { |
| "epoch": 12.448979591836734, |
| "grad_norm": 0.42471328377723694, |
| "learning_rate": 8.718097447795824e-05, |
| "loss": 1.3377, |
| "step": 73200 |
| }, |
| { |
| "epoch": 12.45748299319728, |
| "grad_norm": 0.4189905524253845, |
| "learning_rate": 8.68909512761021e-05, |
| "loss": 1.3338, |
| "step": 73250 |
| }, |
| { |
| "epoch": 12.465986394557824, |
| "grad_norm": 0.4137709438800812, |
| "learning_rate": 8.660092807424594e-05, |
| "loss": 1.3341, |
| "step": 73300 |
| }, |
| { |
| "epoch": 12.474489795918368, |
| "grad_norm": 0.41046592593193054, |
| "learning_rate": 8.631090487238979e-05, |
| "loss": 1.3325, |
| "step": 73350 |
| }, |
| { |
| "epoch": 12.482993197278912, |
| "grad_norm": 0.40613409876823425, |
| "learning_rate": 8.602088167053364e-05, |
| "loss": 1.334, |
| "step": 73400 |
| }, |
| { |
| "epoch": 12.491496598639456, |
| "grad_norm": 0.4053729474544525, |
| "learning_rate": 8.57308584686775e-05, |
| "loss": 1.3297, |
| "step": 73450 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.4099692404270172, |
| "learning_rate": 8.544083526682135e-05, |
| "loss": 1.3303, |
| "step": 73500 |
| }, |
| { |
| "epoch": 12.508503401360544, |
| "grad_norm": 0.40552228689193726, |
| "learning_rate": 8.51508120649652e-05, |
| "loss": 1.3286, |
| "step": 73550 |
| }, |
| { |
| "epoch": 12.517006802721088, |
| "grad_norm": 0.41349565982818604, |
| "learning_rate": 8.486078886310906e-05, |
| "loss": 1.327, |
| "step": 73600 |
| }, |
| { |
| "epoch": 12.525510204081632, |
| "grad_norm": 0.40340515971183777, |
| "learning_rate": 8.45707656612529e-05, |
| "loss": 1.3282, |
| "step": 73650 |
| }, |
| { |
| "epoch": 12.534013605442176, |
| "grad_norm": 0.4043871760368347, |
| "learning_rate": 8.428074245939675e-05, |
| "loss": 1.3316, |
| "step": 73700 |
| }, |
| { |
| "epoch": 12.54251700680272, |
| "grad_norm": 0.41814517974853516, |
| "learning_rate": 8.399071925754061e-05, |
| "loss": 1.3278, |
| "step": 73750 |
| }, |
| { |
| "epoch": 12.551020408163264, |
| "grad_norm": 0.41396060585975647, |
| "learning_rate": 8.370069605568445e-05, |
| "loss": 1.3283, |
| "step": 73800 |
| }, |
| { |
| "epoch": 12.55952380952381, |
| "grad_norm": 0.4073675572872162, |
| "learning_rate": 8.341067285382831e-05, |
| "loss": 1.3274, |
| "step": 73850 |
| }, |
| { |
| "epoch": 12.568027210884354, |
| "grad_norm": 0.40986111760139465, |
| "learning_rate": 8.312064965197217e-05, |
| "loss": 1.3252, |
| "step": 73900 |
| }, |
| { |
| "epoch": 12.576530612244898, |
| "grad_norm": 0.3996681869029999, |
| "learning_rate": 8.2830626450116e-05, |
| "loss": 1.3308, |
| "step": 73950 |
| }, |
| { |
| "epoch": 12.585034013605442, |
| "grad_norm": 0.4171459376811981, |
| "learning_rate": 8.254060324825986e-05, |
| "loss": 1.3263, |
| "step": 74000 |
| }, |
| { |
| "epoch": 12.585034013605442, |
| "eval_loss": 1.4951106309890747, |
| "eval_runtime": 75.2691, |
| "eval_samples_per_second": 1246.714, |
| "eval_steps_per_second": 4.876, |
| "step": 74000 |
| }, |
| { |
| "epoch": 12.593537414965986, |
| "grad_norm": 0.4198111593723297, |
| "learning_rate": 8.225058004640372e-05, |
| "loss": 1.3306, |
| "step": 74050 |
| }, |
| { |
| "epoch": 12.60204081632653, |
| "grad_norm": 0.4122917056083679, |
| "learning_rate": 8.196055684454756e-05, |
| "loss": 1.3325, |
| "step": 74100 |
| }, |
| { |
| "epoch": 12.610544217687075, |
| "grad_norm": 0.43273040652275085, |
| "learning_rate": 8.167053364269142e-05, |
| "loss": 1.3385, |
| "step": 74150 |
| }, |
| { |
| "epoch": 12.619047619047619, |
| "grad_norm": 0.4447864294052124, |
| "learning_rate": 8.138051044083527e-05, |
| "loss": 1.3386, |
| "step": 74200 |
| }, |
| { |
| "epoch": 12.627551020408163, |
| "grad_norm": 0.41947802901268005, |
| "learning_rate": 8.109048723897911e-05, |
| "loss": 1.3313, |
| "step": 74250 |
| }, |
| { |
| "epoch": 12.636054421768707, |
| "grad_norm": 0.41405126452445984, |
| "learning_rate": 8.080046403712298e-05, |
| "loss": 1.3331, |
| "step": 74300 |
| }, |
| { |
| "epoch": 12.64455782312925, |
| "grad_norm": 0.407975971698761, |
| "learning_rate": 8.051044083526682e-05, |
| "loss": 1.3243, |
| "step": 74350 |
| }, |
| { |
| "epoch": 12.653061224489797, |
| "grad_norm": 0.3966875374317169, |
| "learning_rate": 8.022041763341067e-05, |
| "loss": 1.3276, |
| "step": 74400 |
| }, |
| { |
| "epoch": 12.66156462585034, |
| "grad_norm": 0.43828269839286804, |
| "learning_rate": 7.993619489559165e-05, |
| "loss": 1.335, |
| "step": 74450 |
| }, |
| { |
| "epoch": 12.670068027210885, |
| "grad_norm": 0.40009811520576477, |
| "learning_rate": 7.96461716937355e-05, |
| "loss": 1.3217, |
| "step": 74500 |
| }, |
| { |
| "epoch": 12.678571428571429, |
| "grad_norm": 0.40851446986198425, |
| "learning_rate": 7.935614849187935e-05, |
| "loss": 1.3338, |
| "step": 74550 |
| }, |
| { |
| "epoch": 12.687074829931973, |
| "grad_norm": 0.4100090265274048, |
| "learning_rate": 7.906612529002321e-05, |
| "loss": 1.334, |
| "step": 74600 |
| }, |
| { |
| "epoch": 12.695578231292517, |
| "grad_norm": 0.41323432326316833, |
| "learning_rate": 7.877610208816705e-05, |
| "loss": 1.3279, |
| "step": 74650 |
| }, |
| { |
| "epoch": 12.704081632653061, |
| "grad_norm": 0.4219471514225006, |
| "learning_rate": 7.84860788863109e-05, |
| "loss": 1.3265, |
| "step": 74700 |
| }, |
| { |
| "epoch": 12.712585034013605, |
| "grad_norm": 0.4054030478000641, |
| "learning_rate": 7.819605568445476e-05, |
| "loss": 1.3275, |
| "step": 74750 |
| }, |
| { |
| "epoch": 12.72108843537415, |
| "grad_norm": 0.4156668782234192, |
| "learning_rate": 7.790603248259861e-05, |
| "loss": 1.3246, |
| "step": 74800 |
| }, |
| { |
| "epoch": 12.729591836734693, |
| "grad_norm": 0.4128476083278656, |
| "learning_rate": 7.761600928074246e-05, |
| "loss": 1.325, |
| "step": 74850 |
| }, |
| { |
| "epoch": 12.738095238095237, |
| "grad_norm": 0.4204627275466919, |
| "learning_rate": 7.732598607888632e-05, |
| "loss": 1.3303, |
| "step": 74900 |
| }, |
| { |
| "epoch": 12.746598639455783, |
| "grad_norm": 0.4113794267177582, |
| "learning_rate": 7.703596287703015e-05, |
| "loss": 1.3297, |
| "step": 74950 |
| }, |
| { |
| "epoch": 12.755102040816327, |
| "grad_norm": 0.4143586754798889, |
| "learning_rate": 7.674593967517401e-05, |
| "loss": 1.323, |
| "step": 75000 |
| }, |
| { |
| "epoch": 12.755102040816327, |
| "eval_loss": 1.4961031675338745, |
| "eval_runtime": 75.2707, |
| "eval_samples_per_second": 1246.687, |
| "eval_steps_per_second": 4.876, |
| "step": 75000 |
| }, |
| { |
| "epoch": 12.763605442176871, |
| "grad_norm": 0.4443499743938446, |
| "learning_rate": 7.645591647331788e-05, |
| "loss": 1.3285, |
| "step": 75050 |
| }, |
| { |
| "epoch": 12.772108843537415, |
| "grad_norm": 0.4180935025215149, |
| "learning_rate": 7.616589327146171e-05, |
| "loss": 1.3299, |
| "step": 75100 |
| }, |
| { |
| "epoch": 12.78061224489796, |
| "grad_norm": 0.41669797897338867, |
| "learning_rate": 7.587587006960557e-05, |
| "loss": 1.3314, |
| "step": 75150 |
| }, |
| { |
| "epoch": 12.789115646258503, |
| "grad_norm": 0.40955501794815063, |
| "learning_rate": 7.558584686774943e-05, |
| "loss": 1.3285, |
| "step": 75200 |
| }, |
| { |
| "epoch": 12.797619047619047, |
| "grad_norm": 0.4140739142894745, |
| "learning_rate": 7.529582366589327e-05, |
| "loss": 1.331, |
| "step": 75250 |
| }, |
| { |
| "epoch": 12.806122448979592, |
| "grad_norm": 0.4092477858066559, |
| "learning_rate": 7.500580046403713e-05, |
| "loss": 1.3268, |
| "step": 75300 |
| }, |
| { |
| "epoch": 12.814625850340136, |
| "grad_norm": 0.412955105304718, |
| "learning_rate": 7.471577726218097e-05, |
| "loss": 1.324, |
| "step": 75350 |
| }, |
| { |
| "epoch": 12.82312925170068, |
| "grad_norm": 0.4366496205329895, |
| "learning_rate": 7.442575406032482e-05, |
| "loss": 1.3257, |
| "step": 75400 |
| }, |
| { |
| "epoch": 12.831632653061224, |
| "grad_norm": 0.4090063273906708, |
| "learning_rate": 7.413573085846868e-05, |
| "loss": 1.3261, |
| "step": 75450 |
| }, |
| { |
| "epoch": 12.84013605442177, |
| "grad_norm": 0.41486817598342896, |
| "learning_rate": 7.384570765661253e-05, |
| "loss": 1.3224, |
| "step": 75500 |
| }, |
| { |
| "epoch": 12.848639455782314, |
| "grad_norm": 0.40485501289367676, |
| "learning_rate": 7.355568445475638e-05, |
| "loss": 1.3285, |
| "step": 75550 |
| }, |
| { |
| "epoch": 12.857142857142858, |
| "grad_norm": 0.43988388776779175, |
| "learning_rate": 7.326566125290024e-05, |
| "loss": 1.3245, |
| "step": 75600 |
| }, |
| { |
| "epoch": 12.865646258503402, |
| "grad_norm": 0.43231526017189026, |
| "learning_rate": 7.297563805104409e-05, |
| "loss": 1.3266, |
| "step": 75650 |
| }, |
| { |
| "epoch": 12.874149659863946, |
| "grad_norm": 0.40446582436561584, |
| "learning_rate": 7.268561484918793e-05, |
| "loss": 1.3248, |
| "step": 75700 |
| }, |
| { |
| "epoch": 12.88265306122449, |
| "grad_norm": 0.40424802899360657, |
| "learning_rate": 7.239559164733178e-05, |
| "loss": 1.3187, |
| "step": 75750 |
| }, |
| { |
| "epoch": 12.891156462585034, |
| "grad_norm": 0.410151869058609, |
| "learning_rate": 7.210556844547564e-05, |
| "loss": 1.3212, |
| "step": 75800 |
| }, |
| { |
| "epoch": 12.899659863945578, |
| "grad_norm": 0.41777828335762024, |
| "learning_rate": 7.181554524361949e-05, |
| "loss": 1.3204, |
| "step": 75850 |
| }, |
| { |
| "epoch": 12.908163265306122, |
| "grad_norm": 0.4275606870651245, |
| "learning_rate": 7.152552204176334e-05, |
| "loss": 1.3239, |
| "step": 75900 |
| }, |
| { |
| "epoch": 12.916666666666666, |
| "grad_norm": 0.4166664481163025, |
| "learning_rate": 7.12354988399072e-05, |
| "loss": 1.3174, |
| "step": 75950 |
| }, |
| { |
| "epoch": 12.92517006802721, |
| "grad_norm": 0.3995276391506195, |
| "learning_rate": 7.094547563805105e-05, |
| "loss": 1.3231, |
| "step": 76000 |
| }, |
| { |
| "epoch": 12.92517006802721, |
| "eval_loss": 1.490061640739441, |
| "eval_runtime": 75.2857, |
| "eval_samples_per_second": 1246.439, |
| "eval_steps_per_second": 4.875, |
| "step": 76000 |
| }, |
| { |
| "epoch": 12.933673469387756, |
| "grad_norm": 0.4179072082042694, |
| "learning_rate": 7.06554524361949e-05, |
| "loss": 1.3217, |
| "step": 76050 |
| }, |
| { |
| "epoch": 12.9421768707483, |
| "grad_norm": 0.43183988332748413, |
| "learning_rate": 7.036542923433875e-05, |
| "loss": 1.3218, |
| "step": 76100 |
| }, |
| { |
| "epoch": 12.950680272108844, |
| "grad_norm": 0.4066503345966339, |
| "learning_rate": 7.007540603248259e-05, |
| "loss": 1.326, |
| "step": 76150 |
| }, |
| { |
| "epoch": 12.959183673469388, |
| "grad_norm": 0.4215805232524872, |
| "learning_rate": 6.978538283062645e-05, |
| "loss": 1.325, |
| "step": 76200 |
| }, |
| { |
| "epoch": 12.967687074829932, |
| "grad_norm": 0.4248071312904358, |
| "learning_rate": 6.949535962877031e-05, |
| "loss": 1.3179, |
| "step": 76250 |
| }, |
| { |
| "epoch": 12.976190476190476, |
| "grad_norm": 0.42110559344291687, |
| "learning_rate": 6.920533642691414e-05, |
| "loss": 1.3292, |
| "step": 76300 |
| }, |
| { |
| "epoch": 12.98469387755102, |
| "grad_norm": 0.4082868695259094, |
| "learning_rate": 6.8915313225058e-05, |
| "loss": 1.3254, |
| "step": 76350 |
| }, |
| { |
| "epoch": 12.993197278911564, |
| "grad_norm": 0.4124908447265625, |
| "learning_rate": 6.862529002320187e-05, |
| "loss": 1.3227, |
| "step": 76400 |
| }, |
| { |
| "epoch": 13.001700680272108, |
| "grad_norm": 0.5426220297813416, |
| "learning_rate": 6.83352668213457e-05, |
| "loss": 1.3198, |
| "step": 76450 |
| }, |
| { |
| "epoch": 13.010204081632653, |
| "grad_norm": 0.4062071442604065, |
| "learning_rate": 6.804524361948956e-05, |
| "loss": 1.3183, |
| "step": 76500 |
| }, |
| { |
| "epoch": 13.018707482993197, |
| "grad_norm": 0.40301281213760376, |
| "learning_rate": 6.775522041763341e-05, |
| "loss": 1.3232, |
| "step": 76550 |
| }, |
| { |
| "epoch": 13.02721088435374, |
| "grad_norm": 0.44012153148651123, |
| "learning_rate": 6.746519721577726e-05, |
| "loss": 1.318, |
| "step": 76600 |
| }, |
| { |
| "epoch": 13.035714285714286, |
| "grad_norm": 0.41154050827026367, |
| "learning_rate": 6.717517401392112e-05, |
| "loss": 1.3176, |
| "step": 76650 |
| }, |
| { |
| "epoch": 13.04421768707483, |
| "grad_norm": 0.42961668968200684, |
| "learning_rate": 6.688515081206497e-05, |
| "loss": 1.3188, |
| "step": 76700 |
| }, |
| { |
| "epoch": 13.052721088435375, |
| "grad_norm": 0.41780275106430054, |
| "learning_rate": 6.659512761020881e-05, |
| "loss": 1.3191, |
| "step": 76750 |
| }, |
| { |
| "epoch": 13.061224489795919, |
| "grad_norm": 0.4239339828491211, |
| "learning_rate": 6.630510440835267e-05, |
| "loss": 1.3225, |
| "step": 76800 |
| }, |
| { |
| "epoch": 13.069727891156463, |
| "grad_norm": 0.40867921710014343, |
| "learning_rate": 6.601508120649652e-05, |
| "loss": 1.3254, |
| "step": 76850 |
| }, |
| { |
| "epoch": 13.078231292517007, |
| "grad_norm": 0.4113767743110657, |
| "learning_rate": 6.572505800464038e-05, |
| "loss": 1.3151, |
| "step": 76900 |
| }, |
| { |
| "epoch": 13.08673469387755, |
| "grad_norm": 0.4053804874420166, |
| "learning_rate": 6.543503480278422e-05, |
| "loss": 1.3196, |
| "step": 76950 |
| }, |
| { |
| "epoch": 13.095238095238095, |
| "grad_norm": 0.40238648653030396, |
| "learning_rate": 6.514501160092808e-05, |
| "loss": 1.3119, |
| "step": 77000 |
| }, |
| { |
| "epoch": 13.095238095238095, |
| "eval_loss": 1.486020803451538, |
| "eval_runtime": 75.2657, |
| "eval_samples_per_second": 1246.769, |
| "eval_steps_per_second": 4.876, |
| "step": 77000 |
| }, |
| { |
| "epoch": 13.103741496598639, |
| "grad_norm": 0.4077170193195343, |
| "learning_rate": 6.485498839907194e-05, |
| "loss": 1.3185, |
| "step": 77050 |
| }, |
| { |
| "epoch": 13.112244897959183, |
| "grad_norm": 0.40546542406082153, |
| "learning_rate": 6.456496519721577e-05, |
| "loss": 1.3221, |
| "step": 77100 |
| }, |
| { |
| "epoch": 13.120748299319727, |
| "grad_norm": 0.4073767364025116, |
| "learning_rate": 6.427494199535963e-05, |
| "loss": 1.3168, |
| "step": 77150 |
| }, |
| { |
| "epoch": 13.129251700680273, |
| "grad_norm": 0.4032026529312134, |
| "learning_rate": 6.39849187935035e-05, |
| "loss": 1.3162, |
| "step": 77200 |
| }, |
| { |
| "epoch": 13.137755102040817, |
| "grad_norm": 0.4208144545555115, |
| "learning_rate": 6.369489559164733e-05, |
| "loss": 1.3218, |
| "step": 77250 |
| }, |
| { |
| "epoch": 13.146258503401361, |
| "grad_norm": 0.4274178743362427, |
| "learning_rate": 6.340487238979119e-05, |
| "loss": 1.3165, |
| "step": 77300 |
| }, |
| { |
| "epoch": 13.154761904761905, |
| "grad_norm": 0.41824012994766235, |
| "learning_rate": 6.311484918793504e-05, |
| "loss": 1.3185, |
| "step": 77350 |
| }, |
| { |
| "epoch": 13.16326530612245, |
| "grad_norm": 0.395487517118454, |
| "learning_rate": 6.282482598607888e-05, |
| "loss": 1.3115, |
| "step": 77400 |
| }, |
| { |
| "epoch": 13.171768707482993, |
| "grad_norm": 0.4073810577392578, |
| "learning_rate": 6.253480278422275e-05, |
| "loss": 1.3156, |
| "step": 77450 |
| }, |
| { |
| "epoch": 13.180272108843537, |
| "grad_norm": 0.4188859760761261, |
| "learning_rate": 6.224477958236659e-05, |
| "loss": 1.3164, |
| "step": 77500 |
| }, |
| { |
| "epoch": 13.188775510204081, |
| "grad_norm": 0.4151723086833954, |
| "learning_rate": 6.195475638051044e-05, |
| "loss": 1.316, |
| "step": 77550 |
| }, |
| { |
| "epoch": 13.197278911564625, |
| "grad_norm": 0.42075613141059875, |
| "learning_rate": 6.166473317865429e-05, |
| "loss": 1.3136, |
| "step": 77600 |
| }, |
| { |
| "epoch": 13.20578231292517, |
| "grad_norm": 0.41158735752105713, |
| "learning_rate": 6.137470997679815e-05, |
| "loss": 1.3157, |
| "step": 77650 |
| }, |
| { |
| "epoch": 13.214285714285714, |
| "grad_norm": 0.4071219563484192, |
| "learning_rate": 6.1084686774942e-05, |
| "loss": 1.3167, |
| "step": 77700 |
| }, |
| { |
| "epoch": 13.22278911564626, |
| "grad_norm": 0.430393248796463, |
| "learning_rate": 6.079466357308585e-05, |
| "loss": 1.3142, |
| "step": 77750 |
| }, |
| { |
| "epoch": 13.231292517006803, |
| "grad_norm": 0.402025043964386, |
| "learning_rate": 6.05046403712297e-05, |
| "loss": 1.3149, |
| "step": 77800 |
| }, |
| { |
| "epoch": 13.239795918367347, |
| "grad_norm": 0.39697858691215515, |
| "learning_rate": 6.0214617169373546e-05, |
| "loss": 1.3163, |
| "step": 77850 |
| }, |
| { |
| "epoch": 13.248299319727892, |
| "grad_norm": 0.45332589745521545, |
| "learning_rate": 5.992459396751741e-05, |
| "loss": 1.31, |
| "step": 77900 |
| }, |
| { |
| "epoch": 13.256802721088436, |
| "grad_norm": 0.42384395003318787, |
| "learning_rate": 5.9634570765661255e-05, |
| "loss": 1.3102, |
| "step": 77950 |
| }, |
| { |
| "epoch": 13.26530612244898, |
| "grad_norm": 0.42941874265670776, |
| "learning_rate": 5.934454756380511e-05, |
| "loss": 1.3184, |
| "step": 78000 |
| }, |
| { |
| "epoch": 13.26530612244898, |
| "eval_loss": 1.4811251163482666, |
| "eval_runtime": 75.2963, |
| "eval_samples_per_second": 1246.264, |
| "eval_steps_per_second": 4.874, |
| "step": 78000 |
| }, |
| { |
| "epoch": 13.273809523809524, |
| "grad_norm": 0.41413602232933044, |
| "learning_rate": 5.9054524361948956e-05, |
| "loss": 1.3109, |
| "step": 78050 |
| }, |
| { |
| "epoch": 13.282312925170068, |
| "grad_norm": 0.4183179438114166, |
| "learning_rate": 5.876450116009281e-05, |
| "loss": 1.3129, |
| "step": 78100 |
| }, |
| { |
| "epoch": 13.290816326530612, |
| "grad_norm": 0.41364216804504395, |
| "learning_rate": 5.8474477958236665e-05, |
| "loss": 1.3132, |
| "step": 78150 |
| }, |
| { |
| "epoch": 13.299319727891156, |
| "grad_norm": 0.41207000613212585, |
| "learning_rate": 5.818445475638051e-05, |
| "loss": 1.3238, |
| "step": 78200 |
| }, |
| { |
| "epoch": 13.3078231292517, |
| "grad_norm": 0.42450812458992004, |
| "learning_rate": 5.789443155452436e-05, |
| "loss": 1.3113, |
| "step": 78250 |
| }, |
| { |
| "epoch": 13.316326530612244, |
| "grad_norm": 0.4294716715812683, |
| "learning_rate": 5.760440835266822e-05, |
| "loss": 1.3195, |
| "step": 78300 |
| }, |
| { |
| "epoch": 13.32482993197279, |
| "grad_norm": 0.4036255478858948, |
| "learning_rate": 5.731438515081207e-05, |
| "loss": 1.3148, |
| "step": 78350 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.3960818946361542, |
| "learning_rate": 5.7024361948955916e-05, |
| "loss": 1.3098, |
| "step": 78400 |
| }, |
| { |
| "epoch": 13.341836734693878, |
| "grad_norm": 0.40689873695373535, |
| "learning_rate": 5.673433874709977e-05, |
| "loss": 1.3136, |
| "step": 78450 |
| }, |
| { |
| "epoch": 13.350340136054422, |
| "grad_norm": 0.4078476130962372, |
| "learning_rate": 5.6444315545243625e-05, |
| "loss": 1.3111, |
| "step": 78500 |
| }, |
| { |
| "epoch": 13.358843537414966, |
| "grad_norm": 0.4066517651081085, |
| "learning_rate": 5.615429234338747e-05, |
| "loss": 1.3117, |
| "step": 78550 |
| }, |
| { |
| "epoch": 13.36734693877551, |
| "grad_norm": 0.4128108024597168, |
| "learning_rate": 5.5864269141531326e-05, |
| "loss": 1.3108, |
| "step": 78600 |
| }, |
| { |
| "epoch": 13.375850340136054, |
| "grad_norm": 0.40743228793144226, |
| "learning_rate": 5.5574245939675174e-05, |
| "loss": 1.3099, |
| "step": 78650 |
| }, |
| { |
| "epoch": 13.384353741496598, |
| "grad_norm": 0.43066033720970154, |
| "learning_rate": 5.528422273781902e-05, |
| "loss": 1.312, |
| "step": 78700 |
| }, |
| { |
| "epoch": 13.392857142857142, |
| "grad_norm": 0.3970576226711273, |
| "learning_rate": 5.499419953596288e-05, |
| "loss": 1.3128, |
| "step": 78750 |
| }, |
| { |
| "epoch": 13.401360544217686, |
| "grad_norm": 0.42503124475479126, |
| "learning_rate": 5.470417633410673e-05, |
| "loss": 1.3057, |
| "step": 78800 |
| }, |
| { |
| "epoch": 13.40986394557823, |
| "grad_norm": 0.4038301408290863, |
| "learning_rate": 5.441415313225058e-05, |
| "loss": 1.3093, |
| "step": 78850 |
| }, |
| { |
| "epoch": 13.418367346938776, |
| "grad_norm": 0.4250807762145996, |
| "learning_rate": 5.412412993039443e-05, |
| "loss": 1.3086, |
| "step": 78900 |
| }, |
| { |
| "epoch": 13.42687074829932, |
| "grad_norm": 0.4134768843650818, |
| "learning_rate": 5.3834106728538286e-05, |
| "loss": 1.3135, |
| "step": 78950 |
| }, |
| { |
| "epoch": 13.435374149659864, |
| "grad_norm": 0.39782798290252686, |
| "learning_rate": 5.3544083526682134e-05, |
| "loss": 1.305, |
| "step": 79000 |
| }, |
| { |
| "epoch": 13.435374149659864, |
| "eval_loss": 1.4755269289016724, |
| "eval_runtime": 75.3173, |
| "eval_samples_per_second": 1245.915, |
| "eval_steps_per_second": 4.873, |
| "step": 79000 |
| }, |
| { |
| "epoch": 13.443877551020408, |
| "grad_norm": 0.42068231105804443, |
| "learning_rate": 5.325406032482599e-05, |
| "loss": 1.3107, |
| "step": 79050 |
| }, |
| { |
| "epoch": 13.452380952380953, |
| "grad_norm": 0.40822988748550415, |
| "learning_rate": 5.2964037122969835e-05, |
| "loss": 1.3076, |
| "step": 79100 |
| }, |
| { |
| "epoch": 13.460884353741497, |
| "grad_norm": 0.42288413643836975, |
| "learning_rate": 5.267401392111369e-05, |
| "loss": 1.3128, |
| "step": 79150 |
| }, |
| { |
| "epoch": 13.46938775510204, |
| "grad_norm": 0.41481590270996094, |
| "learning_rate": 5.2383990719257544e-05, |
| "loss": 1.3083, |
| "step": 79200 |
| }, |
| { |
| "epoch": 13.477891156462585, |
| "grad_norm": 0.4217277467250824, |
| "learning_rate": 5.209396751740139e-05, |
| "loss": 1.2985, |
| "step": 79250 |
| }, |
| { |
| "epoch": 13.486394557823129, |
| "grad_norm": 0.4183300733566284, |
| "learning_rate": 5.180394431554524e-05, |
| "loss": 1.3067, |
| "step": 79300 |
| }, |
| { |
| "epoch": 13.494897959183673, |
| "grad_norm": 0.44343501329421997, |
| "learning_rate": 5.15139211136891e-05, |
| "loss": 1.3101, |
| "step": 79350 |
| }, |
| { |
| "epoch": 13.503401360544217, |
| "grad_norm": 0.47200411558151245, |
| "learning_rate": 5.122389791183295e-05, |
| "loss": 1.3113, |
| "step": 79400 |
| }, |
| { |
| "epoch": 13.511904761904763, |
| "grad_norm": 0.40812692046165466, |
| "learning_rate": 5.0933874709976795e-05, |
| "loss": 1.3085, |
| "step": 79450 |
| }, |
| { |
| "epoch": 13.520408163265307, |
| "grad_norm": 0.4125157594680786, |
| "learning_rate": 5.064385150812065e-05, |
| "loss": 1.3117, |
| "step": 79500 |
| }, |
| { |
| "epoch": 13.52891156462585, |
| "grad_norm": 0.42148134112358093, |
| "learning_rate": 5.0353828306264504e-05, |
| "loss": 1.3096, |
| "step": 79550 |
| }, |
| { |
| "epoch": 13.537414965986395, |
| "grad_norm": 0.4138471782207489, |
| "learning_rate": 5.006380510440835e-05, |
| "loss": 1.3099, |
| "step": 79600 |
| }, |
| { |
| "epoch": 13.545918367346939, |
| "grad_norm": 0.7425023317337036, |
| "learning_rate": 4.9773781902552205e-05, |
| "loss": 1.3098, |
| "step": 79650 |
| }, |
| { |
| "epoch": 13.554421768707483, |
| "grad_norm": 0.4228558838367462, |
| "learning_rate": 4.948375870069605e-05, |
| "loss": 1.3055, |
| "step": 79700 |
| }, |
| { |
| "epoch": 13.562925170068027, |
| "grad_norm": 0.39013656973838806, |
| "learning_rate": 4.919373549883991e-05, |
| "loss": 1.3109, |
| "step": 79750 |
| }, |
| { |
| "epoch": 13.571428571428571, |
| "grad_norm": 0.4059963822364807, |
| "learning_rate": 4.890371229698376e-05, |
| "loss": 1.3062, |
| "step": 79800 |
| }, |
| { |
| "epoch": 13.579931972789115, |
| "grad_norm": 0.41088271141052246, |
| "learning_rate": 4.861368909512761e-05, |
| "loss": 1.3104, |
| "step": 79850 |
| }, |
| { |
| "epoch": 13.58843537414966, |
| "grad_norm": 0.4267512857913971, |
| "learning_rate": 4.8323665893271456e-05, |
| "loss": 1.3094, |
| "step": 79900 |
| }, |
| { |
| "epoch": 13.596938775510203, |
| "grad_norm": 0.4166390001773834, |
| "learning_rate": 4.803364269141532e-05, |
| "loss": 1.3063, |
| "step": 79950 |
| }, |
| { |
| "epoch": 13.60544217687075, |
| "grad_norm": 0.43058088421821594, |
| "learning_rate": 4.7743619489559165e-05, |
| "loss": 1.3092, |
| "step": 80000 |
| }, |
| { |
| "epoch": 13.60544217687075, |
| "eval_loss": 1.4739412069320679, |
| "eval_runtime": 75.2973, |
| "eval_samples_per_second": 1246.247, |
| "eval_steps_per_second": 4.874, |
| "step": 80000 |
| }, |
| { |
| "epoch": 13.613945578231293, |
| "grad_norm": 0.4472084939479828, |
| "learning_rate": 4.745359628770301e-05, |
| "loss": 1.3092, |
| "step": 80050 |
| }, |
| { |
| "epoch": 13.622448979591837, |
| "grad_norm": 0.40930888056755066, |
| "learning_rate": 4.716357308584687e-05, |
| "loss": 1.2986, |
| "step": 80100 |
| }, |
| { |
| "epoch": 13.630952380952381, |
| "grad_norm": 0.4269891083240509, |
| "learning_rate": 4.687354988399072e-05, |
| "loss": 1.3085, |
| "step": 80150 |
| }, |
| { |
| "epoch": 13.639455782312925, |
| "grad_norm": 0.4400569498538971, |
| "learning_rate": 4.6583526682134575e-05, |
| "loss": 1.3091, |
| "step": 80200 |
| }, |
| { |
| "epoch": 13.64795918367347, |
| "grad_norm": 0.40618574619293213, |
| "learning_rate": 4.629350348027842e-05, |
| "loss": 1.3053, |
| "step": 80250 |
| }, |
| { |
| "epoch": 13.656462585034014, |
| "grad_norm": 0.40548551082611084, |
| "learning_rate": 4.600348027842227e-05, |
| "loss": 1.3097, |
| "step": 80300 |
| }, |
| { |
| "epoch": 13.664965986394558, |
| "grad_norm": 0.4029221534729004, |
| "learning_rate": 4.571345707656613e-05, |
| "loss": 1.3051, |
| "step": 80350 |
| }, |
| { |
| "epoch": 13.673469387755102, |
| "grad_norm": 0.4272826015949249, |
| "learning_rate": 4.542343387470998e-05, |
| "loss": 1.3078, |
| "step": 80400 |
| }, |
| { |
| "epoch": 13.681972789115646, |
| "grad_norm": 0.41242125630378723, |
| "learning_rate": 4.5133410672853826e-05, |
| "loss": 1.3068, |
| "step": 80450 |
| }, |
| { |
| "epoch": 13.69047619047619, |
| "grad_norm": 0.405517578125, |
| "learning_rate": 4.484338747099768e-05, |
| "loss": 1.3061, |
| "step": 80500 |
| }, |
| { |
| "epoch": 13.698979591836736, |
| "grad_norm": 0.4161379635334015, |
| "learning_rate": 4.4553364269141535e-05, |
| "loss": 1.3053, |
| "step": 80550 |
| }, |
| { |
| "epoch": 13.70748299319728, |
| "grad_norm": 0.4203460216522217, |
| "learning_rate": 4.426334106728538e-05, |
| "loss": 1.3043, |
| "step": 80600 |
| }, |
| { |
| "epoch": 13.715986394557824, |
| "grad_norm": 0.4214410185813904, |
| "learning_rate": 4.397331786542924e-05, |
| "loss": 1.3066, |
| "step": 80650 |
| }, |
| { |
| "epoch": 13.724489795918368, |
| "grad_norm": 0.42133694887161255, |
| "learning_rate": 4.3683294663573084e-05, |
| "loss": 1.3143, |
| "step": 80700 |
| }, |
| { |
| "epoch": 13.732993197278912, |
| "grad_norm": 0.39356616139411926, |
| "learning_rate": 4.339327146171694e-05, |
| "loss": 1.3078, |
| "step": 80750 |
| }, |
| { |
| "epoch": 13.741496598639456, |
| "grad_norm": 0.41399866342544556, |
| "learning_rate": 4.310324825986079e-05, |
| "loss": 1.3112, |
| "step": 80800 |
| }, |
| { |
| "epoch": 13.75, |
| "grad_norm": 0.41746941208839417, |
| "learning_rate": 4.281322505800464e-05, |
| "loss": 1.3019, |
| "step": 80850 |
| }, |
| { |
| "epoch": 13.758503401360544, |
| "grad_norm": 0.4164547324180603, |
| "learning_rate": 4.252320185614849e-05, |
| "loss": 1.309, |
| "step": 80900 |
| }, |
| { |
| "epoch": 13.767006802721088, |
| "grad_norm": 0.4242299795150757, |
| "learning_rate": 4.223317865429235e-05, |
| "loss": 1.3023, |
| "step": 80950 |
| }, |
| { |
| "epoch": 13.775510204081632, |
| "grad_norm": 0.4084872603416443, |
| "learning_rate": 4.1943155452436197e-05, |
| "loss": 1.3039, |
| "step": 81000 |
| }, |
| { |
| "epoch": 13.775510204081632, |
| "eval_loss": 1.4660990238189697, |
| "eval_runtime": 75.2655, |
| "eval_samples_per_second": 1246.773, |
| "eval_steps_per_second": 4.876, |
| "step": 81000 |
| }, |
| { |
| "epoch": 13.784013605442176, |
| "grad_norm": 0.4138684868812561, |
| "learning_rate": 4.1653132250580044e-05, |
| "loss": 1.3075, |
| "step": 81050 |
| }, |
| { |
| "epoch": 13.79251700680272, |
| "grad_norm": 0.40258848667144775, |
| "learning_rate": 4.13631090487239e-05, |
| "loss": 1.3068, |
| "step": 81100 |
| }, |
| { |
| "epoch": 13.801020408163264, |
| "grad_norm": 0.4167514145374298, |
| "learning_rate": 4.107308584686775e-05, |
| "loss": 1.305, |
| "step": 81150 |
| }, |
| { |
| "epoch": 13.80952380952381, |
| "grad_norm": 0.41513994336128235, |
| "learning_rate": 4.07830626450116e-05, |
| "loss": 1.3087, |
| "step": 81200 |
| }, |
| { |
| "epoch": 13.818027210884354, |
| "grad_norm": 0.42223405838012695, |
| "learning_rate": 4.0493039443155454e-05, |
| "loss": 1.3048, |
| "step": 81250 |
| }, |
| { |
| "epoch": 13.826530612244898, |
| "grad_norm": 0.3972738981246948, |
| "learning_rate": 4.02030162412993e-05, |
| "loss": 1.2986, |
| "step": 81300 |
| }, |
| { |
| "epoch": 13.835034013605442, |
| "grad_norm": 0.40687882900238037, |
| "learning_rate": 3.9912993039443156e-05, |
| "loss": 1.3106, |
| "step": 81350 |
| }, |
| { |
| "epoch": 13.843537414965986, |
| "grad_norm": 0.40394240617752075, |
| "learning_rate": 3.962296983758701e-05, |
| "loss": 1.309, |
| "step": 81400 |
| }, |
| { |
| "epoch": 13.85204081632653, |
| "grad_norm": 0.40160682797431946, |
| "learning_rate": 3.933294663573086e-05, |
| "loss": 1.3063, |
| "step": 81450 |
| }, |
| { |
| "epoch": 13.860544217687075, |
| "grad_norm": 0.401882529258728, |
| "learning_rate": 3.9042923433874705e-05, |
| "loss": 1.3085, |
| "step": 81500 |
| }, |
| { |
| "epoch": 13.869047619047619, |
| "grad_norm": 0.40280336141586304, |
| "learning_rate": 3.8758700696055686e-05, |
| "loss": 1.2966, |
| "step": 81550 |
| }, |
| { |
| "epoch": 13.877551020408163, |
| "grad_norm": 0.43757128715515137, |
| "learning_rate": 3.8468677494199534e-05, |
| "loss": 1.301, |
| "step": 81600 |
| }, |
| { |
| "epoch": 13.886054421768707, |
| "grad_norm": 0.40991008281707764, |
| "learning_rate": 3.817865429234339e-05, |
| "loss": 1.3037, |
| "step": 81650 |
| }, |
| { |
| "epoch": 13.89455782312925, |
| "grad_norm": 0.413152277469635, |
| "learning_rate": 3.788863109048724e-05, |
| "loss": 1.303, |
| "step": 81700 |
| }, |
| { |
| "epoch": 13.903061224489797, |
| "grad_norm": 0.40487709641456604, |
| "learning_rate": 3.759860788863109e-05, |
| "loss": 1.3071, |
| "step": 81750 |
| }, |
| { |
| "epoch": 13.91156462585034, |
| "grad_norm": 0.4063573181629181, |
| "learning_rate": 3.7308584686774944e-05, |
| "loss": 1.3037, |
| "step": 81800 |
| }, |
| { |
| "epoch": 13.920068027210885, |
| "grad_norm": 0.4249866306781769, |
| "learning_rate": 3.701856148491879e-05, |
| "loss": 1.3005, |
| "step": 81850 |
| }, |
| { |
| "epoch": 13.928571428571429, |
| "grad_norm": 0.41051676869392395, |
| "learning_rate": 3.6728538283062646e-05, |
| "loss": 1.3071, |
| "step": 81900 |
| }, |
| { |
| "epoch": 13.937074829931973, |
| "grad_norm": 0.39749205112457275, |
| "learning_rate": 3.64385150812065e-05, |
| "loss": 1.3049, |
| "step": 81950 |
| }, |
| { |
| "epoch": 13.945578231292517, |
| "grad_norm": 0.4107550382614136, |
| "learning_rate": 3.614849187935035e-05, |
| "loss": 1.3005, |
| "step": 82000 |
| }, |
| { |
| "epoch": 13.945578231292517, |
| "eval_loss": 1.4647165536880493, |
| "eval_runtime": 75.3191, |
| "eval_samples_per_second": 1245.886, |
| "eval_steps_per_second": 4.873, |
| "step": 82000 |
| }, |
| { |
| "epoch": 13.954081632653061, |
| "grad_norm": 0.4149150848388672, |
| "learning_rate": 3.5858468677494195e-05, |
| "loss": 1.3065, |
| "step": 82050 |
| }, |
| { |
| "epoch": 13.962585034013605, |
| "grad_norm": 0.4155055284500122, |
| "learning_rate": 3.5568445475638056e-05, |
| "loss": 1.2891, |
| "step": 82100 |
| }, |
| { |
| "epoch": 13.97108843537415, |
| "grad_norm": 0.40817248821258545, |
| "learning_rate": 3.5278422273781904e-05, |
| "loss": 1.303, |
| "step": 82150 |
| }, |
| { |
| "epoch": 13.979591836734693, |
| "grad_norm": 0.41482388973236084, |
| "learning_rate": 3.498839907192575e-05, |
| "loss": 1.3021, |
| "step": 82200 |
| }, |
| { |
| "epoch": 13.988095238095237, |
| "grad_norm": 0.41232186555862427, |
| "learning_rate": 3.4698375870069605e-05, |
| "loss": 1.2998, |
| "step": 82250 |
| }, |
| { |
| "epoch": 13.996598639455783, |
| "grad_norm": 0.40600040555000305, |
| "learning_rate": 3.440835266821346e-05, |
| "loss": 1.2983, |
| "step": 82300 |
| }, |
| { |
| "epoch": 14.005102040816327, |
| "grad_norm": 0.40963810682296753, |
| "learning_rate": 3.411832946635731e-05, |
| "loss": 1.2898, |
| "step": 82350 |
| }, |
| { |
| "epoch": 14.013605442176871, |
| "grad_norm": 0.41621676087379456, |
| "learning_rate": 3.382830626450116e-05, |
| "loss": 1.2984, |
| "step": 82400 |
| }, |
| { |
| "epoch": 14.022108843537415, |
| "grad_norm": 0.3934304118156433, |
| "learning_rate": 3.353828306264501e-05, |
| "loss": 1.2949, |
| "step": 82450 |
| }, |
| { |
| "epoch": 14.03061224489796, |
| "grad_norm": 0.4079570770263672, |
| "learning_rate": 3.324825986078887e-05, |
| "loss": 1.2966, |
| "step": 82500 |
| }, |
| { |
| "epoch": 14.039115646258503, |
| "grad_norm": 0.41960811614990234, |
| "learning_rate": 3.295823665893272e-05, |
| "loss": 1.3005, |
| "step": 82550 |
| }, |
| { |
| "epoch": 14.047619047619047, |
| "grad_norm": 0.40414050221443176, |
| "learning_rate": 3.2668213457076565e-05, |
| "loss": 1.299, |
| "step": 82600 |
| }, |
| { |
| "epoch": 14.056122448979592, |
| "grad_norm": 0.4004068970680237, |
| "learning_rate": 3.237819025522042e-05, |
| "loss": 1.2898, |
| "step": 82650 |
| }, |
| { |
| "epoch": 14.064625850340136, |
| "grad_norm": 0.41004568338394165, |
| "learning_rate": 3.2088167053364274e-05, |
| "loss": 1.2961, |
| "step": 82700 |
| }, |
| { |
| "epoch": 14.07312925170068, |
| "grad_norm": 0.4094617962837219, |
| "learning_rate": 3.179814385150812e-05, |
| "loss": 1.2972, |
| "step": 82750 |
| }, |
| { |
| "epoch": 14.081632653061224, |
| "grad_norm": 0.422753244638443, |
| "learning_rate": 3.1508120649651975e-05, |
| "loss": 1.3021, |
| "step": 82800 |
| }, |
| { |
| "epoch": 14.09013605442177, |
| "grad_norm": 0.41975682973861694, |
| "learning_rate": 3.121809744779582e-05, |
| "loss": 1.2996, |
| "step": 82850 |
| }, |
| { |
| "epoch": 14.098639455782314, |
| "grad_norm": 0.3985610008239746, |
| "learning_rate": 3.092807424593968e-05, |
| "loss": 1.3014, |
| "step": 82900 |
| }, |
| { |
| "epoch": 14.107142857142858, |
| "grad_norm": 0.4374600350856781, |
| "learning_rate": 3.063805104408353e-05, |
| "loss": 1.2963, |
| "step": 82950 |
| }, |
| { |
| "epoch": 14.115646258503402, |
| "grad_norm": 0.4096509516239166, |
| "learning_rate": 3.034802784222738e-05, |
| "loss": 1.2925, |
| "step": 83000 |
| }, |
| { |
| "epoch": 14.115646258503402, |
| "eval_loss": 1.4645365476608276, |
| "eval_runtime": 75.2852, |
| "eval_samples_per_second": 1246.447, |
| "eval_steps_per_second": 4.875, |
| "step": 83000 |
| }, |
| { |
| "epoch": 14.124149659863946, |
| "grad_norm": 0.6984773874282837, |
| "learning_rate": 3.0058004640371233e-05, |
| "loss": 1.2946, |
| "step": 83050 |
| }, |
| { |
| "epoch": 14.13265306122449, |
| "grad_norm": 0.4192226827144623, |
| "learning_rate": 2.976798143851508e-05, |
| "loss": 1.2906, |
| "step": 83100 |
| }, |
| { |
| "epoch": 14.141156462585034, |
| "grad_norm": 0.4226321280002594, |
| "learning_rate": 2.9477958236658935e-05, |
| "loss": 1.304, |
| "step": 83150 |
| }, |
| { |
| "epoch": 14.149659863945578, |
| "grad_norm": 0.4252030849456787, |
| "learning_rate": 2.9187935034802786e-05, |
| "loss": 1.2966, |
| "step": 83200 |
| }, |
| { |
| "epoch": 14.158163265306122, |
| "grad_norm": 0.3973104953765869, |
| "learning_rate": 2.8897911832946637e-05, |
| "loss": 1.2916, |
| "step": 83250 |
| }, |
| { |
| "epoch": 14.166666666666666, |
| "grad_norm": 0.42370930314064026, |
| "learning_rate": 2.8607888631090488e-05, |
| "loss": 1.2924, |
| "step": 83300 |
| }, |
| { |
| "epoch": 14.17517006802721, |
| "grad_norm": 1.85109543800354, |
| "learning_rate": 2.8317865429234342e-05, |
| "loss": 1.2927, |
| "step": 83350 |
| }, |
| { |
| "epoch": 14.183673469387756, |
| "grad_norm": 0.41108080744743347, |
| "learning_rate": 2.802784222737819e-05, |
| "loss": 1.2924, |
| "step": 83400 |
| }, |
| { |
| "epoch": 14.1921768707483, |
| "grad_norm": 0.45318904519081116, |
| "learning_rate": 2.7737819025522044e-05, |
| "loss": 1.2936, |
| "step": 83450 |
| }, |
| { |
| "epoch": 14.200680272108844, |
| "grad_norm": 0.402097225189209, |
| "learning_rate": 2.7447795823665895e-05, |
| "loss": 1.2966, |
| "step": 83500 |
| }, |
| { |
| "epoch": 14.209183673469388, |
| "grad_norm": 0.40355297923088074, |
| "learning_rate": 2.7157772621809746e-05, |
| "loss": 1.2926, |
| "step": 83550 |
| }, |
| { |
| "epoch": 14.217687074829932, |
| "grad_norm": 0.5472226738929749, |
| "learning_rate": 2.6873549883990723e-05, |
| "loss": 1.2938, |
| "step": 83600 |
| }, |
| { |
| "epoch": 14.226190476190476, |
| "grad_norm": 0.40354666113853455, |
| "learning_rate": 2.658352668213457e-05, |
| "loss": 1.2997, |
| "step": 83650 |
| }, |
| { |
| "epoch": 14.23469387755102, |
| "grad_norm": 0.399110347032547, |
| "learning_rate": 2.6293503480278425e-05, |
| "loss": 1.2916, |
| "step": 83700 |
| }, |
| { |
| "epoch": 14.243197278911564, |
| "grad_norm": 0.42456942796707153, |
| "learning_rate": 2.6003480278422276e-05, |
| "loss": 1.2987, |
| "step": 83750 |
| }, |
| { |
| "epoch": 14.251700680272108, |
| "grad_norm": 0.4085513949394226, |
| "learning_rate": 2.5713457076566123e-05, |
| "loss": 1.2922, |
| "step": 83800 |
| }, |
| { |
| "epoch": 14.260204081632653, |
| "grad_norm": 0.41955679655075073, |
| "learning_rate": 2.5423433874709977e-05, |
| "loss": 1.2942, |
| "step": 83850 |
| }, |
| { |
| "epoch": 14.268707482993197, |
| "grad_norm": 0.42755693197250366, |
| "learning_rate": 2.5133410672853828e-05, |
| "loss": 1.2927, |
| "step": 83900 |
| }, |
| { |
| "epoch": 14.27721088435374, |
| "grad_norm": 0.7680487036705017, |
| "learning_rate": 2.484338747099768e-05, |
| "loss": 1.2896, |
| "step": 83950 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "grad_norm": 0.41442444920539856, |
| "learning_rate": 2.455336426914153e-05, |
| "loss": 1.2993, |
| "step": 84000 |
| }, |
| { |
| "epoch": 14.285714285714286, |
| "eval_loss": 1.4609616994857788, |
| "eval_runtime": 75.2756, |
| "eval_samples_per_second": 1246.605, |
| "eval_steps_per_second": 4.875, |
| "step": 84000 |
| }, |
| { |
| "epoch": 14.29421768707483, |
| "grad_norm": 0.40402066707611084, |
| "learning_rate": 2.4263341067285384e-05, |
| "loss": 1.2965, |
| "step": 84050 |
| }, |
| { |
| "epoch": 14.302721088435375, |
| "grad_norm": 0.4172896146774292, |
| "learning_rate": 2.3973317865429232e-05, |
| "loss": 1.2912, |
| "step": 84100 |
| }, |
| { |
| "epoch": 14.311224489795919, |
| "grad_norm": 0.4188857078552246, |
| "learning_rate": 2.3683294663573086e-05, |
| "loss": 1.2941, |
| "step": 84150 |
| }, |
| { |
| "epoch": 14.319727891156463, |
| "grad_norm": 0.4177609384059906, |
| "learning_rate": 2.3393271461716937e-05, |
| "loss": 1.2956, |
| "step": 84200 |
| }, |
| { |
| "epoch": 14.328231292517007, |
| "grad_norm": 0.4155607223510742, |
| "learning_rate": 2.3103248259860788e-05, |
| "loss": 1.2909, |
| "step": 84250 |
| }, |
| { |
| "epoch": 14.33673469387755, |
| "grad_norm": 0.39458438754081726, |
| "learning_rate": 2.281322505800464e-05, |
| "loss": 1.2991, |
| "step": 84300 |
| }, |
| { |
| "epoch": 14.345238095238095, |
| "grad_norm": 0.3875535726547241, |
| "learning_rate": 2.2523201856148493e-05, |
| "loss": 1.2914, |
| "step": 84350 |
| }, |
| { |
| "epoch": 14.353741496598639, |
| "grad_norm": 0.39373454451560974, |
| "learning_rate": 2.223317865429234e-05, |
| "loss": 1.2921, |
| "step": 84400 |
| }, |
| { |
| "epoch": 14.362244897959183, |
| "grad_norm": 0.4142104685306549, |
| "learning_rate": 2.1943155452436195e-05, |
| "loss": 1.2924, |
| "step": 84450 |
| }, |
| { |
| "epoch": 14.370748299319727, |
| "grad_norm": 0.4059418737888336, |
| "learning_rate": 2.1653132250580046e-05, |
| "loss": 1.2927, |
| "step": 84500 |
| }, |
| { |
| "epoch": 14.379251700680273, |
| "grad_norm": 0.42010223865509033, |
| "learning_rate": 2.13631090487239e-05, |
| "loss": 1.2946, |
| "step": 84550 |
| }, |
| { |
| "epoch": 14.387755102040817, |
| "grad_norm": 0.4005703330039978, |
| "learning_rate": 2.1073085846867748e-05, |
| "loss": 1.2889, |
| "step": 84600 |
| }, |
| { |
| "epoch": 14.396258503401361, |
| "grad_norm": 0.434023916721344, |
| "learning_rate": 2.0783062645011602e-05, |
| "loss": 1.2925, |
| "step": 84650 |
| }, |
| { |
| "epoch": 14.404761904761905, |
| "grad_norm": 0.41903188824653625, |
| "learning_rate": 2.0493039443155453e-05, |
| "loss": 1.2982, |
| "step": 84700 |
| }, |
| { |
| "epoch": 14.41326530612245, |
| "grad_norm": 0.41402989625930786, |
| "learning_rate": 2.0203016241299304e-05, |
| "loss": 1.2972, |
| "step": 84750 |
| }, |
| { |
| "epoch": 14.421768707482993, |
| "grad_norm": 0.4167090952396393, |
| "learning_rate": 1.9912993039443155e-05, |
| "loss": 1.2975, |
| "step": 84800 |
| }, |
| { |
| "epoch": 14.430272108843537, |
| "grad_norm": 0.40198904275894165, |
| "learning_rate": 1.962296983758701e-05, |
| "loss": 1.2898, |
| "step": 84850 |
| }, |
| { |
| "epoch": 14.438775510204081, |
| "grad_norm": 0.412724107503891, |
| "learning_rate": 1.9332946635730856e-05, |
| "loss": 1.2955, |
| "step": 84900 |
| }, |
| { |
| "epoch": 14.447278911564625, |
| "grad_norm": 0.4073164165019989, |
| "learning_rate": 1.904292343387471e-05, |
| "loss": 1.2907, |
| "step": 84950 |
| }, |
| { |
| "epoch": 14.45578231292517, |
| "grad_norm": 0.4109826982021332, |
| "learning_rate": 1.875290023201856e-05, |
| "loss": 1.2935, |
| "step": 85000 |
| }, |
| { |
| "epoch": 14.45578231292517, |
| "eval_loss": 1.457992672920227, |
| "eval_runtime": 75.28, |
| "eval_samples_per_second": 1246.534, |
| "eval_steps_per_second": 4.875, |
| "step": 85000 |
| }, |
| { |
| "epoch": 14.464285714285714, |
| "grad_norm": 0.40559759736061096, |
| "learning_rate": 1.8462877030162412e-05, |
| "loss": 1.2888, |
| "step": 85050 |
| }, |
| { |
| "epoch": 14.47278911564626, |
| "grad_norm": 0.4154794216156006, |
| "learning_rate": 1.8172853828306263e-05, |
| "loss": 1.289, |
| "step": 85100 |
| }, |
| { |
| "epoch": 14.481292517006803, |
| "grad_norm": 0.4155610501766205, |
| "learning_rate": 1.7882830626450118e-05, |
| "loss": 1.2922, |
| "step": 85150 |
| }, |
| { |
| "epoch": 14.489795918367347, |
| "grad_norm": 0.4222477674484253, |
| "learning_rate": 1.7592807424593965e-05, |
| "loss": 1.2867, |
| "step": 85200 |
| }, |
| { |
| "epoch": 14.498299319727892, |
| "grad_norm": 0.4063816964626312, |
| "learning_rate": 1.730278422273782e-05, |
| "loss": 1.2888, |
| "step": 85250 |
| }, |
| { |
| "epoch": 14.506802721088436, |
| "grad_norm": 0.3986765742301941, |
| "learning_rate": 1.701276102088167e-05, |
| "loss": 1.285, |
| "step": 85300 |
| }, |
| { |
| "epoch": 14.51530612244898, |
| "grad_norm": 0.4038733243942261, |
| "learning_rate": 1.672273781902552e-05, |
| "loss": 1.2913, |
| "step": 85350 |
| }, |
| { |
| "epoch": 14.523809523809524, |
| "grad_norm": 0.40419521927833557, |
| "learning_rate": 1.6432714617169372e-05, |
| "loss": 1.2872, |
| "step": 85400 |
| }, |
| { |
| "epoch": 14.532312925170068, |
| "grad_norm": 0.41449958086013794, |
| "learning_rate": 1.6142691415313226e-05, |
| "loss": 1.2853, |
| "step": 85450 |
| }, |
| { |
| "epoch": 14.540816326530612, |
| "grad_norm": 0.4067400097846985, |
| "learning_rate": 1.5852668213457077e-05, |
| "loss": 1.2919, |
| "step": 85500 |
| }, |
| { |
| "epoch": 14.549319727891156, |
| "grad_norm": 0.6428622007369995, |
| "learning_rate": 1.5562645011600928e-05, |
| "loss": 1.2935, |
| "step": 85550 |
| }, |
| { |
| "epoch": 14.5578231292517, |
| "grad_norm": 0.4078386723995209, |
| "learning_rate": 1.527262180974478e-05, |
| "loss": 1.2892, |
| "step": 85600 |
| }, |
| { |
| "epoch": 14.566326530612244, |
| "grad_norm": 0.4054696261882782, |
| "learning_rate": 1.4982598607888632e-05, |
| "loss": 1.2871, |
| "step": 85650 |
| }, |
| { |
| "epoch": 14.57482993197279, |
| "grad_norm": 0.4058522880077362, |
| "learning_rate": 1.4692575406032483e-05, |
| "loss": 1.286, |
| "step": 85700 |
| }, |
| { |
| "epoch": 14.583333333333334, |
| "grad_norm": 0.40443381667137146, |
| "learning_rate": 1.4402552204176335e-05, |
| "loss": 1.2888, |
| "step": 85750 |
| }, |
| { |
| "epoch": 14.591836734693878, |
| "grad_norm": 0.4228183925151825, |
| "learning_rate": 1.4112529002320186e-05, |
| "loss": 1.2875, |
| "step": 85800 |
| }, |
| { |
| "epoch": 14.600340136054422, |
| "grad_norm": 0.4046574831008911, |
| "learning_rate": 1.3822505800464039e-05, |
| "loss": 1.2883, |
| "step": 85850 |
| }, |
| { |
| "epoch": 14.608843537414966, |
| "grad_norm": 0.4183114767074585, |
| "learning_rate": 1.353248259860789e-05, |
| "loss": 1.2929, |
| "step": 85900 |
| }, |
| { |
| "epoch": 14.61734693877551, |
| "grad_norm": 0.40642350912094116, |
| "learning_rate": 1.324245939675174e-05, |
| "loss": 1.2892, |
| "step": 85950 |
| }, |
| { |
| "epoch": 14.625850340136054, |
| "grad_norm": 0.40780848264694214, |
| "learning_rate": 1.2952436194895593e-05, |
| "loss": 1.2888, |
| "step": 86000 |
| }, |
| { |
| "epoch": 14.625850340136054, |
| "eval_loss": 1.4564687013626099, |
| "eval_runtime": 75.2583, |
| "eval_samples_per_second": 1246.893, |
| "eval_steps_per_second": 4.877, |
| "step": 86000 |
| }, |
| { |
| "epoch": 14.634353741496598, |
| "grad_norm": 0.399654746055603, |
| "learning_rate": 1.2662412993039444e-05, |
| "loss": 1.292, |
| "step": 86050 |
| }, |
| { |
| "epoch": 14.642857142857142, |
| "grad_norm": 0.4076179563999176, |
| "learning_rate": 1.2372389791183295e-05, |
| "loss": 1.2894, |
| "step": 86100 |
| }, |
| { |
| "epoch": 14.651360544217686, |
| "grad_norm": 0.4201255738735199, |
| "learning_rate": 1.2082366589327147e-05, |
| "loss": 1.289, |
| "step": 86150 |
| }, |
| { |
| "epoch": 14.65986394557823, |
| "grad_norm": 0.39747655391693115, |
| "learning_rate": 1.1792343387470998e-05, |
| "loss": 1.2915, |
| "step": 86200 |
| }, |
| { |
| "epoch": 14.668367346938776, |
| "grad_norm": 0.4104771316051483, |
| "learning_rate": 1.1508120649651974e-05, |
| "loss": 1.2864, |
| "step": 86250 |
| }, |
| { |
| "epoch": 14.67687074829932, |
| "grad_norm": 0.41255298256874084, |
| "learning_rate": 1.1218097447795825e-05, |
| "loss": 1.2916, |
| "step": 86300 |
| }, |
| { |
| "epoch": 14.685374149659864, |
| "grad_norm": 0.4036099910736084, |
| "learning_rate": 1.0928074245939676e-05, |
| "loss": 1.2921, |
| "step": 86350 |
| }, |
| { |
| "epoch": 14.693877551020408, |
| "grad_norm": 0.40650486946105957, |
| "learning_rate": 1.0638051044083526e-05, |
| "loss": 1.2896, |
| "step": 86400 |
| }, |
| { |
| "epoch": 14.702380952380953, |
| "grad_norm": 0.47484561800956726, |
| "learning_rate": 1.0348027842227377e-05, |
| "loss": 1.2917, |
| "step": 86450 |
| }, |
| { |
| "epoch": 14.710884353741497, |
| "grad_norm": 0.4110647737979889, |
| "learning_rate": 1.0058004640371228e-05, |
| "loss": 1.2878, |
| "step": 86500 |
| }, |
| { |
| "epoch": 14.71938775510204, |
| "grad_norm": 0.4205218553543091, |
| "learning_rate": 9.767981438515081e-06, |
| "loss": 1.2879, |
| "step": 86550 |
| }, |
| { |
| "epoch": 14.727891156462585, |
| "grad_norm": 0.4059794843196869, |
| "learning_rate": 9.477958236658932e-06, |
| "loss": 1.2871, |
| "step": 86600 |
| }, |
| { |
| "epoch": 14.736394557823129, |
| "grad_norm": 0.41226640343666077, |
| "learning_rate": 9.187935034802784e-06, |
| "loss": 1.2938, |
| "step": 86650 |
| }, |
| { |
| "epoch": 14.744897959183673, |
| "grad_norm": 0.41290587186813354, |
| "learning_rate": 8.897911832946635e-06, |
| "loss": 1.286, |
| "step": 86700 |
| }, |
| { |
| "epoch": 14.753401360544217, |
| "grad_norm": 0.4120457172393799, |
| "learning_rate": 8.607888631090486e-06, |
| "loss": 1.289, |
| "step": 86750 |
| }, |
| { |
| "epoch": 14.761904761904763, |
| "grad_norm": 0.4931107461452484, |
| "learning_rate": 8.317865429234339e-06, |
| "loss": 1.2896, |
| "step": 86800 |
| }, |
| { |
| "epoch": 14.770408163265307, |
| "grad_norm": 0.4076496660709381, |
| "learning_rate": 8.02784222737819e-06, |
| "loss": 1.2877, |
| "step": 86850 |
| }, |
| { |
| "epoch": 14.77891156462585, |
| "grad_norm": 0.41810768842697144, |
| "learning_rate": 7.737819025522042e-06, |
| "loss": 1.2873, |
| "step": 86900 |
| }, |
| { |
| "epoch": 14.787414965986395, |
| "grad_norm": 0.41356149315834045, |
| "learning_rate": 7.447795823665894e-06, |
| "loss": 1.2886, |
| "step": 86950 |
| }, |
| { |
| "epoch": 14.795918367346939, |
| "grad_norm": 0.43218472599983215, |
| "learning_rate": 7.157772621809745e-06, |
| "loss": 1.291, |
| "step": 87000 |
| }, |
| { |
| "epoch": 14.795918367346939, |
| "eval_loss": 1.4570873975753784, |
| "eval_runtime": 75.2933, |
| "eval_samples_per_second": 1246.313, |
| "eval_steps_per_second": 4.874, |
| "step": 87000 |
| }, |
| { |
| "epoch": 14.804421768707483, |
| "grad_norm": 0.4102339446544647, |
| "learning_rate": 6.867749419953597e-06, |
| "loss": 1.2848, |
| "step": 87050 |
| }, |
| { |
| "epoch": 14.812925170068027, |
| "grad_norm": 0.41269227862358093, |
| "learning_rate": 6.577726218097448e-06, |
| "loss": 1.2888, |
| "step": 87100 |
| }, |
| { |
| "epoch": 14.821428571428571, |
| "grad_norm": 0.4198719561100006, |
| "learning_rate": 6.287703016241299e-06, |
| "loss": 1.2904, |
| "step": 87150 |
| }, |
| { |
| "epoch": 14.829931972789115, |
| "grad_norm": 0.40524616837501526, |
| "learning_rate": 5.997679814385151e-06, |
| "loss": 1.2916, |
| "step": 87200 |
| }, |
| { |
| "epoch": 14.83843537414966, |
| "grad_norm": 0.40271249413490295, |
| "learning_rate": 5.707656612529003e-06, |
| "loss": 1.2821, |
| "step": 87250 |
| }, |
| { |
| "epoch": 14.846938775510203, |
| "grad_norm": 0.404710590839386, |
| "learning_rate": 5.4176334106728545e-06, |
| "loss": 1.2848, |
| "step": 87300 |
| }, |
| { |
| "epoch": 14.85544217687075, |
| "grad_norm": 0.40877318382263184, |
| "learning_rate": 5.127610208816705e-06, |
| "loss": 1.2824, |
| "step": 87350 |
| }, |
| { |
| "epoch": 14.863945578231293, |
| "grad_norm": 0.40743860602378845, |
| "learning_rate": 4.837587006960557e-06, |
| "loss": 1.2875, |
| "step": 87400 |
| }, |
| { |
| "epoch": 14.872448979591837, |
| "grad_norm": 0.40123429894447327, |
| "learning_rate": 4.547563805104409e-06, |
| "loss": 1.2879, |
| "step": 87450 |
| }, |
| { |
| "epoch": 14.880952380952381, |
| "grad_norm": 0.4120638072490692, |
| "learning_rate": 4.257540603248261e-06, |
| "loss": 1.2886, |
| "step": 87500 |
| }, |
| { |
| "epoch": 14.889455782312925, |
| "grad_norm": 0.4441770017147064, |
| "learning_rate": 3.9675174013921115e-06, |
| "loss": 1.2903, |
| "step": 87550 |
| }, |
| { |
| "epoch": 14.89795918367347, |
| "grad_norm": 0.39684155583381653, |
| "learning_rate": 3.677494199535963e-06, |
| "loss": 1.2894, |
| "step": 87600 |
| }, |
| { |
| "epoch": 14.906462585034014, |
| "grad_norm": 0.40048104524612427, |
| "learning_rate": 3.387470997679814e-06, |
| "loss": 1.2872, |
| "step": 87650 |
| }, |
| { |
| "epoch": 14.914965986394558, |
| "grad_norm": 0.40761691331863403, |
| "learning_rate": 3.097447795823666e-06, |
| "loss": 1.2856, |
| "step": 87700 |
| }, |
| { |
| "epoch": 14.923469387755102, |
| "grad_norm": 0.41914671659469604, |
| "learning_rate": 2.807424593967517e-06, |
| "loss": 1.2864, |
| "step": 87750 |
| }, |
| { |
| "epoch": 14.931972789115646, |
| "grad_norm": 0.41371914744377136, |
| "learning_rate": 2.517401392111369e-06, |
| "loss": 1.2807, |
| "step": 87800 |
| }, |
| { |
| "epoch": 14.94047619047619, |
| "grad_norm": 0.4007938802242279, |
| "learning_rate": 2.2273781902552203e-06, |
| "loss": 1.2869, |
| "step": 87850 |
| }, |
| { |
| "epoch": 14.948979591836736, |
| "grad_norm": 0.4132489264011383, |
| "learning_rate": 1.937354988399072e-06, |
| "loss": 1.2822, |
| "step": 87900 |
| }, |
| { |
| "epoch": 14.95748299319728, |
| "grad_norm": 0.4170679748058319, |
| "learning_rate": 1.6473317865429233e-06, |
| "loss": 1.2867, |
| "step": 87950 |
| }, |
| { |
| "epoch": 14.965986394557824, |
| "grad_norm": 0.4113543629646301, |
| "learning_rate": 1.3573085846867749e-06, |
| "loss": 1.2852, |
| "step": 88000 |
| }, |
| { |
| "epoch": 14.965986394557824, |
| "eval_loss": 1.4495242834091187, |
| "eval_runtime": 75.6807, |
| "eval_samples_per_second": 1239.933, |
| "eval_steps_per_second": 4.849, |
| "step": 88000 |
| }, |
| { |
| "epoch": 14.974489795918368, |
| "grad_norm": 0.4077957272529602, |
| "learning_rate": 1.0672853828306264e-06, |
| "loss": 1.2852, |
| "step": 88050 |
| }, |
| { |
| "epoch": 14.982993197278912, |
| "grad_norm": 0.403070330619812, |
| "learning_rate": 7.772621809744779e-07, |
| "loss": 1.2854, |
| "step": 88100 |
| }, |
| { |
| "epoch": 14.991496598639456, |
| "grad_norm": 0.41275766491889954, |
| "learning_rate": 4.872389791183295e-07, |
| "loss": 1.288, |
| "step": 88150 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.4174801707267761, |
| "learning_rate": 1.97215777262181e-07, |
| "loss": 1.2938, |
| "step": 88200 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 88200, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 15, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.377209771467368e+19, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|