diff --git "a/checkpoint-3641/trainer_state.json" "b/checkpoint-3641/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3641/trainer_state.json" @@ -0,0 +1,25520 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3641, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027464982147761604, + "grad_norm": 2.8266000747680664, + "learning_rate": 5.0000000000000004e-08, + "loss": 0.9522, + "step": 1 + }, + { + "epoch": 0.0005492996429552321, + "grad_norm": 2.7995493412017822, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.0295, + "step": 2 + }, + { + "epoch": 0.0008239494644328481, + "grad_norm": 2.9297163486480713, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0307, + "step": 3 + }, + { + "epoch": 0.0010985992859104642, + "grad_norm": 2.9254722595214844, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.0171, + "step": 4 + }, + { + "epoch": 0.0013732491073880802, + "grad_norm": 3.2124598026275635, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0687, + "step": 5 + }, + { + "epoch": 0.0016478989288656962, + "grad_norm": 3.01930832862854, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.0331, + "step": 6 + }, + { + "epoch": 0.0019225487503433123, + "grad_norm": 2.6723780632019043, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.007, + "step": 7 + }, + { + "epoch": 0.0021971985718209283, + "grad_norm": 2.8304638862609863, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.0556, + "step": 8 + }, + { + "epoch": 0.0024718483932985444, + "grad_norm": 2.7696664333343506, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.9703, + "step": 9 + }, + { + "epoch": 0.0027464982147761604, + "grad_norm": 2.7686898708343506, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0205, + "step": 10 + }, + { + "epoch": 0.0030211480362537764, + "grad_norm": 2.828322649002075, + "learning_rate": 5.5e-07, + "loss": 0.973, + "step": 11 + }, + { + "epoch": 0.0032957978577313925, + "grad_norm": 3.002872943878174, + "learning_rate": 6.000000000000001e-07, + "loss": 1.0823, + "step": 12 + }, + { + "epoch": 0.0035704476792090085, + "grad_norm": 2.571089506149292, + "learning_rate": 6.5e-07, + "loss": 0.8824, + "step": 13 + }, + { + "epoch": 0.0038450975006866246, + "grad_norm": 2.8225350379943848, + "learning_rate": 7.000000000000001e-07, + "loss": 1.036, + "step": 14 + }, + { + "epoch": 0.004119747322164241, + "grad_norm": 2.869109630584717, + "learning_rate": 7.5e-07, + "loss": 1.0591, + "step": 15 + }, + { + "epoch": 0.004394397143641857, + "grad_norm": 2.932457208633423, + "learning_rate": 8.000000000000001e-07, + "loss": 1.0381, + "step": 16 + }, + { + "epoch": 0.004669046965119473, + "grad_norm": 2.6697590351104736, + "learning_rate": 8.500000000000001e-07, + "loss": 1.0504, + "step": 17 + }, + { + "epoch": 0.004943696786597089, + "grad_norm": 2.4847512245178223, + "learning_rate": 9.000000000000001e-07, + "loss": 0.9981, + "step": 18 + }, + { + "epoch": 0.005218346608074705, + "grad_norm": 2.575406074523926, + "learning_rate": 9.500000000000001e-07, + "loss": 1.022, + "step": 19 + }, + { + "epoch": 0.005492996429552321, + "grad_norm": 2.4840376377105713, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0076, + "step": 20 + }, + { + "epoch": 0.005767646251029937, + "grad_norm": 2.345395565032959, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.9437, + "step": 21 + }, + { + "epoch": 0.006042296072507553, + "grad_norm": 2.1309304237365723, + "learning_rate": 1.1e-06, + "loss": 0.9278, + "step": 22 + }, + { + "epoch": 0.006316945893985169, + "grad_norm": 2.3368828296661377, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0177, + "step": 23 + }, + { + "epoch": 0.006591595715462785, + "grad_norm": 2.192014694213867, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.9929, + "step": 24 + }, + { + "epoch": 0.006866245536940401, + "grad_norm": 2.0942306518554688, + "learning_rate": 1.25e-06, + "loss": 1.0349, + "step": 25 + }, + { + "epoch": 0.007140895358418017, + "grad_norm": 1.8654743432998657, + "learning_rate": 1.3e-06, + "loss": 0.8832, + "step": 26 + }, + { + "epoch": 0.007415545179895633, + "grad_norm": 1.9825080633163452, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.9809, + "step": 27 + }, + { + "epoch": 0.007690195001373249, + "grad_norm": 2.044829845428467, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.0065, + "step": 28 + }, + { + "epoch": 0.007964844822850866, + "grad_norm": 1.762143611907959, + "learning_rate": 1.45e-06, + "loss": 0.9793, + "step": 29 + }, + { + "epoch": 0.008239494644328481, + "grad_norm": 1.9147125482559204, + "learning_rate": 1.5e-06, + "loss": 0.9731, + "step": 30 + }, + { + "epoch": 0.008514144465806098, + "grad_norm": 1.7794023752212524, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.9819, + "step": 31 + }, + { + "epoch": 0.008788794287283713, + "grad_norm": 1.725781798362732, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.9416, + "step": 32 + }, + { + "epoch": 0.00906344410876133, + "grad_norm": 1.7977334260940552, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.9956, + "step": 33 + }, + { + "epoch": 0.009338093930238945, + "grad_norm": 1.6866639852523804, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.9668, + "step": 34 + }, + { + "epoch": 0.009612743751716562, + "grad_norm": 1.7249571084976196, + "learning_rate": 1.75e-06, + "loss": 0.9453, + "step": 35 + }, + { + "epoch": 0.009887393573194177, + "grad_norm": 1.6365666389465332, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8942, + "step": 36 + }, + { + "epoch": 0.010162043394671794, + "grad_norm": 1.6188206672668457, + "learning_rate": 1.85e-06, + "loss": 0.9442, + "step": 37 + }, + { + "epoch": 0.01043669321614941, + "grad_norm": 1.7326232194900513, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.9647, + "step": 38 + }, + { + "epoch": 0.010711343037627026, + "grad_norm": 1.7019890546798706, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.9746, + "step": 39 + }, + { + "epoch": 0.010985992859104642, + "grad_norm": 1.617974042892456, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8929, + "step": 40 + }, + { + "epoch": 0.011260642680582258, + "grad_norm": 1.655133843421936, + "learning_rate": 2.05e-06, + "loss": 0.9306, + "step": 41 + }, + { + "epoch": 0.011535292502059874, + "grad_norm": 1.521951675415039, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9122, + "step": 42 + }, + { + "epoch": 0.01180994232353749, + "grad_norm": 1.6089528799057007, + "learning_rate": 2.15e-06, + "loss": 0.9112, + "step": 43 + }, + { + "epoch": 0.012084592145015106, + "grad_norm": 1.5762227773666382, + "learning_rate": 2.2e-06, + "loss": 0.8854, + "step": 44 + }, + { + "epoch": 0.012359241966492723, + "grad_norm": 1.5355838537216187, + "learning_rate": 2.25e-06, + "loss": 0.9154, + "step": 45 + }, + { + "epoch": 0.012633891787970338, + "grad_norm": 1.6074600219726562, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.8411, + "step": 46 + }, + { + "epoch": 0.012908541609447955, + "grad_norm": 1.443608283996582, + "learning_rate": 2.35e-06, + "loss": 0.8077, + "step": 47 + }, + { + "epoch": 0.01318319143092557, + "grad_norm": 1.465929388999939, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.8822, + "step": 48 + }, + { + "epoch": 0.013457841252403187, + "grad_norm": 1.425093173980713, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.8478, + "step": 49 + }, + { + "epoch": 0.013732491073880802, + "grad_norm": 1.4667171239852905, + "learning_rate": 2.5e-06, + "loss": 0.8671, + "step": 50 + }, + { + "epoch": 0.014007140895358419, + "grad_norm": 1.427631139755249, + "learning_rate": 2.55e-06, + "loss": 0.8963, + "step": 51 + }, + { + "epoch": 0.014281790716836034, + "grad_norm": 1.55605149269104, + "learning_rate": 2.6e-06, + "loss": 0.9058, + "step": 52 + }, + { + "epoch": 0.014556440538313651, + "grad_norm": 1.4256898164749146, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.9156, + "step": 53 + }, + { + "epoch": 0.014831090359791266, + "grad_norm": 1.4396406412124634, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.8952, + "step": 54 + }, + { + "epoch": 0.015105740181268883, + "grad_norm": 1.3725215196609497, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8195, + "step": 55 + }, + { + "epoch": 0.015380390002746498, + "grad_norm": 1.4552042484283447, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9397, + "step": 56 + }, + { + "epoch": 0.015655039824224115, + "grad_norm": 1.3353712558746338, + "learning_rate": 2.85e-06, + "loss": 0.8579, + "step": 57 + }, + { + "epoch": 0.015929689645701732, + "grad_norm": 1.3451509475708008, + "learning_rate": 2.9e-06, + "loss": 0.8802, + "step": 58 + }, + { + "epoch": 0.016204339467179345, + "grad_norm": 1.4988653659820557, + "learning_rate": 2.95e-06, + "loss": 0.8852, + "step": 59 + }, + { + "epoch": 0.016478989288656962, + "grad_norm": 1.3400249481201172, + "learning_rate": 3e-06, + "loss": 0.8805, + "step": 60 + }, + { + "epoch": 0.01675363911013458, + "grad_norm": 1.417392611503601, + "learning_rate": 3.05e-06, + "loss": 0.8912, + "step": 61 + }, + { + "epoch": 0.017028288931612196, + "grad_norm": 1.446010947227478, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.8326, + "step": 62 + }, + { + "epoch": 0.01730293875308981, + "grad_norm": 1.3595073223114014, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.8337, + "step": 63 + }, + { + "epoch": 0.017577588574567427, + "grad_norm": 1.3076400756835938, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8317, + "step": 64 + }, + { + "epoch": 0.017852238396045043, + "grad_norm": 1.348937749862671, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.8506, + "step": 65 + }, + { + "epoch": 0.01812688821752266, + "grad_norm": 1.3066271543502808, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.8172, + "step": 66 + }, + { + "epoch": 0.018401538039000274, + "grad_norm": 1.4313191175460815, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.9217, + "step": 67 + }, + { + "epoch": 0.01867618786047789, + "grad_norm": 1.4292980432510376, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9153, + "step": 68 + }, + { + "epoch": 0.018950837681955508, + "grad_norm": 1.3436965942382812, + "learning_rate": 3.45e-06, + "loss": 0.8757, + "step": 69 + }, + { + "epoch": 0.019225487503433124, + "grad_norm": 1.343865156173706, + "learning_rate": 3.5e-06, + "loss": 0.802, + "step": 70 + }, + { + "epoch": 0.019500137324910738, + "grad_norm": 1.288719654083252, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.7625, + "step": 71 + }, + { + "epoch": 0.019774787146388355, + "grad_norm": 1.195141077041626, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7585, + "step": 72 + }, + { + "epoch": 0.020049436967865972, + "grad_norm": 1.3414618968963623, + "learning_rate": 3.65e-06, + "loss": 0.8246, + "step": 73 + }, + { + "epoch": 0.02032408678934359, + "grad_norm": 1.2586166858673096, + "learning_rate": 3.7e-06, + "loss": 0.7934, + "step": 74 + }, + { + "epoch": 0.020598736610821202, + "grad_norm": 1.267920732498169, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9025, + "step": 75 + }, + { + "epoch": 0.02087338643229882, + "grad_norm": 1.2333980798721313, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8222, + "step": 76 + }, + { + "epoch": 0.021148036253776436, + "grad_norm": 1.2571673393249512, + "learning_rate": 3.85e-06, + "loss": 0.7996, + "step": 77 + }, + { + "epoch": 0.021422686075254053, + "grad_norm": 1.3576871156692505, + "learning_rate": 3.900000000000001e-06, + "loss": 0.8566, + "step": 78 + }, + { + "epoch": 0.021697335896731666, + "grad_norm": 1.3421159982681274, + "learning_rate": 3.95e-06, + "loss": 0.8432, + "step": 79 + }, + { + "epoch": 0.021971985718209283, + "grad_norm": 1.3017475605010986, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8691, + "step": 80 + }, + { + "epoch": 0.0222466355396869, + "grad_norm": 1.3238869905471802, + "learning_rate": 4.05e-06, + "loss": 0.862, + "step": 81 + }, + { + "epoch": 0.022521285361164517, + "grad_norm": 1.2861217260360718, + "learning_rate": 4.1e-06, + "loss": 0.7902, + "step": 82 + }, + { + "epoch": 0.02279593518264213, + "grad_norm": 1.2384072542190552, + "learning_rate": 4.15e-06, + "loss": 0.8295, + "step": 83 + }, + { + "epoch": 0.023070585004119747, + "grad_norm": 1.298516035079956, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.858, + "step": 84 + }, + { + "epoch": 0.023345234825597364, + "grad_norm": 1.3388971090316772, + "learning_rate": 4.25e-06, + "loss": 0.8078, + "step": 85 + }, + { + "epoch": 0.02361988464707498, + "grad_norm": 1.2214970588684082, + "learning_rate": 4.3e-06, + "loss": 0.7808, + "step": 86 + }, + { + "epoch": 0.023894534468552595, + "grad_norm": 1.2403273582458496, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8294, + "step": 87 + }, + { + "epoch": 0.02416918429003021, + "grad_norm": 1.309830665588379, + "learning_rate": 4.4e-06, + "loss": 0.7833, + "step": 88 + }, + { + "epoch": 0.02444383411150783, + "grad_norm": 1.3733514547348022, + "learning_rate": 4.450000000000001e-06, + "loss": 0.8465, + "step": 89 + }, + { + "epoch": 0.024718483932985445, + "grad_norm": 1.215659737586975, + "learning_rate": 4.5e-06, + "loss": 0.7968, + "step": 90 + }, + { + "epoch": 0.02499313375446306, + "grad_norm": 1.2257864475250244, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8108, + "step": 91 + }, + { + "epoch": 0.025267783575940676, + "grad_norm": 1.3017266988754272, + "learning_rate": 4.600000000000001e-06, + "loss": 0.8059, + "step": 92 + }, + { + "epoch": 0.025542433397418293, + "grad_norm": 1.33876371383667, + "learning_rate": 4.65e-06, + "loss": 0.8291, + "step": 93 + }, + { + "epoch": 0.02581708321889591, + "grad_norm": 1.2098389863967896, + "learning_rate": 4.7e-06, + "loss": 0.8039, + "step": 94 + }, + { + "epoch": 0.026091733040373523, + "grad_norm": 1.2084779739379883, + "learning_rate": 4.75e-06, + "loss": 0.8186, + "step": 95 + }, + { + "epoch": 0.02636638286185114, + "grad_norm": 1.22316575050354, + "learning_rate": 4.800000000000001e-06, + "loss": 0.8169, + "step": 96 + }, + { + "epoch": 0.026641032683328757, + "grad_norm": 1.3461512327194214, + "learning_rate": 4.85e-06, + "loss": 0.7933, + "step": 97 + }, + { + "epoch": 0.026915682504806374, + "grad_norm": 1.181164026260376, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7433, + "step": 98 + }, + { + "epoch": 0.027190332326283987, + "grad_norm": 1.2945033311843872, + "learning_rate": 4.95e-06, + "loss": 0.8443, + "step": 99 + }, + { + "epoch": 0.027464982147761604, + "grad_norm": 1.3123775720596313, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.02773963196923922, + "grad_norm": 1.268356442451477, + "learning_rate": 4.999999973911387e-06, + "loss": 0.83, + "step": 101 + }, + { + "epoch": 0.028014281790716838, + "grad_norm": 1.300063967704773, + "learning_rate": 4.999999895645549e-06, + "loss": 0.8115, + "step": 102 + }, + { + "epoch": 0.02828893161219445, + "grad_norm": 1.1843887567520142, + "learning_rate": 4.999999765202487e-06, + "loss": 0.7821, + "step": 103 + }, + { + "epoch": 0.028563581433672068, + "grad_norm": 1.2291145324707031, + "learning_rate": 4.999999582582204e-06, + "loss": 0.8039, + "step": 104 + }, + { + "epoch": 0.028838231255149685, + "grad_norm": 1.3058695793151855, + "learning_rate": 4.999999347784703e-06, + "loss": 0.836, + "step": 105 + }, + { + "epoch": 0.029112881076627302, + "grad_norm": 1.2697621583938599, + "learning_rate": 4.9999990608099905e-06, + "loss": 0.7943, + "step": 106 + }, + { + "epoch": 0.029387530898104915, + "grad_norm": 1.289311170578003, + "learning_rate": 4.999998721658071e-06, + "loss": 0.7598, + "step": 107 + }, + { + "epoch": 0.029662180719582532, + "grad_norm": 1.2171679735183716, + "learning_rate": 4.999998330328952e-06, + "loss": 0.7897, + "step": 108 + }, + { + "epoch": 0.02993683054106015, + "grad_norm": 1.283914566040039, + "learning_rate": 4.999997886822643e-06, + "loss": 0.777, + "step": 109 + }, + { + "epoch": 0.030211480362537766, + "grad_norm": 1.258127212524414, + "learning_rate": 4.9999973911391505e-06, + "loss": 0.7918, + "step": 110 + }, + { + "epoch": 0.03048613018401538, + "grad_norm": 1.2143088579177856, + "learning_rate": 4.9999968432784876e-06, + "loss": 0.8333, + "step": 111 + }, + { + "epoch": 0.030760780005492996, + "grad_norm": 1.3077479600906372, + "learning_rate": 4.9999962432406645e-06, + "loss": 0.8027, + "step": 112 + }, + { + "epoch": 0.031035429826970613, + "grad_norm": 1.2741223573684692, + "learning_rate": 4.999995591025693e-06, + "loss": 0.8143, + "step": 113 + }, + { + "epoch": 0.03131007964844823, + "grad_norm": 1.2862162590026855, + "learning_rate": 4.999994886633589e-06, + "loss": 0.8256, + "step": 114 + }, + { + "epoch": 0.03158472946992585, + "grad_norm": 1.2062207460403442, + "learning_rate": 4.999994130064365e-06, + "loss": 0.7885, + "step": 115 + }, + { + "epoch": 0.031859379291403464, + "grad_norm": 1.2290135622024536, + "learning_rate": 4.999993321318037e-06, + "loss": 0.7779, + "step": 116 + }, + { + "epoch": 0.032134029112881074, + "grad_norm": 1.255915880203247, + "learning_rate": 4.999992460394624e-06, + "loss": 0.7814, + "step": 117 + }, + { + "epoch": 0.03240867893435869, + "grad_norm": 1.2726551294326782, + "learning_rate": 4.999991547294141e-06, + "loss": 0.8446, + "step": 118 + }, + { + "epoch": 0.03268332875583631, + "grad_norm": 1.3564611673355103, + "learning_rate": 4.999990582016609e-06, + "loss": 0.8226, + "step": 119 + }, + { + "epoch": 0.032957978577313925, + "grad_norm": 1.210801124572754, + "learning_rate": 4.999989564562048e-06, + "loss": 0.8101, + "step": 120 + }, + { + "epoch": 0.03323262839879154, + "grad_norm": 1.214455246925354, + "learning_rate": 4.999988494930478e-06, + "loss": 0.7698, + "step": 121 + }, + { + "epoch": 0.03350727822026916, + "grad_norm": 1.2661283016204834, + "learning_rate": 4.999987373121921e-06, + "loss": 0.8518, + "step": 122 + }, + { + "epoch": 0.033781928041746775, + "grad_norm": 1.3954834938049316, + "learning_rate": 4.999986199136404e-06, + "loss": 0.8169, + "step": 123 + }, + { + "epoch": 0.03405657786322439, + "grad_norm": 1.3207101821899414, + "learning_rate": 4.999984972973948e-06, + "loss": 0.8142, + "step": 124 + }, + { + "epoch": 0.034331227684702, + "grad_norm": 1.2880308628082275, + "learning_rate": 4.999983694634579e-06, + "loss": 0.7746, + "step": 125 + }, + { + "epoch": 0.03460587750617962, + "grad_norm": 1.2948819398880005, + "learning_rate": 4.999982364118325e-06, + "loss": 0.7744, + "step": 126 + }, + { + "epoch": 0.034880527327657236, + "grad_norm": 1.4608489274978638, + "learning_rate": 4.999980981425214e-06, + "loss": 0.8492, + "step": 127 + }, + { + "epoch": 0.03515517714913485, + "grad_norm": 1.334864854812622, + "learning_rate": 4.999979546555272e-06, + "loss": 0.7723, + "step": 128 + }, + { + "epoch": 0.03542982697061247, + "grad_norm": 1.3800140619277954, + "learning_rate": 4.999978059508532e-06, + "loss": 0.8422, + "step": 129 + }, + { + "epoch": 0.03570447679209009, + "grad_norm": 1.1499245166778564, + "learning_rate": 4.999976520285025e-06, + "loss": 0.6963, + "step": 130 + }, + { + "epoch": 0.035979126613567704, + "grad_norm": 1.2593730688095093, + "learning_rate": 4.99997492888478e-06, + "loss": 0.7591, + "step": 131 + }, + { + "epoch": 0.03625377643504532, + "grad_norm": 1.2408984899520874, + "learning_rate": 4.999973285307834e-06, + "loss": 0.8111, + "step": 132 + }, + { + "epoch": 0.03652842625652293, + "grad_norm": 1.270969271659851, + "learning_rate": 4.999971589554219e-06, + "loss": 0.8256, + "step": 133 + }, + { + "epoch": 0.03680307607800055, + "grad_norm": 1.3845263719558716, + "learning_rate": 4.999969841623971e-06, + "loss": 0.8681, + "step": 134 + }, + { + "epoch": 0.037077725899478164, + "grad_norm": 1.245737910270691, + "learning_rate": 4.999968041517126e-06, + "loss": 0.8323, + "step": 135 + }, + { + "epoch": 0.03735237572095578, + "grad_norm": 1.2017422914505005, + "learning_rate": 4.999966189233722e-06, + "loss": 0.7769, + "step": 136 + }, + { + "epoch": 0.0376270255424334, + "grad_norm": 1.2350472211837769, + "learning_rate": 4.9999642847737985e-06, + "loss": 0.7619, + "step": 137 + }, + { + "epoch": 0.037901675363911015, + "grad_norm": 1.2941774129867554, + "learning_rate": 4.999962328137393e-06, + "loss": 0.8315, + "step": 138 + }, + { + "epoch": 0.03817632518538863, + "grad_norm": 1.3227850198745728, + "learning_rate": 4.999960319324549e-06, + "loss": 0.7746, + "step": 139 + }, + { + "epoch": 0.03845097500686625, + "grad_norm": 1.356667160987854, + "learning_rate": 4.999958258335307e-06, + "loss": 0.7955, + "step": 140 + }, + { + "epoch": 0.03872562482834386, + "grad_norm": 1.2601845264434814, + "learning_rate": 4.99995614516971e-06, + "loss": 0.7818, + "step": 141 + }, + { + "epoch": 0.039000274649821476, + "grad_norm": 1.3285731077194214, + "learning_rate": 4.999953979827802e-06, + "loss": 0.8227, + "step": 142 + }, + { + "epoch": 0.03927492447129909, + "grad_norm": 1.2197043895721436, + "learning_rate": 4.999951762309629e-06, + "loss": 0.765, + "step": 143 + }, + { + "epoch": 0.03954957429277671, + "grad_norm": 1.3434507846832275, + "learning_rate": 4.999949492615237e-06, + "loss": 0.8115, + "step": 144 + }, + { + "epoch": 0.03982422411425433, + "grad_norm": 1.26797354221344, + "learning_rate": 4.999947170744673e-06, + "loss": 0.8438, + "step": 145 + }, + { + "epoch": 0.040098873935731943, + "grad_norm": 1.346535563468933, + "learning_rate": 4.999944796697985e-06, + "loss": 0.7277, + "step": 146 + }, + { + "epoch": 0.04037352375720956, + "grad_norm": 1.3066082000732422, + "learning_rate": 4.999942370475224e-06, + "loss": 0.7798, + "step": 147 + }, + { + "epoch": 0.04064817357868718, + "grad_norm": 1.2851301431655884, + "learning_rate": 4.99993989207644e-06, + "loss": 0.7822, + "step": 148 + }, + { + "epoch": 0.04092282340016479, + "grad_norm": 1.1953388452529907, + "learning_rate": 4.999937361501683e-06, + "loss": 0.7758, + "step": 149 + }, + { + "epoch": 0.041197473221642404, + "grad_norm": 1.3141793012619019, + "learning_rate": 4.999934778751009e-06, + "loss": 0.7699, + "step": 150 + }, + { + "epoch": 0.04147212304312002, + "grad_norm": 1.232103943824768, + "learning_rate": 4.999932143824469e-06, + "loss": 0.7695, + "step": 151 + }, + { + "epoch": 0.04174677286459764, + "grad_norm": 1.4077333211898804, + "learning_rate": 4.9999294567221205e-06, + "loss": 0.818, + "step": 152 + }, + { + "epoch": 0.042021422686075255, + "grad_norm": 1.2228448390960693, + "learning_rate": 4.999926717444018e-06, + "loss": 0.7679, + "step": 153 + }, + { + "epoch": 0.04229607250755287, + "grad_norm": 1.3346647024154663, + "learning_rate": 4.999923925990218e-06, + "loss": 0.7888, + "step": 154 + }, + { + "epoch": 0.04257072232903049, + "grad_norm": 1.3211129903793335, + "learning_rate": 4.999921082360781e-06, + "loss": 0.7656, + "step": 155 + }, + { + "epoch": 0.042845372150508106, + "grad_norm": 1.196613073348999, + "learning_rate": 4.9999181865557645e-06, + "loss": 0.7218, + "step": 156 + }, + { + "epoch": 0.043120021971985716, + "grad_norm": 1.2975802421569824, + "learning_rate": 4.999915238575229e-06, + "loss": 0.7534, + "step": 157 + }, + { + "epoch": 0.04339467179346333, + "grad_norm": 1.4084150791168213, + "learning_rate": 4.999912238419238e-06, + "loss": 0.8026, + "step": 158 + }, + { + "epoch": 0.04366932161494095, + "grad_norm": 1.2156968116760254, + "learning_rate": 4.999909186087851e-06, + "loss": 0.806, + "step": 159 + }, + { + "epoch": 0.043943971436418566, + "grad_norm": 1.1177879571914673, + "learning_rate": 4.999906081581135e-06, + "loss": 0.7197, + "step": 160 + }, + { + "epoch": 0.04421862125789618, + "grad_norm": 1.2545775175094604, + "learning_rate": 4.999902924899152e-06, + "loss": 0.7672, + "step": 161 + }, + { + "epoch": 0.0444932710793738, + "grad_norm": 1.3369728326797485, + "learning_rate": 4.999899716041969e-06, + "loss": 0.8199, + "step": 162 + }, + { + "epoch": 0.04476792090085142, + "grad_norm": 1.2428574562072754, + "learning_rate": 4.999896455009654e-06, + "loss": 0.7454, + "step": 163 + }, + { + "epoch": 0.045042570722329034, + "grad_norm": 1.2718513011932373, + "learning_rate": 4.999893141802273e-06, + "loss": 0.7754, + "step": 164 + }, + { + "epoch": 0.045317220543806644, + "grad_norm": 1.276257038116455, + "learning_rate": 4.9998897764198975e-06, + "loss": 0.8133, + "step": 165 + }, + { + "epoch": 0.04559187036528426, + "grad_norm": 1.2534689903259277, + "learning_rate": 4.9998863588625955e-06, + "loss": 0.8314, + "step": 166 + }, + { + "epoch": 0.04586652018676188, + "grad_norm": 1.408996343612671, + "learning_rate": 4.99988288913044e-06, + "loss": 0.8241, + "step": 167 + }, + { + "epoch": 0.046141170008239495, + "grad_norm": 1.2559762001037598, + "learning_rate": 4.999879367223503e-06, + "loss": 0.7434, + "step": 168 + }, + { + "epoch": 0.04641581982971711, + "grad_norm": 1.2983957529067993, + "learning_rate": 4.999875793141856e-06, + "loss": 0.7715, + "step": 169 + }, + { + "epoch": 0.04669046965119473, + "grad_norm": 1.1758942604064941, + "learning_rate": 4.999872166885577e-06, + "loss": 0.7883, + "step": 170 + }, + { + "epoch": 0.046965119472672345, + "grad_norm": 1.2688244581222534, + "learning_rate": 4.99986848845474e-06, + "loss": 0.8054, + "step": 171 + }, + { + "epoch": 0.04723976929414996, + "grad_norm": 1.2954713106155396, + "learning_rate": 4.999864757849421e-06, + "loss": 0.836, + "step": 172 + }, + { + "epoch": 0.04751441911562757, + "grad_norm": 1.1688711643218994, + "learning_rate": 4.9998609750696996e-06, + "loss": 0.7193, + "step": 173 + }, + { + "epoch": 0.04778906893710519, + "grad_norm": 1.184204339981079, + "learning_rate": 4.999857140115652e-06, + "loss": 0.7362, + "step": 174 + }, + { + "epoch": 0.048063718758582806, + "grad_norm": 1.3526127338409424, + "learning_rate": 4.999853252987361e-06, + "loss": 0.8034, + "step": 175 + }, + { + "epoch": 0.04833836858006042, + "grad_norm": 1.2775675058364868, + "learning_rate": 4.999849313684908e-06, + "loss": 0.8448, + "step": 176 + }, + { + "epoch": 0.04861301840153804, + "grad_norm": 1.3756133317947388, + "learning_rate": 4.9998453222083725e-06, + "loss": 0.7639, + "step": 177 + }, + { + "epoch": 0.04888766822301566, + "grad_norm": 1.339599847793579, + "learning_rate": 4.99984127855784e-06, + "loss": 0.8034, + "step": 178 + }, + { + "epoch": 0.049162318044493274, + "grad_norm": 1.324597716331482, + "learning_rate": 4.999837182733394e-06, + "loss": 0.7611, + "step": 179 + }, + { + "epoch": 0.04943696786597089, + "grad_norm": 1.2013407945632935, + "learning_rate": 4.99983303473512e-06, + "loss": 0.8105, + "step": 180 + }, + { + "epoch": 0.0497116176874485, + "grad_norm": 1.2116559743881226, + "learning_rate": 4.999828834563104e-06, + "loss": 0.7883, + "step": 181 + }, + { + "epoch": 0.04998626750892612, + "grad_norm": 1.3068891763687134, + "learning_rate": 4.999824582217436e-06, + "loss": 0.7876, + "step": 182 + }, + { + "epoch": 0.050260917330403734, + "grad_norm": 1.2657365798950195, + "learning_rate": 4.999820277698202e-06, + "loss": 0.7983, + "step": 183 + }, + { + "epoch": 0.05053556715188135, + "grad_norm": 1.2249605655670166, + "learning_rate": 4.9998159210054935e-06, + "loss": 0.7922, + "step": 184 + }, + { + "epoch": 0.05081021697335897, + "grad_norm": 1.1934384107589722, + "learning_rate": 4.999811512139401e-06, + "loss": 0.7818, + "step": 185 + }, + { + "epoch": 0.051084866794836585, + "grad_norm": 1.2109594345092773, + "learning_rate": 4.999807051100016e-06, + "loss": 0.7939, + "step": 186 + }, + { + "epoch": 0.0513595166163142, + "grad_norm": 1.2157264947891235, + "learning_rate": 4.999802537887432e-06, + "loss": 0.7646, + "step": 187 + }, + { + "epoch": 0.05163416643779182, + "grad_norm": 1.2333133220672607, + "learning_rate": 4.999797972501744e-06, + "loss": 0.7535, + "step": 188 + }, + { + "epoch": 0.05190881625926943, + "grad_norm": 1.2226083278656006, + "learning_rate": 4.999793354943046e-06, + "loss": 0.8034, + "step": 189 + }, + { + "epoch": 0.052183466080747046, + "grad_norm": 1.332330346107483, + "learning_rate": 4.999788685211435e-06, + "loss": 0.787, + "step": 190 + }, + { + "epoch": 0.05245811590222466, + "grad_norm": 1.2435044050216675, + "learning_rate": 4.9997839633070074e-06, + "loss": 0.7908, + "step": 191 + }, + { + "epoch": 0.05273276572370228, + "grad_norm": 1.2167327404022217, + "learning_rate": 4.999779189229864e-06, + "loss": 0.7611, + "step": 192 + }, + { + "epoch": 0.053007415545179896, + "grad_norm": 1.3099274635314941, + "learning_rate": 4.9997743629801034e-06, + "loss": 0.8185, + "step": 193 + }, + { + "epoch": 0.05328206536665751, + "grad_norm": 1.1814327239990234, + "learning_rate": 4.999769484557825e-06, + "loss": 0.7875, + "step": 194 + }, + { + "epoch": 0.05355671518813513, + "grad_norm": 1.2629411220550537, + "learning_rate": 4.999764553963132e-06, + "loss": 0.7925, + "step": 195 + }, + { + "epoch": 0.05383136500961275, + "grad_norm": 1.2540007829666138, + "learning_rate": 4.999759571196127e-06, + "loss": 0.754, + "step": 196 + }, + { + "epoch": 0.05410601483109036, + "grad_norm": 1.1910524368286133, + "learning_rate": 4.999754536256915e-06, + "loss": 0.8137, + "step": 197 + }, + { + "epoch": 0.054380664652567974, + "grad_norm": 1.2434831857681274, + "learning_rate": 4.999749449145598e-06, + "loss": 0.7473, + "step": 198 + }, + { + "epoch": 0.05465531447404559, + "grad_norm": 1.28395414352417, + "learning_rate": 4.999744309862286e-06, + "loss": 0.8138, + "step": 199 + }, + { + "epoch": 0.05492996429552321, + "grad_norm": 1.2323349714279175, + "learning_rate": 4.999739118407084e-06, + "loss": 0.7618, + "step": 200 + }, + { + "epoch": 0.055204614117000825, + "grad_norm": 1.1572526693344116, + "learning_rate": 4.999733874780101e-06, + "loss": 0.7321, + "step": 201 + }, + { + "epoch": 0.05547926393847844, + "grad_norm": 1.232300877571106, + "learning_rate": 4.999728578981445e-06, + "loss": 0.8114, + "step": 202 + }, + { + "epoch": 0.05575391375995606, + "grad_norm": 1.285295844078064, + "learning_rate": 4.999723231011229e-06, + "loss": 0.7785, + "step": 203 + }, + { + "epoch": 0.056028563581433675, + "grad_norm": 1.422584891319275, + "learning_rate": 4.9997178308695634e-06, + "loss": 0.8054, + "step": 204 + }, + { + "epoch": 0.056303213402911285, + "grad_norm": 1.2680491209030151, + "learning_rate": 4.999712378556561e-06, + "loss": 0.7788, + "step": 205 + }, + { + "epoch": 0.0565778632243889, + "grad_norm": 1.31239652633667, + "learning_rate": 4.999706874072336e-06, + "loss": 0.8043, + "step": 206 + }, + { + "epoch": 0.05685251304586652, + "grad_norm": 1.2618324756622314, + "learning_rate": 4.999701317417002e-06, + "loss": 0.7962, + "step": 207 + }, + { + "epoch": 0.057127162867344136, + "grad_norm": 1.231039047241211, + "learning_rate": 4.999695708590676e-06, + "loss": 0.7711, + "step": 208 + }, + { + "epoch": 0.05740181268882175, + "grad_norm": 1.241103172302246, + "learning_rate": 4.999690047593474e-06, + "loss": 0.7424, + "step": 209 + }, + { + "epoch": 0.05767646251029937, + "grad_norm": 1.2723441123962402, + "learning_rate": 4.999684334425516e-06, + "loss": 0.7333, + "step": 210 + }, + { + "epoch": 0.05795111233177699, + "grad_norm": 1.2614531517028809, + "learning_rate": 4.999678569086921e-06, + "loss": 0.7858, + "step": 211 + }, + { + "epoch": 0.058225762153254604, + "grad_norm": 1.3542531728744507, + "learning_rate": 4.9996727515778075e-06, + "loss": 0.8102, + "step": 212 + }, + { + "epoch": 0.058500411974732214, + "grad_norm": 1.2611678838729858, + "learning_rate": 4.999666881898298e-06, + "loss": 0.7807, + "step": 213 + }, + { + "epoch": 0.05877506179620983, + "grad_norm": 1.2868565320968628, + "learning_rate": 4.999660960048515e-06, + "loss": 0.7891, + "step": 214 + }, + { + "epoch": 0.05904971161768745, + "grad_norm": 1.2528905868530273, + "learning_rate": 4.999654986028583e-06, + "loss": 0.7529, + "step": 215 + }, + { + "epoch": 0.059324361439165064, + "grad_norm": 1.2623648643493652, + "learning_rate": 4.999648959838624e-06, + "loss": 0.818, + "step": 216 + }, + { + "epoch": 0.05959901126064268, + "grad_norm": 1.2941478490829468, + "learning_rate": 4.999642881478767e-06, + "loss": 0.8356, + "step": 217 + }, + { + "epoch": 0.0598736610821203, + "grad_norm": 1.2421194314956665, + "learning_rate": 4.999636750949137e-06, + "loss": 0.8091, + "step": 218 + }, + { + "epoch": 0.060148310903597915, + "grad_norm": 1.1763725280761719, + "learning_rate": 4.999630568249862e-06, + "loss": 0.6345, + "step": 219 + }, + { + "epoch": 0.06042296072507553, + "grad_norm": 1.2015634775161743, + "learning_rate": 4.999624333381072e-06, + "loss": 0.7977, + "step": 220 + }, + { + "epoch": 0.06069761054655314, + "grad_norm": 1.3325607776641846, + "learning_rate": 4.999618046342897e-06, + "loss": 0.8061, + "step": 221 + }, + { + "epoch": 0.06097226036803076, + "grad_norm": 1.1898705959320068, + "learning_rate": 4.999611707135467e-06, + "loss": 0.7657, + "step": 222 + }, + { + "epoch": 0.061246910189508376, + "grad_norm": 1.1627925634384155, + "learning_rate": 4.999605315758915e-06, + "loss": 0.7218, + "step": 223 + }, + { + "epoch": 0.06152156001098599, + "grad_norm": 1.268940806388855, + "learning_rate": 4.999598872213374e-06, + "loss": 0.8005, + "step": 224 + }, + { + "epoch": 0.06179620983246361, + "grad_norm": 1.2380216121673584, + "learning_rate": 4.999592376498979e-06, + "loss": 0.7491, + "step": 225 + }, + { + "epoch": 0.06207085965394123, + "grad_norm": 1.2676395177841187, + "learning_rate": 4.999585828615867e-06, + "loss": 0.782, + "step": 226 + }, + { + "epoch": 0.062345509475418844, + "grad_norm": 1.2441015243530273, + "learning_rate": 4.999579228564172e-06, + "loss": 0.7656, + "step": 227 + }, + { + "epoch": 0.06262015929689646, + "grad_norm": 1.2689790725708008, + "learning_rate": 4.999572576344033e-06, + "loss": 0.7884, + "step": 228 + }, + { + "epoch": 0.06289480911837407, + "grad_norm": 1.1827219724655151, + "learning_rate": 4.999565871955589e-06, + "loss": 0.7199, + "step": 229 + }, + { + "epoch": 0.0631694589398517, + "grad_norm": 1.2791718244552612, + "learning_rate": 4.999559115398978e-06, + "loss": 0.7696, + "step": 230 + }, + { + "epoch": 0.0634441087613293, + "grad_norm": 1.3330812454223633, + "learning_rate": 4.999552306674345e-06, + "loss": 0.7684, + "step": 231 + }, + { + "epoch": 0.06371875858280693, + "grad_norm": 1.3429521322250366, + "learning_rate": 4.999545445781829e-06, + "loss": 0.7533, + "step": 232 + }, + { + "epoch": 0.06399340840428454, + "grad_norm": 1.1584880352020264, + "learning_rate": 4.9995385327215736e-06, + "loss": 0.7521, + "step": 233 + }, + { + "epoch": 0.06426805822576215, + "grad_norm": 1.304396629333496, + "learning_rate": 4.999531567493724e-06, + "loss": 0.7727, + "step": 234 + }, + { + "epoch": 0.06454270804723977, + "grad_norm": 1.3097283840179443, + "learning_rate": 4.999524550098425e-06, + "loss": 0.808, + "step": 235 + }, + { + "epoch": 0.06481735786871738, + "grad_norm": 1.2994967699050903, + "learning_rate": 4.999517480535822e-06, + "loss": 0.7836, + "step": 236 + }, + { + "epoch": 0.065092007690195, + "grad_norm": 1.262601375579834, + "learning_rate": 4.999510358806064e-06, + "loss": 0.7462, + "step": 237 + }, + { + "epoch": 0.06536665751167262, + "grad_norm": 1.302114486694336, + "learning_rate": 4.9995031849093e-06, + "loss": 0.7432, + "step": 238 + }, + { + "epoch": 0.06564130733315024, + "grad_norm": 1.2673075199127197, + "learning_rate": 4.9994959588456795e-06, + "loss": 0.8095, + "step": 239 + }, + { + "epoch": 0.06591595715462785, + "grad_norm": 1.3398888111114502, + "learning_rate": 4.999488680615351e-06, + "loss": 0.7503, + "step": 240 + }, + { + "epoch": 0.06619060697610546, + "grad_norm": 1.1982496976852417, + "learning_rate": 4.99948135021847e-06, + "loss": 0.746, + "step": 241 + }, + { + "epoch": 0.06646525679758308, + "grad_norm": 1.132543921470642, + "learning_rate": 4.999473967655186e-06, + "loss": 0.6858, + "step": 242 + }, + { + "epoch": 0.0667399066190607, + "grad_norm": 1.2033439874649048, + "learning_rate": 4.9994665329256565e-06, + "loss": 0.8114, + "step": 243 + }, + { + "epoch": 0.06701455644053832, + "grad_norm": 1.2752856016159058, + "learning_rate": 4.999459046030034e-06, + "loss": 0.7618, + "step": 244 + }, + { + "epoch": 0.06728920626201593, + "grad_norm": 1.3224573135375977, + "learning_rate": 4.999451506968476e-06, + "loss": 0.8027, + "step": 245 + }, + { + "epoch": 0.06756385608349355, + "grad_norm": 1.2776234149932861, + "learning_rate": 4.9994439157411395e-06, + "loss": 0.7992, + "step": 246 + }, + { + "epoch": 0.06783850590497116, + "grad_norm": 1.2867803573608398, + "learning_rate": 4.9994362723481824e-06, + "loss": 0.7542, + "step": 247 + }, + { + "epoch": 0.06811315572644878, + "grad_norm": 1.2823057174682617, + "learning_rate": 4.999428576789766e-06, + "loss": 0.7874, + "step": 248 + }, + { + "epoch": 0.0683878055479264, + "grad_norm": 1.2255151271820068, + "learning_rate": 4.999420829066049e-06, + "loss": 0.7905, + "step": 249 + }, + { + "epoch": 0.068662455369404, + "grad_norm": 1.4265819787979126, + "learning_rate": 4.999413029177194e-06, + "loss": 0.8087, + "step": 250 + }, + { + "epoch": 0.06893710519088163, + "grad_norm": 1.279026985168457, + "learning_rate": 4.999405177123364e-06, + "loss": 0.8071, + "step": 251 + }, + { + "epoch": 0.06921175501235924, + "grad_norm": 1.2843581438064575, + "learning_rate": 4.999397272904722e-06, + "loss": 0.7832, + "step": 252 + }, + { + "epoch": 0.06948640483383686, + "grad_norm": 1.306362509727478, + "learning_rate": 4.999389316521433e-06, + "loss": 0.7642, + "step": 253 + }, + { + "epoch": 0.06976105465531447, + "grad_norm": 1.2689117193222046, + "learning_rate": 4.999381307973664e-06, + "loss": 0.749, + "step": 254 + }, + { + "epoch": 0.0700357044767921, + "grad_norm": 1.2946871519088745, + "learning_rate": 4.999373247261582e-06, + "loss": 0.8087, + "step": 255 + }, + { + "epoch": 0.0703103542982697, + "grad_norm": 1.2030479907989502, + "learning_rate": 4.999365134385355e-06, + "loss": 0.7691, + "step": 256 + }, + { + "epoch": 0.07058500411974732, + "grad_norm": 1.2256243228912354, + "learning_rate": 4.999356969345152e-06, + "loss": 0.7724, + "step": 257 + }, + { + "epoch": 0.07085965394122494, + "grad_norm": 1.3268071413040161, + "learning_rate": 4.999348752141143e-06, + "loss": 0.7586, + "step": 258 + }, + { + "epoch": 0.07113430376270255, + "grad_norm": 1.203253149986267, + "learning_rate": 4.999340482773502e-06, + "loss": 0.7261, + "step": 259 + }, + { + "epoch": 0.07140895358418017, + "grad_norm": 1.239134430885315, + "learning_rate": 4.999332161242398e-06, + "loss": 0.8263, + "step": 260 + }, + { + "epoch": 0.07168360340565778, + "grad_norm": 1.2585619688034058, + "learning_rate": 4.999323787548007e-06, + "loss": 0.7722, + "step": 261 + }, + { + "epoch": 0.07195825322713541, + "grad_norm": 1.329383134841919, + "learning_rate": 4.999315361690504e-06, + "loss": 0.7504, + "step": 262 + }, + { + "epoch": 0.07223290304861302, + "grad_norm": 1.2525564432144165, + "learning_rate": 4.999306883670062e-06, + "loss": 0.7434, + "step": 263 + }, + { + "epoch": 0.07250755287009064, + "grad_norm": 1.2268863916397095, + "learning_rate": 4.999298353486861e-06, + "loss": 0.7531, + "step": 264 + }, + { + "epoch": 0.07278220269156825, + "grad_norm": 1.3073698282241821, + "learning_rate": 4.999289771141078e-06, + "loss": 0.7654, + "step": 265 + }, + { + "epoch": 0.07305685251304586, + "grad_norm": 1.2766711711883545, + "learning_rate": 4.999281136632893e-06, + "loss": 0.7417, + "step": 266 + }, + { + "epoch": 0.07333150233452349, + "grad_norm": 1.2614768743515015, + "learning_rate": 4.9992724499624844e-06, + "loss": 0.7842, + "step": 267 + }, + { + "epoch": 0.0736061521560011, + "grad_norm": 1.2347041368484497, + "learning_rate": 4.999263711130035e-06, + "loss": 0.7689, + "step": 268 + }, + { + "epoch": 0.07388080197747872, + "grad_norm": 1.2907088994979858, + "learning_rate": 4.999254920135726e-06, + "loss": 0.7382, + "step": 269 + }, + { + "epoch": 0.07415545179895633, + "grad_norm": 1.2646888494491577, + "learning_rate": 4.999246076979741e-06, + "loss": 0.7612, + "step": 270 + }, + { + "epoch": 0.07443010162043395, + "grad_norm": 1.2067524194717407, + "learning_rate": 4.999237181662267e-06, + "loss": 0.7288, + "step": 271 + }, + { + "epoch": 0.07470475144191156, + "grad_norm": 1.2424508333206177, + "learning_rate": 4.999228234183486e-06, + "loss": 0.7624, + "step": 272 + }, + { + "epoch": 0.07497940126338917, + "grad_norm": 1.430173397064209, + "learning_rate": 4.999219234543587e-06, + "loss": 0.8062, + "step": 273 + }, + { + "epoch": 0.0752540510848668, + "grad_norm": 1.2682491540908813, + "learning_rate": 4.9992101827427575e-06, + "loss": 0.7912, + "step": 274 + }, + { + "epoch": 0.0755287009063444, + "grad_norm": 1.307724952697754, + "learning_rate": 4.9992010787811865e-06, + "loss": 0.7962, + "step": 275 + }, + { + "epoch": 0.07580335072782203, + "grad_norm": 1.3073571920394897, + "learning_rate": 4.999191922659062e-06, + "loss": 0.7414, + "step": 276 + }, + { + "epoch": 0.07607800054929964, + "grad_norm": 1.2337167263031006, + "learning_rate": 4.999182714376578e-06, + "loss": 0.7946, + "step": 277 + }, + { + "epoch": 0.07635265037077726, + "grad_norm": 1.3415669202804565, + "learning_rate": 4.999173453933926e-06, + "loss": 0.7743, + "step": 278 + }, + { + "epoch": 0.07662730019225487, + "grad_norm": 1.2586348056793213, + "learning_rate": 4.999164141331299e-06, + "loss": 0.7775, + "step": 279 + }, + { + "epoch": 0.0769019500137325, + "grad_norm": 1.205263614654541, + "learning_rate": 4.99915477656889e-06, + "loss": 0.7261, + "step": 280 + }, + { + "epoch": 0.07717659983521011, + "grad_norm": 1.2564035654067993, + "learning_rate": 4.999145359646896e-06, + "loss": 0.7494, + "step": 281 + }, + { + "epoch": 0.07745124965668772, + "grad_norm": 1.213661789894104, + "learning_rate": 4.999135890565514e-06, + "loss": 0.738, + "step": 282 + }, + { + "epoch": 0.07772589947816534, + "grad_norm": 1.2392808198928833, + "learning_rate": 4.999126369324939e-06, + "loss": 0.7371, + "step": 283 + }, + { + "epoch": 0.07800054929964295, + "grad_norm": 1.3007763624191284, + "learning_rate": 4.999116795925373e-06, + "loss": 0.7398, + "step": 284 + }, + { + "epoch": 0.07827519912112058, + "grad_norm": 1.4160560369491577, + "learning_rate": 4.999107170367014e-06, + "loss": 0.761, + "step": 285 + }, + { + "epoch": 0.07854984894259819, + "grad_norm": 1.1680339574813843, + "learning_rate": 4.999097492650062e-06, + "loss": 0.7254, + "step": 286 + }, + { + "epoch": 0.07882449876407581, + "grad_norm": 1.3414337635040283, + "learning_rate": 4.999087762774721e-06, + "loss": 0.7921, + "step": 287 + }, + { + "epoch": 0.07909914858555342, + "grad_norm": 1.2688343524932861, + "learning_rate": 4.999077980741194e-06, + "loss": 0.8083, + "step": 288 + }, + { + "epoch": 0.07937379840703103, + "grad_norm": 1.3309379816055298, + "learning_rate": 4.999068146549684e-06, + "loss": 0.7837, + "step": 289 + }, + { + "epoch": 0.07964844822850865, + "grad_norm": 1.2268614768981934, + "learning_rate": 4.999058260200396e-06, + "loss": 0.7245, + "step": 290 + }, + { + "epoch": 0.07992309804998626, + "grad_norm": 1.1954363584518433, + "learning_rate": 4.999048321693537e-06, + "loss": 0.735, + "step": 291 + }, + { + "epoch": 0.08019774787146389, + "grad_norm": 1.3033727407455444, + "learning_rate": 4.999038331029316e-06, + "loss": 0.8043, + "step": 292 + }, + { + "epoch": 0.0804723976929415, + "grad_norm": 1.2318463325500488, + "learning_rate": 4.999028288207938e-06, + "loss": 0.7605, + "step": 293 + }, + { + "epoch": 0.08074704751441912, + "grad_norm": 1.2270946502685547, + "learning_rate": 4.999018193229615e-06, + "loss": 0.8124, + "step": 294 + }, + { + "epoch": 0.08102169733589673, + "grad_norm": 1.2937095165252686, + "learning_rate": 4.999008046094558e-06, + "loss": 0.7777, + "step": 295 + }, + { + "epoch": 0.08129634715737435, + "grad_norm": 1.241363763809204, + "learning_rate": 4.998997846802976e-06, + "loss": 0.7521, + "step": 296 + }, + { + "epoch": 0.08157099697885196, + "grad_norm": 1.235102653503418, + "learning_rate": 4.998987595355086e-06, + "loss": 0.7577, + "step": 297 + }, + { + "epoch": 0.08184564680032957, + "grad_norm": 1.2663071155548096, + "learning_rate": 4.9989772917510995e-06, + "loss": 0.8156, + "step": 298 + }, + { + "epoch": 0.0821202966218072, + "grad_norm": 1.3150992393493652, + "learning_rate": 4.998966935991231e-06, + "loss": 0.7363, + "step": 299 + }, + { + "epoch": 0.08239494644328481, + "grad_norm": 1.179703950881958, + "learning_rate": 4.998956528075698e-06, + "loss": 0.7119, + "step": 300 + }, + { + "epoch": 0.08266959626476243, + "grad_norm": 1.1894605159759521, + "learning_rate": 4.998946068004717e-06, + "loss": 0.7132, + "step": 301 + }, + { + "epoch": 0.08294424608624004, + "grad_norm": 1.1688392162322998, + "learning_rate": 4.998935555778506e-06, + "loss": 0.7019, + "step": 302 + }, + { + "epoch": 0.08321889590771767, + "grad_norm": 1.2806402444839478, + "learning_rate": 4.998924991397286e-06, + "loss": 0.8058, + "step": 303 + }, + { + "epoch": 0.08349354572919528, + "grad_norm": 1.2150392532348633, + "learning_rate": 4.998914374861275e-06, + "loss": 0.7192, + "step": 304 + }, + { + "epoch": 0.08376819555067289, + "grad_norm": 1.2211799621582031, + "learning_rate": 4.998903706170698e-06, + "loss": 0.7622, + "step": 305 + }, + { + "epoch": 0.08404284537215051, + "grad_norm": 1.2335838079452515, + "learning_rate": 4.998892985325775e-06, + "loss": 0.7758, + "step": 306 + }, + { + "epoch": 0.08431749519362812, + "grad_norm": 1.19437575340271, + "learning_rate": 4.998882212326729e-06, + "loss": 0.7146, + "step": 307 + }, + { + "epoch": 0.08459214501510574, + "grad_norm": 1.2008423805236816, + "learning_rate": 4.998871387173788e-06, + "loss": 0.7262, + "step": 308 + }, + { + "epoch": 0.08486679483658335, + "grad_norm": 1.3043582439422607, + "learning_rate": 4.998860509867175e-06, + "loss": 0.7128, + "step": 309 + }, + { + "epoch": 0.08514144465806098, + "grad_norm": 1.2325835227966309, + "learning_rate": 4.998849580407119e-06, + "loss": 0.7074, + "step": 310 + }, + { + "epoch": 0.08541609447953859, + "grad_norm": 1.2925136089324951, + "learning_rate": 4.998838598793847e-06, + "loss": 0.7865, + "step": 311 + }, + { + "epoch": 0.08569074430101621, + "grad_norm": 1.189629316329956, + "learning_rate": 4.998827565027588e-06, + "loss": 0.7446, + "step": 312 + }, + { + "epoch": 0.08596539412249382, + "grad_norm": 1.2991821765899658, + "learning_rate": 4.998816479108574e-06, + "loss": 0.801, + "step": 313 + }, + { + "epoch": 0.08624004394397143, + "grad_norm": 1.2367357015609741, + "learning_rate": 4.998805341037034e-06, + "loss": 0.7624, + "step": 314 + }, + { + "epoch": 0.08651469376544905, + "grad_norm": 1.3648698329925537, + "learning_rate": 4.998794150813202e-06, + "loss": 0.7809, + "step": 315 + }, + { + "epoch": 0.08678934358692666, + "grad_norm": 1.3204762935638428, + "learning_rate": 4.998782908437311e-06, + "loss": 0.8026, + "step": 316 + }, + { + "epoch": 0.08706399340840429, + "grad_norm": 1.263836145401001, + "learning_rate": 4.998771613909596e-06, + "loss": 0.7646, + "step": 317 + }, + { + "epoch": 0.0873386432298819, + "grad_norm": 1.2400919198989868, + "learning_rate": 4.9987602672302925e-06, + "loss": 0.7022, + "step": 318 + }, + { + "epoch": 0.08761329305135952, + "grad_norm": 1.2299655675888062, + "learning_rate": 4.998748868399638e-06, + "loss": 0.7387, + "step": 319 + }, + { + "epoch": 0.08788794287283713, + "grad_norm": 1.1651661396026611, + "learning_rate": 4.998737417417868e-06, + "loss": 0.71, + "step": 320 + }, + { + "epoch": 0.08816259269431474, + "grad_norm": 1.2628899812698364, + "learning_rate": 4.9987259142852255e-06, + "loss": 0.7454, + "step": 321 + }, + { + "epoch": 0.08843724251579237, + "grad_norm": 1.2806248664855957, + "learning_rate": 4.998714359001947e-06, + "loss": 0.7379, + "step": 322 + }, + { + "epoch": 0.08871189233726998, + "grad_norm": 1.3419311046600342, + "learning_rate": 4.998702751568275e-06, + "loss": 0.8354, + "step": 323 + }, + { + "epoch": 0.0889865421587476, + "grad_norm": 1.2791731357574463, + "learning_rate": 4.998691091984451e-06, + "loss": 0.7774, + "step": 324 + }, + { + "epoch": 0.08926119198022521, + "grad_norm": 1.2205255031585693, + "learning_rate": 4.99867938025072e-06, + "loss": 0.7421, + "step": 325 + }, + { + "epoch": 0.08953584180170283, + "grad_norm": 1.3013678789138794, + "learning_rate": 4.998667616367326e-06, + "loss": 0.7416, + "step": 326 + }, + { + "epoch": 0.08981049162318044, + "grad_norm": 1.2318803071975708, + "learning_rate": 4.998655800334514e-06, + "loss": 0.6776, + "step": 327 + }, + { + "epoch": 0.09008514144465807, + "grad_norm": 1.2002317905426025, + "learning_rate": 4.998643932152531e-06, + "loss": 0.7227, + "step": 328 + }, + { + "epoch": 0.09035979126613568, + "grad_norm": 1.3411955833435059, + "learning_rate": 4.998632011821623e-06, + "loss": 0.7965, + "step": 329 + }, + { + "epoch": 0.09063444108761329, + "grad_norm": 1.2441884279251099, + "learning_rate": 4.998620039342041e-06, + "loss": 0.7653, + "step": 330 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.2387334108352661, + "learning_rate": 4.998608014714034e-06, + "loss": 0.7502, + "step": 331 + }, + { + "epoch": 0.09118374073056852, + "grad_norm": 1.2530438899993896, + "learning_rate": 4.9985959379378525e-06, + "loss": 0.7405, + "step": 332 + }, + { + "epoch": 0.09145839055204615, + "grad_norm": 1.2783328294754028, + "learning_rate": 4.99858380901375e-06, + "loss": 0.7502, + "step": 333 + }, + { + "epoch": 0.09173304037352376, + "grad_norm": 1.1981538534164429, + "learning_rate": 4.998571627941978e-06, + "loss": 0.7656, + "step": 334 + }, + { + "epoch": 0.09200769019500138, + "grad_norm": 1.1556609869003296, + "learning_rate": 4.998559394722792e-06, + "loss": 0.6623, + "step": 335 + }, + { + "epoch": 0.09228234001647899, + "grad_norm": 1.2426846027374268, + "learning_rate": 4.998547109356446e-06, + "loss": 0.6807, + "step": 336 + }, + { + "epoch": 0.0925569898379566, + "grad_norm": 1.2712539434432983, + "learning_rate": 4.998534771843197e-06, + "loss": 0.753, + "step": 337 + }, + { + "epoch": 0.09283163965943422, + "grad_norm": 1.254359483718872, + "learning_rate": 4.998522382183304e-06, + "loss": 0.682, + "step": 338 + }, + { + "epoch": 0.09310628948091183, + "grad_norm": 1.3130810260772705, + "learning_rate": 4.9985099403770226e-06, + "loss": 0.7916, + "step": 339 + }, + { + "epoch": 0.09338093930238946, + "grad_norm": 1.2580945491790771, + "learning_rate": 4.998497446424615e-06, + "loss": 0.7711, + "step": 340 + }, + { + "epoch": 0.09365558912386707, + "grad_norm": 1.2058385610580444, + "learning_rate": 4.998484900326341e-06, + "loss": 0.7618, + "step": 341 + }, + { + "epoch": 0.09393023894534469, + "grad_norm": 1.3217735290527344, + "learning_rate": 4.998472302082462e-06, + "loss": 0.7732, + "step": 342 + }, + { + "epoch": 0.0942048887668223, + "grad_norm": 1.2745957374572754, + "learning_rate": 4.998459651693241e-06, + "loss": 0.797, + "step": 343 + }, + { + "epoch": 0.09447953858829992, + "grad_norm": 1.244918704032898, + "learning_rate": 4.9984469491589434e-06, + "loss": 0.7367, + "step": 344 + }, + { + "epoch": 0.09475418840977753, + "grad_norm": 1.363944411277771, + "learning_rate": 4.9984341944798344e-06, + "loss": 0.764, + "step": 345 + }, + { + "epoch": 0.09502883823125514, + "grad_norm": 1.3132069110870361, + "learning_rate": 4.998421387656178e-06, + "loss": 0.7494, + "step": 346 + }, + { + "epoch": 0.09530348805273277, + "grad_norm": 1.2018793821334839, + "learning_rate": 4.998408528688242e-06, + "loss": 0.7372, + "step": 347 + }, + { + "epoch": 0.09557813787421038, + "grad_norm": 1.1845868825912476, + "learning_rate": 4.998395617576297e-06, + "loss": 0.724, + "step": 348 + }, + { + "epoch": 0.095852787695688, + "grad_norm": 1.1893272399902344, + "learning_rate": 4.998382654320609e-06, + "loss": 0.6956, + "step": 349 + }, + { + "epoch": 0.09612743751716561, + "grad_norm": 1.2171897888183594, + "learning_rate": 4.998369638921452e-06, + "loss": 0.741, + "step": 350 + }, + { + "epoch": 0.09640208733864324, + "grad_norm": 1.2046420574188232, + "learning_rate": 4.998356571379096e-06, + "loss": 0.7301, + "step": 351 + }, + { + "epoch": 0.09667673716012085, + "grad_norm": 1.3397064208984375, + "learning_rate": 4.998343451693813e-06, + "loss": 0.7771, + "step": 352 + }, + { + "epoch": 0.09695138698159846, + "grad_norm": 1.36711847782135, + "learning_rate": 4.998330279865878e-06, + "loss": 0.7682, + "step": 353 + }, + { + "epoch": 0.09722603680307608, + "grad_norm": 1.2135101556777954, + "learning_rate": 4.998317055895566e-06, + "loss": 0.7561, + "step": 354 + }, + { + "epoch": 0.09750068662455369, + "grad_norm": 1.3456923961639404, + "learning_rate": 4.998303779783153e-06, + "loss": 0.815, + "step": 355 + }, + { + "epoch": 0.09777533644603131, + "grad_norm": 1.211620569229126, + "learning_rate": 4.998290451528915e-06, + "loss": 0.7805, + "step": 356 + }, + { + "epoch": 0.09804998626750892, + "grad_norm": 1.2616722583770752, + "learning_rate": 4.998277071133131e-06, + "loss": 0.7107, + "step": 357 + }, + { + "epoch": 0.09832463608898655, + "grad_norm": 1.3406049013137817, + "learning_rate": 4.99826363859608e-06, + "loss": 0.7213, + "step": 358 + }, + { + "epoch": 0.09859928591046416, + "grad_norm": 1.3037861585617065, + "learning_rate": 4.998250153918041e-06, + "loss": 0.7385, + "step": 359 + }, + { + "epoch": 0.09887393573194178, + "grad_norm": 1.232344627380371, + "learning_rate": 4.998236617099299e-06, + "loss": 0.6772, + "step": 360 + }, + { + "epoch": 0.09914858555341939, + "grad_norm": 1.237783670425415, + "learning_rate": 4.998223028140133e-06, + "loss": 0.7385, + "step": 361 + }, + { + "epoch": 0.099423235374897, + "grad_norm": 1.4490269422531128, + "learning_rate": 4.998209387040829e-06, + "loss": 0.7878, + "step": 362 + }, + { + "epoch": 0.09969788519637462, + "grad_norm": 1.2233551740646362, + "learning_rate": 4.99819569380167e-06, + "loss": 0.7518, + "step": 363 + }, + { + "epoch": 0.09997253501785223, + "grad_norm": 1.2865358591079712, + "learning_rate": 4.998181948422943e-06, + "loss": 0.7221, + "step": 364 + }, + { + "epoch": 0.10024718483932986, + "grad_norm": 1.2435417175292969, + "learning_rate": 4.998168150904934e-06, + "loss": 0.783, + "step": 365 + }, + { + "epoch": 0.10052183466080747, + "grad_norm": 1.2082093954086304, + "learning_rate": 4.998154301247932e-06, + "loss": 0.7372, + "step": 366 + }, + { + "epoch": 0.10079648448228509, + "grad_norm": 1.2715510129928589, + "learning_rate": 4.998140399452225e-06, + "loss": 0.7808, + "step": 367 + }, + { + "epoch": 0.1010711343037627, + "grad_norm": 1.2850587368011475, + "learning_rate": 4.9981264455181035e-06, + "loss": 0.7617, + "step": 368 + }, + { + "epoch": 0.10134578412524031, + "grad_norm": 1.3042892217636108, + "learning_rate": 4.998112439445858e-06, + "loss": 0.7316, + "step": 369 + }, + { + "epoch": 0.10162043394671794, + "grad_norm": 1.253700613975525, + "learning_rate": 4.998098381235783e-06, + "loss": 0.779, + "step": 370 + }, + { + "epoch": 0.10189508376819555, + "grad_norm": 1.260552167892456, + "learning_rate": 4.9980842708881695e-06, + "loss": 0.7156, + "step": 371 + }, + { + "epoch": 0.10216973358967317, + "grad_norm": 1.237147331237793, + "learning_rate": 4.998070108403313e-06, + "loss": 0.793, + "step": 372 + }, + { + "epoch": 0.10244438341115078, + "grad_norm": 1.2182241678237915, + "learning_rate": 4.99805589378151e-06, + "loss": 0.7418, + "step": 373 + }, + { + "epoch": 0.1027190332326284, + "grad_norm": 1.341265082359314, + "learning_rate": 4.998041627023056e-06, + "loss": 0.7954, + "step": 374 + }, + { + "epoch": 0.10299368305410601, + "grad_norm": 1.4243696928024292, + "learning_rate": 4.998027308128248e-06, + "loss": 0.749, + "step": 375 + }, + { + "epoch": 0.10326833287558364, + "grad_norm": 1.2504782676696777, + "learning_rate": 4.998012937097388e-06, + "loss": 0.7089, + "step": 376 + }, + { + "epoch": 0.10354298269706125, + "grad_norm": 1.2202086448669434, + "learning_rate": 4.997998513930772e-06, + "loss": 0.7286, + "step": 377 + }, + { + "epoch": 0.10381763251853886, + "grad_norm": 1.266692876815796, + "learning_rate": 4.997984038628703e-06, + "loss": 0.8013, + "step": 378 + }, + { + "epoch": 0.10409228234001648, + "grad_norm": 1.23798668384552, + "learning_rate": 4.997969511191483e-06, + "loss": 0.7224, + "step": 379 + }, + { + "epoch": 0.10436693216149409, + "grad_norm": 1.2249729633331299, + "learning_rate": 4.997954931619415e-06, + "loss": 0.7368, + "step": 380 + }, + { + "epoch": 0.10464158198297172, + "grad_norm": 1.3429884910583496, + "learning_rate": 4.997940299912803e-06, + "loss": 0.7729, + "step": 381 + }, + { + "epoch": 0.10491623180444933, + "grad_norm": 1.2739571332931519, + "learning_rate": 4.997925616071954e-06, + "loss": 0.735, + "step": 382 + }, + { + "epoch": 0.10519088162592695, + "grad_norm": 1.3422342538833618, + "learning_rate": 4.997910880097173e-06, + "loss": 0.7778, + "step": 383 + }, + { + "epoch": 0.10546553144740456, + "grad_norm": 1.2532919645309448, + "learning_rate": 4.997896091988767e-06, + "loss": 0.741, + "step": 384 + }, + { + "epoch": 0.10574018126888217, + "grad_norm": 1.2357875108718872, + "learning_rate": 4.9978812517470465e-06, + "loss": 0.6794, + "step": 385 + }, + { + "epoch": 0.10601483109035979, + "grad_norm": 1.2855935096740723, + "learning_rate": 4.99786635937232e-06, + "loss": 0.6896, + "step": 386 + }, + { + "epoch": 0.1062894809118374, + "grad_norm": 1.1680147647857666, + "learning_rate": 4.997851414864898e-06, + "loss": 0.7083, + "step": 387 + }, + { + "epoch": 0.10656413073331503, + "grad_norm": 1.258545160293579, + "learning_rate": 4.9978364182250935e-06, + "loss": 0.7549, + "step": 388 + }, + { + "epoch": 0.10683878055479264, + "grad_norm": 1.3306005001068115, + "learning_rate": 4.997821369453218e-06, + "loss": 0.7528, + "step": 389 + }, + { + "epoch": 0.10711343037627026, + "grad_norm": 1.3328379392623901, + "learning_rate": 4.997806268549587e-06, + "loss": 0.7177, + "step": 390 + }, + { + "epoch": 0.10738808019774787, + "grad_norm": 1.1670678853988647, + "learning_rate": 4.997791115514516e-06, + "loss": 0.7267, + "step": 391 + }, + { + "epoch": 0.1076627300192255, + "grad_norm": 1.167024850845337, + "learning_rate": 4.997775910348319e-06, + "loss": 0.7376, + "step": 392 + }, + { + "epoch": 0.1079373798407031, + "grad_norm": 1.2820689678192139, + "learning_rate": 4.997760653051316e-06, + "loss": 0.7822, + "step": 393 + }, + { + "epoch": 0.10821202966218071, + "grad_norm": 1.206555724143982, + "learning_rate": 4.997745343623823e-06, + "loss": 0.7222, + "step": 394 + }, + { + "epoch": 0.10848667948365834, + "grad_norm": 1.289656162261963, + "learning_rate": 4.997729982066161e-06, + "loss": 0.7496, + "step": 395 + }, + { + "epoch": 0.10876132930513595, + "grad_norm": 1.336107611656189, + "learning_rate": 4.997714568378651e-06, + "loss": 0.7783, + "step": 396 + }, + { + "epoch": 0.10903597912661357, + "grad_norm": 1.2819020748138428, + "learning_rate": 4.997699102561613e-06, + "loss": 0.7156, + "step": 397 + }, + { + "epoch": 0.10931062894809118, + "grad_norm": 1.351546049118042, + "learning_rate": 4.997683584615371e-06, + "loss": 0.7789, + "step": 398 + }, + { + "epoch": 0.1095852787695688, + "grad_norm": 1.2483948469161987, + "learning_rate": 4.99766801454025e-06, + "loss": 0.7477, + "step": 399 + }, + { + "epoch": 0.10985992859104642, + "grad_norm": 1.2545384168624878, + "learning_rate": 4.9976523923365725e-06, + "loss": 0.7578, + "step": 400 + }, + { + "epoch": 0.11013457841252403, + "grad_norm": 1.3306173086166382, + "learning_rate": 4.997636718004666e-06, + "loss": 0.7751, + "step": 401 + }, + { + "epoch": 0.11040922823400165, + "grad_norm": 1.2168644666671753, + "learning_rate": 4.997620991544857e-06, + "loss": 0.7107, + "step": 402 + }, + { + "epoch": 0.11068387805547926, + "grad_norm": 1.2717666625976562, + "learning_rate": 4.997605212957475e-06, + "loss": 0.7258, + "step": 403 + }, + { + "epoch": 0.11095852787695688, + "grad_norm": 1.2988327741622925, + "learning_rate": 4.997589382242847e-06, + "loss": 0.7847, + "step": 404 + }, + { + "epoch": 0.1112331776984345, + "grad_norm": 1.2802379131317139, + "learning_rate": 4.9975734994013055e-06, + "loss": 0.7454, + "step": 405 + }, + { + "epoch": 0.11150782751991212, + "grad_norm": 1.2659862041473389, + "learning_rate": 4.997557564433181e-06, + "loss": 0.6773, + "step": 406 + }, + { + "epoch": 0.11178247734138973, + "grad_norm": 1.3774352073669434, + "learning_rate": 4.997541577338807e-06, + "loss": 0.7503, + "step": 407 + }, + { + "epoch": 0.11205712716286735, + "grad_norm": 1.257185697555542, + "learning_rate": 4.997525538118515e-06, + "loss": 0.7005, + "step": 408 + }, + { + "epoch": 0.11233177698434496, + "grad_norm": 1.280836820602417, + "learning_rate": 4.997509446772643e-06, + "loss": 0.7504, + "step": 409 + }, + { + "epoch": 0.11260642680582257, + "grad_norm": 1.2438743114471436, + "learning_rate": 4.997493303301524e-06, + "loss": 0.7504, + "step": 410 + }, + { + "epoch": 0.1128810766273002, + "grad_norm": 1.233466625213623, + "learning_rate": 4.997477107705496e-06, + "loss": 0.7115, + "step": 411 + }, + { + "epoch": 0.1131557264487778, + "grad_norm": 1.3945411443710327, + "learning_rate": 4.997460859984898e-06, + "loss": 0.7836, + "step": 412 + }, + { + "epoch": 0.11343037627025543, + "grad_norm": 1.248413324356079, + "learning_rate": 4.9974445601400676e-06, + "loss": 0.7311, + "step": 413 + }, + { + "epoch": 0.11370502609173304, + "grad_norm": 1.1568094491958618, + "learning_rate": 4.997428208171345e-06, + "loss": 0.6951, + "step": 414 + }, + { + "epoch": 0.11397967591321066, + "grad_norm": 1.2211650609970093, + "learning_rate": 4.997411804079073e-06, + "loss": 0.7185, + "step": 415 + }, + { + "epoch": 0.11425432573468827, + "grad_norm": 1.271201252937317, + "learning_rate": 4.997395347863592e-06, + "loss": 0.7375, + "step": 416 + }, + { + "epoch": 0.11452897555616588, + "grad_norm": 1.2520843744277954, + "learning_rate": 4.9973788395252475e-06, + "loss": 0.7313, + "step": 417 + }, + { + "epoch": 0.1148036253776435, + "grad_norm": 1.3485846519470215, + "learning_rate": 4.997362279064382e-06, + "loss": 0.7476, + "step": 418 + }, + { + "epoch": 0.11507827519912112, + "grad_norm": 1.349035382270813, + "learning_rate": 4.997345666481342e-06, + "loss": 0.7066, + "step": 419 + }, + { + "epoch": 0.11535292502059874, + "grad_norm": 1.2428969144821167, + "learning_rate": 4.997329001776475e-06, + "loss": 0.7618, + "step": 420 + }, + { + "epoch": 0.11562757484207635, + "grad_norm": 1.2747961282730103, + "learning_rate": 4.997312284950128e-06, + "loss": 0.7372, + "step": 421 + }, + { + "epoch": 0.11590222466355397, + "grad_norm": 1.2884197235107422, + "learning_rate": 4.99729551600265e-06, + "loss": 0.7319, + "step": 422 + }, + { + "epoch": 0.11617687448503158, + "grad_norm": 1.2992759943008423, + "learning_rate": 4.997278694934391e-06, + "loss": 0.7508, + "step": 423 + }, + { + "epoch": 0.11645152430650921, + "grad_norm": 1.3822576999664307, + "learning_rate": 4.997261821745702e-06, + "loss": 0.7185, + "step": 424 + }, + { + "epoch": 0.11672617412798682, + "grad_norm": 1.1869926452636719, + "learning_rate": 4.997244896436936e-06, + "loss": 0.6604, + "step": 425 + }, + { + "epoch": 0.11700082394946443, + "grad_norm": 1.2719181776046753, + "learning_rate": 4.9972279190084444e-06, + "loss": 0.7501, + "step": 426 + }, + { + "epoch": 0.11727547377094205, + "grad_norm": 1.2562122344970703, + "learning_rate": 4.997210889460583e-06, + "loss": 0.7602, + "step": 427 + }, + { + "epoch": 0.11755012359241966, + "grad_norm": 1.242262840270996, + "learning_rate": 4.997193807793707e-06, + "loss": 0.7199, + "step": 428 + }, + { + "epoch": 0.11782477341389729, + "grad_norm": 1.2422415018081665, + "learning_rate": 4.997176674008172e-06, + "loss": 0.736, + "step": 429 + }, + { + "epoch": 0.1180994232353749, + "grad_norm": 1.3194923400878906, + "learning_rate": 4.997159488104337e-06, + "loss": 0.7424, + "step": 430 + }, + { + "epoch": 0.11837407305685252, + "grad_norm": 1.281875491142273, + "learning_rate": 4.997142250082561e-06, + "loss": 0.7748, + "step": 431 + }, + { + "epoch": 0.11864872287833013, + "grad_norm": 1.2036486864089966, + "learning_rate": 4.997124959943201e-06, + "loss": 0.6856, + "step": 432 + }, + { + "epoch": 0.11892337269980774, + "grad_norm": 1.2024517059326172, + "learning_rate": 4.997107617686621e-06, + "loss": 0.7524, + "step": 433 + }, + { + "epoch": 0.11919802252128536, + "grad_norm": 1.2705662250518799, + "learning_rate": 4.997090223313181e-06, + "loss": 0.743, + "step": 434 + }, + { + "epoch": 0.11947267234276297, + "grad_norm": 1.3075573444366455, + "learning_rate": 4.997072776823244e-06, + "loss": 0.7316, + "step": 435 + }, + { + "epoch": 0.1197473221642406, + "grad_norm": 1.2316983938217163, + "learning_rate": 4.9970552782171765e-06, + "loss": 0.7231, + "step": 436 + }, + { + "epoch": 0.1200219719857182, + "grad_norm": 1.1771068572998047, + "learning_rate": 4.9970377274953406e-06, + "loss": 0.7078, + "step": 437 + }, + { + "epoch": 0.12029662180719583, + "grad_norm": 1.2357999086380005, + "learning_rate": 4.997020124658104e-06, + "loss": 0.6675, + "step": 438 + }, + { + "epoch": 0.12057127162867344, + "grad_norm": 1.254892110824585, + "learning_rate": 4.997002469705835e-06, + "loss": 0.7801, + "step": 439 + }, + { + "epoch": 0.12084592145015106, + "grad_norm": 1.248659372329712, + "learning_rate": 4.996984762638901e-06, + "loss": 0.7504, + "step": 440 + }, + { + "epoch": 0.12112057127162867, + "grad_norm": 1.2337586879730225, + "learning_rate": 4.996967003457671e-06, + "loss": 0.761, + "step": 441 + }, + { + "epoch": 0.12139522109310628, + "grad_norm": 1.3466196060180664, + "learning_rate": 4.996949192162517e-06, + "loss": 0.7371, + "step": 442 + }, + { + "epoch": 0.12166987091458391, + "grad_norm": 1.205243706703186, + "learning_rate": 4.99693132875381e-06, + "loss": 0.6969, + "step": 443 + }, + { + "epoch": 0.12194452073606152, + "grad_norm": 1.350434422492981, + "learning_rate": 4.996913413231923e-06, + "loss": 0.7081, + "step": 444 + }, + { + "epoch": 0.12221917055753914, + "grad_norm": 1.2508010864257812, + "learning_rate": 4.996895445597229e-06, + "loss": 0.7562, + "step": 445 + }, + { + "epoch": 0.12249382037901675, + "grad_norm": 1.2065321207046509, + "learning_rate": 4.996877425850104e-06, + "loss": 0.7127, + "step": 446 + }, + { + "epoch": 0.12276847020049438, + "grad_norm": 1.305888295173645, + "learning_rate": 4.996859353990925e-06, + "loss": 0.7246, + "step": 447 + }, + { + "epoch": 0.12304312002197199, + "grad_norm": 1.2624449729919434, + "learning_rate": 4.996841230020067e-06, + "loss": 0.7292, + "step": 448 + }, + { + "epoch": 0.1233177698434496, + "grad_norm": 1.3166404962539673, + "learning_rate": 4.99682305393791e-06, + "loss": 0.7843, + "step": 449 + }, + { + "epoch": 0.12359241966492722, + "grad_norm": 1.206982970237732, + "learning_rate": 4.9968048257448335e-06, + "loss": 0.7085, + "step": 450 + }, + { + "epoch": 0.12386706948640483, + "grad_norm": 1.3199633359909058, + "learning_rate": 4.996786545441216e-06, + "loss": 0.7583, + "step": 451 + }, + { + "epoch": 0.12414171930788245, + "grad_norm": 1.2931712865829468, + "learning_rate": 4.9967682130274404e-06, + "loss": 0.7444, + "step": 452 + }, + { + "epoch": 0.12441636912936006, + "grad_norm": 1.2845779657363892, + "learning_rate": 4.996749828503889e-06, + "loss": 0.7304, + "step": 453 + }, + { + "epoch": 0.12469101895083769, + "grad_norm": 1.386226773262024, + "learning_rate": 4.9967313918709455e-06, + "loss": 0.8193, + "step": 454 + }, + { + "epoch": 0.1249656687723153, + "grad_norm": 1.4400932788848877, + "learning_rate": 4.996712903128995e-06, + "loss": 0.7541, + "step": 455 + }, + { + "epoch": 0.12524031859379292, + "grad_norm": 1.3683884143829346, + "learning_rate": 4.996694362278424e-06, + "loss": 0.7578, + "step": 456 + }, + { + "epoch": 0.12551496841527052, + "grad_norm": 1.290042757987976, + "learning_rate": 4.996675769319617e-06, + "loss": 0.7582, + "step": 457 + }, + { + "epoch": 0.12578961823674814, + "grad_norm": 1.1474919319152832, + "learning_rate": 4.9966571242529645e-06, + "loss": 0.6678, + "step": 458 + }, + { + "epoch": 0.12606426805822576, + "grad_norm": 1.26216459274292, + "learning_rate": 4.996638427078855e-06, + "loss": 0.7659, + "step": 459 + }, + { + "epoch": 0.1263389178797034, + "grad_norm": 1.25944983959198, + "learning_rate": 4.996619677797679e-06, + "loss": 0.7505, + "step": 460 + }, + { + "epoch": 0.12661356770118098, + "grad_norm": 1.1706511974334717, + "learning_rate": 4.996600876409826e-06, + "loss": 0.7167, + "step": 461 + }, + { + "epoch": 0.1268882175226586, + "grad_norm": 1.2865626811981201, + "learning_rate": 4.99658202291569e-06, + "loss": 0.7619, + "step": 462 + }, + { + "epoch": 0.12716286734413623, + "grad_norm": 1.248112678527832, + "learning_rate": 4.996563117315665e-06, + "loss": 0.7287, + "step": 463 + }, + { + "epoch": 0.12743751716561386, + "grad_norm": 1.2872062921524048, + "learning_rate": 4.996544159610145e-06, + "loss": 0.7919, + "step": 464 + }, + { + "epoch": 0.12771216698709145, + "grad_norm": 1.2547814846038818, + "learning_rate": 4.9965251497995246e-06, + "loss": 0.7242, + "step": 465 + }, + { + "epoch": 0.12798681680856908, + "grad_norm": 1.4056199789047241, + "learning_rate": 4.996506087884202e-06, + "loss": 0.7431, + "step": 466 + }, + { + "epoch": 0.1282614666300467, + "grad_norm": 1.3475724458694458, + "learning_rate": 4.996486973864575e-06, + "loss": 0.7297, + "step": 467 + }, + { + "epoch": 0.1285361164515243, + "grad_norm": 1.2836741209030151, + "learning_rate": 4.996467807741041e-06, + "loss": 0.6965, + "step": 468 + }, + { + "epoch": 0.12881076627300192, + "grad_norm": 1.2718721628189087, + "learning_rate": 4.996448589514002e-06, + "loss": 0.7871, + "step": 469 + }, + { + "epoch": 0.12908541609447954, + "grad_norm": 1.3450841903686523, + "learning_rate": 4.996429319183856e-06, + "loss": 0.814, + "step": 470 + }, + { + "epoch": 0.12936006591595717, + "grad_norm": 1.21324622631073, + "learning_rate": 4.9964099967510095e-06, + "loss": 0.7357, + "step": 471 + }, + { + "epoch": 0.12963471573743476, + "grad_norm": 1.21219801902771, + "learning_rate": 4.996390622215863e-06, + "loss": 0.7264, + "step": 472 + }, + { + "epoch": 0.1299093655589124, + "grad_norm": 1.3398288488388062, + "learning_rate": 4.996371195578822e-06, + "loss": 0.7559, + "step": 473 + }, + { + "epoch": 0.13018401538039, + "grad_norm": 1.382553219795227, + "learning_rate": 4.9963517168402905e-06, + "loss": 0.7661, + "step": 474 + }, + { + "epoch": 0.1304586652018676, + "grad_norm": 1.295050859451294, + "learning_rate": 4.9963321860006755e-06, + "loss": 0.7499, + "step": 475 + }, + { + "epoch": 0.13073331502334523, + "grad_norm": 1.1735695600509644, + "learning_rate": 4.996312603060386e-06, + "loss": 0.6957, + "step": 476 + }, + { + "epoch": 0.13100796484482286, + "grad_norm": 1.3107273578643799, + "learning_rate": 4.9962929680198306e-06, + "loss": 0.726, + "step": 477 + }, + { + "epoch": 0.13128261466630048, + "grad_norm": 1.1925411224365234, + "learning_rate": 4.996273280879417e-06, + "loss": 0.6994, + "step": 478 + }, + { + "epoch": 0.13155726448777808, + "grad_norm": 1.2793998718261719, + "learning_rate": 4.996253541639557e-06, + "loss": 0.721, + "step": 479 + }, + { + "epoch": 0.1318319143092557, + "grad_norm": 1.2780818939208984, + "learning_rate": 4.996233750300664e-06, + "loss": 0.7226, + "step": 480 + }, + { + "epoch": 0.13210656413073332, + "grad_norm": 1.2908203601837158, + "learning_rate": 4.996213906863149e-06, + "loss": 0.7258, + "step": 481 + }, + { + "epoch": 0.13238121395221092, + "grad_norm": 1.3219084739685059, + "learning_rate": 4.996194011327428e-06, + "loss": 0.7693, + "step": 482 + }, + { + "epoch": 0.13265586377368854, + "grad_norm": 1.2924821376800537, + "learning_rate": 4.996174063693915e-06, + "loss": 0.7433, + "step": 483 + }, + { + "epoch": 0.13293051359516617, + "grad_norm": 1.2904934883117676, + "learning_rate": 4.996154063963027e-06, + "loss": 0.7338, + "step": 484 + }, + { + "epoch": 0.1332051634166438, + "grad_norm": 1.2564016580581665, + "learning_rate": 4.996134012135181e-06, + "loss": 0.769, + "step": 485 + }, + { + "epoch": 0.1334798132381214, + "grad_norm": 1.3121843338012695, + "learning_rate": 4.9961139082107955e-06, + "loss": 0.716, + "step": 486 + }, + { + "epoch": 0.133754463059599, + "grad_norm": 1.3125780820846558, + "learning_rate": 4.996093752190291e-06, + "loss": 0.7363, + "step": 487 + }, + { + "epoch": 0.13402911288107663, + "grad_norm": 1.2501887083053589, + "learning_rate": 4.996073544074087e-06, + "loss": 0.6583, + "step": 488 + }, + { + "epoch": 0.13430376270255423, + "grad_norm": 1.3097083568572998, + "learning_rate": 4.996053283862605e-06, + "loss": 0.6961, + "step": 489 + }, + { + "epoch": 0.13457841252403185, + "grad_norm": 1.280885934829712, + "learning_rate": 4.996032971556269e-06, + "loss": 0.7375, + "step": 490 + }, + { + "epoch": 0.13485306234550948, + "grad_norm": 1.2557544708251953, + "learning_rate": 4.996012607155502e-06, + "loss": 0.7502, + "step": 491 + }, + { + "epoch": 0.1351277121669871, + "grad_norm": 1.2696101665496826, + "learning_rate": 4.99599219066073e-06, + "loss": 0.7061, + "step": 492 + }, + { + "epoch": 0.1354023619884647, + "grad_norm": 1.236847996711731, + "learning_rate": 4.995971722072379e-06, + "loss": 0.7217, + "step": 493 + }, + { + "epoch": 0.13567701180994232, + "grad_norm": 1.2991341352462769, + "learning_rate": 4.995951201390875e-06, + "loss": 0.7802, + "step": 494 + }, + { + "epoch": 0.13595166163141995, + "grad_norm": 1.2087254524230957, + "learning_rate": 4.995930628616647e-06, + "loss": 0.7431, + "step": 495 + }, + { + "epoch": 0.13622631145289757, + "grad_norm": 1.275146722793579, + "learning_rate": 4.995910003750125e-06, + "loss": 0.7517, + "step": 496 + }, + { + "epoch": 0.13650096127437517, + "grad_norm": 1.3168613910675049, + "learning_rate": 4.995889326791738e-06, + "loss": 0.7359, + "step": 497 + }, + { + "epoch": 0.1367756110958528, + "grad_norm": 1.3115369081497192, + "learning_rate": 4.995868597741919e-06, + "loss": 0.7396, + "step": 498 + }, + { + "epoch": 0.1370502609173304, + "grad_norm": 1.3071073293685913, + "learning_rate": 4.995847816601101e-06, + "loss": 0.7566, + "step": 499 + }, + { + "epoch": 0.137324910738808, + "grad_norm": 1.2403035163879395, + "learning_rate": 4.9958269833697145e-06, + "loss": 0.681, + "step": 500 + }, + { + "epoch": 0.13759956056028563, + "grad_norm": 1.2726815938949585, + "learning_rate": 4.995806098048199e-06, + "loss": 0.697, + "step": 501 + }, + { + "epoch": 0.13787421038176326, + "grad_norm": 1.2817356586456299, + "learning_rate": 4.995785160636987e-06, + "loss": 0.757, + "step": 502 + }, + { + "epoch": 0.13814886020324088, + "grad_norm": 1.3180609941482544, + "learning_rate": 4.995764171136516e-06, + "loss": 0.7961, + "step": 503 + }, + { + "epoch": 0.13842351002471848, + "grad_norm": 1.2108051776885986, + "learning_rate": 4.995743129547225e-06, + "loss": 0.7488, + "step": 504 + }, + { + "epoch": 0.1386981598461961, + "grad_norm": 1.2139147520065308, + "learning_rate": 4.995722035869552e-06, + "loss": 0.6681, + "step": 505 + }, + { + "epoch": 0.13897280966767372, + "grad_norm": 1.4002267122268677, + "learning_rate": 4.995700890103939e-06, + "loss": 0.7541, + "step": 506 + }, + { + "epoch": 0.13924745948915132, + "grad_norm": 1.3403345346450806, + "learning_rate": 4.995679692250827e-06, + "loss": 0.7299, + "step": 507 + }, + { + "epoch": 0.13952210931062894, + "grad_norm": 1.2086071968078613, + "learning_rate": 4.995658442310655e-06, + "loss": 0.6736, + "step": 508 + }, + { + "epoch": 0.13979675913210657, + "grad_norm": 1.2888362407684326, + "learning_rate": 4.995637140283872e-06, + "loss": 0.7122, + "step": 509 + }, + { + "epoch": 0.1400714089535842, + "grad_norm": 1.2734389305114746, + "learning_rate": 4.995615786170919e-06, + "loss": 0.7557, + "step": 510 + }, + { + "epoch": 0.1403460587750618, + "grad_norm": 1.2490509748458862, + "learning_rate": 4.9955943799722415e-06, + "loss": 0.7603, + "step": 511 + }, + { + "epoch": 0.1406207085965394, + "grad_norm": 1.238836407661438, + "learning_rate": 4.9955729216882875e-06, + "loss": 0.7558, + "step": 512 + }, + { + "epoch": 0.14089535841801704, + "grad_norm": 1.1885271072387695, + "learning_rate": 4.995551411319504e-06, + "loss": 0.6402, + "step": 513 + }, + { + "epoch": 0.14117000823949463, + "grad_norm": 1.235407829284668, + "learning_rate": 4.9955298488663425e-06, + "loss": 0.7027, + "step": 514 + }, + { + "epoch": 0.14144465806097226, + "grad_norm": 1.3486651182174683, + "learning_rate": 4.99550823432925e-06, + "loss": 0.682, + "step": 515 + }, + { + "epoch": 0.14171930788244988, + "grad_norm": 1.274798035621643, + "learning_rate": 4.9954865677086785e-06, + "loss": 0.7063, + "step": 516 + }, + { + "epoch": 0.1419939577039275, + "grad_norm": 1.2887500524520874, + "learning_rate": 4.99546484900508e-06, + "loss": 0.7585, + "step": 517 + }, + { + "epoch": 0.1422686075254051, + "grad_norm": 1.2268427610397339, + "learning_rate": 4.995443078218909e-06, + "loss": 0.6944, + "step": 518 + }, + { + "epoch": 0.14254325734688272, + "grad_norm": 1.2531949281692505, + "learning_rate": 4.995421255350619e-06, + "loss": 0.7574, + "step": 519 + }, + { + "epoch": 0.14281790716836035, + "grad_norm": 1.2422707080841064, + "learning_rate": 4.995399380400665e-06, + "loss": 0.7374, + "step": 520 + }, + { + "epoch": 0.14309255698983794, + "grad_norm": 1.2118220329284668, + "learning_rate": 4.9953774533695045e-06, + "loss": 0.7385, + "step": 521 + }, + { + "epoch": 0.14336720681131557, + "grad_norm": 1.315832495689392, + "learning_rate": 4.995355474257595e-06, + "loss": 0.7697, + "step": 522 + }, + { + "epoch": 0.1436418566327932, + "grad_norm": 1.2482990026474, + "learning_rate": 4.995333443065394e-06, + "loss": 0.7439, + "step": 523 + }, + { + "epoch": 0.14391650645427082, + "grad_norm": 1.3098465204238892, + "learning_rate": 4.995311359793363e-06, + "loss": 0.7437, + "step": 524 + }, + { + "epoch": 0.1441911562757484, + "grad_norm": 1.280495047569275, + "learning_rate": 4.995289224441962e-06, + "loss": 0.6971, + "step": 525 + }, + { + "epoch": 0.14446580609722603, + "grad_norm": 1.1530063152313232, + "learning_rate": 4.995267037011653e-06, + "loss": 0.7175, + "step": 526 + }, + { + "epoch": 0.14474045591870366, + "grad_norm": 1.2328840494155884, + "learning_rate": 4.9952447975028985e-06, + "loss": 0.7499, + "step": 527 + }, + { + "epoch": 0.14501510574018128, + "grad_norm": 1.2199496030807495, + "learning_rate": 4.995222505916164e-06, + "loss": 0.705, + "step": 528 + }, + { + "epoch": 0.14528975556165888, + "grad_norm": 1.3375369310379028, + "learning_rate": 4.995200162251914e-06, + "loss": 0.72, + "step": 529 + }, + { + "epoch": 0.1455644053831365, + "grad_norm": 1.2791963815689087, + "learning_rate": 4.995177766510615e-06, + "loss": 0.7456, + "step": 530 + }, + { + "epoch": 0.14583905520461413, + "grad_norm": 1.292958378791809, + "learning_rate": 4.995155318692734e-06, + "loss": 0.7082, + "step": 531 + }, + { + "epoch": 0.14611370502609172, + "grad_norm": 1.3329838514328003, + "learning_rate": 4.99513281879874e-06, + "loss": 0.7396, + "step": 532 + }, + { + "epoch": 0.14638835484756935, + "grad_norm": 1.2229316234588623, + "learning_rate": 4.9951102668291025e-06, + "loss": 0.7479, + "step": 533 + }, + { + "epoch": 0.14666300466904697, + "grad_norm": 1.267865777015686, + "learning_rate": 4.995087662784292e-06, + "loss": 0.7364, + "step": 534 + }, + { + "epoch": 0.1469376544905246, + "grad_norm": 1.2301939725875854, + "learning_rate": 4.99506500666478e-06, + "loss": 0.712, + "step": 535 + }, + { + "epoch": 0.1472123043120022, + "grad_norm": 1.237546443939209, + "learning_rate": 4.9950422984710405e-06, + "loss": 0.6775, + "step": 536 + }, + { + "epoch": 0.14748695413347981, + "grad_norm": 1.2265241146087646, + "learning_rate": 4.995019538203546e-06, + "loss": 0.7632, + "step": 537 + }, + { + "epoch": 0.14776160395495744, + "grad_norm": 1.2479978799819946, + "learning_rate": 4.994996725862773e-06, + "loss": 0.6911, + "step": 538 + }, + { + "epoch": 0.14803625377643503, + "grad_norm": 1.2884811162948608, + "learning_rate": 4.994973861449196e-06, + "loss": 0.731, + "step": 539 + }, + { + "epoch": 0.14831090359791266, + "grad_norm": 1.2379366159439087, + "learning_rate": 4.994950944963294e-06, + "loss": 0.7121, + "step": 540 + }, + { + "epoch": 0.14858555341939028, + "grad_norm": 1.379361867904663, + "learning_rate": 4.994927976405543e-06, + "loss": 0.7367, + "step": 541 + }, + { + "epoch": 0.1488602032408679, + "grad_norm": 1.1671173572540283, + "learning_rate": 4.994904955776425e-06, + "loss": 0.6833, + "step": 542 + }, + { + "epoch": 0.1491348530623455, + "grad_norm": 1.2908154726028442, + "learning_rate": 4.994881883076418e-06, + "loss": 0.7096, + "step": 543 + }, + { + "epoch": 0.14940950288382313, + "grad_norm": 1.2105340957641602, + "learning_rate": 4.994858758306006e-06, + "loss": 0.7096, + "step": 544 + }, + { + "epoch": 0.14968415270530075, + "grad_norm": 1.2236354351043701, + "learning_rate": 4.994835581465669e-06, + "loss": 0.7598, + "step": 545 + }, + { + "epoch": 0.14995880252677835, + "grad_norm": 1.2512356042861938, + "learning_rate": 4.994812352555892e-06, + "loss": 0.7322, + "step": 546 + }, + { + "epoch": 0.15023345234825597, + "grad_norm": 1.1933348178863525, + "learning_rate": 4.9947890715771605e-06, + "loss": 0.734, + "step": 547 + }, + { + "epoch": 0.1505081021697336, + "grad_norm": 1.2729922533035278, + "learning_rate": 4.994765738529961e-06, + "loss": 0.6817, + "step": 548 + }, + { + "epoch": 0.15078275199121122, + "grad_norm": 1.1818009614944458, + "learning_rate": 4.994742353414777e-06, + "loss": 0.7145, + "step": 549 + }, + { + "epoch": 0.1510574018126888, + "grad_norm": 1.2586711645126343, + "learning_rate": 4.9947189162321e-06, + "loss": 0.7232, + "step": 550 + }, + { + "epoch": 0.15133205163416644, + "grad_norm": 1.3745049238204956, + "learning_rate": 4.9946954269824175e-06, + "loss": 0.7637, + "step": 551 + }, + { + "epoch": 0.15160670145564406, + "grad_norm": 1.289932131767273, + "learning_rate": 4.99467188566622e-06, + "loss": 0.7871, + "step": 552 + }, + { + "epoch": 0.15188135127712166, + "grad_norm": 1.2599809169769287, + "learning_rate": 4.994648292284e-06, + "loss": 0.7167, + "step": 553 + }, + { + "epoch": 0.15215600109859928, + "grad_norm": 1.1942172050476074, + "learning_rate": 4.994624646836248e-06, + "loss": 0.6728, + "step": 554 + }, + { + "epoch": 0.1524306509200769, + "grad_norm": 1.3477710485458374, + "learning_rate": 4.9946009493234585e-06, + "loss": 0.7695, + "step": 555 + }, + { + "epoch": 0.15270530074155453, + "grad_norm": 1.217524528503418, + "learning_rate": 4.9945771997461255e-06, + "loss": 0.7088, + "step": 556 + }, + { + "epoch": 0.15297995056303212, + "grad_norm": 1.2759534120559692, + "learning_rate": 4.994553398104747e-06, + "loss": 0.7895, + "step": 557 + }, + { + "epoch": 0.15325460038450975, + "grad_norm": 1.2611597776412964, + "learning_rate": 4.994529544399816e-06, + "loss": 0.7306, + "step": 558 + }, + { + "epoch": 0.15352925020598737, + "grad_norm": 1.1870383024215698, + "learning_rate": 4.994505638631834e-06, + "loss": 0.6653, + "step": 559 + }, + { + "epoch": 0.153803900027465, + "grad_norm": 1.2770532369613647, + "learning_rate": 4.994481680801297e-06, + "loss": 0.6767, + "step": 560 + }, + { + "epoch": 0.1540785498489426, + "grad_norm": 1.207006812095642, + "learning_rate": 4.9944576709087064e-06, + "loss": 0.6957, + "step": 561 + }, + { + "epoch": 0.15435319967042022, + "grad_norm": 1.2653279304504395, + "learning_rate": 4.994433608954563e-06, + "loss": 0.6565, + "step": 562 + }, + { + "epoch": 0.15462784949189784, + "grad_norm": 1.2376434803009033, + "learning_rate": 4.9944094949393705e-06, + "loss": 0.7698, + "step": 563 + }, + { + "epoch": 0.15490249931337544, + "grad_norm": 1.254945993423462, + "learning_rate": 4.994385328863629e-06, + "loss": 0.7119, + "step": 564 + }, + { + "epoch": 0.15517714913485306, + "grad_norm": 1.269452691078186, + "learning_rate": 4.994361110727846e-06, + "loss": 0.708, + "step": 565 + }, + { + "epoch": 0.15545179895633068, + "grad_norm": 1.2820905447006226, + "learning_rate": 4.994336840532526e-06, + "loss": 0.6974, + "step": 566 + }, + { + "epoch": 0.1557264487778083, + "grad_norm": 1.2497018575668335, + "learning_rate": 4.994312518278174e-06, + "loss": 0.733, + "step": 567 + }, + { + "epoch": 0.1560010985992859, + "grad_norm": 1.3049654960632324, + "learning_rate": 4.994288143965299e-06, + "loss": 0.7282, + "step": 568 + }, + { + "epoch": 0.15627574842076353, + "grad_norm": 1.231859803199768, + "learning_rate": 4.99426371759441e-06, + "loss": 0.7163, + "step": 569 + }, + { + "epoch": 0.15655039824224115, + "grad_norm": 1.1883779764175415, + "learning_rate": 4.994239239166017e-06, + "loss": 0.6974, + "step": 570 + }, + { + "epoch": 0.15682504806371875, + "grad_norm": 1.2433087825775146, + "learning_rate": 4.994214708680629e-06, + "loss": 0.7477, + "step": 571 + }, + { + "epoch": 0.15709969788519637, + "grad_norm": 1.347593069076538, + "learning_rate": 4.99419012613876e-06, + "loss": 0.7534, + "step": 572 + }, + { + "epoch": 0.157374347706674, + "grad_norm": 1.30435311794281, + "learning_rate": 4.994165491540922e-06, + "loss": 0.759, + "step": 573 + }, + { + "epoch": 0.15764899752815162, + "grad_norm": 1.3386049270629883, + "learning_rate": 4.994140804887629e-06, + "loss": 0.7203, + "step": 574 + }, + { + "epoch": 0.15792364734962921, + "grad_norm": 1.2261995077133179, + "learning_rate": 4.994116066179397e-06, + "loss": 0.7515, + "step": 575 + }, + { + "epoch": 0.15819829717110684, + "grad_norm": 1.2890686988830566, + "learning_rate": 4.994091275416741e-06, + "loss": 0.7609, + "step": 576 + }, + { + "epoch": 0.15847294699258446, + "grad_norm": 1.2662187814712524, + "learning_rate": 4.994066432600181e-06, + "loss": 0.7188, + "step": 577 + }, + { + "epoch": 0.15874759681406206, + "grad_norm": 1.2325074672698975, + "learning_rate": 4.994041537730233e-06, + "loss": 0.7076, + "step": 578 + }, + { + "epoch": 0.15902224663553968, + "grad_norm": 1.2631176710128784, + "learning_rate": 4.994016590807418e-06, + "loss": 0.7629, + "step": 579 + }, + { + "epoch": 0.1592968964570173, + "grad_norm": 1.3318605422973633, + "learning_rate": 4.993991591832256e-06, + "loss": 0.7214, + "step": 580 + }, + { + "epoch": 0.15957154627849493, + "grad_norm": 1.239741325378418, + "learning_rate": 4.9939665408052684e-06, + "loss": 0.7047, + "step": 581 + }, + { + "epoch": 0.15984619609997253, + "grad_norm": 1.30906343460083, + "learning_rate": 4.993941437726979e-06, + "loss": 0.7339, + "step": 582 + }, + { + "epoch": 0.16012084592145015, + "grad_norm": 1.174399495124817, + "learning_rate": 4.9939162825979105e-06, + "loss": 0.7059, + "step": 583 + }, + { + "epoch": 0.16039549574292777, + "grad_norm": 1.2543668746948242, + "learning_rate": 4.993891075418589e-06, + "loss": 0.7318, + "step": 584 + }, + { + "epoch": 0.16067014556440537, + "grad_norm": 1.398615837097168, + "learning_rate": 4.99386581618954e-06, + "loss": 0.7654, + "step": 585 + }, + { + "epoch": 0.160944795385883, + "grad_norm": 1.2420587539672852, + "learning_rate": 4.993840504911291e-06, + "loss": 0.7322, + "step": 586 + }, + { + "epoch": 0.16121944520736062, + "grad_norm": 1.254502773284912, + "learning_rate": 4.993815141584371e-06, + "loss": 0.7733, + "step": 587 + }, + { + "epoch": 0.16149409502883824, + "grad_norm": 1.350947618484497, + "learning_rate": 4.9937897262093075e-06, + "loss": 0.6693, + "step": 588 + }, + { + "epoch": 0.16176874485031584, + "grad_norm": 1.2573328018188477, + "learning_rate": 4.993764258786632e-06, + "loss": 0.6766, + "step": 589 + }, + { + "epoch": 0.16204339467179346, + "grad_norm": 1.2433239221572876, + "learning_rate": 4.993738739316877e-06, + "loss": 0.748, + "step": 590 + }, + { + "epoch": 0.16231804449327109, + "grad_norm": 1.2355573177337646, + "learning_rate": 4.993713167800573e-06, + "loss": 0.7284, + "step": 591 + }, + { + "epoch": 0.1625926943147487, + "grad_norm": 1.275648832321167, + "learning_rate": 4.993687544238256e-06, + "loss": 0.7516, + "step": 592 + }, + { + "epoch": 0.1628673441362263, + "grad_norm": 1.2664850950241089, + "learning_rate": 4.993661868630458e-06, + "loss": 0.7204, + "step": 593 + }, + { + "epoch": 0.16314199395770393, + "grad_norm": 1.2328099012374878, + "learning_rate": 4.993636140977718e-06, + "loss": 0.6428, + "step": 594 + }, + { + "epoch": 0.16341664377918155, + "grad_norm": 1.2442200183868408, + "learning_rate": 4.99361036128057e-06, + "loss": 0.7283, + "step": 595 + }, + { + "epoch": 0.16369129360065915, + "grad_norm": 1.207743763923645, + "learning_rate": 4.993584529539555e-06, + "loss": 0.7254, + "step": 596 + }, + { + "epoch": 0.16396594342213677, + "grad_norm": 1.199493169784546, + "learning_rate": 4.993558645755209e-06, + "loss": 0.7307, + "step": 597 + }, + { + "epoch": 0.1642405932436144, + "grad_norm": 1.2047767639160156, + "learning_rate": 4.993532709928075e-06, + "loss": 0.7587, + "step": 598 + }, + { + "epoch": 0.16451524306509202, + "grad_norm": 1.2931045293807983, + "learning_rate": 4.993506722058693e-06, + "loss": 0.713, + "step": 599 + }, + { + "epoch": 0.16478989288656962, + "grad_norm": 1.2952368259429932, + "learning_rate": 4.993480682147605e-06, + "loss": 0.7386, + "step": 600 + }, + { + "epoch": 0.16506454270804724, + "grad_norm": 1.265316367149353, + "learning_rate": 4.993454590195355e-06, + "loss": 0.7568, + "step": 601 + }, + { + "epoch": 0.16533919252952486, + "grad_norm": 1.381645917892456, + "learning_rate": 4.993428446202489e-06, + "loss": 0.7202, + "step": 602 + }, + { + "epoch": 0.16561384235100246, + "grad_norm": 1.2995312213897705, + "learning_rate": 4.993402250169549e-06, + "loss": 0.6837, + "step": 603 + }, + { + "epoch": 0.16588849217248008, + "grad_norm": 1.3542150259017944, + "learning_rate": 4.993376002097085e-06, + "loss": 0.713, + "step": 604 + }, + { + "epoch": 0.1661631419939577, + "grad_norm": 1.214881420135498, + "learning_rate": 4.993349701985644e-06, + "loss": 0.6934, + "step": 605 + }, + { + "epoch": 0.16643779181543533, + "grad_norm": 1.250314712524414, + "learning_rate": 4.993323349835774e-06, + "loss": 0.692, + "step": 606 + }, + { + "epoch": 0.16671244163691293, + "grad_norm": 1.251692771911621, + "learning_rate": 4.993296945648026e-06, + "loss": 0.6597, + "step": 607 + }, + { + "epoch": 0.16698709145839055, + "grad_norm": 1.3501358032226562, + "learning_rate": 4.993270489422951e-06, + "loss": 0.7424, + "step": 608 + }, + { + "epoch": 0.16726174127986818, + "grad_norm": 1.2728570699691772, + "learning_rate": 4.993243981161101e-06, + "loss": 0.7102, + "step": 609 + }, + { + "epoch": 0.16753639110134577, + "grad_norm": 1.310855507850647, + "learning_rate": 4.993217420863029e-06, + "loss": 0.7029, + "step": 610 + }, + { + "epoch": 0.1678110409228234, + "grad_norm": 1.2316882610321045, + "learning_rate": 4.993190808529289e-06, + "loss": 0.7186, + "step": 611 + }, + { + "epoch": 0.16808569074430102, + "grad_norm": 1.292986273765564, + "learning_rate": 4.993164144160438e-06, + "loss": 0.6578, + "step": 612 + }, + { + "epoch": 0.16836034056577864, + "grad_norm": 1.22337007522583, + "learning_rate": 4.99313742775703e-06, + "loss": 0.7329, + "step": 613 + }, + { + "epoch": 0.16863499038725624, + "grad_norm": 1.1940160989761353, + "learning_rate": 4.993110659319625e-06, + "loss": 0.7184, + "step": 614 + }, + { + "epoch": 0.16890964020873386, + "grad_norm": 1.3873040676116943, + "learning_rate": 4.99308383884878e-06, + "loss": 0.782, + "step": 615 + }, + { + "epoch": 0.1691842900302115, + "grad_norm": 1.3385339975357056, + "learning_rate": 4.993056966345056e-06, + "loss": 0.7311, + "step": 616 + }, + { + "epoch": 0.16945893985168908, + "grad_norm": 1.2608270645141602, + "learning_rate": 4.993030041809013e-06, + "loss": 0.7017, + "step": 617 + }, + { + "epoch": 0.1697335896731667, + "grad_norm": 1.2555872201919556, + "learning_rate": 4.993003065241214e-06, + "loss": 0.7288, + "step": 618 + }, + { + "epoch": 0.17000823949464433, + "grad_norm": 1.2335351705551147, + "learning_rate": 4.992976036642221e-06, + "loss": 0.7107, + "step": 619 + }, + { + "epoch": 0.17028288931612195, + "grad_norm": 1.2063064575195312, + "learning_rate": 4.992948956012597e-06, + "loss": 0.6935, + "step": 620 + }, + { + "epoch": 0.17055753913759955, + "grad_norm": 1.2798599004745483, + "learning_rate": 4.9929218233529105e-06, + "loss": 0.7367, + "step": 621 + }, + { + "epoch": 0.17083218895907717, + "grad_norm": 1.2167130708694458, + "learning_rate": 4.992894638663725e-06, + "loss": 0.7434, + "step": 622 + }, + { + "epoch": 0.1711068387805548, + "grad_norm": 1.2005631923675537, + "learning_rate": 4.992867401945609e-06, + "loss": 0.6992, + "step": 623 + }, + { + "epoch": 0.17138148860203242, + "grad_norm": 1.2654507160186768, + "learning_rate": 4.992840113199131e-06, + "loss": 0.7031, + "step": 624 + }, + { + "epoch": 0.17165613842351002, + "grad_norm": 1.2544667720794678, + "learning_rate": 4.992812772424859e-06, + "loss": 0.7092, + "step": 625 + }, + { + "epoch": 0.17193078824498764, + "grad_norm": 1.3520675897598267, + "learning_rate": 4.992785379623366e-06, + "loss": 0.6745, + "step": 626 + }, + { + "epoch": 0.17220543806646527, + "grad_norm": 1.2129243612289429, + "learning_rate": 4.9927579347952224e-06, + "loss": 0.7, + "step": 627 + }, + { + "epoch": 0.17248008788794286, + "grad_norm": 1.2417519092559814, + "learning_rate": 4.992730437941001e-06, + "loss": 0.7235, + "step": 628 + }, + { + "epoch": 0.1727547377094205, + "grad_norm": 1.293628215789795, + "learning_rate": 4.992702889061276e-06, + "loss": 0.7211, + "step": 629 + }, + { + "epoch": 0.1730293875308981, + "grad_norm": 1.2544784545898438, + "learning_rate": 4.992675288156622e-06, + "loss": 0.7291, + "step": 630 + }, + { + "epoch": 0.17330403735237573, + "grad_norm": 1.19427490234375, + "learning_rate": 4.992647635227615e-06, + "loss": 0.7432, + "step": 631 + }, + { + "epoch": 0.17357868717385333, + "grad_norm": 1.2346041202545166, + "learning_rate": 4.992619930274833e-06, + "loss": 0.7183, + "step": 632 + }, + { + "epoch": 0.17385333699533095, + "grad_norm": 1.2933248281478882, + "learning_rate": 4.992592173298853e-06, + "loss": 0.7642, + "step": 633 + }, + { + "epoch": 0.17412798681680858, + "grad_norm": 1.2515265941619873, + "learning_rate": 4.9925643643002556e-06, + "loss": 0.7239, + "step": 634 + }, + { + "epoch": 0.17440263663828617, + "grad_norm": 1.338828206062317, + "learning_rate": 4.992536503279621e-06, + "loss": 0.6888, + "step": 635 + }, + { + "epoch": 0.1746772864597638, + "grad_norm": 1.2661267518997192, + "learning_rate": 4.992508590237529e-06, + "loss": 0.7211, + "step": 636 + }, + { + "epoch": 0.17495193628124142, + "grad_norm": 1.3165549039840698, + "learning_rate": 4.9924806251745646e-06, + "loss": 0.7346, + "step": 637 + }, + { + "epoch": 0.17522658610271905, + "grad_norm": 1.2732335329055786, + "learning_rate": 4.992452608091309e-06, + "loss": 0.6883, + "step": 638 + }, + { + "epoch": 0.17550123592419664, + "grad_norm": 1.22809898853302, + "learning_rate": 4.99242453898835e-06, + "loss": 0.7023, + "step": 639 + }, + { + "epoch": 0.17577588574567427, + "grad_norm": 1.2879995107650757, + "learning_rate": 4.99239641786627e-06, + "loss": 0.7071, + "step": 640 + }, + { + "epoch": 0.1760505355671519, + "grad_norm": 1.2207119464874268, + "learning_rate": 4.9923682447256585e-06, + "loss": 0.6909, + "step": 641 + }, + { + "epoch": 0.17632518538862949, + "grad_norm": 1.3115817308425903, + "learning_rate": 4.992340019567102e-06, + "loss": 0.6987, + "step": 642 + }, + { + "epoch": 0.1765998352101071, + "grad_norm": 1.2715681791305542, + "learning_rate": 4.99231174239119e-06, + "loss": 0.7249, + "step": 643 + }, + { + "epoch": 0.17687448503158473, + "grad_norm": 1.267511010169983, + "learning_rate": 4.9922834131985135e-06, + "loss": 0.6963, + "step": 644 + }, + { + "epoch": 0.17714913485306236, + "grad_norm": 1.1994532346725464, + "learning_rate": 4.992255031989662e-06, + "loss": 0.713, + "step": 645 + }, + { + "epoch": 0.17742378467453995, + "grad_norm": 1.2370326519012451, + "learning_rate": 4.99222659876523e-06, + "loss": 0.7585, + "step": 646 + }, + { + "epoch": 0.17769843449601758, + "grad_norm": 1.3039616346359253, + "learning_rate": 4.99219811352581e-06, + "loss": 0.7171, + "step": 647 + }, + { + "epoch": 0.1779730843174952, + "grad_norm": 1.2462775707244873, + "learning_rate": 4.992169576271996e-06, + "loss": 0.6709, + "step": 648 + }, + { + "epoch": 0.1782477341389728, + "grad_norm": 1.2331303358078003, + "learning_rate": 4.992140987004383e-06, + "loss": 0.6989, + "step": 649 + }, + { + "epoch": 0.17852238396045042, + "grad_norm": 1.3080765008926392, + "learning_rate": 4.99211234572357e-06, + "loss": 0.7765, + "step": 650 + }, + { + "epoch": 0.17879703378192804, + "grad_norm": 1.2622755765914917, + "learning_rate": 4.992083652430152e-06, + "loss": 0.6604, + "step": 651 + }, + { + "epoch": 0.17907168360340567, + "grad_norm": 1.179392695426941, + "learning_rate": 4.992054907124731e-06, + "loss": 0.6887, + "step": 652 + }, + { + "epoch": 0.17934633342488326, + "grad_norm": 1.345860481262207, + "learning_rate": 4.992026109807904e-06, + "loss": 0.7166, + "step": 653 + }, + { + "epoch": 0.1796209832463609, + "grad_norm": 1.2422218322753906, + "learning_rate": 4.991997260480272e-06, + "loss": 0.7049, + "step": 654 + }, + { + "epoch": 0.1798956330678385, + "grad_norm": 1.2933900356292725, + "learning_rate": 4.99196835914244e-06, + "loss": 0.726, + "step": 655 + }, + { + "epoch": 0.18017028288931614, + "grad_norm": 1.320479154586792, + "learning_rate": 4.99193940579501e-06, + "loss": 0.7071, + "step": 656 + }, + { + "epoch": 0.18044493271079373, + "grad_norm": 1.267040729522705, + "learning_rate": 4.991910400438585e-06, + "loss": 0.7229, + "step": 657 + }, + { + "epoch": 0.18071958253227136, + "grad_norm": 1.2974326610565186, + "learning_rate": 4.991881343073771e-06, + "loss": 0.7336, + "step": 658 + }, + { + "epoch": 0.18099423235374898, + "grad_norm": 1.3429235219955444, + "learning_rate": 4.991852233701175e-06, + "loss": 0.7407, + "step": 659 + }, + { + "epoch": 0.18126888217522658, + "grad_norm": 1.2144014835357666, + "learning_rate": 4.991823072321404e-06, + "loss": 0.6714, + "step": 660 + }, + { + "epoch": 0.1815435319967042, + "grad_norm": 1.3525917530059814, + "learning_rate": 4.9917938589350665e-06, + "loss": 0.6862, + "step": 661 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.2529915571212769, + "learning_rate": 4.991764593542773e-06, + "loss": 0.7012, + "step": 662 + }, + { + "epoch": 0.18209283163965945, + "grad_norm": 1.367202639579773, + "learning_rate": 4.991735276145134e-06, + "loss": 0.7413, + "step": 663 + }, + { + "epoch": 0.18236748146113704, + "grad_norm": 1.2778208255767822, + "learning_rate": 4.991705906742761e-06, + "loss": 0.7467, + "step": 664 + }, + { + "epoch": 0.18264213128261467, + "grad_norm": 1.2809789180755615, + "learning_rate": 4.991676485336266e-06, + "loss": 0.699, + "step": 665 + }, + { + "epoch": 0.1829167811040923, + "grad_norm": 1.1964315176010132, + "learning_rate": 4.991647011926265e-06, + "loss": 0.6817, + "step": 666 + }, + { + "epoch": 0.1831914309255699, + "grad_norm": 1.2680212259292603, + "learning_rate": 4.991617486513373e-06, + "loss": 0.7237, + "step": 667 + }, + { + "epoch": 0.1834660807470475, + "grad_norm": 1.3416337966918945, + "learning_rate": 4.991587909098205e-06, + "loss": 0.7221, + "step": 668 + }, + { + "epoch": 0.18374073056852513, + "grad_norm": 1.2713003158569336, + "learning_rate": 4.9915582796813785e-06, + "loss": 0.6974, + "step": 669 + }, + { + "epoch": 0.18401538039000276, + "grad_norm": 1.2941009998321533, + "learning_rate": 4.991528598263513e-06, + "loss": 0.7431, + "step": 670 + }, + { + "epoch": 0.18429003021148035, + "grad_norm": 1.2845340967178345, + "learning_rate": 4.991498864845227e-06, + "loss": 0.7437, + "step": 671 + }, + { + "epoch": 0.18456468003295798, + "grad_norm": 1.2850611209869385, + "learning_rate": 4.991469079427141e-06, + "loss": 0.7442, + "step": 672 + }, + { + "epoch": 0.1848393298544356, + "grad_norm": 1.191165804862976, + "learning_rate": 4.9914392420098764e-06, + "loss": 0.6584, + "step": 673 + }, + { + "epoch": 0.1851139796759132, + "grad_norm": 1.2499492168426514, + "learning_rate": 4.9914093525940575e-06, + "loss": 0.6841, + "step": 674 + }, + { + "epoch": 0.18538862949739082, + "grad_norm": 1.3074150085449219, + "learning_rate": 4.991379411180306e-06, + "loss": 0.7462, + "step": 675 + }, + { + "epoch": 0.18566327931886845, + "grad_norm": 1.3811063766479492, + "learning_rate": 4.991349417769249e-06, + "loss": 0.7447, + "step": 676 + }, + { + "epoch": 0.18593792914034607, + "grad_norm": 1.2539589405059814, + "learning_rate": 4.991319372361511e-06, + "loss": 0.6965, + "step": 677 + }, + { + "epoch": 0.18621257896182367, + "grad_norm": 1.2812538146972656, + "learning_rate": 4.991289274957719e-06, + "loss": 0.7079, + "step": 678 + }, + { + "epoch": 0.1864872287833013, + "grad_norm": 1.295317530632019, + "learning_rate": 4.991259125558502e-06, + "loss": 0.746, + "step": 679 + }, + { + "epoch": 0.1867618786047789, + "grad_norm": 1.2975353002548218, + "learning_rate": 4.991228924164489e-06, + "loss": 0.7469, + "step": 680 + }, + { + "epoch": 0.1870365284262565, + "grad_norm": 1.269023060798645, + "learning_rate": 4.99119867077631e-06, + "loss": 0.7484, + "step": 681 + }, + { + "epoch": 0.18731117824773413, + "grad_norm": 1.2688056230545044, + "learning_rate": 4.991168365394596e-06, + "loss": 0.6826, + "step": 682 + }, + { + "epoch": 0.18758582806921176, + "grad_norm": 1.2906309366226196, + "learning_rate": 4.991138008019981e-06, + "loss": 0.7041, + "step": 683 + }, + { + "epoch": 0.18786047789068938, + "grad_norm": 1.2663801908493042, + "learning_rate": 4.991107598653097e-06, + "loss": 0.7008, + "step": 684 + }, + { + "epoch": 0.18813512771216698, + "grad_norm": 1.2516754865646362, + "learning_rate": 4.991077137294581e-06, + "loss": 0.6695, + "step": 685 + }, + { + "epoch": 0.1884097775336446, + "grad_norm": 1.3060780763626099, + "learning_rate": 4.9910466239450656e-06, + "loss": 0.6806, + "step": 686 + }, + { + "epoch": 0.18868442735512223, + "grad_norm": 1.3439714908599854, + "learning_rate": 4.9910160586051895e-06, + "loss": 0.7338, + "step": 687 + }, + { + "epoch": 0.18895907717659985, + "grad_norm": 1.2038359642028809, + "learning_rate": 4.99098544127559e-06, + "loss": 0.68, + "step": 688 + }, + { + "epoch": 0.18923372699807745, + "grad_norm": 1.232743263244629, + "learning_rate": 4.990954771956907e-06, + "loss": 0.7235, + "step": 689 + }, + { + "epoch": 0.18950837681955507, + "grad_norm": 1.2286728620529175, + "learning_rate": 4.990924050649779e-06, + "loss": 0.7181, + "step": 690 + }, + { + "epoch": 0.1897830266410327, + "grad_norm": 1.332396388053894, + "learning_rate": 4.990893277354849e-06, + "loss": 0.705, + "step": 691 + }, + { + "epoch": 0.1900576764625103, + "grad_norm": 1.2982734441757202, + "learning_rate": 4.990862452072758e-06, + "loss": 0.7123, + "step": 692 + }, + { + "epoch": 0.1903323262839879, + "grad_norm": 1.2231944799423218, + "learning_rate": 4.99083157480415e-06, + "loss": 0.6848, + "step": 693 + }, + { + "epoch": 0.19060697610546554, + "grad_norm": 1.259516954421997, + "learning_rate": 4.99080064554967e-06, + "loss": 0.6847, + "step": 694 + }, + { + "epoch": 0.19088162592694316, + "grad_norm": 1.2471414804458618, + "learning_rate": 4.990769664309962e-06, + "loss": 0.7224, + "step": 695 + }, + { + "epoch": 0.19115627574842076, + "grad_norm": 1.2843775749206543, + "learning_rate": 4.9907386310856735e-06, + "loss": 0.7058, + "step": 696 + }, + { + "epoch": 0.19143092556989838, + "grad_norm": 1.3134331703186035, + "learning_rate": 4.990707545877451e-06, + "loss": 0.7312, + "step": 697 + }, + { + "epoch": 0.191705575391376, + "grad_norm": 1.2935395240783691, + "learning_rate": 4.990676408685945e-06, + "loss": 0.7516, + "step": 698 + }, + { + "epoch": 0.1919802252128536, + "grad_norm": 1.1687885522842407, + "learning_rate": 4.990645219511805e-06, + "loss": 0.6854, + "step": 699 + }, + { + "epoch": 0.19225487503433122, + "grad_norm": 1.283696174621582, + "learning_rate": 4.990613978355682e-06, + "loss": 0.7169, + "step": 700 + }, + { + "epoch": 0.19252952485580885, + "grad_norm": 1.2453712224960327, + "learning_rate": 4.990582685218227e-06, + "loss": 0.7013, + "step": 701 + }, + { + "epoch": 0.19280417467728647, + "grad_norm": 1.2628878355026245, + "learning_rate": 4.990551340100094e-06, + "loss": 0.7116, + "step": 702 + }, + { + "epoch": 0.19307882449876407, + "grad_norm": 1.2654649019241333, + "learning_rate": 4.990519943001937e-06, + "loss": 0.675, + "step": 703 + }, + { + "epoch": 0.1933534743202417, + "grad_norm": 1.2012871503829956, + "learning_rate": 4.9904884939244115e-06, + "loss": 0.7176, + "step": 704 + }, + { + "epoch": 0.19362812414171932, + "grad_norm": 1.2704998254776, + "learning_rate": 4.990456992868173e-06, + "loss": 0.7375, + "step": 705 + }, + { + "epoch": 0.1939027739631969, + "grad_norm": 1.2547845840454102, + "learning_rate": 4.9904254398338795e-06, + "loss": 0.7196, + "step": 706 + }, + { + "epoch": 0.19417742378467454, + "grad_norm": 1.2998814582824707, + "learning_rate": 4.99039383482219e-06, + "loss": 0.6972, + "step": 707 + }, + { + "epoch": 0.19445207360615216, + "grad_norm": 1.2798316478729248, + "learning_rate": 4.990362177833764e-06, + "loss": 0.682, + "step": 708 + }, + { + "epoch": 0.19472672342762978, + "grad_norm": 1.3519818782806396, + "learning_rate": 4.990330468869262e-06, + "loss": 0.7272, + "step": 709 + }, + { + "epoch": 0.19500137324910738, + "grad_norm": 1.3364511728286743, + "learning_rate": 4.990298707929345e-06, + "loss": 0.7381, + "step": 710 + }, + { + "epoch": 0.195276023070585, + "grad_norm": 1.254469394683838, + "learning_rate": 4.990266895014677e-06, + "loss": 0.7022, + "step": 711 + }, + { + "epoch": 0.19555067289206263, + "grad_norm": 1.2280793190002441, + "learning_rate": 4.990235030125922e-06, + "loss": 0.6638, + "step": 712 + }, + { + "epoch": 0.19582532271354022, + "grad_norm": 1.2580749988555908, + "learning_rate": 4.990203113263744e-06, + "loss": 0.695, + "step": 713 + }, + { + "epoch": 0.19609997253501785, + "grad_norm": 1.3083257675170898, + "learning_rate": 4.99017114442881e-06, + "loss": 0.7298, + "step": 714 + }, + { + "epoch": 0.19637462235649547, + "grad_norm": 1.3358627557754517, + "learning_rate": 4.990139123621787e-06, + "loss": 0.7473, + "step": 715 + }, + { + "epoch": 0.1966492721779731, + "grad_norm": 1.3429090976715088, + "learning_rate": 4.990107050843344e-06, + "loss": 0.7118, + "step": 716 + }, + { + "epoch": 0.1969239219994507, + "grad_norm": 1.3068575859069824, + "learning_rate": 4.990074926094149e-06, + "loss": 0.7302, + "step": 717 + }, + { + "epoch": 0.19719857182092831, + "grad_norm": 1.2909040451049805, + "learning_rate": 4.990042749374873e-06, + "loss": 0.7244, + "step": 718 + }, + { + "epoch": 0.19747322164240594, + "grad_norm": 1.236091136932373, + "learning_rate": 4.990010520686188e-06, + "loss": 0.711, + "step": 719 + }, + { + "epoch": 0.19774787146388356, + "grad_norm": 1.3478262424468994, + "learning_rate": 4.9899782400287664e-06, + "loss": 0.7124, + "step": 720 + }, + { + "epoch": 0.19802252128536116, + "grad_norm": 1.3007488250732422, + "learning_rate": 4.989945907403282e-06, + "loss": 0.7184, + "step": 721 + }, + { + "epoch": 0.19829717110683878, + "grad_norm": 1.2704670429229736, + "learning_rate": 4.989913522810409e-06, + "loss": 0.7613, + "step": 722 + }, + { + "epoch": 0.1985718209283164, + "grad_norm": 1.366603970527649, + "learning_rate": 4.989881086250824e-06, + "loss": 0.7246, + "step": 723 + }, + { + "epoch": 0.198846470749794, + "grad_norm": 1.310261845588684, + "learning_rate": 4.989848597725203e-06, + "loss": 0.7108, + "step": 724 + }, + { + "epoch": 0.19912112057127163, + "grad_norm": 1.249550700187683, + "learning_rate": 4.989816057234226e-06, + "loss": 0.7292, + "step": 725 + }, + { + "epoch": 0.19939577039274925, + "grad_norm": 1.2709535360336304, + "learning_rate": 4.989783464778571e-06, + "loss": 0.7109, + "step": 726 + }, + { + "epoch": 0.19967042021422687, + "grad_norm": 1.2604894638061523, + "learning_rate": 4.9897508203589175e-06, + "loss": 0.718, + "step": 727 + }, + { + "epoch": 0.19994507003570447, + "grad_norm": 1.1772223711013794, + "learning_rate": 4.989718123975948e-06, + "loss": 0.6844, + "step": 728 + }, + { + "epoch": 0.2002197198571821, + "grad_norm": 1.2224922180175781, + "learning_rate": 4.989685375630344e-06, + "loss": 0.6865, + "step": 729 + }, + { + "epoch": 0.20049436967865972, + "grad_norm": 1.2218387126922607, + "learning_rate": 4.98965257532279e-06, + "loss": 0.6987, + "step": 730 + }, + { + "epoch": 0.2007690195001373, + "grad_norm": 1.2464933395385742, + "learning_rate": 4.98961972305397e-06, + "loss": 0.729, + "step": 731 + }, + { + "epoch": 0.20104366932161494, + "grad_norm": 1.326940894126892, + "learning_rate": 4.989586818824569e-06, + "loss": 0.726, + "step": 732 + }, + { + "epoch": 0.20131831914309256, + "grad_norm": 1.2082685232162476, + "learning_rate": 4.989553862635275e-06, + "loss": 0.7408, + "step": 733 + }, + { + "epoch": 0.20159296896457018, + "grad_norm": 1.328771948814392, + "learning_rate": 4.9895208544867745e-06, + "loss": 0.7338, + "step": 734 + }, + { + "epoch": 0.20186761878604778, + "grad_norm": 1.2211029529571533, + "learning_rate": 4.9894877943797586e-06, + "loss": 0.6608, + "step": 735 + }, + { + "epoch": 0.2021422686075254, + "grad_norm": 1.2755651473999023, + "learning_rate": 4.989454682314914e-06, + "loss": 0.7274, + "step": 736 + }, + { + "epoch": 0.20241691842900303, + "grad_norm": 1.3286546468734741, + "learning_rate": 4.989421518292934e-06, + "loss": 0.7311, + "step": 737 + }, + { + "epoch": 0.20269156825048062, + "grad_norm": 1.3386216163635254, + "learning_rate": 4.989388302314511e-06, + "loss": 0.7213, + "step": 738 + }, + { + "epoch": 0.20296621807195825, + "grad_norm": 1.2848949432373047, + "learning_rate": 4.9893550343803375e-06, + "loss": 0.7324, + "step": 739 + }, + { + "epoch": 0.20324086789343587, + "grad_norm": 1.2094541788101196, + "learning_rate": 4.989321714491108e-06, + "loss": 0.7341, + "step": 740 + }, + { + "epoch": 0.2035155177149135, + "grad_norm": 1.2718027830123901, + "learning_rate": 4.989288342647518e-06, + "loss": 0.6724, + "step": 741 + }, + { + "epoch": 0.2037901675363911, + "grad_norm": 1.2094570398330688, + "learning_rate": 4.989254918850263e-06, + "loss": 0.6804, + "step": 742 + }, + { + "epoch": 0.20406481735786872, + "grad_norm": 1.300984263420105, + "learning_rate": 4.989221443100042e-06, + "loss": 0.7181, + "step": 743 + }, + { + "epoch": 0.20433946717934634, + "grad_norm": 1.3508949279785156, + "learning_rate": 4.989187915397553e-06, + "loss": 0.7358, + "step": 744 + }, + { + "epoch": 0.20461411700082394, + "grad_norm": 1.3125295639038086, + "learning_rate": 4.9891543357434955e-06, + "loss": 0.7095, + "step": 745 + }, + { + "epoch": 0.20488876682230156, + "grad_norm": 1.2503554821014404, + "learning_rate": 4.989120704138571e-06, + "loss": 0.7596, + "step": 746 + }, + { + "epoch": 0.20516341664377918, + "grad_norm": 1.2847005128860474, + "learning_rate": 4.989087020583482e-06, + "loss": 0.7067, + "step": 747 + }, + { + "epoch": 0.2054380664652568, + "grad_norm": 1.193857192993164, + "learning_rate": 4.98905328507893e-06, + "loss": 0.7069, + "step": 748 + }, + { + "epoch": 0.2057127162867344, + "grad_norm": 1.2320988178253174, + "learning_rate": 4.989019497625619e-06, + "loss": 0.7048, + "step": 749 + }, + { + "epoch": 0.20598736610821203, + "grad_norm": 1.366865873336792, + "learning_rate": 4.988985658224256e-06, + "loss": 0.7177, + "step": 750 + }, + { + "epoch": 0.20626201592968965, + "grad_norm": 1.1981791257858276, + "learning_rate": 4.988951766875545e-06, + "loss": 0.725, + "step": 751 + }, + { + "epoch": 0.20653666575116728, + "grad_norm": 1.1687260866165161, + "learning_rate": 4.988917823580195e-06, + "loss": 0.6844, + "step": 752 + }, + { + "epoch": 0.20681131557264487, + "grad_norm": 1.247359037399292, + "learning_rate": 4.988883828338914e-06, + "loss": 0.699, + "step": 753 + }, + { + "epoch": 0.2070859653941225, + "grad_norm": 1.289939284324646, + "learning_rate": 4.988849781152411e-06, + "loss": 0.7066, + "step": 754 + }, + { + "epoch": 0.20736061521560012, + "grad_norm": 1.2228636741638184, + "learning_rate": 4.988815682021398e-06, + "loss": 0.6906, + "step": 755 + }, + { + "epoch": 0.20763526503707772, + "grad_norm": 1.1455881595611572, + "learning_rate": 4.988781530946585e-06, + "loss": 0.6864, + "step": 756 + }, + { + "epoch": 0.20790991485855534, + "grad_norm": 1.2938854694366455, + "learning_rate": 4.988747327928686e-06, + "loss": 0.7247, + "step": 757 + }, + { + "epoch": 0.20818456468003296, + "grad_norm": 1.2046964168548584, + "learning_rate": 4.988713072968413e-06, + "loss": 0.6919, + "step": 758 + }, + { + "epoch": 0.2084592145015106, + "grad_norm": 1.2994613647460938, + "learning_rate": 4.988678766066484e-06, + "loss": 0.6927, + "step": 759 + }, + { + "epoch": 0.20873386432298818, + "grad_norm": 1.3328251838684082, + "learning_rate": 4.9886444072236114e-06, + "loss": 0.7073, + "step": 760 + }, + { + "epoch": 0.2090085141444658, + "grad_norm": 1.3116586208343506, + "learning_rate": 4.988609996440516e-06, + "loss": 0.7388, + "step": 761 + }, + { + "epoch": 0.20928316396594343, + "grad_norm": 1.2347891330718994, + "learning_rate": 4.988575533717913e-06, + "loss": 0.7474, + "step": 762 + }, + { + "epoch": 0.20955781378742103, + "grad_norm": 1.2507805824279785, + "learning_rate": 4.988541019056523e-06, + "loss": 0.7147, + "step": 763 + }, + { + "epoch": 0.20983246360889865, + "grad_norm": 1.2203876972198486, + "learning_rate": 4.9885064524570665e-06, + "loss": 0.7151, + "step": 764 + }, + { + "epoch": 0.21010711343037627, + "grad_norm": 1.2818399667739868, + "learning_rate": 4.988471833920264e-06, + "loss": 0.7479, + "step": 765 + }, + { + "epoch": 0.2103817632518539, + "grad_norm": 1.3417636156082153, + "learning_rate": 4.988437163446839e-06, + "loss": 0.7649, + "step": 766 + }, + { + "epoch": 0.2106564130733315, + "grad_norm": 1.3322017192840576, + "learning_rate": 4.988402441037515e-06, + "loss": 0.6908, + "step": 767 + }, + { + "epoch": 0.21093106289480912, + "grad_norm": 1.2650338411331177, + "learning_rate": 4.988367666693016e-06, + "loss": 0.7385, + "step": 768 + }, + { + "epoch": 0.21120571271628674, + "grad_norm": 1.3333842754364014, + "learning_rate": 4.9883328404140685e-06, + "loss": 0.7218, + "step": 769 + }, + { + "epoch": 0.21148036253776434, + "grad_norm": 1.3029369115829468, + "learning_rate": 4.988297962201399e-06, + "loss": 0.7418, + "step": 770 + }, + { + "epoch": 0.21175501235924196, + "grad_norm": 1.2389003038406372, + "learning_rate": 4.9882630320557355e-06, + "loss": 0.7084, + "step": 771 + }, + { + "epoch": 0.21202966218071959, + "grad_norm": 1.2809033393859863, + "learning_rate": 4.988228049977806e-06, + "loss": 0.7459, + "step": 772 + }, + { + "epoch": 0.2123043120021972, + "grad_norm": 1.2405447959899902, + "learning_rate": 4.988193015968343e-06, + "loss": 0.7153, + "step": 773 + }, + { + "epoch": 0.2125789618236748, + "grad_norm": 1.3248083591461182, + "learning_rate": 4.988157930028076e-06, + "loss": 0.7298, + "step": 774 + }, + { + "epoch": 0.21285361164515243, + "grad_norm": 1.2558859586715698, + "learning_rate": 4.988122792157737e-06, + "loss": 0.7282, + "step": 775 + }, + { + "epoch": 0.21312826146663005, + "grad_norm": 1.347428798675537, + "learning_rate": 4.98808760235806e-06, + "loss": 0.7091, + "step": 776 + }, + { + "epoch": 0.21340291128810765, + "grad_norm": 1.2880363464355469, + "learning_rate": 4.988052360629779e-06, + "loss": 0.762, + "step": 777 + }, + { + "epoch": 0.21367756110958527, + "grad_norm": 1.2941904067993164, + "learning_rate": 4.988017066973631e-06, + "loss": 0.7179, + "step": 778 + }, + { + "epoch": 0.2139522109310629, + "grad_norm": 1.2835867404937744, + "learning_rate": 4.9879817213903505e-06, + "loss": 0.7551, + "step": 779 + }, + { + "epoch": 0.21422686075254052, + "grad_norm": 1.21609628200531, + "learning_rate": 4.987946323880677e-06, + "loss": 0.6477, + "step": 780 + }, + { + "epoch": 0.21450151057401812, + "grad_norm": 1.3201535940170288, + "learning_rate": 4.987910874445348e-06, + "loss": 0.7428, + "step": 781 + }, + { + "epoch": 0.21477616039549574, + "grad_norm": 1.265965461730957, + "learning_rate": 4.987875373085104e-06, + "loss": 0.7413, + "step": 782 + }, + { + "epoch": 0.21505081021697336, + "grad_norm": 1.2111151218414307, + "learning_rate": 4.987839819800686e-06, + "loss": 0.7088, + "step": 783 + }, + { + "epoch": 0.215325460038451, + "grad_norm": 1.2650107145309448, + "learning_rate": 4.987804214592836e-06, + "loss": 0.6785, + "step": 784 + }, + { + "epoch": 0.21560010985992858, + "grad_norm": 1.306730031967163, + "learning_rate": 4.987768557462296e-06, + "loss": 0.6847, + "step": 785 + }, + { + "epoch": 0.2158747596814062, + "grad_norm": 1.232254147529602, + "learning_rate": 4.987732848409812e-06, + "loss": 0.6538, + "step": 786 + }, + { + "epoch": 0.21614940950288383, + "grad_norm": 1.2269521951675415, + "learning_rate": 4.987697087436128e-06, + "loss": 0.7264, + "step": 787 + }, + { + "epoch": 0.21642405932436143, + "grad_norm": 1.2772256135940552, + "learning_rate": 4.987661274541991e-06, + "loss": 0.6902, + "step": 788 + }, + { + "epoch": 0.21669870914583905, + "grad_norm": 1.1920197010040283, + "learning_rate": 4.987625409728148e-06, + "loss": 0.666, + "step": 789 + }, + { + "epoch": 0.21697335896731668, + "grad_norm": 1.3303563594818115, + "learning_rate": 4.9875894929953475e-06, + "loss": 0.6692, + "step": 790 + }, + { + "epoch": 0.2172480087887943, + "grad_norm": 1.321170449256897, + "learning_rate": 4.98755352434434e-06, + "loss": 0.7358, + "step": 791 + }, + { + "epoch": 0.2175226586102719, + "grad_norm": 1.3153355121612549, + "learning_rate": 4.987517503775875e-06, + "loss": 0.7203, + "step": 792 + }, + { + "epoch": 0.21779730843174952, + "grad_norm": 1.2321504354476929, + "learning_rate": 4.987481431290705e-06, + "loss": 0.7214, + "step": 793 + }, + { + "epoch": 0.21807195825322714, + "grad_norm": 1.250683069229126, + "learning_rate": 4.9874453068895825e-06, + "loss": 0.7198, + "step": 794 + }, + { + "epoch": 0.21834660807470474, + "grad_norm": 1.375928521156311, + "learning_rate": 4.987409130573262e-06, + "loss": 0.6909, + "step": 795 + }, + { + "epoch": 0.21862125789618236, + "grad_norm": 1.2081973552703857, + "learning_rate": 4.987372902342498e-06, + "loss": 0.6383, + "step": 796 + }, + { + "epoch": 0.21889590771766, + "grad_norm": 1.2133538722991943, + "learning_rate": 4.987336622198047e-06, + "loss": 0.7037, + "step": 797 + }, + { + "epoch": 0.2191705575391376, + "grad_norm": 1.2567147016525269, + "learning_rate": 4.9873002901406656e-06, + "loss": 0.6898, + "step": 798 + }, + { + "epoch": 0.2194452073606152, + "grad_norm": 1.1989855766296387, + "learning_rate": 4.987263906171112e-06, + "loss": 0.7131, + "step": 799 + }, + { + "epoch": 0.21971985718209283, + "grad_norm": 1.3390337228775024, + "learning_rate": 4.987227470290148e-06, + "loss": 0.7279, + "step": 800 + }, + { + "epoch": 0.21999450700357046, + "grad_norm": 1.392145037651062, + "learning_rate": 4.9871909824985305e-06, + "loss": 0.7322, + "step": 801 + }, + { + "epoch": 0.22026915682504805, + "grad_norm": 1.2279229164123535, + "learning_rate": 4.987154442797023e-06, + "loss": 0.7166, + "step": 802 + }, + { + "epoch": 0.22054380664652568, + "grad_norm": 1.3357951641082764, + "learning_rate": 4.987117851186388e-06, + "loss": 0.7326, + "step": 803 + }, + { + "epoch": 0.2208184564680033, + "grad_norm": 1.1892001628875732, + "learning_rate": 4.9870812076673885e-06, + "loss": 0.7269, + "step": 804 + }, + { + "epoch": 0.22109310628948092, + "grad_norm": 1.2788124084472656, + "learning_rate": 4.98704451224079e-06, + "loss": 0.684, + "step": 805 + }, + { + "epoch": 0.22136775611095852, + "grad_norm": 1.388063907623291, + "learning_rate": 4.9870077649073574e-06, + "loss": 0.7097, + "step": 806 + }, + { + "epoch": 0.22164240593243614, + "grad_norm": 1.2396403551101685, + "learning_rate": 4.9869709656678595e-06, + "loss": 0.6684, + "step": 807 + }, + { + "epoch": 0.22191705575391377, + "grad_norm": 1.2335666418075562, + "learning_rate": 4.986934114523063e-06, + "loss": 0.6829, + "step": 808 + }, + { + "epoch": 0.22219170557539136, + "grad_norm": 1.2683956623077393, + "learning_rate": 4.986897211473736e-06, + "loss": 0.7416, + "step": 809 + }, + { + "epoch": 0.222466355396869, + "grad_norm": 1.2569732666015625, + "learning_rate": 4.986860256520651e-06, + "loss": 0.701, + "step": 810 + }, + { + "epoch": 0.2227410052183466, + "grad_norm": 1.2777084112167358, + "learning_rate": 4.986823249664578e-06, + "loss": 0.6526, + "step": 811 + }, + { + "epoch": 0.22301565503982423, + "grad_norm": 1.2222849130630493, + "learning_rate": 4.986786190906289e-06, + "loss": 0.7152, + "step": 812 + }, + { + "epoch": 0.22329030486130183, + "grad_norm": 1.276238203048706, + "learning_rate": 4.986749080246559e-06, + "loss": 0.7285, + "step": 813 + }, + { + "epoch": 0.22356495468277945, + "grad_norm": 1.2851428985595703, + "learning_rate": 4.98671191768616e-06, + "loss": 0.7494, + "step": 814 + }, + { + "epoch": 0.22383960450425708, + "grad_norm": 1.3218364715576172, + "learning_rate": 4.986674703225871e-06, + "loss": 0.7731, + "step": 815 + }, + { + "epoch": 0.2241142543257347, + "grad_norm": 1.2535046339035034, + "learning_rate": 4.986637436866465e-06, + "loss": 0.7078, + "step": 816 + }, + { + "epoch": 0.2243889041472123, + "grad_norm": 1.3137294054031372, + "learning_rate": 4.986600118608723e-06, + "loss": 0.7516, + "step": 817 + }, + { + "epoch": 0.22466355396868992, + "grad_norm": 1.3096369504928589, + "learning_rate": 4.986562748453422e-06, + "loss": 0.7093, + "step": 818 + }, + { + "epoch": 0.22493820379016755, + "grad_norm": 1.2040520906448364, + "learning_rate": 4.986525326401342e-06, + "loss": 0.7411, + "step": 819 + }, + { + "epoch": 0.22521285361164514, + "grad_norm": 1.1747503280639648, + "learning_rate": 4.986487852453265e-06, + "loss": 0.6583, + "step": 820 + }, + { + "epoch": 0.22548750343312277, + "grad_norm": 1.3336994647979736, + "learning_rate": 4.986450326609973e-06, + "loss": 0.729, + "step": 821 + }, + { + "epoch": 0.2257621532546004, + "grad_norm": 1.317300796508789, + "learning_rate": 4.986412748872248e-06, + "loss": 0.7113, + "step": 822 + }, + { + "epoch": 0.226036803076078, + "grad_norm": 1.2223623991012573, + "learning_rate": 4.986375119240875e-06, + "loss": 0.6772, + "step": 823 + }, + { + "epoch": 0.2263114528975556, + "grad_norm": 1.2742339372634888, + "learning_rate": 4.986337437716639e-06, + "loss": 0.6879, + "step": 824 + }, + { + "epoch": 0.22658610271903323, + "grad_norm": 1.316394567489624, + "learning_rate": 4.986299704300328e-06, + "loss": 0.7371, + "step": 825 + }, + { + "epoch": 0.22686075254051086, + "grad_norm": 1.3018203973770142, + "learning_rate": 4.986261918992729e-06, + "loss": 0.7347, + "step": 826 + }, + { + "epoch": 0.22713540236198845, + "grad_norm": 1.2798411846160889, + "learning_rate": 4.986224081794628e-06, + "loss": 0.7219, + "step": 827 + }, + { + "epoch": 0.22741005218346608, + "grad_norm": 1.3078782558441162, + "learning_rate": 4.986186192706818e-06, + "loss": 0.7343, + "step": 828 + }, + { + "epoch": 0.2276847020049437, + "grad_norm": 1.2674369812011719, + "learning_rate": 4.9861482517300876e-06, + "loss": 0.7659, + "step": 829 + }, + { + "epoch": 0.22795935182642132, + "grad_norm": 1.1725728511810303, + "learning_rate": 4.986110258865231e-06, + "loss": 0.6542, + "step": 830 + }, + { + "epoch": 0.22823400164789892, + "grad_norm": 1.3072152137756348, + "learning_rate": 4.986072214113038e-06, + "loss": 0.6612, + "step": 831 + }, + { + "epoch": 0.22850865146937654, + "grad_norm": 1.233365774154663, + "learning_rate": 4.986034117474306e-06, + "loss": 0.637, + "step": 832 + }, + { + "epoch": 0.22878330129085417, + "grad_norm": 1.167496681213379, + "learning_rate": 4.985995968949827e-06, + "loss": 0.6678, + "step": 833 + }, + { + "epoch": 0.22905795111233176, + "grad_norm": 1.248011827468872, + "learning_rate": 4.985957768540399e-06, + "loss": 0.7271, + "step": 834 + }, + { + "epoch": 0.2293326009338094, + "grad_norm": 1.3080165386199951, + "learning_rate": 4.985919516246819e-06, + "loss": 0.6877, + "step": 835 + }, + { + "epoch": 0.229607250755287, + "grad_norm": 1.1870863437652588, + "learning_rate": 4.985881212069885e-06, + "loss": 0.6475, + "step": 836 + }, + { + "epoch": 0.22988190057676464, + "grad_norm": 1.2831885814666748, + "learning_rate": 4.9858428560103975e-06, + "loss": 0.7369, + "step": 837 + }, + { + "epoch": 0.23015655039824223, + "grad_norm": 1.2627655267715454, + "learning_rate": 4.985804448069156e-06, + "loss": 0.7096, + "step": 838 + }, + { + "epoch": 0.23043120021971986, + "grad_norm": 1.2132052183151245, + "learning_rate": 4.985765988246962e-06, + "loss": 0.687, + "step": 839 + }, + { + "epoch": 0.23070585004119748, + "grad_norm": 1.1917312145233154, + "learning_rate": 4.985727476544618e-06, + "loss": 0.6363, + "step": 840 + }, + { + "epoch": 0.23098049986267508, + "grad_norm": 1.2970017194747925, + "learning_rate": 4.985688912962929e-06, + "loss": 0.6895, + "step": 841 + }, + { + "epoch": 0.2312551496841527, + "grad_norm": 1.3590093851089478, + "learning_rate": 4.9856502975027e-06, + "loss": 0.7141, + "step": 842 + }, + { + "epoch": 0.23152979950563032, + "grad_norm": 1.3227688074111938, + "learning_rate": 4.985611630164735e-06, + "loss": 0.7077, + "step": 843 + }, + { + "epoch": 0.23180444932710795, + "grad_norm": 1.2081618309020996, + "learning_rate": 4.985572910949842e-06, + "loss": 0.6834, + "step": 844 + }, + { + "epoch": 0.23207909914858554, + "grad_norm": 1.3101043701171875, + "learning_rate": 4.98553413985883e-06, + "loss": 0.7259, + "step": 845 + }, + { + "epoch": 0.23235374897006317, + "grad_norm": 1.2899553775787354, + "learning_rate": 4.985495316892506e-06, + "loss": 0.7332, + "step": 846 + }, + { + "epoch": 0.2326283987915408, + "grad_norm": 1.2949529886245728, + "learning_rate": 4.985456442051683e-06, + "loss": 0.7143, + "step": 847 + }, + { + "epoch": 0.23290304861301842, + "grad_norm": 1.2609729766845703, + "learning_rate": 4.985417515337171e-06, + "loss": 0.6376, + "step": 848 + }, + { + "epoch": 0.233177698434496, + "grad_norm": 1.289713740348816, + "learning_rate": 4.9853785367497815e-06, + "loss": 0.7393, + "step": 849 + }, + { + "epoch": 0.23345234825597364, + "grad_norm": 1.270831823348999, + "learning_rate": 4.98533950629033e-06, + "loss": 0.7188, + "step": 850 + }, + { + "epoch": 0.23372699807745126, + "grad_norm": 1.20114266872406, + "learning_rate": 4.985300423959629e-06, + "loss": 0.6641, + "step": 851 + }, + { + "epoch": 0.23400164789892886, + "grad_norm": 1.262244462966919, + "learning_rate": 4.985261289758496e-06, + "loss": 0.7547, + "step": 852 + }, + { + "epoch": 0.23427629772040648, + "grad_norm": 1.3522323369979858, + "learning_rate": 4.985222103687747e-06, + "loss": 0.7043, + "step": 853 + }, + { + "epoch": 0.2345509475418841, + "grad_norm": 1.2436219453811646, + "learning_rate": 4.9851828657482e-06, + "loss": 0.7187, + "step": 854 + }, + { + "epoch": 0.23482559736336173, + "grad_norm": 1.238990068435669, + "learning_rate": 4.985143575940673e-06, + "loss": 0.7022, + "step": 855 + }, + { + "epoch": 0.23510024718483932, + "grad_norm": 1.3613369464874268, + "learning_rate": 4.985104234265988e-06, + "loss": 0.7073, + "step": 856 + }, + { + "epoch": 0.23537489700631695, + "grad_norm": 1.2868516445159912, + "learning_rate": 4.985064840724964e-06, + "loss": 0.7072, + "step": 857 + }, + { + "epoch": 0.23564954682779457, + "grad_norm": 1.2406924962997437, + "learning_rate": 4.985025395318425e-06, + "loss": 0.6992, + "step": 858 + }, + { + "epoch": 0.23592419664927217, + "grad_norm": 1.2192474603652954, + "learning_rate": 4.9849858980471924e-06, + "loss": 0.7289, + "step": 859 + }, + { + "epoch": 0.2361988464707498, + "grad_norm": 1.305741786956787, + "learning_rate": 4.984946348912092e-06, + "loss": 0.7077, + "step": 860 + }, + { + "epoch": 0.23647349629222741, + "grad_norm": 1.2120999097824097, + "learning_rate": 4.984906747913949e-06, + "loss": 0.7431, + "step": 861 + }, + { + "epoch": 0.23674814611370504, + "grad_norm": 1.2095791101455688, + "learning_rate": 4.984867095053589e-06, + "loss": 0.6862, + "step": 862 + }, + { + "epoch": 0.23702279593518263, + "grad_norm": 1.3139053583145142, + "learning_rate": 4.984827390331841e-06, + "loss": 0.7206, + "step": 863 + }, + { + "epoch": 0.23729744575666026, + "grad_norm": 1.3170955181121826, + "learning_rate": 4.9847876337495314e-06, + "loss": 0.7477, + "step": 864 + }, + { + "epoch": 0.23757209557813788, + "grad_norm": 1.3773725032806396, + "learning_rate": 4.984747825307493e-06, + "loss": 0.7091, + "step": 865 + }, + { + "epoch": 0.23784674539961548, + "grad_norm": 1.2858976125717163, + "learning_rate": 4.984707965006554e-06, + "loss": 0.7518, + "step": 866 + }, + { + "epoch": 0.2381213952210931, + "grad_norm": 1.2803081274032593, + "learning_rate": 4.984668052847549e-06, + "loss": 0.7097, + "step": 867 + }, + { + "epoch": 0.23839604504257073, + "grad_norm": 1.33771812915802, + "learning_rate": 4.984628088831308e-06, + "loss": 0.6987, + "step": 868 + }, + { + "epoch": 0.23867069486404835, + "grad_norm": 1.2586207389831543, + "learning_rate": 4.984588072958667e-06, + "loss": 0.7501, + "step": 869 + }, + { + "epoch": 0.23894534468552595, + "grad_norm": 1.232498049736023, + "learning_rate": 4.9845480052304606e-06, + "loss": 0.7011, + "step": 870 + }, + { + "epoch": 0.23921999450700357, + "grad_norm": 1.242592453956604, + "learning_rate": 4.9845078856475245e-06, + "loss": 0.7005, + "step": 871 + }, + { + "epoch": 0.2394946443284812, + "grad_norm": 1.2812868356704712, + "learning_rate": 4.984467714210698e-06, + "loss": 0.7201, + "step": 872 + }, + { + "epoch": 0.2397692941499588, + "grad_norm": 1.3022912740707397, + "learning_rate": 4.984427490920817e-06, + "loss": 0.6906, + "step": 873 + }, + { + "epoch": 0.2400439439714364, + "grad_norm": 1.2038233280181885, + "learning_rate": 4.984387215778722e-06, + "loss": 0.7075, + "step": 874 + }, + { + "epoch": 0.24031859379291404, + "grad_norm": 1.3275353908538818, + "learning_rate": 4.984346888785255e-06, + "loss": 0.7079, + "step": 875 + }, + { + "epoch": 0.24059324361439166, + "grad_norm": 1.2155817747116089, + "learning_rate": 4.984306509941256e-06, + "loss": 0.6907, + "step": 876 + }, + { + "epoch": 0.24086789343586926, + "grad_norm": 1.1783818006515503, + "learning_rate": 4.9842660792475676e-06, + "loss": 0.7024, + "step": 877 + }, + { + "epoch": 0.24114254325734688, + "grad_norm": 1.2901397943496704, + "learning_rate": 4.984225596705034e-06, + "loss": 0.741, + "step": 878 + }, + { + "epoch": 0.2414171930788245, + "grad_norm": 1.28879976272583, + "learning_rate": 4.984185062314501e-06, + "loss": 0.6828, + "step": 879 + }, + { + "epoch": 0.24169184290030213, + "grad_norm": 1.2490298748016357, + "learning_rate": 4.9841444760768134e-06, + "loss": 0.69, + "step": 880 + }, + { + "epoch": 0.24196649272177972, + "grad_norm": 1.2445104122161865, + "learning_rate": 4.984103837992819e-06, + "loss": 0.7362, + "step": 881 + }, + { + "epoch": 0.24224114254325735, + "grad_norm": 1.168450117111206, + "learning_rate": 4.984063148063365e-06, + "loss": 0.6633, + "step": 882 + }, + { + "epoch": 0.24251579236473497, + "grad_norm": 1.2901265621185303, + "learning_rate": 4.984022406289302e-06, + "loss": 0.7093, + "step": 883 + }, + { + "epoch": 0.24279044218621257, + "grad_norm": 1.191749095916748, + "learning_rate": 4.983981612671479e-06, + "loss": 0.6811, + "step": 884 + }, + { + "epoch": 0.2430650920076902, + "grad_norm": 1.3347686529159546, + "learning_rate": 4.983940767210749e-06, + "loss": 0.7168, + "step": 885 + }, + { + "epoch": 0.24333974182916782, + "grad_norm": 1.2455517053604126, + "learning_rate": 4.983899869907963e-06, + "loss": 0.7129, + "step": 886 + }, + { + "epoch": 0.24361439165064544, + "grad_norm": 1.2327148914337158, + "learning_rate": 4.9838589207639745e-06, + "loss": 0.6734, + "step": 887 + }, + { + "epoch": 0.24388904147212304, + "grad_norm": 1.2371383905410767, + "learning_rate": 4.9838179197796395e-06, + "loss": 0.7314, + "step": 888 + }, + { + "epoch": 0.24416369129360066, + "grad_norm": 1.302803874015808, + "learning_rate": 4.9837768669558125e-06, + "loss": 0.6943, + "step": 889 + }, + { + "epoch": 0.24443834111507828, + "grad_norm": 1.2600665092468262, + "learning_rate": 4.983735762293351e-06, + "loss": 0.7161, + "step": 890 + }, + { + "epoch": 0.24471299093655588, + "grad_norm": 1.2127025127410889, + "learning_rate": 4.9836946057931126e-06, + "loss": 0.7088, + "step": 891 + }, + { + "epoch": 0.2449876407580335, + "grad_norm": 1.3058143854141235, + "learning_rate": 4.983653397455957e-06, + "loss": 0.726, + "step": 892 + }, + { + "epoch": 0.24526229057951113, + "grad_norm": 1.3128089904785156, + "learning_rate": 4.9836121372827425e-06, + "loss": 0.6874, + "step": 893 + }, + { + "epoch": 0.24553694040098875, + "grad_norm": 1.0972052812576294, + "learning_rate": 4.983570825274332e-06, + "loss": 0.6029, + "step": 894 + }, + { + "epoch": 0.24581159022246635, + "grad_norm": 1.246225118637085, + "learning_rate": 4.983529461431587e-06, + "loss": 0.7042, + "step": 895 + }, + { + "epoch": 0.24608624004394397, + "grad_norm": 1.3225852251052856, + "learning_rate": 4.98348804575537e-06, + "loss": 0.7081, + "step": 896 + }, + { + "epoch": 0.2463608898654216, + "grad_norm": 1.2378884553909302, + "learning_rate": 4.9834465782465465e-06, + "loss": 0.7262, + "step": 897 + }, + { + "epoch": 0.2466355396868992, + "grad_norm": 1.2656704187393188, + "learning_rate": 4.983405058905982e-06, + "loss": 0.7194, + "step": 898 + }, + { + "epoch": 0.24691018950837682, + "grad_norm": 1.2721108198165894, + "learning_rate": 4.983363487734543e-06, + "loss": 0.7393, + "step": 899 + }, + { + "epoch": 0.24718483932985444, + "grad_norm": 1.2820522785186768, + "learning_rate": 4.983321864733098e-06, + "loss": 0.7105, + "step": 900 + }, + { + "epoch": 0.24745948915133206, + "grad_norm": 1.3282326459884644, + "learning_rate": 4.983280189902512e-06, + "loss": 0.7302, + "step": 901 + }, + { + "epoch": 0.24773413897280966, + "grad_norm": 1.2549283504486084, + "learning_rate": 4.983238463243658e-06, + "loss": 0.7268, + "step": 902 + }, + { + "epoch": 0.24800878879428728, + "grad_norm": 1.1678422689437866, + "learning_rate": 4.983196684757408e-06, + "loss": 0.6641, + "step": 903 + }, + { + "epoch": 0.2482834386157649, + "grad_norm": 1.2806346416473389, + "learning_rate": 4.98315485444463e-06, + "loss": 0.7453, + "step": 904 + }, + { + "epoch": 0.2485580884372425, + "grad_norm": 1.317617654800415, + "learning_rate": 4.9831129723062e-06, + "loss": 0.6778, + "step": 905 + }, + { + "epoch": 0.24883273825872013, + "grad_norm": 1.2287358045578003, + "learning_rate": 4.9830710383429916e-06, + "loss": 0.6598, + "step": 906 + }, + { + "epoch": 0.24910738808019775, + "grad_norm": 1.2154239416122437, + "learning_rate": 4.983029052555879e-06, + "loss": 0.6822, + "step": 907 + }, + { + "epoch": 0.24938203790167537, + "grad_norm": 1.29787278175354, + "learning_rate": 4.98298701494574e-06, + "loss": 0.7394, + "step": 908 + }, + { + "epoch": 0.24965668772315297, + "grad_norm": 1.2593048810958862, + "learning_rate": 4.982944925513451e-06, + "loss": 0.7389, + "step": 909 + }, + { + "epoch": 0.2499313375446306, + "grad_norm": 1.3168563842773438, + "learning_rate": 4.982902784259891e-06, + "loss": 0.6911, + "step": 910 + }, + { + "epoch": 0.2502059873661082, + "grad_norm": 1.2939618825912476, + "learning_rate": 4.982860591185938e-06, + "loss": 0.7091, + "step": 911 + }, + { + "epoch": 0.25048063718758584, + "grad_norm": 1.2713887691497803, + "learning_rate": 4.982818346292475e-06, + "loss": 0.6841, + "step": 912 + }, + { + "epoch": 0.25075528700906347, + "grad_norm": 1.2705943584442139, + "learning_rate": 4.982776049580382e-06, + "loss": 0.7621, + "step": 913 + }, + { + "epoch": 0.25102993683054103, + "grad_norm": 1.2955294847488403, + "learning_rate": 4.9827337010505415e-06, + "loss": 0.7103, + "step": 914 + }, + { + "epoch": 0.25130458665201866, + "grad_norm": 1.2614054679870605, + "learning_rate": 4.9826913007038394e-06, + "loss": 0.6987, + "step": 915 + }, + { + "epoch": 0.2515792364734963, + "grad_norm": 1.2423617839813232, + "learning_rate": 4.982648848541158e-06, + "loss": 0.7199, + "step": 916 + }, + { + "epoch": 0.2518538862949739, + "grad_norm": 1.1379387378692627, + "learning_rate": 4.982606344563386e-06, + "loss": 0.6413, + "step": 917 + }, + { + "epoch": 0.25212853611645153, + "grad_norm": 1.299209475517273, + "learning_rate": 4.982563788771407e-06, + "loss": 0.7187, + "step": 918 + }, + { + "epoch": 0.25240318593792915, + "grad_norm": 1.2490566968917847, + "learning_rate": 4.982521181166113e-06, + "loss": 0.7156, + "step": 919 + }, + { + "epoch": 0.2526778357594068, + "grad_norm": 1.331782341003418, + "learning_rate": 4.982478521748391e-06, + "loss": 0.6792, + "step": 920 + }, + { + "epoch": 0.25295248558088435, + "grad_norm": 1.1768856048583984, + "learning_rate": 4.982435810519132e-06, + "loss": 0.6802, + "step": 921 + }, + { + "epoch": 0.25322713540236197, + "grad_norm": 1.2367897033691406, + "learning_rate": 4.982393047479227e-06, + "loss": 0.7414, + "step": 922 + }, + { + "epoch": 0.2535017852238396, + "grad_norm": 1.1635708808898926, + "learning_rate": 4.982350232629569e-06, + "loss": 0.6728, + "step": 923 + }, + { + "epoch": 0.2537764350453172, + "grad_norm": 1.371668815612793, + "learning_rate": 4.982307365971052e-06, + "loss": 0.7289, + "step": 924 + }, + { + "epoch": 0.25405108486679484, + "grad_norm": 1.198479175567627, + "learning_rate": 4.982264447504569e-06, + "loss": 0.7165, + "step": 925 + }, + { + "epoch": 0.25432573468827246, + "grad_norm": 1.2450952529907227, + "learning_rate": 4.9822214772310174e-06, + "loss": 0.7118, + "step": 926 + }, + { + "epoch": 0.2546003845097501, + "grad_norm": 1.2730010747909546, + "learning_rate": 4.9821784551512934e-06, + "loss": 0.7606, + "step": 927 + }, + { + "epoch": 0.2548750343312277, + "grad_norm": 1.284477949142456, + "learning_rate": 4.982135381266296e-06, + "loss": 0.7243, + "step": 928 + }, + { + "epoch": 0.2551496841527053, + "grad_norm": 1.3078871965408325, + "learning_rate": 4.9820922555769214e-06, + "loss": 0.655, + "step": 929 + }, + { + "epoch": 0.2554243339741829, + "grad_norm": 1.2310397624969482, + "learning_rate": 4.982049078084072e-06, + "loss": 0.7304, + "step": 930 + }, + { + "epoch": 0.25569898379566053, + "grad_norm": 1.2723703384399414, + "learning_rate": 4.982005848788648e-06, + "loss": 0.6995, + "step": 931 + }, + { + "epoch": 0.25597363361713815, + "grad_norm": 1.3504489660263062, + "learning_rate": 4.981962567691553e-06, + "loss": 0.6962, + "step": 932 + }, + { + "epoch": 0.2562482834386158, + "grad_norm": 1.2877484560012817, + "learning_rate": 4.981919234793688e-06, + "loss": 0.7515, + "step": 933 + }, + { + "epoch": 0.2565229332600934, + "grad_norm": 1.257575273513794, + "learning_rate": 4.9818758500959595e-06, + "loss": 0.6855, + "step": 934 + }, + { + "epoch": 0.256797583081571, + "grad_norm": 1.296901822090149, + "learning_rate": 4.981832413599272e-06, + "loss": 0.6665, + "step": 935 + }, + { + "epoch": 0.2570722329030486, + "grad_norm": 1.3910408020019531, + "learning_rate": 4.981788925304532e-06, + "loss": 0.6991, + "step": 936 + }, + { + "epoch": 0.2573468827245262, + "grad_norm": 1.3227461576461792, + "learning_rate": 4.9817453852126475e-06, + "loss": 0.7161, + "step": 937 + }, + { + "epoch": 0.25762153254600384, + "grad_norm": 1.2379125356674194, + "learning_rate": 4.981701793324527e-06, + "loss": 0.683, + "step": 938 + }, + { + "epoch": 0.25789618236748146, + "grad_norm": 1.3465633392333984, + "learning_rate": 4.98165814964108e-06, + "loss": 0.7032, + "step": 939 + }, + { + "epoch": 0.2581708321889591, + "grad_norm": 1.2320935726165771, + "learning_rate": 4.981614454163219e-06, + "loss": 0.6965, + "step": 940 + }, + { + "epoch": 0.2584454820104367, + "grad_norm": 1.216944694519043, + "learning_rate": 4.981570706891854e-06, + "loss": 0.7209, + "step": 941 + }, + { + "epoch": 0.25872013183191434, + "grad_norm": 1.2130893468856812, + "learning_rate": 4.9815269078278984e-06, + "loss": 0.7224, + "step": 942 + }, + { + "epoch": 0.2589947816533919, + "grad_norm": 1.310502052307129, + "learning_rate": 4.981483056972267e-06, + "loss": 0.7165, + "step": 943 + }, + { + "epoch": 0.2592694314748695, + "grad_norm": 1.2879263162612915, + "learning_rate": 4.981439154325875e-06, + "loss": 0.7, + "step": 944 + }, + { + "epoch": 0.25954408129634715, + "grad_norm": 1.3549739122390747, + "learning_rate": 4.981395199889639e-06, + "loss": 0.7506, + "step": 945 + }, + { + "epoch": 0.2598187311178248, + "grad_norm": 1.228348731994629, + "learning_rate": 4.981351193664474e-06, + "loss": 0.6708, + "step": 946 + }, + { + "epoch": 0.2600933809393024, + "grad_norm": 1.3207430839538574, + "learning_rate": 4.9813071356513025e-06, + "loss": 0.6875, + "step": 947 + }, + { + "epoch": 0.26036803076078, + "grad_norm": 1.2807351350784302, + "learning_rate": 4.9812630258510395e-06, + "loss": 0.6646, + "step": 948 + }, + { + "epoch": 0.26064268058225765, + "grad_norm": 1.289628505706787, + "learning_rate": 4.981218864264609e-06, + "loss": 0.7433, + "step": 949 + }, + { + "epoch": 0.2609173304037352, + "grad_norm": 1.215345025062561, + "learning_rate": 4.981174650892931e-06, + "loss": 0.636, + "step": 950 + }, + { + "epoch": 0.26119198022521284, + "grad_norm": 1.2548105716705322, + "learning_rate": 4.98113038573693e-06, + "loss": 0.7412, + "step": 951 + }, + { + "epoch": 0.26146663004669046, + "grad_norm": 1.2947429418563843, + "learning_rate": 4.9810860687975284e-06, + "loss": 0.6958, + "step": 952 + }, + { + "epoch": 0.2617412798681681, + "grad_norm": 1.3284801244735718, + "learning_rate": 4.9810417000756505e-06, + "loss": 0.6782, + "step": 953 + }, + { + "epoch": 0.2620159296896457, + "grad_norm": 1.2492486238479614, + "learning_rate": 4.9809972795722235e-06, + "loss": 0.6617, + "step": 954 + }, + { + "epoch": 0.26229057951112333, + "grad_norm": 1.1903424263000488, + "learning_rate": 4.980952807288174e-06, + "loss": 0.6995, + "step": 955 + }, + { + "epoch": 0.26256522933260096, + "grad_norm": 1.3041877746582031, + "learning_rate": 4.980908283224431e-06, + "loss": 0.6808, + "step": 956 + }, + { + "epoch": 0.2628398791540785, + "grad_norm": 1.2137293815612793, + "learning_rate": 4.9808637073819235e-06, + "loss": 0.6346, + "step": 957 + }, + { + "epoch": 0.26311452897555615, + "grad_norm": 1.2349345684051514, + "learning_rate": 4.980819079761581e-06, + "loss": 0.6761, + "step": 958 + }, + { + "epoch": 0.2633891787970338, + "grad_norm": 1.2655107975006104, + "learning_rate": 4.980774400364335e-06, + "loss": 0.6866, + "step": 959 + }, + { + "epoch": 0.2636638286185114, + "grad_norm": 1.2779783010482788, + "learning_rate": 4.980729669191119e-06, + "loss": 0.7521, + "step": 960 + }, + { + "epoch": 0.263938478439989, + "grad_norm": 1.2955983877182007, + "learning_rate": 4.9806848862428656e-06, + "loss": 0.7219, + "step": 961 + }, + { + "epoch": 0.26421312826146665, + "grad_norm": 1.2511858940124512, + "learning_rate": 4.98064005152051e-06, + "loss": 0.7063, + "step": 962 + }, + { + "epoch": 0.26448777808294427, + "grad_norm": 1.1988381147384644, + "learning_rate": 4.980595165024987e-06, + "loss": 0.6733, + "step": 963 + }, + { + "epoch": 0.26476242790442184, + "grad_norm": 1.256410837173462, + "learning_rate": 4.9805502267572355e-06, + "loss": 0.7326, + "step": 964 + }, + { + "epoch": 0.26503707772589946, + "grad_norm": 1.280403733253479, + "learning_rate": 4.9805052367181916e-06, + "loss": 0.7364, + "step": 965 + }, + { + "epoch": 0.2653117275473771, + "grad_norm": 1.3214569091796875, + "learning_rate": 4.980460194908795e-06, + "loss": 0.7017, + "step": 966 + }, + { + "epoch": 0.2655863773688547, + "grad_norm": 1.2558528184890747, + "learning_rate": 4.980415101329985e-06, + "loss": 0.7401, + "step": 967 + }, + { + "epoch": 0.26586102719033233, + "grad_norm": 1.2303555011749268, + "learning_rate": 4.980369955982703e-06, + "loss": 0.6813, + "step": 968 + }, + { + "epoch": 0.26613567701180996, + "grad_norm": 1.3038359880447388, + "learning_rate": 4.980324758867893e-06, + "loss": 0.6951, + "step": 969 + }, + { + "epoch": 0.2664103268332876, + "grad_norm": 1.3217288255691528, + "learning_rate": 4.980279509986496e-06, + "loss": 0.713, + "step": 970 + }, + { + "epoch": 0.26668497665476515, + "grad_norm": 1.3922308683395386, + "learning_rate": 4.980234209339457e-06, + "loss": 0.7335, + "step": 971 + }, + { + "epoch": 0.2669596264762428, + "grad_norm": 1.280900478363037, + "learning_rate": 4.980188856927722e-06, + "loss": 0.7377, + "step": 972 + }, + { + "epoch": 0.2672342762977204, + "grad_norm": 1.3025240898132324, + "learning_rate": 4.980143452752237e-06, + "loss": 0.6963, + "step": 973 + }, + { + "epoch": 0.267508926119198, + "grad_norm": 1.2562007904052734, + "learning_rate": 4.9800979968139494e-06, + "loss": 0.7684, + "step": 974 + }, + { + "epoch": 0.26778357594067564, + "grad_norm": 1.3080564737319946, + "learning_rate": 4.980052489113809e-06, + "loss": 0.7259, + "step": 975 + }, + { + "epoch": 0.26805822576215327, + "grad_norm": 1.353481650352478, + "learning_rate": 4.980006929652766e-06, + "loss": 0.6843, + "step": 976 + }, + { + "epoch": 0.2683328755836309, + "grad_norm": 1.2157846689224243, + "learning_rate": 4.9799613184317685e-06, + "loss": 0.703, + "step": 977 + }, + { + "epoch": 0.26860752540510846, + "grad_norm": 1.263662338256836, + "learning_rate": 4.97991565545177e-06, + "loss": 0.7415, + "step": 978 + }, + { + "epoch": 0.2688821752265861, + "grad_norm": 1.1598557233810425, + "learning_rate": 4.979869940713725e-06, + "loss": 0.6408, + "step": 979 + }, + { + "epoch": 0.2691568250480637, + "grad_norm": 1.2220137119293213, + "learning_rate": 4.979824174218585e-06, + "loss": 0.7217, + "step": 980 + }, + { + "epoch": 0.26943147486954133, + "grad_norm": 1.2184396982192993, + "learning_rate": 4.979778355967307e-06, + "loss": 0.6792, + "step": 981 + }, + { + "epoch": 0.26970612469101896, + "grad_norm": 1.300327181816101, + "learning_rate": 4.979732485960848e-06, + "loss": 0.767, + "step": 982 + }, + { + "epoch": 0.2699807745124966, + "grad_norm": 1.256528615951538, + "learning_rate": 4.979686564200162e-06, + "loss": 0.6979, + "step": 983 + }, + { + "epoch": 0.2702554243339742, + "grad_norm": 1.2479788064956665, + "learning_rate": 4.979640590686211e-06, + "loss": 0.6637, + "step": 984 + }, + { + "epoch": 0.27053007415545177, + "grad_norm": 1.273691177368164, + "learning_rate": 4.979594565419952e-06, + "loss": 0.7585, + "step": 985 + }, + { + "epoch": 0.2708047239769294, + "grad_norm": 1.2591562271118164, + "learning_rate": 4.9795484884023475e-06, + "loss": 0.6545, + "step": 986 + }, + { + "epoch": 0.271079373798407, + "grad_norm": 1.2205631732940674, + "learning_rate": 4.979502359634358e-06, + "loss": 0.6813, + "step": 987 + }, + { + "epoch": 0.27135402361988464, + "grad_norm": 1.2036645412445068, + "learning_rate": 4.979456179116946e-06, + "loss": 0.665, + "step": 988 + }, + { + "epoch": 0.27162867344136227, + "grad_norm": 1.264237403869629, + "learning_rate": 4.979409946851077e-06, + "loss": 0.7091, + "step": 989 + }, + { + "epoch": 0.2719033232628399, + "grad_norm": 1.260743260383606, + "learning_rate": 4.979363662837714e-06, + "loss": 0.6956, + "step": 990 + }, + { + "epoch": 0.2721779730843175, + "grad_norm": 1.2634515762329102, + "learning_rate": 4.979317327077824e-06, + "loss": 0.7095, + "step": 991 + }, + { + "epoch": 0.27245262290579514, + "grad_norm": 1.2023704051971436, + "learning_rate": 4.979270939572373e-06, + "loss": 0.6886, + "step": 992 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.2218390703201294, + "learning_rate": 4.979224500322331e-06, + "loss": 0.7028, + "step": 993 + }, + { + "epoch": 0.27300192254875033, + "grad_norm": 1.3931236267089844, + "learning_rate": 4.979178009328666e-06, + "loss": 0.766, + "step": 994 + }, + { + "epoch": 0.27327657237022795, + "grad_norm": 1.302596092224121, + "learning_rate": 4.979131466592348e-06, + "loss": 0.6812, + "step": 995 + }, + { + "epoch": 0.2735512221917056, + "grad_norm": 1.2319303750991821, + "learning_rate": 4.979084872114349e-06, + "loss": 0.6597, + "step": 996 + }, + { + "epoch": 0.2738258720131832, + "grad_norm": 1.3199958801269531, + "learning_rate": 4.979038225895642e-06, + "loss": 0.69, + "step": 997 + }, + { + "epoch": 0.2741005218346608, + "grad_norm": 1.3419215679168701, + "learning_rate": 4.978991527937199e-06, + "loss": 0.7658, + "step": 998 + }, + { + "epoch": 0.27437517165613845, + "grad_norm": 1.4456923007965088, + "learning_rate": 4.978944778239996e-06, + "loss": 0.7726, + "step": 999 + }, + { + "epoch": 0.274649821477616, + "grad_norm": 1.2877647876739502, + "learning_rate": 4.978897976805009e-06, + "loss": 0.7071, + "step": 1000 + }, + { + "epoch": 0.27492447129909364, + "grad_norm": 1.2310909032821655, + "learning_rate": 4.978851123633214e-06, + "loss": 0.6771, + "step": 1001 + }, + { + "epoch": 0.27519912112057127, + "grad_norm": 1.2899233102798462, + "learning_rate": 4.978804218725587e-06, + "loss": 0.7116, + "step": 1002 + }, + { + "epoch": 0.2754737709420489, + "grad_norm": 1.2363801002502441, + "learning_rate": 4.978757262083111e-06, + "loss": 0.6872, + "step": 1003 + }, + { + "epoch": 0.2757484207635265, + "grad_norm": 1.2041945457458496, + "learning_rate": 4.978710253706762e-06, + "loss": 0.6748, + "step": 1004 + }, + { + "epoch": 0.27602307058500414, + "grad_norm": 1.2160760164260864, + "learning_rate": 4.978663193597524e-06, + "loss": 0.7295, + "step": 1005 + }, + { + "epoch": 0.27629772040648176, + "grad_norm": 1.209977388381958, + "learning_rate": 4.978616081756379e-06, + "loss": 0.697, + "step": 1006 + }, + { + "epoch": 0.27657237022795933, + "grad_norm": 1.2074801921844482, + "learning_rate": 4.978568918184309e-06, + "loss": 0.69, + "step": 1007 + }, + { + "epoch": 0.27684702004943695, + "grad_norm": 1.1789027452468872, + "learning_rate": 4.9785217028822975e-06, + "loss": 0.7366, + "step": 1008 + }, + { + "epoch": 0.2771216698709146, + "grad_norm": 1.330591082572937, + "learning_rate": 4.978474435851332e-06, + "loss": 0.7097, + "step": 1009 + }, + { + "epoch": 0.2773963196923922, + "grad_norm": 1.2768994569778442, + "learning_rate": 4.978427117092398e-06, + "loss": 0.674, + "step": 1010 + }, + { + "epoch": 0.2776709695138698, + "grad_norm": 1.283840298652649, + "learning_rate": 4.9783797466064845e-06, + "loss": 0.6935, + "step": 1011 + }, + { + "epoch": 0.27794561933534745, + "grad_norm": 1.2808358669281006, + "learning_rate": 4.978332324394578e-06, + "loss": 0.7324, + "step": 1012 + }, + { + "epoch": 0.2782202691568251, + "grad_norm": 1.2986747026443481, + "learning_rate": 4.978284850457669e-06, + "loss": 0.7513, + "step": 1013 + }, + { + "epoch": 0.27849491897830264, + "grad_norm": 1.2573647499084473, + "learning_rate": 4.978237324796748e-06, + "loss": 0.6984, + "step": 1014 + }, + { + "epoch": 0.27876956879978027, + "grad_norm": 1.1954389810562134, + "learning_rate": 4.978189747412809e-06, + "loss": 0.665, + "step": 1015 + }, + { + "epoch": 0.2790442186212579, + "grad_norm": 1.3068561553955078, + "learning_rate": 4.978142118306842e-06, + "loss": 0.7229, + "step": 1016 + }, + { + "epoch": 0.2793188684427355, + "grad_norm": 1.2403963804244995, + "learning_rate": 4.978094437479843e-06, + "loss": 0.6891, + "step": 1017 + }, + { + "epoch": 0.27959351826421314, + "grad_norm": 1.2806992530822754, + "learning_rate": 4.978046704932807e-06, + "loss": 0.7098, + "step": 1018 + }, + { + "epoch": 0.27986816808569076, + "grad_norm": 1.3972183465957642, + "learning_rate": 4.97799892066673e-06, + "loss": 0.7562, + "step": 1019 + }, + { + "epoch": 0.2801428179071684, + "grad_norm": 1.2648934125900269, + "learning_rate": 4.977951084682609e-06, + "loss": 0.6485, + "step": 1020 + }, + { + "epoch": 0.28041746772864595, + "grad_norm": 1.2848612070083618, + "learning_rate": 4.977903196981442e-06, + "loss": 0.6848, + "step": 1021 + }, + { + "epoch": 0.2806921175501236, + "grad_norm": 1.3485931158065796, + "learning_rate": 4.977855257564229e-06, + "loss": 0.7523, + "step": 1022 + }, + { + "epoch": 0.2809667673716012, + "grad_norm": 1.243517279624939, + "learning_rate": 4.977807266431971e-06, + "loss": 0.6719, + "step": 1023 + }, + { + "epoch": 0.2812414171930788, + "grad_norm": 1.1943622827529907, + "learning_rate": 4.977759223585669e-06, + "loss": 0.6089, + "step": 1024 + }, + { + "epoch": 0.28151606701455645, + "grad_norm": 1.4312350749969482, + "learning_rate": 4.977711129026326e-06, + "loss": 0.745, + "step": 1025 + }, + { + "epoch": 0.28179071683603407, + "grad_norm": 1.306100606918335, + "learning_rate": 4.977662982754945e-06, + "loss": 0.7033, + "step": 1026 + }, + { + "epoch": 0.2820653666575117, + "grad_norm": 1.3339329957962036, + "learning_rate": 4.977614784772532e-06, + "loss": 0.752, + "step": 1027 + }, + { + "epoch": 0.28234001647898926, + "grad_norm": 1.2642780542373657, + "learning_rate": 4.977566535080092e-06, + "loss": 0.7184, + "step": 1028 + }, + { + "epoch": 0.2826146663004669, + "grad_norm": 1.2235605716705322, + "learning_rate": 4.977518233678633e-06, + "loss": 0.6813, + "step": 1029 + }, + { + "epoch": 0.2828893161219445, + "grad_norm": 1.273101568222046, + "learning_rate": 4.977469880569162e-06, + "loss": 0.743, + "step": 1030 + }, + { + "epoch": 0.28316396594342214, + "grad_norm": 1.2600704431533813, + "learning_rate": 4.977421475752689e-06, + "loss": 0.6577, + "step": 1031 + }, + { + "epoch": 0.28343861576489976, + "grad_norm": 1.2189220190048218, + "learning_rate": 4.977373019230224e-06, + "loss": 0.746, + "step": 1032 + }, + { + "epoch": 0.2837132655863774, + "grad_norm": 1.2899290323257446, + "learning_rate": 4.977324511002778e-06, + "loss": 0.6942, + "step": 1033 + }, + { + "epoch": 0.283987915407855, + "grad_norm": 1.2556238174438477, + "learning_rate": 4.977275951071363e-06, + "loss": 0.7276, + "step": 1034 + }, + { + "epoch": 0.2842625652293326, + "grad_norm": 1.3390672206878662, + "learning_rate": 4.977227339436994e-06, + "loss": 0.713, + "step": 1035 + }, + { + "epoch": 0.2845372150508102, + "grad_norm": 1.2657535076141357, + "learning_rate": 4.977178676100685e-06, + "loss": 0.6692, + "step": 1036 + }, + { + "epoch": 0.2848118648722878, + "grad_norm": 1.237056851387024, + "learning_rate": 4.97712996106345e-06, + "loss": 0.6828, + "step": 1037 + }, + { + "epoch": 0.28508651469376545, + "grad_norm": 1.2238341569900513, + "learning_rate": 4.977081194326309e-06, + "loss": 0.7199, + "step": 1038 + }, + { + "epoch": 0.28536116451524307, + "grad_norm": 1.228433609008789, + "learning_rate": 4.977032375890277e-06, + "loss": 0.7003, + "step": 1039 + }, + { + "epoch": 0.2856358143367207, + "grad_norm": 1.18865966796875, + "learning_rate": 4.976983505756373e-06, + "loss": 0.6217, + "step": 1040 + }, + { + "epoch": 0.2859104641581983, + "grad_norm": 1.3259810209274292, + "learning_rate": 4.976934583925619e-06, + "loss": 0.7121, + "step": 1041 + }, + { + "epoch": 0.2861851139796759, + "grad_norm": 1.318829894065857, + "learning_rate": 4.976885610399034e-06, + "loss": 0.7144, + "step": 1042 + }, + { + "epoch": 0.2864597638011535, + "grad_norm": 1.275184988975525, + "learning_rate": 4.976836585177641e-06, + "loss": 0.6964, + "step": 1043 + }, + { + "epoch": 0.28673441362263113, + "grad_norm": 1.202424168586731, + "learning_rate": 4.976787508262464e-06, + "loss": 0.6712, + "step": 1044 + }, + { + "epoch": 0.28700906344410876, + "grad_norm": 1.3279105424880981, + "learning_rate": 4.976738379654526e-06, + "loss": 0.6871, + "step": 1045 + }, + { + "epoch": 0.2872837132655864, + "grad_norm": 1.2250405550003052, + "learning_rate": 4.976689199354851e-06, + "loss": 0.6956, + "step": 1046 + }, + { + "epoch": 0.287558363087064, + "grad_norm": 1.309221625328064, + "learning_rate": 4.9766399673644695e-06, + "loss": 0.6978, + "step": 1047 + }, + { + "epoch": 0.28783301290854163, + "grad_norm": 1.2365233898162842, + "learning_rate": 4.976590683684406e-06, + "loss": 0.7173, + "step": 1048 + }, + { + "epoch": 0.2881076627300192, + "grad_norm": 1.3102757930755615, + "learning_rate": 4.976541348315689e-06, + "loss": 0.7041, + "step": 1049 + }, + { + "epoch": 0.2883823125514968, + "grad_norm": 1.3233015537261963, + "learning_rate": 4.97649196125935e-06, + "loss": 0.7181, + "step": 1050 + }, + { + "epoch": 0.28865696237297445, + "grad_norm": 1.330296277999878, + "learning_rate": 4.976442522516418e-06, + "loss": 0.7008, + "step": 1051 + }, + { + "epoch": 0.28893161219445207, + "grad_norm": 1.2273402214050293, + "learning_rate": 4.976393032087926e-06, + "loss": 0.6788, + "step": 1052 + }, + { + "epoch": 0.2892062620159297, + "grad_norm": 1.2416349649429321, + "learning_rate": 4.976343489974907e-06, + "loss": 0.7435, + "step": 1053 + }, + { + "epoch": 0.2894809118374073, + "grad_norm": 1.3420822620391846, + "learning_rate": 4.976293896178394e-06, + "loss": 0.6674, + "step": 1054 + }, + { + "epoch": 0.28975556165888494, + "grad_norm": 1.303139090538025, + "learning_rate": 4.976244250699422e-06, + "loss": 0.7221, + "step": 1055 + }, + { + "epoch": 0.29003021148036257, + "grad_norm": 1.4449259042739868, + "learning_rate": 4.976194553539028e-06, + "loss": 0.7367, + "step": 1056 + }, + { + "epoch": 0.29030486130184013, + "grad_norm": 1.348315715789795, + "learning_rate": 4.976144804698249e-06, + "loss": 0.7152, + "step": 1057 + }, + { + "epoch": 0.29057951112331776, + "grad_norm": 1.2402122020721436, + "learning_rate": 4.976095004178123e-06, + "loss": 0.683, + "step": 1058 + }, + { + "epoch": 0.2908541609447954, + "grad_norm": 1.238673448562622, + "learning_rate": 4.97604515197969e-06, + "loss": 0.7004, + "step": 1059 + }, + { + "epoch": 0.291128810766273, + "grad_norm": 1.2019429206848145, + "learning_rate": 4.975995248103989e-06, + "loss": 0.7256, + "step": 1060 + }, + { + "epoch": 0.29140346058775063, + "grad_norm": 1.2706888914108276, + "learning_rate": 4.9759452925520635e-06, + "loss": 0.6918, + "step": 1061 + }, + { + "epoch": 0.29167811040922825, + "grad_norm": 1.2903348207473755, + "learning_rate": 4.9758952853249555e-06, + "loss": 0.6867, + "step": 1062 + }, + { + "epoch": 0.2919527602307059, + "grad_norm": 1.2567076683044434, + "learning_rate": 4.975845226423708e-06, + "loss": 0.7507, + "step": 1063 + }, + { + "epoch": 0.29222741005218345, + "grad_norm": 1.1991972923278809, + "learning_rate": 4.975795115849366e-06, + "loss": 0.6758, + "step": 1064 + }, + { + "epoch": 0.29250205987366107, + "grad_norm": 1.2050590515136719, + "learning_rate": 4.975744953602975e-06, + "loss": 0.6877, + "step": 1065 + }, + { + "epoch": 0.2927767096951387, + "grad_norm": 1.3248696327209473, + "learning_rate": 4.9756947396855835e-06, + "loss": 0.7008, + "step": 1066 + }, + { + "epoch": 0.2930513595166163, + "grad_norm": 1.2641798257827759, + "learning_rate": 4.975644474098238e-06, + "loss": 0.7392, + "step": 1067 + }, + { + "epoch": 0.29332600933809394, + "grad_norm": 1.2709039449691772, + "learning_rate": 4.975594156841988e-06, + "loss": 0.7108, + "step": 1068 + }, + { + "epoch": 0.29360065915957156, + "grad_norm": 1.303604006767273, + "learning_rate": 4.975543787917884e-06, + "loss": 0.6798, + "step": 1069 + }, + { + "epoch": 0.2938753089810492, + "grad_norm": 1.2379215955734253, + "learning_rate": 4.975493367326975e-06, + "loss": 0.6581, + "step": 1070 + }, + { + "epoch": 0.29414995880252676, + "grad_norm": 1.3104496002197266, + "learning_rate": 4.975442895070318e-06, + "loss": 0.7613, + "step": 1071 + }, + { + "epoch": 0.2944246086240044, + "grad_norm": 1.3025963306427002, + "learning_rate": 4.975392371148963e-06, + "loss": 0.6843, + "step": 1072 + }, + { + "epoch": 0.294699258445482, + "grad_norm": 1.1400501728057861, + "learning_rate": 4.9753417955639636e-06, + "loss": 0.638, + "step": 1073 + }, + { + "epoch": 0.29497390826695963, + "grad_norm": 1.267250657081604, + "learning_rate": 4.9752911683163784e-06, + "loss": 0.71, + "step": 1074 + }, + { + "epoch": 0.29524855808843725, + "grad_norm": 1.2476543188095093, + "learning_rate": 4.975240489407263e-06, + "loss": 0.6544, + "step": 1075 + }, + { + "epoch": 0.2955232079099149, + "grad_norm": 1.2070684432983398, + "learning_rate": 4.975189758837673e-06, + "loss": 0.7075, + "step": 1076 + }, + { + "epoch": 0.2957978577313925, + "grad_norm": 1.2883967161178589, + "learning_rate": 4.97513897660867e-06, + "loss": 0.6678, + "step": 1077 + }, + { + "epoch": 0.29607250755287007, + "grad_norm": 1.2714040279388428, + "learning_rate": 4.9750881427213125e-06, + "loss": 0.7354, + "step": 1078 + }, + { + "epoch": 0.2963471573743477, + "grad_norm": 1.2661608457565308, + "learning_rate": 4.975037257176661e-06, + "loss": 0.7507, + "step": 1079 + }, + { + "epoch": 0.2966218071958253, + "grad_norm": 1.2865004539489746, + "learning_rate": 4.974986319975778e-06, + "loss": 0.7156, + "step": 1080 + }, + { + "epoch": 0.29689645701730294, + "grad_norm": 1.1958593130111694, + "learning_rate": 4.974935331119729e-06, + "loss": 0.7079, + "step": 1081 + }, + { + "epoch": 0.29717110683878056, + "grad_norm": 1.2677067518234253, + "learning_rate": 4.974884290609574e-06, + "loss": 0.7571, + "step": 1082 + }, + { + "epoch": 0.2974457566602582, + "grad_norm": 1.2182669639587402, + "learning_rate": 4.97483319844638e-06, + "loss": 0.6754, + "step": 1083 + }, + { + "epoch": 0.2977204064817358, + "grad_norm": 1.2605763673782349, + "learning_rate": 4.9747820546312154e-06, + "loss": 0.7324, + "step": 1084 + }, + { + "epoch": 0.2979950563032134, + "grad_norm": 1.2188094854354858, + "learning_rate": 4.9747308591651445e-06, + "loss": 0.6627, + "step": 1085 + }, + { + "epoch": 0.298269706124691, + "grad_norm": 1.2251505851745605, + "learning_rate": 4.974679612049237e-06, + "loss": 0.678, + "step": 1086 + }, + { + "epoch": 0.2985443559461686, + "grad_norm": 1.2656257152557373, + "learning_rate": 4.974628313284563e-06, + "loss": 0.6955, + "step": 1087 + }, + { + "epoch": 0.29881900576764625, + "grad_norm": 1.198972225189209, + "learning_rate": 4.974576962872193e-06, + "loss": 0.708, + "step": 1088 + }, + { + "epoch": 0.2990936555891239, + "grad_norm": 1.2038213014602661, + "learning_rate": 4.974525560813198e-06, + "loss": 0.6602, + "step": 1089 + }, + { + "epoch": 0.2993683054106015, + "grad_norm": 1.2352268695831299, + "learning_rate": 4.974474107108651e-06, + "loss": 0.6603, + "step": 1090 + }, + { + "epoch": 0.2996429552320791, + "grad_norm": 1.2886539697647095, + "learning_rate": 4.974422601759627e-06, + "loss": 0.6834, + "step": 1091 + }, + { + "epoch": 0.2999176050535567, + "grad_norm": 1.2558372020721436, + "learning_rate": 4.9743710447672e-06, + "loss": 0.7282, + "step": 1092 + }, + { + "epoch": 0.3001922548750343, + "grad_norm": 1.317233920097351, + "learning_rate": 4.974319436132445e-06, + "loss": 0.7351, + "step": 1093 + }, + { + "epoch": 0.30046690469651194, + "grad_norm": 1.2305881977081299, + "learning_rate": 4.9742677758564414e-06, + "loss": 0.7317, + "step": 1094 + }, + { + "epoch": 0.30074155451798956, + "grad_norm": 1.1871901750564575, + "learning_rate": 4.974216063940266e-06, + "loss": 0.6999, + "step": 1095 + }, + { + "epoch": 0.3010162043394672, + "grad_norm": 1.2979683876037598, + "learning_rate": 4.974164300384998e-06, + "loss": 0.6879, + "step": 1096 + }, + { + "epoch": 0.3012908541609448, + "grad_norm": 1.2294809818267822, + "learning_rate": 4.974112485191718e-06, + "loss": 0.6869, + "step": 1097 + }, + { + "epoch": 0.30156550398242243, + "grad_norm": 1.2687429189682007, + "learning_rate": 4.974060618361508e-06, + "loss": 0.7009, + "step": 1098 + }, + { + "epoch": 0.3018401538039, + "grad_norm": 1.3056883811950684, + "learning_rate": 4.97400869989545e-06, + "loss": 0.6855, + "step": 1099 + }, + { + "epoch": 0.3021148036253776, + "grad_norm": 1.2734125852584839, + "learning_rate": 4.973956729794626e-06, + "loss": 0.7482, + "step": 1100 + }, + { + "epoch": 0.30238945344685525, + "grad_norm": 1.203019380569458, + "learning_rate": 4.973904708060125e-06, + "loss": 0.6466, + "step": 1101 + }, + { + "epoch": 0.3026641032683329, + "grad_norm": 1.2725151777267456, + "learning_rate": 4.973852634693029e-06, + "loss": 0.6645, + "step": 1102 + }, + { + "epoch": 0.3029387530898105, + "grad_norm": 1.2640999555587769, + "learning_rate": 4.9738005096944245e-06, + "loss": 0.6931, + "step": 1103 + }, + { + "epoch": 0.3032134029112881, + "grad_norm": 1.2882065773010254, + "learning_rate": 4.973748333065402e-06, + "loss": 0.6751, + "step": 1104 + }, + { + "epoch": 0.30348805273276575, + "grad_norm": 1.3069666624069214, + "learning_rate": 4.9736961048070485e-06, + "loss": 0.6455, + "step": 1105 + }, + { + "epoch": 0.3037627025542433, + "grad_norm": 1.292135238647461, + "learning_rate": 4.973643824920455e-06, + "loss": 0.7039, + "step": 1106 + }, + { + "epoch": 0.30403735237572094, + "grad_norm": 1.2356903553009033, + "learning_rate": 4.973591493406713e-06, + "loss": 0.6884, + "step": 1107 + }, + { + "epoch": 0.30431200219719856, + "grad_norm": 1.3681414127349854, + "learning_rate": 4.973539110266913e-06, + "loss": 0.7198, + "step": 1108 + }, + { + "epoch": 0.3045866520186762, + "grad_norm": 1.2274909019470215, + "learning_rate": 4.973486675502151e-06, + "loss": 0.6849, + "step": 1109 + }, + { + "epoch": 0.3048613018401538, + "grad_norm": 1.3158444166183472, + "learning_rate": 4.973434189113518e-06, + "loss": 0.7189, + "step": 1110 + }, + { + "epoch": 0.30513595166163143, + "grad_norm": 1.217949390411377, + "learning_rate": 4.973381651102112e-06, + "loss": 0.6403, + "step": 1111 + }, + { + "epoch": 0.30541060148310906, + "grad_norm": 1.2558484077453613, + "learning_rate": 4.973329061469028e-06, + "loss": 0.7417, + "step": 1112 + }, + { + "epoch": 0.3056852513045866, + "grad_norm": 1.3415907621383667, + "learning_rate": 4.973276420215365e-06, + "loss": 0.7211, + "step": 1113 + }, + { + "epoch": 0.30595990112606425, + "grad_norm": 1.2331265211105347, + "learning_rate": 4.973223727342221e-06, + "loss": 0.6571, + "step": 1114 + }, + { + "epoch": 0.3062345509475419, + "grad_norm": 1.2444936037063599, + "learning_rate": 4.973170982850695e-06, + "loss": 0.6679, + "step": 1115 + }, + { + "epoch": 0.3065092007690195, + "grad_norm": 1.254255771636963, + "learning_rate": 4.973118186741889e-06, + "loss": 0.7129, + "step": 1116 + }, + { + "epoch": 0.3067838505904971, + "grad_norm": 1.207050085067749, + "learning_rate": 4.973065339016904e-06, + "loss": 0.6513, + "step": 1117 + }, + { + "epoch": 0.30705850041197474, + "grad_norm": 1.2654480934143066, + "learning_rate": 4.973012439676843e-06, + "loss": 0.7106, + "step": 1118 + }, + { + "epoch": 0.30733315023345237, + "grad_norm": 1.2694567441940308, + "learning_rate": 4.972959488722811e-06, + "loss": 0.7017, + "step": 1119 + }, + { + "epoch": 0.30760780005493, + "grad_norm": 1.3504406213760376, + "learning_rate": 4.972906486155913e-06, + "loss": 0.7262, + "step": 1120 + }, + { + "epoch": 0.30788244987640756, + "grad_norm": 1.2533509731292725, + "learning_rate": 4.9728534319772545e-06, + "loss": 0.7113, + "step": 1121 + }, + { + "epoch": 0.3081570996978852, + "grad_norm": 1.3151748180389404, + "learning_rate": 4.972800326187943e-06, + "loss": 0.7028, + "step": 1122 + }, + { + "epoch": 0.3084317495193628, + "grad_norm": 1.2147001028060913, + "learning_rate": 4.972747168789087e-06, + "loss": 0.6917, + "step": 1123 + }, + { + "epoch": 0.30870639934084043, + "grad_norm": 1.3302382230758667, + "learning_rate": 4.972693959781796e-06, + "loss": 0.7404, + "step": 1124 + }, + { + "epoch": 0.30898104916231806, + "grad_norm": 1.2579518556594849, + "learning_rate": 4.97264069916718e-06, + "loss": 0.7039, + "step": 1125 + }, + { + "epoch": 0.3092556989837957, + "grad_norm": 1.253291130065918, + "learning_rate": 4.972587386946351e-06, + "loss": 0.6891, + "step": 1126 + }, + { + "epoch": 0.3095303488052733, + "grad_norm": 1.2578589916229248, + "learning_rate": 4.972534023120422e-06, + "loss": 0.6974, + "step": 1127 + }, + { + "epoch": 0.30980499862675087, + "grad_norm": 1.233010172843933, + "learning_rate": 4.972480607690506e-06, + "loss": 0.661, + "step": 1128 + }, + { + "epoch": 0.3100796484482285, + "grad_norm": 1.3245187997817993, + "learning_rate": 4.972427140657718e-06, + "loss": 0.657, + "step": 1129 + }, + { + "epoch": 0.3103542982697061, + "grad_norm": 1.2921481132507324, + "learning_rate": 4.972373622023176e-06, + "loss": 0.7082, + "step": 1130 + }, + { + "epoch": 0.31062894809118374, + "grad_norm": 1.1943910121917725, + "learning_rate": 4.972320051787993e-06, + "loss": 0.7013, + "step": 1131 + }, + { + "epoch": 0.31090359791266137, + "grad_norm": 1.3244467973709106, + "learning_rate": 4.972266429953291e-06, + "loss": 0.7382, + "step": 1132 + }, + { + "epoch": 0.311178247734139, + "grad_norm": 1.2655655145645142, + "learning_rate": 4.9722127565201855e-06, + "loss": 0.6762, + "step": 1133 + }, + { + "epoch": 0.3114528975556166, + "grad_norm": 1.2360156774520874, + "learning_rate": 4.972159031489799e-06, + "loss": 0.6746, + "step": 1134 + }, + { + "epoch": 0.3117275473770942, + "grad_norm": 1.1925410032272339, + "learning_rate": 4.972105254863253e-06, + "loss": 0.7044, + "step": 1135 + }, + { + "epoch": 0.3120021971985718, + "grad_norm": 1.2351912260055542, + "learning_rate": 4.972051426641669e-06, + "loss": 0.6685, + "step": 1136 + }, + { + "epoch": 0.31227684702004943, + "grad_norm": 1.2430596351623535, + "learning_rate": 4.971997546826171e-06, + "loss": 0.6596, + "step": 1137 + }, + { + "epoch": 0.31255149684152705, + "grad_norm": 1.291938304901123, + "learning_rate": 4.971943615417882e-06, + "loss": 0.6937, + "step": 1138 + }, + { + "epoch": 0.3128261466630047, + "grad_norm": 1.2360295057296753, + "learning_rate": 4.97188963241793e-06, + "loss": 0.7164, + "step": 1139 + }, + { + "epoch": 0.3131007964844823, + "grad_norm": 1.1582595109939575, + "learning_rate": 4.971835597827439e-06, + "loss": 0.6294, + "step": 1140 + }, + { + "epoch": 0.3133754463059599, + "grad_norm": 1.455346941947937, + "learning_rate": 4.97178151164754e-06, + "loss": 0.7163, + "step": 1141 + }, + { + "epoch": 0.3136500961274375, + "grad_norm": 1.2973326444625854, + "learning_rate": 4.9717273738793585e-06, + "loss": 0.7228, + "step": 1142 + }, + { + "epoch": 0.3139247459489151, + "grad_norm": 1.3986222743988037, + "learning_rate": 4.9716731845240266e-06, + "loss": 0.7265, + "step": 1143 + }, + { + "epoch": 0.31419939577039274, + "grad_norm": 1.252943992614746, + "learning_rate": 4.9716189435826746e-06, + "loss": 0.6736, + "step": 1144 + }, + { + "epoch": 0.31447404559187037, + "grad_norm": 1.3144580125808716, + "learning_rate": 4.971564651056434e-06, + "loss": 0.7207, + "step": 1145 + }, + { + "epoch": 0.314748695413348, + "grad_norm": 1.2155941724777222, + "learning_rate": 4.971510306946439e-06, + "loss": 0.6905, + "step": 1146 + }, + { + "epoch": 0.3150233452348256, + "grad_norm": 1.23052978515625, + "learning_rate": 4.971455911253823e-06, + "loss": 0.6926, + "step": 1147 + }, + { + "epoch": 0.31529799505630324, + "grad_norm": 1.2047772407531738, + "learning_rate": 4.971401463979722e-06, + "loss": 0.6592, + "step": 1148 + }, + { + "epoch": 0.3155726448777808, + "grad_norm": 1.2332119941711426, + "learning_rate": 4.971346965125271e-06, + "loss": 0.6993, + "step": 1149 + }, + { + "epoch": 0.31584729469925843, + "grad_norm": 1.261730432510376, + "learning_rate": 4.97129241469161e-06, + "loss": 0.7038, + "step": 1150 + }, + { + "epoch": 0.31612194452073605, + "grad_norm": 1.262345314025879, + "learning_rate": 4.971237812679875e-06, + "loss": 0.6667, + "step": 1151 + }, + { + "epoch": 0.3163965943422137, + "grad_norm": 1.3032517433166504, + "learning_rate": 4.9711831590912055e-06, + "loss": 0.7184, + "step": 1152 + }, + { + "epoch": 0.3166712441636913, + "grad_norm": 1.2083176374435425, + "learning_rate": 4.971128453926745e-06, + "loss": 0.7528, + "step": 1153 + }, + { + "epoch": 0.3169458939851689, + "grad_norm": 1.2534137964248657, + "learning_rate": 4.971073697187632e-06, + "loss": 0.6463, + "step": 1154 + }, + { + "epoch": 0.31722054380664655, + "grad_norm": 1.2843812704086304, + "learning_rate": 4.971018888875011e-06, + "loss": 0.6691, + "step": 1155 + }, + { + "epoch": 0.3174951936281241, + "grad_norm": 1.3202883005142212, + "learning_rate": 4.9709640289900255e-06, + "loss": 0.7128, + "step": 1156 + }, + { + "epoch": 0.31776984344960174, + "grad_norm": 1.2320432662963867, + "learning_rate": 4.9709091175338206e-06, + "loss": 0.718, + "step": 1157 + }, + { + "epoch": 0.31804449327107936, + "grad_norm": 1.2808107137680054, + "learning_rate": 4.970854154507543e-06, + "loss": 0.7138, + "step": 1158 + }, + { + "epoch": 0.318319143092557, + "grad_norm": 1.3208327293395996, + "learning_rate": 4.970799139912339e-06, + "loss": 0.6742, + "step": 1159 + }, + { + "epoch": 0.3185937929140346, + "grad_norm": 1.2974694967269897, + "learning_rate": 4.970744073749357e-06, + "loss": 0.6705, + "step": 1160 + }, + { + "epoch": 0.31886844273551224, + "grad_norm": 1.3387993574142456, + "learning_rate": 4.970688956019746e-06, + "loss": 0.6825, + "step": 1161 + }, + { + "epoch": 0.31914309255698986, + "grad_norm": 1.199853539466858, + "learning_rate": 4.970633786724657e-06, + "loss": 0.6999, + "step": 1162 + }, + { + "epoch": 0.31941774237846743, + "grad_norm": 1.393248200416565, + "learning_rate": 4.97057856586524e-06, + "loss": 0.7245, + "step": 1163 + }, + { + "epoch": 0.31969239219994505, + "grad_norm": 1.2718462944030762, + "learning_rate": 4.97052329344265e-06, + "loss": 0.7173, + "step": 1164 + }, + { + "epoch": 0.3199670420214227, + "grad_norm": 1.2557599544525146, + "learning_rate": 4.9704679694580376e-06, + "loss": 0.6496, + "step": 1165 + }, + { + "epoch": 0.3202416918429003, + "grad_norm": 1.2766826152801514, + "learning_rate": 4.97041259391256e-06, + "loss": 0.7072, + "step": 1166 + }, + { + "epoch": 0.3205163416643779, + "grad_norm": 1.35191810131073, + "learning_rate": 4.970357166807371e-06, + "loss": 0.6955, + "step": 1167 + }, + { + "epoch": 0.32079099148585555, + "grad_norm": 1.2542033195495605, + "learning_rate": 4.9703016881436295e-06, + "loss": 0.7049, + "step": 1168 + }, + { + "epoch": 0.32106564130733317, + "grad_norm": 1.231925368309021, + "learning_rate": 4.970246157922492e-06, + "loss": 0.656, + "step": 1169 + }, + { + "epoch": 0.32134029112881074, + "grad_norm": 1.4131462574005127, + "learning_rate": 4.970190576145117e-06, + "loss": 0.7063, + "step": 1170 + }, + { + "epoch": 0.32161494095028836, + "grad_norm": 1.2872861623764038, + "learning_rate": 4.970134942812666e-06, + "loss": 0.6975, + "step": 1171 + }, + { + "epoch": 0.321889590771766, + "grad_norm": 1.2075905799865723, + "learning_rate": 4.970079257926299e-06, + "loss": 0.6456, + "step": 1172 + }, + { + "epoch": 0.3221642405932436, + "grad_norm": 1.3115437030792236, + "learning_rate": 4.970023521487179e-06, + "loss": 0.7276, + "step": 1173 + }, + { + "epoch": 0.32243889041472124, + "grad_norm": 1.3721809387207031, + "learning_rate": 4.9699677334964685e-06, + "loss": 0.7326, + "step": 1174 + }, + { + "epoch": 0.32271354023619886, + "grad_norm": 1.2202194929122925, + "learning_rate": 4.969911893955332e-06, + "loss": 0.7244, + "step": 1175 + }, + { + "epoch": 0.3229881900576765, + "grad_norm": 1.2224313020706177, + "learning_rate": 4.969856002864936e-06, + "loss": 0.678, + "step": 1176 + }, + { + "epoch": 0.32326283987915405, + "grad_norm": 1.2608031034469604, + "learning_rate": 4.969800060226445e-06, + "loss": 0.7135, + "step": 1177 + }, + { + "epoch": 0.3235374897006317, + "grad_norm": 1.2331879138946533, + "learning_rate": 4.969744066041028e-06, + "loss": 0.7068, + "step": 1178 + }, + { + "epoch": 0.3238121395221093, + "grad_norm": 1.2968316078186035, + "learning_rate": 4.969688020309853e-06, + "loss": 0.6951, + "step": 1179 + }, + { + "epoch": 0.3240867893435869, + "grad_norm": 1.353359580039978, + "learning_rate": 4.96963192303409e-06, + "loss": 0.701, + "step": 1180 + }, + { + "epoch": 0.32436143916506455, + "grad_norm": 1.315069317817688, + "learning_rate": 4.969575774214911e-06, + "loss": 0.6874, + "step": 1181 + }, + { + "epoch": 0.32463608898654217, + "grad_norm": 1.2289810180664062, + "learning_rate": 4.969519573853485e-06, + "loss": 0.6891, + "step": 1182 + }, + { + "epoch": 0.3249107388080198, + "grad_norm": 1.2072538137435913, + "learning_rate": 4.969463321950988e-06, + "loss": 0.6506, + "step": 1183 + }, + { + "epoch": 0.3251853886294974, + "grad_norm": 1.3120249509811401, + "learning_rate": 4.969407018508592e-06, + "loss": 0.6843, + "step": 1184 + }, + { + "epoch": 0.325460038450975, + "grad_norm": 1.2871127128601074, + "learning_rate": 4.969350663527473e-06, + "loss": 0.7125, + "step": 1185 + }, + { + "epoch": 0.3257346882724526, + "grad_norm": 1.2735702991485596, + "learning_rate": 4.969294257008807e-06, + "loss": 0.6902, + "step": 1186 + }, + { + "epoch": 0.32600933809393023, + "grad_norm": 1.332248330116272, + "learning_rate": 4.969237798953772e-06, + "loss": 0.7011, + "step": 1187 + }, + { + "epoch": 0.32628398791540786, + "grad_norm": 1.2655326128005981, + "learning_rate": 4.969181289363544e-06, + "loss": 0.7069, + "step": 1188 + }, + { + "epoch": 0.3265586377368855, + "grad_norm": 1.2368921041488647, + "learning_rate": 4.969124728239305e-06, + "loss": 0.701, + "step": 1189 + }, + { + "epoch": 0.3268332875583631, + "grad_norm": 1.2421051263809204, + "learning_rate": 4.969068115582234e-06, + "loss": 0.6664, + "step": 1190 + }, + { + "epoch": 0.32710793737984073, + "grad_norm": 1.3675907850265503, + "learning_rate": 4.969011451393513e-06, + "loss": 0.7257, + "step": 1191 + }, + { + "epoch": 0.3273825872013183, + "grad_norm": 1.2941806316375732, + "learning_rate": 4.9689547356743255e-06, + "loss": 0.6506, + "step": 1192 + }, + { + "epoch": 0.3276572370227959, + "grad_norm": 1.1745949983596802, + "learning_rate": 4.968897968425853e-06, + "loss": 0.6131, + "step": 1193 + }, + { + "epoch": 0.32793188684427355, + "grad_norm": 1.2895632982254028, + "learning_rate": 4.968841149649282e-06, + "loss": 0.6978, + "step": 1194 + }, + { + "epoch": 0.32820653666575117, + "grad_norm": 1.3631726503372192, + "learning_rate": 4.968784279345799e-06, + "loss": 0.7078, + "step": 1195 + }, + { + "epoch": 0.3284811864872288, + "grad_norm": 1.2365717887878418, + "learning_rate": 4.968727357516588e-06, + "loss": 0.6695, + "step": 1196 + }, + { + "epoch": 0.3287558363087064, + "grad_norm": 1.2193000316619873, + "learning_rate": 4.96867038416284e-06, + "loss": 0.6901, + "step": 1197 + }, + { + "epoch": 0.32903048613018404, + "grad_norm": 1.2034114599227905, + "learning_rate": 4.968613359285743e-06, + "loss": 0.674, + "step": 1198 + }, + { + "epoch": 0.3293051359516616, + "grad_norm": 1.284611463546753, + "learning_rate": 4.9685562828864875e-06, + "loss": 0.6907, + "step": 1199 + }, + { + "epoch": 0.32957978577313923, + "grad_norm": 1.2640138864517212, + "learning_rate": 4.968499154966264e-06, + "loss": 0.6368, + "step": 1200 + }, + { + "epoch": 0.32985443559461686, + "grad_norm": 1.2003651857376099, + "learning_rate": 4.968441975526266e-06, + "loss": 0.6896, + "step": 1201 + }, + { + "epoch": 0.3301290854160945, + "grad_norm": 1.3906826972961426, + "learning_rate": 4.968384744567685e-06, + "loss": 0.7204, + "step": 1202 + }, + { + "epoch": 0.3304037352375721, + "grad_norm": 1.1921378374099731, + "learning_rate": 4.968327462091717e-06, + "loss": 0.6868, + "step": 1203 + }, + { + "epoch": 0.33067838505904973, + "grad_norm": 1.2754156589508057, + "learning_rate": 4.968270128099557e-06, + "loss": 0.6953, + "step": 1204 + }, + { + "epoch": 0.33095303488052735, + "grad_norm": 1.2439545392990112, + "learning_rate": 4.968212742592402e-06, + "loss": 0.6997, + "step": 1205 + }, + { + "epoch": 0.3312276847020049, + "grad_norm": 1.305830478668213, + "learning_rate": 4.968155305571449e-06, + "loss": 0.6608, + "step": 1206 + }, + { + "epoch": 0.33150233452348254, + "grad_norm": 1.3759721517562866, + "learning_rate": 4.968097817037897e-06, + "loss": 0.7009, + "step": 1207 + }, + { + "epoch": 0.33177698434496017, + "grad_norm": 1.2901371717453003, + "learning_rate": 4.968040276992947e-06, + "loss": 0.7307, + "step": 1208 + }, + { + "epoch": 0.3320516341664378, + "grad_norm": 1.2595411539077759, + "learning_rate": 4.967982685437798e-06, + "loss": 0.6846, + "step": 1209 + }, + { + "epoch": 0.3323262839879154, + "grad_norm": 1.224743366241455, + "learning_rate": 4.967925042373653e-06, + "loss": 0.7468, + "step": 1210 + }, + { + "epoch": 0.33260093380939304, + "grad_norm": 1.2710872888565063, + "learning_rate": 4.967867347801715e-06, + "loss": 0.7104, + "step": 1211 + }, + { + "epoch": 0.33287558363087066, + "grad_norm": 1.3188378810882568, + "learning_rate": 4.967809601723188e-06, + "loss": 0.6723, + "step": 1212 + }, + { + "epoch": 0.33315023345234823, + "grad_norm": 1.27811598777771, + "learning_rate": 4.967751804139278e-06, + "loss": 0.7008, + "step": 1213 + }, + { + "epoch": 0.33342488327382586, + "grad_norm": 1.2898287773132324, + "learning_rate": 4.96769395505119e-06, + "loss": 0.6157, + "step": 1214 + }, + { + "epoch": 0.3336995330953035, + "grad_norm": 1.3266711235046387, + "learning_rate": 4.967636054460131e-06, + "loss": 0.6689, + "step": 1215 + }, + { + "epoch": 0.3339741829167811, + "grad_norm": 1.3278069496154785, + "learning_rate": 4.967578102367312e-06, + "loss": 0.7399, + "step": 1216 + }, + { + "epoch": 0.3342488327382587, + "grad_norm": 1.2430450916290283, + "learning_rate": 4.96752009877394e-06, + "loss": 0.7301, + "step": 1217 + }, + { + "epoch": 0.33452348255973635, + "grad_norm": 1.3095192909240723, + "learning_rate": 4.967462043681227e-06, + "loss": 0.7313, + "step": 1218 + }, + { + "epoch": 0.334798132381214, + "grad_norm": 1.3126986026763916, + "learning_rate": 4.967403937090384e-06, + "loss": 0.7302, + "step": 1219 + }, + { + "epoch": 0.33507278220269154, + "grad_norm": 1.3070597648620605, + "learning_rate": 4.967345779002623e-06, + "loss": 0.7307, + "step": 1220 + }, + { + "epoch": 0.33534743202416917, + "grad_norm": 1.1980966329574585, + "learning_rate": 4.96728756941916e-06, + "loss": 0.6907, + "step": 1221 + }, + { + "epoch": 0.3356220818456468, + "grad_norm": 1.4078737497329712, + "learning_rate": 4.967229308341208e-06, + "loss": 0.6867, + "step": 1222 + }, + { + "epoch": 0.3358967316671244, + "grad_norm": 1.3017908334732056, + "learning_rate": 4.967170995769983e-06, + "loss": 0.7185, + "step": 1223 + }, + { + "epoch": 0.33617138148860204, + "grad_norm": 1.2036820650100708, + "learning_rate": 4.967112631706703e-06, + "loss": 0.6492, + "step": 1224 + }, + { + "epoch": 0.33644603131007966, + "grad_norm": 1.2785874605178833, + "learning_rate": 4.967054216152586e-06, + "loss": 0.6891, + "step": 1225 + }, + { + "epoch": 0.3367206811315573, + "grad_norm": 1.2394524812698364, + "learning_rate": 4.966995749108851e-06, + "loss": 0.6539, + "step": 1226 + }, + { + "epoch": 0.33699533095303486, + "grad_norm": 1.2425485849380493, + "learning_rate": 4.966937230576718e-06, + "loss": 0.7411, + "step": 1227 + }, + { + "epoch": 0.3372699807745125, + "grad_norm": 1.2377713918685913, + "learning_rate": 4.966878660557409e-06, + "loss": 0.7034, + "step": 1228 + }, + { + "epoch": 0.3375446305959901, + "grad_norm": 1.3110496997833252, + "learning_rate": 4.966820039052144e-06, + "loss": 0.6961, + "step": 1229 + }, + { + "epoch": 0.3378192804174677, + "grad_norm": 1.3804707527160645, + "learning_rate": 4.96676136606215e-06, + "loss": 0.7226, + "step": 1230 + }, + { + "epoch": 0.33809393023894535, + "grad_norm": 1.264753818511963, + "learning_rate": 4.96670264158865e-06, + "loss": 0.6991, + "step": 1231 + }, + { + "epoch": 0.338368580060423, + "grad_norm": 1.3613078594207764, + "learning_rate": 4.9666438656328685e-06, + "loss": 0.7196, + "step": 1232 + }, + { + "epoch": 0.3386432298819006, + "grad_norm": 1.2202939987182617, + "learning_rate": 4.966585038196033e-06, + "loss": 0.7177, + "step": 1233 + }, + { + "epoch": 0.33891787970337817, + "grad_norm": 1.2335845232009888, + "learning_rate": 4.966526159279373e-06, + "loss": 0.689, + "step": 1234 + }, + { + "epoch": 0.3391925295248558, + "grad_norm": 1.2219244241714478, + "learning_rate": 4.966467228884115e-06, + "loss": 0.6793, + "step": 1235 + }, + { + "epoch": 0.3394671793463334, + "grad_norm": 1.2178367376327515, + "learning_rate": 4.9664082470114895e-06, + "loss": 0.6895, + "step": 1236 + }, + { + "epoch": 0.33974182916781104, + "grad_norm": 1.2188019752502441, + "learning_rate": 4.966349213662729e-06, + "loss": 0.668, + "step": 1237 + }, + { + "epoch": 0.34001647898928866, + "grad_norm": 1.2965822219848633, + "learning_rate": 4.966290128839063e-06, + "loss": 0.7244, + "step": 1238 + }, + { + "epoch": 0.3402911288107663, + "grad_norm": 1.2367604970932007, + "learning_rate": 4.966230992541727e-06, + "loss": 0.6787, + "step": 1239 + }, + { + "epoch": 0.3405657786322439, + "grad_norm": 1.282376766204834, + "learning_rate": 4.966171804771954e-06, + "loss": 0.6901, + "step": 1240 + }, + { + "epoch": 0.3408404284537215, + "grad_norm": 1.2598642110824585, + "learning_rate": 4.966112565530979e-06, + "loss": 0.7222, + "step": 1241 + }, + { + "epoch": 0.3411150782751991, + "grad_norm": 1.2169580459594727, + "learning_rate": 4.96605327482004e-06, + "loss": 0.694, + "step": 1242 + }, + { + "epoch": 0.3413897280966767, + "grad_norm": 1.357069492340088, + "learning_rate": 4.965993932640373e-06, + "loss": 0.7239, + "step": 1243 + }, + { + "epoch": 0.34166437791815435, + "grad_norm": 1.289471983909607, + "learning_rate": 4.965934538993217e-06, + "loss": 0.6752, + "step": 1244 + }, + { + "epoch": 0.341939027739632, + "grad_norm": 1.2353941202163696, + "learning_rate": 4.965875093879812e-06, + "loss": 0.6254, + "step": 1245 + }, + { + "epoch": 0.3422136775611096, + "grad_norm": 1.3221089839935303, + "learning_rate": 4.965815597301397e-06, + "loss": 0.662, + "step": 1246 + }, + { + "epoch": 0.3424883273825872, + "grad_norm": 1.278363585472107, + "learning_rate": 4.965756049259216e-06, + "loss": 0.6854, + "step": 1247 + }, + { + "epoch": 0.34276297720406484, + "grad_norm": 1.2043577432632446, + "learning_rate": 4.9656964497545106e-06, + "loss": 0.6949, + "step": 1248 + }, + { + "epoch": 0.3430376270255424, + "grad_norm": 1.2732398509979248, + "learning_rate": 4.9656367987885235e-06, + "loss": 0.7362, + "step": 1249 + }, + { + "epoch": 0.34331227684702004, + "grad_norm": 1.2270352840423584, + "learning_rate": 4.9655770963625024e-06, + "loss": 0.7042, + "step": 1250 + }, + { + "epoch": 0.34358692666849766, + "grad_norm": 1.3288655281066895, + "learning_rate": 4.965517342477692e-06, + "loss": 0.7248, + "step": 1251 + }, + { + "epoch": 0.3438615764899753, + "grad_norm": 1.2524750232696533, + "learning_rate": 4.965457537135339e-06, + "loss": 0.6948, + "step": 1252 + }, + { + "epoch": 0.3441362263114529, + "grad_norm": 1.3115184307098389, + "learning_rate": 4.965397680336692e-06, + "loss": 0.7488, + "step": 1253 + }, + { + "epoch": 0.34441087613293053, + "grad_norm": 1.2549692392349243, + "learning_rate": 4.965337772083e-06, + "loss": 0.6495, + "step": 1254 + }, + { + "epoch": 0.34468552595440816, + "grad_norm": 1.3103783130645752, + "learning_rate": 4.965277812375514e-06, + "loss": 0.7423, + "step": 1255 + }, + { + "epoch": 0.3449601757758857, + "grad_norm": 1.279936671257019, + "learning_rate": 4.965217801215485e-06, + "loss": 0.7122, + "step": 1256 + }, + { + "epoch": 0.34523482559736335, + "grad_norm": 1.2402790784835815, + "learning_rate": 4.965157738604166e-06, + "loss": 0.7124, + "step": 1257 + }, + { + "epoch": 0.345509475418841, + "grad_norm": 1.2878808975219727, + "learning_rate": 4.965097624542808e-06, + "loss": 0.7156, + "step": 1258 + }, + { + "epoch": 0.3457841252403186, + "grad_norm": 1.2625129222869873, + "learning_rate": 4.9650374590326696e-06, + "loss": 0.7367, + "step": 1259 + }, + { + "epoch": 0.3460587750617962, + "grad_norm": 1.2823412418365479, + "learning_rate": 4.964977242075004e-06, + "loss": 0.7071, + "step": 1260 + }, + { + "epoch": 0.34633342488327384, + "grad_norm": 1.2602367401123047, + "learning_rate": 4.964916973671069e-06, + "loss": 0.6816, + "step": 1261 + }, + { + "epoch": 0.34660807470475147, + "grad_norm": 1.2291955947875977, + "learning_rate": 4.9648566538221224e-06, + "loss": 0.6645, + "step": 1262 + }, + { + "epoch": 0.34688272452622904, + "grad_norm": 1.2150517702102661, + "learning_rate": 4.964796282529421e-06, + "loss": 0.6728, + "step": 1263 + }, + { + "epoch": 0.34715737434770666, + "grad_norm": 1.2840805053710938, + "learning_rate": 4.964735859794228e-06, + "loss": 0.6915, + "step": 1264 + }, + { + "epoch": 0.3474320241691843, + "grad_norm": 1.2058695554733276, + "learning_rate": 4.964675385617803e-06, + "loss": 0.6439, + "step": 1265 + }, + { + "epoch": 0.3477066739906619, + "grad_norm": 1.2549614906311035, + "learning_rate": 4.964614860001408e-06, + "loss": 0.6989, + "step": 1266 + }, + { + "epoch": 0.34798132381213953, + "grad_norm": 1.26815664768219, + "learning_rate": 4.964554282946306e-06, + "loss": 0.6976, + "step": 1267 + }, + { + "epoch": 0.34825597363361716, + "grad_norm": 1.252768635749817, + "learning_rate": 4.964493654453761e-06, + "loss": 0.7092, + "step": 1268 + }, + { + "epoch": 0.3485306234550948, + "grad_norm": 1.4191515445709229, + "learning_rate": 4.964432974525041e-06, + "loss": 0.7298, + "step": 1269 + }, + { + "epoch": 0.34880527327657235, + "grad_norm": 1.2309355735778809, + "learning_rate": 4.964372243161409e-06, + "loss": 0.6945, + "step": 1270 + }, + { + "epoch": 0.34907992309804997, + "grad_norm": 1.1912832260131836, + "learning_rate": 4.964311460364135e-06, + "loss": 0.616, + "step": 1271 + }, + { + "epoch": 0.3493545729195276, + "grad_norm": 1.2099345922470093, + "learning_rate": 4.9642506261344854e-06, + "loss": 0.6334, + "step": 1272 + }, + { + "epoch": 0.3496292227410052, + "grad_norm": 1.2811123132705688, + "learning_rate": 4.964189740473732e-06, + "loss": 0.6929, + "step": 1273 + }, + { + "epoch": 0.34990387256248284, + "grad_norm": 1.242837905883789, + "learning_rate": 4.964128803383144e-06, + "loss": 0.6743, + "step": 1274 + }, + { + "epoch": 0.35017852238396047, + "grad_norm": 1.2108162641525269, + "learning_rate": 4.964067814863995e-06, + "loss": 0.6836, + "step": 1275 + }, + { + "epoch": 0.3504531722054381, + "grad_norm": 1.3200699090957642, + "learning_rate": 4.964006774917556e-06, + "loss": 0.7384, + "step": 1276 + }, + { + "epoch": 0.35072782202691566, + "grad_norm": 1.1506552696228027, + "learning_rate": 4.963945683545102e-06, + "loss": 0.5975, + "step": 1277 + }, + { + "epoch": 0.3510024718483933, + "grad_norm": 1.398451805114746, + "learning_rate": 4.9638845407479075e-06, + "loss": 0.7236, + "step": 1278 + }, + { + "epoch": 0.3512771216698709, + "grad_norm": 1.2461745738983154, + "learning_rate": 4.963823346527249e-06, + "loss": 0.6582, + "step": 1279 + }, + { + "epoch": 0.35155177149134853, + "grad_norm": 1.3456004858016968, + "learning_rate": 4.963762100884403e-06, + "loss": 0.6908, + "step": 1280 + }, + { + "epoch": 0.35182642131282615, + "grad_norm": 1.3496099710464478, + "learning_rate": 4.963700803820648e-06, + "loss": 0.6758, + "step": 1281 + }, + { + "epoch": 0.3521010711343038, + "grad_norm": 1.227717638015747, + "learning_rate": 4.963639455337264e-06, + "loss": 0.711, + "step": 1282 + }, + { + "epoch": 0.3523757209557814, + "grad_norm": 1.151244878768921, + "learning_rate": 4.9635780554355315e-06, + "loss": 0.602, + "step": 1283 + }, + { + "epoch": 0.35265037077725897, + "grad_norm": 1.273998737335205, + "learning_rate": 4.96351660411673e-06, + "loss": 0.6521, + "step": 1284 + }, + { + "epoch": 0.3529250205987366, + "grad_norm": 1.3186947107315063, + "learning_rate": 4.963455101382145e-06, + "loss": 0.7021, + "step": 1285 + }, + { + "epoch": 0.3531996704202142, + "grad_norm": 1.2140824794769287, + "learning_rate": 4.963393547233058e-06, + "loss": 0.714, + "step": 1286 + }, + { + "epoch": 0.35347432024169184, + "grad_norm": 1.371940016746521, + "learning_rate": 4.9633319416707545e-06, + "loss": 0.6696, + "step": 1287 + }, + { + "epoch": 0.35374897006316947, + "grad_norm": 1.2342212200164795, + "learning_rate": 4.96327028469652e-06, + "loss": 0.6528, + "step": 1288 + }, + { + "epoch": 0.3540236198846471, + "grad_norm": 1.2283183336257935, + "learning_rate": 4.963208576311641e-06, + "loss": 0.6503, + "step": 1289 + }, + { + "epoch": 0.3542982697061247, + "grad_norm": 1.3268879652023315, + "learning_rate": 4.963146816517406e-06, + "loss": 0.6611, + "step": 1290 + }, + { + "epoch": 0.3545729195276023, + "grad_norm": 1.2814422845840454, + "learning_rate": 4.963085005315104e-06, + "loss": 0.6813, + "step": 1291 + }, + { + "epoch": 0.3548475693490799, + "grad_norm": 1.2326849699020386, + "learning_rate": 4.963023142706025e-06, + "loss": 0.6932, + "step": 1292 + }, + { + "epoch": 0.35512221917055753, + "grad_norm": 1.2080974578857422, + "learning_rate": 4.962961228691459e-06, + "loss": 0.6838, + "step": 1293 + }, + { + "epoch": 0.35539686899203515, + "grad_norm": 1.2160965204238892, + "learning_rate": 4.9628992632727005e-06, + "loss": 0.6982, + "step": 1294 + }, + { + "epoch": 0.3556715188135128, + "grad_norm": 1.2777553796768188, + "learning_rate": 4.962837246451041e-06, + "loss": 0.7081, + "step": 1295 + }, + { + "epoch": 0.3559461686349904, + "grad_norm": 1.2889776229858398, + "learning_rate": 4.9627751782277745e-06, + "loss": 0.6847, + "step": 1296 + }, + { + "epoch": 0.356220818456468, + "grad_norm": 1.239198088645935, + "learning_rate": 4.962713058604198e-06, + "loss": 0.6661, + "step": 1297 + }, + { + "epoch": 0.3564954682779456, + "grad_norm": 1.2919679880142212, + "learning_rate": 4.962650887581607e-06, + "loss": 0.6609, + "step": 1298 + }, + { + "epoch": 0.3567701180994232, + "grad_norm": 1.2189463376998901, + "learning_rate": 4.962588665161298e-06, + "loss": 0.6941, + "step": 1299 + }, + { + "epoch": 0.35704476792090084, + "grad_norm": 1.3782885074615479, + "learning_rate": 4.962526391344573e-06, + "loss": 0.6984, + "step": 1300 + }, + { + "epoch": 0.35731941774237846, + "grad_norm": 1.2717798948287964, + "learning_rate": 4.962464066132729e-06, + "loss": 0.6501, + "step": 1301 + }, + { + "epoch": 0.3575940675638561, + "grad_norm": 1.1711856126785278, + "learning_rate": 4.962401689527067e-06, + "loss": 0.6813, + "step": 1302 + }, + { + "epoch": 0.3578687173853337, + "grad_norm": 1.289645791053772, + "learning_rate": 4.962339261528889e-06, + "loss": 0.6887, + "step": 1303 + }, + { + "epoch": 0.35814336720681134, + "grad_norm": 1.2963961362838745, + "learning_rate": 4.9622767821394985e-06, + "loss": 0.7049, + "step": 1304 + }, + { + "epoch": 0.3584180170282889, + "grad_norm": 1.2422353029251099, + "learning_rate": 4.962214251360199e-06, + "loss": 0.6887, + "step": 1305 + }, + { + "epoch": 0.35869266684976653, + "grad_norm": 1.2630949020385742, + "learning_rate": 4.962151669192296e-06, + "loss": 0.7121, + "step": 1306 + }, + { + "epoch": 0.35896731667124415, + "grad_norm": 1.2452367544174194, + "learning_rate": 4.9620890356370954e-06, + "loss": 0.7224, + "step": 1307 + }, + { + "epoch": 0.3592419664927218, + "grad_norm": 1.3621306419372559, + "learning_rate": 4.962026350695905e-06, + "loss": 0.6885, + "step": 1308 + }, + { + "epoch": 0.3595166163141994, + "grad_norm": 1.227502703666687, + "learning_rate": 4.961963614370031e-06, + "loss": 0.6725, + "step": 1309 + }, + { + "epoch": 0.359791266135677, + "grad_norm": 1.280401349067688, + "learning_rate": 4.961900826660785e-06, + "loss": 0.7455, + "step": 1310 + }, + { + "epoch": 0.36006591595715465, + "grad_norm": 1.258021593093872, + "learning_rate": 4.961837987569477e-06, + "loss": 0.6445, + "step": 1311 + }, + { + "epoch": 0.36034056577863227, + "grad_norm": 1.2514430284500122, + "learning_rate": 4.9617750970974175e-06, + "loss": 0.6651, + "step": 1312 + }, + { + "epoch": 0.36061521560010984, + "grad_norm": 1.2700772285461426, + "learning_rate": 4.96171215524592e-06, + "loss": 0.6868, + "step": 1313 + }, + { + "epoch": 0.36088986542158746, + "grad_norm": 1.2522872686386108, + "learning_rate": 4.961649162016299e-06, + "loss": 0.7191, + "step": 1314 + }, + { + "epoch": 0.3611645152430651, + "grad_norm": 1.3493527173995972, + "learning_rate": 4.961586117409867e-06, + "loss": 0.7067, + "step": 1315 + }, + { + "epoch": 0.3614391650645427, + "grad_norm": 1.23672354221344, + "learning_rate": 4.961523021427941e-06, + "loss": 0.7095, + "step": 1316 + }, + { + "epoch": 0.36171381488602034, + "grad_norm": 1.225111961364746, + "learning_rate": 4.961459874071838e-06, + "loss": 0.6996, + "step": 1317 + }, + { + "epoch": 0.36198846470749796, + "grad_norm": 1.2586084604263306, + "learning_rate": 4.961396675342877e-06, + "loss": 0.7171, + "step": 1318 + }, + { + "epoch": 0.3622631145289756, + "grad_norm": 1.2409802675247192, + "learning_rate": 4.961333425242374e-06, + "loss": 0.7165, + "step": 1319 + }, + { + "epoch": 0.36253776435045315, + "grad_norm": 1.2159485816955566, + "learning_rate": 4.961270123771652e-06, + "loss": 0.6769, + "step": 1320 + }, + { + "epoch": 0.3628124141719308, + "grad_norm": 1.2586230039596558, + "learning_rate": 4.96120677093203e-06, + "loss": 0.7218, + "step": 1321 + }, + { + "epoch": 0.3630870639934084, + "grad_norm": 1.2448314428329468, + "learning_rate": 4.961143366724832e-06, + "loss": 0.7018, + "step": 1322 + }, + { + "epoch": 0.363361713814886, + "grad_norm": 1.2226682901382446, + "learning_rate": 4.96107991115138e-06, + "loss": 0.6622, + "step": 1323 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.3261022567749023, + "learning_rate": 4.961016404212999e-06, + "loss": 0.7218, + "step": 1324 + }, + { + "epoch": 0.36391101345784127, + "grad_norm": 1.3771624565124512, + "learning_rate": 4.960952845911016e-06, + "loss": 0.7272, + "step": 1325 + }, + { + "epoch": 0.3641856632793189, + "grad_norm": 1.2573333978652954, + "learning_rate": 4.960889236246754e-06, + "loss": 0.6912, + "step": 1326 + }, + { + "epoch": 0.36446031310079646, + "grad_norm": 1.2141249179840088, + "learning_rate": 4.9608255752215424e-06, + "loss": 0.6597, + "step": 1327 + }, + { + "epoch": 0.3647349629222741, + "grad_norm": 1.2378182411193848, + "learning_rate": 4.960761862836712e-06, + "loss": 0.6892, + "step": 1328 + }, + { + "epoch": 0.3650096127437517, + "grad_norm": 1.3210159540176392, + "learning_rate": 4.9606980990935884e-06, + "loss": 0.6912, + "step": 1329 + }, + { + "epoch": 0.36528426256522933, + "grad_norm": 1.3261783123016357, + "learning_rate": 4.960634283993505e-06, + "loss": 0.7196, + "step": 1330 + }, + { + "epoch": 0.36555891238670696, + "grad_norm": 1.2320525646209717, + "learning_rate": 4.960570417537793e-06, + "loss": 0.6736, + "step": 1331 + }, + { + "epoch": 0.3658335622081846, + "grad_norm": 1.3036469221115112, + "learning_rate": 4.960506499727786e-06, + "loss": 0.7164, + "step": 1332 + }, + { + "epoch": 0.3661082120296622, + "grad_norm": 1.3294017314910889, + "learning_rate": 4.9604425305648175e-06, + "loss": 0.7083, + "step": 1333 + }, + { + "epoch": 0.3663828618511398, + "grad_norm": 1.278617024421692, + "learning_rate": 4.960378510050223e-06, + "loss": 0.6846, + "step": 1334 + }, + { + "epoch": 0.3666575116726174, + "grad_norm": 1.3490254878997803, + "learning_rate": 4.960314438185338e-06, + "loss": 0.7206, + "step": 1335 + }, + { + "epoch": 0.366932161494095, + "grad_norm": 1.226145625114441, + "learning_rate": 4.9602503149715e-06, + "loss": 0.6805, + "step": 1336 + }, + { + "epoch": 0.36720681131557265, + "grad_norm": 1.2435972690582275, + "learning_rate": 4.9601861404100475e-06, + "loss": 0.6852, + "step": 1337 + }, + { + "epoch": 0.36748146113705027, + "grad_norm": 1.3466812372207642, + "learning_rate": 4.960121914502319e-06, + "loss": 0.7076, + "step": 1338 + }, + { + "epoch": 0.3677561109585279, + "grad_norm": 1.226743221282959, + "learning_rate": 4.960057637249657e-06, + "loss": 0.6917, + "step": 1339 + }, + { + "epoch": 0.3680307607800055, + "grad_norm": 1.3025742769241333, + "learning_rate": 4.9599933086534e-06, + "loss": 0.6573, + "step": 1340 + }, + { + "epoch": 0.3683054106014831, + "grad_norm": 1.2842214107513428, + "learning_rate": 4.959928928714893e-06, + "loss": 0.6716, + "step": 1341 + }, + { + "epoch": 0.3685800604229607, + "grad_norm": 1.2486650943756104, + "learning_rate": 4.959864497435479e-06, + "loss": 0.6401, + "step": 1342 + }, + { + "epoch": 0.36885471024443833, + "grad_norm": 1.3169175386428833, + "learning_rate": 4.959800014816504e-06, + "loss": 0.698, + "step": 1343 + }, + { + "epoch": 0.36912936006591596, + "grad_norm": 1.2873677015304565, + "learning_rate": 4.959735480859311e-06, + "loss": 0.7319, + "step": 1344 + }, + { + "epoch": 0.3694040098873936, + "grad_norm": 1.2233039140701294, + "learning_rate": 4.959670895565248e-06, + "loss": 0.6739, + "step": 1345 + }, + { + "epoch": 0.3696786597088712, + "grad_norm": 1.235597848892212, + "learning_rate": 4.959606258935664e-06, + "loss": 0.6761, + "step": 1346 + }, + { + "epoch": 0.36995330953034883, + "grad_norm": 1.31159245967865, + "learning_rate": 4.959541570971908e-06, + "loss": 0.678, + "step": 1347 + }, + { + "epoch": 0.3702279593518264, + "grad_norm": 1.2119773626327515, + "learning_rate": 4.959476831675329e-06, + "loss": 0.6706, + "step": 1348 + }, + { + "epoch": 0.370502609173304, + "grad_norm": 1.2745022773742676, + "learning_rate": 4.959412041047278e-06, + "loss": 0.6856, + "step": 1349 + }, + { + "epoch": 0.37077725899478164, + "grad_norm": 1.3383160829544067, + "learning_rate": 4.959347199089109e-06, + "loss": 0.6867, + "step": 1350 + }, + { + "epoch": 0.37105190881625927, + "grad_norm": 1.3316866159439087, + "learning_rate": 4.959282305802173e-06, + "loss": 0.7185, + "step": 1351 + }, + { + "epoch": 0.3713265586377369, + "grad_norm": 1.2539231777191162, + "learning_rate": 4.959217361187827e-06, + "loss": 0.6767, + "step": 1352 + }, + { + "epoch": 0.3716012084592145, + "grad_norm": 1.1549036502838135, + "learning_rate": 4.959152365247424e-06, + "loss": 0.6244, + "step": 1353 + }, + { + "epoch": 0.37187585828069214, + "grad_norm": 1.29035222530365, + "learning_rate": 4.959087317982322e-06, + "loss": 0.6495, + "step": 1354 + }, + { + "epoch": 0.3721505081021697, + "grad_norm": 1.1867456436157227, + "learning_rate": 4.959022219393878e-06, + "loss": 0.6737, + "step": 1355 + }, + { + "epoch": 0.37242515792364733, + "grad_norm": 1.3463538885116577, + "learning_rate": 4.95895706948345e-06, + "loss": 0.6482, + "step": 1356 + }, + { + "epoch": 0.37269980774512496, + "grad_norm": 1.2377370595932007, + "learning_rate": 4.958891868252399e-06, + "loss": 0.6936, + "step": 1357 + }, + { + "epoch": 0.3729744575666026, + "grad_norm": 1.3409926891326904, + "learning_rate": 4.958826615702086e-06, + "loss": 0.6951, + "step": 1358 + }, + { + "epoch": 0.3732491073880802, + "grad_norm": 1.1791020631790161, + "learning_rate": 4.9587613118338715e-06, + "loss": 0.7031, + "step": 1359 + }, + { + "epoch": 0.3735237572095578, + "grad_norm": 1.3067299127578735, + "learning_rate": 4.958695956649119e-06, + "loss": 0.7088, + "step": 1360 + }, + { + "epoch": 0.37379840703103545, + "grad_norm": 1.219298243522644, + "learning_rate": 4.958630550149193e-06, + "loss": 0.6689, + "step": 1361 + }, + { + "epoch": 0.374073056852513, + "grad_norm": 1.2516835927963257, + "learning_rate": 4.958565092335459e-06, + "loss": 0.6953, + "step": 1362 + }, + { + "epoch": 0.37434770667399064, + "grad_norm": 1.2293386459350586, + "learning_rate": 4.958499583209281e-06, + "loss": 0.6789, + "step": 1363 + }, + { + "epoch": 0.37462235649546827, + "grad_norm": 1.267264485359192, + "learning_rate": 4.958434022772029e-06, + "loss": 0.7278, + "step": 1364 + }, + { + "epoch": 0.3748970063169459, + "grad_norm": 1.196534276008606, + "learning_rate": 4.958368411025069e-06, + "loss": 0.6458, + "step": 1365 + }, + { + "epoch": 0.3751716561384235, + "grad_norm": 1.264838695526123, + "learning_rate": 4.958302747969772e-06, + "loss": 0.6733, + "step": 1366 + }, + { + "epoch": 0.37544630595990114, + "grad_norm": 1.2327977418899536, + "learning_rate": 4.9582370336075075e-06, + "loss": 0.6649, + "step": 1367 + }, + { + "epoch": 0.37572095578137876, + "grad_norm": 1.225211501121521, + "learning_rate": 4.958171267939648e-06, + "loss": 0.6944, + "step": 1368 + }, + { + "epoch": 0.37599560560285633, + "grad_norm": 1.2980737686157227, + "learning_rate": 4.958105450967564e-06, + "loss": 0.7171, + "step": 1369 + }, + { + "epoch": 0.37627025542433395, + "grad_norm": 1.247089147567749, + "learning_rate": 4.958039582692632e-06, + "loss": 0.6508, + "step": 1370 + }, + { + "epoch": 0.3765449052458116, + "grad_norm": 1.2542396783828735, + "learning_rate": 4.9579736631162244e-06, + "loss": 0.7212, + "step": 1371 + }, + { + "epoch": 0.3768195550672892, + "grad_norm": 1.281359314918518, + "learning_rate": 4.957907692239719e-06, + "loss": 0.6622, + "step": 1372 + }, + { + "epoch": 0.3770942048887668, + "grad_norm": 1.224281907081604, + "learning_rate": 4.95784167006449e-06, + "loss": 0.6856, + "step": 1373 + }, + { + "epoch": 0.37736885471024445, + "grad_norm": 1.2364610433578491, + "learning_rate": 4.957775596591918e-06, + "loss": 0.6969, + "step": 1374 + }, + { + "epoch": 0.3776435045317221, + "grad_norm": 1.218517541885376, + "learning_rate": 4.9577094718233805e-06, + "loss": 0.6768, + "step": 1375 + }, + { + "epoch": 0.3779181543531997, + "grad_norm": 1.2569109201431274, + "learning_rate": 4.957643295760258e-06, + "loss": 0.7078, + "step": 1376 + }, + { + "epoch": 0.37819280417467727, + "grad_norm": 1.2530310153961182, + "learning_rate": 4.957577068403931e-06, + "loss": 0.6633, + "step": 1377 + }, + { + "epoch": 0.3784674539961549, + "grad_norm": 1.2946768999099731, + "learning_rate": 4.957510789755784e-06, + "loss": 0.7359, + "step": 1378 + }, + { + "epoch": 0.3787421038176325, + "grad_norm": 1.1900367736816406, + "learning_rate": 4.957444459817198e-06, + "loss": 0.6242, + "step": 1379 + }, + { + "epoch": 0.37901675363911014, + "grad_norm": 1.3988484144210815, + "learning_rate": 4.9573780785895574e-06, + "loss": 0.7346, + "step": 1380 + }, + { + "epoch": 0.37929140346058776, + "grad_norm": 1.2844029664993286, + "learning_rate": 4.9573116460742484e-06, + "loss": 0.6692, + "step": 1381 + }, + { + "epoch": 0.3795660532820654, + "grad_norm": 1.2828158140182495, + "learning_rate": 4.957245162272658e-06, + "loss": 0.7089, + "step": 1382 + }, + { + "epoch": 0.379840703103543, + "grad_norm": 1.2637178897857666, + "learning_rate": 4.957178627186173e-06, + "loss": 0.7341, + "step": 1383 + }, + { + "epoch": 0.3801153529250206, + "grad_norm": 1.2337836027145386, + "learning_rate": 4.957112040816182e-06, + "loss": 0.7286, + "step": 1384 + }, + { + "epoch": 0.3803900027464982, + "grad_norm": 1.1862221956253052, + "learning_rate": 4.957045403164074e-06, + "loss": 0.6037, + "step": 1385 + }, + { + "epoch": 0.3806646525679758, + "grad_norm": 1.2431236505508423, + "learning_rate": 4.956978714231242e-06, + "loss": 0.7276, + "step": 1386 + }, + { + "epoch": 0.38093930238945345, + "grad_norm": 1.198455810546875, + "learning_rate": 4.956911974019077e-06, + "loss": 0.6797, + "step": 1387 + }, + { + "epoch": 0.3812139522109311, + "grad_norm": 1.1986327171325684, + "learning_rate": 4.956845182528971e-06, + "loss": 0.6883, + "step": 1388 + }, + { + "epoch": 0.3814886020324087, + "grad_norm": 1.2984189987182617, + "learning_rate": 4.956778339762318e-06, + "loss": 0.6423, + "step": 1389 + }, + { + "epoch": 0.3817632518538863, + "grad_norm": 1.2505302429199219, + "learning_rate": 4.9567114457205136e-06, + "loss": 0.6769, + "step": 1390 + }, + { + "epoch": 0.3820379016753639, + "grad_norm": 1.2214730978012085, + "learning_rate": 4.956644500404954e-06, + "loss": 0.6539, + "step": 1391 + }, + { + "epoch": 0.3823125514968415, + "grad_norm": 1.2277097702026367, + "learning_rate": 4.9565775038170365e-06, + "loss": 0.7132, + "step": 1392 + }, + { + "epoch": 0.38258720131831914, + "grad_norm": 1.3273342847824097, + "learning_rate": 4.956510455958159e-06, + "loss": 0.6986, + "step": 1393 + }, + { + "epoch": 0.38286185113979676, + "grad_norm": 1.2960453033447266, + "learning_rate": 4.956443356829721e-06, + "loss": 0.6975, + "step": 1394 + }, + { + "epoch": 0.3831365009612744, + "grad_norm": 1.2949374914169312, + "learning_rate": 4.9563762064331235e-06, + "loss": 0.6761, + "step": 1395 + }, + { + "epoch": 0.383411150782752, + "grad_norm": 1.1880576610565186, + "learning_rate": 4.956309004769767e-06, + "loss": 0.6359, + "step": 1396 + }, + { + "epoch": 0.38368580060422963, + "grad_norm": 1.2352294921875, + "learning_rate": 4.956241751841054e-06, + "loss": 0.7006, + "step": 1397 + }, + { + "epoch": 0.3839604504257072, + "grad_norm": 1.2861589193344116, + "learning_rate": 4.9561744476483896e-06, + "loss": 0.6742, + "step": 1398 + }, + { + "epoch": 0.3842351002471848, + "grad_norm": 1.2724494934082031, + "learning_rate": 4.956107092193177e-06, + "loss": 0.7057, + "step": 1399 + }, + { + "epoch": 0.38450975006866245, + "grad_norm": 1.2755625247955322, + "learning_rate": 4.956039685476822e-06, + "loss": 0.6836, + "step": 1400 + }, + { + "epoch": 0.38478439989014007, + "grad_norm": 1.2564730644226074, + "learning_rate": 4.955972227500733e-06, + "loss": 0.6776, + "step": 1401 + }, + { + "epoch": 0.3850590497116177, + "grad_norm": 1.2954457998275757, + "learning_rate": 4.955904718266316e-06, + "loss": 0.7261, + "step": 1402 + }, + { + "epoch": 0.3853336995330953, + "grad_norm": 1.238318681716919, + "learning_rate": 4.955837157774982e-06, + "loss": 0.6948, + "step": 1403 + }, + { + "epoch": 0.38560834935457294, + "grad_norm": 1.2006291151046753, + "learning_rate": 4.955769546028139e-06, + "loss": 0.6916, + "step": 1404 + }, + { + "epoch": 0.3858829991760505, + "grad_norm": 1.3086395263671875, + "learning_rate": 4.955701883027198e-06, + "loss": 0.6985, + "step": 1405 + }, + { + "epoch": 0.38615764899752814, + "grad_norm": 1.2865846157073975, + "learning_rate": 4.955634168773573e-06, + "loss": 0.7064, + "step": 1406 + }, + { + "epoch": 0.38643229881900576, + "grad_norm": 1.282709002494812, + "learning_rate": 4.955566403268677e-06, + "loss": 0.7001, + "step": 1407 + }, + { + "epoch": 0.3867069486404834, + "grad_norm": 1.2238717079162598, + "learning_rate": 4.955498586513924e-06, + "loss": 0.6744, + "step": 1408 + }, + { + "epoch": 0.386981598461961, + "grad_norm": 1.2575292587280273, + "learning_rate": 4.955430718510728e-06, + "loss": 0.618, + "step": 1409 + }, + { + "epoch": 0.38725624828343863, + "grad_norm": 1.2415677309036255, + "learning_rate": 4.955362799260507e-06, + "loss": 0.6779, + "step": 1410 + }, + { + "epoch": 0.38753089810491625, + "grad_norm": 1.25653076171875, + "learning_rate": 4.9552948287646786e-06, + "loss": 0.6685, + "step": 1411 + }, + { + "epoch": 0.3878055479263938, + "grad_norm": 1.3358088731765747, + "learning_rate": 4.9552268070246605e-06, + "loss": 0.7413, + "step": 1412 + }, + { + "epoch": 0.38808019774787145, + "grad_norm": 1.1822996139526367, + "learning_rate": 4.955158734041873e-06, + "loss": 0.6568, + "step": 1413 + }, + { + "epoch": 0.38835484756934907, + "grad_norm": 1.1928046941757202, + "learning_rate": 4.955090609817736e-06, + "loss": 0.6701, + "step": 1414 + }, + { + "epoch": 0.3886294973908267, + "grad_norm": 1.3731223344802856, + "learning_rate": 4.9550224343536725e-06, + "loss": 0.7039, + "step": 1415 + }, + { + "epoch": 0.3889041472123043, + "grad_norm": 1.3621954917907715, + "learning_rate": 4.954954207651106e-06, + "loss": 0.7401, + "step": 1416 + }, + { + "epoch": 0.38917879703378194, + "grad_norm": 1.2356224060058594, + "learning_rate": 4.9548859297114576e-06, + "loss": 0.6463, + "step": 1417 + }, + { + "epoch": 0.38945344685525957, + "grad_norm": 1.3029649257659912, + "learning_rate": 4.954817600536155e-06, + "loss": 0.6641, + "step": 1418 + }, + { + "epoch": 0.38972809667673713, + "grad_norm": 1.2096586227416992, + "learning_rate": 4.9547492201266225e-06, + "loss": 0.6352, + "step": 1419 + }, + { + "epoch": 0.39000274649821476, + "grad_norm": 1.2483359575271606, + "learning_rate": 4.954680788484289e-06, + "loss": 0.6979, + "step": 1420 + }, + { + "epoch": 0.3902773963196924, + "grad_norm": 1.2355910539627075, + "learning_rate": 4.954612305610581e-06, + "loss": 0.6557, + "step": 1421 + }, + { + "epoch": 0.39055204614117, + "grad_norm": 1.1781282424926758, + "learning_rate": 4.95454377150693e-06, + "loss": 0.6601, + "step": 1422 + }, + { + "epoch": 0.39082669596264763, + "grad_norm": 1.2588895559310913, + "learning_rate": 4.954475186174764e-06, + "loss": 0.6828, + "step": 1423 + }, + { + "epoch": 0.39110134578412525, + "grad_norm": 1.2814724445343018, + "learning_rate": 4.9544065496155154e-06, + "loss": 0.7244, + "step": 1424 + }, + { + "epoch": 0.3913759956056029, + "grad_norm": 1.3551459312438965, + "learning_rate": 4.954337861830617e-06, + "loss": 0.6958, + "step": 1425 + }, + { + "epoch": 0.39165064542708045, + "grad_norm": 1.2251776456832886, + "learning_rate": 4.954269122821501e-06, + "loss": 0.6737, + "step": 1426 + }, + { + "epoch": 0.39192529524855807, + "grad_norm": 1.2512660026550293, + "learning_rate": 4.954200332589605e-06, + "loss": 0.679, + "step": 1427 + }, + { + "epoch": 0.3921999450700357, + "grad_norm": 1.2677335739135742, + "learning_rate": 4.954131491136362e-06, + "loss": 0.6784, + "step": 1428 + }, + { + "epoch": 0.3924745948915133, + "grad_norm": 1.2881762981414795, + "learning_rate": 4.95406259846321e-06, + "loss": 0.6658, + "step": 1429 + }, + { + "epoch": 0.39274924471299094, + "grad_norm": 1.3021732568740845, + "learning_rate": 4.953993654571586e-06, + "loss": 0.7172, + "step": 1430 + }, + { + "epoch": 0.39302389453446857, + "grad_norm": 1.3039478063583374, + "learning_rate": 4.953924659462929e-06, + "loss": 0.7095, + "step": 1431 + }, + { + "epoch": 0.3932985443559462, + "grad_norm": 1.2799725532531738, + "learning_rate": 4.95385561313868e-06, + "loss": 0.6655, + "step": 1432 + }, + { + "epoch": 0.39357319417742376, + "grad_norm": 1.2378416061401367, + "learning_rate": 4.95378651560028e-06, + "loss": 0.6856, + "step": 1433 + }, + { + "epoch": 0.3938478439989014, + "grad_norm": 1.33161199092865, + "learning_rate": 4.953717366849169e-06, + "loss": 0.7139, + "step": 1434 + }, + { + "epoch": 0.394122493820379, + "grad_norm": 1.3270524740219116, + "learning_rate": 4.9536481668867934e-06, + "loss": 0.7311, + "step": 1435 + }, + { + "epoch": 0.39439714364185663, + "grad_norm": 1.2287719249725342, + "learning_rate": 4.953578915714595e-06, + "loss": 0.6546, + "step": 1436 + }, + { + "epoch": 0.39467179346333425, + "grad_norm": 1.2382794618606567, + "learning_rate": 4.95350961333402e-06, + "loss": 0.6419, + "step": 1437 + }, + { + "epoch": 0.3949464432848119, + "grad_norm": 1.2277491092681885, + "learning_rate": 4.953440259746515e-06, + "loss": 0.6955, + "step": 1438 + }, + { + "epoch": 0.3952210931062895, + "grad_norm": 1.3598512411117554, + "learning_rate": 4.953370854953527e-06, + "loss": 0.7083, + "step": 1439 + }, + { + "epoch": 0.3954957429277671, + "grad_norm": 1.3187317848205566, + "learning_rate": 4.953301398956506e-06, + "loss": 0.6955, + "step": 1440 + }, + { + "epoch": 0.3957703927492447, + "grad_norm": 1.2039005756378174, + "learning_rate": 4.9532318917568985e-06, + "loss": 0.6946, + "step": 1441 + }, + { + "epoch": 0.3960450425707223, + "grad_norm": 1.3444775342941284, + "learning_rate": 4.953162333356159e-06, + "loss": 0.6978, + "step": 1442 + }, + { + "epoch": 0.39631969239219994, + "grad_norm": 1.1751902103424072, + "learning_rate": 4.9530927237557355e-06, + "loss": 0.6824, + "step": 1443 + }, + { + "epoch": 0.39659434221367756, + "grad_norm": 1.2033604383468628, + "learning_rate": 4.953023062957084e-06, + "loss": 0.6181, + "step": 1444 + }, + { + "epoch": 0.3968689920351552, + "grad_norm": 1.396830677986145, + "learning_rate": 4.952953350961657e-06, + "loss": 0.7072, + "step": 1445 + }, + { + "epoch": 0.3971436418566328, + "grad_norm": 1.2349138259887695, + "learning_rate": 4.95288358777091e-06, + "loss": 0.6541, + "step": 1446 + }, + { + "epoch": 0.39741829167811044, + "grad_norm": 1.2541958093643188, + "learning_rate": 4.952813773386297e-06, + "loss": 0.7171, + "step": 1447 + }, + { + "epoch": 0.397692941499588, + "grad_norm": 1.3107824325561523, + "learning_rate": 4.952743907809277e-06, + "loss": 0.6622, + "step": 1448 + }, + { + "epoch": 0.39796759132106563, + "grad_norm": 1.2827956676483154, + "learning_rate": 4.9526739910413075e-06, + "loss": 0.7077, + "step": 1449 + }, + { + "epoch": 0.39824224114254325, + "grad_norm": 1.2185384035110474, + "learning_rate": 4.952604023083848e-06, + "loss": 0.6489, + "step": 1450 + }, + { + "epoch": 0.3985168909640209, + "grad_norm": 1.2637358903884888, + "learning_rate": 4.952534003938359e-06, + "loss": 0.7012, + "step": 1451 + }, + { + "epoch": 0.3987915407854985, + "grad_norm": 1.3294488191604614, + "learning_rate": 4.952463933606303e-06, + "loss": 0.7041, + "step": 1452 + }, + { + "epoch": 0.3990661906069761, + "grad_norm": 1.2262356281280518, + "learning_rate": 4.9523938120891394e-06, + "loss": 0.6886, + "step": 1453 + }, + { + "epoch": 0.39934084042845375, + "grad_norm": 1.1808733940124512, + "learning_rate": 4.952323639388333e-06, + "loss": 0.6813, + "step": 1454 + }, + { + "epoch": 0.3996154902499313, + "grad_norm": 1.2580538988113403, + "learning_rate": 4.95225341550535e-06, + "loss": 0.6469, + "step": 1455 + }, + { + "epoch": 0.39989014007140894, + "grad_norm": 1.2514662742614746, + "learning_rate": 4.952183140441654e-06, + "loss": 0.7106, + "step": 1456 + }, + { + "epoch": 0.40016478989288656, + "grad_norm": 1.180565357208252, + "learning_rate": 4.952112814198713e-06, + "loss": 0.7183, + "step": 1457 + }, + { + "epoch": 0.4004394397143642, + "grad_norm": 1.2220641374588013, + "learning_rate": 4.952042436777994e-06, + "loss": 0.6464, + "step": 1458 + }, + { + "epoch": 0.4007140895358418, + "grad_norm": 1.2186099290847778, + "learning_rate": 4.951972008180966e-06, + "loss": 0.6453, + "step": 1459 + }, + { + "epoch": 0.40098873935731943, + "grad_norm": 1.2596220970153809, + "learning_rate": 4.951901528409099e-06, + "loss": 0.6661, + "step": 1460 + }, + { + "epoch": 0.40126338917879706, + "grad_norm": 1.2750242948532104, + "learning_rate": 4.951830997463864e-06, + "loss": 0.6928, + "step": 1461 + }, + { + "epoch": 0.4015380390002746, + "grad_norm": 1.1902400255203247, + "learning_rate": 4.951760415346733e-06, + "loss": 0.714, + "step": 1462 + }, + { + "epoch": 0.40181268882175225, + "grad_norm": 1.2800443172454834, + "learning_rate": 4.951689782059179e-06, + "loss": 0.7064, + "step": 1463 + }, + { + "epoch": 0.4020873386432299, + "grad_norm": 1.245355486869812, + "learning_rate": 4.951619097602677e-06, + "loss": 0.6314, + "step": 1464 + }, + { + "epoch": 0.4023619884647075, + "grad_norm": 1.2271722555160522, + "learning_rate": 4.9515483619787e-06, + "loss": 0.6244, + "step": 1465 + }, + { + "epoch": 0.4026366382861851, + "grad_norm": 1.202816128730774, + "learning_rate": 4.951477575188727e-06, + "loss": 0.6497, + "step": 1466 + }, + { + "epoch": 0.40291128810766275, + "grad_norm": 1.2148836851119995, + "learning_rate": 4.951406737234234e-06, + "loss": 0.6645, + "step": 1467 + }, + { + "epoch": 0.40318593792914037, + "grad_norm": 1.2242296934127808, + "learning_rate": 4.9513358481167e-06, + "loss": 0.6579, + "step": 1468 + }, + { + "epoch": 0.40346058775061794, + "grad_norm": 1.2546976804733276, + "learning_rate": 4.951264907837603e-06, + "loss": 0.6996, + "step": 1469 + }, + { + "epoch": 0.40373523757209556, + "grad_norm": 1.3110781908035278, + "learning_rate": 4.951193916398426e-06, + "loss": 0.7047, + "step": 1470 + }, + { + "epoch": 0.4040098873935732, + "grad_norm": 1.2429604530334473, + "learning_rate": 4.95112287380065e-06, + "loss": 0.6584, + "step": 1471 + }, + { + "epoch": 0.4042845372150508, + "grad_norm": 1.2501721382141113, + "learning_rate": 4.951051780045756e-06, + "loss": 0.6759, + "step": 1472 + }, + { + "epoch": 0.40455918703652843, + "grad_norm": 1.2294771671295166, + "learning_rate": 4.950980635135229e-06, + "loss": 0.6996, + "step": 1473 + }, + { + "epoch": 0.40483383685800606, + "grad_norm": 1.2240831851959229, + "learning_rate": 4.950909439070554e-06, + "loss": 0.7003, + "step": 1474 + }, + { + "epoch": 0.4051084866794837, + "grad_norm": 1.2977678775787354, + "learning_rate": 4.9508381918532165e-06, + "loss": 0.6618, + "step": 1475 + }, + { + "epoch": 0.40538313650096125, + "grad_norm": 1.240294098854065, + "learning_rate": 4.950766893484704e-06, + "loss": 0.716, + "step": 1476 + }, + { + "epoch": 0.4056577863224389, + "grad_norm": 1.2064539194107056, + "learning_rate": 4.950695543966505e-06, + "loss": 0.6808, + "step": 1477 + }, + { + "epoch": 0.4059324361439165, + "grad_norm": 1.2945671081542969, + "learning_rate": 4.950624143300107e-06, + "loss": 0.6867, + "step": 1478 + }, + { + "epoch": 0.4062070859653941, + "grad_norm": 1.2178826332092285, + "learning_rate": 4.950552691487002e-06, + "loss": 0.731, + "step": 1479 + }, + { + "epoch": 0.40648173578687175, + "grad_norm": 1.1807661056518555, + "learning_rate": 4.950481188528679e-06, + "loss": 0.6547, + "step": 1480 + }, + { + "epoch": 0.40675638560834937, + "grad_norm": 1.3330632448196411, + "learning_rate": 4.950409634426633e-06, + "loss": 0.7495, + "step": 1481 + }, + { + "epoch": 0.407031035429827, + "grad_norm": 1.1545077562332153, + "learning_rate": 4.950338029182356e-06, + "loss": 0.6416, + "step": 1482 + }, + { + "epoch": 0.40730568525130456, + "grad_norm": 1.3518697023391724, + "learning_rate": 4.950266372797341e-06, + "loss": 0.6732, + "step": 1483 + }, + { + "epoch": 0.4075803350727822, + "grad_norm": 1.2305175065994263, + "learning_rate": 4.950194665273087e-06, + "loss": 0.6888, + "step": 1484 + }, + { + "epoch": 0.4078549848942598, + "grad_norm": 1.2019784450531006, + "learning_rate": 4.950122906611088e-06, + "loss": 0.6648, + "step": 1485 + }, + { + "epoch": 0.40812963471573743, + "grad_norm": 1.2188624143600464, + "learning_rate": 4.950051096812842e-06, + "loss": 0.6987, + "step": 1486 + }, + { + "epoch": 0.40840428453721506, + "grad_norm": 1.2916913032531738, + "learning_rate": 4.949979235879848e-06, + "loss": 0.6608, + "step": 1487 + }, + { + "epoch": 0.4086789343586927, + "grad_norm": 1.274309754371643, + "learning_rate": 4.949907323813607e-06, + "loss": 0.6941, + "step": 1488 + }, + { + "epoch": 0.4089535841801703, + "grad_norm": 1.215025544166565, + "learning_rate": 4.949835360615618e-06, + "loss": 0.6753, + "step": 1489 + }, + { + "epoch": 0.4092282340016479, + "grad_norm": 1.2952286005020142, + "learning_rate": 4.949763346287384e-06, + "loss": 0.6953, + "step": 1490 + }, + { + "epoch": 0.4095028838231255, + "grad_norm": 1.2170606851577759, + "learning_rate": 4.949691280830407e-06, + "loss": 0.6831, + "step": 1491 + }, + { + "epoch": 0.4097775336446031, + "grad_norm": 1.2274984121322632, + "learning_rate": 4.949619164246192e-06, + "loss": 0.6822, + "step": 1492 + }, + { + "epoch": 0.41005218346608074, + "grad_norm": 1.2664318084716797, + "learning_rate": 4.949546996536245e-06, + "loss": 0.7102, + "step": 1493 + }, + { + "epoch": 0.41032683328755837, + "grad_norm": 1.2629177570343018, + "learning_rate": 4.94947477770207e-06, + "loss": 0.6754, + "step": 1494 + }, + { + "epoch": 0.410601483109036, + "grad_norm": 1.2938421964645386, + "learning_rate": 4.949402507745177e-06, + "loss": 0.7011, + "step": 1495 + }, + { + "epoch": 0.4108761329305136, + "grad_norm": 1.295804500579834, + "learning_rate": 4.949330186667071e-06, + "loss": 0.6962, + "step": 1496 + }, + { + "epoch": 0.4111507827519912, + "grad_norm": 1.238394021987915, + "learning_rate": 4.949257814469264e-06, + "loss": 0.6559, + "step": 1497 + }, + { + "epoch": 0.4114254325734688, + "grad_norm": 1.1879796981811523, + "learning_rate": 4.949185391153265e-06, + "loss": 0.6233, + "step": 1498 + }, + { + "epoch": 0.41170008239494643, + "grad_norm": 1.2769726514816284, + "learning_rate": 4.949112916720587e-06, + "loss": 0.6903, + "step": 1499 + }, + { + "epoch": 0.41197473221642406, + "grad_norm": 1.2421104907989502, + "learning_rate": 4.9490403911727406e-06, + "loss": 0.6989, + "step": 1500 + }, + { + "epoch": 0.4122493820379017, + "grad_norm": 1.2049193382263184, + "learning_rate": 4.948967814511242e-06, + "loss": 0.653, + "step": 1501 + }, + { + "epoch": 0.4125240318593793, + "grad_norm": 1.2369251251220703, + "learning_rate": 4.948895186737603e-06, + "loss": 0.6789, + "step": 1502 + }, + { + "epoch": 0.4127986816808569, + "grad_norm": 1.225521445274353, + "learning_rate": 4.948822507853343e-06, + "loss": 0.6491, + "step": 1503 + }, + { + "epoch": 0.41307333150233455, + "grad_norm": 1.3163360357284546, + "learning_rate": 4.948749777859975e-06, + "loss": 0.6893, + "step": 1504 + }, + { + "epoch": 0.4133479813238121, + "grad_norm": 1.227973461151123, + "learning_rate": 4.94867699675902e-06, + "loss": 0.6712, + "step": 1505 + }, + { + "epoch": 0.41362263114528974, + "grad_norm": 1.2600226402282715, + "learning_rate": 4.948604164551994e-06, + "loss": 0.6926, + "step": 1506 + }, + { + "epoch": 0.41389728096676737, + "grad_norm": 1.281821370124817, + "learning_rate": 4.9485312812404206e-06, + "loss": 0.7105, + "step": 1507 + }, + { + "epoch": 0.414171930788245, + "grad_norm": 1.189564824104309, + "learning_rate": 4.9484583468258195e-06, + "loss": 0.6896, + "step": 1508 + }, + { + "epoch": 0.4144465806097226, + "grad_norm": 1.2889312505722046, + "learning_rate": 4.948385361309711e-06, + "loss": 0.6927, + "step": 1509 + }, + { + "epoch": 0.41472123043120024, + "grad_norm": 1.2584372758865356, + "learning_rate": 4.948312324693622e-06, + "loss": 0.6824, + "step": 1510 + }, + { + "epoch": 0.41499588025267786, + "grad_norm": 1.1768090724945068, + "learning_rate": 4.9482392369790735e-06, + "loss": 0.6651, + "step": 1511 + }, + { + "epoch": 0.41527053007415543, + "grad_norm": 1.242891550064087, + "learning_rate": 4.948166098167592e-06, + "loss": 0.7001, + "step": 1512 + }, + { + "epoch": 0.41554517989563305, + "grad_norm": 1.215121865272522, + "learning_rate": 4.948092908260705e-06, + "loss": 0.6937, + "step": 1513 + }, + { + "epoch": 0.4158198297171107, + "grad_norm": 1.1457231044769287, + "learning_rate": 4.948019667259939e-06, + "loss": 0.6102, + "step": 1514 + }, + { + "epoch": 0.4160944795385883, + "grad_norm": 1.2336541414260864, + "learning_rate": 4.947946375166823e-06, + "loss": 0.6733, + "step": 1515 + }, + { + "epoch": 0.4163691293600659, + "grad_norm": 1.2664320468902588, + "learning_rate": 4.947873031982886e-06, + "loss": 0.6883, + "step": 1516 + }, + { + "epoch": 0.41664377918154355, + "grad_norm": 1.373031735420227, + "learning_rate": 4.947799637709659e-06, + "loss": 0.7411, + "step": 1517 + }, + { + "epoch": 0.4169184290030212, + "grad_norm": 1.341872215270996, + "learning_rate": 4.947726192348675e-06, + "loss": 0.6968, + "step": 1518 + }, + { + "epoch": 0.41719307882449874, + "grad_norm": 1.3309589624404907, + "learning_rate": 4.947652695901466e-06, + "loss": 0.7612, + "step": 1519 + }, + { + "epoch": 0.41746772864597637, + "grad_norm": 1.2619577646255493, + "learning_rate": 4.947579148369565e-06, + "loss": 0.7229, + "step": 1520 + }, + { + "epoch": 0.417742378467454, + "grad_norm": 1.2879170179367065, + "learning_rate": 4.947505549754509e-06, + "loss": 0.7089, + "step": 1521 + }, + { + "epoch": 0.4180170282889316, + "grad_norm": 1.2597875595092773, + "learning_rate": 4.9474319000578315e-06, + "loss": 0.6372, + "step": 1522 + }, + { + "epoch": 0.41829167811040924, + "grad_norm": 1.2853755950927734, + "learning_rate": 4.947358199281072e-06, + "loss": 0.7007, + "step": 1523 + }, + { + "epoch": 0.41856632793188686, + "grad_norm": 1.2591743469238281, + "learning_rate": 4.947284447425767e-06, + "loss": 0.6806, + "step": 1524 + }, + { + "epoch": 0.4188409777533645, + "grad_norm": 1.2808767557144165, + "learning_rate": 4.947210644493458e-06, + "loss": 0.6978, + "step": 1525 + }, + { + "epoch": 0.41911562757484205, + "grad_norm": 1.2350125312805176, + "learning_rate": 4.947136790485682e-06, + "loss": 0.6754, + "step": 1526 + }, + { + "epoch": 0.4193902773963197, + "grad_norm": 1.256532907485962, + "learning_rate": 4.947062885403984e-06, + "loss": 0.7032, + "step": 1527 + }, + { + "epoch": 0.4196649272177973, + "grad_norm": 1.1937589645385742, + "learning_rate": 4.946988929249904e-06, + "loss": 0.6476, + "step": 1528 + }, + { + "epoch": 0.4199395770392749, + "grad_norm": 1.2803969383239746, + "learning_rate": 4.946914922024987e-06, + "loss": 0.6778, + "step": 1529 + }, + { + "epoch": 0.42021422686075255, + "grad_norm": 1.2882503271102905, + "learning_rate": 4.946840863730776e-06, + "loss": 0.7246, + "step": 1530 + }, + { + "epoch": 0.4204888766822302, + "grad_norm": 1.2682784795761108, + "learning_rate": 4.946766754368818e-06, + "loss": 0.6977, + "step": 1531 + }, + { + "epoch": 0.4207635265037078, + "grad_norm": 1.2257541418075562, + "learning_rate": 4.946692593940658e-06, + "loss": 0.6792, + "step": 1532 + }, + { + "epoch": 0.42103817632518536, + "grad_norm": 1.265275239944458, + "learning_rate": 4.946618382447847e-06, + "loss": 0.625, + "step": 1533 + }, + { + "epoch": 0.421312826146663, + "grad_norm": 1.2144159078598022, + "learning_rate": 4.94654411989193e-06, + "loss": 0.6502, + "step": 1534 + }, + { + "epoch": 0.4215874759681406, + "grad_norm": 1.3124994039535522, + "learning_rate": 4.946469806274461e-06, + "loss": 0.6993, + "step": 1535 + }, + { + "epoch": 0.42186212578961824, + "grad_norm": 1.2542037963867188, + "learning_rate": 4.946395441596988e-06, + "loss": 0.7113, + "step": 1536 + }, + { + "epoch": 0.42213677561109586, + "grad_norm": 1.2797579765319824, + "learning_rate": 4.946321025861063e-06, + "loss": 0.6729, + "step": 1537 + }, + { + "epoch": 0.4224114254325735, + "grad_norm": 1.2854171991348267, + "learning_rate": 4.9462465590682406e-06, + "loss": 0.6617, + "step": 1538 + }, + { + "epoch": 0.4226860752540511, + "grad_norm": 1.1967439651489258, + "learning_rate": 4.946172041220074e-06, + "loss": 0.681, + "step": 1539 + }, + { + "epoch": 0.4229607250755287, + "grad_norm": 1.2454828023910522, + "learning_rate": 4.946097472318119e-06, + "loss": 0.6842, + "step": 1540 + }, + { + "epoch": 0.4232353748970063, + "grad_norm": 1.2540063858032227, + "learning_rate": 4.946022852363932e-06, + "loss": 0.6992, + "step": 1541 + }, + { + "epoch": 0.4235100247184839, + "grad_norm": 1.2198692560195923, + "learning_rate": 4.9459481813590705e-06, + "loss": 0.7219, + "step": 1542 + }, + { + "epoch": 0.42378467453996155, + "grad_norm": 1.3245341777801514, + "learning_rate": 4.945873459305092e-06, + "loss": 0.693, + "step": 1543 + }, + { + "epoch": 0.42405932436143917, + "grad_norm": 1.2529648542404175, + "learning_rate": 4.945798686203557e-06, + "loss": 0.6636, + "step": 1544 + }, + { + "epoch": 0.4243339741829168, + "grad_norm": 1.2862932682037354, + "learning_rate": 4.945723862056026e-06, + "loss": 0.7076, + "step": 1545 + }, + { + "epoch": 0.4246086240043944, + "grad_norm": 1.2733591794967651, + "learning_rate": 4.945648986864059e-06, + "loss": 0.6429, + "step": 1546 + }, + { + "epoch": 0.424883273825872, + "grad_norm": 1.3012772798538208, + "learning_rate": 4.945574060629221e-06, + "loss": 0.7079, + "step": 1547 + }, + { + "epoch": 0.4251579236473496, + "grad_norm": 1.189072847366333, + "learning_rate": 4.945499083353075e-06, + "loss": 0.7058, + "step": 1548 + }, + { + "epoch": 0.42543257346882724, + "grad_norm": 1.1245629787445068, + "learning_rate": 4.945424055037185e-06, + "loss": 0.631, + "step": 1549 + }, + { + "epoch": 0.42570722329030486, + "grad_norm": 1.1956684589385986, + "learning_rate": 4.945348975683119e-06, + "loss": 0.6372, + "step": 1550 + }, + { + "epoch": 0.4259818731117825, + "grad_norm": 1.2560845613479614, + "learning_rate": 4.945273845292441e-06, + "loss": 0.6975, + "step": 1551 + }, + { + "epoch": 0.4262565229332601, + "grad_norm": 1.1784229278564453, + "learning_rate": 4.945198663866722e-06, + "loss": 0.6563, + "step": 1552 + }, + { + "epoch": 0.42653117275473773, + "grad_norm": 1.2194324731826782, + "learning_rate": 4.945123431407529e-06, + "loss": 0.6307, + "step": 1553 + }, + { + "epoch": 0.4268058225762153, + "grad_norm": 1.2263269424438477, + "learning_rate": 4.945048147916432e-06, + "loss": 0.6908, + "step": 1554 + }, + { + "epoch": 0.4270804723976929, + "grad_norm": 1.2652277946472168, + "learning_rate": 4.944972813395005e-06, + "loss": 0.6633, + "step": 1555 + }, + { + "epoch": 0.42735512221917055, + "grad_norm": 1.2781243324279785, + "learning_rate": 4.944897427844817e-06, + "loss": 0.6995, + "step": 1556 + }, + { + "epoch": 0.42762977204064817, + "grad_norm": 1.3732542991638184, + "learning_rate": 4.944821991267443e-06, + "loss": 0.673, + "step": 1557 + }, + { + "epoch": 0.4279044218621258, + "grad_norm": 1.2795166969299316, + "learning_rate": 4.944746503664458e-06, + "loss": 0.6716, + "step": 1558 + }, + { + "epoch": 0.4281790716836034, + "grad_norm": 1.2684571743011475, + "learning_rate": 4.944670965037436e-06, + "loss": 0.6885, + "step": 1559 + }, + { + "epoch": 0.42845372150508104, + "grad_norm": 1.3117479085922241, + "learning_rate": 4.944595375387954e-06, + "loss": 0.6761, + "step": 1560 + }, + { + "epoch": 0.4287283713265586, + "grad_norm": 1.250610113143921, + "learning_rate": 4.94451973471759e-06, + "loss": 0.6422, + "step": 1561 + }, + { + "epoch": 0.42900302114803623, + "grad_norm": 1.217047095298767, + "learning_rate": 4.944444043027923e-06, + "loss": 0.6754, + "step": 1562 + }, + { + "epoch": 0.42927767096951386, + "grad_norm": 1.236140251159668, + "learning_rate": 4.944368300320532e-06, + "loss": 0.6366, + "step": 1563 + }, + { + "epoch": 0.4295523207909915, + "grad_norm": 1.2492588758468628, + "learning_rate": 4.944292506596998e-06, + "loss": 0.6395, + "step": 1564 + }, + { + "epoch": 0.4298269706124691, + "grad_norm": 1.238175392150879, + "learning_rate": 4.944216661858903e-06, + "loss": 0.6901, + "step": 1565 + }, + { + "epoch": 0.43010162043394673, + "grad_norm": 1.2165322303771973, + "learning_rate": 4.94414076610783e-06, + "loss": 0.6528, + "step": 1566 + }, + { + "epoch": 0.43037627025542435, + "grad_norm": 1.2010551691055298, + "learning_rate": 4.9440648193453635e-06, + "loss": 0.6609, + "step": 1567 + }, + { + "epoch": 0.430650920076902, + "grad_norm": 1.2190098762512207, + "learning_rate": 4.943988821573088e-06, + "loss": 0.6663, + "step": 1568 + }, + { + "epoch": 0.43092556989837955, + "grad_norm": 1.2176289558410645, + "learning_rate": 4.943912772792589e-06, + "loss": 0.646, + "step": 1569 + }, + { + "epoch": 0.43120021971985717, + "grad_norm": 1.2417620420455933, + "learning_rate": 4.943836673005454e-06, + "loss": 0.7139, + "step": 1570 + }, + { + "epoch": 0.4314748695413348, + "grad_norm": 1.3032342195510864, + "learning_rate": 4.9437605222132725e-06, + "loss": 0.6883, + "step": 1571 + }, + { + "epoch": 0.4317495193628124, + "grad_norm": 1.2894343137741089, + "learning_rate": 4.9436843204176335e-06, + "loss": 0.7086, + "step": 1572 + }, + { + "epoch": 0.43202416918429004, + "grad_norm": 1.2305532693862915, + "learning_rate": 4.943608067620126e-06, + "loss": 0.7189, + "step": 1573 + }, + { + "epoch": 0.43229881900576766, + "grad_norm": 1.3441786766052246, + "learning_rate": 4.943531763822343e-06, + "loss": 0.6813, + "step": 1574 + }, + { + "epoch": 0.4325734688272453, + "grad_norm": 1.1841871738433838, + "learning_rate": 4.943455409025876e-06, + "loss": 0.6921, + "step": 1575 + }, + { + "epoch": 0.43284811864872286, + "grad_norm": 1.2839624881744385, + "learning_rate": 4.943379003232319e-06, + "loss": 0.6861, + "step": 1576 + }, + { + "epoch": 0.4331227684702005, + "grad_norm": 1.210640549659729, + "learning_rate": 4.943302546443266e-06, + "loss": 0.6776, + "step": 1577 + }, + { + "epoch": 0.4333974182916781, + "grad_norm": 1.2392061948776245, + "learning_rate": 4.943226038660315e-06, + "loss": 0.6899, + "step": 1578 + }, + { + "epoch": 0.43367206811315573, + "grad_norm": 1.2809741497039795, + "learning_rate": 4.94314947988506e-06, + "loss": 0.6742, + "step": 1579 + }, + { + "epoch": 0.43394671793463335, + "grad_norm": 1.2671804428100586, + "learning_rate": 4.9430728701191e-06, + "loss": 0.6922, + "step": 1580 + }, + { + "epoch": 0.434221367756111, + "grad_norm": 1.2340973615646362, + "learning_rate": 4.9429962093640345e-06, + "loss": 0.699, + "step": 1581 + }, + { + "epoch": 0.4344960175775886, + "grad_norm": 1.2183806896209717, + "learning_rate": 4.942919497621463e-06, + "loss": 0.6442, + "step": 1582 + }, + { + "epoch": 0.43477066739906617, + "grad_norm": 1.284279704093933, + "learning_rate": 4.942842734892985e-06, + "loss": 0.6905, + "step": 1583 + }, + { + "epoch": 0.4350453172205438, + "grad_norm": 1.2613437175750732, + "learning_rate": 4.942765921180205e-06, + "loss": 0.6467, + "step": 1584 + }, + { + "epoch": 0.4353199670420214, + "grad_norm": 1.2872633934020996, + "learning_rate": 4.942689056484725e-06, + "loss": 0.6863, + "step": 1585 + }, + { + "epoch": 0.43559461686349904, + "grad_norm": 1.2252823114395142, + "learning_rate": 4.94261214080815e-06, + "loss": 0.6554, + "step": 1586 + }, + { + "epoch": 0.43586926668497666, + "grad_norm": 1.2820498943328857, + "learning_rate": 4.942535174152084e-06, + "loss": 0.6927, + "step": 1587 + }, + { + "epoch": 0.4361439165064543, + "grad_norm": 1.248608112335205, + "learning_rate": 4.942458156518135e-06, + "loss": 0.6509, + "step": 1588 + }, + { + "epoch": 0.4364185663279319, + "grad_norm": 1.2266736030578613, + "learning_rate": 4.942381087907909e-06, + "loss": 0.6804, + "step": 1589 + }, + { + "epoch": 0.4366932161494095, + "grad_norm": 1.3936342000961304, + "learning_rate": 4.942303968323015e-06, + "loss": 0.7935, + "step": 1590 + }, + { + "epoch": 0.4369678659708871, + "grad_norm": 1.2903056144714355, + "learning_rate": 4.942226797765063e-06, + "loss": 0.6822, + "step": 1591 + }, + { + "epoch": 0.4372425157923647, + "grad_norm": 1.2534384727478027, + "learning_rate": 4.942149576235663e-06, + "loss": 0.7072, + "step": 1592 + }, + { + "epoch": 0.43751716561384235, + "grad_norm": 1.222512125968933, + "learning_rate": 4.942072303736426e-06, + "loss": 0.6387, + "step": 1593 + }, + { + "epoch": 0.43779181543532, + "grad_norm": 1.2618838548660278, + "learning_rate": 4.941994980268967e-06, + "loss": 0.6783, + "step": 1594 + }, + { + "epoch": 0.4380664652567976, + "grad_norm": 1.3110655546188354, + "learning_rate": 4.941917605834897e-06, + "loss": 0.6418, + "step": 1595 + }, + { + "epoch": 0.4383411150782752, + "grad_norm": 1.3148311376571655, + "learning_rate": 4.941840180435834e-06, + "loss": 0.6871, + "step": 1596 + }, + { + "epoch": 0.4386157648997528, + "grad_norm": 1.2832858562469482, + "learning_rate": 4.94176270407339e-06, + "loss": 0.6956, + "step": 1597 + }, + { + "epoch": 0.4388904147212304, + "grad_norm": 1.2520045042037964, + "learning_rate": 4.941685176749187e-06, + "loss": 0.7213, + "step": 1598 + }, + { + "epoch": 0.43916506454270804, + "grad_norm": 1.2185713052749634, + "learning_rate": 4.941607598464838e-06, + "loss": 0.6975, + "step": 1599 + }, + { + "epoch": 0.43943971436418566, + "grad_norm": 1.2325654029846191, + "learning_rate": 4.941529969221966e-06, + "loss": 0.6695, + "step": 1600 + }, + { + "epoch": 0.4397143641856633, + "grad_norm": 1.324038028717041, + "learning_rate": 4.941452289022189e-06, + "loss": 0.6484, + "step": 1601 + }, + { + "epoch": 0.4399890140071409, + "grad_norm": 1.2912266254425049, + "learning_rate": 4.941374557867129e-06, + "loss": 0.6779, + "step": 1602 + }, + { + "epoch": 0.44026366382861853, + "grad_norm": 1.2356009483337402, + "learning_rate": 4.941296775758407e-06, + "loss": 0.6252, + "step": 1603 + }, + { + "epoch": 0.4405383136500961, + "grad_norm": 1.2637747526168823, + "learning_rate": 4.941218942697649e-06, + "loss": 0.6594, + "step": 1604 + }, + { + "epoch": 0.4408129634715737, + "grad_norm": 1.3426119089126587, + "learning_rate": 4.941141058686478e-06, + "loss": 0.7015, + "step": 1605 + }, + { + "epoch": 0.44108761329305135, + "grad_norm": 1.2859525680541992, + "learning_rate": 4.94106312372652e-06, + "loss": 0.7156, + "step": 1606 + }, + { + "epoch": 0.441362263114529, + "grad_norm": 1.33936607837677, + "learning_rate": 4.9409851378194e-06, + "loss": 0.7066, + "step": 1607 + }, + { + "epoch": 0.4416369129360066, + "grad_norm": 1.2464051246643066, + "learning_rate": 4.940907100966746e-06, + "loss": 0.6521, + "step": 1608 + }, + { + "epoch": 0.4419115627574842, + "grad_norm": 1.200866460800171, + "learning_rate": 4.9408290131701896e-06, + "loss": 0.6334, + "step": 1609 + }, + { + "epoch": 0.44218621257896185, + "grad_norm": 1.3436448574066162, + "learning_rate": 4.940750874431357e-06, + "loss": 0.7087, + "step": 1610 + }, + { + "epoch": 0.4424608624004394, + "grad_norm": 1.2887132167816162, + "learning_rate": 4.94067268475188e-06, + "loss": 0.6515, + "step": 1611 + }, + { + "epoch": 0.44273551222191704, + "grad_norm": 1.3014705181121826, + "learning_rate": 4.940594444133392e-06, + "loss": 0.6881, + "step": 1612 + }, + { + "epoch": 0.44301016204339466, + "grad_norm": 1.251540184020996, + "learning_rate": 4.9405161525775245e-06, + "loss": 0.6812, + "step": 1613 + }, + { + "epoch": 0.4432848118648723, + "grad_norm": 1.2536545991897583, + "learning_rate": 4.940437810085912e-06, + "loss": 0.7187, + "step": 1614 + }, + { + "epoch": 0.4435594616863499, + "grad_norm": 1.2599084377288818, + "learning_rate": 4.94035941666019e-06, + "loss": 0.7126, + "step": 1615 + }, + { + "epoch": 0.44383411150782753, + "grad_norm": 1.2196757793426514, + "learning_rate": 4.940280972301993e-06, + "loss": 0.6724, + "step": 1616 + }, + { + "epoch": 0.44410876132930516, + "grad_norm": 1.388366460800171, + "learning_rate": 4.94020247701296e-06, + "loss": 0.71, + "step": 1617 + }, + { + "epoch": 0.4443834111507827, + "grad_norm": 1.1817978620529175, + "learning_rate": 4.940123930794728e-06, + "loss": 0.6902, + "step": 1618 + }, + { + "epoch": 0.44465806097226035, + "grad_norm": 1.2521659135818481, + "learning_rate": 4.9400453336489375e-06, + "loss": 0.6448, + "step": 1619 + }, + { + "epoch": 0.444932710793738, + "grad_norm": 1.3823257684707642, + "learning_rate": 4.939966685577228e-06, + "loss": 0.7099, + "step": 1620 + }, + { + "epoch": 0.4452073606152156, + "grad_norm": 1.2419886589050293, + "learning_rate": 4.939887986581241e-06, + "loss": 0.676, + "step": 1621 + }, + { + "epoch": 0.4454820104366932, + "grad_norm": 1.2642154693603516, + "learning_rate": 4.93980923666262e-06, + "loss": 0.7353, + "step": 1622 + }, + { + "epoch": 0.44575666025817084, + "grad_norm": 1.2579114437103271, + "learning_rate": 4.939730435823008e-06, + "loss": 0.6747, + "step": 1623 + }, + { + "epoch": 0.44603131007964847, + "grad_norm": 1.2223361730575562, + "learning_rate": 4.939651584064048e-06, + "loss": 0.7117, + "step": 1624 + }, + { + "epoch": 0.44630595990112604, + "grad_norm": 1.2102928161621094, + "learning_rate": 4.939572681387388e-06, + "loss": 0.6657, + "step": 1625 + }, + { + "epoch": 0.44658060972260366, + "grad_norm": 1.280430793762207, + "learning_rate": 4.939493727794675e-06, + "loss": 0.7185, + "step": 1626 + }, + { + "epoch": 0.4468552595440813, + "grad_norm": 1.3859812021255493, + "learning_rate": 4.939414723287555e-06, + "loss": 0.6125, + "step": 1627 + }, + { + "epoch": 0.4471299093655589, + "grad_norm": 1.198620080947876, + "learning_rate": 4.939335667867677e-06, + "loss": 0.657, + "step": 1628 + }, + { + "epoch": 0.44740455918703653, + "grad_norm": 1.2455341815948486, + "learning_rate": 4.9392565615366925e-06, + "loss": 0.682, + "step": 1629 + }, + { + "epoch": 0.44767920900851416, + "grad_norm": 1.301669955253601, + "learning_rate": 4.939177404296251e-06, + "loss": 0.6826, + "step": 1630 + }, + { + "epoch": 0.4479538588299918, + "grad_norm": 1.2868176698684692, + "learning_rate": 4.939098196148006e-06, + "loss": 0.7101, + "step": 1631 + }, + { + "epoch": 0.4482285086514694, + "grad_norm": 1.2437944412231445, + "learning_rate": 4.939018937093609e-06, + "loss": 0.6525, + "step": 1632 + }, + { + "epoch": 0.448503158472947, + "grad_norm": 1.2421884536743164, + "learning_rate": 4.9389396271347165e-06, + "loss": 0.6875, + "step": 1633 + }, + { + "epoch": 0.4487778082944246, + "grad_norm": 1.2536998987197876, + "learning_rate": 4.938860266272981e-06, + "loss": 0.6778, + "step": 1634 + }, + { + "epoch": 0.4490524581159022, + "grad_norm": 1.2578446865081787, + "learning_rate": 4.9387808545100615e-06, + "loss": 0.7038, + "step": 1635 + }, + { + "epoch": 0.44932710793737984, + "grad_norm": 1.3131754398345947, + "learning_rate": 4.938701391847613e-06, + "loss": 0.6849, + "step": 1636 + }, + { + "epoch": 0.44960175775885747, + "grad_norm": 1.3048213720321655, + "learning_rate": 4.938621878287295e-06, + "loss": 0.7317, + "step": 1637 + }, + { + "epoch": 0.4498764075803351, + "grad_norm": 1.215268611907959, + "learning_rate": 4.938542313830767e-06, + "loss": 0.6724, + "step": 1638 + }, + { + "epoch": 0.4501510574018127, + "grad_norm": 1.2254724502563477, + "learning_rate": 4.938462698479691e-06, + "loss": 0.6912, + "step": 1639 + }, + { + "epoch": 0.4504257072232903, + "grad_norm": 1.2056632041931152, + "learning_rate": 4.938383032235726e-06, + "loss": 0.7459, + "step": 1640 + }, + { + "epoch": 0.4507003570447679, + "grad_norm": 1.2603645324707031, + "learning_rate": 4.938303315100537e-06, + "loss": 0.7188, + "step": 1641 + }, + { + "epoch": 0.45097500686624553, + "grad_norm": 1.315182089805603, + "learning_rate": 4.938223547075786e-06, + "loss": 0.6603, + "step": 1642 + }, + { + "epoch": 0.45124965668772316, + "grad_norm": 1.2346665859222412, + "learning_rate": 4.938143728163139e-06, + "loss": 0.7085, + "step": 1643 + }, + { + "epoch": 0.4515243065092008, + "grad_norm": 1.2689255475997925, + "learning_rate": 4.938063858364262e-06, + "loss": 0.6996, + "step": 1644 + }, + { + "epoch": 0.4517989563306784, + "grad_norm": 1.2059011459350586, + "learning_rate": 4.93798393768082e-06, + "loss": 0.6545, + "step": 1645 + }, + { + "epoch": 0.452073606152156, + "grad_norm": 1.2702889442443848, + "learning_rate": 4.937903966114483e-06, + "loss": 0.5985, + "step": 1646 + }, + { + "epoch": 0.4523482559736336, + "grad_norm": 1.314550757408142, + "learning_rate": 4.93782394366692e-06, + "loss": 0.6951, + "step": 1647 + }, + { + "epoch": 0.4526229057951112, + "grad_norm": 1.2849299907684326, + "learning_rate": 4.9377438703398e-06, + "loss": 0.6813, + "step": 1648 + }, + { + "epoch": 0.45289755561658884, + "grad_norm": 1.2576013803482056, + "learning_rate": 4.937663746134796e-06, + "loss": 0.7189, + "step": 1649 + }, + { + "epoch": 0.45317220543806647, + "grad_norm": 1.2452319860458374, + "learning_rate": 4.937583571053579e-06, + "loss": 0.6805, + "step": 1650 + }, + { + "epoch": 0.4534468552595441, + "grad_norm": 1.2222715616226196, + "learning_rate": 4.937503345097822e-06, + "loss": 0.6808, + "step": 1651 + }, + { + "epoch": 0.4537215050810217, + "grad_norm": 1.2812227010726929, + "learning_rate": 4.9374230682691995e-06, + "loss": 0.6788, + "step": 1652 + }, + { + "epoch": 0.45399615490249934, + "grad_norm": 1.2094817161560059, + "learning_rate": 4.937342740569387e-06, + "loss": 0.6658, + "step": 1653 + }, + { + "epoch": 0.4542708047239769, + "grad_norm": 1.255818486213684, + "learning_rate": 4.9372623620000625e-06, + "loss": 0.6879, + "step": 1654 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.2455192804336548, + "learning_rate": 4.9371819325629026e-06, + "loss": 0.6549, + "step": 1655 + }, + { + "epoch": 0.45482010436693215, + "grad_norm": 1.3092188835144043, + "learning_rate": 4.937101452259584e-06, + "loss": 0.6806, + "step": 1656 + }, + { + "epoch": 0.4550947541884098, + "grad_norm": 1.262878179550171, + "learning_rate": 4.93702092109179e-06, + "loss": 0.6765, + "step": 1657 + }, + { + "epoch": 0.4553694040098874, + "grad_norm": 1.1884450912475586, + "learning_rate": 4.936940339061198e-06, + "loss": 0.6719, + "step": 1658 + }, + { + "epoch": 0.455644053831365, + "grad_norm": 1.2083607912063599, + "learning_rate": 4.936859706169492e-06, + "loss": 0.6196, + "step": 1659 + }, + { + "epoch": 0.45591870365284265, + "grad_norm": 1.3240455389022827, + "learning_rate": 4.936779022418354e-06, + "loss": 0.7303, + "step": 1660 + }, + { + "epoch": 0.4561933534743202, + "grad_norm": 1.311226487159729, + "learning_rate": 4.936698287809469e-06, + "loss": 0.7102, + "step": 1661 + }, + { + "epoch": 0.45646800329579784, + "grad_norm": 1.2333426475524902, + "learning_rate": 4.936617502344519e-06, + "loss": 0.6505, + "step": 1662 + }, + { + "epoch": 0.45674265311727547, + "grad_norm": 1.275201678276062, + "learning_rate": 4.936536666025194e-06, + "loss": 0.6881, + "step": 1663 + }, + { + "epoch": 0.4570173029387531, + "grad_norm": 1.2837852239608765, + "learning_rate": 4.9364557788531786e-06, + "loss": 0.6851, + "step": 1664 + }, + { + "epoch": 0.4572919527602307, + "grad_norm": 1.2120788097381592, + "learning_rate": 4.936374840830162e-06, + "loss": 0.6796, + "step": 1665 + }, + { + "epoch": 0.45756660258170834, + "grad_norm": 1.2897040843963623, + "learning_rate": 4.936293851957833e-06, + "loss": 0.7262, + "step": 1666 + }, + { + "epoch": 0.45784125240318596, + "grad_norm": 1.3181095123291016, + "learning_rate": 4.936212812237882e-06, + "loss": 0.6739, + "step": 1667 + }, + { + "epoch": 0.45811590222466353, + "grad_norm": 1.2430158853530884, + "learning_rate": 4.936131721672001e-06, + "loss": 0.6939, + "step": 1668 + }, + { + "epoch": 0.45839055204614115, + "grad_norm": 1.300011157989502, + "learning_rate": 4.936050580261881e-06, + "loss": 0.68, + "step": 1669 + }, + { + "epoch": 0.4586652018676188, + "grad_norm": 1.3406085968017578, + "learning_rate": 4.935969388009217e-06, + "loss": 0.6999, + "step": 1670 + }, + { + "epoch": 0.4589398516890964, + "grad_norm": 1.266564130783081, + "learning_rate": 4.9358881449157026e-06, + "loss": 0.6961, + "step": 1671 + }, + { + "epoch": 0.459214501510574, + "grad_norm": 1.3074579238891602, + "learning_rate": 4.935806850983034e-06, + "loss": 0.6908, + "step": 1672 + }, + { + "epoch": 0.45948915133205165, + "grad_norm": 1.1925123929977417, + "learning_rate": 4.935725506212908e-06, + "loss": 0.6523, + "step": 1673 + }, + { + "epoch": 0.4597638011535293, + "grad_norm": 1.2701035737991333, + "learning_rate": 4.935644110607021e-06, + "loss": 0.7152, + "step": 1674 + }, + { + "epoch": 0.46003845097500684, + "grad_norm": 1.2592908143997192, + "learning_rate": 4.935562664167073e-06, + "loss": 0.7062, + "step": 1675 + }, + { + "epoch": 0.46031310079648446, + "grad_norm": 1.3024771213531494, + "learning_rate": 4.935481166894763e-06, + "loss": 0.6313, + "step": 1676 + }, + { + "epoch": 0.4605877506179621, + "grad_norm": 1.2409968376159668, + "learning_rate": 4.935399618791793e-06, + "loss": 0.6421, + "step": 1677 + }, + { + "epoch": 0.4608624004394397, + "grad_norm": 1.271250605583191, + "learning_rate": 4.935318019859865e-06, + "loss": 0.6862, + "step": 1678 + }, + { + "epoch": 0.46113705026091734, + "grad_norm": 1.343415379524231, + "learning_rate": 4.935236370100682e-06, + "loss": 0.6748, + "step": 1679 + }, + { + "epoch": 0.46141170008239496, + "grad_norm": 1.2570387125015259, + "learning_rate": 4.935154669515946e-06, + "loss": 0.7023, + "step": 1680 + }, + { + "epoch": 0.4616863499038726, + "grad_norm": 1.1970798969268799, + "learning_rate": 4.9350729181073654e-06, + "loss": 0.6321, + "step": 1681 + }, + { + "epoch": 0.46196099972535015, + "grad_norm": 1.3742674589157104, + "learning_rate": 4.934991115876644e-06, + "loss": 0.7303, + "step": 1682 + }, + { + "epoch": 0.4622356495468278, + "grad_norm": 1.252060890197754, + "learning_rate": 4.93490926282549e-06, + "loss": 0.6505, + "step": 1683 + }, + { + "epoch": 0.4625102993683054, + "grad_norm": 1.2036149501800537, + "learning_rate": 4.934827358955612e-06, + "loss": 0.6776, + "step": 1684 + }, + { + "epoch": 0.462784949189783, + "grad_norm": 1.2437474727630615, + "learning_rate": 4.934745404268718e-06, + "loss": 0.6771, + "step": 1685 + }, + { + "epoch": 0.46305959901126065, + "grad_norm": 1.2944401502609253, + "learning_rate": 4.934663398766521e-06, + "loss": 0.683, + "step": 1686 + }, + { + "epoch": 0.46333424883273827, + "grad_norm": 1.2389048337936401, + "learning_rate": 4.93458134245073e-06, + "loss": 0.7083, + "step": 1687 + }, + { + "epoch": 0.4636088986542159, + "grad_norm": 1.2423871755599976, + "learning_rate": 4.934499235323059e-06, + "loss": 0.7159, + "step": 1688 + }, + { + "epoch": 0.46388354847569346, + "grad_norm": 1.2657099962234497, + "learning_rate": 4.934417077385222e-06, + "loss": 0.6885, + "step": 1689 + }, + { + "epoch": 0.4641581982971711, + "grad_norm": 1.2099254131317139, + "learning_rate": 4.934334868638932e-06, + "loss": 0.6291, + "step": 1690 + }, + { + "epoch": 0.4644328481186487, + "grad_norm": 1.2546802759170532, + "learning_rate": 4.934252609085907e-06, + "loss": 0.6626, + "step": 1691 + }, + { + "epoch": 0.46470749794012634, + "grad_norm": 1.2452806234359741, + "learning_rate": 4.934170298727861e-06, + "loss": 0.6782, + "step": 1692 + }, + { + "epoch": 0.46498214776160396, + "grad_norm": 1.2043837308883667, + "learning_rate": 4.934087937566514e-06, + "loss": 0.6287, + "step": 1693 + }, + { + "epoch": 0.4652567975830816, + "grad_norm": 1.210280418395996, + "learning_rate": 4.934005525603585e-06, + "loss": 0.6793, + "step": 1694 + }, + { + "epoch": 0.4655314474045592, + "grad_norm": 1.2614749670028687, + "learning_rate": 4.933923062840793e-06, + "loss": 0.6966, + "step": 1695 + }, + { + "epoch": 0.46580609722603683, + "grad_norm": 1.22611665725708, + "learning_rate": 4.93384054927986e-06, + "loss": 0.6965, + "step": 1696 + }, + { + "epoch": 0.4660807470475144, + "grad_norm": 1.200987696647644, + "learning_rate": 4.933757984922508e-06, + "loss": 0.674, + "step": 1697 + }, + { + "epoch": 0.466355396868992, + "grad_norm": 1.2729634046554565, + "learning_rate": 4.933675369770458e-06, + "loss": 0.7072, + "step": 1698 + }, + { + "epoch": 0.46663004669046965, + "grad_norm": 1.2457878589630127, + "learning_rate": 4.933592703825439e-06, + "loss": 0.6618, + "step": 1699 + }, + { + "epoch": 0.46690469651194727, + "grad_norm": 1.2492914199829102, + "learning_rate": 4.933509987089172e-06, + "loss": 0.7044, + "step": 1700 + }, + { + "epoch": 0.4671793463334249, + "grad_norm": 1.2340683937072754, + "learning_rate": 4.9334272195633834e-06, + "loss": 0.6295, + "step": 1701 + }, + { + "epoch": 0.4674539961549025, + "grad_norm": 1.2708743810653687, + "learning_rate": 4.933344401249803e-06, + "loss": 0.7164, + "step": 1702 + }, + { + "epoch": 0.46772864597638014, + "grad_norm": 1.3075398206710815, + "learning_rate": 4.933261532150159e-06, + "loss": 0.6895, + "step": 1703 + }, + { + "epoch": 0.4680032957978577, + "grad_norm": 1.229004144668579, + "learning_rate": 4.9331786122661805e-06, + "loss": 0.6697, + "step": 1704 + }, + { + "epoch": 0.46827794561933533, + "grad_norm": 1.1561369895935059, + "learning_rate": 4.933095641599597e-06, + "loss": 0.6284, + "step": 1705 + }, + { + "epoch": 0.46855259544081296, + "grad_norm": 1.221834659576416, + "learning_rate": 4.933012620152141e-06, + "loss": 0.6961, + "step": 1706 + }, + { + "epoch": 0.4688272452622906, + "grad_norm": 1.2034026384353638, + "learning_rate": 4.932929547925546e-06, + "loss": 0.6624, + "step": 1707 + }, + { + "epoch": 0.4691018950837682, + "grad_norm": 1.2451382875442505, + "learning_rate": 4.932846424921545e-06, + "loss": 0.6973, + "step": 1708 + }, + { + "epoch": 0.46937654490524583, + "grad_norm": 1.290609359741211, + "learning_rate": 4.932763251141872e-06, + "loss": 0.6724, + "step": 1709 + }, + { + "epoch": 0.46965119472672345, + "grad_norm": 1.22378671169281, + "learning_rate": 4.932680026588265e-06, + "loss": 0.6814, + "step": 1710 + }, + { + "epoch": 0.469925844548201, + "grad_norm": 1.2198066711425781, + "learning_rate": 4.932596751262458e-06, + "loss": 0.6547, + "step": 1711 + }, + { + "epoch": 0.47020049436967865, + "grad_norm": 1.341112732887268, + "learning_rate": 4.932513425166193e-06, + "loss": 0.7271, + "step": 1712 + }, + { + "epoch": 0.47047514419115627, + "grad_norm": 1.3232097625732422, + "learning_rate": 4.932430048301206e-06, + "loss": 0.7236, + "step": 1713 + }, + { + "epoch": 0.4707497940126339, + "grad_norm": 1.1853257417678833, + "learning_rate": 4.9323466206692385e-06, + "loss": 0.6821, + "step": 1714 + }, + { + "epoch": 0.4710244438341115, + "grad_norm": 1.2709271907806396, + "learning_rate": 4.93226314227203e-06, + "loss": 0.6806, + "step": 1715 + }, + { + "epoch": 0.47129909365558914, + "grad_norm": 1.2195218801498413, + "learning_rate": 4.932179613111325e-06, + "loss": 0.628, + "step": 1716 + }, + { + "epoch": 0.47157374347706676, + "grad_norm": 1.2403204441070557, + "learning_rate": 4.932096033188866e-06, + "loss": 0.7124, + "step": 1717 + }, + { + "epoch": 0.47184839329854433, + "grad_norm": 1.2112172842025757, + "learning_rate": 4.932012402506397e-06, + "loss": 0.6757, + "step": 1718 + }, + { + "epoch": 0.47212304312002196, + "grad_norm": 1.194536566734314, + "learning_rate": 4.931928721065664e-06, + "loss": 0.6512, + "step": 1719 + }, + { + "epoch": 0.4723976929414996, + "grad_norm": 1.254512906074524, + "learning_rate": 4.931844988868413e-06, + "loss": 0.6657, + "step": 1720 + }, + { + "epoch": 0.4726723427629772, + "grad_norm": 1.236161231994629, + "learning_rate": 4.931761205916392e-06, + "loss": 0.6614, + "step": 1721 + }, + { + "epoch": 0.47294699258445483, + "grad_norm": 1.2602201700210571, + "learning_rate": 4.931677372211349e-06, + "loss": 0.6863, + "step": 1722 + }, + { + "epoch": 0.47322164240593245, + "grad_norm": 1.1589335203170776, + "learning_rate": 4.931593487755035e-06, + "loss": 0.6607, + "step": 1723 + }, + { + "epoch": 0.4734962922274101, + "grad_norm": 1.3498005867004395, + "learning_rate": 4.9315095525491984e-06, + "loss": 0.6896, + "step": 1724 + }, + { + "epoch": 0.47377094204888764, + "grad_norm": 1.2705018520355225, + "learning_rate": 4.9314255665955935e-06, + "loss": 0.6503, + "step": 1725 + }, + { + "epoch": 0.47404559187036527, + "grad_norm": 1.2038240432739258, + "learning_rate": 4.931341529895972e-06, + "loss": 0.6286, + "step": 1726 + }, + { + "epoch": 0.4743202416918429, + "grad_norm": 1.3116551637649536, + "learning_rate": 4.931257442452088e-06, + "loss": 0.7249, + "step": 1727 + }, + { + "epoch": 0.4745948915133205, + "grad_norm": 1.2891385555267334, + "learning_rate": 4.9311733042656965e-06, + "loss": 0.6881, + "step": 1728 + }, + { + "epoch": 0.47486954133479814, + "grad_norm": 1.2637989521026611, + "learning_rate": 4.931089115338553e-06, + "loss": 0.6606, + "step": 1729 + }, + { + "epoch": 0.47514419115627576, + "grad_norm": 1.2251986265182495, + "learning_rate": 4.931004875672415e-06, + "loss": 0.6956, + "step": 1730 + }, + { + "epoch": 0.4754188409777534, + "grad_norm": 1.1930687427520752, + "learning_rate": 4.9309205852690415e-06, + "loss": 0.6879, + "step": 1731 + }, + { + "epoch": 0.47569349079923096, + "grad_norm": 1.2108919620513916, + "learning_rate": 4.93083624413019e-06, + "loss": 0.6788, + "step": 1732 + }, + { + "epoch": 0.4759681406207086, + "grad_norm": 1.2611151933670044, + "learning_rate": 4.930751852257622e-06, + "loss": 0.6966, + "step": 1733 + }, + { + "epoch": 0.4762427904421862, + "grad_norm": 1.2470356225967407, + "learning_rate": 4.930667409653098e-06, + "loss": 0.6882, + "step": 1734 + }, + { + "epoch": 0.4765174402636638, + "grad_norm": 1.2198002338409424, + "learning_rate": 4.930582916318381e-06, + "loss": 0.6453, + "step": 1735 + }, + { + "epoch": 0.47679209008514145, + "grad_norm": 1.1614460945129395, + "learning_rate": 4.930498372255236e-06, + "loss": 0.6905, + "step": 1736 + }, + { + "epoch": 0.4770667399066191, + "grad_norm": 1.1935980319976807, + "learning_rate": 4.930413777465425e-06, + "loss": 0.6923, + "step": 1737 + }, + { + "epoch": 0.4773413897280967, + "grad_norm": 1.2549549341201782, + "learning_rate": 4.930329131950714e-06, + "loss": 0.7034, + "step": 1738 + }, + { + "epoch": 0.47761603954957427, + "grad_norm": 1.32106351852417, + "learning_rate": 4.93024443571287e-06, + "loss": 0.6919, + "step": 1739 + }, + { + "epoch": 0.4778906893710519, + "grad_norm": 1.3458634614944458, + "learning_rate": 4.93015968875366e-06, + "loss": 0.7012, + "step": 1740 + }, + { + "epoch": 0.4781653391925295, + "grad_norm": 1.2276681661605835, + "learning_rate": 4.930074891074855e-06, + "loss": 0.6431, + "step": 1741 + }, + { + "epoch": 0.47843998901400714, + "grad_norm": 1.286747694015503, + "learning_rate": 4.929990042678223e-06, + "loss": 0.7, + "step": 1742 + }, + { + "epoch": 0.47871463883548476, + "grad_norm": 1.2554528713226318, + "learning_rate": 4.929905143565536e-06, + "loss": 0.6947, + "step": 1743 + }, + { + "epoch": 0.4789892886569624, + "grad_norm": 1.2579967975616455, + "learning_rate": 4.929820193738563e-06, + "loss": 0.6586, + "step": 1744 + }, + { + "epoch": 0.47926393847844, + "grad_norm": 1.3838120698928833, + "learning_rate": 4.92973519319908e-06, + "loss": 0.667, + "step": 1745 + }, + { + "epoch": 0.4795385882999176, + "grad_norm": 1.188249945640564, + "learning_rate": 4.929650141948861e-06, + "loss": 0.6247, + "step": 1746 + }, + { + "epoch": 0.4798132381213952, + "grad_norm": 1.2289414405822754, + "learning_rate": 4.929565039989679e-06, + "loss": 0.7058, + "step": 1747 + }, + { + "epoch": 0.4800878879428728, + "grad_norm": 1.3463776111602783, + "learning_rate": 4.929479887323312e-06, + "loss": 0.6848, + "step": 1748 + }, + { + "epoch": 0.48036253776435045, + "grad_norm": 1.2244699001312256, + "learning_rate": 4.9293946839515365e-06, + "loss": 0.6848, + "step": 1749 + }, + { + "epoch": 0.4806371875858281, + "grad_norm": 1.2998331785202026, + "learning_rate": 4.929309429876131e-06, + "loss": 0.6557, + "step": 1750 + }, + { + "epoch": 0.4809118374073057, + "grad_norm": 1.2760121822357178, + "learning_rate": 4.9292241250988745e-06, + "loss": 0.6684, + "step": 1751 + }, + { + "epoch": 0.4811864872287833, + "grad_norm": 1.3189417123794556, + "learning_rate": 4.929138769621548e-06, + "loss": 0.6895, + "step": 1752 + }, + { + "epoch": 0.4814611370502609, + "grad_norm": 1.345036268234253, + "learning_rate": 4.929053363445932e-06, + "loss": 0.7217, + "step": 1753 + }, + { + "epoch": 0.4817357868717385, + "grad_norm": 1.3052606582641602, + "learning_rate": 4.928967906573808e-06, + "loss": 0.7286, + "step": 1754 + }, + { + "epoch": 0.48201043669321614, + "grad_norm": 1.2289317846298218, + "learning_rate": 4.928882399006963e-06, + "loss": 0.6823, + "step": 1755 + }, + { + "epoch": 0.48228508651469376, + "grad_norm": 1.3643330335617065, + "learning_rate": 4.928796840747179e-06, + "loss": 0.6924, + "step": 1756 + }, + { + "epoch": 0.4825597363361714, + "grad_norm": 1.3866688013076782, + "learning_rate": 4.928711231796243e-06, + "loss": 0.6558, + "step": 1757 + }, + { + "epoch": 0.482834386157649, + "grad_norm": 1.2827922105789185, + "learning_rate": 4.928625572155941e-06, + "loss": 0.6564, + "step": 1758 + }, + { + "epoch": 0.48310903597912663, + "grad_norm": 1.1938042640686035, + "learning_rate": 4.92853986182806e-06, + "loss": 0.6643, + "step": 1759 + }, + { + "epoch": 0.48338368580060426, + "grad_norm": 1.2642765045166016, + "learning_rate": 4.92845410081439e-06, + "loss": 0.6682, + "step": 1760 + }, + { + "epoch": 0.4836583356220818, + "grad_norm": 1.222314715385437, + "learning_rate": 4.928368289116722e-06, + "loss": 0.6837, + "step": 1761 + }, + { + "epoch": 0.48393298544355945, + "grad_norm": 1.2198814153671265, + "learning_rate": 4.928282426736844e-06, + "loss": 0.6474, + "step": 1762 + }, + { + "epoch": 0.4842076352650371, + "grad_norm": 1.2768702507019043, + "learning_rate": 4.92819651367655e-06, + "loss": 0.6998, + "step": 1763 + }, + { + "epoch": 0.4844822850865147, + "grad_norm": 1.292870283126831, + "learning_rate": 4.928110549937633e-06, + "loss": 0.679, + "step": 1764 + }, + { + "epoch": 0.4847569349079923, + "grad_norm": 1.2230418920516968, + "learning_rate": 4.928024535521887e-06, + "loss": 0.6855, + "step": 1765 + }, + { + "epoch": 0.48503158472946994, + "grad_norm": 1.2187390327453613, + "learning_rate": 4.927938470431107e-06, + "loss": 0.6708, + "step": 1766 + }, + { + "epoch": 0.48530623455094757, + "grad_norm": 1.3087133169174194, + "learning_rate": 4.927852354667089e-06, + "loss": 0.7035, + "step": 1767 + }, + { + "epoch": 0.48558088437242514, + "grad_norm": 1.2728099822998047, + "learning_rate": 4.92776618823163e-06, + "loss": 0.6762, + "step": 1768 + }, + { + "epoch": 0.48585553419390276, + "grad_norm": 1.2004823684692383, + "learning_rate": 4.92767997112653e-06, + "loss": 0.6671, + "step": 1769 + }, + { + "epoch": 0.4861301840153804, + "grad_norm": 1.1882115602493286, + "learning_rate": 4.927593703353587e-06, + "loss": 0.6766, + "step": 1770 + }, + { + "epoch": 0.486404833836858, + "grad_norm": 1.2627111673355103, + "learning_rate": 4.927507384914603e-06, + "loss": 0.6864, + "step": 1771 + }, + { + "epoch": 0.48667948365833563, + "grad_norm": 1.2419143915176392, + "learning_rate": 4.927421015811376e-06, + "loss": 0.6869, + "step": 1772 + }, + { + "epoch": 0.48695413347981326, + "grad_norm": 1.300385594367981, + "learning_rate": 4.9273345960457135e-06, + "loss": 0.6941, + "step": 1773 + }, + { + "epoch": 0.4872287833012909, + "grad_norm": 1.2302087545394897, + "learning_rate": 4.927248125619416e-06, + "loss": 0.6872, + "step": 1774 + }, + { + "epoch": 0.48750343312276845, + "grad_norm": 1.212653636932373, + "learning_rate": 4.927161604534288e-06, + "loss": 0.6804, + "step": 1775 + }, + { + "epoch": 0.48777808294424607, + "grad_norm": 1.1563371419906616, + "learning_rate": 4.927075032792137e-06, + "loss": 0.6717, + "step": 1776 + }, + { + "epoch": 0.4880527327657237, + "grad_norm": 1.2417601346969604, + "learning_rate": 4.926988410394769e-06, + "loss": 0.6656, + "step": 1777 + }, + { + "epoch": 0.4883273825872013, + "grad_norm": 1.2074555158615112, + "learning_rate": 4.926901737343992e-06, + "loss": 0.6968, + "step": 1778 + }, + { + "epoch": 0.48860203240867894, + "grad_norm": 1.190871238708496, + "learning_rate": 4.926815013641615e-06, + "loss": 0.6485, + "step": 1779 + }, + { + "epoch": 0.48887668223015657, + "grad_norm": 1.3167977333068848, + "learning_rate": 4.9267282392894475e-06, + "loss": 0.7463, + "step": 1780 + }, + { + "epoch": 0.4891513320516342, + "grad_norm": 1.3299305438995361, + "learning_rate": 4.926641414289301e-06, + "loss": 0.6919, + "step": 1781 + }, + { + "epoch": 0.48942598187311176, + "grad_norm": 1.2470818758010864, + "learning_rate": 4.9265545386429865e-06, + "loss": 0.6682, + "step": 1782 + }, + { + "epoch": 0.4897006316945894, + "grad_norm": 1.2249537706375122, + "learning_rate": 4.92646761235232e-06, + "loss": 0.7133, + "step": 1783 + }, + { + "epoch": 0.489975281516067, + "grad_norm": 1.2733349800109863, + "learning_rate": 4.926380635419113e-06, + "loss": 0.6523, + "step": 1784 + }, + { + "epoch": 0.49024993133754463, + "grad_norm": 1.266682744026184, + "learning_rate": 4.926293607845182e-06, + "loss": 0.6657, + "step": 1785 + }, + { + "epoch": 0.49052458115902225, + "grad_norm": 1.226355791091919, + "learning_rate": 4.926206529632342e-06, + "loss": 0.6889, + "step": 1786 + }, + { + "epoch": 0.4907992309804999, + "grad_norm": 1.2639997005462646, + "learning_rate": 4.926119400782413e-06, + "loss": 0.6731, + "step": 1787 + }, + { + "epoch": 0.4910738808019775, + "grad_norm": 1.2373970746994019, + "learning_rate": 4.926032221297211e-06, + "loss": 0.6627, + "step": 1788 + }, + { + "epoch": 0.49134853062345507, + "grad_norm": 1.274160623550415, + "learning_rate": 4.925944991178557e-06, + "loss": 0.6321, + "step": 1789 + }, + { + "epoch": 0.4916231804449327, + "grad_norm": 1.2425028085708618, + "learning_rate": 4.92585771042827e-06, + "loss": 0.6584, + "step": 1790 + }, + { + "epoch": 0.4918978302664103, + "grad_norm": 1.2627785205841064, + "learning_rate": 4.925770379048174e-06, + "loss": 0.7117, + "step": 1791 + }, + { + "epoch": 0.49217248008788794, + "grad_norm": 1.2026883363723755, + "learning_rate": 4.925682997040089e-06, + "loss": 0.6812, + "step": 1792 + }, + { + "epoch": 0.49244712990936557, + "grad_norm": 1.3435001373291016, + "learning_rate": 4.925595564405841e-06, + "loss": 0.7388, + "step": 1793 + }, + { + "epoch": 0.4927217797308432, + "grad_norm": 1.2220122814178467, + "learning_rate": 4.925508081147253e-06, + "loss": 0.6979, + "step": 1794 + }, + { + "epoch": 0.4929964295523208, + "grad_norm": 1.271528959274292, + "learning_rate": 4.9254205472661526e-06, + "loss": 0.7117, + "step": 1795 + }, + { + "epoch": 0.4932710793737984, + "grad_norm": 1.23237943649292, + "learning_rate": 4.925332962764365e-06, + "loss": 0.6542, + "step": 1796 + }, + { + "epoch": 0.493545729195276, + "grad_norm": 1.269889235496521, + "learning_rate": 4.925245327643719e-06, + "loss": 0.6375, + "step": 1797 + }, + { + "epoch": 0.49382037901675363, + "grad_norm": 1.2590141296386719, + "learning_rate": 4.925157641906045e-06, + "loss": 0.6558, + "step": 1798 + }, + { + "epoch": 0.49409502883823125, + "grad_norm": 1.303390383720398, + "learning_rate": 4.92506990555317e-06, + "loss": 0.6893, + "step": 1799 + }, + { + "epoch": 0.4943696786597089, + "grad_norm": 1.1788426637649536, + "learning_rate": 4.924982118586928e-06, + "loss": 0.669, + "step": 1800 + }, + { + "epoch": 0.4946443284811865, + "grad_norm": 1.28995680809021, + "learning_rate": 4.92489428100915e-06, + "loss": 0.6939, + "step": 1801 + }, + { + "epoch": 0.4949189783026641, + "grad_norm": 1.2653744220733643, + "learning_rate": 4.924806392821668e-06, + "loss": 0.7234, + "step": 1802 + }, + { + "epoch": 0.4951936281241417, + "grad_norm": 1.1807552576065063, + "learning_rate": 4.924718454026318e-06, + "loss": 0.6205, + "step": 1803 + }, + { + "epoch": 0.4954682779456193, + "grad_norm": 1.2810486555099487, + "learning_rate": 4.924630464624936e-06, + "loss": 0.7144, + "step": 1804 + }, + { + "epoch": 0.49574292776709694, + "grad_norm": 1.2400248050689697, + "learning_rate": 4.924542424619356e-06, + "loss": 0.6624, + "step": 1805 + }, + { + "epoch": 0.49601757758857457, + "grad_norm": 1.2769737243652344, + "learning_rate": 4.924454334011418e-06, + "loss": 0.7065, + "step": 1806 + }, + { + "epoch": 0.4962922274100522, + "grad_norm": 1.32846200466156, + "learning_rate": 4.9243661928029584e-06, + "loss": 0.6956, + "step": 1807 + }, + { + "epoch": 0.4965668772315298, + "grad_norm": 1.2121623754501343, + "learning_rate": 4.924278000995818e-06, + "loss": 0.6221, + "step": 1808 + }, + { + "epoch": 0.49684152705300744, + "grad_norm": 1.258018970489502, + "learning_rate": 4.924189758591838e-06, + "loss": 0.6397, + "step": 1809 + }, + { + "epoch": 0.497116176874485, + "grad_norm": 1.22756826877594, + "learning_rate": 4.9241014655928575e-06, + "loss": 0.682, + "step": 1810 + }, + { + "epoch": 0.49739082669596263, + "grad_norm": 1.285510540008545, + "learning_rate": 4.924013122000722e-06, + "loss": 0.6664, + "step": 1811 + }, + { + "epoch": 0.49766547651744025, + "grad_norm": 1.2718364000320435, + "learning_rate": 4.923924727817274e-06, + "loss": 0.6486, + "step": 1812 + }, + { + "epoch": 0.4979401263389179, + "grad_norm": 1.2525216341018677, + "learning_rate": 4.923836283044359e-06, + "loss": 0.6988, + "step": 1813 + }, + { + "epoch": 0.4982147761603955, + "grad_norm": 1.3173061609268188, + "learning_rate": 4.923747787683823e-06, + "loss": 0.7212, + "step": 1814 + }, + { + "epoch": 0.4984894259818731, + "grad_norm": 1.2970421314239502, + "learning_rate": 4.923659241737511e-06, + "loss": 0.6856, + "step": 1815 + }, + { + "epoch": 0.49876407580335075, + "grad_norm": 1.2462059259414673, + "learning_rate": 4.923570645207275e-06, + "loss": 0.7372, + "step": 1816 + }, + { + "epoch": 0.4990387256248283, + "grad_norm": 1.206465721130371, + "learning_rate": 4.9234819980949605e-06, + "loss": 0.662, + "step": 1817 + }, + { + "epoch": 0.49931337544630594, + "grad_norm": 1.2343392372131348, + "learning_rate": 4.923393300402419e-06, + "loss": 0.6752, + "step": 1818 + }, + { + "epoch": 0.49958802526778356, + "grad_norm": 1.22999906539917, + "learning_rate": 4.923304552131501e-06, + "loss": 0.6742, + "step": 1819 + }, + { + "epoch": 0.4998626750892612, + "grad_norm": 1.332919716835022, + "learning_rate": 4.9232157532840606e-06, + "loss": 0.672, + "step": 1820 + }, + { + "epoch": 0.5001373249107388, + "grad_norm": 1.2197908163070679, + "learning_rate": 4.9231269038619486e-06, + "loss": 0.6363, + "step": 1821 + }, + { + "epoch": 0.5004119747322164, + "grad_norm": 1.323853850364685, + "learning_rate": 4.923038003867021e-06, + "loss": 0.6748, + "step": 1822 + }, + { + "epoch": 0.5006866245536941, + "grad_norm": 1.242728352546692, + "learning_rate": 4.922949053301133e-06, + "loss": 0.6707, + "step": 1823 + }, + { + "epoch": 0.5009612743751717, + "grad_norm": 1.3285831212997437, + "learning_rate": 4.922860052166141e-06, + "loss": 0.6556, + "step": 1824 + }, + { + "epoch": 0.5012359241966493, + "grad_norm": 1.261741280555725, + "learning_rate": 4.922771000463902e-06, + "loss": 0.6531, + "step": 1825 + }, + { + "epoch": 0.5015105740181269, + "grad_norm": 1.2713242769241333, + "learning_rate": 4.922681898196274e-06, + "loss": 0.6422, + "step": 1826 + }, + { + "epoch": 0.5017852238396046, + "grad_norm": 1.2380292415618896, + "learning_rate": 4.922592745365119e-06, + "loss": 0.703, + "step": 1827 + }, + { + "epoch": 0.5020598736610821, + "grad_norm": 1.3056650161743164, + "learning_rate": 4.922503541972296e-06, + "loss": 0.729, + "step": 1828 + }, + { + "epoch": 0.5023345234825597, + "grad_norm": 1.2135775089263916, + "learning_rate": 4.922414288019668e-06, + "loss": 0.6661, + "step": 1829 + }, + { + "epoch": 0.5026091733040373, + "grad_norm": 1.3339289426803589, + "learning_rate": 4.9223249835090956e-06, + "loss": 0.6948, + "step": 1830 + }, + { + "epoch": 0.5028838231255149, + "grad_norm": 1.281559705734253, + "learning_rate": 4.922235628442444e-06, + "loss": 0.7195, + "step": 1831 + }, + { + "epoch": 0.5031584729469926, + "grad_norm": 1.315497875213623, + "learning_rate": 4.92214622282158e-06, + "loss": 0.7074, + "step": 1832 + }, + { + "epoch": 0.5034331227684702, + "grad_norm": 1.1987298727035522, + "learning_rate": 4.9220567666483655e-06, + "loss": 0.6624, + "step": 1833 + }, + { + "epoch": 0.5037077725899478, + "grad_norm": 1.3246207237243652, + "learning_rate": 4.921967259924671e-06, + "loss": 0.7276, + "step": 1834 + }, + { + "epoch": 0.5039824224114254, + "grad_norm": 1.2300755977630615, + "learning_rate": 4.921877702652362e-06, + "loss": 0.6805, + "step": 1835 + }, + { + "epoch": 0.5042570722329031, + "grad_norm": 1.2585492134094238, + "learning_rate": 4.9217880948333104e-06, + "loss": 0.6978, + "step": 1836 + }, + { + "epoch": 0.5045317220543807, + "grad_norm": 1.2830356359481812, + "learning_rate": 4.921698436469385e-06, + "loss": 0.6848, + "step": 1837 + }, + { + "epoch": 0.5048063718758583, + "grad_norm": 1.2256406545639038, + "learning_rate": 4.921608727562455e-06, + "loss": 0.6654, + "step": 1838 + }, + { + "epoch": 0.5050810216973359, + "grad_norm": 1.2478407621383667, + "learning_rate": 4.921518968114396e-06, + "loss": 0.7352, + "step": 1839 + }, + { + "epoch": 0.5053556715188136, + "grad_norm": 1.3068411350250244, + "learning_rate": 4.9214291581270805e-06, + "loss": 0.6689, + "step": 1840 + }, + { + "epoch": 0.5056303213402912, + "grad_norm": 1.3199888467788696, + "learning_rate": 4.921339297602381e-06, + "loss": 0.6472, + "step": 1841 + }, + { + "epoch": 0.5059049711617687, + "grad_norm": 1.3088469505310059, + "learning_rate": 4.921249386542176e-06, + "loss": 0.6844, + "step": 1842 + }, + { + "epoch": 0.5061796209832463, + "grad_norm": 1.2106199264526367, + "learning_rate": 4.9211594249483395e-06, + "loss": 0.6319, + "step": 1843 + }, + { + "epoch": 0.5064542708047239, + "grad_norm": 1.1742829084396362, + "learning_rate": 4.921069412822751e-06, + "loss": 0.6722, + "step": 1844 + }, + { + "epoch": 0.5067289206262016, + "grad_norm": 1.2606443166732788, + "learning_rate": 4.920979350167286e-06, + "loss": 0.6693, + "step": 1845 + }, + { + "epoch": 0.5070035704476792, + "grad_norm": 1.222058653831482, + "learning_rate": 4.920889236983829e-06, + "loss": 0.6793, + "step": 1846 + }, + { + "epoch": 0.5072782202691568, + "grad_norm": 1.2871266603469849, + "learning_rate": 4.9207990732742564e-06, + "loss": 0.6864, + "step": 1847 + }, + { + "epoch": 0.5075528700906344, + "grad_norm": 1.154127836227417, + "learning_rate": 4.920708859040452e-06, + "loss": 0.633, + "step": 1848 + }, + { + "epoch": 0.5078275199121121, + "grad_norm": 1.232245683670044, + "learning_rate": 4.920618594284298e-06, + "loss": 0.7175, + "step": 1849 + }, + { + "epoch": 0.5081021697335897, + "grad_norm": 1.2520184516906738, + "learning_rate": 4.92052827900768e-06, + "loss": 0.6649, + "step": 1850 + }, + { + "epoch": 0.5083768195550673, + "grad_norm": 1.2655351161956787, + "learning_rate": 4.920437913212481e-06, + "loss": 0.6917, + "step": 1851 + }, + { + "epoch": 0.5086514693765449, + "grad_norm": 1.27291738986969, + "learning_rate": 4.920347496900587e-06, + "loss": 0.708, + "step": 1852 + }, + { + "epoch": 0.5089261191980226, + "grad_norm": 1.2721384763717651, + "learning_rate": 4.920257030073886e-06, + "loss": 0.7242, + "step": 1853 + }, + { + "epoch": 0.5092007690195002, + "grad_norm": 1.3439891338348389, + "learning_rate": 4.920166512734266e-06, + "loss": 0.6824, + "step": 1854 + }, + { + "epoch": 0.5094754188409778, + "grad_norm": 1.2531839609146118, + "learning_rate": 4.920075944883616e-06, + "loss": 0.7076, + "step": 1855 + }, + { + "epoch": 0.5097500686624554, + "grad_norm": 1.2528809309005737, + "learning_rate": 4.919985326523826e-06, + "loss": 0.6915, + "step": 1856 + }, + { + "epoch": 0.5100247184839329, + "grad_norm": 1.214250922203064, + "learning_rate": 4.919894657656787e-06, + "loss": 0.6923, + "step": 1857 + }, + { + "epoch": 0.5102993683054106, + "grad_norm": 1.2683045864105225, + "learning_rate": 4.919803938284393e-06, + "loss": 0.663, + "step": 1858 + }, + { + "epoch": 0.5105740181268882, + "grad_norm": 1.250664234161377, + "learning_rate": 4.919713168408536e-06, + "loss": 0.709, + "step": 1859 + }, + { + "epoch": 0.5108486679483658, + "grad_norm": 1.2961539030075073, + "learning_rate": 4.91962234803111e-06, + "loss": 0.7253, + "step": 1860 + }, + { + "epoch": 0.5111233177698434, + "grad_norm": 1.2377573251724243, + "learning_rate": 4.9195314771540116e-06, + "loss": 0.6815, + "step": 1861 + }, + { + "epoch": 0.5113979675913211, + "grad_norm": 1.2084695100784302, + "learning_rate": 4.919440555779137e-06, + "loss": 0.675, + "step": 1862 + }, + { + "epoch": 0.5116726174127987, + "grad_norm": 1.2381694316864014, + "learning_rate": 4.919349583908384e-06, + "loss": 0.6682, + "step": 1863 + }, + { + "epoch": 0.5119472672342763, + "grad_norm": 1.3362971544265747, + "learning_rate": 4.919258561543651e-06, + "loss": 0.6702, + "step": 1864 + }, + { + "epoch": 0.5122219170557539, + "grad_norm": 1.2515486478805542, + "learning_rate": 4.919167488686837e-06, + "loss": 0.7385, + "step": 1865 + }, + { + "epoch": 0.5124965668772316, + "grad_norm": 1.3072539567947388, + "learning_rate": 4.919076365339844e-06, + "loss": 0.6779, + "step": 1866 + }, + { + "epoch": 0.5127712166987092, + "grad_norm": 1.2080353498458862, + "learning_rate": 4.9189851915045724e-06, + "loss": 0.6344, + "step": 1867 + }, + { + "epoch": 0.5130458665201868, + "grad_norm": 1.2061829566955566, + "learning_rate": 4.918893967182927e-06, + "loss": 0.6811, + "step": 1868 + }, + { + "epoch": 0.5133205163416644, + "grad_norm": 1.2865073680877686, + "learning_rate": 4.918802692376811e-06, + "loss": 0.6983, + "step": 1869 + }, + { + "epoch": 0.513595166163142, + "grad_norm": 1.3096965551376343, + "learning_rate": 4.918711367088129e-06, + "loss": 0.6865, + "step": 1870 + }, + { + "epoch": 0.5138698159846196, + "grad_norm": 1.2871531248092651, + "learning_rate": 4.918619991318787e-06, + "loss": 0.6234, + "step": 1871 + }, + { + "epoch": 0.5141444658060972, + "grad_norm": 1.3479686975479126, + "learning_rate": 4.918528565070692e-06, + "loss": 0.7229, + "step": 1872 + }, + { + "epoch": 0.5144191156275748, + "grad_norm": 1.2918241024017334, + "learning_rate": 4.918437088345753e-06, + "loss": 0.6652, + "step": 1873 + }, + { + "epoch": 0.5146937654490524, + "grad_norm": 1.2369645833969116, + "learning_rate": 4.918345561145878e-06, + "loss": 0.6293, + "step": 1874 + }, + { + "epoch": 0.51496841527053, + "grad_norm": 1.3120394945144653, + "learning_rate": 4.918253983472978e-06, + "loss": 0.6506, + "step": 1875 + }, + { + "epoch": 0.5152430650920077, + "grad_norm": 1.2707550525665283, + "learning_rate": 4.918162355328964e-06, + "loss": 0.6817, + "step": 1876 + }, + { + "epoch": 0.5155177149134853, + "grad_norm": 1.2679158449172974, + "learning_rate": 4.918070676715749e-06, + "loss": 0.6777, + "step": 1877 + }, + { + "epoch": 0.5157923647349629, + "grad_norm": 1.1695531606674194, + "learning_rate": 4.917978947635246e-06, + "loss": 0.6651, + "step": 1878 + }, + { + "epoch": 0.5160670145564406, + "grad_norm": 1.3017899990081787, + "learning_rate": 4.917887168089369e-06, + "loss": 0.7368, + "step": 1879 + }, + { + "epoch": 0.5163416643779182, + "grad_norm": 1.2308164834976196, + "learning_rate": 4.917795338080034e-06, + "loss": 0.6791, + "step": 1880 + }, + { + "epoch": 0.5166163141993958, + "grad_norm": 1.2852586507797241, + "learning_rate": 4.917703457609158e-06, + "loss": 0.6651, + "step": 1881 + }, + { + "epoch": 0.5168909640208734, + "grad_norm": 1.2615993022918701, + "learning_rate": 4.917611526678658e-06, + "loss": 0.7058, + "step": 1882 + }, + { + "epoch": 0.517165613842351, + "grad_norm": 1.1923750638961792, + "learning_rate": 4.917519545290453e-06, + "loss": 0.6895, + "step": 1883 + }, + { + "epoch": 0.5174402636638287, + "grad_norm": 1.2352246046066284, + "learning_rate": 4.917427513446461e-06, + "loss": 0.6689, + "step": 1884 + }, + { + "epoch": 0.5177149134853062, + "grad_norm": 1.2336764335632324, + "learning_rate": 4.917335431148606e-06, + "loss": 0.6711, + "step": 1885 + }, + { + "epoch": 0.5179895633067838, + "grad_norm": 1.2499829530715942, + "learning_rate": 4.917243298398808e-06, + "loss": 0.6904, + "step": 1886 + }, + { + "epoch": 0.5182642131282614, + "grad_norm": 1.2408851385116577, + "learning_rate": 4.917151115198989e-06, + "loss": 0.7062, + "step": 1887 + }, + { + "epoch": 0.518538862949739, + "grad_norm": 1.3008146286010742, + "learning_rate": 4.9170588815510755e-06, + "loss": 0.6998, + "step": 1888 + }, + { + "epoch": 0.5188135127712167, + "grad_norm": 1.1868892908096313, + "learning_rate": 4.91696659745699e-06, + "loss": 0.6284, + "step": 1889 + }, + { + "epoch": 0.5190881625926943, + "grad_norm": 1.2608468532562256, + "learning_rate": 4.91687426291866e-06, + "loss": 0.6602, + "step": 1890 + }, + { + "epoch": 0.5193628124141719, + "grad_norm": 1.2133026123046875, + "learning_rate": 4.916781877938011e-06, + "loss": 0.6739, + "step": 1891 + }, + { + "epoch": 0.5196374622356495, + "grad_norm": 1.1733887195587158, + "learning_rate": 4.916689442516974e-06, + "loss": 0.6778, + "step": 1892 + }, + { + "epoch": 0.5199121120571272, + "grad_norm": 1.268813133239746, + "learning_rate": 4.916596956657476e-06, + "loss": 0.705, + "step": 1893 + }, + { + "epoch": 0.5201867618786048, + "grad_norm": 1.285495638847351, + "learning_rate": 4.916504420361447e-06, + "loss": 0.7019, + "step": 1894 + }, + { + "epoch": 0.5204614117000824, + "grad_norm": 1.158308982849121, + "learning_rate": 4.916411833630821e-06, + "loss": 0.6661, + "step": 1895 + }, + { + "epoch": 0.52073606152156, + "grad_norm": 1.3213388919830322, + "learning_rate": 4.916319196467527e-06, + "loss": 0.6522, + "step": 1896 + }, + { + "epoch": 0.5210107113430377, + "grad_norm": 1.3089981079101562, + "learning_rate": 4.916226508873501e-06, + "loss": 0.6211, + "step": 1897 + }, + { + "epoch": 0.5212853611645153, + "grad_norm": 1.246030330657959, + "learning_rate": 4.916133770850675e-06, + "loss": 0.7096, + "step": 1898 + }, + { + "epoch": 0.5215600109859928, + "grad_norm": 1.23479163646698, + "learning_rate": 4.916040982400986e-06, + "loss": 0.6995, + "step": 1899 + }, + { + "epoch": 0.5218346608074704, + "grad_norm": 1.231031060218811, + "learning_rate": 4.915948143526371e-06, + "loss": 0.6848, + "step": 1900 + }, + { + "epoch": 0.522109310628948, + "grad_norm": 1.3100155591964722, + "learning_rate": 4.915855254228767e-06, + "loss": 0.6873, + "step": 1901 + }, + { + "epoch": 0.5223839604504257, + "grad_norm": 1.1904327869415283, + "learning_rate": 4.915762314510113e-06, + "loss": 0.6742, + "step": 1902 + }, + { + "epoch": 0.5226586102719033, + "grad_norm": 1.2119362354278564, + "learning_rate": 4.915669324372348e-06, + "loss": 0.6321, + "step": 1903 + }, + { + "epoch": 0.5229332600933809, + "grad_norm": 1.18685781955719, + "learning_rate": 4.915576283817415e-06, + "loss": 0.6458, + "step": 1904 + }, + { + "epoch": 0.5232079099148585, + "grad_norm": 1.2201762199401855, + "learning_rate": 4.915483192847252e-06, + "loss": 0.6887, + "step": 1905 + }, + { + "epoch": 0.5234825597363362, + "grad_norm": 1.202985167503357, + "learning_rate": 4.915390051463806e-06, + "loss": 0.6946, + "step": 1906 + }, + { + "epoch": 0.5237572095578138, + "grad_norm": 1.1923401355743408, + "learning_rate": 4.915296859669017e-06, + "loss": 0.6777, + "step": 1907 + }, + { + "epoch": 0.5240318593792914, + "grad_norm": 1.348125696182251, + "learning_rate": 4.915203617464833e-06, + "loss": 0.6685, + "step": 1908 + }, + { + "epoch": 0.524306509200769, + "grad_norm": 1.2768113613128662, + "learning_rate": 4.915110324853199e-06, + "loss": 0.7102, + "step": 1909 + }, + { + "epoch": 0.5245811590222467, + "grad_norm": 1.2948483228683472, + "learning_rate": 4.915016981836062e-06, + "loss": 0.7041, + "step": 1910 + }, + { + "epoch": 0.5248558088437243, + "grad_norm": 1.1948193311691284, + "learning_rate": 4.91492358841537e-06, + "loss": 0.7029, + "step": 1911 + }, + { + "epoch": 0.5251304586652019, + "grad_norm": 1.2794129848480225, + "learning_rate": 4.914830144593074e-06, + "loss": 0.709, + "step": 1912 + }, + { + "epoch": 0.5254051084866794, + "grad_norm": 1.235395073890686, + "learning_rate": 4.914736650371121e-06, + "loss": 0.6902, + "step": 1913 + }, + { + "epoch": 0.525679758308157, + "grad_norm": 1.2710777521133423, + "learning_rate": 4.914643105751464e-06, + "loss": 0.6473, + "step": 1914 + }, + { + "epoch": 0.5259544081296347, + "grad_norm": 1.2397987842559814, + "learning_rate": 4.914549510736057e-06, + "loss": 0.6617, + "step": 1915 + }, + { + "epoch": 0.5262290579511123, + "grad_norm": 1.1843680143356323, + "learning_rate": 4.91445586532685e-06, + "loss": 0.6282, + "step": 1916 + }, + { + "epoch": 0.5265037077725899, + "grad_norm": 1.2912824153900146, + "learning_rate": 4.9143621695258e-06, + "loss": 0.7096, + "step": 1917 + }, + { + "epoch": 0.5267783575940675, + "grad_norm": 1.2232455015182495, + "learning_rate": 4.914268423334863e-06, + "loss": 0.6606, + "step": 1918 + }, + { + "epoch": 0.5270530074155452, + "grad_norm": 1.2481834888458252, + "learning_rate": 4.9141746267559925e-06, + "loss": 0.6753, + "step": 1919 + }, + { + "epoch": 0.5273276572370228, + "grad_norm": 1.2774497270584106, + "learning_rate": 4.914080779791149e-06, + "loss": 0.6418, + "step": 1920 + }, + { + "epoch": 0.5276023070585004, + "grad_norm": 1.1941187381744385, + "learning_rate": 4.91398688244229e-06, + "loss": 0.7007, + "step": 1921 + }, + { + "epoch": 0.527876956879978, + "grad_norm": 1.1485710144042969, + "learning_rate": 4.913892934711375e-06, + "loss": 0.6469, + "step": 1922 + }, + { + "epoch": 0.5281516067014557, + "grad_norm": 1.2516621351242065, + "learning_rate": 4.913798936600366e-06, + "loss": 0.688, + "step": 1923 + }, + { + "epoch": 0.5284262565229333, + "grad_norm": 1.1951640844345093, + "learning_rate": 4.913704888111222e-06, + "loss": 0.6073, + "step": 1924 + }, + { + "epoch": 0.5287009063444109, + "grad_norm": 1.2852009534835815, + "learning_rate": 4.91361078924591e-06, + "loss": 0.6824, + "step": 1925 + }, + { + "epoch": 0.5289755561658885, + "grad_norm": 1.3097145557403564, + "learning_rate": 4.9135166400063916e-06, + "loss": 0.6967, + "step": 1926 + }, + { + "epoch": 0.5292502059873662, + "grad_norm": 1.2610855102539062, + "learning_rate": 4.9134224403946305e-06, + "loss": 0.6903, + "step": 1927 + }, + { + "epoch": 0.5295248558088437, + "grad_norm": 1.2302881479263306, + "learning_rate": 4.913328190412596e-06, + "loss": 0.6852, + "step": 1928 + }, + { + "epoch": 0.5297995056303213, + "grad_norm": 1.2909409999847412, + "learning_rate": 4.913233890062252e-06, + "loss": 0.6725, + "step": 1929 + }, + { + "epoch": 0.5300741554517989, + "grad_norm": 1.3593473434448242, + "learning_rate": 4.913139539345568e-06, + "loss": 0.7105, + "step": 1930 + }, + { + "epoch": 0.5303488052732765, + "grad_norm": 1.2503076791763306, + "learning_rate": 4.913045138264514e-06, + "loss": 0.7035, + "step": 1931 + }, + { + "epoch": 0.5306234550947542, + "grad_norm": 1.2476969957351685, + "learning_rate": 4.912950686821059e-06, + "loss": 0.6548, + "step": 1932 + }, + { + "epoch": 0.5308981049162318, + "grad_norm": 1.286202073097229, + "learning_rate": 4.912856185017175e-06, + "loss": 0.6597, + "step": 1933 + }, + { + "epoch": 0.5311727547377094, + "grad_norm": 1.1525657176971436, + "learning_rate": 4.912761632854834e-06, + "loss": 0.6369, + "step": 1934 + }, + { + "epoch": 0.531447404559187, + "grad_norm": 1.2875515222549438, + "learning_rate": 4.912667030336009e-06, + "loss": 0.7206, + "step": 1935 + }, + { + "epoch": 0.5317220543806647, + "grad_norm": 1.2953742742538452, + "learning_rate": 4.912572377462675e-06, + "loss": 0.6909, + "step": 1936 + }, + { + "epoch": 0.5319967042021423, + "grad_norm": 1.2154854536056519, + "learning_rate": 4.912477674236808e-06, + "loss": 0.68, + "step": 1937 + }, + { + "epoch": 0.5322713540236199, + "grad_norm": 1.2394602298736572, + "learning_rate": 4.912382920660382e-06, + "loss": 0.6897, + "step": 1938 + }, + { + "epoch": 0.5325460038450975, + "grad_norm": 1.2848504781723022, + "learning_rate": 4.912288116735379e-06, + "loss": 0.6614, + "step": 1939 + }, + { + "epoch": 0.5328206536665752, + "grad_norm": 1.233673334121704, + "learning_rate": 4.912193262463774e-06, + "loss": 0.6582, + "step": 1940 + }, + { + "epoch": 0.5330953034880528, + "grad_norm": 1.2133405208587646, + "learning_rate": 4.912098357847547e-06, + "loss": 0.6687, + "step": 1941 + }, + { + "epoch": 0.5333699533095303, + "grad_norm": 1.2398384809494019, + "learning_rate": 4.912003402888681e-06, + "loss": 0.6746, + "step": 1942 + }, + { + "epoch": 0.5336446031310079, + "grad_norm": 1.229853630065918, + "learning_rate": 4.911908397589155e-06, + "loss": 0.6393, + "step": 1943 + }, + { + "epoch": 0.5339192529524855, + "grad_norm": 1.2647844552993774, + "learning_rate": 4.911813341950954e-06, + "loss": 0.7262, + "step": 1944 + }, + { + "epoch": 0.5341939027739632, + "grad_norm": 1.3007748126983643, + "learning_rate": 4.911718235976061e-06, + "loss": 0.6694, + "step": 1945 + }, + { + "epoch": 0.5344685525954408, + "grad_norm": 1.2805767059326172, + "learning_rate": 4.911623079666461e-06, + "loss": 0.6885, + "step": 1946 + }, + { + "epoch": 0.5347432024169184, + "grad_norm": 1.2015613317489624, + "learning_rate": 4.91152787302414e-06, + "loss": 0.6663, + "step": 1947 + }, + { + "epoch": 0.535017852238396, + "grad_norm": 1.180745005607605, + "learning_rate": 4.911432616051085e-06, + "loss": 0.6807, + "step": 1948 + }, + { + "epoch": 0.5352925020598737, + "grad_norm": 1.241559386253357, + "learning_rate": 4.911337308749285e-06, + "loss": 0.6577, + "step": 1949 + }, + { + "epoch": 0.5355671518813513, + "grad_norm": 1.2465488910675049, + "learning_rate": 4.9112419511207275e-06, + "loss": 0.6595, + "step": 1950 + }, + { + "epoch": 0.5358418017028289, + "grad_norm": 1.09845769405365, + "learning_rate": 4.911146543167404e-06, + "loss": 0.5913, + "step": 1951 + }, + { + "epoch": 0.5361164515243065, + "grad_norm": 1.2450032234191895, + "learning_rate": 4.911051084891304e-06, + "loss": 0.6214, + "step": 1952 + }, + { + "epoch": 0.5363911013457842, + "grad_norm": 1.1663990020751953, + "learning_rate": 4.910955576294423e-06, + "loss": 0.6515, + "step": 1953 + }, + { + "epoch": 0.5366657511672618, + "grad_norm": 1.384398102760315, + "learning_rate": 4.910860017378752e-06, + "loss": 0.6942, + "step": 1954 + }, + { + "epoch": 0.5369404009887394, + "grad_norm": 1.333154320716858, + "learning_rate": 4.910764408146284e-06, + "loss": 0.7244, + "step": 1955 + }, + { + "epoch": 0.5372150508102169, + "grad_norm": 1.2546206712722778, + "learning_rate": 4.910668748599018e-06, + "loss": 0.7044, + "step": 1956 + }, + { + "epoch": 0.5374897006316945, + "grad_norm": 1.2439848184585571, + "learning_rate": 4.910573038738948e-06, + "loss": 0.717, + "step": 1957 + }, + { + "epoch": 0.5377643504531722, + "grad_norm": 1.2594661712646484, + "learning_rate": 4.9104772785680734e-06, + "loss": 0.6606, + "step": 1958 + }, + { + "epoch": 0.5380390002746498, + "grad_norm": 1.2831999063491821, + "learning_rate": 4.910381468088391e-06, + "loss": 0.65, + "step": 1959 + }, + { + "epoch": 0.5383136500961274, + "grad_norm": 1.221526861190796, + "learning_rate": 4.910285607301901e-06, + "loss": 0.6542, + "step": 1960 + }, + { + "epoch": 0.538588299917605, + "grad_norm": 1.2474944591522217, + "learning_rate": 4.9101896962106045e-06, + "loss": 0.6345, + "step": 1961 + }, + { + "epoch": 0.5388629497390827, + "grad_norm": 1.236576795578003, + "learning_rate": 4.9100937348165035e-06, + "loss": 0.6321, + "step": 1962 + }, + { + "epoch": 0.5391375995605603, + "grad_norm": 1.2558410167694092, + "learning_rate": 4.909997723121599e-06, + "loss": 0.6566, + "step": 1963 + }, + { + "epoch": 0.5394122493820379, + "grad_norm": 1.1535310745239258, + "learning_rate": 4.909901661127898e-06, + "loss": 0.635, + "step": 1964 + }, + { + "epoch": 0.5396868992035155, + "grad_norm": 1.2175015211105347, + "learning_rate": 4.9098055488374015e-06, + "loss": 0.6532, + "step": 1965 + }, + { + "epoch": 0.5399615490249932, + "grad_norm": 1.192400336265564, + "learning_rate": 4.909709386252119e-06, + "loss": 0.5783, + "step": 1966 + }, + { + "epoch": 0.5402361988464708, + "grad_norm": 1.319879412651062, + "learning_rate": 4.909613173374056e-06, + "loss": 0.6971, + "step": 1967 + }, + { + "epoch": 0.5405108486679484, + "grad_norm": 1.2732102870941162, + "learning_rate": 4.909516910205221e-06, + "loss": 0.684, + "step": 1968 + }, + { + "epoch": 0.540785498489426, + "grad_norm": 1.2137372493743896, + "learning_rate": 4.9094205967476215e-06, + "loss": 0.692, + "step": 1969 + }, + { + "epoch": 0.5410601483109035, + "grad_norm": 1.302038311958313, + "learning_rate": 4.909324233003269e-06, + "loss": 0.6882, + "step": 1970 + }, + { + "epoch": 0.5413347981323812, + "grad_norm": 1.2042526006698608, + "learning_rate": 4.9092278189741746e-06, + "loss": 0.6245, + "step": 1971 + }, + { + "epoch": 0.5416094479538588, + "grad_norm": 1.3160183429718018, + "learning_rate": 4.909131354662351e-06, + "loss": 0.6413, + "step": 1972 + }, + { + "epoch": 0.5418840977753364, + "grad_norm": 1.2651569843292236, + "learning_rate": 4.909034840069811e-06, + "loss": 0.674, + "step": 1973 + }, + { + "epoch": 0.542158747596814, + "grad_norm": 1.288333535194397, + "learning_rate": 4.908938275198568e-06, + "loss": 0.7019, + "step": 1974 + }, + { + "epoch": 0.5424333974182917, + "grad_norm": 1.237622857093811, + "learning_rate": 4.908841660050639e-06, + "loss": 0.7581, + "step": 1975 + }, + { + "epoch": 0.5427080472397693, + "grad_norm": 1.2389169931411743, + "learning_rate": 4.90874499462804e-06, + "loss": 0.6636, + "step": 1976 + }, + { + "epoch": 0.5429826970612469, + "grad_norm": 1.1323224306106567, + "learning_rate": 4.908648278932787e-06, + "loss": 0.5867, + "step": 1977 + }, + { + "epoch": 0.5432573468827245, + "grad_norm": 1.373181700706482, + "learning_rate": 4.908551512966902e-06, + "loss": 0.6744, + "step": 1978 + }, + { + "epoch": 0.5435319967042022, + "grad_norm": 1.2507795095443726, + "learning_rate": 4.9084546967324e-06, + "loss": 0.6634, + "step": 1979 + }, + { + "epoch": 0.5438066465256798, + "grad_norm": 1.2166351079940796, + "learning_rate": 4.908357830231305e-06, + "loss": 0.6614, + "step": 1980 + }, + { + "epoch": 0.5440812963471574, + "grad_norm": 1.233439564704895, + "learning_rate": 4.908260913465638e-06, + "loss": 0.6815, + "step": 1981 + }, + { + "epoch": 0.544355946168635, + "grad_norm": 1.2122130393981934, + "learning_rate": 4.908163946437421e-06, + "loss": 0.6724, + "step": 1982 + }, + { + "epoch": 0.5446305959901127, + "grad_norm": 1.2877411842346191, + "learning_rate": 4.908066929148679e-06, + "loss": 0.7118, + "step": 1983 + }, + { + "epoch": 0.5449052458115903, + "grad_norm": 1.2203996181488037, + "learning_rate": 4.907969861601435e-06, + "loss": 0.6992, + "step": 1984 + }, + { + "epoch": 0.5451798956330678, + "grad_norm": 1.2708067893981934, + "learning_rate": 4.907872743797717e-06, + "loss": 0.6812, + "step": 1985 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.2340161800384521, + "learning_rate": 4.907775575739549e-06, + "loss": 0.6717, + "step": 1986 + }, + { + "epoch": 0.545729195276023, + "grad_norm": 1.273066759109497, + "learning_rate": 4.9076783574289635e-06, + "loss": 0.6984, + "step": 1987 + }, + { + "epoch": 0.5460038450975007, + "grad_norm": 1.4476913213729858, + "learning_rate": 4.907581088867985e-06, + "loss": 0.6832, + "step": 1988 + }, + { + "epoch": 0.5462784949189783, + "grad_norm": 1.228721022605896, + "learning_rate": 4.907483770058646e-06, + "loss": 0.677, + "step": 1989 + }, + { + "epoch": 0.5465531447404559, + "grad_norm": 1.2262204885482788, + "learning_rate": 4.907386401002977e-06, + "loss": 0.6694, + "step": 1990 + }, + { + "epoch": 0.5468277945619335, + "grad_norm": 1.2483885288238525, + "learning_rate": 4.907288981703011e-06, + "loss": 0.6746, + "step": 1991 + }, + { + "epoch": 0.5471024443834112, + "grad_norm": 1.247436761856079, + "learning_rate": 4.90719151216078e-06, + "loss": 0.6393, + "step": 1992 + }, + { + "epoch": 0.5473770942048888, + "grad_norm": 1.2660582065582275, + "learning_rate": 4.907093992378319e-06, + "loss": 0.6914, + "step": 1993 + }, + { + "epoch": 0.5476517440263664, + "grad_norm": 1.191460371017456, + "learning_rate": 4.906996422357663e-06, + "loss": 0.6543, + "step": 1994 + }, + { + "epoch": 0.547926393847844, + "grad_norm": 1.250543236732483, + "learning_rate": 4.906898802100849e-06, + "loss": 0.6899, + "step": 1995 + }, + { + "epoch": 0.5482010436693217, + "grad_norm": 1.2283161878585815, + "learning_rate": 4.906801131609913e-06, + "loss": 0.6688, + "step": 1996 + }, + { + "epoch": 0.5484756934907993, + "grad_norm": 1.2117525339126587, + "learning_rate": 4.906703410886895e-06, + "loss": 0.6532, + "step": 1997 + }, + { + "epoch": 0.5487503433122769, + "grad_norm": 1.2722893953323364, + "learning_rate": 4.906605639933834e-06, + "loss": 0.7009, + "step": 1998 + }, + { + "epoch": 0.5490249931337544, + "grad_norm": 1.209172248840332, + "learning_rate": 4.906507818752769e-06, + "loss": 0.6455, + "step": 1999 + }, + { + "epoch": 0.549299642955232, + "grad_norm": 1.2733027935028076, + "learning_rate": 4.906409947345745e-06, + "loss": 0.6739, + "step": 2000 + }, + { + "epoch": 0.5495742927767097, + "grad_norm": 1.2675704956054688, + "learning_rate": 4.9063120257148025e-06, + "loss": 0.6645, + "step": 2001 + }, + { + "epoch": 0.5498489425981873, + "grad_norm": 1.3162777423858643, + "learning_rate": 4.906214053861984e-06, + "loss": 0.6663, + "step": 2002 + }, + { + "epoch": 0.5501235924196649, + "grad_norm": 1.2362629175186157, + "learning_rate": 4.906116031789338e-06, + "loss": 0.6608, + "step": 2003 + }, + { + "epoch": 0.5503982422411425, + "grad_norm": 1.2309050559997559, + "learning_rate": 4.906017959498906e-06, + "loss": 0.5913, + "step": 2004 + }, + { + "epoch": 0.5506728920626202, + "grad_norm": 1.2447757720947266, + "learning_rate": 4.905919836992737e-06, + "loss": 0.6598, + "step": 2005 + }, + { + "epoch": 0.5509475418840978, + "grad_norm": 1.1878386735916138, + "learning_rate": 4.9058216642728805e-06, + "loss": 0.6516, + "step": 2006 + }, + { + "epoch": 0.5512221917055754, + "grad_norm": 1.2062187194824219, + "learning_rate": 4.905723441341382e-06, + "loss": 0.665, + "step": 2007 + }, + { + "epoch": 0.551496841527053, + "grad_norm": 1.1686571836471558, + "learning_rate": 4.905625168200294e-06, + "loss": 0.6634, + "step": 2008 + }, + { + "epoch": 0.5517714913485307, + "grad_norm": 1.3019970655441284, + "learning_rate": 4.9055268448516665e-06, + "loss": 0.6678, + "step": 2009 + }, + { + "epoch": 0.5520461411700083, + "grad_norm": 1.3820641040802002, + "learning_rate": 4.905428471297553e-06, + "loss": 0.6944, + "step": 2010 + }, + { + "epoch": 0.5523207909914859, + "grad_norm": 1.252486228942871, + "learning_rate": 4.905330047540004e-06, + "loss": 0.683, + "step": 2011 + }, + { + "epoch": 0.5525954408129635, + "grad_norm": 1.1959519386291504, + "learning_rate": 4.905231573581077e-06, + "loss": 0.6677, + "step": 2012 + }, + { + "epoch": 0.552870090634441, + "grad_norm": 1.258548378944397, + "learning_rate": 4.905133049422823e-06, + "loss": 0.7168, + "step": 2013 + }, + { + "epoch": 0.5531447404559187, + "grad_norm": 1.2629436254501343, + "learning_rate": 4.905034475067303e-06, + "loss": 0.6523, + "step": 2014 + }, + { + "epoch": 0.5534193902773963, + "grad_norm": 1.211830496788025, + "learning_rate": 4.904935850516571e-06, + "loss": 0.7069, + "step": 2015 + }, + { + "epoch": 0.5536940400988739, + "grad_norm": 1.2212965488433838, + "learning_rate": 4.904837175772686e-06, + "loss": 0.67, + "step": 2016 + }, + { + "epoch": 0.5539686899203515, + "grad_norm": 1.308441400527954, + "learning_rate": 4.904738450837709e-06, + "loss": 0.6649, + "step": 2017 + }, + { + "epoch": 0.5542433397418292, + "grad_norm": 1.1791369915008545, + "learning_rate": 4.904639675713699e-06, + "loss": 0.6237, + "step": 2018 + }, + { + "epoch": 0.5545179895633068, + "grad_norm": 1.1727653741836548, + "learning_rate": 4.904540850402718e-06, + "loss": 0.6367, + "step": 2019 + }, + { + "epoch": 0.5547926393847844, + "grad_norm": 1.229462742805481, + "learning_rate": 4.904441974906828e-06, + "loss": 0.6741, + "step": 2020 + }, + { + "epoch": 0.555067289206262, + "grad_norm": 1.2460815906524658, + "learning_rate": 4.904343049228094e-06, + "loss": 0.6882, + "step": 2021 + }, + { + "epoch": 0.5553419390277397, + "grad_norm": 1.174329400062561, + "learning_rate": 4.904244073368579e-06, + "loss": 0.597, + "step": 2022 + }, + { + "epoch": 0.5556165888492173, + "grad_norm": 1.2760900259017944, + "learning_rate": 4.9041450473303494e-06, + "loss": 0.6339, + "step": 2023 + }, + { + "epoch": 0.5558912386706949, + "grad_norm": 1.2218141555786133, + "learning_rate": 4.904045971115472e-06, + "loss": 0.6339, + "step": 2024 + }, + { + "epoch": 0.5561658884921725, + "grad_norm": 1.1406677961349487, + "learning_rate": 4.9039468447260155e-06, + "loss": 0.6264, + "step": 2025 + }, + { + "epoch": 0.5564405383136501, + "grad_norm": 1.2081279754638672, + "learning_rate": 4.903847668164047e-06, + "loss": 0.6358, + "step": 2026 + }, + { + "epoch": 0.5567151881351277, + "grad_norm": 1.2224445343017578, + "learning_rate": 4.903748441431638e-06, + "loss": 0.6865, + "step": 2027 + }, + { + "epoch": 0.5569898379566053, + "grad_norm": 1.2457295656204224, + "learning_rate": 4.903649164530858e-06, + "loss": 0.7071, + "step": 2028 + }, + { + "epoch": 0.5572644877780829, + "grad_norm": 1.191192388534546, + "learning_rate": 4.90354983746378e-06, + "loss": 0.6629, + "step": 2029 + }, + { + "epoch": 0.5575391375995605, + "grad_norm": 1.302304744720459, + "learning_rate": 4.9034504602324775e-06, + "loss": 0.6922, + "step": 2030 + }, + { + "epoch": 0.5578137874210382, + "grad_norm": 1.3755236864089966, + "learning_rate": 4.903351032839023e-06, + "loss": 0.7289, + "step": 2031 + }, + { + "epoch": 0.5580884372425158, + "grad_norm": 1.159969449043274, + "learning_rate": 4.903251555285493e-06, + "loss": 0.6495, + "step": 2032 + }, + { + "epoch": 0.5583630870639934, + "grad_norm": 1.2960714101791382, + "learning_rate": 4.903152027573963e-06, + "loss": 0.7078, + "step": 2033 + }, + { + "epoch": 0.558637736885471, + "grad_norm": 1.3595154285430908, + "learning_rate": 4.9030524497065105e-06, + "loss": 0.6288, + "step": 2034 + }, + { + "epoch": 0.5589123867069486, + "grad_norm": 1.2309448719024658, + "learning_rate": 4.902952821685214e-06, + "loss": 0.6664, + "step": 2035 + }, + { + "epoch": 0.5591870365284263, + "grad_norm": 1.2616173028945923, + "learning_rate": 4.902853143512152e-06, + "loss": 0.6483, + "step": 2036 + }, + { + "epoch": 0.5594616863499039, + "grad_norm": 1.274018406867981, + "learning_rate": 4.902753415189406e-06, + "loss": 0.7224, + "step": 2037 + }, + { + "epoch": 0.5597363361713815, + "grad_norm": 1.2363628149032593, + "learning_rate": 4.902653636719056e-06, + "loss": 0.6763, + "step": 2038 + }, + { + "epoch": 0.5600109859928591, + "grad_norm": 1.2123674154281616, + "learning_rate": 4.902553808103187e-06, + "loss": 0.6717, + "step": 2039 + }, + { + "epoch": 0.5602856358143368, + "grad_norm": 1.1387126445770264, + "learning_rate": 4.9024539293438786e-06, + "loss": 0.6061, + "step": 2040 + }, + { + "epoch": 0.5605602856358143, + "grad_norm": 1.1762044429779053, + "learning_rate": 4.902354000443219e-06, + "loss": 0.6459, + "step": 2041 + }, + { + "epoch": 0.5608349354572919, + "grad_norm": 1.205870509147644, + "learning_rate": 4.902254021403292e-06, + "loss": 0.6704, + "step": 2042 + }, + { + "epoch": 0.5611095852787695, + "grad_norm": 1.302602767944336, + "learning_rate": 4.902153992226184e-06, + "loss": 0.7054, + "step": 2043 + }, + { + "epoch": 0.5613842351002472, + "grad_norm": 1.2341861724853516, + "learning_rate": 4.902053912913985e-06, + "loss": 0.6745, + "step": 2044 + }, + { + "epoch": 0.5616588849217248, + "grad_norm": 1.20309579372406, + "learning_rate": 4.901953783468781e-06, + "loss": 0.6896, + "step": 2045 + }, + { + "epoch": 0.5619335347432024, + "grad_norm": 1.2418700456619263, + "learning_rate": 4.9018536038926626e-06, + "loss": 0.6644, + "step": 2046 + }, + { + "epoch": 0.56220818456468, + "grad_norm": 1.2392477989196777, + "learning_rate": 4.901753374187721e-06, + "loss": 0.6454, + "step": 2047 + }, + { + "epoch": 0.5624828343861576, + "grad_norm": 1.3440816402435303, + "learning_rate": 4.901653094356048e-06, + "loss": 0.7215, + "step": 2048 + }, + { + "epoch": 0.5627574842076353, + "grad_norm": 1.227992057800293, + "learning_rate": 4.901552764399738e-06, + "loss": 0.6512, + "step": 2049 + }, + { + "epoch": 0.5630321340291129, + "grad_norm": 1.3039536476135254, + "learning_rate": 4.901452384320882e-06, + "loss": 0.6737, + "step": 2050 + }, + { + "epoch": 0.5633067838505905, + "grad_norm": 1.2975107431411743, + "learning_rate": 4.901351954121576e-06, + "loss": 0.7014, + "step": 2051 + }, + { + "epoch": 0.5635814336720681, + "grad_norm": 1.2200485467910767, + "learning_rate": 4.901251473803918e-06, + "loss": 0.6365, + "step": 2052 + }, + { + "epoch": 0.5638560834935458, + "grad_norm": 1.2094225883483887, + "learning_rate": 4.901150943370002e-06, + "loss": 0.7001, + "step": 2053 + }, + { + "epoch": 0.5641307333150234, + "grad_norm": 1.215348720550537, + "learning_rate": 4.901050362821929e-06, + "loss": 0.6784, + "step": 2054 + }, + { + "epoch": 0.564405383136501, + "grad_norm": 1.1970093250274658, + "learning_rate": 4.900949732161797e-06, + "loss": 0.6845, + "step": 2055 + }, + { + "epoch": 0.5646800329579785, + "grad_norm": 1.1918702125549316, + "learning_rate": 4.900849051391706e-06, + "loss": 0.6899, + "step": 2056 + }, + { + "epoch": 0.5649546827794562, + "grad_norm": 1.2310571670532227, + "learning_rate": 4.900748320513759e-06, + "loss": 0.6881, + "step": 2057 + }, + { + "epoch": 0.5652293326009338, + "grad_norm": 1.3540955781936646, + "learning_rate": 4.9006475395300545e-06, + "loss": 0.6697, + "step": 2058 + }, + { + "epoch": 0.5655039824224114, + "grad_norm": 1.2424699068069458, + "learning_rate": 4.9005467084427e-06, + "loss": 0.6529, + "step": 2059 + }, + { + "epoch": 0.565778632243889, + "grad_norm": 1.2318272590637207, + "learning_rate": 4.900445827253798e-06, + "loss": 0.6824, + "step": 2060 + }, + { + "epoch": 0.5660532820653666, + "grad_norm": 1.2743127346038818, + "learning_rate": 4.900344895965454e-06, + "loss": 0.6856, + "step": 2061 + }, + { + "epoch": 0.5663279318868443, + "grad_norm": 1.2484874725341797, + "learning_rate": 4.900243914579775e-06, + "loss": 0.628, + "step": 2062 + }, + { + "epoch": 0.5666025817083219, + "grad_norm": 1.221451997756958, + "learning_rate": 4.900142883098869e-06, + "loss": 0.6506, + "step": 2063 + }, + { + "epoch": 0.5668772315297995, + "grad_norm": 1.2296631336212158, + "learning_rate": 4.900041801524843e-06, + "loss": 0.6737, + "step": 2064 + }, + { + "epoch": 0.5671518813512771, + "grad_norm": 1.2403110265731812, + "learning_rate": 4.899940669859807e-06, + "loss": 0.6972, + "step": 2065 + }, + { + "epoch": 0.5674265311727548, + "grad_norm": 1.2568553686141968, + "learning_rate": 4.899839488105873e-06, + "loss": 0.6529, + "step": 2066 + }, + { + "epoch": 0.5677011809942324, + "grad_norm": 1.210694432258606, + "learning_rate": 4.899738256265153e-06, + "loss": 0.6699, + "step": 2067 + }, + { + "epoch": 0.56797583081571, + "grad_norm": 1.2312426567077637, + "learning_rate": 4.899636974339758e-06, + "loss": 0.7246, + "step": 2068 + }, + { + "epoch": 0.5682504806371876, + "grad_norm": 1.177139401435852, + "learning_rate": 4.899535642331803e-06, + "loss": 0.7038, + "step": 2069 + }, + { + "epoch": 0.5685251304586652, + "grad_norm": 1.2626112699508667, + "learning_rate": 4.899434260243403e-06, + "loss": 0.6552, + "step": 2070 + }, + { + "epoch": 0.5687997802801428, + "grad_norm": 1.2337579727172852, + "learning_rate": 4.8993328280766725e-06, + "loss": 0.6381, + "step": 2071 + }, + { + "epoch": 0.5690744301016204, + "grad_norm": 1.2556040287017822, + "learning_rate": 4.8992313458337304e-06, + "loss": 0.7253, + "step": 2072 + }, + { + "epoch": 0.569349079923098, + "grad_norm": 1.3157798051834106, + "learning_rate": 4.899129813516693e-06, + "loss": 0.7052, + "step": 2073 + }, + { + "epoch": 0.5696237297445756, + "grad_norm": 1.245750069618225, + "learning_rate": 4.899028231127681e-06, + "loss": 0.6974, + "step": 2074 + }, + { + "epoch": 0.5698983795660533, + "grad_norm": 1.3790632486343384, + "learning_rate": 4.898926598668814e-06, + "loss": 0.6413, + "step": 2075 + }, + { + "epoch": 0.5701730293875309, + "grad_norm": 1.2547316551208496, + "learning_rate": 4.8988249161422115e-06, + "loss": 0.6694, + "step": 2076 + }, + { + "epoch": 0.5704476792090085, + "grad_norm": 1.3639119863510132, + "learning_rate": 4.898723183549999e-06, + "loss": 0.6711, + "step": 2077 + }, + { + "epoch": 0.5707223290304861, + "grad_norm": 1.2166539430618286, + "learning_rate": 4.898621400894297e-06, + "loss": 0.7204, + "step": 2078 + }, + { + "epoch": 0.5709969788519638, + "grad_norm": 1.2410130500793457, + "learning_rate": 4.89851956817723e-06, + "loss": 0.6484, + "step": 2079 + }, + { + "epoch": 0.5712716286734414, + "grad_norm": 1.2720575332641602, + "learning_rate": 4.8984176854009245e-06, + "loss": 0.6952, + "step": 2080 + }, + { + "epoch": 0.571546278494919, + "grad_norm": 1.186461091041565, + "learning_rate": 4.8983157525675076e-06, + "loss": 0.6354, + "step": 2081 + }, + { + "epoch": 0.5718209283163966, + "grad_norm": 1.1898760795593262, + "learning_rate": 4.898213769679104e-06, + "loss": 0.6231, + "step": 2082 + }, + { + "epoch": 0.5720955781378743, + "grad_norm": 1.179399847984314, + "learning_rate": 4.898111736737844e-06, + "loss": 0.6363, + "step": 2083 + }, + { + "epoch": 0.5723702279593518, + "grad_norm": 1.221977949142456, + "learning_rate": 4.898009653745856e-06, + "loss": 0.6281, + "step": 2084 + }, + { + "epoch": 0.5726448777808294, + "grad_norm": 1.3620784282684326, + "learning_rate": 4.897907520705273e-06, + "loss": 0.7137, + "step": 2085 + }, + { + "epoch": 0.572919527602307, + "grad_norm": 1.222396731376648, + "learning_rate": 4.897805337618224e-06, + "loss": 0.6584, + "step": 2086 + }, + { + "epoch": 0.5731941774237846, + "grad_norm": 1.2955621480941772, + "learning_rate": 4.897703104486843e-06, + "loss": 0.6623, + "step": 2087 + }, + { + "epoch": 0.5734688272452623, + "grad_norm": 1.2958590984344482, + "learning_rate": 4.897600821313263e-06, + "loss": 0.6864, + "step": 2088 + }, + { + "epoch": 0.5737434770667399, + "grad_norm": 1.1901214122772217, + "learning_rate": 4.897498488099619e-06, + "loss": 0.6969, + "step": 2089 + }, + { + "epoch": 0.5740181268882175, + "grad_norm": 1.2572015523910522, + "learning_rate": 4.897396104848047e-06, + "loss": 0.6668, + "step": 2090 + }, + { + "epoch": 0.5742927767096951, + "grad_norm": 1.2501463890075684, + "learning_rate": 4.897293671560684e-06, + "loss": 0.6756, + "step": 2091 + }, + { + "epoch": 0.5745674265311728, + "grad_norm": 1.2511216402053833, + "learning_rate": 4.897191188239667e-06, + "loss": 0.6591, + "step": 2092 + }, + { + "epoch": 0.5748420763526504, + "grad_norm": 1.234343409538269, + "learning_rate": 4.897088654887136e-06, + "loss": 0.6962, + "step": 2093 + }, + { + "epoch": 0.575116726174128, + "grad_norm": 1.1664408445358276, + "learning_rate": 4.8969860715052304e-06, + "loss": 0.6089, + "step": 2094 + }, + { + "epoch": 0.5753913759956056, + "grad_norm": 1.2181437015533447, + "learning_rate": 4.896883438096091e-06, + "loss": 0.6913, + "step": 2095 + }, + { + "epoch": 0.5756660258170833, + "grad_norm": 1.2467070817947388, + "learning_rate": 4.89678075466186e-06, + "loss": 0.7014, + "step": 2096 + }, + { + "epoch": 0.5759406756385609, + "grad_norm": 1.187426209449768, + "learning_rate": 4.89667802120468e-06, + "loss": 0.6639, + "step": 2097 + }, + { + "epoch": 0.5762153254600384, + "grad_norm": 1.3170231580734253, + "learning_rate": 4.896575237726697e-06, + "loss": 0.6921, + "step": 2098 + }, + { + "epoch": 0.576489975281516, + "grad_norm": 1.201523780822754, + "learning_rate": 4.896472404230054e-06, + "loss": 0.6452, + "step": 2099 + }, + { + "epoch": 0.5767646251029936, + "grad_norm": 1.184014916419983, + "learning_rate": 4.896369520716899e-06, + "loss": 0.6698, + "step": 2100 + }, + { + "epoch": 0.5770392749244713, + "grad_norm": 1.2314972877502441, + "learning_rate": 4.896266587189378e-06, + "loss": 0.7162, + "step": 2101 + }, + { + "epoch": 0.5773139247459489, + "grad_norm": 1.3073794841766357, + "learning_rate": 4.8961636036496396e-06, + "loss": 0.6634, + "step": 2102 + }, + { + "epoch": 0.5775885745674265, + "grad_norm": 1.2476046085357666, + "learning_rate": 4.896060570099833e-06, + "loss": 0.6593, + "step": 2103 + }, + { + "epoch": 0.5778632243889041, + "grad_norm": 1.3161057233810425, + "learning_rate": 4.895957486542109e-06, + "loss": 0.6935, + "step": 2104 + }, + { + "epoch": 0.5781378742103818, + "grad_norm": 1.163431167602539, + "learning_rate": 4.895854352978619e-06, + "loss": 0.6397, + "step": 2105 + }, + { + "epoch": 0.5784125240318594, + "grad_norm": 1.219389796257019, + "learning_rate": 4.895751169411516e-06, + "loss": 0.6532, + "step": 2106 + }, + { + "epoch": 0.578687173853337, + "grad_norm": 1.3288512229919434, + "learning_rate": 4.895647935842952e-06, + "loss": 0.6802, + "step": 2107 + }, + { + "epoch": 0.5789618236748146, + "grad_norm": 1.2355316877365112, + "learning_rate": 4.895544652275083e-06, + "loss": 0.7057, + "step": 2108 + }, + { + "epoch": 0.5792364734962923, + "grad_norm": 1.225499153137207, + "learning_rate": 4.895441318710063e-06, + "loss": 0.6473, + "step": 2109 + }, + { + "epoch": 0.5795111233177699, + "grad_norm": 1.2690250873565674, + "learning_rate": 4.8953379351500515e-06, + "loss": 0.6667, + "step": 2110 + }, + { + "epoch": 0.5797857731392475, + "grad_norm": 1.1994632482528687, + "learning_rate": 4.895234501597204e-06, + "loss": 0.6754, + "step": 2111 + }, + { + "epoch": 0.5800604229607251, + "grad_norm": 1.2496929168701172, + "learning_rate": 4.8951310180536795e-06, + "loss": 0.6651, + "step": 2112 + }, + { + "epoch": 0.5803350727822026, + "grad_norm": 1.2562726736068726, + "learning_rate": 4.895027484521638e-06, + "loss": 0.7103, + "step": 2113 + }, + { + "epoch": 0.5806097226036803, + "grad_norm": 1.2573413848876953, + "learning_rate": 4.8949239010032404e-06, + "loss": 0.6365, + "step": 2114 + }, + { + "epoch": 0.5808843724251579, + "grad_norm": 1.1448456048965454, + "learning_rate": 4.894820267500649e-06, + "loss": 0.6269, + "step": 2115 + }, + { + "epoch": 0.5811590222466355, + "grad_norm": 1.1917364597320557, + "learning_rate": 4.894716584016026e-06, + "loss": 0.6641, + "step": 2116 + }, + { + "epoch": 0.5814336720681131, + "grad_norm": 1.2027231454849243, + "learning_rate": 4.894612850551536e-06, + "loss": 0.6758, + "step": 2117 + }, + { + "epoch": 0.5817083218895908, + "grad_norm": 1.2884190082550049, + "learning_rate": 4.894509067109343e-06, + "loss": 0.6658, + "step": 2118 + }, + { + "epoch": 0.5819829717110684, + "grad_norm": 1.224510908126831, + "learning_rate": 4.894405233691613e-06, + "loss": 0.6598, + "step": 2119 + }, + { + "epoch": 0.582257621532546, + "grad_norm": 1.301540493965149, + "learning_rate": 4.894301350300516e-06, + "loss": 0.6885, + "step": 2120 + }, + { + "epoch": 0.5825322713540236, + "grad_norm": 1.2952022552490234, + "learning_rate": 4.8941974169382165e-06, + "loss": 0.6887, + "step": 2121 + }, + { + "epoch": 0.5828069211755013, + "grad_norm": 1.3403820991516113, + "learning_rate": 4.894093433606886e-06, + "loss": 0.6465, + "step": 2122 + }, + { + "epoch": 0.5830815709969789, + "grad_norm": 1.2007299661636353, + "learning_rate": 4.893989400308693e-06, + "loss": 0.6565, + "step": 2123 + }, + { + "epoch": 0.5833562208184565, + "grad_norm": 1.2581720352172852, + "learning_rate": 4.89388531704581e-06, + "loss": 0.6474, + "step": 2124 + }, + { + "epoch": 0.5836308706399341, + "grad_norm": 1.258731722831726, + "learning_rate": 4.8937811838204095e-06, + "loss": 0.6761, + "step": 2125 + }, + { + "epoch": 0.5839055204614118, + "grad_norm": 1.2340025901794434, + "learning_rate": 4.893677000634664e-06, + "loss": 0.6627, + "step": 2126 + }, + { + "epoch": 0.5841801702828893, + "grad_norm": 1.3587782382965088, + "learning_rate": 4.893572767490747e-06, + "loss": 0.6991, + "step": 2127 + }, + { + "epoch": 0.5844548201043669, + "grad_norm": 1.2615233659744263, + "learning_rate": 4.8934684843908365e-06, + "loss": 0.6505, + "step": 2128 + }, + { + "epoch": 0.5847294699258445, + "grad_norm": 1.3082506656646729, + "learning_rate": 4.893364151337107e-06, + "loss": 0.6848, + "step": 2129 + }, + { + "epoch": 0.5850041197473221, + "grad_norm": 1.2055296897888184, + "learning_rate": 4.893259768331736e-06, + "loss": 0.7113, + "step": 2130 + }, + { + "epoch": 0.5852787695687998, + "grad_norm": 1.2637176513671875, + "learning_rate": 4.893155335376904e-06, + "loss": 0.6823, + "step": 2131 + }, + { + "epoch": 0.5855534193902774, + "grad_norm": 1.3186132907867432, + "learning_rate": 4.893050852474789e-06, + "loss": 0.6689, + "step": 2132 + }, + { + "epoch": 0.585828069211755, + "grad_norm": 1.2590444087982178, + "learning_rate": 4.892946319627572e-06, + "loss": 0.6588, + "step": 2133 + }, + { + "epoch": 0.5861027190332326, + "grad_norm": 1.2568665742874146, + "learning_rate": 4.892841736837433e-06, + "loss": 0.6624, + "step": 2134 + }, + { + "epoch": 0.5863773688547103, + "grad_norm": 1.252236247062683, + "learning_rate": 4.892737104106559e-06, + "loss": 0.6799, + "step": 2135 + }, + { + "epoch": 0.5866520186761879, + "grad_norm": 1.3164221048355103, + "learning_rate": 4.89263242143713e-06, + "loss": 0.6915, + "step": 2136 + }, + { + "epoch": 0.5869266684976655, + "grad_norm": 1.2545945644378662, + "learning_rate": 4.892527688831331e-06, + "loss": 0.6694, + "step": 2137 + }, + { + "epoch": 0.5872013183191431, + "grad_norm": 1.2305253744125366, + "learning_rate": 4.892422906291351e-06, + "loss": 0.649, + "step": 2138 + }, + { + "epoch": 0.5874759681406208, + "grad_norm": 1.2493430376052856, + "learning_rate": 4.8923180738193735e-06, + "loss": 0.6593, + "step": 2139 + }, + { + "epoch": 0.5877506179620984, + "grad_norm": 1.1807249784469604, + "learning_rate": 4.892213191417588e-06, + "loss": 0.67, + "step": 2140 + }, + { + "epoch": 0.5880252677835759, + "grad_norm": 1.2274889945983887, + "learning_rate": 4.892108259088183e-06, + "loss": 0.665, + "step": 2141 + }, + { + "epoch": 0.5882999176050535, + "grad_norm": 1.2688651084899902, + "learning_rate": 4.892003276833348e-06, + "loss": 0.6828, + "step": 2142 + }, + { + "epoch": 0.5885745674265311, + "grad_norm": 1.222699522972107, + "learning_rate": 4.891898244655275e-06, + "loss": 0.6251, + "step": 2143 + }, + { + "epoch": 0.5888492172480088, + "grad_norm": 1.2583143711090088, + "learning_rate": 4.8917931625561575e-06, + "loss": 0.6587, + "step": 2144 + }, + { + "epoch": 0.5891238670694864, + "grad_norm": 1.178244709968567, + "learning_rate": 4.891688030538185e-06, + "loss": 0.6584, + "step": 2145 + }, + { + "epoch": 0.589398516890964, + "grad_norm": 1.2540377378463745, + "learning_rate": 4.891582848603555e-06, + "loss": 0.6487, + "step": 2146 + }, + { + "epoch": 0.5896731667124416, + "grad_norm": 1.2454533576965332, + "learning_rate": 4.891477616754461e-06, + "loss": 0.6713, + "step": 2147 + }, + { + "epoch": 0.5899478165339193, + "grad_norm": 1.230099081993103, + "learning_rate": 4.8913723349931005e-06, + "loss": 0.6333, + "step": 2148 + }, + { + "epoch": 0.5902224663553969, + "grad_norm": 1.211197853088379, + "learning_rate": 4.89126700332167e-06, + "loss": 0.6562, + "step": 2149 + }, + { + "epoch": 0.5904971161768745, + "grad_norm": 1.2441049814224243, + "learning_rate": 4.891161621742367e-06, + "loss": 0.6466, + "step": 2150 + }, + { + "epoch": 0.5907717659983521, + "grad_norm": 1.1923770904541016, + "learning_rate": 4.891056190257393e-06, + "loss": 0.6205, + "step": 2151 + }, + { + "epoch": 0.5910464158198298, + "grad_norm": 1.1704694032669067, + "learning_rate": 4.890950708868947e-06, + "loss": 0.6505, + "step": 2152 + }, + { + "epoch": 0.5913210656413074, + "grad_norm": 1.2466100454330444, + "learning_rate": 4.8908451775792305e-06, + "loss": 0.6795, + "step": 2153 + }, + { + "epoch": 0.591595715462785, + "grad_norm": 1.3013418912887573, + "learning_rate": 4.890739596390446e-06, + "loss": 0.7041, + "step": 2154 + }, + { + "epoch": 0.5918703652842625, + "grad_norm": 1.2820237874984741, + "learning_rate": 4.8906339653047985e-06, + "loss": 0.6854, + "step": 2155 + }, + { + "epoch": 0.5921450151057401, + "grad_norm": 1.206052303314209, + "learning_rate": 4.890528284324491e-06, + "loss": 0.5912, + "step": 2156 + }, + { + "epoch": 0.5924196649272178, + "grad_norm": 1.234548807144165, + "learning_rate": 4.89042255345173e-06, + "loss": 0.6767, + "step": 2157 + }, + { + "epoch": 0.5926943147486954, + "grad_norm": 1.1918962001800537, + "learning_rate": 4.8903167726887215e-06, + "loss": 0.6867, + "step": 2158 + }, + { + "epoch": 0.592968964570173, + "grad_norm": 1.1987545490264893, + "learning_rate": 4.890210942037674e-06, + "loss": 0.6098, + "step": 2159 + }, + { + "epoch": 0.5932436143916506, + "grad_norm": 1.2581249475479126, + "learning_rate": 4.890105061500795e-06, + "loss": 0.6699, + "step": 2160 + }, + { + "epoch": 0.5935182642131283, + "grad_norm": 1.26827871799469, + "learning_rate": 4.889999131080296e-06, + "loss": 0.6886, + "step": 2161 + }, + { + "epoch": 0.5937929140346059, + "grad_norm": 1.29579758644104, + "learning_rate": 4.889893150778387e-06, + "loss": 0.6833, + "step": 2162 + }, + { + "epoch": 0.5940675638560835, + "grad_norm": 1.2987114191055298, + "learning_rate": 4.88978712059728e-06, + "loss": 0.6585, + "step": 2163 + }, + { + "epoch": 0.5943422136775611, + "grad_norm": 1.3741060495376587, + "learning_rate": 4.889681040539188e-06, + "loss": 0.7398, + "step": 2164 + }, + { + "epoch": 0.5946168634990388, + "grad_norm": 1.133868932723999, + "learning_rate": 4.889574910606325e-06, + "loss": 0.6396, + "step": 2165 + }, + { + "epoch": 0.5948915133205164, + "grad_norm": 1.3257559537887573, + "learning_rate": 4.889468730800906e-06, + "loss": 0.692, + "step": 2166 + }, + { + "epoch": 0.595166163141994, + "grad_norm": 1.225637674331665, + "learning_rate": 4.889362501125147e-06, + "loss": 0.6729, + "step": 2167 + }, + { + "epoch": 0.5954408129634716, + "grad_norm": 1.2157444953918457, + "learning_rate": 4.889256221581266e-06, + "loss": 0.6639, + "step": 2168 + }, + { + "epoch": 0.5957154627849491, + "grad_norm": 1.218176245689392, + "learning_rate": 4.889149892171479e-06, + "loss": 0.6618, + "step": 2169 + }, + { + "epoch": 0.5959901126064268, + "grad_norm": 1.401928424835205, + "learning_rate": 4.889043512898006e-06, + "loss": 0.7237, + "step": 2170 + }, + { + "epoch": 0.5962647624279044, + "grad_norm": 1.1976895332336426, + "learning_rate": 4.888937083763069e-06, + "loss": 0.6576, + "step": 2171 + }, + { + "epoch": 0.596539412249382, + "grad_norm": 1.2470829486846924, + "learning_rate": 4.8888306047688875e-06, + "loss": 0.6886, + "step": 2172 + }, + { + "epoch": 0.5968140620708596, + "grad_norm": 1.2247908115386963, + "learning_rate": 4.8887240759176845e-06, + "loss": 0.6558, + "step": 2173 + }, + { + "epoch": 0.5970887118923373, + "grad_norm": 1.21309494972229, + "learning_rate": 4.888617497211683e-06, + "loss": 0.638, + "step": 2174 + }, + { + "epoch": 0.5973633617138149, + "grad_norm": 1.26221764087677, + "learning_rate": 4.888510868653107e-06, + "loss": 0.6882, + "step": 2175 + }, + { + "epoch": 0.5976380115352925, + "grad_norm": 1.2386534214019775, + "learning_rate": 4.888404190244183e-06, + "loss": 0.6324, + "step": 2176 + }, + { + "epoch": 0.5979126613567701, + "grad_norm": 1.294993281364441, + "learning_rate": 4.888297461987137e-06, + "loss": 0.653, + "step": 2177 + }, + { + "epoch": 0.5981873111782477, + "grad_norm": 1.271480679512024, + "learning_rate": 4.888190683884197e-06, + "loss": 0.6994, + "step": 2178 + }, + { + "epoch": 0.5984619609997254, + "grad_norm": 1.2364814281463623, + "learning_rate": 4.8880838559375895e-06, + "loss": 0.6892, + "step": 2179 + }, + { + "epoch": 0.598736610821203, + "grad_norm": 1.318196415901184, + "learning_rate": 4.887976978149547e-06, + "loss": 0.684, + "step": 2180 + }, + { + "epoch": 0.5990112606426806, + "grad_norm": 1.2638230323791504, + "learning_rate": 4.887870050522298e-06, + "loss": 0.6599, + "step": 2181 + }, + { + "epoch": 0.5992859104641582, + "grad_norm": 1.2311891317367554, + "learning_rate": 4.887763073058075e-06, + "loss": 0.6604, + "step": 2182 + }, + { + "epoch": 0.5995605602856359, + "grad_norm": 1.2555134296417236, + "learning_rate": 4.887656045759111e-06, + "loss": 0.7005, + "step": 2183 + }, + { + "epoch": 0.5998352101071134, + "grad_norm": 1.1683993339538574, + "learning_rate": 4.887548968627639e-06, + "loss": 0.6674, + "step": 2184 + }, + { + "epoch": 0.600109859928591, + "grad_norm": 1.217057466506958, + "learning_rate": 4.887441841665895e-06, + "loss": 0.6327, + "step": 2185 + }, + { + "epoch": 0.6003845097500686, + "grad_norm": 1.2163523435592651, + "learning_rate": 4.887334664876113e-06, + "loss": 0.6875, + "step": 2186 + }, + { + "epoch": 0.6006591595715463, + "grad_norm": 1.2383071184158325, + "learning_rate": 4.8872274382605314e-06, + "loss": 0.673, + "step": 2187 + }, + { + "epoch": 0.6009338093930239, + "grad_norm": 1.2378246784210205, + "learning_rate": 4.887120161821387e-06, + "loss": 0.6813, + "step": 2188 + }, + { + "epoch": 0.6012084592145015, + "grad_norm": 1.2653270959854126, + "learning_rate": 4.887012835560919e-06, + "loss": 0.7187, + "step": 2189 + }, + { + "epoch": 0.6014831090359791, + "grad_norm": 1.1576482057571411, + "learning_rate": 4.886905459481369e-06, + "loss": 0.6407, + "step": 2190 + }, + { + "epoch": 0.6017577588574567, + "grad_norm": 1.2121202945709229, + "learning_rate": 4.8867980335849765e-06, + "loss": 0.6859, + "step": 2191 + }, + { + "epoch": 0.6020324086789344, + "grad_norm": 1.1396483182907104, + "learning_rate": 4.8866905578739835e-06, + "loss": 0.6419, + "step": 2192 + }, + { + "epoch": 0.602307058500412, + "grad_norm": 1.295674443244934, + "learning_rate": 4.886583032350634e-06, + "loss": 0.6891, + "step": 2193 + }, + { + "epoch": 0.6025817083218896, + "grad_norm": 1.1752090454101562, + "learning_rate": 4.886475457017171e-06, + "loss": 0.6369, + "step": 2194 + }, + { + "epoch": 0.6028563581433672, + "grad_norm": 1.178135871887207, + "learning_rate": 4.88636783187584e-06, + "loss": 0.6258, + "step": 2195 + }, + { + "epoch": 0.6031310079648449, + "grad_norm": 1.2599819898605347, + "learning_rate": 4.8862601569288885e-06, + "loss": 0.698, + "step": 2196 + }, + { + "epoch": 0.6034056577863225, + "grad_norm": 1.1549804210662842, + "learning_rate": 4.886152432178562e-06, + "loss": 0.6406, + "step": 2197 + }, + { + "epoch": 0.6036803076078, + "grad_norm": 1.1930546760559082, + "learning_rate": 4.88604465762711e-06, + "loss": 0.6285, + "step": 2198 + }, + { + "epoch": 0.6039549574292776, + "grad_norm": 1.1717133522033691, + "learning_rate": 4.885936833276782e-06, + "loss": 0.6289, + "step": 2199 + }, + { + "epoch": 0.6042296072507553, + "grad_norm": 1.2476890087127686, + "learning_rate": 4.885828959129828e-06, + "loss": 0.6834, + "step": 2200 + }, + { + "epoch": 0.6045042570722329, + "grad_norm": 1.2336312532424927, + "learning_rate": 4.8857210351884985e-06, + "loss": 0.7013, + "step": 2201 + }, + { + "epoch": 0.6047789068937105, + "grad_norm": 1.2779195308685303, + "learning_rate": 4.8856130614550466e-06, + "loss": 0.6874, + "step": 2202 + }, + { + "epoch": 0.6050535567151881, + "grad_norm": 1.3194706439971924, + "learning_rate": 4.8855050379317265e-06, + "loss": 0.7071, + "step": 2203 + }, + { + "epoch": 0.6053282065366657, + "grad_norm": 1.2551568746566772, + "learning_rate": 4.885396964620792e-06, + "loss": 0.6947, + "step": 2204 + }, + { + "epoch": 0.6056028563581434, + "grad_norm": 1.2662042379379272, + "learning_rate": 4.885288841524499e-06, + "loss": 0.6658, + "step": 2205 + }, + { + "epoch": 0.605877506179621, + "grad_norm": 1.2178679704666138, + "learning_rate": 4.8851806686451045e-06, + "loss": 0.6478, + "step": 2206 + }, + { + "epoch": 0.6061521560010986, + "grad_norm": 1.2345565557479858, + "learning_rate": 4.8850724459848655e-06, + "loss": 0.6722, + "step": 2207 + }, + { + "epoch": 0.6064268058225762, + "grad_norm": 1.253940224647522, + "learning_rate": 4.88496417354604e-06, + "loss": 0.7174, + "step": 2208 + }, + { + "epoch": 0.6067014556440539, + "grad_norm": 1.2704260349273682, + "learning_rate": 4.88485585133089e-06, + "loss": 0.6953, + "step": 2209 + }, + { + "epoch": 0.6069761054655315, + "grad_norm": 1.1604411602020264, + "learning_rate": 4.884747479341674e-06, + "loss": 0.5993, + "step": 2210 + }, + { + "epoch": 0.6072507552870091, + "grad_norm": 1.2875036001205444, + "learning_rate": 4.884639057580655e-06, + "loss": 0.6991, + "step": 2211 + }, + { + "epoch": 0.6075254051084866, + "grad_norm": 1.2029407024383545, + "learning_rate": 4.884530586050095e-06, + "loss": 0.6822, + "step": 2212 + }, + { + "epoch": 0.6078000549299643, + "grad_norm": 1.1860824823379517, + "learning_rate": 4.884422064752259e-06, + "loss": 0.6878, + "step": 2213 + }, + { + "epoch": 0.6080747047514419, + "grad_norm": 1.181959629058838, + "learning_rate": 4.8843134936894125e-06, + "loss": 0.6693, + "step": 2214 + }, + { + "epoch": 0.6083493545729195, + "grad_norm": 1.1846363544464111, + "learning_rate": 4.884204872863819e-06, + "loss": 0.6373, + "step": 2215 + }, + { + "epoch": 0.6086240043943971, + "grad_norm": 1.2646026611328125, + "learning_rate": 4.884096202277747e-06, + "loss": 0.669, + "step": 2216 + }, + { + "epoch": 0.6088986542158747, + "grad_norm": 1.1456899642944336, + "learning_rate": 4.883987481933465e-06, + "loss": 0.6258, + "step": 2217 + }, + { + "epoch": 0.6091733040373524, + "grad_norm": 1.2833468914031982, + "learning_rate": 4.883878711833242e-06, + "loss": 0.6579, + "step": 2218 + }, + { + "epoch": 0.60944795385883, + "grad_norm": 1.269018292427063, + "learning_rate": 4.883769891979348e-06, + "loss": 0.6995, + "step": 2219 + }, + { + "epoch": 0.6097226036803076, + "grad_norm": 1.3242828845977783, + "learning_rate": 4.883661022374052e-06, + "loss": 0.7012, + "step": 2220 + }, + { + "epoch": 0.6099972535017852, + "grad_norm": 1.3044755458831787, + "learning_rate": 4.88355210301963e-06, + "loss": 0.7047, + "step": 2221 + }, + { + "epoch": 0.6102719033232629, + "grad_norm": 1.2469160556793213, + "learning_rate": 4.883443133918352e-06, + "loss": 0.6552, + "step": 2222 + }, + { + "epoch": 0.6105465531447405, + "grad_norm": 1.238265872001648, + "learning_rate": 4.883334115072494e-06, + "loss": 0.6841, + "step": 2223 + }, + { + "epoch": 0.6108212029662181, + "grad_norm": 1.215640902519226, + "learning_rate": 4.8832250464843314e-06, + "loss": 0.6676, + "step": 2224 + }, + { + "epoch": 0.6110958527876957, + "grad_norm": 1.2187350988388062, + "learning_rate": 4.883115928156139e-06, + "loss": 0.6508, + "step": 2225 + }, + { + "epoch": 0.6113705026091732, + "grad_norm": 1.236460566520691, + "learning_rate": 4.883006760090197e-06, + "loss": 0.6888, + "step": 2226 + }, + { + "epoch": 0.6116451524306509, + "grad_norm": 1.2136934995651245, + "learning_rate": 4.882897542288781e-06, + "loss": 0.6642, + "step": 2227 + }, + { + "epoch": 0.6119198022521285, + "grad_norm": 1.147387981414795, + "learning_rate": 4.882788274754171e-06, + "loss": 0.6271, + "step": 2228 + }, + { + "epoch": 0.6121944520736061, + "grad_norm": 1.180159330368042, + "learning_rate": 4.882678957488649e-06, + "loss": 0.6639, + "step": 2229 + }, + { + "epoch": 0.6124691018950837, + "grad_norm": 1.2671730518341064, + "learning_rate": 4.882569590494495e-06, + "loss": 0.6898, + "step": 2230 + }, + { + "epoch": 0.6127437517165614, + "grad_norm": 1.2480524778366089, + "learning_rate": 4.882460173773993e-06, + "loss": 0.73, + "step": 2231 + }, + { + "epoch": 0.613018401538039, + "grad_norm": 1.1568536758422852, + "learning_rate": 4.882350707329426e-06, + "loss": 0.6977, + "step": 2232 + }, + { + "epoch": 0.6132930513595166, + "grad_norm": 1.2081753015518188, + "learning_rate": 4.882241191163077e-06, + "loss": 0.6711, + "step": 2233 + }, + { + "epoch": 0.6135677011809942, + "grad_norm": 1.2658942937850952, + "learning_rate": 4.882131625277235e-06, + "loss": 0.6548, + "step": 2234 + }, + { + "epoch": 0.6138423510024719, + "grad_norm": 1.221195936203003, + "learning_rate": 4.882022009674183e-06, + "loss": 0.6465, + "step": 2235 + }, + { + "epoch": 0.6141170008239495, + "grad_norm": 1.3227347135543823, + "learning_rate": 4.881912344356213e-06, + "loss": 0.6378, + "step": 2236 + }, + { + "epoch": 0.6143916506454271, + "grad_norm": 1.2574020624160767, + "learning_rate": 4.88180262932561e-06, + "loss": 0.6686, + "step": 2237 + }, + { + "epoch": 0.6146663004669047, + "grad_norm": 1.207385778427124, + "learning_rate": 4.881692864584666e-06, + "loss": 0.6394, + "step": 2238 + }, + { + "epoch": 0.6149409502883824, + "grad_norm": 1.2668700218200684, + "learning_rate": 4.881583050135671e-06, + "loss": 0.6712, + "step": 2239 + }, + { + "epoch": 0.61521560010986, + "grad_norm": 1.189225673675537, + "learning_rate": 4.881473185980917e-06, + "loss": 0.6955, + "step": 2240 + }, + { + "epoch": 0.6154902499313375, + "grad_norm": 1.2239036560058594, + "learning_rate": 4.881363272122698e-06, + "loss": 0.6731, + "step": 2241 + }, + { + "epoch": 0.6157648997528151, + "grad_norm": 1.2910078763961792, + "learning_rate": 4.881253308563306e-06, + "loss": 0.6606, + "step": 2242 + }, + { + "epoch": 0.6160395495742927, + "grad_norm": 1.1568422317504883, + "learning_rate": 4.881143295305038e-06, + "loss": 0.6781, + "step": 2243 + }, + { + "epoch": 0.6163141993957704, + "grad_norm": 1.224657654762268, + "learning_rate": 4.8810332323501895e-06, + "loss": 0.6886, + "step": 2244 + }, + { + "epoch": 0.616588849217248, + "grad_norm": 1.1700243949890137, + "learning_rate": 4.880923119701057e-06, + "loss": 0.6846, + "step": 2245 + }, + { + "epoch": 0.6168634990387256, + "grad_norm": 1.2669962644577026, + "learning_rate": 4.880812957359939e-06, + "loss": 0.6994, + "step": 2246 + }, + { + "epoch": 0.6171381488602032, + "grad_norm": 1.244422197341919, + "learning_rate": 4.880702745329135e-06, + "loss": 0.7076, + "step": 2247 + }, + { + "epoch": 0.6174127986816809, + "grad_norm": 1.1967029571533203, + "learning_rate": 4.880592483610944e-06, + "loss": 0.6815, + "step": 2248 + }, + { + "epoch": 0.6176874485031585, + "grad_norm": 1.2065685987472534, + "learning_rate": 4.88048217220767e-06, + "loss": 0.6732, + "step": 2249 + }, + { + "epoch": 0.6179620983246361, + "grad_norm": 1.223342776298523, + "learning_rate": 4.880371811121612e-06, + "loss": 0.6416, + "step": 2250 + }, + { + "epoch": 0.6182367481461137, + "grad_norm": 1.2267885208129883, + "learning_rate": 4.880261400355076e-06, + "loss": 0.6066, + "step": 2251 + }, + { + "epoch": 0.6185113979675914, + "grad_norm": 1.270570993423462, + "learning_rate": 4.880150939910364e-06, + "loss": 0.6297, + "step": 2252 + }, + { + "epoch": 0.618786047789069, + "grad_norm": 1.268916368484497, + "learning_rate": 4.880040429789783e-06, + "loss": 0.6773, + "step": 2253 + }, + { + "epoch": 0.6190606976105466, + "grad_norm": 1.3055903911590576, + "learning_rate": 4.879929869995639e-06, + "loss": 0.7131, + "step": 2254 + }, + { + "epoch": 0.6193353474320241, + "grad_norm": 1.242005705833435, + "learning_rate": 4.8798192605302395e-06, + "loss": 0.682, + "step": 2255 + }, + { + "epoch": 0.6196099972535017, + "grad_norm": 1.3264429569244385, + "learning_rate": 4.879708601395894e-06, + "loss": 0.6655, + "step": 2256 + }, + { + "epoch": 0.6198846470749794, + "grad_norm": 1.2042170763015747, + "learning_rate": 4.8795978925949104e-06, + "loss": 0.6637, + "step": 2257 + }, + { + "epoch": 0.620159296896457, + "grad_norm": 1.213010311126709, + "learning_rate": 4.8794871341296e-06, + "loss": 0.687, + "step": 2258 + }, + { + "epoch": 0.6204339467179346, + "grad_norm": 1.2698134183883667, + "learning_rate": 4.879376326002274e-06, + "loss": 0.6795, + "step": 2259 + }, + { + "epoch": 0.6207085965394122, + "grad_norm": 1.1896153688430786, + "learning_rate": 4.879265468215246e-06, + "loss": 0.6338, + "step": 2260 + }, + { + "epoch": 0.6209832463608899, + "grad_norm": 1.323988914489746, + "learning_rate": 4.87915456077083e-06, + "loss": 0.6868, + "step": 2261 + }, + { + "epoch": 0.6212578961823675, + "grad_norm": 1.3363620042800903, + "learning_rate": 4.879043603671339e-06, + "loss": 0.7364, + "step": 2262 + }, + { + "epoch": 0.6215325460038451, + "grad_norm": 1.3053498268127441, + "learning_rate": 4.87893259691909e-06, + "loss": 0.6382, + "step": 2263 + }, + { + "epoch": 0.6218071958253227, + "grad_norm": 1.1417639255523682, + "learning_rate": 4.8788215405164e-06, + "loss": 0.6326, + "step": 2264 + }, + { + "epoch": 0.6220818456468004, + "grad_norm": 1.220378041267395, + "learning_rate": 4.878710434465585e-06, + "loss": 0.6522, + "step": 2265 + }, + { + "epoch": 0.622356495468278, + "grad_norm": 1.1986886262893677, + "learning_rate": 4.8785992787689665e-06, + "loss": 0.6409, + "step": 2266 + }, + { + "epoch": 0.6226311452897556, + "grad_norm": 1.3277795314788818, + "learning_rate": 4.878488073428862e-06, + "loss": 0.6529, + "step": 2267 + }, + { + "epoch": 0.6229057951112332, + "grad_norm": 1.3271198272705078, + "learning_rate": 4.878376818447594e-06, + "loss": 0.6952, + "step": 2268 + }, + { + "epoch": 0.6231804449327107, + "grad_norm": 1.3229132890701294, + "learning_rate": 4.878265513827485e-06, + "loss": 0.6611, + "step": 2269 + }, + { + "epoch": 0.6234550947541884, + "grad_norm": 1.188491702079773, + "learning_rate": 4.878154159570857e-06, + "loss": 0.6277, + "step": 2270 + }, + { + "epoch": 0.623729744575666, + "grad_norm": 1.2120506763458252, + "learning_rate": 4.8780427556800335e-06, + "loss": 0.6565, + "step": 2271 + }, + { + "epoch": 0.6240043943971436, + "grad_norm": 1.334118366241455, + "learning_rate": 4.87793130215734e-06, + "loss": 0.6533, + "step": 2272 + }, + { + "epoch": 0.6242790442186212, + "grad_norm": 1.23712158203125, + "learning_rate": 4.877819799005105e-06, + "loss": 0.6296, + "step": 2273 + }, + { + "epoch": 0.6245536940400989, + "grad_norm": 1.2372177839279175, + "learning_rate": 4.877708246225652e-06, + "loss": 0.6869, + "step": 2274 + }, + { + "epoch": 0.6248283438615765, + "grad_norm": 1.2098251581192017, + "learning_rate": 4.8775966438213106e-06, + "loss": 0.6746, + "step": 2275 + }, + { + "epoch": 0.6251029936830541, + "grad_norm": 1.2175664901733398, + "learning_rate": 4.87748499179441e-06, + "loss": 0.6953, + "step": 2276 + }, + { + "epoch": 0.6253776435045317, + "grad_norm": 1.1957833766937256, + "learning_rate": 4.877373290147282e-06, + "loss": 0.675, + "step": 2277 + }, + { + "epoch": 0.6256522933260094, + "grad_norm": 1.2842142581939697, + "learning_rate": 4.877261538882256e-06, + "loss": 0.6017, + "step": 2278 + }, + { + "epoch": 0.625926943147487, + "grad_norm": 1.2085833549499512, + "learning_rate": 4.877149738001665e-06, + "loss": 0.6566, + "step": 2279 + }, + { + "epoch": 0.6262015929689646, + "grad_norm": 1.1657382249832153, + "learning_rate": 4.877037887507842e-06, + "loss": 0.6436, + "step": 2280 + }, + { + "epoch": 0.6264762427904422, + "grad_norm": 1.302761435508728, + "learning_rate": 4.876925987403123e-06, + "loss": 0.6937, + "step": 2281 + }, + { + "epoch": 0.6267508926119199, + "grad_norm": 1.1853264570236206, + "learning_rate": 4.876814037689841e-06, + "loss": 0.6403, + "step": 2282 + }, + { + "epoch": 0.6270255424333974, + "grad_norm": 1.2788697481155396, + "learning_rate": 4.876702038370334e-06, + "loss": 0.6767, + "step": 2283 + }, + { + "epoch": 0.627300192254875, + "grad_norm": 1.2680929899215698, + "learning_rate": 4.8765899894469395e-06, + "loss": 0.6822, + "step": 2284 + }, + { + "epoch": 0.6275748420763526, + "grad_norm": 1.2173099517822266, + "learning_rate": 4.876477890921995e-06, + "loss": 0.6632, + "step": 2285 + }, + { + "epoch": 0.6278494918978302, + "grad_norm": 1.258073329925537, + "learning_rate": 4.876365742797842e-06, + "loss": 0.6865, + "step": 2286 + }, + { + "epoch": 0.6281241417193079, + "grad_norm": 1.243541955947876, + "learning_rate": 4.87625354507682e-06, + "loss": 0.6556, + "step": 2287 + }, + { + "epoch": 0.6283987915407855, + "grad_norm": 1.3018732070922852, + "learning_rate": 4.876141297761271e-06, + "loss": 0.6449, + "step": 2288 + }, + { + "epoch": 0.6286734413622631, + "grad_norm": 1.2847545146942139, + "learning_rate": 4.876029000853536e-06, + "loss": 0.6777, + "step": 2289 + }, + { + "epoch": 0.6289480911837407, + "grad_norm": 1.2862802743911743, + "learning_rate": 4.875916654355962e-06, + "loss": 0.6593, + "step": 2290 + }, + { + "epoch": 0.6292227410052184, + "grad_norm": 1.2520287036895752, + "learning_rate": 4.875804258270891e-06, + "loss": 0.6341, + "step": 2291 + }, + { + "epoch": 0.629497390826696, + "grad_norm": 1.2531334161758423, + "learning_rate": 4.875691812600669e-06, + "loss": 0.6855, + "step": 2292 + }, + { + "epoch": 0.6297720406481736, + "grad_norm": 1.291643500328064, + "learning_rate": 4.875579317347645e-06, + "loss": 0.6328, + "step": 2293 + }, + { + "epoch": 0.6300466904696512, + "grad_norm": 1.2634364366531372, + "learning_rate": 4.8754667725141646e-06, + "loss": 0.6458, + "step": 2294 + }, + { + "epoch": 0.6303213402911289, + "grad_norm": 1.2950358390808105, + "learning_rate": 4.875354178102578e-06, + "loss": 0.6685, + "step": 2295 + }, + { + "epoch": 0.6305959901126065, + "grad_norm": 1.232521653175354, + "learning_rate": 4.875241534115233e-06, + "loss": 0.653, + "step": 2296 + }, + { + "epoch": 0.630870639934084, + "grad_norm": 1.2499456405639648, + "learning_rate": 4.875128840554485e-06, + "loss": 0.6362, + "step": 2297 + }, + { + "epoch": 0.6311452897555616, + "grad_norm": 1.221954345703125, + "learning_rate": 4.8750160974226815e-06, + "loss": 0.6818, + "step": 2298 + }, + { + "epoch": 0.6314199395770392, + "grad_norm": 1.2163417339324951, + "learning_rate": 4.874903304722178e-06, + "loss": 0.6168, + "step": 2299 + }, + { + "epoch": 0.6316945893985169, + "grad_norm": 1.2078871726989746, + "learning_rate": 4.874790462455328e-06, + "loss": 0.6029, + "step": 2300 + }, + { + "epoch": 0.6319692392199945, + "grad_norm": 1.2508267164230347, + "learning_rate": 4.874677570624487e-06, + "loss": 0.6634, + "step": 2301 + }, + { + "epoch": 0.6322438890414721, + "grad_norm": 1.2729121446609497, + "learning_rate": 4.87456462923201e-06, + "loss": 0.6917, + "step": 2302 + }, + { + "epoch": 0.6325185388629497, + "grad_norm": 1.2348761558532715, + "learning_rate": 4.874451638280255e-06, + "loss": 0.6833, + "step": 2303 + }, + { + "epoch": 0.6327931886844274, + "grad_norm": 1.341267704963684, + "learning_rate": 4.8743385977715795e-06, + "loss": 0.6552, + "step": 2304 + }, + { + "epoch": 0.633067838505905, + "grad_norm": 1.1851223707199097, + "learning_rate": 4.874225507708344e-06, + "loss": 0.6408, + "step": 2305 + }, + { + "epoch": 0.6333424883273826, + "grad_norm": 1.191239833831787, + "learning_rate": 4.874112368092909e-06, + "loss": 0.6734, + "step": 2306 + }, + { + "epoch": 0.6336171381488602, + "grad_norm": 1.3290492296218872, + "learning_rate": 4.8739991789276345e-06, + "loss": 0.673, + "step": 2307 + }, + { + "epoch": 0.6338917879703378, + "grad_norm": 1.2048017978668213, + "learning_rate": 4.8738859402148815e-06, + "loss": 0.6636, + "step": 2308 + }, + { + "epoch": 0.6341664377918155, + "grad_norm": 1.3111512660980225, + "learning_rate": 4.8737726519570175e-06, + "loss": 0.6589, + "step": 2309 + }, + { + "epoch": 0.6344410876132931, + "grad_norm": 1.2591675519943237, + "learning_rate": 4.873659314156404e-06, + "loss": 0.6707, + "step": 2310 + }, + { + "epoch": 0.6347157374347707, + "grad_norm": 1.2288177013397217, + "learning_rate": 4.873545926815407e-06, + "loss": 0.6661, + "step": 2311 + }, + { + "epoch": 0.6349903872562482, + "grad_norm": 1.3324315547943115, + "learning_rate": 4.8734324899363935e-06, + "loss": 0.6702, + "step": 2312 + }, + { + "epoch": 0.6352650370777259, + "grad_norm": 1.2441694736480713, + "learning_rate": 4.873319003521731e-06, + "loss": 0.6701, + "step": 2313 + }, + { + "epoch": 0.6355396868992035, + "grad_norm": 1.302112340927124, + "learning_rate": 4.873205467573786e-06, + "loss": 0.6368, + "step": 2314 + }, + { + "epoch": 0.6358143367206811, + "grad_norm": 1.189063310623169, + "learning_rate": 4.873091882094931e-06, + "loss": 0.6412, + "step": 2315 + }, + { + "epoch": 0.6360889865421587, + "grad_norm": 1.1966521739959717, + "learning_rate": 4.872978247087535e-06, + "loss": 0.677, + "step": 2316 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.1490018367767334, + "learning_rate": 4.872864562553971e-06, + "loss": 0.6489, + "step": 2317 + }, + { + "epoch": 0.636638286185114, + "grad_norm": 1.2110377550125122, + "learning_rate": 4.8727508284966104e-06, + "loss": 0.649, + "step": 2318 + }, + { + "epoch": 0.6369129360065916, + "grad_norm": 1.192082166671753, + "learning_rate": 4.872637044917827e-06, + "loss": 0.6457, + "step": 2319 + }, + { + "epoch": 0.6371875858280692, + "grad_norm": 1.347945213317871, + "learning_rate": 4.8725232118199965e-06, + "loss": 0.6797, + "step": 2320 + }, + { + "epoch": 0.6374622356495468, + "grad_norm": 1.2579545974731445, + "learning_rate": 4.872409329205493e-06, + "loss": 0.6504, + "step": 2321 + }, + { + "epoch": 0.6377368854710245, + "grad_norm": 1.2010716199874878, + "learning_rate": 4.872295397076695e-06, + "loss": 0.7014, + "step": 2322 + }, + { + "epoch": 0.6380115352925021, + "grad_norm": 1.3290280103683472, + "learning_rate": 4.87218141543598e-06, + "loss": 0.665, + "step": 2323 + }, + { + "epoch": 0.6382861851139797, + "grad_norm": 1.2618565559387207, + "learning_rate": 4.872067384285727e-06, + "loss": 0.7366, + "step": 2324 + }, + { + "epoch": 0.6385608349354573, + "grad_norm": 1.279033899307251, + "learning_rate": 4.871953303628315e-06, + "loss": 0.6635, + "step": 2325 + }, + { + "epoch": 0.6388354847569349, + "grad_norm": 1.2714426517486572, + "learning_rate": 4.871839173466127e-06, + "loss": 0.6814, + "step": 2326 + }, + { + "epoch": 0.6391101345784125, + "grad_norm": 1.1830644607543945, + "learning_rate": 4.871724993801541e-06, + "loss": 0.6457, + "step": 2327 + }, + { + "epoch": 0.6393847843998901, + "grad_norm": 1.2252377271652222, + "learning_rate": 4.871610764636945e-06, + "loss": 0.6939, + "step": 2328 + }, + { + "epoch": 0.6396594342213677, + "grad_norm": 1.1912280321121216, + "learning_rate": 4.8714964859747195e-06, + "loss": 0.6416, + "step": 2329 + }, + { + "epoch": 0.6399340840428454, + "grad_norm": 1.2039128541946411, + "learning_rate": 4.8713821578172505e-06, + "loss": 0.6365, + "step": 2330 + }, + { + "epoch": 0.640208733864323, + "grad_norm": 1.1382946968078613, + "learning_rate": 4.871267780166925e-06, + "loss": 0.6527, + "step": 2331 + }, + { + "epoch": 0.6404833836858006, + "grad_norm": 1.18531334400177, + "learning_rate": 4.87115335302613e-06, + "loss": 0.6143, + "step": 2332 + }, + { + "epoch": 0.6407580335072782, + "grad_norm": 1.2161833047866821, + "learning_rate": 4.871038876397253e-06, + "loss": 0.6035, + "step": 2333 + }, + { + "epoch": 0.6410326833287558, + "grad_norm": 1.2760906219482422, + "learning_rate": 4.870924350282683e-06, + "loss": 0.6273, + "step": 2334 + }, + { + "epoch": 0.6413073331502335, + "grad_norm": 1.2057136297225952, + "learning_rate": 4.87080977468481e-06, + "loss": 0.6791, + "step": 2335 + }, + { + "epoch": 0.6415819829717111, + "grad_norm": 1.2507777214050293, + "learning_rate": 4.870695149606028e-06, + "loss": 0.6837, + "step": 2336 + }, + { + "epoch": 0.6418566327931887, + "grad_norm": 1.2726746797561646, + "learning_rate": 4.8705804750487264e-06, + "loss": 0.6447, + "step": 2337 + }, + { + "epoch": 0.6421312826146663, + "grad_norm": 1.1960442066192627, + "learning_rate": 4.8704657510153e-06, + "loss": 0.6396, + "step": 2338 + }, + { + "epoch": 0.642405932436144, + "grad_norm": 1.2517859935760498, + "learning_rate": 4.870350977508142e-06, + "loss": 0.6995, + "step": 2339 + }, + { + "epoch": 0.6426805822576215, + "grad_norm": 1.217170000076294, + "learning_rate": 4.870236154529649e-06, + "loss": 0.6712, + "step": 2340 + }, + { + "epoch": 0.6429552320790991, + "grad_norm": 1.2512516975402832, + "learning_rate": 4.8701212820822165e-06, + "loss": 0.7355, + "step": 2341 + }, + { + "epoch": 0.6432298819005767, + "grad_norm": 1.2561153173446655, + "learning_rate": 4.870006360168244e-06, + "loss": 0.6466, + "step": 2342 + }, + { + "epoch": 0.6435045317220544, + "grad_norm": 1.208702802658081, + "learning_rate": 4.869891388790127e-06, + "loss": 0.6714, + "step": 2343 + }, + { + "epoch": 0.643779181543532, + "grad_norm": 1.3221428394317627, + "learning_rate": 4.869776367950267e-06, + "loss": 0.7032, + "step": 2344 + }, + { + "epoch": 0.6440538313650096, + "grad_norm": 1.2560445070266724, + "learning_rate": 4.869661297651064e-06, + "loss": 0.6913, + "step": 2345 + }, + { + "epoch": 0.6443284811864872, + "grad_norm": 1.2235829830169678, + "learning_rate": 4.869546177894921e-06, + "loss": 0.6694, + "step": 2346 + }, + { + "epoch": 0.6446031310079648, + "grad_norm": 1.319300889968872, + "learning_rate": 4.869431008684238e-06, + "loss": 0.6526, + "step": 2347 + }, + { + "epoch": 0.6448777808294425, + "grad_norm": 1.2550545930862427, + "learning_rate": 4.869315790021421e-06, + "loss": 0.7084, + "step": 2348 + }, + { + "epoch": 0.6451524306509201, + "grad_norm": 1.1825112104415894, + "learning_rate": 4.869200521908873e-06, + "loss": 0.6154, + "step": 2349 + }, + { + "epoch": 0.6454270804723977, + "grad_norm": 1.3326940536499023, + "learning_rate": 4.869085204349001e-06, + "loss": 0.6944, + "step": 2350 + }, + { + "epoch": 0.6457017302938753, + "grad_norm": 1.3068625926971436, + "learning_rate": 4.868969837344211e-06, + "loss": 0.6688, + "step": 2351 + }, + { + "epoch": 0.645976380115353, + "grad_norm": 1.3186291456222534, + "learning_rate": 4.868854420896912e-06, + "loss": 0.6709, + "step": 2352 + }, + { + "epoch": 0.6462510299368306, + "grad_norm": 1.3176915645599365, + "learning_rate": 4.868738955009512e-06, + "loss": 0.7051, + "step": 2353 + }, + { + "epoch": 0.6465256797583081, + "grad_norm": 1.1829774379730225, + "learning_rate": 4.86862343968442e-06, + "loss": 0.6286, + "step": 2354 + }, + { + "epoch": 0.6468003295797857, + "grad_norm": 1.2580732107162476, + "learning_rate": 4.868507874924049e-06, + "loss": 0.6781, + "step": 2355 + }, + { + "epoch": 0.6470749794012634, + "grad_norm": 1.2304930686950684, + "learning_rate": 4.868392260730808e-06, + "loss": 0.6351, + "step": 2356 + }, + { + "epoch": 0.647349629222741, + "grad_norm": 1.1758105754852295, + "learning_rate": 4.868276597107113e-06, + "loss": 0.6467, + "step": 2357 + }, + { + "epoch": 0.6476242790442186, + "grad_norm": 1.2294155359268188, + "learning_rate": 4.8681608840553776e-06, + "loss": 0.6381, + "step": 2358 + }, + { + "epoch": 0.6478989288656962, + "grad_norm": 1.1753289699554443, + "learning_rate": 4.868045121578015e-06, + "loss": 0.6729, + "step": 2359 + }, + { + "epoch": 0.6481735786871738, + "grad_norm": 1.1876384019851685, + "learning_rate": 4.867929309677442e-06, + "loss": 0.654, + "step": 2360 + }, + { + "epoch": 0.6484482285086515, + "grad_norm": 1.2648404836654663, + "learning_rate": 4.867813448356076e-06, + "loss": 0.6436, + "step": 2361 + }, + { + "epoch": 0.6487228783301291, + "grad_norm": 1.2250540256500244, + "learning_rate": 4.867697537616335e-06, + "loss": 0.6302, + "step": 2362 + }, + { + "epoch": 0.6489975281516067, + "grad_norm": 1.206444501876831, + "learning_rate": 4.867581577460639e-06, + "loss": 0.6571, + "step": 2363 + }, + { + "epoch": 0.6492721779730843, + "grad_norm": 1.2882975339889526, + "learning_rate": 4.867465567891406e-06, + "loss": 0.6606, + "step": 2364 + }, + { + "epoch": 0.649546827794562, + "grad_norm": 1.309769630432129, + "learning_rate": 4.86734950891106e-06, + "loss": 0.6849, + "step": 2365 + }, + { + "epoch": 0.6498214776160396, + "grad_norm": 1.2941290140151978, + "learning_rate": 4.867233400522021e-06, + "loss": 0.6899, + "step": 2366 + }, + { + "epoch": 0.6500961274375172, + "grad_norm": 1.2072819471359253, + "learning_rate": 4.867117242726714e-06, + "loss": 0.6538, + "step": 2367 + }, + { + "epoch": 0.6503707772589948, + "grad_norm": 1.239656925201416, + "learning_rate": 4.867001035527562e-06, + "loss": 0.6821, + "step": 2368 + }, + { + "epoch": 0.6506454270804723, + "grad_norm": 1.218690276145935, + "learning_rate": 4.86688477892699e-06, + "loss": 0.6437, + "step": 2369 + }, + { + "epoch": 0.65092007690195, + "grad_norm": 1.2470159530639648, + "learning_rate": 4.866768472927427e-06, + "loss": 0.6401, + "step": 2370 + }, + { + "epoch": 0.6511947267234276, + "grad_norm": 1.2821930646896362, + "learning_rate": 4.866652117531297e-06, + "loss": 0.624, + "step": 2371 + }, + { + "epoch": 0.6514693765449052, + "grad_norm": 1.2682119607925415, + "learning_rate": 4.866535712741031e-06, + "loss": 0.674, + "step": 2372 + }, + { + "epoch": 0.6517440263663828, + "grad_norm": 1.194772720336914, + "learning_rate": 4.866419258559058e-06, + "loss": 0.667, + "step": 2373 + }, + { + "epoch": 0.6520186761878605, + "grad_norm": 1.3997645378112793, + "learning_rate": 4.866302754987806e-06, + "loss": 0.7026, + "step": 2374 + }, + { + "epoch": 0.6522933260093381, + "grad_norm": 1.280574083328247, + "learning_rate": 4.86618620202971e-06, + "loss": 0.6954, + "step": 2375 + }, + { + "epoch": 0.6525679758308157, + "grad_norm": 1.2607375383377075, + "learning_rate": 4.8660695996872015e-06, + "loss": 0.7106, + "step": 2376 + }, + { + "epoch": 0.6528426256522933, + "grad_norm": 1.299013376235962, + "learning_rate": 4.865952947962713e-06, + "loss": 0.6585, + "step": 2377 + }, + { + "epoch": 0.653117275473771, + "grad_norm": 1.1974748373031616, + "learning_rate": 4.865836246858681e-06, + "loss": 0.6231, + "step": 2378 + }, + { + "epoch": 0.6533919252952486, + "grad_norm": 1.3611178398132324, + "learning_rate": 4.865719496377538e-06, + "loss": 0.7022, + "step": 2379 + }, + { + "epoch": 0.6536665751167262, + "grad_norm": 1.2500081062316895, + "learning_rate": 4.865602696521724e-06, + "loss": 0.6753, + "step": 2380 + }, + { + "epoch": 0.6539412249382038, + "grad_norm": 1.3568782806396484, + "learning_rate": 4.865485847293676e-06, + "loss": 0.6806, + "step": 2381 + }, + { + "epoch": 0.6542158747596815, + "grad_norm": 1.2675652503967285, + "learning_rate": 4.865368948695831e-06, + "loss": 0.6643, + "step": 2382 + }, + { + "epoch": 0.654490524581159, + "grad_norm": 1.2089654207229614, + "learning_rate": 4.8652520007306305e-06, + "loss": 0.6853, + "step": 2383 + }, + { + "epoch": 0.6547651744026366, + "grad_norm": 1.186428427696228, + "learning_rate": 4.865135003400514e-06, + "loss": 0.6749, + "step": 2384 + }, + { + "epoch": 0.6550398242241142, + "grad_norm": 1.186400294303894, + "learning_rate": 4.865017956707925e-06, + "loss": 0.6379, + "step": 2385 + }, + { + "epoch": 0.6553144740455918, + "grad_norm": 1.1832771301269531, + "learning_rate": 4.864900860655305e-06, + "loss": 0.6598, + "step": 2386 + }, + { + "epoch": 0.6555891238670695, + "grad_norm": 1.2750558853149414, + "learning_rate": 4.864783715245098e-06, + "loss": 0.6289, + "step": 2387 + }, + { + "epoch": 0.6558637736885471, + "grad_norm": 1.2696483135223389, + "learning_rate": 4.864666520479751e-06, + "loss": 0.6234, + "step": 2388 + }, + { + "epoch": 0.6561384235100247, + "grad_norm": 1.235398530960083, + "learning_rate": 4.864549276361706e-06, + "loss": 0.6577, + "step": 2389 + }, + { + "epoch": 0.6564130733315023, + "grad_norm": 1.258173942565918, + "learning_rate": 4.864431982893414e-06, + "loss": 0.5999, + "step": 2390 + }, + { + "epoch": 0.65668772315298, + "grad_norm": 1.2648627758026123, + "learning_rate": 4.86431464007732e-06, + "loss": 0.6968, + "step": 2391 + }, + { + "epoch": 0.6569623729744576, + "grad_norm": 1.253382921218872, + "learning_rate": 4.864197247915875e-06, + "loss": 0.6455, + "step": 2392 + }, + { + "epoch": 0.6572370227959352, + "grad_norm": 1.2074750661849976, + "learning_rate": 4.864079806411528e-06, + "loss": 0.6399, + "step": 2393 + }, + { + "epoch": 0.6575116726174128, + "grad_norm": 1.2792136669158936, + "learning_rate": 4.86396231556673e-06, + "loss": 0.7321, + "step": 2394 + }, + { + "epoch": 0.6577863224388905, + "grad_norm": 1.1795686483383179, + "learning_rate": 4.863844775383935e-06, + "loss": 0.6288, + "step": 2395 + }, + { + "epoch": 0.6580609722603681, + "grad_norm": 1.2060192823410034, + "learning_rate": 4.863727185865594e-06, + "loss": 0.5987, + "step": 2396 + }, + { + "epoch": 0.6583356220818456, + "grad_norm": 1.2190200090408325, + "learning_rate": 4.863609547014162e-06, + "loss": 0.6456, + "step": 2397 + }, + { + "epoch": 0.6586102719033232, + "grad_norm": 1.1759698390960693, + "learning_rate": 4.863491858832095e-06, + "loss": 0.6959, + "step": 2398 + }, + { + "epoch": 0.6588849217248008, + "grad_norm": 1.2415157556533813, + "learning_rate": 4.8633741213218475e-06, + "loss": 0.6615, + "step": 2399 + }, + { + "epoch": 0.6591595715462785, + "grad_norm": 1.2711741924285889, + "learning_rate": 4.8632563344858775e-06, + "loss": 0.6713, + "step": 2400 + }, + { + "epoch": 0.6594342213677561, + "grad_norm": 1.1652781963348389, + "learning_rate": 4.8631384983266445e-06, + "loss": 0.6497, + "step": 2401 + }, + { + "epoch": 0.6597088711892337, + "grad_norm": 1.34406578540802, + "learning_rate": 4.8630206128466066e-06, + "loss": 0.6649, + "step": 2402 + }, + { + "epoch": 0.6599835210107113, + "grad_norm": 1.288146734237671, + "learning_rate": 4.862902678048224e-06, + "loss": 0.7039, + "step": 2403 + }, + { + "epoch": 0.660258170832189, + "grad_norm": 1.2775423526763916, + "learning_rate": 4.862784693933959e-06, + "loss": 0.7245, + "step": 2404 + }, + { + "epoch": 0.6605328206536666, + "grad_norm": 1.2249255180358887, + "learning_rate": 4.862666660506274e-06, + "loss": 0.6803, + "step": 2405 + }, + { + "epoch": 0.6608074704751442, + "grad_norm": 1.3164795637130737, + "learning_rate": 4.862548577767632e-06, + "loss": 0.7037, + "step": 2406 + }, + { + "epoch": 0.6610821202966218, + "grad_norm": 1.2183669805526733, + "learning_rate": 4.862430445720498e-06, + "loss": 0.6714, + "step": 2407 + }, + { + "epoch": 0.6613567701180995, + "grad_norm": 1.2691792249679565, + "learning_rate": 4.862312264367336e-06, + "loss": 0.6936, + "step": 2408 + }, + { + "epoch": 0.6616314199395771, + "grad_norm": 1.2474333047866821, + "learning_rate": 4.862194033710614e-06, + "loss": 0.6582, + "step": 2409 + }, + { + "epoch": 0.6619060697610547, + "grad_norm": 1.2718043327331543, + "learning_rate": 4.8620757537528e-06, + "loss": 0.6124, + "step": 2410 + }, + { + "epoch": 0.6621807195825322, + "grad_norm": 1.1839513778686523, + "learning_rate": 4.86195742449636e-06, + "loss": 0.6156, + "step": 2411 + }, + { + "epoch": 0.6624553694040098, + "grad_norm": 1.3037816286087036, + "learning_rate": 4.861839045943767e-06, + "loss": 0.6445, + "step": 2412 + }, + { + "epoch": 0.6627300192254875, + "grad_norm": 1.2805886268615723, + "learning_rate": 4.8617206180974905e-06, + "loss": 0.6687, + "step": 2413 + }, + { + "epoch": 0.6630046690469651, + "grad_norm": 1.1477116346359253, + "learning_rate": 4.861602140960001e-06, + "loss": 0.6591, + "step": 2414 + }, + { + "epoch": 0.6632793188684427, + "grad_norm": 1.2587929964065552, + "learning_rate": 4.861483614533772e-06, + "loss": 0.6596, + "step": 2415 + }, + { + "epoch": 0.6635539686899203, + "grad_norm": 1.2850815057754517, + "learning_rate": 4.861365038821278e-06, + "loss": 0.6907, + "step": 2416 + }, + { + "epoch": 0.663828618511398, + "grad_norm": 1.1940858364105225, + "learning_rate": 4.861246413824993e-06, + "loss": 0.5751, + "step": 2417 + }, + { + "epoch": 0.6641032683328756, + "grad_norm": 1.2289096117019653, + "learning_rate": 4.861127739547392e-06, + "loss": 0.7033, + "step": 2418 + }, + { + "epoch": 0.6643779181543532, + "grad_norm": 1.1941348314285278, + "learning_rate": 4.861009015990954e-06, + "loss": 0.6747, + "step": 2419 + }, + { + "epoch": 0.6646525679758308, + "grad_norm": 1.1660795211791992, + "learning_rate": 4.860890243158155e-06, + "loss": 0.6592, + "step": 2420 + }, + { + "epoch": 0.6649272177973085, + "grad_norm": 1.2096538543701172, + "learning_rate": 4.860771421051475e-06, + "loss": 0.6737, + "step": 2421 + }, + { + "epoch": 0.6652018676187861, + "grad_norm": 1.1565107107162476, + "learning_rate": 4.8606525496733925e-06, + "loss": 0.6807, + "step": 2422 + }, + { + "epoch": 0.6654765174402637, + "grad_norm": 1.2097201347351074, + "learning_rate": 4.8605336290263895e-06, + "loss": 0.6918, + "step": 2423 + }, + { + "epoch": 0.6657511672617413, + "grad_norm": 1.2859036922454834, + "learning_rate": 4.8604146591129485e-06, + "loss": 0.6386, + "step": 2424 + }, + { + "epoch": 0.6660258170832188, + "grad_norm": 1.2120131254196167, + "learning_rate": 4.860295639935552e-06, + "loss": 0.5995, + "step": 2425 + }, + { + "epoch": 0.6663004669046965, + "grad_norm": 1.1865559816360474, + "learning_rate": 4.8601765714966835e-06, + "loss": 0.6386, + "step": 2426 + }, + { + "epoch": 0.6665751167261741, + "grad_norm": 1.140697717666626, + "learning_rate": 4.8600574537988274e-06, + "loss": 0.6111, + "step": 2427 + }, + { + "epoch": 0.6668497665476517, + "grad_norm": 1.172806978225708, + "learning_rate": 4.859938286844472e-06, + "loss": 0.6299, + "step": 2428 + }, + { + "epoch": 0.6671244163691293, + "grad_norm": 1.278246283531189, + "learning_rate": 4.8598190706361035e-06, + "loss": 0.6308, + "step": 2429 + }, + { + "epoch": 0.667399066190607, + "grad_norm": 1.2665176391601562, + "learning_rate": 4.85969980517621e-06, + "loss": 0.623, + "step": 2430 + }, + { + "epoch": 0.6676737160120846, + "grad_norm": 1.2178493738174438, + "learning_rate": 4.85958049046728e-06, + "loss": 0.6639, + "step": 2431 + }, + { + "epoch": 0.6679483658335622, + "grad_norm": 1.4040766954421997, + "learning_rate": 4.859461126511804e-06, + "loss": 0.6738, + "step": 2432 + }, + { + "epoch": 0.6682230156550398, + "grad_norm": 1.2241830825805664, + "learning_rate": 4.859341713312274e-06, + "loss": 0.6348, + "step": 2433 + }, + { + "epoch": 0.6684976654765175, + "grad_norm": 1.253078579902649, + "learning_rate": 4.859222250871182e-06, + "loss": 0.6715, + "step": 2434 + }, + { + "epoch": 0.6687723152979951, + "grad_norm": 1.191145420074463, + "learning_rate": 4.859102739191021e-06, + "loss": 0.5935, + "step": 2435 + }, + { + "epoch": 0.6690469651194727, + "grad_norm": 1.2114653587341309, + "learning_rate": 4.858983178274284e-06, + "loss": 0.6686, + "step": 2436 + }, + { + "epoch": 0.6693216149409503, + "grad_norm": 1.252699613571167, + "learning_rate": 4.858863568123469e-06, + "loss": 0.7016, + "step": 2437 + }, + { + "epoch": 0.669596264762428, + "grad_norm": 1.1462637186050415, + "learning_rate": 4.85874390874107e-06, + "loss": 0.5495, + "step": 2438 + }, + { + "epoch": 0.6698709145839056, + "grad_norm": 1.168648600578308, + "learning_rate": 4.858624200129586e-06, + "loss": 0.658, + "step": 2439 + }, + { + "epoch": 0.6701455644053831, + "grad_norm": 1.2667863368988037, + "learning_rate": 4.858504442291515e-06, + "loss": 0.6481, + "step": 2440 + }, + { + "epoch": 0.6704202142268607, + "grad_norm": 1.228646993637085, + "learning_rate": 4.8583846352293556e-06, + "loss": 0.6761, + "step": 2441 + }, + { + "epoch": 0.6706948640483383, + "grad_norm": 1.1974778175354004, + "learning_rate": 4.8582647789456094e-06, + "loss": 0.628, + "step": 2442 + }, + { + "epoch": 0.670969513869816, + "grad_norm": 1.2580175399780273, + "learning_rate": 4.858144873442778e-06, + "loss": 0.6493, + "step": 2443 + }, + { + "epoch": 0.6712441636912936, + "grad_norm": 1.2110973596572876, + "learning_rate": 4.858024918723363e-06, + "loss": 0.6505, + "step": 2444 + }, + { + "epoch": 0.6715188135127712, + "grad_norm": 1.2425161600112915, + "learning_rate": 4.857904914789868e-06, + "loss": 0.6608, + "step": 2445 + }, + { + "epoch": 0.6717934633342488, + "grad_norm": 1.201805830001831, + "learning_rate": 4.857784861644798e-06, + "loss": 0.6554, + "step": 2446 + }, + { + "epoch": 0.6720681131557265, + "grad_norm": 1.277544379234314, + "learning_rate": 4.857664759290659e-06, + "loss": 0.693, + "step": 2447 + }, + { + "epoch": 0.6723427629772041, + "grad_norm": 1.1919485330581665, + "learning_rate": 4.857544607729957e-06, + "loss": 0.6694, + "step": 2448 + }, + { + "epoch": 0.6726174127986817, + "grad_norm": 1.117793083190918, + "learning_rate": 4.8574244069652e-06, + "loss": 0.62, + "step": 2449 + }, + { + "epoch": 0.6728920626201593, + "grad_norm": 1.2342332601547241, + "learning_rate": 4.857304156998897e-06, + "loss": 0.6805, + "step": 2450 + }, + { + "epoch": 0.673166712441637, + "grad_norm": 1.210235357284546, + "learning_rate": 4.8571838578335575e-06, + "loss": 0.6354, + "step": 2451 + }, + { + "epoch": 0.6734413622631146, + "grad_norm": 1.207582712173462, + "learning_rate": 4.857063509471691e-06, + "loss": 0.6253, + "step": 2452 + }, + { + "epoch": 0.6737160120845922, + "grad_norm": 1.209510326385498, + "learning_rate": 4.8569431119158106e-06, + "loss": 0.6571, + "step": 2453 + }, + { + "epoch": 0.6739906619060697, + "grad_norm": 1.248435139656067, + "learning_rate": 4.856822665168429e-06, + "loss": 0.6963, + "step": 2454 + }, + { + "epoch": 0.6742653117275473, + "grad_norm": 1.2797932624816895, + "learning_rate": 4.85670216923206e-06, + "loss": 0.6771, + "step": 2455 + }, + { + "epoch": 0.674539961549025, + "grad_norm": 1.1722526550292969, + "learning_rate": 4.856581624109219e-06, + "loss": 0.6626, + "step": 2456 + }, + { + "epoch": 0.6748146113705026, + "grad_norm": 1.287246584892273, + "learning_rate": 4.8564610298024206e-06, + "loss": 0.6904, + "step": 2457 + }, + { + "epoch": 0.6750892611919802, + "grad_norm": 1.1941591501235962, + "learning_rate": 4.8563403863141825e-06, + "loss": 0.6517, + "step": 2458 + }, + { + "epoch": 0.6753639110134578, + "grad_norm": 1.2583779096603394, + "learning_rate": 4.856219693647022e-06, + "loss": 0.6924, + "step": 2459 + }, + { + "epoch": 0.6756385608349355, + "grad_norm": 1.22472083568573, + "learning_rate": 4.856098951803459e-06, + "loss": 0.665, + "step": 2460 + }, + { + "epoch": 0.6759132106564131, + "grad_norm": 1.2232582569122314, + "learning_rate": 4.855978160786013e-06, + "loss": 0.6938, + "step": 2461 + }, + { + "epoch": 0.6761878604778907, + "grad_norm": 1.1918185949325562, + "learning_rate": 4.8558573205972045e-06, + "loss": 0.6759, + "step": 2462 + }, + { + "epoch": 0.6764625102993683, + "grad_norm": 1.2619127035140991, + "learning_rate": 4.855736431239557e-06, + "loss": 0.7208, + "step": 2463 + }, + { + "epoch": 0.676737160120846, + "grad_norm": 1.2984373569488525, + "learning_rate": 4.855615492715592e-06, + "loss": 0.6982, + "step": 2464 + }, + { + "epoch": 0.6770118099423236, + "grad_norm": 1.1893705129623413, + "learning_rate": 4.8554945050278346e-06, + "loss": 0.6601, + "step": 2465 + }, + { + "epoch": 0.6772864597638012, + "grad_norm": 1.182787537574768, + "learning_rate": 4.855373468178809e-06, + "loss": 0.64, + "step": 2466 + }, + { + "epoch": 0.6775611095852788, + "grad_norm": 1.2234127521514893, + "learning_rate": 4.855252382171043e-06, + "loss": 0.6672, + "step": 2467 + }, + { + "epoch": 0.6778357594067563, + "grad_norm": 1.1435353755950928, + "learning_rate": 4.8551312470070616e-06, + "loss": 0.6428, + "step": 2468 + }, + { + "epoch": 0.678110409228234, + "grad_norm": 1.2576814889907837, + "learning_rate": 4.855010062689395e-06, + "loss": 0.6894, + "step": 2469 + }, + { + "epoch": 0.6783850590497116, + "grad_norm": 1.2385004758834839, + "learning_rate": 4.854888829220571e-06, + "loss": 0.6215, + "step": 2470 + }, + { + "epoch": 0.6786597088711892, + "grad_norm": 1.2640084028244019, + "learning_rate": 4.85476754660312e-06, + "loss": 0.704, + "step": 2471 + }, + { + "epoch": 0.6789343586926668, + "grad_norm": 1.2403196096420288, + "learning_rate": 4.854646214839574e-06, + "loss": 0.6649, + "step": 2472 + }, + { + "epoch": 0.6792090085141445, + "grad_norm": 1.2175374031066895, + "learning_rate": 4.854524833932464e-06, + "loss": 0.699, + "step": 2473 + }, + { + "epoch": 0.6794836583356221, + "grad_norm": 1.2631479501724243, + "learning_rate": 4.8544034038843255e-06, + "loss": 0.6658, + "step": 2474 + }, + { + "epoch": 0.6797583081570997, + "grad_norm": 1.237913727760315, + "learning_rate": 4.854281924697692e-06, + "loss": 0.6958, + "step": 2475 + }, + { + "epoch": 0.6800329579785773, + "grad_norm": 1.2160992622375488, + "learning_rate": 4.854160396375098e-06, + "loss": 0.7018, + "step": 2476 + }, + { + "epoch": 0.680307607800055, + "grad_norm": 1.2331290245056152, + "learning_rate": 4.85403881891908e-06, + "loss": 0.6645, + "step": 2477 + }, + { + "epoch": 0.6805822576215326, + "grad_norm": 1.1668201684951782, + "learning_rate": 4.853917192332176e-06, + "loss": 0.6567, + "step": 2478 + }, + { + "epoch": 0.6808569074430102, + "grad_norm": 1.2155003547668457, + "learning_rate": 4.853795516616925e-06, + "loss": 0.663, + "step": 2479 + }, + { + "epoch": 0.6811315572644878, + "grad_norm": 1.2252116203308105, + "learning_rate": 4.8536737917758645e-06, + "loss": 0.6562, + "step": 2480 + }, + { + "epoch": 0.6814062070859654, + "grad_norm": 1.1795504093170166, + "learning_rate": 4.853552017811537e-06, + "loss": 0.6146, + "step": 2481 + }, + { + "epoch": 0.681680856907443, + "grad_norm": 1.1971715688705444, + "learning_rate": 4.853430194726484e-06, + "loss": 0.6187, + "step": 2482 + }, + { + "epoch": 0.6819555067289206, + "grad_norm": 1.2165309190750122, + "learning_rate": 4.853308322523246e-06, + "loss": 0.6503, + "step": 2483 + }, + { + "epoch": 0.6822301565503982, + "grad_norm": 1.2784132957458496, + "learning_rate": 4.853186401204369e-06, + "loss": 0.6954, + "step": 2484 + }, + { + "epoch": 0.6825048063718758, + "grad_norm": 1.2214373350143433, + "learning_rate": 4.853064430772396e-06, + "loss": 0.6624, + "step": 2485 + }, + { + "epoch": 0.6827794561933535, + "grad_norm": 1.1879428625106812, + "learning_rate": 4.852942411229873e-06, + "loss": 0.6463, + "step": 2486 + }, + { + "epoch": 0.6830541060148311, + "grad_norm": 1.2132809162139893, + "learning_rate": 4.8528203425793475e-06, + "loss": 0.6872, + "step": 2487 + }, + { + "epoch": 0.6833287558363087, + "grad_norm": 1.2330808639526367, + "learning_rate": 4.8526982248233665e-06, + "loss": 0.6389, + "step": 2488 + }, + { + "epoch": 0.6836034056577863, + "grad_norm": 1.2233047485351562, + "learning_rate": 4.852576057964478e-06, + "loss": 0.7463, + "step": 2489 + }, + { + "epoch": 0.683878055479264, + "grad_norm": 1.2381340265274048, + "learning_rate": 4.852453842005233e-06, + "loss": 0.6594, + "step": 2490 + }, + { + "epoch": 0.6841527053007416, + "grad_norm": 1.1593436002731323, + "learning_rate": 4.852331576948182e-06, + "loss": 0.6375, + "step": 2491 + }, + { + "epoch": 0.6844273551222192, + "grad_norm": 1.260753870010376, + "learning_rate": 4.852209262795876e-06, + "loss": 0.656, + "step": 2492 + }, + { + "epoch": 0.6847020049436968, + "grad_norm": 1.2004562616348267, + "learning_rate": 4.852086899550868e-06, + "loss": 0.6455, + "step": 2493 + }, + { + "epoch": 0.6849766547651744, + "grad_norm": 1.2131366729736328, + "learning_rate": 4.851964487215712e-06, + "loss": 0.6728, + "step": 2494 + }, + { + "epoch": 0.6852513045866521, + "grad_norm": 1.224953532218933, + "learning_rate": 4.8518420257929645e-06, + "loss": 0.641, + "step": 2495 + }, + { + "epoch": 0.6855259544081297, + "grad_norm": 1.2689448595046997, + "learning_rate": 4.8517195152851784e-06, + "loss": 0.7006, + "step": 2496 + }, + { + "epoch": 0.6858006042296072, + "grad_norm": 1.2415691614151, + "learning_rate": 4.851596955694913e-06, + "loss": 0.6715, + "step": 2497 + }, + { + "epoch": 0.6860752540510848, + "grad_norm": 1.1459202766418457, + "learning_rate": 4.851474347024725e-06, + "loss": 0.6471, + "step": 2498 + }, + { + "epoch": 0.6863499038725625, + "grad_norm": 1.156075358390808, + "learning_rate": 4.851351689277173e-06, + "loss": 0.6357, + "step": 2499 + }, + { + "epoch": 0.6866245536940401, + "grad_norm": 1.2385637760162354, + "learning_rate": 4.851228982454818e-06, + "loss": 0.6288, + "step": 2500 + }, + { + "epoch": 0.6868992035155177, + "grad_norm": 1.293927550315857, + "learning_rate": 4.851106226560221e-06, + "loss": 0.6473, + "step": 2501 + }, + { + "epoch": 0.6871738533369953, + "grad_norm": 1.28309965133667, + "learning_rate": 4.850983421595943e-06, + "loss": 0.6957, + "step": 2502 + }, + { + "epoch": 0.687448503158473, + "grad_norm": 1.28934907913208, + "learning_rate": 4.8508605675645485e-06, + "loss": 0.6696, + "step": 2503 + }, + { + "epoch": 0.6877231529799506, + "grad_norm": 1.2545020580291748, + "learning_rate": 4.850737664468601e-06, + "loss": 0.6265, + "step": 2504 + }, + { + "epoch": 0.6879978028014282, + "grad_norm": 1.2298269271850586, + "learning_rate": 4.850614712310664e-06, + "loss": 0.6535, + "step": 2505 + }, + { + "epoch": 0.6882724526229058, + "grad_norm": 1.2643858194351196, + "learning_rate": 4.850491711093306e-06, + "loss": 0.6817, + "step": 2506 + }, + { + "epoch": 0.6885471024443834, + "grad_norm": 1.2357481718063354, + "learning_rate": 4.850368660819092e-06, + "loss": 0.6474, + "step": 2507 + }, + { + "epoch": 0.6888217522658611, + "grad_norm": 1.3000752925872803, + "learning_rate": 4.850245561490593e-06, + "loss": 0.6771, + "step": 2508 + }, + { + "epoch": 0.6890964020873387, + "grad_norm": 1.2279030084609985, + "learning_rate": 4.850122413110375e-06, + "loss": 0.6652, + "step": 2509 + }, + { + "epoch": 0.6893710519088163, + "grad_norm": 1.2090377807617188, + "learning_rate": 4.8499992156810115e-06, + "loss": 0.6552, + "step": 2510 + }, + { + "epoch": 0.6896457017302938, + "grad_norm": 1.2738678455352783, + "learning_rate": 4.84987596920507e-06, + "loss": 0.6876, + "step": 2511 + }, + { + "epoch": 0.6899203515517714, + "grad_norm": 1.1904641389846802, + "learning_rate": 4.849752673685126e-06, + "loss": 0.6562, + "step": 2512 + }, + { + "epoch": 0.6901950013732491, + "grad_norm": 1.2919272184371948, + "learning_rate": 4.849629329123751e-06, + "loss": 0.691, + "step": 2513 + }, + { + "epoch": 0.6904696511947267, + "grad_norm": 1.2429072856903076, + "learning_rate": 4.8495059355235205e-06, + "loss": 0.6946, + "step": 2514 + }, + { + "epoch": 0.6907443010162043, + "grad_norm": 1.1734365224838257, + "learning_rate": 4.849382492887008e-06, + "loss": 0.6487, + "step": 2515 + }, + { + "epoch": 0.691018950837682, + "grad_norm": 1.2525792121887207, + "learning_rate": 4.8492590012167914e-06, + "loss": 0.6311, + "step": 2516 + }, + { + "epoch": 0.6912936006591596, + "grad_norm": 1.2563265562057495, + "learning_rate": 4.8491354605154485e-06, + "loss": 0.6758, + "step": 2517 + }, + { + "epoch": 0.6915682504806372, + "grad_norm": 1.17202627658844, + "learning_rate": 4.849011870785556e-06, + "loss": 0.6441, + "step": 2518 + }, + { + "epoch": 0.6918429003021148, + "grad_norm": 1.3151847124099731, + "learning_rate": 4.848888232029694e-06, + "loss": 0.6623, + "step": 2519 + }, + { + "epoch": 0.6921175501235924, + "grad_norm": 1.2007482051849365, + "learning_rate": 4.848764544250444e-06, + "loss": 0.6486, + "step": 2520 + }, + { + "epoch": 0.6923921999450701, + "grad_norm": 1.163457989692688, + "learning_rate": 4.848640807450387e-06, + "loss": 0.6458, + "step": 2521 + }, + { + "epoch": 0.6926668497665477, + "grad_norm": 1.251396656036377, + "learning_rate": 4.8485170216321035e-06, + "loss": 0.651, + "step": 2522 + }, + { + "epoch": 0.6929414995880253, + "grad_norm": 1.3481165170669556, + "learning_rate": 4.84839318679818e-06, + "loss": 0.6727, + "step": 2523 + }, + { + "epoch": 0.6932161494095029, + "grad_norm": 1.161405086517334, + "learning_rate": 4.848269302951199e-06, + "loss": 0.6471, + "step": 2524 + }, + { + "epoch": 0.6934907992309804, + "grad_norm": 1.2060315608978271, + "learning_rate": 4.848145370093747e-06, + "loss": 0.6516, + "step": 2525 + }, + { + "epoch": 0.6937654490524581, + "grad_norm": 1.2362033128738403, + "learning_rate": 4.84802138822841e-06, + "loss": 0.7058, + "step": 2526 + }, + { + "epoch": 0.6940400988739357, + "grad_norm": 1.2079367637634277, + "learning_rate": 4.847897357357777e-06, + "loss": 0.6564, + "step": 2527 + }, + { + "epoch": 0.6943147486954133, + "grad_norm": 1.217621088027954, + "learning_rate": 4.847773277484435e-06, + "loss": 0.6547, + "step": 2528 + }, + { + "epoch": 0.6945893985168909, + "grad_norm": 1.1994013786315918, + "learning_rate": 4.847649148610974e-06, + "loss": 0.6759, + "step": 2529 + }, + { + "epoch": 0.6948640483383686, + "grad_norm": 1.2285231351852417, + "learning_rate": 4.847524970739985e-06, + "loss": 0.6352, + "step": 2530 + }, + { + "epoch": 0.6951386981598462, + "grad_norm": 1.2501732110977173, + "learning_rate": 4.84740074387406e-06, + "loss": 0.6641, + "step": 2531 + }, + { + "epoch": 0.6954133479813238, + "grad_norm": 1.3032687902450562, + "learning_rate": 4.847276468015791e-06, + "loss": 0.6786, + "step": 2532 + }, + { + "epoch": 0.6956879978028014, + "grad_norm": 1.2566877603530884, + "learning_rate": 4.847152143167772e-06, + "loss": 0.6567, + "step": 2533 + }, + { + "epoch": 0.6959626476242791, + "grad_norm": 1.307905912399292, + "learning_rate": 4.847027769332599e-06, + "loss": 0.6819, + "step": 2534 + }, + { + "epoch": 0.6962372974457567, + "grad_norm": 1.1892006397247314, + "learning_rate": 4.846903346512866e-06, + "loss": 0.6973, + "step": 2535 + }, + { + "epoch": 0.6965119472672343, + "grad_norm": 1.191818118095398, + "learning_rate": 4.846778874711171e-06, + "loss": 0.6373, + "step": 2536 + }, + { + "epoch": 0.6967865970887119, + "grad_norm": 1.2529497146606445, + "learning_rate": 4.84665435393011e-06, + "loss": 0.6464, + "step": 2537 + }, + { + "epoch": 0.6970612469101896, + "grad_norm": 1.2488638162612915, + "learning_rate": 4.8465297841722845e-06, + "loss": 0.6824, + "step": 2538 + }, + { + "epoch": 0.6973358967316671, + "grad_norm": 1.2017219066619873, + "learning_rate": 4.846405165440292e-06, + "loss": 0.6126, + "step": 2539 + }, + { + "epoch": 0.6976105465531447, + "grad_norm": 1.2843046188354492, + "learning_rate": 4.846280497736735e-06, + "loss": 0.709, + "step": 2540 + }, + { + "epoch": 0.6978851963746223, + "grad_norm": 1.2667895555496216, + "learning_rate": 4.846155781064215e-06, + "loss": 0.6628, + "step": 2541 + }, + { + "epoch": 0.6981598461960999, + "grad_norm": 1.1944607496261597, + "learning_rate": 4.846031015425335e-06, + "loss": 0.6636, + "step": 2542 + }, + { + "epoch": 0.6984344960175776, + "grad_norm": 1.2559814453125, + "learning_rate": 4.845906200822699e-06, + "loss": 0.6962, + "step": 2543 + }, + { + "epoch": 0.6987091458390552, + "grad_norm": 1.14364755153656, + "learning_rate": 4.8457813372589115e-06, + "loss": 0.6919, + "step": 2544 + }, + { + "epoch": 0.6989837956605328, + "grad_norm": 1.3051791191101074, + "learning_rate": 4.8456564247365785e-06, + "loss": 0.6366, + "step": 2545 + }, + { + "epoch": 0.6992584454820104, + "grad_norm": 1.2098251581192017, + "learning_rate": 4.845531463258307e-06, + "loss": 0.6734, + "step": 2546 + }, + { + "epoch": 0.6995330953034881, + "grad_norm": 1.2383352518081665, + "learning_rate": 4.845406452826706e-06, + "loss": 0.6612, + "step": 2547 + }, + { + "epoch": 0.6998077451249657, + "grad_norm": 1.1820002794265747, + "learning_rate": 4.8452813934443846e-06, + "loss": 0.6345, + "step": 2548 + }, + { + "epoch": 0.7000823949464433, + "grad_norm": 1.2627513408660889, + "learning_rate": 4.8451562851139525e-06, + "loss": 0.6424, + "step": 2549 + }, + { + "epoch": 0.7003570447679209, + "grad_norm": 1.2354274988174438, + "learning_rate": 4.845031127838019e-06, + "loss": 0.6361, + "step": 2550 + }, + { + "epoch": 0.7006316945893986, + "grad_norm": 1.2363474369049072, + "learning_rate": 4.844905921619199e-06, + "loss": 0.7299, + "step": 2551 + }, + { + "epoch": 0.7009063444108762, + "grad_norm": 1.2248488664627075, + "learning_rate": 4.844780666460105e-06, + "loss": 0.6487, + "step": 2552 + }, + { + "epoch": 0.7011809942323537, + "grad_norm": 1.3126020431518555, + "learning_rate": 4.84465536236335e-06, + "loss": 0.6925, + "step": 2553 + }, + { + "epoch": 0.7014556440538313, + "grad_norm": 1.2258257865905762, + "learning_rate": 4.84453000933155e-06, + "loss": 0.6853, + "step": 2554 + }, + { + "epoch": 0.7017302938753089, + "grad_norm": 1.242316484451294, + "learning_rate": 4.84440460736732e-06, + "loss": 0.5938, + "step": 2555 + }, + { + "epoch": 0.7020049436967866, + "grad_norm": 1.2006852626800537, + "learning_rate": 4.84427915647328e-06, + "loss": 0.6849, + "step": 2556 + }, + { + "epoch": 0.7022795935182642, + "grad_norm": 1.2141653299331665, + "learning_rate": 4.8441536566520465e-06, + "loss": 0.6103, + "step": 2557 + }, + { + "epoch": 0.7025542433397418, + "grad_norm": 1.320063591003418, + "learning_rate": 4.844028107906238e-06, + "loss": 0.684, + "step": 2558 + }, + { + "epoch": 0.7028288931612194, + "grad_norm": 1.1797584295272827, + "learning_rate": 4.8439025102384765e-06, + "loss": 0.6259, + "step": 2559 + }, + { + "epoch": 0.7031035429826971, + "grad_norm": 1.256907343864441, + "learning_rate": 4.8437768636513825e-06, + "loss": 0.6878, + "step": 2560 + }, + { + "epoch": 0.7033781928041747, + "grad_norm": 1.183455228805542, + "learning_rate": 4.843651168147579e-06, + "loss": 0.6015, + "step": 2561 + }, + { + "epoch": 0.7036528426256523, + "grad_norm": 1.2778548002243042, + "learning_rate": 4.843525423729688e-06, + "loss": 0.6589, + "step": 2562 + }, + { + "epoch": 0.7039274924471299, + "grad_norm": 1.2166637182235718, + "learning_rate": 4.843399630400336e-06, + "loss": 0.6762, + "step": 2563 + }, + { + "epoch": 0.7042021422686076, + "grad_norm": 1.278424620628357, + "learning_rate": 4.843273788162146e-06, + "loss": 0.6658, + "step": 2564 + }, + { + "epoch": 0.7044767920900852, + "grad_norm": 1.2461270093917847, + "learning_rate": 4.843147897017746e-06, + "loss": 0.7333, + "step": 2565 + }, + { + "epoch": 0.7047514419115628, + "grad_norm": 1.2532498836517334, + "learning_rate": 4.8430219569697634e-06, + "loss": 0.6449, + "step": 2566 + }, + { + "epoch": 0.7050260917330404, + "grad_norm": 1.2417924404144287, + "learning_rate": 4.842895968020826e-06, + "loss": 0.6962, + "step": 2567 + }, + { + "epoch": 0.7053007415545179, + "grad_norm": 1.2479039430618286, + "learning_rate": 4.842769930173563e-06, + "loss": 0.6354, + "step": 2568 + }, + { + "epoch": 0.7055753913759956, + "grad_norm": 1.2602438926696777, + "learning_rate": 4.842643843430607e-06, + "loss": 0.6601, + "step": 2569 + }, + { + "epoch": 0.7058500411974732, + "grad_norm": 1.2029204368591309, + "learning_rate": 4.842517707794587e-06, + "loss": 0.6556, + "step": 2570 + }, + { + "epoch": 0.7061246910189508, + "grad_norm": 1.2296373844146729, + "learning_rate": 4.842391523268137e-06, + "loss": 0.6386, + "step": 2571 + }, + { + "epoch": 0.7063993408404284, + "grad_norm": 1.2428159713745117, + "learning_rate": 4.8422652898538905e-06, + "loss": 0.699, + "step": 2572 + }, + { + "epoch": 0.7066739906619061, + "grad_norm": 1.237544298171997, + "learning_rate": 4.842139007554482e-06, + "loss": 0.6565, + "step": 2573 + }, + { + "epoch": 0.7069486404833837, + "grad_norm": 1.1524006128311157, + "learning_rate": 4.8420126763725465e-06, + "loss": 0.6328, + "step": 2574 + }, + { + "epoch": 0.7072232903048613, + "grad_norm": 1.2504823207855225, + "learning_rate": 4.841886296310722e-06, + "loss": 0.6707, + "step": 2575 + }, + { + "epoch": 0.7074979401263389, + "grad_norm": 1.2813316583633423, + "learning_rate": 4.8417598673716435e-06, + "loss": 0.6831, + "step": 2576 + }, + { + "epoch": 0.7077725899478166, + "grad_norm": 1.2346826791763306, + "learning_rate": 4.8416333895579536e-06, + "loss": 0.644, + "step": 2577 + }, + { + "epoch": 0.7080472397692942, + "grad_norm": 1.2129617929458618, + "learning_rate": 4.841506862872288e-06, + "loss": 0.6292, + "step": 2578 + }, + { + "epoch": 0.7083218895907718, + "grad_norm": 1.1971428394317627, + "learning_rate": 4.841380287317291e-06, + "loss": 0.5952, + "step": 2579 + }, + { + "epoch": 0.7085965394122494, + "grad_norm": 1.233088731765747, + "learning_rate": 4.841253662895602e-06, + "loss": 0.6767, + "step": 2580 + }, + { + "epoch": 0.708871189233727, + "grad_norm": 1.1671273708343506, + "learning_rate": 4.841126989609864e-06, + "loss": 0.6513, + "step": 2581 + }, + { + "epoch": 0.7091458390552046, + "grad_norm": 1.237938642501831, + "learning_rate": 4.8410002674627215e-06, + "loss": 0.6678, + "step": 2582 + }, + { + "epoch": 0.7094204888766822, + "grad_norm": 1.2909393310546875, + "learning_rate": 4.840873496456819e-06, + "loss": 0.7051, + "step": 2583 + }, + { + "epoch": 0.7096951386981598, + "grad_norm": 1.2044832706451416, + "learning_rate": 4.840746676594802e-06, + "loss": 0.6657, + "step": 2584 + }, + { + "epoch": 0.7099697885196374, + "grad_norm": 1.2281615734100342, + "learning_rate": 4.840619807879319e-06, + "loss": 0.6467, + "step": 2585 + }, + { + "epoch": 0.7102444383411151, + "grad_norm": 1.2388803958892822, + "learning_rate": 4.840492890313016e-06, + "loss": 0.6889, + "step": 2586 + }, + { + "epoch": 0.7105190881625927, + "grad_norm": 1.204423427581787, + "learning_rate": 4.8403659238985415e-06, + "loss": 0.6576, + "step": 2587 + }, + { + "epoch": 0.7107937379840703, + "grad_norm": 1.1941070556640625, + "learning_rate": 4.840238908638547e-06, + "loss": 0.6683, + "step": 2588 + }, + { + "epoch": 0.7110683878055479, + "grad_norm": 1.1943819522857666, + "learning_rate": 4.840111844535682e-06, + "loss": 0.6424, + "step": 2589 + }, + { + "epoch": 0.7113430376270256, + "grad_norm": 1.1951406002044678, + "learning_rate": 4.8399847315926e-06, + "loss": 0.6165, + "step": 2590 + }, + { + "epoch": 0.7116176874485032, + "grad_norm": 1.1617677211761475, + "learning_rate": 4.839857569811953e-06, + "loss": 0.6631, + "step": 2591 + }, + { + "epoch": 0.7118923372699808, + "grad_norm": 1.2594715356826782, + "learning_rate": 4.839730359196395e-06, + "loss": 0.6508, + "step": 2592 + }, + { + "epoch": 0.7121669870914584, + "grad_norm": 1.1695960760116577, + "learning_rate": 4.839603099748581e-06, + "loss": 0.6418, + "step": 2593 + }, + { + "epoch": 0.712441636912936, + "grad_norm": 1.1300392150878906, + "learning_rate": 4.839475791471168e-06, + "loss": 0.6385, + "step": 2594 + }, + { + "epoch": 0.7127162867344137, + "grad_norm": 1.230018973350525, + "learning_rate": 4.8393484343668116e-06, + "loss": 0.6791, + "step": 2595 + }, + { + "epoch": 0.7129909365558912, + "grad_norm": 1.2432208061218262, + "learning_rate": 4.83922102843817e-06, + "loss": 0.6371, + "step": 2596 + }, + { + "epoch": 0.7132655863773688, + "grad_norm": 1.232212781906128, + "learning_rate": 4.839093573687904e-06, + "loss": 0.6505, + "step": 2597 + }, + { + "epoch": 0.7135402361988464, + "grad_norm": 1.231616735458374, + "learning_rate": 4.8389660701186716e-06, + "loss": 0.6639, + "step": 2598 + }, + { + "epoch": 0.7138148860203241, + "grad_norm": 1.2573962211608887, + "learning_rate": 4.838838517733134e-06, + "loss": 0.6411, + "step": 2599 + }, + { + "epoch": 0.7140895358418017, + "grad_norm": 1.2284060716629028, + "learning_rate": 4.838710916533955e-06, + "loss": 0.6763, + "step": 2600 + }, + { + "epoch": 0.7143641856632793, + "grad_norm": 1.1650574207305908, + "learning_rate": 4.8385832665237965e-06, + "loss": 0.6099, + "step": 2601 + }, + { + "epoch": 0.7146388354847569, + "grad_norm": 1.164918303489685, + "learning_rate": 4.838455567705323e-06, + "loss": 0.6149, + "step": 2602 + }, + { + "epoch": 0.7149134853062346, + "grad_norm": 1.2489311695098877, + "learning_rate": 4.8383278200812e-06, + "loss": 0.6643, + "step": 2603 + }, + { + "epoch": 0.7151881351277122, + "grad_norm": 1.1790887117385864, + "learning_rate": 4.838200023654093e-06, + "loss": 0.6272, + "step": 2604 + }, + { + "epoch": 0.7154627849491898, + "grad_norm": 1.3365055322647095, + "learning_rate": 4.838072178426669e-06, + "loss": 0.7046, + "step": 2605 + }, + { + "epoch": 0.7157374347706674, + "grad_norm": 1.2104628086090088, + "learning_rate": 4.837944284401598e-06, + "loss": 0.6559, + "step": 2606 + }, + { + "epoch": 0.716012084592145, + "grad_norm": 1.247556447982788, + "learning_rate": 4.837816341581548e-06, + "loss": 0.6372, + "step": 2607 + }, + { + "epoch": 0.7162867344136227, + "grad_norm": 1.205532193183899, + "learning_rate": 4.837688349969188e-06, + "loss": 0.6746, + "step": 2608 + }, + { + "epoch": 0.7165613842351003, + "grad_norm": 1.2280710935592651, + "learning_rate": 4.837560309567193e-06, + "loss": 0.6426, + "step": 2609 + }, + { + "epoch": 0.7168360340565778, + "grad_norm": 1.2986479997634888, + "learning_rate": 4.837432220378231e-06, + "loss": 0.6536, + "step": 2610 + }, + { + "epoch": 0.7171106838780554, + "grad_norm": 1.2752411365509033, + "learning_rate": 4.837304082404978e-06, + "loss": 0.6664, + "step": 2611 + }, + { + "epoch": 0.7173853336995331, + "grad_norm": 1.3562172651290894, + "learning_rate": 4.837175895650108e-06, + "loss": 0.6465, + "step": 2612 + }, + { + "epoch": 0.7176599835210107, + "grad_norm": 1.280902624130249, + "learning_rate": 4.837047660116295e-06, + "loss": 0.6016, + "step": 2613 + }, + { + "epoch": 0.7179346333424883, + "grad_norm": 1.2267950773239136, + "learning_rate": 4.836919375806218e-06, + "loss": 0.657, + "step": 2614 + }, + { + "epoch": 0.7182092831639659, + "grad_norm": 1.2640737295150757, + "learning_rate": 4.836791042722552e-06, + "loss": 0.6789, + "step": 2615 + }, + { + "epoch": 0.7184839329854436, + "grad_norm": 1.2577710151672363, + "learning_rate": 4.836662660867976e-06, + "loss": 0.6919, + "step": 2616 + }, + { + "epoch": 0.7187585828069212, + "grad_norm": 1.2735185623168945, + "learning_rate": 4.83653423024517e-06, + "loss": 0.6612, + "step": 2617 + }, + { + "epoch": 0.7190332326283988, + "grad_norm": 1.3190715312957764, + "learning_rate": 4.836405750856814e-06, + "loss": 0.7099, + "step": 2618 + }, + { + "epoch": 0.7193078824498764, + "grad_norm": 1.2188142538070679, + "learning_rate": 4.836277222705591e-06, + "loss": 0.6752, + "step": 2619 + }, + { + "epoch": 0.719582532271354, + "grad_norm": 1.2301660776138306, + "learning_rate": 4.836148645794181e-06, + "loss": 0.6469, + "step": 2620 + }, + { + "epoch": 0.7198571820928317, + "grad_norm": 1.2151820659637451, + "learning_rate": 4.836020020125269e-06, + "loss": 0.627, + "step": 2621 + }, + { + "epoch": 0.7201318319143093, + "grad_norm": 1.2913693189620972, + "learning_rate": 4.835891345701539e-06, + "loss": 0.7133, + "step": 2622 + }, + { + "epoch": 0.7204064817357869, + "grad_norm": 1.2656034231185913, + "learning_rate": 4.835762622525676e-06, + "loss": 0.6677, + "step": 2623 + }, + { + "epoch": 0.7206811315572645, + "grad_norm": 1.2697417736053467, + "learning_rate": 4.835633850600369e-06, + "loss": 0.6606, + "step": 2624 + }, + { + "epoch": 0.720955781378742, + "grad_norm": 1.2403780221939087, + "learning_rate": 4.835505029928302e-06, + "loss": 0.7062, + "step": 2625 + }, + { + "epoch": 0.7212304312002197, + "grad_norm": 1.2479169368743896, + "learning_rate": 4.835376160512167e-06, + "loss": 0.6291, + "step": 2626 + }, + { + "epoch": 0.7215050810216973, + "grad_norm": 1.215395450592041, + "learning_rate": 4.835247242354652e-06, + "loss": 0.6195, + "step": 2627 + }, + { + "epoch": 0.7217797308431749, + "grad_norm": 1.173272967338562, + "learning_rate": 4.835118275458447e-06, + "loss": 0.6486, + "step": 2628 + }, + { + "epoch": 0.7220543806646526, + "grad_norm": 1.268359661102295, + "learning_rate": 4.834989259826245e-06, + "loss": 0.6435, + "step": 2629 + }, + { + "epoch": 0.7223290304861302, + "grad_norm": 1.2664827108383179, + "learning_rate": 4.8348601954607374e-06, + "loss": 0.6693, + "step": 2630 + }, + { + "epoch": 0.7226036803076078, + "grad_norm": 1.1863359212875366, + "learning_rate": 4.83473108236462e-06, + "loss": 0.6585, + "step": 2631 + }, + { + "epoch": 0.7228783301290854, + "grad_norm": 1.1893763542175293, + "learning_rate": 4.8346019205405845e-06, + "loss": 0.6206, + "step": 2632 + }, + { + "epoch": 0.723152979950563, + "grad_norm": 1.239371657371521, + "learning_rate": 4.834472709991329e-06, + "loss": 0.6547, + "step": 2633 + }, + { + "epoch": 0.7234276297720407, + "grad_norm": 1.2348417043685913, + "learning_rate": 4.83434345071955e-06, + "loss": 0.6753, + "step": 2634 + }, + { + "epoch": 0.7237022795935183, + "grad_norm": 1.2620543241500854, + "learning_rate": 4.834214142727944e-06, + "loss": 0.6716, + "step": 2635 + }, + { + "epoch": 0.7239769294149959, + "grad_norm": 1.2233422994613647, + "learning_rate": 4.834084786019211e-06, + "loss": 0.6366, + "step": 2636 + }, + { + "epoch": 0.7242515792364735, + "grad_norm": 1.1760400533676147, + "learning_rate": 4.83395538059605e-06, + "loss": 0.7133, + "step": 2637 + }, + { + "epoch": 0.7245262290579512, + "grad_norm": 1.2795121669769287, + "learning_rate": 4.833825926461161e-06, + "loss": 0.7001, + "step": 2638 + }, + { + "epoch": 0.7248008788794287, + "grad_norm": 1.210072636604309, + "learning_rate": 4.8336964236172476e-06, + "loss": 0.6909, + "step": 2639 + }, + { + "epoch": 0.7250755287009063, + "grad_norm": 1.3078213930130005, + "learning_rate": 4.833566872067013e-06, + "loss": 0.6908, + "step": 2640 + }, + { + "epoch": 0.7253501785223839, + "grad_norm": 1.1938626766204834, + "learning_rate": 4.833437271813159e-06, + "loss": 0.5971, + "step": 2641 + }, + { + "epoch": 0.7256248283438615, + "grad_norm": 1.233149528503418, + "learning_rate": 4.8333076228583915e-06, + "loss": 0.6718, + "step": 2642 + }, + { + "epoch": 0.7258994781653392, + "grad_norm": 1.1853522062301636, + "learning_rate": 4.8331779252054165e-06, + "loss": 0.6472, + "step": 2643 + }, + { + "epoch": 0.7261741279868168, + "grad_norm": 1.247067928314209, + "learning_rate": 4.833048178856941e-06, + "loss": 0.6861, + "step": 2644 + }, + { + "epoch": 0.7264487778082944, + "grad_norm": 1.1603810787200928, + "learning_rate": 4.832918383815673e-06, + "loss": 0.6285, + "step": 2645 + }, + { + "epoch": 0.726723427629772, + "grad_norm": 1.259160041809082, + "learning_rate": 4.83278854008432e-06, + "loss": 0.6676, + "step": 2646 + }, + { + "epoch": 0.7269980774512497, + "grad_norm": 1.2981258630752563, + "learning_rate": 4.832658647665593e-06, + "loss": 0.6764, + "step": 2647 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.2524065971374512, + "learning_rate": 4.832528706562204e-06, + "loss": 0.6648, + "step": 2648 + }, + { + "epoch": 0.7275473770942049, + "grad_norm": 1.3255397081375122, + "learning_rate": 4.8323987167768636e-06, + "loss": 0.6976, + "step": 2649 + }, + { + "epoch": 0.7278220269156825, + "grad_norm": 1.3068585395812988, + "learning_rate": 4.832268678312285e-06, + "loss": 0.6777, + "step": 2650 + }, + { + "epoch": 0.7280966767371602, + "grad_norm": 1.260494589805603, + "learning_rate": 4.8321385911711834e-06, + "loss": 0.6846, + "step": 2651 + }, + { + "epoch": 0.7283713265586378, + "grad_norm": 1.2625644207000732, + "learning_rate": 4.8320084553562726e-06, + "loss": 0.7069, + "step": 2652 + }, + { + "epoch": 0.7286459763801153, + "grad_norm": 1.2137185335159302, + "learning_rate": 4.831878270870268e-06, + "loss": 0.6681, + "step": 2653 + }, + { + "epoch": 0.7289206262015929, + "grad_norm": 1.2252287864685059, + "learning_rate": 4.831748037715889e-06, + "loss": 0.7101, + "step": 2654 + }, + { + "epoch": 0.7291952760230705, + "grad_norm": 1.1803810596466064, + "learning_rate": 4.8316177558958514e-06, + "loss": 0.6104, + "step": 2655 + }, + { + "epoch": 0.7294699258445482, + "grad_norm": 1.206071138381958, + "learning_rate": 4.831487425412875e-06, + "loss": 0.6549, + "step": 2656 + }, + { + "epoch": 0.7297445756660258, + "grad_norm": 1.175026774406433, + "learning_rate": 4.83135704626968e-06, + "loss": 0.6463, + "step": 2657 + }, + { + "epoch": 0.7300192254875034, + "grad_norm": 1.2154968976974487, + "learning_rate": 4.831226618468989e-06, + "loss": 0.661, + "step": 2658 + }, + { + "epoch": 0.730293875308981, + "grad_norm": 1.197281002998352, + "learning_rate": 4.831096142013522e-06, + "loss": 0.6088, + "step": 2659 + }, + { + "epoch": 0.7305685251304587, + "grad_norm": 1.2784627676010132, + "learning_rate": 4.830965616906002e-06, + "loss": 0.6652, + "step": 2660 + }, + { + "epoch": 0.7308431749519363, + "grad_norm": 1.2173511981964111, + "learning_rate": 4.8308350431491555e-06, + "loss": 0.6452, + "step": 2661 + }, + { + "epoch": 0.7311178247734139, + "grad_norm": 1.3023452758789062, + "learning_rate": 4.8307044207457055e-06, + "loss": 0.6507, + "step": 2662 + }, + { + "epoch": 0.7313924745948915, + "grad_norm": 1.202516794204712, + "learning_rate": 4.83057374969838e-06, + "loss": 0.6582, + "step": 2663 + }, + { + "epoch": 0.7316671244163692, + "grad_norm": 1.1709731817245483, + "learning_rate": 4.8304430300099035e-06, + "loss": 0.6574, + "step": 2664 + }, + { + "epoch": 0.7319417742378468, + "grad_norm": 1.2910704612731934, + "learning_rate": 4.830312261683006e-06, + "loss": 0.6754, + "step": 2665 + }, + { + "epoch": 0.7322164240593244, + "grad_norm": 1.2408219575881958, + "learning_rate": 4.8301814447204184e-06, + "loss": 0.6991, + "step": 2666 + }, + { + "epoch": 0.7324910738808019, + "grad_norm": 1.2163797616958618, + "learning_rate": 4.830050579124869e-06, + "loss": 0.6538, + "step": 2667 + }, + { + "epoch": 0.7327657237022795, + "grad_norm": 1.1695199012756348, + "learning_rate": 4.8299196648990885e-06, + "loss": 0.6939, + "step": 2668 + }, + { + "epoch": 0.7330403735237572, + "grad_norm": 1.2975423336029053, + "learning_rate": 4.82978870204581e-06, + "loss": 0.6809, + "step": 2669 + }, + { + "epoch": 0.7333150233452348, + "grad_norm": 1.2584255933761597, + "learning_rate": 4.829657690567768e-06, + "loss": 0.6792, + "step": 2670 + }, + { + "epoch": 0.7335896731667124, + "grad_norm": 1.2603685855865479, + "learning_rate": 4.829526630467695e-06, + "loss": 0.6647, + "step": 2671 + }, + { + "epoch": 0.73386432298819, + "grad_norm": 1.3302973508834839, + "learning_rate": 4.8293955217483265e-06, + "loss": 0.7062, + "step": 2672 + }, + { + "epoch": 0.7341389728096677, + "grad_norm": 1.207607388496399, + "learning_rate": 4.8292643644123996e-06, + "loss": 0.6786, + "step": 2673 + }, + { + "epoch": 0.7344136226311453, + "grad_norm": 1.2517790794372559, + "learning_rate": 4.829133158462652e-06, + "loss": 0.6558, + "step": 2674 + }, + { + "epoch": 0.7346882724526229, + "grad_norm": 1.2418631315231323, + "learning_rate": 4.829001903901821e-06, + "loss": 0.6492, + "step": 2675 + }, + { + "epoch": 0.7349629222741005, + "grad_norm": 1.3598357439041138, + "learning_rate": 4.828870600732647e-06, + "loss": 0.6757, + "step": 2676 + }, + { + "epoch": 0.7352375720955782, + "grad_norm": 1.1709097623825073, + "learning_rate": 4.82873924895787e-06, + "loss": 0.6462, + "step": 2677 + }, + { + "epoch": 0.7355122219170558, + "grad_norm": 1.2608535289764404, + "learning_rate": 4.8286078485802315e-06, + "loss": 0.7087, + "step": 2678 + }, + { + "epoch": 0.7357868717385334, + "grad_norm": 1.2177619934082031, + "learning_rate": 4.828476399602473e-06, + "loss": 0.6479, + "step": 2679 + }, + { + "epoch": 0.736061521560011, + "grad_norm": 1.2774367332458496, + "learning_rate": 4.82834490202734e-06, + "loss": 0.6863, + "step": 2680 + }, + { + "epoch": 0.7363361713814885, + "grad_norm": 1.3956927061080933, + "learning_rate": 4.828213355857575e-06, + "loss": 0.6931, + "step": 2681 + }, + { + "epoch": 0.7366108212029662, + "grad_norm": 1.1557369232177734, + "learning_rate": 4.828081761095924e-06, + "loss": 0.5787, + "step": 2682 + }, + { + "epoch": 0.7368854710244438, + "grad_norm": 1.213106393814087, + "learning_rate": 4.827950117745134e-06, + "loss": 0.6648, + "step": 2683 + }, + { + "epoch": 0.7371601208459214, + "grad_norm": 1.2639710903167725, + "learning_rate": 4.827818425807952e-06, + "loss": 0.6672, + "step": 2684 + }, + { + "epoch": 0.737434770667399, + "grad_norm": 1.2266902923583984, + "learning_rate": 4.8276866852871275e-06, + "loss": 0.6814, + "step": 2685 + }, + { + "epoch": 0.7377094204888767, + "grad_norm": 1.2527778148651123, + "learning_rate": 4.827554896185409e-06, + "loss": 0.6781, + "step": 2686 + }, + { + "epoch": 0.7379840703103543, + "grad_norm": 1.2198469638824463, + "learning_rate": 4.827423058505547e-06, + "loss": 0.6665, + "step": 2687 + }, + { + "epoch": 0.7382587201318319, + "grad_norm": 1.277623176574707, + "learning_rate": 4.827291172250293e-06, + "loss": 0.6408, + "step": 2688 + }, + { + "epoch": 0.7385333699533095, + "grad_norm": 1.2744598388671875, + "learning_rate": 4.827159237422402e-06, + "loss": 0.6786, + "step": 2689 + }, + { + "epoch": 0.7388080197747872, + "grad_norm": 1.1946789026260376, + "learning_rate": 4.827027254024625e-06, + "loss": 0.7044, + "step": 2690 + }, + { + "epoch": 0.7390826695962648, + "grad_norm": 1.1860970258712769, + "learning_rate": 4.826895222059716e-06, + "loss": 0.6607, + "step": 2691 + }, + { + "epoch": 0.7393573194177424, + "grad_norm": 1.220557689666748, + "learning_rate": 4.8267631415304325e-06, + "loss": 0.6478, + "step": 2692 + }, + { + "epoch": 0.73963196923922, + "grad_norm": 1.1601228713989258, + "learning_rate": 4.82663101243953e-06, + "loss": 0.5719, + "step": 2693 + }, + { + "epoch": 0.7399066190606977, + "grad_norm": 1.2767890691757202, + "learning_rate": 4.826498834789768e-06, + "loss": 0.718, + "step": 2694 + }, + { + "epoch": 0.7401812688821753, + "grad_norm": 1.2557477951049805, + "learning_rate": 4.826366608583903e-06, + "loss": 0.6657, + "step": 2695 + }, + { + "epoch": 0.7404559187036528, + "grad_norm": 1.1222519874572754, + "learning_rate": 4.826234333824695e-06, + "loss": 0.5956, + "step": 2696 + }, + { + "epoch": 0.7407305685251304, + "grad_norm": 1.2512675523757935, + "learning_rate": 4.826102010514906e-06, + "loss": 0.669, + "step": 2697 + }, + { + "epoch": 0.741005218346608, + "grad_norm": 1.225512981414795, + "learning_rate": 4.825969638657297e-06, + "loss": 0.6228, + "step": 2698 + }, + { + "epoch": 0.7412798681680857, + "grad_norm": 1.2160779237747192, + "learning_rate": 4.8258372182546305e-06, + "loss": 0.6314, + "step": 2699 + }, + { + "epoch": 0.7415545179895633, + "grad_norm": 1.2221267223358154, + "learning_rate": 4.82570474930967e-06, + "loss": 0.6804, + "step": 2700 + }, + { + "epoch": 0.7418291678110409, + "grad_norm": 1.3084213733673096, + "learning_rate": 4.825572231825181e-06, + "loss": 0.7015, + "step": 2701 + }, + { + "epoch": 0.7421038176325185, + "grad_norm": 1.2539498805999756, + "learning_rate": 4.825439665803928e-06, + "loss": 0.6575, + "step": 2702 + }, + { + "epoch": 0.7423784674539962, + "grad_norm": 1.272747278213501, + "learning_rate": 4.825307051248679e-06, + "loss": 0.6662, + "step": 2703 + }, + { + "epoch": 0.7426531172754738, + "grad_norm": 1.19369375705719, + "learning_rate": 4.825174388162202e-06, + "loss": 0.652, + "step": 2704 + }, + { + "epoch": 0.7429277670969514, + "grad_norm": 1.2036484479904175, + "learning_rate": 4.825041676547265e-06, + "loss": 0.6494, + "step": 2705 + }, + { + "epoch": 0.743202416918429, + "grad_norm": 1.2332478761672974, + "learning_rate": 4.8249089164066376e-06, + "loss": 0.654, + "step": 2706 + }, + { + "epoch": 0.7434770667399067, + "grad_norm": 1.2555882930755615, + "learning_rate": 4.824776107743092e-06, + "loss": 0.6504, + "step": 2707 + }, + { + "epoch": 0.7437517165613843, + "grad_norm": 1.1814985275268555, + "learning_rate": 4.824643250559398e-06, + "loss": 0.6545, + "step": 2708 + }, + { + "epoch": 0.7440263663828619, + "grad_norm": 1.2091392278671265, + "learning_rate": 4.824510344858329e-06, + "loss": 0.6666, + "step": 2709 + }, + { + "epoch": 0.7443010162043394, + "grad_norm": 1.1880073547363281, + "learning_rate": 4.824377390642661e-06, + "loss": 0.5977, + "step": 2710 + }, + { + "epoch": 0.744575666025817, + "grad_norm": 1.268398404121399, + "learning_rate": 4.824244387915167e-06, + "loss": 0.654, + "step": 2711 + }, + { + "epoch": 0.7448503158472947, + "grad_norm": 1.3034236431121826, + "learning_rate": 4.824111336678622e-06, + "loss": 0.6888, + "step": 2712 + }, + { + "epoch": 0.7451249656687723, + "grad_norm": 1.2113943099975586, + "learning_rate": 4.8239782369358055e-06, + "loss": 0.6547, + "step": 2713 + }, + { + "epoch": 0.7453996154902499, + "grad_norm": 1.185414433479309, + "learning_rate": 4.823845088689493e-06, + "loss": 0.6662, + "step": 2714 + }, + { + "epoch": 0.7456742653117275, + "grad_norm": 1.2429478168487549, + "learning_rate": 4.823711891942464e-06, + "loss": 0.6594, + "step": 2715 + }, + { + "epoch": 0.7459489151332052, + "grad_norm": 1.2242335081100464, + "learning_rate": 4.8235786466975e-06, + "loss": 0.6395, + "step": 2716 + }, + { + "epoch": 0.7462235649546828, + "grad_norm": 1.2199525833129883, + "learning_rate": 4.82344535295738e-06, + "loss": 0.685, + "step": 2717 + }, + { + "epoch": 0.7464982147761604, + "grad_norm": 1.1755070686340332, + "learning_rate": 4.823312010724887e-06, + "loss": 0.6355, + "step": 2718 + }, + { + "epoch": 0.746772864597638, + "grad_norm": 1.2248190641403198, + "learning_rate": 4.823178620002803e-06, + "loss": 0.6708, + "step": 2719 + }, + { + "epoch": 0.7470475144191157, + "grad_norm": 1.2438082695007324, + "learning_rate": 4.823045180793914e-06, + "loss": 0.6218, + "step": 2720 + }, + { + "epoch": 0.7473221642405933, + "grad_norm": 1.2407666444778442, + "learning_rate": 4.822911693101002e-06, + "loss": 0.6954, + "step": 2721 + }, + { + "epoch": 0.7475968140620709, + "grad_norm": 1.1886253356933594, + "learning_rate": 4.822778156926856e-06, + "loss": 0.6222, + "step": 2722 + }, + { + "epoch": 0.7478714638835485, + "grad_norm": 1.2444713115692139, + "learning_rate": 4.822644572274261e-06, + "loss": 0.6503, + "step": 2723 + }, + { + "epoch": 0.748146113705026, + "grad_norm": 1.2187961339950562, + "learning_rate": 4.822510939146006e-06, + "loss": 0.6427, + "step": 2724 + }, + { + "epoch": 0.7484207635265037, + "grad_norm": 1.243812084197998, + "learning_rate": 4.82237725754488e-06, + "loss": 0.6217, + "step": 2725 + }, + { + "epoch": 0.7486954133479813, + "grad_norm": 1.2508264780044556, + "learning_rate": 4.822243527473672e-06, + "loss": 0.6699, + "step": 2726 + }, + { + "epoch": 0.7489700631694589, + "grad_norm": 1.3379255533218384, + "learning_rate": 4.8221097489351745e-06, + "loss": 0.6733, + "step": 2727 + }, + { + "epoch": 0.7492447129909365, + "grad_norm": 1.2983356714248657, + "learning_rate": 4.821975921932179e-06, + "loss": 0.6816, + "step": 2728 + }, + { + "epoch": 0.7495193628124142, + "grad_norm": 1.290490746498108, + "learning_rate": 4.821842046467478e-06, + "loss": 0.6845, + "step": 2729 + }, + { + "epoch": 0.7497940126338918, + "grad_norm": 1.1989301443099976, + "learning_rate": 4.821708122543867e-06, + "loss": 0.6628, + "step": 2730 + }, + { + "epoch": 0.7500686624553694, + "grad_norm": 1.2093989849090576, + "learning_rate": 4.821574150164139e-06, + "loss": 0.624, + "step": 2731 + }, + { + "epoch": 0.750343312276847, + "grad_norm": 1.3308185338974, + "learning_rate": 4.821440129331093e-06, + "loss": 0.662, + "step": 2732 + }, + { + "epoch": 0.7506179620983247, + "grad_norm": 1.284172534942627, + "learning_rate": 4.821306060047523e-06, + "loss": 0.7077, + "step": 2733 + }, + { + "epoch": 0.7508926119198023, + "grad_norm": 1.2883281707763672, + "learning_rate": 4.821171942316229e-06, + "loss": 0.6817, + "step": 2734 + }, + { + "epoch": 0.7511672617412799, + "grad_norm": 1.272843837738037, + "learning_rate": 4.821037776140011e-06, + "loss": 0.6654, + "step": 2735 + }, + { + "epoch": 0.7514419115627575, + "grad_norm": 1.2477424144744873, + "learning_rate": 4.820903561521667e-06, + "loss": 0.6785, + "step": 2736 + }, + { + "epoch": 0.7517165613842351, + "grad_norm": 1.234392762184143, + "learning_rate": 4.8207692984639994e-06, + "loss": 0.6905, + "step": 2737 + }, + { + "epoch": 0.7519912112057127, + "grad_norm": 1.1749627590179443, + "learning_rate": 4.82063498696981e-06, + "loss": 0.6334, + "step": 2738 + }, + { + "epoch": 0.7522658610271903, + "grad_norm": 1.25011146068573, + "learning_rate": 4.8205006270419016e-06, + "loss": 0.6583, + "step": 2739 + }, + { + "epoch": 0.7525405108486679, + "grad_norm": 1.2895532846450806, + "learning_rate": 4.82036621868308e-06, + "loss": 0.6826, + "step": 2740 + }, + { + "epoch": 0.7528151606701455, + "grad_norm": 1.1686209440231323, + "learning_rate": 4.820231761896149e-06, + "loss": 0.6326, + "step": 2741 + }, + { + "epoch": 0.7530898104916232, + "grad_norm": 1.339766025543213, + "learning_rate": 4.820097256683916e-06, + "loss": 0.7087, + "step": 2742 + }, + { + "epoch": 0.7533644603131008, + "grad_norm": 1.2019959688186646, + "learning_rate": 4.819962703049186e-06, + "loss": 0.6985, + "step": 2743 + }, + { + "epoch": 0.7536391101345784, + "grad_norm": 1.1824702024459839, + "learning_rate": 4.819828100994769e-06, + "loss": 0.6331, + "step": 2744 + }, + { + "epoch": 0.753913759956056, + "grad_norm": 1.2544387578964233, + "learning_rate": 4.819693450523474e-06, + "loss": 0.6643, + "step": 2745 + }, + { + "epoch": 0.7541884097775337, + "grad_norm": 1.1968138217926025, + "learning_rate": 4.819558751638112e-06, + "loss": 0.6389, + "step": 2746 + }, + { + "epoch": 0.7544630595990113, + "grad_norm": 1.205706238746643, + "learning_rate": 4.819424004341493e-06, + "loss": 0.6783, + "step": 2747 + }, + { + "epoch": 0.7547377094204889, + "grad_norm": 1.210109829902649, + "learning_rate": 4.8192892086364305e-06, + "loss": 0.673, + "step": 2748 + }, + { + "epoch": 0.7550123592419665, + "grad_norm": 1.23871910572052, + "learning_rate": 4.819154364525737e-06, + "loss": 0.671, + "step": 2749 + }, + { + "epoch": 0.7552870090634441, + "grad_norm": 1.238933801651001, + "learning_rate": 4.8190194720122255e-06, + "loss": 0.6166, + "step": 2750 + }, + { + "epoch": 0.7555616588849218, + "grad_norm": 1.1296334266662598, + "learning_rate": 4.8188845310987145e-06, + "loss": 0.6026, + "step": 2751 + }, + { + "epoch": 0.7558363087063994, + "grad_norm": 1.2344245910644531, + "learning_rate": 4.818749541788018e-06, + "loss": 0.6152, + "step": 2752 + }, + { + "epoch": 0.7561109585278769, + "grad_norm": 1.2464640140533447, + "learning_rate": 4.818614504082953e-06, + "loss": 0.6694, + "step": 2753 + }, + { + "epoch": 0.7563856083493545, + "grad_norm": 1.2384718656539917, + "learning_rate": 4.81847941798634e-06, + "loss": 0.6584, + "step": 2754 + }, + { + "epoch": 0.7566602581708322, + "grad_norm": 1.2772361040115356, + "learning_rate": 4.818344283500998e-06, + "loss": 0.6593, + "step": 2755 + }, + { + "epoch": 0.7569349079923098, + "grad_norm": 1.2625064849853516, + "learning_rate": 4.818209100629745e-06, + "loss": 0.639, + "step": 2756 + }, + { + "epoch": 0.7572095578137874, + "grad_norm": 1.259667992591858, + "learning_rate": 4.818073869375405e-06, + "loss": 0.6963, + "step": 2757 + }, + { + "epoch": 0.757484207635265, + "grad_norm": 1.2347222566604614, + "learning_rate": 4.817938589740798e-06, + "loss": 0.6827, + "step": 2758 + }, + { + "epoch": 0.7577588574567427, + "grad_norm": 1.1931219100952148, + "learning_rate": 4.81780326172875e-06, + "loss": 0.7046, + "step": 2759 + }, + { + "epoch": 0.7580335072782203, + "grad_norm": 1.2553517818450928, + "learning_rate": 4.8176678853420835e-06, + "loss": 0.654, + "step": 2760 + }, + { + "epoch": 0.7583081570996979, + "grad_norm": 1.2340283393859863, + "learning_rate": 4.817532460583625e-06, + "loss": 0.6421, + "step": 2761 + }, + { + "epoch": 0.7585828069211755, + "grad_norm": 1.1767640113830566, + "learning_rate": 4.817396987456201e-06, + "loss": 0.6287, + "step": 2762 + }, + { + "epoch": 0.7588574567426531, + "grad_norm": 1.2702959775924683, + "learning_rate": 4.817261465962638e-06, + "loss": 0.6631, + "step": 2763 + }, + { + "epoch": 0.7591321065641308, + "grad_norm": 1.2662761211395264, + "learning_rate": 4.817125896105765e-06, + "loss": 0.6873, + "step": 2764 + }, + { + "epoch": 0.7594067563856084, + "grad_norm": 1.2545751333236694, + "learning_rate": 4.8169902778884105e-06, + "loss": 0.6407, + "step": 2765 + }, + { + "epoch": 0.759681406207086, + "grad_norm": 1.3118904829025269, + "learning_rate": 4.8168546113134075e-06, + "loss": 0.6768, + "step": 2766 + }, + { + "epoch": 0.7599560560285635, + "grad_norm": 1.2252416610717773, + "learning_rate": 4.816718896383584e-06, + "loss": 0.6342, + "step": 2767 + }, + { + "epoch": 0.7602307058500412, + "grad_norm": 1.2121387720108032, + "learning_rate": 4.8165831331017755e-06, + "loss": 0.6226, + "step": 2768 + }, + { + "epoch": 0.7605053556715188, + "grad_norm": 1.218197226524353, + "learning_rate": 4.816447321470815e-06, + "loss": 0.6399, + "step": 2769 + }, + { + "epoch": 0.7607800054929964, + "grad_norm": 1.1735790967941284, + "learning_rate": 4.8163114614935355e-06, + "loss": 0.615, + "step": 2770 + }, + { + "epoch": 0.761054655314474, + "grad_norm": 1.2598767280578613, + "learning_rate": 4.816175553172773e-06, + "loss": 0.6624, + "step": 2771 + }, + { + "epoch": 0.7613293051359517, + "grad_norm": 1.2730270624160767, + "learning_rate": 4.816039596511365e-06, + "loss": 0.6317, + "step": 2772 + }, + { + "epoch": 0.7616039549574293, + "grad_norm": 1.2177730798721313, + "learning_rate": 4.815903591512149e-06, + "loss": 0.6828, + "step": 2773 + }, + { + "epoch": 0.7618786047789069, + "grad_norm": 1.2979192733764648, + "learning_rate": 4.815767538177962e-06, + "loss": 0.7147, + "step": 2774 + }, + { + "epoch": 0.7621532546003845, + "grad_norm": 1.2301865816116333, + "learning_rate": 4.8156314365116445e-06, + "loss": 0.6682, + "step": 2775 + }, + { + "epoch": 0.7624279044218621, + "grad_norm": 1.2697664499282837, + "learning_rate": 4.815495286516038e-06, + "loss": 0.6681, + "step": 2776 + }, + { + "epoch": 0.7627025542433398, + "grad_norm": 1.1956398487091064, + "learning_rate": 4.815359088193982e-06, + "loss": 0.6746, + "step": 2777 + }, + { + "epoch": 0.7629772040648174, + "grad_norm": 1.1749181747436523, + "learning_rate": 4.815222841548321e-06, + "loss": 0.6494, + "step": 2778 + }, + { + "epoch": 0.763251853886295, + "grad_norm": 1.2282536029815674, + "learning_rate": 4.815086546581897e-06, + "loss": 0.6737, + "step": 2779 + }, + { + "epoch": 0.7635265037077726, + "grad_norm": 1.2292851209640503, + "learning_rate": 4.814950203297557e-06, + "loss": 0.6719, + "step": 2780 + }, + { + "epoch": 0.7638011535292502, + "grad_norm": 1.1841903924942017, + "learning_rate": 4.814813811698144e-06, + "loss": 0.6495, + "step": 2781 + }, + { + "epoch": 0.7640758033507278, + "grad_norm": 1.200908899307251, + "learning_rate": 4.814677371786506e-06, + "loss": 0.6528, + "step": 2782 + }, + { + "epoch": 0.7643504531722054, + "grad_norm": 1.2510873079299927, + "learning_rate": 4.814540883565491e-06, + "loss": 0.6581, + "step": 2783 + }, + { + "epoch": 0.764625102993683, + "grad_norm": 1.2287383079528809, + "learning_rate": 4.814404347037945e-06, + "loss": 0.6383, + "step": 2784 + }, + { + "epoch": 0.7648997528151606, + "grad_norm": 1.2709141969680786, + "learning_rate": 4.814267762206721e-06, + "loss": 0.6913, + "step": 2785 + }, + { + "epoch": 0.7651744026366383, + "grad_norm": 1.2692923545837402, + "learning_rate": 4.8141311290746684e-06, + "loss": 0.6657, + "step": 2786 + }, + { + "epoch": 0.7654490524581159, + "grad_norm": 1.2264435291290283, + "learning_rate": 4.813994447644638e-06, + "loss": 0.6782, + "step": 2787 + }, + { + "epoch": 0.7657237022795935, + "grad_norm": 1.2755216360092163, + "learning_rate": 4.813857717919483e-06, + "loss": 0.7097, + "step": 2788 + }, + { + "epoch": 0.7659983521010711, + "grad_norm": 1.206514835357666, + "learning_rate": 4.813720939902057e-06, + "loss": 0.5703, + "step": 2789 + }, + { + "epoch": 0.7662730019225488, + "grad_norm": 1.1818243265151978, + "learning_rate": 4.813584113595215e-06, + "loss": 0.6836, + "step": 2790 + }, + { + "epoch": 0.7665476517440264, + "grad_norm": 1.1880180835723877, + "learning_rate": 4.813447239001814e-06, + "loss": 0.6452, + "step": 2791 + }, + { + "epoch": 0.766822301565504, + "grad_norm": 1.2962851524353027, + "learning_rate": 4.813310316124708e-06, + "loss": 0.6786, + "step": 2792 + }, + { + "epoch": 0.7670969513869816, + "grad_norm": 1.2248321771621704, + "learning_rate": 4.813173344966756e-06, + "loss": 0.6417, + "step": 2793 + }, + { + "epoch": 0.7673716012084593, + "grad_norm": 1.3020126819610596, + "learning_rate": 4.813036325530817e-06, + "loss": 0.6724, + "step": 2794 + }, + { + "epoch": 0.7676462510299368, + "grad_norm": 1.2169283628463745, + "learning_rate": 4.8128992578197494e-06, + "loss": 0.6805, + "step": 2795 + }, + { + "epoch": 0.7679209008514144, + "grad_norm": 1.2225966453552246, + "learning_rate": 4.812762141836416e-06, + "loss": 0.6835, + "step": 2796 + }, + { + "epoch": 0.768195550672892, + "grad_norm": 1.2254691123962402, + "learning_rate": 4.812624977583677e-06, + "loss": 0.6699, + "step": 2797 + }, + { + "epoch": 0.7684702004943696, + "grad_norm": 1.2414202690124512, + "learning_rate": 4.812487765064395e-06, + "loss": 0.7095, + "step": 2798 + }, + { + "epoch": 0.7687448503158473, + "grad_norm": 1.2411459684371948, + "learning_rate": 4.812350504281436e-06, + "loss": 0.6464, + "step": 2799 + }, + { + "epoch": 0.7690195001373249, + "grad_norm": 1.2674440145492554, + "learning_rate": 4.812213195237661e-06, + "loss": 0.6968, + "step": 2800 + }, + { + "epoch": 0.7692941499588025, + "grad_norm": 1.1888679265975952, + "learning_rate": 4.81207583793594e-06, + "loss": 0.6522, + "step": 2801 + }, + { + "epoch": 0.7695687997802801, + "grad_norm": 1.1983121633529663, + "learning_rate": 4.811938432379136e-06, + "loss": 0.6341, + "step": 2802 + }, + { + "epoch": 0.7698434496017578, + "grad_norm": 1.2214245796203613, + "learning_rate": 4.8118009785701194e-06, + "loss": 0.6929, + "step": 2803 + }, + { + "epoch": 0.7701180994232354, + "grad_norm": 1.2303977012634277, + "learning_rate": 4.811663476511757e-06, + "loss": 0.6446, + "step": 2804 + }, + { + "epoch": 0.770392749244713, + "grad_norm": 1.295453429222107, + "learning_rate": 4.811525926206921e-06, + "loss": 0.6803, + "step": 2805 + }, + { + "epoch": 0.7706673990661906, + "grad_norm": 1.2943812608718872, + "learning_rate": 4.811388327658479e-06, + "loss": 0.6624, + "step": 2806 + }, + { + "epoch": 0.7709420488876683, + "grad_norm": 1.1981496810913086, + "learning_rate": 4.811250680869305e-06, + "loss": 0.6728, + "step": 2807 + }, + { + "epoch": 0.7712166987091459, + "grad_norm": 1.2089614868164062, + "learning_rate": 4.811112985842272e-06, + "loss": 0.6387, + "step": 2808 + }, + { + "epoch": 0.7714913485306234, + "grad_norm": 1.1972203254699707, + "learning_rate": 4.810975242580252e-06, + "loss": 0.6492, + "step": 2809 + }, + { + "epoch": 0.771765998352101, + "grad_norm": 1.1975420713424683, + "learning_rate": 4.810837451086123e-06, + "loss": 0.6365, + "step": 2810 + }, + { + "epoch": 0.7720406481735786, + "grad_norm": 1.222152590751648, + "learning_rate": 4.810699611362758e-06, + "loss": 0.6827, + "step": 2811 + }, + { + "epoch": 0.7723152979950563, + "grad_norm": 1.1980445384979248, + "learning_rate": 4.810561723413034e-06, + "loss": 0.6102, + "step": 2812 + }, + { + "epoch": 0.7725899478165339, + "grad_norm": 1.304560661315918, + "learning_rate": 4.81042378723983e-06, + "loss": 0.6804, + "step": 2813 + }, + { + "epoch": 0.7728645976380115, + "grad_norm": 1.2297863960266113, + "learning_rate": 4.8102858028460244e-06, + "loss": 0.6594, + "step": 2814 + }, + { + "epoch": 0.7731392474594891, + "grad_norm": 1.1754190921783447, + "learning_rate": 4.810147770234497e-06, + "loss": 0.6103, + "step": 2815 + }, + { + "epoch": 0.7734138972809668, + "grad_norm": 1.373738169670105, + "learning_rate": 4.81000968940813e-06, + "loss": 0.6619, + "step": 2816 + }, + { + "epoch": 0.7736885471024444, + "grad_norm": 1.292263150215149, + "learning_rate": 4.809871560369802e-06, + "loss": 0.7204, + "step": 2817 + }, + { + "epoch": 0.773963196923922, + "grad_norm": 1.1726168394088745, + "learning_rate": 4.809733383122398e-06, + "loss": 0.6462, + "step": 2818 + }, + { + "epoch": 0.7742378467453996, + "grad_norm": 1.2446914911270142, + "learning_rate": 4.809595157668803e-06, + "loss": 0.6533, + "step": 2819 + }, + { + "epoch": 0.7745124965668773, + "grad_norm": 1.1531327962875366, + "learning_rate": 4.809456884011899e-06, + "loss": 0.6754, + "step": 2820 + }, + { + "epoch": 0.7747871463883549, + "grad_norm": 1.2699013948440552, + "learning_rate": 4.809318562154575e-06, + "loss": 0.6789, + "step": 2821 + }, + { + "epoch": 0.7750617962098325, + "grad_norm": 1.2350231409072876, + "learning_rate": 4.809180192099716e-06, + "loss": 0.651, + "step": 2822 + }, + { + "epoch": 0.7753364460313101, + "grad_norm": 1.2555299997329712, + "learning_rate": 4.80904177385021e-06, + "loss": 0.7198, + "step": 2823 + }, + { + "epoch": 0.7756110958527876, + "grad_norm": 1.195288062095642, + "learning_rate": 4.808903307408947e-06, + "loss": 0.6901, + "step": 2824 + }, + { + "epoch": 0.7758857456742653, + "grad_norm": 1.2028143405914307, + "learning_rate": 4.808764792778815e-06, + "loss": 0.614, + "step": 2825 + }, + { + "epoch": 0.7761603954957429, + "grad_norm": 1.234889030456543, + "learning_rate": 4.8086262299627065e-06, + "loss": 0.6442, + "step": 2826 + }, + { + "epoch": 0.7764350453172205, + "grad_norm": 1.185577154159546, + "learning_rate": 4.808487618963513e-06, + "loss": 0.6523, + "step": 2827 + }, + { + "epoch": 0.7767096951386981, + "grad_norm": 1.2797319889068604, + "learning_rate": 4.8083489597841285e-06, + "loss": 0.627, + "step": 2828 + }, + { + "epoch": 0.7769843449601758, + "grad_norm": 1.2867389917373657, + "learning_rate": 4.8082102524274445e-06, + "loss": 0.6134, + "step": 2829 + }, + { + "epoch": 0.7772589947816534, + "grad_norm": 1.1616151332855225, + "learning_rate": 4.808071496896358e-06, + "loss": 0.6448, + "step": 2830 + }, + { + "epoch": 0.777533644603131, + "grad_norm": 1.2378900051116943, + "learning_rate": 4.807932693193764e-06, + "loss": 0.6878, + "step": 2831 + }, + { + "epoch": 0.7778082944246086, + "grad_norm": 1.2423443794250488, + "learning_rate": 4.80779384132256e-06, + "loss": 0.6585, + "step": 2832 + }, + { + "epoch": 0.7780829442460863, + "grad_norm": 1.1612862348556519, + "learning_rate": 4.807654941285643e-06, + "loss": 0.6537, + "step": 2833 + }, + { + "epoch": 0.7783575940675639, + "grad_norm": 1.1832112073898315, + "learning_rate": 4.807515993085914e-06, + "loss": 0.6282, + "step": 2834 + }, + { + "epoch": 0.7786322438890415, + "grad_norm": 1.2265095710754395, + "learning_rate": 4.8073769967262705e-06, + "loss": 0.6743, + "step": 2835 + }, + { + "epoch": 0.7789068937105191, + "grad_norm": 1.2545920610427856, + "learning_rate": 4.807237952209614e-06, + "loss": 0.6559, + "step": 2836 + }, + { + "epoch": 0.7791815435319968, + "grad_norm": 1.2488280534744263, + "learning_rate": 4.807098859538849e-06, + "loss": 0.6505, + "step": 2837 + }, + { + "epoch": 0.7794561933534743, + "grad_norm": 1.25026535987854, + "learning_rate": 4.806959718716875e-06, + "loss": 0.6759, + "step": 2838 + }, + { + "epoch": 0.7797308431749519, + "grad_norm": 1.2748719453811646, + "learning_rate": 4.806820529746598e-06, + "loss": 0.6464, + "step": 2839 + }, + { + "epoch": 0.7800054929964295, + "grad_norm": 1.1928046941757202, + "learning_rate": 4.806681292630923e-06, + "loss": 0.6902, + "step": 2840 + }, + { + "epoch": 0.7802801428179071, + "grad_norm": 1.251920223236084, + "learning_rate": 4.8065420073727556e-06, + "loss": 0.694, + "step": 2841 + }, + { + "epoch": 0.7805547926393848, + "grad_norm": 1.279515266418457, + "learning_rate": 4.806402673975003e-06, + "loss": 0.6417, + "step": 2842 + }, + { + "epoch": 0.7808294424608624, + "grad_norm": 1.2917672395706177, + "learning_rate": 4.806263292440572e-06, + "loss": 0.6696, + "step": 2843 + }, + { + "epoch": 0.78110409228234, + "grad_norm": 1.1663826704025269, + "learning_rate": 4.806123862772373e-06, + "loss": 0.6439, + "step": 2844 + }, + { + "epoch": 0.7813787421038176, + "grad_norm": 1.2600188255310059, + "learning_rate": 4.805984384973316e-06, + "loss": 0.6632, + "step": 2845 + }, + { + "epoch": 0.7816533919252953, + "grad_norm": 1.22040855884552, + "learning_rate": 4.8058448590463115e-06, + "loss": 0.6153, + "step": 2846 + }, + { + "epoch": 0.7819280417467729, + "grad_norm": 1.1702262163162231, + "learning_rate": 4.805705284994271e-06, + "loss": 0.6677, + "step": 2847 + }, + { + "epoch": 0.7822026915682505, + "grad_norm": 1.1855041980743408, + "learning_rate": 4.805565662820109e-06, + "loss": 0.6827, + "step": 2848 + }, + { + "epoch": 0.7824773413897281, + "grad_norm": 1.2299331426620483, + "learning_rate": 4.8054259925267385e-06, + "loss": 0.6458, + "step": 2849 + }, + { + "epoch": 0.7827519912112058, + "grad_norm": 1.3661082983016968, + "learning_rate": 4.805286274117075e-06, + "loss": 0.7041, + "step": 2850 + }, + { + "epoch": 0.7830266410326834, + "grad_norm": 1.2398149967193604, + "learning_rate": 4.805146507594034e-06, + "loss": 0.6531, + "step": 2851 + }, + { + "epoch": 0.7833012908541609, + "grad_norm": 1.253365159034729, + "learning_rate": 4.8050066929605335e-06, + "loss": 0.6537, + "step": 2852 + }, + { + "epoch": 0.7835759406756385, + "grad_norm": 1.1717530488967896, + "learning_rate": 4.8048668302194905e-06, + "loss": 0.6481, + "step": 2853 + }, + { + "epoch": 0.7838505904971161, + "grad_norm": 1.25908625125885, + "learning_rate": 4.804726919373824e-06, + "loss": 0.6862, + "step": 2854 + }, + { + "epoch": 0.7841252403185938, + "grad_norm": 1.1744885444641113, + "learning_rate": 4.804586960426456e-06, + "loss": 0.6294, + "step": 2855 + }, + { + "epoch": 0.7843998901400714, + "grad_norm": 1.1746904850006104, + "learning_rate": 4.804446953380305e-06, + "loss": 0.6721, + "step": 2856 + }, + { + "epoch": 0.784674539961549, + "grad_norm": 1.2020422220230103, + "learning_rate": 4.804306898238295e-06, + "loss": 0.6765, + "step": 2857 + }, + { + "epoch": 0.7849491897830266, + "grad_norm": 1.2881739139556885, + "learning_rate": 4.804166795003348e-06, + "loss": 0.6881, + "step": 2858 + }, + { + "epoch": 0.7852238396045043, + "grad_norm": 1.2976202964782715, + "learning_rate": 4.804026643678388e-06, + "loss": 0.6683, + "step": 2859 + }, + { + "epoch": 0.7854984894259819, + "grad_norm": 1.1818921566009521, + "learning_rate": 4.803886444266341e-06, + "loss": 0.6223, + "step": 2860 + }, + { + "epoch": 0.7857731392474595, + "grad_norm": 1.210225224494934, + "learning_rate": 4.803746196770132e-06, + "loss": 0.6523, + "step": 2861 + }, + { + "epoch": 0.7860477890689371, + "grad_norm": 1.2401338815689087, + "learning_rate": 4.803605901192689e-06, + "loss": 0.6224, + "step": 2862 + }, + { + "epoch": 0.7863224388904148, + "grad_norm": 1.2890150547027588, + "learning_rate": 4.80346555753694e-06, + "loss": 0.6781, + "step": 2863 + }, + { + "epoch": 0.7865970887118924, + "grad_norm": 1.2258787155151367, + "learning_rate": 4.803325165805814e-06, + "loss": 0.6928, + "step": 2864 + }, + { + "epoch": 0.78687173853337, + "grad_norm": 1.1575013399124146, + "learning_rate": 4.80318472600224e-06, + "loss": 0.6744, + "step": 2865 + }, + { + "epoch": 0.7871463883548475, + "grad_norm": 1.2045594453811646, + "learning_rate": 4.803044238129151e-06, + "loss": 0.6397, + "step": 2866 + }, + { + "epoch": 0.7874210381763251, + "grad_norm": 1.274499773979187, + "learning_rate": 4.802903702189477e-06, + "loss": 0.6895, + "step": 2867 + }, + { + "epoch": 0.7876956879978028, + "grad_norm": 1.2109447717666626, + "learning_rate": 4.8027631181861535e-06, + "loss": 0.6416, + "step": 2868 + }, + { + "epoch": 0.7879703378192804, + "grad_norm": 1.2465527057647705, + "learning_rate": 4.8026224861221125e-06, + "loss": 0.6927, + "step": 2869 + }, + { + "epoch": 0.788244987640758, + "grad_norm": 1.275590181350708, + "learning_rate": 4.80248180600029e-06, + "loss": 0.685, + "step": 2870 + }, + { + "epoch": 0.7885196374622356, + "grad_norm": 1.1992900371551514, + "learning_rate": 4.802341077823622e-06, + "loss": 0.6408, + "step": 2871 + }, + { + "epoch": 0.7887942872837133, + "grad_norm": 1.2258410453796387, + "learning_rate": 4.802200301595047e-06, + "loss": 0.6531, + "step": 2872 + }, + { + "epoch": 0.7890689371051909, + "grad_norm": 1.252181053161621, + "learning_rate": 4.8020594773175006e-06, + "loss": 0.648, + "step": 2873 + }, + { + "epoch": 0.7893435869266685, + "grad_norm": 1.1840732097625732, + "learning_rate": 4.8019186049939236e-06, + "loss": 0.6476, + "step": 2874 + }, + { + "epoch": 0.7896182367481461, + "grad_norm": 1.2296451330184937, + "learning_rate": 4.801777684627255e-06, + "loss": 0.6705, + "step": 2875 + }, + { + "epoch": 0.7898928865696238, + "grad_norm": 1.2290650606155396, + "learning_rate": 4.801636716220438e-06, + "loss": 0.6485, + "step": 2876 + }, + { + "epoch": 0.7901675363911014, + "grad_norm": 1.195563793182373, + "learning_rate": 4.801495699776413e-06, + "loss": 0.6258, + "step": 2877 + }, + { + "epoch": 0.790442186212579, + "grad_norm": 1.2786891460418701, + "learning_rate": 4.801354635298123e-06, + "loss": 0.6583, + "step": 2878 + }, + { + "epoch": 0.7907168360340566, + "grad_norm": 1.1786712408065796, + "learning_rate": 4.801213522788513e-06, + "loss": 0.64, + "step": 2879 + }, + { + "epoch": 0.7909914858555342, + "grad_norm": 1.2036850452423096, + "learning_rate": 4.801072362250527e-06, + "loss": 0.6424, + "step": 2880 + }, + { + "epoch": 0.7912661356770118, + "grad_norm": 1.305334448814392, + "learning_rate": 4.800931153687113e-06, + "loss": 0.643, + "step": 2881 + }, + { + "epoch": 0.7915407854984894, + "grad_norm": 1.2073020935058594, + "learning_rate": 4.800789897101217e-06, + "loss": 0.6883, + "step": 2882 + }, + { + "epoch": 0.791815435319967, + "grad_norm": 1.1817529201507568, + "learning_rate": 4.8006485924957865e-06, + "loss": 0.6925, + "step": 2883 + }, + { + "epoch": 0.7920900851414446, + "grad_norm": 1.249086618423462, + "learning_rate": 4.800507239873772e-06, + "loss": 0.6108, + "step": 2884 + }, + { + "epoch": 0.7923647349629223, + "grad_norm": 1.2129474878311157, + "learning_rate": 4.800365839238122e-06, + "loss": 0.658, + "step": 2885 + }, + { + "epoch": 0.7926393847843999, + "grad_norm": 1.2325923442840576, + "learning_rate": 4.80022439059179e-06, + "loss": 0.6414, + "step": 2886 + }, + { + "epoch": 0.7929140346058775, + "grad_norm": 1.2201826572418213, + "learning_rate": 4.800082893937726e-06, + "loss": 0.6705, + "step": 2887 + }, + { + "epoch": 0.7931886844273551, + "grad_norm": 1.112959861755371, + "learning_rate": 4.799941349278885e-06, + "loss": 0.5683, + "step": 2888 + }, + { + "epoch": 0.7934633342488328, + "grad_norm": 1.1585725545883179, + "learning_rate": 4.7997997566182194e-06, + "loss": 0.6166, + "step": 2889 + }, + { + "epoch": 0.7937379840703104, + "grad_norm": 1.2108248472213745, + "learning_rate": 4.799658115958686e-06, + "loss": 0.644, + "step": 2890 + }, + { + "epoch": 0.794012633891788, + "grad_norm": 1.2623339891433716, + "learning_rate": 4.7995164273032394e-06, + "loss": 0.6586, + "step": 2891 + }, + { + "epoch": 0.7942872837132656, + "grad_norm": 1.231102466583252, + "learning_rate": 4.799374690654837e-06, + "loss": 0.6669, + "step": 2892 + }, + { + "epoch": 0.7945619335347432, + "grad_norm": 1.1890374422073364, + "learning_rate": 4.799232906016439e-06, + "loss": 0.7152, + "step": 2893 + }, + { + "epoch": 0.7948365833562209, + "grad_norm": 1.2065973281860352, + "learning_rate": 4.799091073391003e-06, + "loss": 0.6829, + "step": 2894 + }, + { + "epoch": 0.7951112331776984, + "grad_norm": 1.2562252283096313, + "learning_rate": 4.7989491927814875e-06, + "loss": 0.6511, + "step": 2895 + }, + { + "epoch": 0.795385882999176, + "grad_norm": 1.1068367958068848, + "learning_rate": 4.798807264190856e-06, + "loss": 0.5569, + "step": 2896 + }, + { + "epoch": 0.7956605328206536, + "grad_norm": 1.3200187683105469, + "learning_rate": 4.798665287622071e-06, + "loss": 0.7055, + "step": 2897 + }, + { + "epoch": 0.7959351826421313, + "grad_norm": 1.2085601091384888, + "learning_rate": 4.798523263078094e-06, + "loss": 0.6617, + "step": 2898 + }, + { + "epoch": 0.7962098324636089, + "grad_norm": 1.254721760749817, + "learning_rate": 4.79838119056189e-06, + "loss": 0.682, + "step": 2899 + }, + { + "epoch": 0.7964844822850865, + "grad_norm": 1.2364803552627563, + "learning_rate": 4.798239070076423e-06, + "loss": 0.652, + "step": 2900 + }, + { + "epoch": 0.7967591321065641, + "grad_norm": 1.1770317554473877, + "learning_rate": 4.798096901624663e-06, + "loss": 0.6692, + "step": 2901 + }, + { + "epoch": 0.7970337819280418, + "grad_norm": 1.2226812839508057, + "learning_rate": 4.797954685209572e-06, + "loss": 0.6533, + "step": 2902 + }, + { + "epoch": 0.7973084317495194, + "grad_norm": 1.2441266775131226, + "learning_rate": 4.797812420834121e-06, + "loss": 0.7071, + "step": 2903 + }, + { + "epoch": 0.797583081570997, + "grad_norm": 1.2740623950958252, + "learning_rate": 4.79767010850128e-06, + "loss": 0.6855, + "step": 2904 + }, + { + "epoch": 0.7978577313924746, + "grad_norm": 1.2008252143859863, + "learning_rate": 4.797527748214017e-06, + "loss": 0.6888, + "step": 2905 + }, + { + "epoch": 0.7981323812139522, + "grad_norm": 1.2795240879058838, + "learning_rate": 4.7973853399753045e-06, + "loss": 0.6239, + "step": 2906 + }, + { + "epoch": 0.7984070310354299, + "grad_norm": 1.195493221282959, + "learning_rate": 4.797242883788114e-06, + "loss": 0.6313, + "step": 2907 + }, + { + "epoch": 0.7986816808569075, + "grad_norm": 1.173943042755127, + "learning_rate": 4.7971003796554205e-06, + "loss": 0.65, + "step": 2908 + }, + { + "epoch": 0.798956330678385, + "grad_norm": 1.2160239219665527, + "learning_rate": 4.796957827580195e-06, + "loss": 0.6536, + "step": 2909 + }, + { + "epoch": 0.7992309804998626, + "grad_norm": 1.1562957763671875, + "learning_rate": 4.7968152275654155e-06, + "loss": 0.6648, + "step": 2910 + }, + { + "epoch": 0.7995056303213403, + "grad_norm": 1.272045612335205, + "learning_rate": 4.7966725796140564e-06, + "loss": 0.7043, + "step": 2911 + }, + { + "epoch": 0.7997802801428179, + "grad_norm": 1.1539562940597534, + "learning_rate": 4.796529883729096e-06, + "loss": 0.5979, + "step": 2912 + }, + { + "epoch": 0.8000549299642955, + "grad_norm": 1.210585117340088, + "learning_rate": 4.796387139913513e-06, + "loss": 0.6451, + "step": 2913 + }, + { + "epoch": 0.8003295797857731, + "grad_norm": 1.2691060304641724, + "learning_rate": 4.796244348170284e-06, + "loss": 0.6558, + "step": 2914 + }, + { + "epoch": 0.8006042296072508, + "grad_norm": 1.2310236692428589, + "learning_rate": 4.796101508502393e-06, + "loss": 0.6601, + "step": 2915 + }, + { + "epoch": 0.8008788794287284, + "grad_norm": 1.293983817100525, + "learning_rate": 4.7959586209128175e-06, + "loss": 0.6809, + "step": 2916 + }, + { + "epoch": 0.801153529250206, + "grad_norm": 1.306242823600769, + "learning_rate": 4.795815685404542e-06, + "loss": 0.6826, + "step": 2917 + }, + { + "epoch": 0.8014281790716836, + "grad_norm": 1.2630850076675415, + "learning_rate": 4.795672701980548e-06, + "loss": 0.6667, + "step": 2918 + }, + { + "epoch": 0.8017028288931612, + "grad_norm": 1.3092399835586548, + "learning_rate": 4.795529670643822e-06, + "loss": 0.6481, + "step": 2919 + }, + { + "epoch": 0.8019774787146389, + "grad_norm": 1.2528066635131836, + "learning_rate": 4.7953865913973466e-06, + "loss": 0.6613, + "step": 2920 + }, + { + "epoch": 0.8022521285361165, + "grad_norm": 1.294963002204895, + "learning_rate": 4.79524346424411e-06, + "loss": 0.6686, + "step": 2921 + }, + { + "epoch": 0.8025267783575941, + "grad_norm": 1.1944183111190796, + "learning_rate": 4.795100289187099e-06, + "loss": 0.6268, + "step": 2922 + }, + { + "epoch": 0.8028014281790716, + "grad_norm": 1.2505841255187988, + "learning_rate": 4.794957066229301e-06, + "loss": 0.6789, + "step": 2923 + }, + { + "epoch": 0.8030760780005493, + "grad_norm": 1.2038072347640991, + "learning_rate": 4.794813795373705e-06, + "loss": 0.6116, + "step": 2924 + }, + { + "epoch": 0.8033507278220269, + "grad_norm": 1.1791633367538452, + "learning_rate": 4.794670476623302e-06, + "loss": 0.6203, + "step": 2925 + }, + { + "epoch": 0.8036253776435045, + "grad_norm": 1.1382683515548706, + "learning_rate": 4.794527109981084e-06, + "loss": 0.6086, + "step": 2926 + }, + { + "epoch": 0.8039000274649821, + "grad_norm": 1.2600253820419312, + "learning_rate": 4.794383695450041e-06, + "loss": 0.6825, + "step": 2927 + }, + { + "epoch": 0.8041746772864597, + "grad_norm": 1.2778816223144531, + "learning_rate": 4.7942402330331685e-06, + "loss": 0.6904, + "step": 2928 + }, + { + "epoch": 0.8044493271079374, + "grad_norm": 1.2481133937835693, + "learning_rate": 4.794096722733459e-06, + "loss": 0.6712, + "step": 2929 + }, + { + "epoch": 0.804723976929415, + "grad_norm": 1.2001410722732544, + "learning_rate": 4.793953164553908e-06, + "loss": 0.6479, + "step": 2930 + }, + { + "epoch": 0.8049986267508926, + "grad_norm": 1.2365530729293823, + "learning_rate": 4.793809558497512e-06, + "loss": 0.6588, + "step": 2931 + }, + { + "epoch": 0.8052732765723702, + "grad_norm": 1.1774418354034424, + "learning_rate": 4.7936659045672685e-06, + "loss": 0.6428, + "step": 2932 + }, + { + "epoch": 0.8055479263938479, + "grad_norm": 1.2506473064422607, + "learning_rate": 4.793522202766176e-06, + "loss": 0.6675, + "step": 2933 + }, + { + "epoch": 0.8058225762153255, + "grad_norm": 1.2604624032974243, + "learning_rate": 4.793378453097232e-06, + "loss": 0.6735, + "step": 2934 + }, + { + "epoch": 0.8060972260368031, + "grad_norm": 1.2018909454345703, + "learning_rate": 4.793234655563437e-06, + "loss": 0.6525, + "step": 2935 + }, + { + "epoch": 0.8063718758582807, + "grad_norm": 1.2106949090957642, + "learning_rate": 4.793090810167794e-06, + "loss": 0.6696, + "step": 2936 + }, + { + "epoch": 0.8066465256797583, + "grad_norm": 1.213627576828003, + "learning_rate": 4.792946916913304e-06, + "loss": 0.6462, + "step": 2937 + }, + { + "epoch": 0.8069211755012359, + "grad_norm": 1.2729214429855347, + "learning_rate": 4.792802975802969e-06, + "loss": 0.6826, + "step": 2938 + }, + { + "epoch": 0.8071958253227135, + "grad_norm": 1.1689305305480957, + "learning_rate": 4.792658986839796e-06, + "loss": 0.6361, + "step": 2939 + }, + { + "epoch": 0.8074704751441911, + "grad_norm": 1.0978611707687378, + "learning_rate": 4.792514950026788e-06, + "loss": 0.6416, + "step": 2940 + }, + { + "epoch": 0.8077451249656687, + "grad_norm": 1.1403359174728394, + "learning_rate": 4.792370865366951e-06, + "loss": 0.6529, + "step": 2941 + }, + { + "epoch": 0.8080197747871464, + "grad_norm": 1.1011066436767578, + "learning_rate": 4.792226732863295e-06, + "loss": 0.5935, + "step": 2942 + }, + { + "epoch": 0.808294424608624, + "grad_norm": 1.2709904909133911, + "learning_rate": 4.792082552518824e-06, + "loss": 0.6265, + "step": 2943 + }, + { + "epoch": 0.8085690744301016, + "grad_norm": 1.2694720029830933, + "learning_rate": 4.79193832433655e-06, + "loss": 0.6899, + "step": 2944 + }, + { + "epoch": 0.8088437242515792, + "grad_norm": 1.164707899093628, + "learning_rate": 4.791794048319482e-06, + "loss": 0.6699, + "step": 2945 + }, + { + "epoch": 0.8091183740730569, + "grad_norm": 1.1457130908966064, + "learning_rate": 4.791649724470633e-06, + "loss": 0.6042, + "step": 2946 + }, + { + "epoch": 0.8093930238945345, + "grad_norm": 1.218591332435608, + "learning_rate": 4.791505352793012e-06, + "loss": 0.6911, + "step": 2947 + }, + { + "epoch": 0.8096676737160121, + "grad_norm": 1.191819429397583, + "learning_rate": 4.791360933289635e-06, + "loss": 0.6474, + "step": 2948 + }, + { + "epoch": 0.8099423235374897, + "grad_norm": 1.2261943817138672, + "learning_rate": 4.791216465963515e-06, + "loss": 0.6765, + "step": 2949 + }, + { + "epoch": 0.8102169733589674, + "grad_norm": 1.2736910581588745, + "learning_rate": 4.791071950817667e-06, + "loss": 0.6725, + "step": 2950 + }, + { + "epoch": 0.810491623180445, + "grad_norm": 1.1958893537521362, + "learning_rate": 4.790927387855108e-06, + "loss": 0.6901, + "step": 2951 + }, + { + "epoch": 0.8107662730019225, + "grad_norm": 1.24952232837677, + "learning_rate": 4.790782777078854e-06, + "loss": 0.669, + "step": 2952 + }, + { + "epoch": 0.8110409228234001, + "grad_norm": 1.2396634817123413, + "learning_rate": 4.7906381184919245e-06, + "loss": 0.6622, + "step": 2953 + }, + { + "epoch": 0.8113155726448777, + "grad_norm": 1.1973953247070312, + "learning_rate": 4.790493412097338e-06, + "loss": 0.6411, + "step": 2954 + }, + { + "epoch": 0.8115902224663554, + "grad_norm": 1.2059978246688843, + "learning_rate": 4.790348657898114e-06, + "loss": 0.6319, + "step": 2955 + }, + { + "epoch": 0.811864872287833, + "grad_norm": 1.2537676095962524, + "learning_rate": 4.790203855897274e-06, + "loss": 0.6466, + "step": 2956 + }, + { + "epoch": 0.8121395221093106, + "grad_norm": 1.1937447786331177, + "learning_rate": 4.790059006097841e-06, + "loss": 0.639, + "step": 2957 + }, + { + "epoch": 0.8124141719307882, + "grad_norm": 1.2075364589691162, + "learning_rate": 4.7899141085028365e-06, + "loss": 0.6584, + "step": 2958 + }, + { + "epoch": 0.8126888217522659, + "grad_norm": 1.2417018413543701, + "learning_rate": 4.789769163115287e-06, + "loss": 0.6569, + "step": 2959 + }, + { + "epoch": 0.8129634715737435, + "grad_norm": 1.25492525100708, + "learning_rate": 4.789624169938215e-06, + "loss": 0.6512, + "step": 2960 + }, + { + "epoch": 0.8132381213952211, + "grad_norm": 1.2013273239135742, + "learning_rate": 4.789479128974648e-06, + "loss": 0.6667, + "step": 2961 + }, + { + "epoch": 0.8135127712166987, + "grad_norm": 1.2715181112289429, + "learning_rate": 4.789334040227614e-06, + "loss": 0.6865, + "step": 2962 + }, + { + "epoch": 0.8137874210381764, + "grad_norm": 1.215315580368042, + "learning_rate": 4.7891889037001396e-06, + "loss": 0.6186, + "step": 2963 + }, + { + "epoch": 0.814062070859654, + "grad_norm": 1.1616284847259521, + "learning_rate": 4.7890437193952545e-06, + "loss": 0.6161, + "step": 2964 + }, + { + "epoch": 0.8143367206811316, + "grad_norm": 1.3379619121551514, + "learning_rate": 4.788898487315988e-06, + "loss": 0.6573, + "step": 2965 + }, + { + "epoch": 0.8146113705026091, + "grad_norm": 1.2307674884796143, + "learning_rate": 4.7887532074653734e-06, + "loss": 0.6499, + "step": 2966 + }, + { + "epoch": 0.8148860203240867, + "grad_norm": 1.1849983930587769, + "learning_rate": 4.7886078798464405e-06, + "loss": 0.6074, + "step": 2967 + }, + { + "epoch": 0.8151606701455644, + "grad_norm": 1.3327951431274414, + "learning_rate": 4.7884625044622245e-06, + "loss": 0.6563, + "step": 2968 + }, + { + "epoch": 0.815435319967042, + "grad_norm": 1.1522712707519531, + "learning_rate": 4.788317081315758e-06, + "loss": 0.652, + "step": 2969 + }, + { + "epoch": 0.8157099697885196, + "grad_norm": 1.3234031200408936, + "learning_rate": 4.788171610410076e-06, + "loss": 0.722, + "step": 2970 + }, + { + "epoch": 0.8159846196099972, + "grad_norm": 1.2934340238571167, + "learning_rate": 4.788026091748216e-06, + "loss": 0.6693, + "step": 2971 + }, + { + "epoch": 0.8162592694314749, + "grad_norm": 1.2951685190200806, + "learning_rate": 4.787880525333213e-06, + "loss": 0.6686, + "step": 2972 + }, + { + "epoch": 0.8165339192529525, + "grad_norm": 1.2362418174743652, + "learning_rate": 4.787734911168107e-06, + "loss": 0.6622, + "step": 2973 + }, + { + "epoch": 0.8168085690744301, + "grad_norm": 1.2404433488845825, + "learning_rate": 4.787589249255937e-06, + "loss": 0.6428, + "step": 2974 + }, + { + "epoch": 0.8170832188959077, + "grad_norm": 1.260521411895752, + "learning_rate": 4.787443539599742e-06, + "loss": 0.6578, + "step": 2975 + }, + { + "epoch": 0.8173578687173854, + "grad_norm": 1.2462433576583862, + "learning_rate": 4.787297782202563e-06, + "loss": 0.6886, + "step": 2976 + }, + { + "epoch": 0.817632518538863, + "grad_norm": 1.2232097387313843, + "learning_rate": 4.7871519770674436e-06, + "loss": 0.6839, + "step": 2977 + }, + { + "epoch": 0.8179071683603406, + "grad_norm": 1.1459496021270752, + "learning_rate": 4.7870061241974254e-06, + "loss": 0.6392, + "step": 2978 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.2506115436553955, + "learning_rate": 4.786860223595554e-06, + "loss": 0.6461, + "step": 2979 + }, + { + "epoch": 0.8184564680032957, + "grad_norm": 1.2247257232666016, + "learning_rate": 4.786714275264873e-06, + "loss": 0.6802, + "step": 2980 + }, + { + "epoch": 0.8187311178247734, + "grad_norm": 1.3182578086853027, + "learning_rate": 4.786568279208429e-06, + "loss": 0.672, + "step": 2981 + }, + { + "epoch": 0.819005767646251, + "grad_norm": 1.2709492444992065, + "learning_rate": 4.786422235429269e-06, + "loss": 0.6521, + "step": 2982 + }, + { + "epoch": 0.8192804174677286, + "grad_norm": 1.2167927026748657, + "learning_rate": 4.786276143930441e-06, + "loss": 0.6371, + "step": 2983 + }, + { + "epoch": 0.8195550672892062, + "grad_norm": 1.1225700378417969, + "learning_rate": 4.786130004714994e-06, + "loss": 0.6441, + "step": 2984 + }, + { + "epoch": 0.8198297171106839, + "grad_norm": 1.1780742406845093, + "learning_rate": 4.7859838177859784e-06, + "loss": 0.6304, + "step": 2985 + }, + { + "epoch": 0.8201043669321615, + "grad_norm": 1.16338050365448, + "learning_rate": 4.785837583146446e-06, + "loss": 0.6591, + "step": 2986 + }, + { + "epoch": 0.8203790167536391, + "grad_norm": 1.2317198514938354, + "learning_rate": 4.785691300799447e-06, + "loss": 0.6522, + "step": 2987 + }, + { + "epoch": 0.8206536665751167, + "grad_norm": 1.1529053449630737, + "learning_rate": 4.785544970748035e-06, + "loss": 0.6373, + "step": 2988 + }, + { + "epoch": 0.8209283163965944, + "grad_norm": 1.2490605115890503, + "learning_rate": 4.785398592995265e-06, + "loss": 0.6363, + "step": 2989 + }, + { + "epoch": 0.821202966218072, + "grad_norm": 1.1725293397903442, + "learning_rate": 4.785252167544192e-06, + "loss": 0.633, + "step": 2990 + }, + { + "epoch": 0.8214776160395496, + "grad_norm": 1.2671160697937012, + "learning_rate": 4.785105694397871e-06, + "loss": 0.678, + "step": 2991 + }, + { + "epoch": 0.8217522658610272, + "grad_norm": 1.1656769514083862, + "learning_rate": 4.784959173559359e-06, + "loss": 0.6055, + "step": 2992 + }, + { + "epoch": 0.8220269156825049, + "grad_norm": 1.2929587364196777, + "learning_rate": 4.784812605031716e-06, + "loss": 0.6343, + "step": 2993 + }, + { + "epoch": 0.8223015655039824, + "grad_norm": 1.2247740030288696, + "learning_rate": 4.784665988817998e-06, + "loss": 0.6386, + "step": 2994 + }, + { + "epoch": 0.82257621532546, + "grad_norm": 1.210375428199768, + "learning_rate": 4.784519324921268e-06, + "loss": 0.699, + "step": 2995 + }, + { + "epoch": 0.8228508651469376, + "grad_norm": 1.24129319190979, + "learning_rate": 4.784372613344585e-06, + "loss": 0.6031, + "step": 2996 + }, + { + "epoch": 0.8231255149684152, + "grad_norm": 1.2138656377792358, + "learning_rate": 4.78422585409101e-06, + "loss": 0.6085, + "step": 2997 + }, + { + "epoch": 0.8234001647898929, + "grad_norm": 1.1817389726638794, + "learning_rate": 4.78407904716361e-06, + "loss": 0.6477, + "step": 2998 + }, + { + "epoch": 0.8236748146113705, + "grad_norm": 1.1807503700256348, + "learning_rate": 4.783932192565444e-06, + "loss": 0.6123, + "step": 2999 + }, + { + "epoch": 0.8239494644328481, + "grad_norm": 1.3016074895858765, + "learning_rate": 4.783785290299582e-06, + "loss": 0.671, + "step": 3000 + }, + { + "epoch": 0.8242241142543257, + "grad_norm": 1.1358970403671265, + "learning_rate": 4.783638340369087e-06, + "loss": 0.6188, + "step": 3001 + }, + { + "epoch": 0.8244987640758034, + "grad_norm": 1.2217535972595215, + "learning_rate": 4.783491342777026e-06, + "loss": 0.6571, + "step": 3002 + }, + { + "epoch": 0.824773413897281, + "grad_norm": 1.1692636013031006, + "learning_rate": 4.783344297526468e-06, + "loss": 0.6578, + "step": 3003 + }, + { + "epoch": 0.8250480637187586, + "grad_norm": 1.196078896522522, + "learning_rate": 4.78319720462048e-06, + "loss": 0.6463, + "step": 3004 + }, + { + "epoch": 0.8253227135402362, + "grad_norm": 1.1500111818313599, + "learning_rate": 4.783050064062135e-06, + "loss": 0.6578, + "step": 3005 + }, + { + "epoch": 0.8255973633617139, + "grad_norm": 1.1641343832015991, + "learning_rate": 4.782902875854501e-06, + "loss": 0.641, + "step": 3006 + }, + { + "epoch": 0.8258720131831915, + "grad_norm": 1.2000601291656494, + "learning_rate": 4.782755640000652e-06, + "loss": 0.6479, + "step": 3007 + }, + { + "epoch": 0.8261466630046691, + "grad_norm": 1.2385162115097046, + "learning_rate": 4.782608356503661e-06, + "loss": 0.6684, + "step": 3008 + }, + { + "epoch": 0.8264213128261466, + "grad_norm": 1.2422523498535156, + "learning_rate": 4.7824610253666e-06, + "loss": 0.6564, + "step": 3009 + }, + { + "epoch": 0.8266959626476242, + "grad_norm": 1.1336132287979126, + "learning_rate": 4.782313646592547e-06, + "loss": 0.5988, + "step": 3010 + }, + { + "epoch": 0.8269706124691019, + "grad_norm": 1.276553988456726, + "learning_rate": 4.782166220184574e-06, + "loss": 0.6523, + "step": 3011 + }, + { + "epoch": 0.8272452622905795, + "grad_norm": 1.209840178489685, + "learning_rate": 4.782018746145761e-06, + "loss": 0.626, + "step": 3012 + }, + { + "epoch": 0.8275199121120571, + "grad_norm": 1.2360517978668213, + "learning_rate": 4.781871224479184e-06, + "loss": 0.6203, + "step": 3013 + }, + { + "epoch": 0.8277945619335347, + "grad_norm": 1.2478326559066772, + "learning_rate": 4.781723655187923e-06, + "loss": 0.6753, + "step": 3014 + }, + { + "epoch": 0.8280692117550124, + "grad_norm": 1.132057785987854, + "learning_rate": 4.7815760382750585e-06, + "loss": 0.597, + "step": 3015 + }, + { + "epoch": 0.82834386157649, + "grad_norm": 1.2624074220657349, + "learning_rate": 4.78142837374367e-06, + "loss": 0.6473, + "step": 3016 + }, + { + "epoch": 0.8286185113979676, + "grad_norm": 1.1737803220748901, + "learning_rate": 4.78128066159684e-06, + "loss": 0.6549, + "step": 3017 + }, + { + "epoch": 0.8288931612194452, + "grad_norm": 1.216335654258728, + "learning_rate": 4.781132901837652e-06, + "loss": 0.6801, + "step": 3018 + }, + { + "epoch": 0.8291678110409229, + "grad_norm": 1.2033624649047852, + "learning_rate": 4.7809850944691885e-06, + "loss": 0.7, + "step": 3019 + }, + { + "epoch": 0.8294424608624005, + "grad_norm": 1.2870113849639893, + "learning_rate": 4.780837239494536e-06, + "loss": 0.61, + "step": 3020 + }, + { + "epoch": 0.8297171106838781, + "grad_norm": 1.209315538406372, + "learning_rate": 4.780689336916779e-06, + "loss": 0.6448, + "step": 3021 + }, + { + "epoch": 0.8299917605053557, + "grad_norm": 1.2818105220794678, + "learning_rate": 4.7805413867390034e-06, + "loss": 0.6557, + "step": 3022 + }, + { + "epoch": 0.8302664103268332, + "grad_norm": 1.2017264366149902, + "learning_rate": 4.7803933889643e-06, + "loss": 0.6833, + "step": 3023 + }, + { + "epoch": 0.8305410601483109, + "grad_norm": 1.300740122795105, + "learning_rate": 4.780245343595757e-06, + "loss": 0.6526, + "step": 3024 + }, + { + "epoch": 0.8308157099697885, + "grad_norm": 1.2193852663040161, + "learning_rate": 4.7800972506364616e-06, + "loss": 0.6602, + "step": 3025 + }, + { + "epoch": 0.8310903597912661, + "grad_norm": 1.2361305952072144, + "learning_rate": 4.779949110089507e-06, + "loss": 0.6759, + "step": 3026 + }, + { + "epoch": 0.8313650096127437, + "grad_norm": 1.2417620420455933, + "learning_rate": 4.779800921957985e-06, + "loss": 0.6824, + "step": 3027 + }, + { + "epoch": 0.8316396594342214, + "grad_norm": 1.2373156547546387, + "learning_rate": 4.779652686244986e-06, + "loss": 0.6735, + "step": 3028 + }, + { + "epoch": 0.831914309255699, + "grad_norm": 1.2515944242477417, + "learning_rate": 4.779504402953608e-06, + "loss": 0.6404, + "step": 3029 + }, + { + "epoch": 0.8321889590771766, + "grad_norm": 1.1986885070800781, + "learning_rate": 4.779356072086942e-06, + "loss": 0.6285, + "step": 3030 + }, + { + "epoch": 0.8324636088986542, + "grad_norm": 1.2141094207763672, + "learning_rate": 4.779207693648086e-06, + "loss": 0.6737, + "step": 3031 + }, + { + "epoch": 0.8327382587201319, + "grad_norm": 1.1837410926818848, + "learning_rate": 4.779059267640135e-06, + "loss": 0.6489, + "step": 3032 + }, + { + "epoch": 0.8330129085416095, + "grad_norm": 1.1988180875778198, + "learning_rate": 4.77891079406619e-06, + "loss": 0.6418, + "step": 3033 + }, + { + "epoch": 0.8332875583630871, + "grad_norm": 1.2997223138809204, + "learning_rate": 4.7787622729293445e-06, + "loss": 0.6973, + "step": 3034 + }, + { + "epoch": 0.8335622081845647, + "grad_norm": 1.2344731092453003, + "learning_rate": 4.778613704232704e-06, + "loss": 0.6652, + "step": 3035 + }, + { + "epoch": 0.8338368580060423, + "grad_norm": 1.2014533281326294, + "learning_rate": 4.778465087979365e-06, + "loss": 0.6527, + "step": 3036 + }, + { + "epoch": 0.8341115078275199, + "grad_norm": 1.2307097911834717, + "learning_rate": 4.778316424172432e-06, + "loss": 0.643, + "step": 3037 + }, + { + "epoch": 0.8343861576489975, + "grad_norm": 1.1751227378845215, + "learning_rate": 4.778167712815006e-06, + "loss": 0.6415, + "step": 3038 + }, + { + "epoch": 0.8346608074704751, + "grad_norm": 1.258471965789795, + "learning_rate": 4.778018953910191e-06, + "loss": 0.6782, + "step": 3039 + }, + { + "epoch": 0.8349354572919527, + "grad_norm": 1.2169530391693115, + "learning_rate": 4.777870147461093e-06, + "loss": 0.6319, + "step": 3040 + }, + { + "epoch": 0.8352101071134304, + "grad_norm": 1.3101508617401123, + "learning_rate": 4.777721293470817e-06, + "loss": 0.6463, + "step": 3041 + }, + { + "epoch": 0.835484756934908, + "grad_norm": 1.2785394191741943, + "learning_rate": 4.777572391942469e-06, + "loss": 0.6985, + "step": 3042 + }, + { + "epoch": 0.8357594067563856, + "grad_norm": 1.2545239925384521, + "learning_rate": 4.777423442879158e-06, + "loss": 0.6841, + "step": 3043 + }, + { + "epoch": 0.8360340565778632, + "grad_norm": 1.2531274557113647, + "learning_rate": 4.777274446283991e-06, + "loss": 0.6636, + "step": 3044 + }, + { + "epoch": 0.8363087063993409, + "grad_norm": 1.2790828943252563, + "learning_rate": 4.777125402160079e-06, + "loss": 0.6732, + "step": 3045 + }, + { + "epoch": 0.8365833562208185, + "grad_norm": 1.2322310209274292, + "learning_rate": 4.776976310510532e-06, + "loss": 0.6424, + "step": 3046 + }, + { + "epoch": 0.8368580060422961, + "grad_norm": 1.1801968812942505, + "learning_rate": 4.776827171338463e-06, + "loss": 0.6507, + "step": 3047 + }, + { + "epoch": 0.8371326558637737, + "grad_norm": 1.2335439920425415, + "learning_rate": 4.7766779846469825e-06, + "loss": 0.6596, + "step": 3048 + }, + { + "epoch": 0.8374073056852513, + "grad_norm": 1.1805238723754883, + "learning_rate": 4.7765287504392055e-06, + "loss": 0.6819, + "step": 3049 + }, + { + "epoch": 0.837681955506729, + "grad_norm": 1.205893635749817, + "learning_rate": 4.776379468718248e-06, + "loss": 0.6443, + "step": 3050 + }, + { + "epoch": 0.8379566053282065, + "grad_norm": 1.2364075183868408, + "learning_rate": 4.776230139487222e-06, + "loss": 0.6799, + "step": 3051 + }, + { + "epoch": 0.8382312551496841, + "grad_norm": 1.1957356929779053, + "learning_rate": 4.7760807627492474e-06, + "loss": 0.6467, + "step": 3052 + }, + { + "epoch": 0.8385059049711617, + "grad_norm": 1.221339464187622, + "learning_rate": 4.77593133850744e-06, + "loss": 0.6368, + "step": 3053 + }, + { + "epoch": 0.8387805547926394, + "grad_norm": 1.2319796085357666, + "learning_rate": 4.77578186676492e-06, + "loss": 0.6413, + "step": 3054 + }, + { + "epoch": 0.839055204614117, + "grad_norm": 1.1506072282791138, + "learning_rate": 4.775632347524806e-06, + "loss": 0.6385, + "step": 3055 + }, + { + "epoch": 0.8393298544355946, + "grad_norm": 1.2249106168746948, + "learning_rate": 4.7754827807902174e-06, + "loss": 0.6702, + "step": 3056 + }, + { + "epoch": 0.8396045042570722, + "grad_norm": 1.22999107837677, + "learning_rate": 4.775333166564279e-06, + "loss": 0.6205, + "step": 3057 + }, + { + "epoch": 0.8398791540785498, + "grad_norm": 1.2364047765731812, + "learning_rate": 4.77518350485011e-06, + "loss": 0.6848, + "step": 3058 + }, + { + "epoch": 0.8401538039000275, + "grad_norm": 1.234154462814331, + "learning_rate": 4.775033795650836e-06, + "loss": 0.5955, + "step": 3059 + }, + { + "epoch": 0.8404284537215051, + "grad_norm": 1.2164496183395386, + "learning_rate": 4.7748840389695805e-06, + "loss": 0.6743, + "step": 3060 + }, + { + "epoch": 0.8407031035429827, + "grad_norm": 1.1873829364776611, + "learning_rate": 4.77473423480947e-06, + "loss": 0.6588, + "step": 3061 + }, + { + "epoch": 0.8409777533644603, + "grad_norm": 1.260932445526123, + "learning_rate": 4.774584383173631e-06, + "loss": 0.6469, + "step": 3062 + }, + { + "epoch": 0.841252403185938, + "grad_norm": 1.1999144554138184, + "learning_rate": 4.77443448406519e-06, + "loss": 0.6369, + "step": 3063 + }, + { + "epoch": 0.8415270530074156, + "grad_norm": 1.2230007648468018, + "learning_rate": 4.774284537487277e-06, + "loss": 0.6721, + "step": 3064 + }, + { + "epoch": 0.8418017028288931, + "grad_norm": 1.2173882722854614, + "learning_rate": 4.7741345434430205e-06, + "loss": 0.657, + "step": 3065 + }, + { + "epoch": 0.8420763526503707, + "grad_norm": 1.1599143743515015, + "learning_rate": 4.77398450193555e-06, + "loss": 0.6301, + "step": 3066 + }, + { + "epoch": 0.8423510024718484, + "grad_norm": 1.1458444595336914, + "learning_rate": 4.773834412968e-06, + "loss": 0.6171, + "step": 3067 + }, + { + "epoch": 0.842625652293326, + "grad_norm": 1.286423683166504, + "learning_rate": 4.7736842765435e-06, + "loss": 0.6501, + "step": 3068 + }, + { + "epoch": 0.8429003021148036, + "grad_norm": 1.2103040218353271, + "learning_rate": 4.773534092665185e-06, + "loss": 0.6799, + "step": 3069 + }, + { + "epoch": 0.8431749519362812, + "grad_norm": 1.2638500928878784, + "learning_rate": 4.77338386133619e-06, + "loss": 0.6687, + "step": 3070 + }, + { + "epoch": 0.8434496017577588, + "grad_norm": 1.1798574924468994, + "learning_rate": 4.773233582559649e-06, + "loss": 0.6332, + "step": 3071 + }, + { + "epoch": 0.8437242515792365, + "grad_norm": 1.2918095588684082, + "learning_rate": 4.773083256338698e-06, + "loss": 0.6659, + "step": 3072 + }, + { + "epoch": 0.8439989014007141, + "grad_norm": 1.2190998792648315, + "learning_rate": 4.772932882676476e-06, + "loss": 0.66, + "step": 3073 + }, + { + "epoch": 0.8442735512221917, + "grad_norm": 1.2315949201583862, + "learning_rate": 4.772782461576122e-06, + "loss": 0.696, + "step": 3074 + }, + { + "epoch": 0.8445482010436693, + "grad_norm": 1.2350980043411255, + "learning_rate": 4.772631993040774e-06, + "loss": 0.6426, + "step": 3075 + }, + { + "epoch": 0.844822850865147, + "grad_norm": 1.2267611026763916, + "learning_rate": 4.7724814770735725e-06, + "loss": 0.6653, + "step": 3076 + }, + { + "epoch": 0.8450975006866246, + "grad_norm": 1.2120057344436646, + "learning_rate": 4.772330913677659e-06, + "loss": 0.6256, + "step": 3077 + }, + { + "epoch": 0.8453721505081022, + "grad_norm": 1.2102619409561157, + "learning_rate": 4.772180302856176e-06, + "loss": 0.6702, + "step": 3078 + }, + { + "epoch": 0.8456468003295798, + "grad_norm": 1.24788236618042, + "learning_rate": 4.772029644612267e-06, + "loss": 0.6484, + "step": 3079 + }, + { + "epoch": 0.8459214501510574, + "grad_norm": 1.1945960521697998, + "learning_rate": 4.771878938949077e-06, + "loss": 0.6119, + "step": 3080 + }, + { + "epoch": 0.846196099972535, + "grad_norm": 1.1670129299163818, + "learning_rate": 4.77172818586975e-06, + "loss": 0.6367, + "step": 3081 + }, + { + "epoch": 0.8464707497940126, + "grad_norm": 1.2458608150482178, + "learning_rate": 4.7715773853774334e-06, + "loss": 0.6166, + "step": 3082 + }, + { + "epoch": 0.8467453996154902, + "grad_norm": 1.3623865842819214, + "learning_rate": 4.771426537475274e-06, + "loss": 0.6649, + "step": 3083 + }, + { + "epoch": 0.8470200494369678, + "grad_norm": 1.2277112007141113, + "learning_rate": 4.77127564216642e-06, + "loss": 0.6515, + "step": 3084 + }, + { + "epoch": 0.8472946992584455, + "grad_norm": 1.2113984823226929, + "learning_rate": 4.771124699454021e-06, + "loss": 0.6715, + "step": 3085 + }, + { + "epoch": 0.8475693490799231, + "grad_norm": 1.1833856105804443, + "learning_rate": 4.770973709341228e-06, + "loss": 0.6624, + "step": 3086 + }, + { + "epoch": 0.8478439989014007, + "grad_norm": 1.2026044130325317, + "learning_rate": 4.770822671831192e-06, + "loss": 0.6546, + "step": 3087 + }, + { + "epoch": 0.8481186487228783, + "grad_norm": 1.2354923486709595, + "learning_rate": 4.770671586927064e-06, + "loss": 0.699, + "step": 3088 + }, + { + "epoch": 0.848393298544356, + "grad_norm": 1.1900427341461182, + "learning_rate": 4.7705204546319986e-06, + "loss": 0.6628, + "step": 3089 + }, + { + "epoch": 0.8486679483658336, + "grad_norm": 1.1967521905899048, + "learning_rate": 4.770369274949149e-06, + "loss": 0.6669, + "step": 3090 + }, + { + "epoch": 0.8489425981873112, + "grad_norm": 1.2319220304489136, + "learning_rate": 4.770218047881672e-06, + "loss": 0.6683, + "step": 3091 + }, + { + "epoch": 0.8492172480087888, + "grad_norm": 1.2096678018569946, + "learning_rate": 4.770066773432723e-06, + "loss": 0.6511, + "step": 3092 + }, + { + "epoch": 0.8494918978302665, + "grad_norm": 1.2270427942276, + "learning_rate": 4.769915451605458e-06, + "loss": 0.7064, + "step": 3093 + }, + { + "epoch": 0.849766547651744, + "grad_norm": 1.1589103937149048, + "learning_rate": 4.769764082403038e-06, + "loss": 0.624, + "step": 3094 + }, + { + "epoch": 0.8500411974732216, + "grad_norm": 1.2301867008209229, + "learning_rate": 4.769612665828619e-06, + "loss": 0.6298, + "step": 3095 + }, + { + "epoch": 0.8503158472946992, + "grad_norm": 1.2944434881210327, + "learning_rate": 4.769461201885365e-06, + "loss": 0.7094, + "step": 3096 + }, + { + "epoch": 0.8505904971161768, + "grad_norm": 1.136640191078186, + "learning_rate": 4.769309690576433e-06, + "loss": 0.6647, + "step": 3097 + }, + { + "epoch": 0.8508651469376545, + "grad_norm": 1.2125695943832397, + "learning_rate": 4.769158131904988e-06, + "loss": 0.6946, + "step": 3098 + }, + { + "epoch": 0.8511397967591321, + "grad_norm": 1.2241644859313965, + "learning_rate": 4.769006525874192e-06, + "loss": 0.6739, + "step": 3099 + }, + { + "epoch": 0.8514144465806097, + "grad_norm": 1.1924527883529663, + "learning_rate": 4.76885487248721e-06, + "loss": 0.6662, + "step": 3100 + }, + { + "epoch": 0.8516890964020873, + "grad_norm": 1.1923859119415283, + "learning_rate": 4.768703171747206e-06, + "loss": 0.6535, + "step": 3101 + }, + { + "epoch": 0.851963746223565, + "grad_norm": 1.19837486743927, + "learning_rate": 4.768551423657347e-06, + "loss": 0.6567, + "step": 3102 + }, + { + "epoch": 0.8522383960450426, + "grad_norm": 1.2351516485214233, + "learning_rate": 4.7683996282208e-06, + "loss": 0.6753, + "step": 3103 + }, + { + "epoch": 0.8525130458665202, + "grad_norm": 1.2283838987350464, + "learning_rate": 4.768247785440734e-06, + "loss": 0.7001, + "step": 3104 + }, + { + "epoch": 0.8527876956879978, + "grad_norm": 1.2545238733291626, + "learning_rate": 4.768095895320315e-06, + "loss": 0.6719, + "step": 3105 + }, + { + "epoch": 0.8530623455094755, + "grad_norm": 1.224421739578247, + "learning_rate": 4.767943957862716e-06, + "loss": 0.6748, + "step": 3106 + }, + { + "epoch": 0.8533369953309531, + "grad_norm": 1.2379471063613892, + "learning_rate": 4.767791973071108e-06, + "loss": 0.6524, + "step": 3107 + }, + { + "epoch": 0.8536116451524306, + "grad_norm": 1.1785601377487183, + "learning_rate": 4.767639940948662e-06, + "loss": 0.6773, + "step": 3108 + }, + { + "epoch": 0.8538862949739082, + "grad_norm": 1.2280325889587402, + "learning_rate": 4.767487861498551e-06, + "loss": 0.6717, + "step": 3109 + }, + { + "epoch": 0.8541609447953858, + "grad_norm": 1.264664888381958, + "learning_rate": 4.76733573472395e-06, + "loss": 0.7085, + "step": 3110 + }, + { + "epoch": 0.8544355946168635, + "grad_norm": 1.213975429534912, + "learning_rate": 4.767183560628033e-06, + "loss": 0.6712, + "step": 3111 + }, + { + "epoch": 0.8547102444383411, + "grad_norm": 1.2707874774932861, + "learning_rate": 4.767031339213976e-06, + "loss": 0.6876, + "step": 3112 + }, + { + "epoch": 0.8549848942598187, + "grad_norm": 1.2735756635665894, + "learning_rate": 4.766879070484957e-06, + "loss": 0.6586, + "step": 3113 + }, + { + "epoch": 0.8552595440812963, + "grad_norm": 1.2132917642593384, + "learning_rate": 4.7667267544441535e-06, + "loss": 0.6331, + "step": 3114 + }, + { + "epoch": 0.855534193902774, + "grad_norm": 1.2152236700057983, + "learning_rate": 4.766574391094743e-06, + "loss": 0.6341, + "step": 3115 + }, + { + "epoch": 0.8558088437242516, + "grad_norm": 1.355560064315796, + "learning_rate": 4.766421980439909e-06, + "loss": 0.6829, + "step": 3116 + }, + { + "epoch": 0.8560834935457292, + "grad_norm": 1.242410659790039, + "learning_rate": 4.766269522482829e-06, + "loss": 0.6899, + "step": 3117 + }, + { + "epoch": 0.8563581433672068, + "grad_norm": 1.176930546760559, + "learning_rate": 4.7661170172266865e-06, + "loss": 0.6195, + "step": 3118 + }, + { + "epoch": 0.8566327931886845, + "grad_norm": 1.2420094013214111, + "learning_rate": 4.765964464674664e-06, + "loss": 0.6791, + "step": 3119 + }, + { + "epoch": 0.8569074430101621, + "grad_norm": 1.190176010131836, + "learning_rate": 4.765811864829945e-06, + "loss": 0.6455, + "step": 3120 + }, + { + "epoch": 0.8571820928316397, + "grad_norm": 1.199761152267456, + "learning_rate": 4.765659217695715e-06, + "loss": 0.6375, + "step": 3121 + }, + { + "epoch": 0.8574567426531172, + "grad_norm": 1.221487283706665, + "learning_rate": 4.765506523275161e-06, + "loss": 0.6787, + "step": 3122 + }, + { + "epoch": 0.8577313924745948, + "grad_norm": 1.1775150299072266, + "learning_rate": 4.765353781571468e-06, + "loss": 0.6609, + "step": 3123 + }, + { + "epoch": 0.8580060422960725, + "grad_norm": 1.2188796997070312, + "learning_rate": 4.765200992587824e-06, + "loss": 0.6432, + "step": 3124 + }, + { + "epoch": 0.8582806921175501, + "grad_norm": 1.2047728300094604, + "learning_rate": 4.76504815632742e-06, + "loss": 0.6267, + "step": 3125 + }, + { + "epoch": 0.8585553419390277, + "grad_norm": 1.2734259366989136, + "learning_rate": 4.7648952727934426e-06, + "loss": 0.673, + "step": 3126 + }, + { + "epoch": 0.8588299917605053, + "grad_norm": 1.2859611511230469, + "learning_rate": 4.764742341989084e-06, + "loss": 0.6521, + "step": 3127 + }, + { + "epoch": 0.859104641581983, + "grad_norm": 1.2725260257720947, + "learning_rate": 4.764589363917537e-06, + "loss": 0.67, + "step": 3128 + }, + { + "epoch": 0.8593792914034606, + "grad_norm": 1.236201286315918, + "learning_rate": 4.764436338581994e-06, + "loss": 0.6538, + "step": 3129 + }, + { + "epoch": 0.8596539412249382, + "grad_norm": 1.1870168447494507, + "learning_rate": 4.764283265985647e-06, + "loss": 0.5949, + "step": 3130 + }, + { + "epoch": 0.8599285910464158, + "grad_norm": 1.1697994470596313, + "learning_rate": 4.764130146131694e-06, + "loss": 0.6492, + "step": 3131 + }, + { + "epoch": 0.8602032408678935, + "grad_norm": 1.2168995141983032, + "learning_rate": 4.763976979023327e-06, + "loss": 0.5919, + "step": 3132 + }, + { + "epoch": 0.8604778906893711, + "grad_norm": 1.1913084983825684, + "learning_rate": 4.7638237646637454e-06, + "loss": 0.6576, + "step": 3133 + }, + { + "epoch": 0.8607525405108487, + "grad_norm": 1.2516275644302368, + "learning_rate": 4.763670503056147e-06, + "loss": 0.6768, + "step": 3134 + }, + { + "epoch": 0.8610271903323263, + "grad_norm": 1.3012807369232178, + "learning_rate": 4.763517194203728e-06, + "loss": 0.6638, + "step": 3135 + }, + { + "epoch": 0.861301840153804, + "grad_norm": 1.2819026708602905, + "learning_rate": 4.763363838109691e-06, + "loss": 0.7117, + "step": 3136 + }, + { + "epoch": 0.8615764899752815, + "grad_norm": 1.2719910144805908, + "learning_rate": 4.763210434777236e-06, + "loss": 0.6827, + "step": 3137 + }, + { + "epoch": 0.8618511397967591, + "grad_norm": 1.2508279085159302, + "learning_rate": 4.763056984209563e-06, + "loss": 0.6807, + "step": 3138 + }, + { + "epoch": 0.8621257896182367, + "grad_norm": 1.222008466720581, + "learning_rate": 4.762903486409876e-06, + "loss": 0.693, + "step": 3139 + }, + { + "epoch": 0.8624004394397143, + "grad_norm": 1.2479721307754517, + "learning_rate": 4.762749941381378e-06, + "loss": 0.6635, + "step": 3140 + }, + { + "epoch": 0.862675089261192, + "grad_norm": 1.237922191619873, + "learning_rate": 4.7625963491272745e-06, + "loss": 0.658, + "step": 3141 + }, + { + "epoch": 0.8629497390826696, + "grad_norm": 1.277939796447754, + "learning_rate": 4.76244270965077e-06, + "loss": 0.6731, + "step": 3142 + }, + { + "epoch": 0.8632243889041472, + "grad_norm": 1.2095460891723633, + "learning_rate": 4.762289022955072e-06, + "loss": 0.671, + "step": 3143 + }, + { + "epoch": 0.8634990387256248, + "grad_norm": 1.38899564743042, + "learning_rate": 4.762135289043388e-06, + "loss": 0.607, + "step": 3144 + }, + { + "epoch": 0.8637736885471025, + "grad_norm": 1.235978126525879, + "learning_rate": 4.761981507918926e-06, + "loss": 0.6844, + "step": 3145 + }, + { + "epoch": 0.8640483383685801, + "grad_norm": 1.1680009365081787, + "learning_rate": 4.761827679584896e-06, + "loss": 0.6232, + "step": 3146 + }, + { + "epoch": 0.8643229881900577, + "grad_norm": 1.1795028448104858, + "learning_rate": 4.761673804044509e-06, + "loss": 0.6434, + "step": 3147 + }, + { + "epoch": 0.8645976380115353, + "grad_norm": 1.1687803268432617, + "learning_rate": 4.761519881300976e-06, + "loss": 0.6431, + "step": 3148 + }, + { + "epoch": 0.864872287833013, + "grad_norm": 1.2611666917800903, + "learning_rate": 4.7613659113575084e-06, + "loss": 0.6324, + "step": 3149 + }, + { + "epoch": 0.8651469376544906, + "grad_norm": 1.223936915397644, + "learning_rate": 4.761211894217321e-06, + "loss": 0.6432, + "step": 3150 + }, + { + "epoch": 0.8654215874759681, + "grad_norm": 1.2724579572677612, + "learning_rate": 4.7610578298836284e-06, + "loss": 0.6724, + "step": 3151 + }, + { + "epoch": 0.8656962372974457, + "grad_norm": 1.2799842357635498, + "learning_rate": 4.760903718359645e-06, + "loss": 0.6329, + "step": 3152 + }, + { + "epoch": 0.8659708871189233, + "grad_norm": 1.2452640533447266, + "learning_rate": 4.760749559648588e-06, + "loss": 0.6828, + "step": 3153 + }, + { + "epoch": 0.866245536940401, + "grad_norm": 1.1988794803619385, + "learning_rate": 4.760595353753676e-06, + "loss": 0.6491, + "step": 3154 + }, + { + "epoch": 0.8665201867618786, + "grad_norm": 1.2864352464675903, + "learning_rate": 4.760441100678125e-06, + "loss": 0.6877, + "step": 3155 + }, + { + "epoch": 0.8667948365833562, + "grad_norm": 1.1693450212478638, + "learning_rate": 4.760286800425156e-06, + "loss": 0.6211, + "step": 3156 + }, + { + "epoch": 0.8670694864048338, + "grad_norm": 1.2334718704223633, + "learning_rate": 4.7601324529979886e-06, + "loss": 0.6853, + "step": 3157 + }, + { + "epoch": 0.8673441362263115, + "grad_norm": 1.096116542816162, + "learning_rate": 4.7599780583998444e-06, + "loss": 0.5781, + "step": 3158 + }, + { + "epoch": 0.8676187860477891, + "grad_norm": 1.1968705654144287, + "learning_rate": 4.759823616633946e-06, + "loss": 0.6456, + "step": 3159 + }, + { + "epoch": 0.8678934358692667, + "grad_norm": 1.1905962228775024, + "learning_rate": 4.7596691277035175e-06, + "loss": 0.6358, + "step": 3160 + }, + { + "epoch": 0.8681680856907443, + "grad_norm": 1.2642359733581543, + "learning_rate": 4.759514591611781e-06, + "loss": 0.6533, + "step": 3161 + }, + { + "epoch": 0.868442735512222, + "grad_norm": 1.3271033763885498, + "learning_rate": 4.759360008361964e-06, + "loss": 0.6854, + "step": 3162 + }, + { + "epoch": 0.8687173853336996, + "grad_norm": 1.2457754611968994, + "learning_rate": 4.759205377957292e-06, + "loss": 0.637, + "step": 3163 + }, + { + "epoch": 0.8689920351551772, + "grad_norm": 1.2117791175842285, + "learning_rate": 4.7590507004009915e-06, + "loss": 0.6646, + "step": 3164 + }, + { + "epoch": 0.8692666849766547, + "grad_norm": 1.3357610702514648, + "learning_rate": 4.758895975696291e-06, + "loss": 0.6896, + "step": 3165 + }, + { + "epoch": 0.8695413347981323, + "grad_norm": 1.1622824668884277, + "learning_rate": 4.758741203846422e-06, + "loss": 0.6126, + "step": 3166 + }, + { + "epoch": 0.86981598461961, + "grad_norm": 1.2407593727111816, + "learning_rate": 4.7585863848546116e-06, + "loss": 0.6519, + "step": 3167 + }, + { + "epoch": 0.8700906344410876, + "grad_norm": 1.237552523612976, + "learning_rate": 4.758431518724092e-06, + "loss": 0.6597, + "step": 3168 + }, + { + "epoch": 0.8703652842625652, + "grad_norm": 1.2143101692199707, + "learning_rate": 4.758276605458096e-06, + "loss": 0.5922, + "step": 3169 + }, + { + "epoch": 0.8706399340840428, + "grad_norm": 1.2773417234420776, + "learning_rate": 4.758121645059858e-06, + "loss": 0.6547, + "step": 3170 + }, + { + "epoch": 0.8709145839055205, + "grad_norm": 1.2154061794281006, + "learning_rate": 4.757966637532609e-06, + "loss": 0.6589, + "step": 3171 + }, + { + "epoch": 0.8711892337269981, + "grad_norm": 1.2985972166061401, + "learning_rate": 4.757811582879586e-06, + "loss": 0.6945, + "step": 3172 + }, + { + "epoch": 0.8714638835484757, + "grad_norm": 1.2129970788955688, + "learning_rate": 4.757656481104026e-06, + "loss": 0.6533, + "step": 3173 + }, + { + "epoch": 0.8717385333699533, + "grad_norm": 1.2589232921600342, + "learning_rate": 4.757501332209164e-06, + "loss": 0.648, + "step": 3174 + }, + { + "epoch": 0.872013183191431, + "grad_norm": 1.1818265914916992, + "learning_rate": 4.757346136198239e-06, + "loss": 0.6385, + "step": 3175 + }, + { + "epoch": 0.8722878330129086, + "grad_norm": 1.2979928255081177, + "learning_rate": 4.7571908930744905e-06, + "loss": 0.7083, + "step": 3176 + }, + { + "epoch": 0.8725624828343862, + "grad_norm": 1.3302868604660034, + "learning_rate": 4.757035602841159e-06, + "loss": 0.6337, + "step": 3177 + }, + { + "epoch": 0.8728371326558638, + "grad_norm": 1.2916905879974365, + "learning_rate": 4.756880265501484e-06, + "loss": 0.6564, + "step": 3178 + }, + { + "epoch": 0.8731117824773413, + "grad_norm": 1.250840425491333, + "learning_rate": 4.756724881058708e-06, + "loss": 0.6248, + "step": 3179 + }, + { + "epoch": 0.873386432298819, + "grad_norm": 1.1738570928573608, + "learning_rate": 4.756569449516075e-06, + "loss": 0.6148, + "step": 3180 + }, + { + "epoch": 0.8736610821202966, + "grad_norm": 1.1644301414489746, + "learning_rate": 4.7564139708768286e-06, + "loss": 0.6296, + "step": 3181 + }, + { + "epoch": 0.8739357319417742, + "grad_norm": 1.254445195198059, + "learning_rate": 4.756258445144214e-06, + "loss": 0.6524, + "step": 3182 + }, + { + "epoch": 0.8742103817632518, + "grad_norm": 1.2380977869033813, + "learning_rate": 4.756102872321476e-06, + "loss": 0.6455, + "step": 3183 + }, + { + "epoch": 0.8744850315847295, + "grad_norm": 1.1835347414016724, + "learning_rate": 4.755947252411862e-06, + "loss": 0.5901, + "step": 3184 + }, + { + "epoch": 0.8747596814062071, + "grad_norm": 1.2016915082931519, + "learning_rate": 4.7557915854186195e-06, + "loss": 0.6641, + "step": 3185 + }, + { + "epoch": 0.8750343312276847, + "grad_norm": 1.2703536748886108, + "learning_rate": 4.755635871344999e-06, + "loss": 0.6443, + "step": 3186 + }, + { + "epoch": 0.8753089810491623, + "grad_norm": 1.2113157510757446, + "learning_rate": 4.75548011019425e-06, + "loss": 0.6491, + "step": 3187 + }, + { + "epoch": 0.87558363087064, + "grad_norm": 1.1959140300750732, + "learning_rate": 4.755324301969622e-06, + "loss": 0.6811, + "step": 3188 + }, + { + "epoch": 0.8758582806921176, + "grad_norm": 1.229858160018921, + "learning_rate": 4.755168446674368e-06, + "loss": 0.644, + "step": 3189 + }, + { + "epoch": 0.8761329305135952, + "grad_norm": 1.2091691493988037, + "learning_rate": 4.75501254431174e-06, + "loss": 0.6203, + "step": 3190 + }, + { + "epoch": 0.8764075803350728, + "grad_norm": 1.1582812070846558, + "learning_rate": 4.7548565948849935e-06, + "loss": 0.6353, + "step": 3191 + }, + { + "epoch": 0.8766822301565504, + "grad_norm": 1.215316653251648, + "learning_rate": 4.7547005983973815e-06, + "loss": 0.6177, + "step": 3192 + }, + { + "epoch": 0.876956879978028, + "grad_norm": 1.2596538066864014, + "learning_rate": 4.7545445548521605e-06, + "loss": 0.6699, + "step": 3193 + }, + { + "epoch": 0.8772315297995056, + "grad_norm": 1.2889765501022339, + "learning_rate": 4.754388464252587e-06, + "loss": 0.6958, + "step": 3194 + }, + { + "epoch": 0.8775061796209832, + "grad_norm": 1.2172209024429321, + "learning_rate": 4.754232326601919e-06, + "loss": 0.6547, + "step": 3195 + }, + { + "epoch": 0.8777808294424608, + "grad_norm": 1.1737725734710693, + "learning_rate": 4.754076141903415e-06, + "loss": 0.6209, + "step": 3196 + }, + { + "epoch": 0.8780554792639385, + "grad_norm": 1.2126917839050293, + "learning_rate": 4.7539199101603365e-06, + "loss": 0.6888, + "step": 3197 + }, + { + "epoch": 0.8783301290854161, + "grad_norm": 1.2201076745986938, + "learning_rate": 4.75376363137594e-06, + "loss": 0.6071, + "step": 3198 + }, + { + "epoch": 0.8786047789068937, + "grad_norm": 1.2926558256149292, + "learning_rate": 4.753607305553492e-06, + "loss": 0.683, + "step": 3199 + }, + { + "epoch": 0.8788794287283713, + "grad_norm": 1.214252233505249, + "learning_rate": 4.7534509326962515e-06, + "loss": 0.6786, + "step": 3200 + }, + { + "epoch": 0.879154078549849, + "grad_norm": 1.1660795211791992, + "learning_rate": 4.7532945128074844e-06, + "loss": 0.6558, + "step": 3201 + }, + { + "epoch": 0.8794287283713266, + "grad_norm": 1.1842966079711914, + "learning_rate": 4.7531380458904545e-06, + "loss": 0.6126, + "step": 3202 + }, + { + "epoch": 0.8797033781928042, + "grad_norm": 1.1848260164260864, + "learning_rate": 4.752981531948427e-06, + "loss": 0.6021, + "step": 3203 + }, + { + "epoch": 0.8799780280142818, + "grad_norm": 1.18794846534729, + "learning_rate": 4.75282497098467e-06, + "loss": 0.6441, + "step": 3204 + }, + { + "epoch": 0.8802526778357594, + "grad_norm": 1.2090963125228882, + "learning_rate": 4.752668363002449e-06, + "loss": 0.6996, + "step": 3205 + }, + { + "epoch": 0.8805273276572371, + "grad_norm": 1.3504619598388672, + "learning_rate": 4.7525117080050334e-06, + "loss": 0.7125, + "step": 3206 + }, + { + "epoch": 0.8808019774787147, + "grad_norm": 1.246999740600586, + "learning_rate": 4.752355005995692e-06, + "loss": 0.6758, + "step": 3207 + }, + { + "epoch": 0.8810766273001922, + "grad_norm": 1.2448410987854004, + "learning_rate": 4.752198256977698e-06, + "loss": 0.637, + "step": 3208 + }, + { + "epoch": 0.8813512771216698, + "grad_norm": 1.1892648935317993, + "learning_rate": 4.7520414609543194e-06, + "loss": 0.6358, + "step": 3209 + }, + { + "epoch": 0.8816259269431475, + "grad_norm": 1.2422618865966797, + "learning_rate": 4.7518846179288315e-06, + "loss": 0.6672, + "step": 3210 + }, + { + "epoch": 0.8819005767646251, + "grad_norm": 1.1543844938278198, + "learning_rate": 4.751727727904507e-06, + "loss": 0.6318, + "step": 3211 + }, + { + "epoch": 0.8821752265861027, + "grad_norm": 1.1774117946624756, + "learning_rate": 4.751570790884619e-06, + "loss": 0.6554, + "step": 3212 + }, + { + "epoch": 0.8824498764075803, + "grad_norm": 1.3037426471710205, + "learning_rate": 4.751413806872443e-06, + "loss": 0.6248, + "step": 3213 + }, + { + "epoch": 0.882724526229058, + "grad_norm": 1.2632336616516113, + "learning_rate": 4.751256775871258e-06, + "loss": 0.6675, + "step": 3214 + }, + { + "epoch": 0.8829991760505356, + "grad_norm": 1.2411131858825684, + "learning_rate": 4.751099697884338e-06, + "loss": 0.6327, + "step": 3215 + }, + { + "epoch": 0.8832738258720132, + "grad_norm": 1.257380485534668, + "learning_rate": 4.750942572914964e-06, + "loss": 0.6504, + "step": 3216 + }, + { + "epoch": 0.8835484756934908, + "grad_norm": 1.2300056219100952, + "learning_rate": 4.7507854009664145e-06, + "loss": 0.6621, + "step": 3217 + }, + { + "epoch": 0.8838231255149684, + "grad_norm": 1.2716186046600342, + "learning_rate": 4.750628182041969e-06, + "loss": 0.7173, + "step": 3218 + }, + { + "epoch": 0.8840977753364461, + "grad_norm": 1.2843126058578491, + "learning_rate": 4.750470916144909e-06, + "loss": 0.6382, + "step": 3219 + }, + { + "epoch": 0.8843724251579237, + "grad_norm": 1.2249022722244263, + "learning_rate": 4.750313603278518e-06, + "loss": 0.6607, + "step": 3220 + }, + { + "epoch": 0.8846470749794013, + "grad_norm": 1.241899847984314, + "learning_rate": 4.750156243446078e-06, + "loss": 0.6653, + "step": 3221 + }, + { + "epoch": 0.8849217248008788, + "grad_norm": 1.2353935241699219, + "learning_rate": 4.7499988366508735e-06, + "loss": 0.6545, + "step": 3222 + }, + { + "epoch": 0.8851963746223565, + "grad_norm": 1.1942094564437866, + "learning_rate": 4.749841382896191e-06, + "loss": 0.6616, + "step": 3223 + }, + { + "epoch": 0.8854710244438341, + "grad_norm": 1.1891744136810303, + "learning_rate": 4.749683882185314e-06, + "loss": 0.6302, + "step": 3224 + }, + { + "epoch": 0.8857456742653117, + "grad_norm": 1.1934564113616943, + "learning_rate": 4.749526334521533e-06, + "loss": 0.6599, + "step": 3225 + }, + { + "epoch": 0.8860203240867893, + "grad_norm": 1.2374625205993652, + "learning_rate": 4.749368739908133e-06, + "loss": 0.6323, + "step": 3226 + }, + { + "epoch": 0.886294973908267, + "grad_norm": 1.22036874294281, + "learning_rate": 4.749211098348405e-06, + "loss": 0.6769, + "step": 3227 + }, + { + "epoch": 0.8865696237297446, + "grad_norm": 1.2477878332138062, + "learning_rate": 4.749053409845639e-06, + "loss": 0.6539, + "step": 3228 + }, + { + "epoch": 0.8868442735512222, + "grad_norm": 1.183706521987915, + "learning_rate": 4.748895674403126e-06, + "loss": 0.6534, + "step": 3229 + }, + { + "epoch": 0.8871189233726998, + "grad_norm": 1.230559229850769, + "learning_rate": 4.748737892024156e-06, + "loss": 0.6821, + "step": 3230 + }, + { + "epoch": 0.8873935731941774, + "grad_norm": 1.219538688659668, + "learning_rate": 4.748580062712026e-06, + "loss": 0.657, + "step": 3231 + }, + { + "epoch": 0.8876682230156551, + "grad_norm": 1.2560067176818848, + "learning_rate": 4.748422186470028e-06, + "loss": 0.6861, + "step": 3232 + }, + { + "epoch": 0.8879428728371327, + "grad_norm": 1.2346668243408203, + "learning_rate": 4.748264263301456e-06, + "loss": 0.6159, + "step": 3233 + }, + { + "epoch": 0.8882175226586103, + "grad_norm": 1.2258926630020142, + "learning_rate": 4.748106293209608e-06, + "loss": 0.6535, + "step": 3234 + }, + { + "epoch": 0.8884921724800879, + "grad_norm": 1.1529701948165894, + "learning_rate": 4.747948276197779e-06, + "loss": 0.6176, + "step": 3235 + }, + { + "epoch": 0.8887668223015655, + "grad_norm": 1.1679143905639648, + "learning_rate": 4.7477902122692686e-06, + "loss": 0.6413, + "step": 3236 + }, + { + "epoch": 0.8890414721230431, + "grad_norm": 1.320907711982727, + "learning_rate": 4.747632101427374e-06, + "loss": 0.5942, + "step": 3237 + }, + { + "epoch": 0.8893161219445207, + "grad_norm": 1.279429316520691, + "learning_rate": 4.747473943675397e-06, + "loss": 0.6774, + "step": 3238 + }, + { + "epoch": 0.8895907717659983, + "grad_norm": 1.2503316402435303, + "learning_rate": 4.747315739016638e-06, + "loss": 0.6597, + "step": 3239 + }, + { + "epoch": 0.889865421587476, + "grad_norm": 1.2005053758621216, + "learning_rate": 4.747157487454398e-06, + "loss": 0.6468, + "step": 3240 + }, + { + "epoch": 0.8901400714089536, + "grad_norm": 1.2209571599960327, + "learning_rate": 4.746999188991981e-06, + "loss": 0.6768, + "step": 3241 + }, + { + "epoch": 0.8904147212304312, + "grad_norm": 1.195417046546936, + "learning_rate": 4.74684084363269e-06, + "loss": 0.6641, + "step": 3242 + }, + { + "epoch": 0.8906893710519088, + "grad_norm": 1.1354600191116333, + "learning_rate": 4.74668245137983e-06, + "loss": 0.6366, + "step": 3243 + }, + { + "epoch": 0.8909640208733864, + "grad_norm": 1.20213782787323, + "learning_rate": 4.746524012236706e-06, + "loss": 0.6617, + "step": 3244 + }, + { + "epoch": 0.8912386706948641, + "grad_norm": 1.2225933074951172, + "learning_rate": 4.746365526206626e-06, + "loss": 0.6093, + "step": 3245 + }, + { + "epoch": 0.8915133205163417, + "grad_norm": 1.1737213134765625, + "learning_rate": 4.746206993292898e-06, + "loss": 0.6344, + "step": 3246 + }, + { + "epoch": 0.8917879703378193, + "grad_norm": 1.1798285245895386, + "learning_rate": 4.74604841349883e-06, + "loss": 0.682, + "step": 3247 + }, + { + "epoch": 0.8920626201592969, + "grad_norm": 1.1596754789352417, + "learning_rate": 4.7458897868277305e-06, + "loss": 0.6006, + "step": 3248 + }, + { + "epoch": 0.8923372699807746, + "grad_norm": 1.2220512628555298, + "learning_rate": 4.745731113282912e-06, + "loss": 0.6177, + "step": 3249 + }, + { + "epoch": 0.8926119198022521, + "grad_norm": 1.1804587841033936, + "learning_rate": 4.745572392867686e-06, + "loss": 0.6535, + "step": 3250 + }, + { + "epoch": 0.8928865696237297, + "grad_norm": 1.2050646543502808, + "learning_rate": 4.745413625585364e-06, + "loss": 0.6773, + "step": 3251 + }, + { + "epoch": 0.8931612194452073, + "grad_norm": 1.2668638229370117, + "learning_rate": 4.745254811439262e-06, + "loss": 0.641, + "step": 3252 + }, + { + "epoch": 0.893435869266685, + "grad_norm": 1.2234748601913452, + "learning_rate": 4.745095950432691e-06, + "loss": 0.625, + "step": 3253 + }, + { + "epoch": 0.8937105190881626, + "grad_norm": 1.2862836122512817, + "learning_rate": 4.74493704256897e-06, + "loss": 0.6542, + "step": 3254 + }, + { + "epoch": 0.8939851689096402, + "grad_norm": 1.209565281867981, + "learning_rate": 4.744778087851413e-06, + "loss": 0.6782, + "step": 3255 + }, + { + "epoch": 0.8942598187311178, + "grad_norm": 1.337021827697754, + "learning_rate": 4.744619086283339e-06, + "loss": 0.6993, + "step": 3256 + }, + { + "epoch": 0.8945344685525954, + "grad_norm": 1.138991355895996, + "learning_rate": 4.744460037868066e-06, + "loss": 0.6021, + "step": 3257 + }, + { + "epoch": 0.8948091183740731, + "grad_norm": 1.1923425197601318, + "learning_rate": 4.744300942608914e-06, + "loss": 0.6354, + "step": 3258 + }, + { + "epoch": 0.8950837681955507, + "grad_norm": 1.1907410621643066, + "learning_rate": 4.744141800509202e-06, + "loss": 0.6513, + "step": 3259 + }, + { + "epoch": 0.8953584180170283, + "grad_norm": 1.1823608875274658, + "learning_rate": 4.743982611572253e-06, + "loss": 0.6535, + "step": 3260 + }, + { + "epoch": 0.8956330678385059, + "grad_norm": 1.229851245880127, + "learning_rate": 4.74382337580139e-06, + "loss": 0.6002, + "step": 3261 + }, + { + "epoch": 0.8959077176599836, + "grad_norm": 1.2578003406524658, + "learning_rate": 4.743664093199934e-06, + "loss": 0.6486, + "step": 3262 + }, + { + "epoch": 0.8961823674814612, + "grad_norm": 1.2095725536346436, + "learning_rate": 4.743504763771211e-06, + "loss": 0.6565, + "step": 3263 + }, + { + "epoch": 0.8964570173029388, + "grad_norm": 1.2320910692214966, + "learning_rate": 4.7433453875185455e-06, + "loss": 0.6386, + "step": 3264 + }, + { + "epoch": 0.8967316671244163, + "grad_norm": 1.2348376512527466, + "learning_rate": 4.743185964445265e-06, + "loss": 0.628, + "step": 3265 + }, + { + "epoch": 0.897006316945894, + "grad_norm": 1.1802558898925781, + "learning_rate": 4.7430264945546955e-06, + "loss": 0.6221, + "step": 3266 + }, + { + "epoch": 0.8972809667673716, + "grad_norm": 1.194118857383728, + "learning_rate": 4.742866977850167e-06, + "loss": 0.6095, + "step": 3267 + }, + { + "epoch": 0.8975556165888492, + "grad_norm": 1.1412383317947388, + "learning_rate": 4.742707414335006e-06, + "loss": 0.6056, + "step": 3268 + }, + { + "epoch": 0.8978302664103268, + "grad_norm": 1.1478824615478516, + "learning_rate": 4.7425478040125455e-06, + "loss": 0.6241, + "step": 3269 + }, + { + "epoch": 0.8981049162318044, + "grad_norm": 1.2739990949630737, + "learning_rate": 4.7423881468861156e-06, + "loss": 0.6634, + "step": 3270 + }, + { + "epoch": 0.8983795660532821, + "grad_norm": 1.2000274658203125, + "learning_rate": 4.7422284429590485e-06, + "loss": 0.6021, + "step": 3271 + }, + { + "epoch": 0.8986542158747597, + "grad_norm": 1.274438738822937, + "learning_rate": 4.742068692234677e-06, + "loss": 0.6498, + "step": 3272 + }, + { + "epoch": 0.8989288656962373, + "grad_norm": 1.2904435396194458, + "learning_rate": 4.741908894716335e-06, + "loss": 0.6083, + "step": 3273 + }, + { + "epoch": 0.8992035155177149, + "grad_norm": 1.1960009336471558, + "learning_rate": 4.741749050407359e-06, + "loss": 0.6489, + "step": 3274 + }, + { + "epoch": 0.8994781653391926, + "grad_norm": 1.2506803274154663, + "learning_rate": 4.7415891593110845e-06, + "loss": 0.65, + "step": 3275 + }, + { + "epoch": 0.8997528151606702, + "grad_norm": 1.221425175666809, + "learning_rate": 4.741429221430848e-06, + "loss": 0.6707, + "step": 3276 + }, + { + "epoch": 0.9000274649821478, + "grad_norm": 1.2434529066085815, + "learning_rate": 4.741269236769987e-06, + "loss": 0.6553, + "step": 3277 + }, + { + "epoch": 0.9003021148036254, + "grad_norm": 1.2028110027313232, + "learning_rate": 4.741109205331843e-06, + "loss": 0.6479, + "step": 3278 + }, + { + "epoch": 0.9005767646251029, + "grad_norm": 1.1201896667480469, + "learning_rate": 4.740949127119753e-06, + "loss": 0.6563, + "step": 3279 + }, + { + "epoch": 0.9008514144465806, + "grad_norm": 1.233085036277771, + "learning_rate": 4.74078900213706e-06, + "loss": 0.678, + "step": 3280 + }, + { + "epoch": 0.9011260642680582, + "grad_norm": 1.2305516004562378, + "learning_rate": 4.7406288303871065e-06, + "loss": 0.6285, + "step": 3281 + }, + { + "epoch": 0.9014007140895358, + "grad_norm": 1.2351481914520264, + "learning_rate": 4.740468611873232e-06, + "loss": 0.6546, + "step": 3282 + }, + { + "epoch": 0.9016753639110134, + "grad_norm": 1.2602733373641968, + "learning_rate": 4.740308346598784e-06, + "loss": 0.6868, + "step": 3283 + }, + { + "epoch": 0.9019500137324911, + "grad_norm": 1.1805901527404785, + "learning_rate": 4.7401480345671065e-06, + "loss": 0.6548, + "step": 3284 + }, + { + "epoch": 0.9022246635539687, + "grad_norm": 1.1949414014816284, + "learning_rate": 4.739987675781544e-06, + "loss": 0.6426, + "step": 3285 + }, + { + "epoch": 0.9024993133754463, + "grad_norm": 1.2290570735931396, + "learning_rate": 4.739827270245445e-06, + "loss": 0.6034, + "step": 3286 + }, + { + "epoch": 0.9027739631969239, + "grad_norm": 1.2857248783111572, + "learning_rate": 4.7396668179621565e-06, + "loss": 0.6853, + "step": 3287 + }, + { + "epoch": 0.9030486130184016, + "grad_norm": 1.269652009010315, + "learning_rate": 4.7395063189350265e-06, + "loss": 0.6826, + "step": 3288 + }, + { + "epoch": 0.9033232628398792, + "grad_norm": 1.21947181224823, + "learning_rate": 4.739345773167406e-06, + "loss": 0.6958, + "step": 3289 + }, + { + "epoch": 0.9035979126613568, + "grad_norm": 1.1684547662734985, + "learning_rate": 4.739185180662646e-06, + "loss": 0.6068, + "step": 3290 + }, + { + "epoch": 0.9038725624828344, + "grad_norm": 1.1824017763137817, + "learning_rate": 4.7390245414240965e-06, + "loss": 0.584, + "step": 3291 + }, + { + "epoch": 0.904147212304312, + "grad_norm": 1.1952911615371704, + "learning_rate": 4.738863855455112e-06, + "loss": 0.6552, + "step": 3292 + }, + { + "epoch": 0.9044218621257896, + "grad_norm": 1.310860276222229, + "learning_rate": 4.738703122759045e-06, + "loss": 0.6967, + "step": 3293 + }, + { + "epoch": 0.9046965119472672, + "grad_norm": 1.2444058656692505, + "learning_rate": 4.738542343339251e-06, + "loss": 0.6468, + "step": 3294 + }, + { + "epoch": 0.9049711617687448, + "grad_norm": 1.1714537143707275, + "learning_rate": 4.738381517199084e-06, + "loss": 0.6144, + "step": 3295 + }, + { + "epoch": 0.9052458115902224, + "grad_norm": 1.2112292051315308, + "learning_rate": 4.738220644341903e-06, + "loss": 0.6405, + "step": 3296 + }, + { + "epoch": 0.9055204614117001, + "grad_norm": 1.264083981513977, + "learning_rate": 4.738059724771064e-06, + "loss": 0.6305, + "step": 3297 + }, + { + "epoch": 0.9057951112331777, + "grad_norm": 1.2958228588104248, + "learning_rate": 4.737898758489925e-06, + "loss": 0.623, + "step": 3298 + }, + { + "epoch": 0.9060697610546553, + "grad_norm": 1.2039716243743896, + "learning_rate": 4.737737745501847e-06, + "loss": 0.6704, + "step": 3299 + }, + { + "epoch": 0.9063444108761329, + "grad_norm": 1.175504207611084, + "learning_rate": 4.737576685810189e-06, + "loss": 0.6582, + "step": 3300 + }, + { + "epoch": 0.9066190606976106, + "grad_norm": 1.2469818592071533, + "learning_rate": 4.7374155794183145e-06, + "loss": 0.601, + "step": 3301 + }, + { + "epoch": 0.9068937105190882, + "grad_norm": 1.2496552467346191, + "learning_rate": 4.737254426329584e-06, + "loss": 0.6276, + "step": 3302 + }, + { + "epoch": 0.9071683603405658, + "grad_norm": 1.1889533996582031, + "learning_rate": 4.737093226547361e-06, + "loss": 0.6529, + "step": 3303 + }, + { + "epoch": 0.9074430101620434, + "grad_norm": 1.2326579093933105, + "learning_rate": 4.736931980075011e-06, + "loss": 0.6843, + "step": 3304 + }, + { + "epoch": 0.907717659983521, + "grad_norm": 1.1805305480957031, + "learning_rate": 4.736770686915899e-06, + "loss": 0.6228, + "step": 3305 + }, + { + "epoch": 0.9079923098049987, + "grad_norm": 1.164962887763977, + "learning_rate": 4.736609347073391e-06, + "loss": 0.5871, + "step": 3306 + }, + { + "epoch": 0.9082669596264762, + "grad_norm": 1.1963015794754028, + "learning_rate": 4.736447960550855e-06, + "loss": 0.6241, + "step": 3307 + }, + { + "epoch": 0.9085416094479538, + "grad_norm": 1.226948618888855, + "learning_rate": 4.736286527351658e-06, + "loss": 0.6427, + "step": 3308 + }, + { + "epoch": 0.9088162592694314, + "grad_norm": 1.2357860803604126, + "learning_rate": 4.73612504747917e-06, + "loss": 0.6467, + "step": 3309 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.1557937860488892, + "learning_rate": 4.7359635209367616e-06, + "loss": 0.5994, + "step": 3310 + }, + { + "epoch": 0.9093655589123867, + "grad_norm": 1.191800832748413, + "learning_rate": 4.735801947727804e-06, + "loss": 0.6935, + "step": 3311 + }, + { + "epoch": 0.9096402087338643, + "grad_norm": 1.2956902980804443, + "learning_rate": 4.735640327855669e-06, + "loss": 0.6335, + "step": 3312 + }, + { + "epoch": 0.9099148585553419, + "grad_norm": 1.2298846244812012, + "learning_rate": 4.7354786613237294e-06, + "loss": 0.6797, + "step": 3313 + }, + { + "epoch": 0.9101895083768196, + "grad_norm": 1.2666711807250977, + "learning_rate": 4.735316948135359e-06, + "loss": 0.6704, + "step": 3314 + }, + { + "epoch": 0.9104641581982972, + "grad_norm": 1.2041577100753784, + "learning_rate": 4.735155188293934e-06, + "loss": 0.6386, + "step": 3315 + }, + { + "epoch": 0.9107388080197748, + "grad_norm": 1.1734966039657593, + "learning_rate": 4.7349933818028305e-06, + "loss": 0.6559, + "step": 3316 + }, + { + "epoch": 0.9110134578412524, + "grad_norm": 1.1735032796859741, + "learning_rate": 4.734831528665425e-06, + "loss": 0.6539, + "step": 3317 + }, + { + "epoch": 0.91128810766273, + "grad_norm": 1.2193413972854614, + "learning_rate": 4.734669628885095e-06, + "loss": 0.634, + "step": 3318 + }, + { + "epoch": 0.9115627574842077, + "grad_norm": 1.2175984382629395, + "learning_rate": 4.734507682465222e-06, + "loss": 0.6985, + "step": 3319 + }, + { + "epoch": 0.9118374073056853, + "grad_norm": 1.1821956634521484, + "learning_rate": 4.734345689409182e-06, + "loss": 0.6415, + "step": 3320 + }, + { + "epoch": 0.9121120571271628, + "grad_norm": 1.2269189357757568, + "learning_rate": 4.734183649720359e-06, + "loss": 0.6421, + "step": 3321 + }, + { + "epoch": 0.9123867069486404, + "grad_norm": 1.3192476034164429, + "learning_rate": 4.7340215634021334e-06, + "loss": 0.6863, + "step": 3322 + }, + { + "epoch": 0.9126613567701181, + "grad_norm": 1.1839925050735474, + "learning_rate": 4.733859430457889e-06, + "loss": 0.6202, + "step": 3323 + }, + { + "epoch": 0.9129360065915957, + "grad_norm": 1.1641578674316406, + "learning_rate": 4.733697250891009e-06, + "loss": 0.5897, + "step": 3324 + }, + { + "epoch": 0.9132106564130733, + "grad_norm": 1.3024643659591675, + "learning_rate": 4.733535024704879e-06, + "loss": 0.6552, + "step": 3325 + }, + { + "epoch": 0.9134853062345509, + "grad_norm": 1.1944773197174072, + "learning_rate": 4.733372751902884e-06, + "loss": 0.6327, + "step": 3326 + }, + { + "epoch": 0.9137599560560286, + "grad_norm": 1.1654354333877563, + "learning_rate": 4.733210432488411e-06, + "loss": 0.5953, + "step": 3327 + }, + { + "epoch": 0.9140346058775062, + "grad_norm": 1.1810640096664429, + "learning_rate": 4.733048066464848e-06, + "loss": 0.6296, + "step": 3328 + }, + { + "epoch": 0.9143092556989838, + "grad_norm": 1.2034618854522705, + "learning_rate": 4.732885653835584e-06, + "loss": 0.6283, + "step": 3329 + }, + { + "epoch": 0.9145839055204614, + "grad_norm": 1.2485102415084839, + "learning_rate": 4.732723194604008e-06, + "loss": 0.6889, + "step": 3330 + }, + { + "epoch": 0.914858555341939, + "grad_norm": 1.1412485837936401, + "learning_rate": 4.7325606887735096e-06, + "loss": 0.6239, + "step": 3331 + }, + { + "epoch": 0.9151332051634167, + "grad_norm": 1.2666511535644531, + "learning_rate": 4.732398136347483e-06, + "loss": 0.6317, + "step": 3332 + }, + { + "epoch": 0.9154078549848943, + "grad_norm": 1.2180017232894897, + "learning_rate": 4.73223553732932e-06, + "loss": 0.6495, + "step": 3333 + }, + { + "epoch": 0.9156825048063719, + "grad_norm": 1.2905066013336182, + "learning_rate": 4.732072891722413e-06, + "loss": 0.6707, + "step": 3334 + }, + { + "epoch": 0.9159571546278495, + "grad_norm": 1.2537615299224854, + "learning_rate": 4.731910199530158e-06, + "loss": 0.6676, + "step": 3335 + }, + { + "epoch": 0.9162318044493271, + "grad_norm": 1.146183967590332, + "learning_rate": 4.7317474607559495e-06, + "loss": 0.594, + "step": 3336 + }, + { + "epoch": 0.9165064542708047, + "grad_norm": 1.235552430152893, + "learning_rate": 4.7315846754031844e-06, + "loss": 0.6791, + "step": 3337 + }, + { + "epoch": 0.9167811040922823, + "grad_norm": 1.285744071006775, + "learning_rate": 4.73142184347526e-06, + "loss": 0.6475, + "step": 3338 + }, + { + "epoch": 0.9170557539137599, + "grad_norm": 1.1992052793502808, + "learning_rate": 4.731258964975575e-06, + "loss": 0.6733, + "step": 3339 + }, + { + "epoch": 0.9173304037352376, + "grad_norm": 1.2048259973526, + "learning_rate": 4.731096039907529e-06, + "loss": 0.6389, + "step": 3340 + }, + { + "epoch": 0.9176050535567152, + "grad_norm": 1.1461082696914673, + "learning_rate": 4.730933068274522e-06, + "loss": 0.6213, + "step": 3341 + }, + { + "epoch": 0.9178797033781928, + "grad_norm": 1.2467976808547974, + "learning_rate": 4.730770050079955e-06, + "loss": 0.6599, + "step": 3342 + }, + { + "epoch": 0.9181543531996704, + "grad_norm": 1.223556399345398, + "learning_rate": 4.73060698532723e-06, + "loss": 0.6951, + "step": 3343 + }, + { + "epoch": 0.918429003021148, + "grad_norm": 1.1764570474624634, + "learning_rate": 4.730443874019753e-06, + "loss": 0.6224, + "step": 3344 + }, + { + "epoch": 0.9187036528426257, + "grad_norm": 1.2771332263946533, + "learning_rate": 4.7302807161609254e-06, + "loss": 0.6386, + "step": 3345 + }, + { + "epoch": 0.9189783026641033, + "grad_norm": 1.1635290384292603, + "learning_rate": 4.7301175117541544e-06, + "loss": 0.655, + "step": 3346 + }, + { + "epoch": 0.9192529524855809, + "grad_norm": 1.2523891925811768, + "learning_rate": 4.729954260802843e-06, + "loss": 0.6288, + "step": 3347 + }, + { + "epoch": 0.9195276023070585, + "grad_norm": 1.2810609340667725, + "learning_rate": 4.729790963310403e-06, + "loss": 0.6596, + "step": 3348 + }, + { + "epoch": 0.9198022521285362, + "grad_norm": 1.2154881954193115, + "learning_rate": 4.729627619280239e-06, + "loss": 0.6302, + "step": 3349 + }, + { + "epoch": 0.9200769019500137, + "grad_norm": 1.2944589853286743, + "learning_rate": 4.729464228715762e-06, + "loss": 0.6945, + "step": 3350 + }, + { + "epoch": 0.9203515517714913, + "grad_norm": 1.2318041324615479, + "learning_rate": 4.729300791620381e-06, + "loss": 0.6757, + "step": 3351 + }, + { + "epoch": 0.9206262015929689, + "grad_norm": 1.2096670866012573, + "learning_rate": 4.729137307997508e-06, + "loss": 0.6494, + "step": 3352 + }, + { + "epoch": 0.9209008514144466, + "grad_norm": 1.1881980895996094, + "learning_rate": 4.728973777850554e-06, + "loss": 0.6381, + "step": 3353 + }, + { + "epoch": 0.9211755012359242, + "grad_norm": 1.1626616716384888, + "learning_rate": 4.728810201182933e-06, + "loss": 0.6101, + "step": 3354 + }, + { + "epoch": 0.9214501510574018, + "grad_norm": 1.2054380178451538, + "learning_rate": 4.728646577998059e-06, + "loss": 0.6884, + "step": 3355 + }, + { + "epoch": 0.9217248008788794, + "grad_norm": 1.231203317642212, + "learning_rate": 4.728482908299346e-06, + "loss": 0.6524, + "step": 3356 + }, + { + "epoch": 0.921999450700357, + "grad_norm": 1.2315783500671387, + "learning_rate": 4.72831919209021e-06, + "loss": 0.6548, + "step": 3357 + }, + { + "epoch": 0.9222741005218347, + "grad_norm": 1.1598831415176392, + "learning_rate": 4.728155429374069e-06, + "loss": 0.6005, + "step": 3358 + }, + { + "epoch": 0.9225487503433123, + "grad_norm": 1.1539400815963745, + "learning_rate": 4.7279916201543395e-06, + "loss": 0.6249, + "step": 3359 + }, + { + "epoch": 0.9228234001647899, + "grad_norm": 1.1905007362365723, + "learning_rate": 4.7278277644344426e-06, + "loss": 0.6048, + "step": 3360 + }, + { + "epoch": 0.9230980499862675, + "grad_norm": 1.2108032703399658, + "learning_rate": 4.727663862217795e-06, + "loss": 0.669, + "step": 3361 + }, + { + "epoch": 0.9233726998077452, + "grad_norm": 1.2166707515716553, + "learning_rate": 4.72749991350782e-06, + "loss": 0.6124, + "step": 3362 + }, + { + "epoch": 0.9236473496292228, + "grad_norm": 1.2127670049667358, + "learning_rate": 4.7273359183079385e-06, + "loss": 0.6654, + "step": 3363 + }, + { + "epoch": 0.9239219994507003, + "grad_norm": 1.1827402114868164, + "learning_rate": 4.727171876621573e-06, + "loss": 0.6621, + "step": 3364 + }, + { + "epoch": 0.9241966492721779, + "grad_norm": 1.260708212852478, + "learning_rate": 4.7270077884521476e-06, + "loss": 0.6342, + "step": 3365 + }, + { + "epoch": 0.9244712990936556, + "grad_norm": 1.309738278388977, + "learning_rate": 4.726843653803086e-06, + "loss": 0.7045, + "step": 3366 + }, + { + "epoch": 0.9247459489151332, + "grad_norm": 1.2072808742523193, + "learning_rate": 4.726679472677815e-06, + "loss": 0.6517, + "step": 3367 + }, + { + "epoch": 0.9250205987366108, + "grad_norm": 1.176601529121399, + "learning_rate": 4.726515245079761e-06, + "loss": 0.617, + "step": 3368 + }, + { + "epoch": 0.9252952485580884, + "grad_norm": 1.1358445882797241, + "learning_rate": 4.726350971012353e-06, + "loss": 0.6204, + "step": 3369 + }, + { + "epoch": 0.925569898379566, + "grad_norm": 1.1549413204193115, + "learning_rate": 4.726186650479015e-06, + "loss": 0.6211, + "step": 3370 + }, + { + "epoch": 0.9258445482010437, + "grad_norm": 1.2606345415115356, + "learning_rate": 4.72602228348318e-06, + "loss": 0.6932, + "step": 3371 + }, + { + "epoch": 0.9261191980225213, + "grad_norm": 1.2289155721664429, + "learning_rate": 4.725857870028279e-06, + "loss": 0.6539, + "step": 3372 + }, + { + "epoch": 0.9263938478439989, + "grad_norm": 1.2346222400665283, + "learning_rate": 4.725693410117742e-06, + "loss": 0.6773, + "step": 3373 + }, + { + "epoch": 0.9266684976654765, + "grad_norm": 1.1449462175369263, + "learning_rate": 4.725528903755002e-06, + "loss": 0.6473, + "step": 3374 + }, + { + "epoch": 0.9269431474869542, + "grad_norm": 1.2370164394378662, + "learning_rate": 4.725364350943492e-06, + "loss": 0.6591, + "step": 3375 + }, + { + "epoch": 0.9272177973084318, + "grad_norm": 1.2285922765731812, + "learning_rate": 4.725199751686647e-06, + "loss": 0.6267, + "step": 3376 + }, + { + "epoch": 0.9274924471299094, + "grad_norm": 1.2621729373931885, + "learning_rate": 4.725035105987901e-06, + "loss": 0.6769, + "step": 3377 + }, + { + "epoch": 0.9277670969513869, + "grad_norm": 1.2038602828979492, + "learning_rate": 4.724870413850691e-06, + "loss": 0.6319, + "step": 3378 + }, + { + "epoch": 0.9280417467728646, + "grad_norm": 1.221264362335205, + "learning_rate": 4.7247056752784555e-06, + "loss": 0.5799, + "step": 3379 + }, + { + "epoch": 0.9283163965943422, + "grad_norm": 1.252919316291809, + "learning_rate": 4.724540890274631e-06, + "loss": 0.6592, + "step": 3380 + }, + { + "epoch": 0.9285910464158198, + "grad_norm": 1.2024648189544678, + "learning_rate": 4.724376058842658e-06, + "loss": 0.5983, + "step": 3381 + }, + { + "epoch": 0.9288656962372974, + "grad_norm": 1.2941291332244873, + "learning_rate": 4.724211180985975e-06, + "loss": 0.678, + "step": 3382 + }, + { + "epoch": 0.929140346058775, + "grad_norm": 1.1544336080551147, + "learning_rate": 4.724046256708026e-06, + "loss": 0.6404, + "step": 3383 + }, + { + "epoch": 0.9294149958802527, + "grad_norm": 1.146309494972229, + "learning_rate": 4.72388128601225e-06, + "loss": 0.6718, + "step": 3384 + }, + { + "epoch": 0.9296896457017303, + "grad_norm": 1.165910005569458, + "learning_rate": 4.723716268902091e-06, + "loss": 0.6062, + "step": 3385 + }, + { + "epoch": 0.9299642955232079, + "grad_norm": 1.1886464357376099, + "learning_rate": 4.723551205380995e-06, + "loss": 0.6277, + "step": 3386 + }, + { + "epoch": 0.9302389453446855, + "grad_norm": 1.2176129817962646, + "learning_rate": 4.723386095452405e-06, + "loss": 0.6461, + "step": 3387 + }, + { + "epoch": 0.9305135951661632, + "grad_norm": 1.2004435062408447, + "learning_rate": 4.723220939119767e-06, + "loss": 0.6338, + "step": 3388 + }, + { + "epoch": 0.9307882449876408, + "grad_norm": 1.2468947172164917, + "learning_rate": 4.723055736386528e-06, + "loss": 0.621, + "step": 3389 + }, + { + "epoch": 0.9310628948091184, + "grad_norm": 1.1752002239227295, + "learning_rate": 4.722890487256138e-06, + "loss": 0.6391, + "step": 3390 + }, + { + "epoch": 0.931337544630596, + "grad_norm": 1.2277220487594604, + "learning_rate": 4.722725191732042e-06, + "loss": 0.6616, + "step": 3391 + }, + { + "epoch": 0.9316121944520737, + "grad_norm": 1.2123576402664185, + "learning_rate": 4.722559849817695e-06, + "loss": 0.6268, + "step": 3392 + }, + { + "epoch": 0.9318868442735512, + "grad_norm": 1.2843072414398193, + "learning_rate": 4.722394461516543e-06, + "loss": 0.6387, + "step": 3393 + }, + { + "epoch": 0.9321614940950288, + "grad_norm": 1.1686700582504272, + "learning_rate": 4.72222902683204e-06, + "loss": 0.6827, + "step": 3394 + }, + { + "epoch": 0.9324361439165064, + "grad_norm": 1.2437337636947632, + "learning_rate": 4.722063545767639e-06, + "loss": 0.6601, + "step": 3395 + }, + { + "epoch": 0.932710793737984, + "grad_norm": 1.1848821640014648, + "learning_rate": 4.721898018326794e-06, + "loss": 0.63, + "step": 3396 + }, + { + "epoch": 0.9329854435594617, + "grad_norm": 1.2922289371490479, + "learning_rate": 4.7217324445129585e-06, + "loss": 0.6496, + "step": 3397 + }, + { + "epoch": 0.9332600933809393, + "grad_norm": 1.2368274927139282, + "learning_rate": 4.721566824329588e-06, + "loss": 0.6351, + "step": 3398 + }, + { + "epoch": 0.9335347432024169, + "grad_norm": 1.2076274156570435, + "learning_rate": 4.72140115778014e-06, + "loss": 0.6057, + "step": 3399 + }, + { + "epoch": 0.9338093930238945, + "grad_norm": 1.1669716835021973, + "learning_rate": 4.721235444868072e-06, + "loss": 0.6539, + "step": 3400 + }, + { + "epoch": 0.9340840428453722, + "grad_norm": 1.1340528726577759, + "learning_rate": 4.721069685596843e-06, + "loss": 0.6184, + "step": 3401 + }, + { + "epoch": 0.9343586926668498, + "grad_norm": 1.1443041563034058, + "learning_rate": 4.720903879969911e-06, + "loss": 0.6414, + "step": 3402 + }, + { + "epoch": 0.9346333424883274, + "grad_norm": 1.239579439163208, + "learning_rate": 4.720738027990739e-06, + "loss": 0.6985, + "step": 3403 + }, + { + "epoch": 0.934907992309805, + "grad_norm": 1.1525670289993286, + "learning_rate": 4.720572129662786e-06, + "loss": 0.6243, + "step": 3404 + }, + { + "epoch": 0.9351826421312827, + "grad_norm": 1.173326015472412, + "learning_rate": 4.720406184989516e-06, + "loss": 0.6186, + "step": 3405 + }, + { + "epoch": 0.9354572919527603, + "grad_norm": 1.1277180910110474, + "learning_rate": 4.720240193974392e-06, + "loss": 0.5899, + "step": 3406 + }, + { + "epoch": 0.9357319417742378, + "grad_norm": 1.2533860206604004, + "learning_rate": 4.7200741566208775e-06, + "loss": 0.6418, + "step": 3407 + }, + { + "epoch": 0.9360065915957154, + "grad_norm": 1.3133188486099243, + "learning_rate": 4.719908072932438e-06, + "loss": 0.6519, + "step": 3408 + }, + { + "epoch": 0.936281241417193, + "grad_norm": 1.2379558086395264, + "learning_rate": 4.719741942912542e-06, + "loss": 0.646, + "step": 3409 + }, + { + "epoch": 0.9365558912386707, + "grad_norm": 1.1802151203155518, + "learning_rate": 4.719575766564655e-06, + "loss": 0.6123, + "step": 3410 + }, + { + "epoch": 0.9368305410601483, + "grad_norm": 1.166155457496643, + "learning_rate": 4.7194095438922455e-06, + "loss": 0.6447, + "step": 3411 + }, + { + "epoch": 0.9371051908816259, + "grad_norm": 1.206767201423645, + "learning_rate": 4.719243274898783e-06, + "loss": 0.6836, + "step": 3412 + }, + { + "epoch": 0.9373798407031035, + "grad_norm": 1.2699425220489502, + "learning_rate": 4.719076959587736e-06, + "loss": 0.6534, + "step": 3413 + }, + { + "epoch": 0.9376544905245812, + "grad_norm": 1.2340337038040161, + "learning_rate": 4.718910597962578e-06, + "loss": 0.6599, + "step": 3414 + }, + { + "epoch": 0.9379291403460588, + "grad_norm": 1.2079087495803833, + "learning_rate": 4.71874419002678e-06, + "loss": 0.6327, + "step": 3415 + }, + { + "epoch": 0.9382037901675364, + "grad_norm": 1.1515933275222778, + "learning_rate": 4.718577735783815e-06, + "loss": 0.6072, + "step": 3416 + }, + { + "epoch": 0.938478439989014, + "grad_norm": 1.2794970273971558, + "learning_rate": 4.718411235237157e-06, + "loss": 0.6512, + "step": 3417 + }, + { + "epoch": 0.9387530898104917, + "grad_norm": 1.2489773035049438, + "learning_rate": 4.718244688390281e-06, + "loss": 0.6707, + "step": 3418 + }, + { + "epoch": 0.9390277396319693, + "grad_norm": 1.113620400428772, + "learning_rate": 4.718078095246664e-06, + "loss": 0.5981, + "step": 3419 + }, + { + "epoch": 0.9393023894534469, + "grad_norm": 1.2410790920257568, + "learning_rate": 4.717911455809782e-06, + "loss": 0.6553, + "step": 3420 + }, + { + "epoch": 0.9395770392749244, + "grad_norm": 1.1784037351608276, + "learning_rate": 4.7177447700831136e-06, + "loss": 0.5842, + "step": 3421 + }, + { + "epoch": 0.939851689096402, + "grad_norm": 1.2588396072387695, + "learning_rate": 4.7175780380701366e-06, + "loss": 0.63, + "step": 3422 + }, + { + "epoch": 0.9401263389178797, + "grad_norm": 1.241640567779541, + "learning_rate": 4.7174112597743315e-06, + "loss": 0.6531, + "step": 3423 + }, + { + "epoch": 0.9404009887393573, + "grad_norm": 1.1901918649673462, + "learning_rate": 4.717244435199179e-06, + "loss": 0.6163, + "step": 3424 + }, + { + "epoch": 0.9406756385608349, + "grad_norm": 1.1859066486358643, + "learning_rate": 4.7170775643481615e-06, + "loss": 0.6621, + "step": 3425 + }, + { + "epoch": 0.9409502883823125, + "grad_norm": 1.2271337509155273, + "learning_rate": 4.716910647224761e-06, + "loss": 0.5725, + "step": 3426 + }, + { + "epoch": 0.9412249382037902, + "grad_norm": 1.205249309539795, + "learning_rate": 4.71674368383246e-06, + "loss": 0.6349, + "step": 3427 + }, + { + "epoch": 0.9414995880252678, + "grad_norm": 1.1861555576324463, + "learning_rate": 4.716576674174747e-06, + "loss": 0.6463, + "step": 3428 + }, + { + "epoch": 0.9417742378467454, + "grad_norm": 1.2437589168548584, + "learning_rate": 4.716409618255104e-06, + "loss": 0.6718, + "step": 3429 + }, + { + "epoch": 0.942048887668223, + "grad_norm": 1.1360713243484497, + "learning_rate": 4.716242516077018e-06, + "loss": 0.6455, + "step": 3430 + }, + { + "epoch": 0.9423235374897007, + "grad_norm": 1.2316755056381226, + "learning_rate": 4.716075367643978e-06, + "loss": 0.6635, + "step": 3431 + }, + { + "epoch": 0.9425981873111783, + "grad_norm": 1.2503108978271484, + "learning_rate": 4.715908172959471e-06, + "loss": 0.6501, + "step": 3432 + }, + { + "epoch": 0.9428728371326559, + "grad_norm": 1.2497085332870483, + "learning_rate": 4.7157409320269884e-06, + "loss": 0.6498, + "step": 3433 + }, + { + "epoch": 0.9431474869541335, + "grad_norm": 1.2081092596054077, + "learning_rate": 4.71557364485002e-06, + "loss": 0.6695, + "step": 3434 + }, + { + "epoch": 0.943422136775611, + "grad_norm": 1.303027868270874, + "learning_rate": 4.715406311432057e-06, + "loss": 0.6482, + "step": 3435 + }, + { + "epoch": 0.9436967865970887, + "grad_norm": 1.2053909301757812, + "learning_rate": 4.715238931776591e-06, + "loss": 0.626, + "step": 3436 + }, + { + "epoch": 0.9439714364185663, + "grad_norm": 1.2444037199020386, + "learning_rate": 4.715071505887116e-06, + "loss": 0.7002, + "step": 3437 + }, + { + "epoch": 0.9442460862400439, + "grad_norm": 1.2068390846252441, + "learning_rate": 4.714904033767127e-06, + "loss": 0.6791, + "step": 3438 + }, + { + "epoch": 0.9445207360615215, + "grad_norm": 1.1975890398025513, + "learning_rate": 4.714736515420118e-06, + "loss": 0.6229, + "step": 3439 + }, + { + "epoch": 0.9447953858829992, + "grad_norm": 1.1779896020889282, + "learning_rate": 4.714568950849586e-06, + "loss": 0.6094, + "step": 3440 + }, + { + "epoch": 0.9450700357044768, + "grad_norm": 1.251038908958435, + "learning_rate": 4.714401340059029e-06, + "loss": 0.6942, + "step": 3441 + }, + { + "epoch": 0.9453446855259544, + "grad_norm": 1.2396414279937744, + "learning_rate": 4.7142336830519435e-06, + "loss": 0.733, + "step": 3442 + }, + { + "epoch": 0.945619335347432, + "grad_norm": 1.302371859550476, + "learning_rate": 4.71406597983183e-06, + "loss": 0.6752, + "step": 3443 + }, + { + "epoch": 0.9458939851689097, + "grad_norm": 1.2069149017333984, + "learning_rate": 4.713898230402188e-06, + "loss": 0.6491, + "step": 3444 + }, + { + "epoch": 0.9461686349903873, + "grad_norm": 1.2156940698623657, + "learning_rate": 4.713730434766519e-06, + "loss": 0.6622, + "step": 3445 + }, + { + "epoch": 0.9464432848118649, + "grad_norm": 1.211964726448059, + "learning_rate": 4.713562592928324e-06, + "loss": 0.6291, + "step": 3446 + }, + { + "epoch": 0.9467179346333425, + "grad_norm": 1.1631731986999512, + "learning_rate": 4.713394704891107e-06, + "loss": 0.6197, + "step": 3447 + }, + { + "epoch": 0.9469925844548202, + "grad_norm": 1.2102649211883545, + "learning_rate": 4.713226770658372e-06, + "loss": 0.6685, + "step": 3448 + }, + { + "epoch": 0.9472672342762977, + "grad_norm": 1.2227662801742554, + "learning_rate": 4.713058790233623e-06, + "loss": 0.6947, + "step": 3449 + }, + { + "epoch": 0.9475418840977753, + "grad_norm": 1.2321763038635254, + "learning_rate": 4.712890763620368e-06, + "loss": 0.6836, + "step": 3450 + }, + { + "epoch": 0.9478165339192529, + "grad_norm": 1.2143627405166626, + "learning_rate": 4.712722690822111e-06, + "loss": 0.6461, + "step": 3451 + }, + { + "epoch": 0.9480911837407305, + "grad_norm": 1.2430148124694824, + "learning_rate": 4.7125545718423625e-06, + "loss": 0.6732, + "step": 3452 + }, + { + "epoch": 0.9483658335622082, + "grad_norm": 1.2977855205535889, + "learning_rate": 4.71238640668463e-06, + "loss": 0.6348, + "step": 3453 + }, + { + "epoch": 0.9486404833836858, + "grad_norm": 1.1805461645126343, + "learning_rate": 4.712218195352421e-06, + "loss": 0.6028, + "step": 3454 + }, + { + "epoch": 0.9489151332051634, + "grad_norm": 1.1982643604278564, + "learning_rate": 4.712049937849251e-06, + "loss": 0.6299, + "step": 3455 + }, + { + "epoch": 0.949189783026641, + "grad_norm": 1.3067811727523804, + "learning_rate": 4.711881634178628e-06, + "loss": 0.659, + "step": 3456 + }, + { + "epoch": 0.9494644328481187, + "grad_norm": 1.204461693763733, + "learning_rate": 4.711713284344066e-06, + "loss": 0.6468, + "step": 3457 + }, + { + "epoch": 0.9497390826695963, + "grad_norm": 1.2797666788101196, + "learning_rate": 4.711544888349079e-06, + "loss": 0.6632, + "step": 3458 + }, + { + "epoch": 0.9500137324910739, + "grad_norm": 1.1899386644363403, + "learning_rate": 4.711376446197181e-06, + "loss": 0.6309, + "step": 3459 + }, + { + "epoch": 0.9502883823125515, + "grad_norm": 1.2262940406799316, + "learning_rate": 4.711207957891887e-06, + "loss": 0.6397, + "step": 3460 + }, + { + "epoch": 0.9505630321340292, + "grad_norm": 1.2557927370071411, + "learning_rate": 4.711039423436714e-06, + "loss": 0.6888, + "step": 3461 + }, + { + "epoch": 0.9508376819555068, + "grad_norm": 1.2803071737289429, + "learning_rate": 4.71087084283518e-06, + "loss": 0.682, + "step": 3462 + }, + { + "epoch": 0.9511123317769844, + "grad_norm": 1.1599842309951782, + "learning_rate": 4.710702216090804e-06, + "loss": 0.6285, + "step": 3463 + }, + { + "epoch": 0.9513869815984619, + "grad_norm": 1.1830108165740967, + "learning_rate": 4.710533543207102e-06, + "loss": 0.6408, + "step": 3464 + }, + { + "epoch": 0.9516616314199395, + "grad_norm": 1.2033802270889282, + "learning_rate": 4.7103648241875984e-06, + "loss": 0.6015, + "step": 3465 + }, + { + "epoch": 0.9519362812414172, + "grad_norm": 1.256394624710083, + "learning_rate": 4.710196059035812e-06, + "loss": 0.7258, + "step": 3466 + }, + { + "epoch": 0.9522109310628948, + "grad_norm": 1.2584211826324463, + "learning_rate": 4.710027247755266e-06, + "loss": 0.6625, + "step": 3467 + }, + { + "epoch": 0.9524855808843724, + "grad_norm": 1.2241586446762085, + "learning_rate": 4.7098583903494846e-06, + "loss": 0.6363, + "step": 3468 + }, + { + "epoch": 0.95276023070585, + "grad_norm": 1.2050907611846924, + "learning_rate": 4.709689486821991e-06, + "loss": 0.6477, + "step": 3469 + }, + { + "epoch": 0.9530348805273277, + "grad_norm": 1.210800051689148, + "learning_rate": 4.709520537176309e-06, + "loss": 0.6683, + "step": 3470 + }, + { + "epoch": 0.9533095303488053, + "grad_norm": 1.2416422367095947, + "learning_rate": 4.7093515414159665e-06, + "loss": 0.5931, + "step": 3471 + }, + { + "epoch": 0.9535841801702829, + "grad_norm": 1.206438660621643, + "learning_rate": 4.7091824995444905e-06, + "loss": 0.6101, + "step": 3472 + }, + { + "epoch": 0.9538588299917605, + "grad_norm": 1.2351220846176147, + "learning_rate": 4.709013411565409e-06, + "loss": 0.6263, + "step": 3473 + }, + { + "epoch": 0.9541334798132382, + "grad_norm": 1.2429038286209106, + "learning_rate": 4.70884427748225e-06, + "loss": 0.682, + "step": 3474 + }, + { + "epoch": 0.9544081296347158, + "grad_norm": 1.1100518703460693, + "learning_rate": 4.708675097298544e-06, + "loss": 0.5968, + "step": 3475 + }, + { + "epoch": 0.9546827794561934, + "grad_norm": 1.2964025735855103, + "learning_rate": 4.708505871017823e-06, + "loss": 0.6927, + "step": 3476 + }, + { + "epoch": 0.954957429277671, + "grad_norm": 1.1657687425613403, + "learning_rate": 4.7083365986436174e-06, + "loss": 0.6336, + "step": 3477 + }, + { + "epoch": 0.9552320790991485, + "grad_norm": 1.2022286653518677, + "learning_rate": 4.708167280179461e-06, + "loss": 0.6549, + "step": 3478 + }, + { + "epoch": 0.9555067289206262, + "grad_norm": 1.1321518421173096, + "learning_rate": 4.707997915628887e-06, + "loss": 0.631, + "step": 3479 + }, + { + "epoch": 0.9557813787421038, + "grad_norm": 1.2224246263504028, + "learning_rate": 4.707828504995431e-06, + "loss": 0.6277, + "step": 3480 + }, + { + "epoch": 0.9560560285635814, + "grad_norm": 1.2777113914489746, + "learning_rate": 4.707659048282628e-06, + "loss": 0.6532, + "step": 3481 + }, + { + "epoch": 0.956330678385059, + "grad_norm": 1.2143696546554565, + "learning_rate": 4.707489545494015e-06, + "loss": 0.6548, + "step": 3482 + }, + { + "epoch": 0.9566053282065367, + "grad_norm": 1.2380015850067139, + "learning_rate": 4.70731999663313e-06, + "loss": 0.6932, + "step": 3483 + }, + { + "epoch": 0.9568799780280143, + "grad_norm": 1.237418532371521, + "learning_rate": 4.70715040170351e-06, + "loss": 0.6695, + "step": 3484 + }, + { + "epoch": 0.9571546278494919, + "grad_norm": 1.1902186870574951, + "learning_rate": 4.706980760708698e-06, + "loss": 0.6349, + "step": 3485 + }, + { + "epoch": 0.9574292776709695, + "grad_norm": 1.2418519258499146, + "learning_rate": 4.706811073652231e-06, + "loss": 0.631, + "step": 3486 + }, + { + "epoch": 0.9577039274924471, + "grad_norm": 1.1842035055160522, + "learning_rate": 4.706641340537652e-06, + "loss": 0.6291, + "step": 3487 + }, + { + "epoch": 0.9579785773139248, + "grad_norm": 1.2201026678085327, + "learning_rate": 4.7064715613685045e-06, + "loss": 0.6594, + "step": 3488 + }, + { + "epoch": 0.9582532271354024, + "grad_norm": 1.1866848468780518, + "learning_rate": 4.7063017361483295e-06, + "loss": 0.5996, + "step": 3489 + }, + { + "epoch": 0.95852787695688, + "grad_norm": 1.1671572923660278, + "learning_rate": 4.706131864880674e-06, + "loss": 0.6437, + "step": 3490 + }, + { + "epoch": 0.9588025267783576, + "grad_norm": 1.2588392496109009, + "learning_rate": 4.705961947569081e-06, + "loss": 0.6549, + "step": 3491 + }, + { + "epoch": 0.9590771765998352, + "grad_norm": 1.1730564832687378, + "learning_rate": 4.705791984217098e-06, + "loss": 0.6531, + "step": 3492 + }, + { + "epoch": 0.9593518264213128, + "grad_norm": 1.2654051780700684, + "learning_rate": 4.7056219748282735e-06, + "loss": 0.6862, + "step": 3493 + }, + { + "epoch": 0.9596264762427904, + "grad_norm": 1.2059543132781982, + "learning_rate": 4.705451919406155e-06, + "loss": 0.6352, + "step": 3494 + }, + { + "epoch": 0.959901126064268, + "grad_norm": 1.2290771007537842, + "learning_rate": 4.7052818179542894e-06, + "loss": 0.6973, + "step": 3495 + }, + { + "epoch": 0.9601757758857457, + "grad_norm": 1.266959547996521, + "learning_rate": 4.705111670476229e-06, + "loss": 0.6386, + "step": 3496 + }, + { + "epoch": 0.9604504257072233, + "grad_norm": 1.19248366355896, + "learning_rate": 4.704941476975525e-06, + "loss": 0.6205, + "step": 3497 + }, + { + "epoch": 0.9607250755287009, + "grad_norm": 1.2123160362243652, + "learning_rate": 4.704771237455729e-06, + "loss": 0.5801, + "step": 3498 + }, + { + "epoch": 0.9609997253501785, + "grad_norm": 1.2344917058944702, + "learning_rate": 4.704600951920395e-06, + "loss": 0.6581, + "step": 3499 + }, + { + "epoch": 0.9612743751716561, + "grad_norm": 1.2397671937942505, + "learning_rate": 4.704430620373075e-06, + "loss": 0.6506, + "step": 3500 + }, + { + "epoch": 0.9615490249931338, + "grad_norm": 1.1939283609390259, + "learning_rate": 4.704260242817325e-06, + "loss": 0.6666, + "step": 3501 + }, + { + "epoch": 0.9618236748146114, + "grad_norm": 1.2546193599700928, + "learning_rate": 4.704089819256703e-06, + "loss": 0.6742, + "step": 3502 + }, + { + "epoch": 0.962098324636089, + "grad_norm": 1.2269984483718872, + "learning_rate": 4.7039193496947624e-06, + "loss": 0.6416, + "step": 3503 + }, + { + "epoch": 0.9623729744575666, + "grad_norm": 1.2310396432876587, + "learning_rate": 4.703748834135062e-06, + "loss": 0.6675, + "step": 3504 + }, + { + "epoch": 0.9626476242790443, + "grad_norm": 1.1254775524139404, + "learning_rate": 4.703578272581163e-06, + "loss": 0.6481, + "step": 3505 + }, + { + "epoch": 0.9629222741005218, + "grad_norm": 1.2074267864227295, + "learning_rate": 4.703407665036622e-06, + "loss": 0.6321, + "step": 3506 + }, + { + "epoch": 0.9631969239219994, + "grad_norm": 1.2337595224380493, + "learning_rate": 4.703237011505002e-06, + "loss": 0.656, + "step": 3507 + }, + { + "epoch": 0.963471573743477, + "grad_norm": 1.2387075424194336, + "learning_rate": 4.703066311989863e-06, + "loss": 0.6804, + "step": 3508 + }, + { + "epoch": 0.9637462235649547, + "grad_norm": 1.224850058555603, + "learning_rate": 4.7028955664947694e-06, + "loss": 0.6359, + "step": 3509 + }, + { + "epoch": 0.9640208733864323, + "grad_norm": 1.246777892112732, + "learning_rate": 4.702724775023284e-06, + "loss": 0.6907, + "step": 3510 + }, + { + "epoch": 0.9642955232079099, + "grad_norm": 1.2576322555541992, + "learning_rate": 4.7025539375789705e-06, + "loss": 0.7105, + "step": 3511 + }, + { + "epoch": 0.9645701730293875, + "grad_norm": 1.1948013305664062, + "learning_rate": 4.702383054165396e-06, + "loss": 0.6541, + "step": 3512 + }, + { + "epoch": 0.9648448228508651, + "grad_norm": 1.1008268594741821, + "learning_rate": 4.702212124786125e-06, + "loss": 0.5972, + "step": 3513 + }, + { + "epoch": 0.9651194726723428, + "grad_norm": 1.169193983078003, + "learning_rate": 4.702041149444727e-06, + "loss": 0.6333, + "step": 3514 + }, + { + "epoch": 0.9653941224938204, + "grad_norm": 1.1267778873443604, + "learning_rate": 4.70187012814477e-06, + "loss": 0.6287, + "step": 3515 + }, + { + "epoch": 0.965668772315298, + "grad_norm": 1.2137783765792847, + "learning_rate": 4.7016990608898215e-06, + "loss": 0.6658, + "step": 3516 + }, + { + "epoch": 0.9659434221367756, + "grad_norm": 1.1707870960235596, + "learning_rate": 4.7015279476834544e-06, + "loss": 0.6738, + "step": 3517 + }, + { + "epoch": 0.9662180719582533, + "grad_norm": 1.1859707832336426, + "learning_rate": 4.701356788529238e-06, + "loss": 0.6296, + "step": 3518 + }, + { + "epoch": 0.9664927217797309, + "grad_norm": 1.199833631515503, + "learning_rate": 4.701185583430747e-06, + "loss": 0.6336, + "step": 3519 + }, + { + "epoch": 0.9667673716012085, + "grad_norm": 1.2593847513198853, + "learning_rate": 4.701014332391551e-06, + "loss": 0.6366, + "step": 3520 + }, + { + "epoch": 0.967042021422686, + "grad_norm": 1.192130208015442, + "learning_rate": 4.700843035415227e-06, + "loss": 0.627, + "step": 3521 + }, + { + "epoch": 0.9673166712441637, + "grad_norm": 1.1925928592681885, + "learning_rate": 4.700671692505348e-06, + "loss": 0.658, + "step": 3522 + }, + { + "epoch": 0.9675913210656413, + "grad_norm": 1.1722140312194824, + "learning_rate": 4.700500303665493e-06, + "loss": 0.6367, + "step": 3523 + }, + { + "epoch": 0.9678659708871189, + "grad_norm": 1.2378544807434082, + "learning_rate": 4.700328868899236e-06, + "loss": 0.6414, + "step": 3524 + }, + { + "epoch": 0.9681406207085965, + "grad_norm": 1.2139688730239868, + "learning_rate": 4.700157388210158e-06, + "loss": 0.6283, + "step": 3525 + }, + { + "epoch": 0.9684152705300741, + "grad_norm": 1.279630422592163, + "learning_rate": 4.699985861601835e-06, + "loss": 0.663, + "step": 3526 + }, + { + "epoch": 0.9686899203515518, + "grad_norm": 1.2480055093765259, + "learning_rate": 4.699814289077849e-06, + "loss": 0.6686, + "step": 3527 + }, + { + "epoch": 0.9689645701730294, + "grad_norm": 1.2140295505523682, + "learning_rate": 4.69964267064178e-06, + "loss": 0.612, + "step": 3528 + }, + { + "epoch": 0.969239219994507, + "grad_norm": 1.2688043117523193, + "learning_rate": 4.699471006297209e-06, + "loss": 0.6494, + "step": 3529 + }, + { + "epoch": 0.9695138698159846, + "grad_norm": 1.2405773401260376, + "learning_rate": 4.69929929604772e-06, + "loss": 0.6341, + "step": 3530 + }, + { + "epoch": 0.9697885196374623, + "grad_norm": 1.2147294282913208, + "learning_rate": 4.6991275398968974e-06, + "loss": 0.6686, + "step": 3531 + }, + { + "epoch": 0.9700631694589399, + "grad_norm": 1.281254529953003, + "learning_rate": 4.698955737848324e-06, + "loss": 0.6881, + "step": 3532 + }, + { + "epoch": 0.9703378192804175, + "grad_norm": 1.2178648710250854, + "learning_rate": 4.698783889905587e-06, + "loss": 0.645, + "step": 3533 + }, + { + "epoch": 0.9706124691018951, + "grad_norm": 1.275679588317871, + "learning_rate": 4.698611996072272e-06, + "loss": 0.6358, + "step": 3534 + }, + { + "epoch": 0.9708871189233726, + "grad_norm": 1.1965566873550415, + "learning_rate": 4.698440056351968e-06, + "loss": 0.6478, + "step": 3535 + }, + { + "epoch": 0.9711617687448503, + "grad_norm": 1.175085425376892, + "learning_rate": 4.698268070748262e-06, + "loss": 0.6551, + "step": 3536 + }, + { + "epoch": 0.9714364185663279, + "grad_norm": 1.1507444381713867, + "learning_rate": 4.698096039264743e-06, + "loss": 0.629, + "step": 3537 + }, + { + "epoch": 0.9717110683878055, + "grad_norm": 1.2637684345245361, + "learning_rate": 4.697923961905003e-06, + "loss": 0.6462, + "step": 3538 + }, + { + "epoch": 0.9719857182092831, + "grad_norm": 1.2055002450942993, + "learning_rate": 4.697751838672634e-06, + "loss": 0.6563, + "step": 3539 + }, + { + "epoch": 0.9722603680307608, + "grad_norm": 1.2520908117294312, + "learning_rate": 4.697579669571227e-06, + "loss": 0.7018, + "step": 3540 + }, + { + "epoch": 0.9725350178522384, + "grad_norm": 1.1962261199951172, + "learning_rate": 4.697407454604374e-06, + "loss": 0.6024, + "step": 3541 + }, + { + "epoch": 0.972809667673716, + "grad_norm": 1.2998093366622925, + "learning_rate": 4.6972351937756736e-06, + "loss": 0.6285, + "step": 3542 + }, + { + "epoch": 0.9730843174951936, + "grad_norm": 1.232661485671997, + "learning_rate": 4.697062887088716e-06, + "loss": 0.6548, + "step": 3543 + }, + { + "epoch": 0.9733589673166713, + "grad_norm": 1.2421823740005493, + "learning_rate": 4.696890534547101e-06, + "loss": 0.6219, + "step": 3544 + }, + { + "epoch": 0.9736336171381489, + "grad_norm": 1.1615954637527466, + "learning_rate": 4.6967181361544234e-06, + "loss": 0.6074, + "step": 3545 + }, + { + "epoch": 0.9739082669596265, + "grad_norm": 1.2622795104980469, + "learning_rate": 4.696545691914283e-06, + "loss": 0.617, + "step": 3546 + }, + { + "epoch": 0.9741829167811041, + "grad_norm": 1.284096360206604, + "learning_rate": 4.696373201830278e-06, + "loss": 0.6495, + "step": 3547 + }, + { + "epoch": 0.9744575666025818, + "grad_norm": 1.2257293462753296, + "learning_rate": 4.696200665906009e-06, + "loss": 0.6789, + "step": 3548 + }, + { + "epoch": 0.9747322164240593, + "grad_norm": 1.197741150856018, + "learning_rate": 4.696028084145076e-06, + "loss": 0.6226, + "step": 3549 + }, + { + "epoch": 0.9750068662455369, + "grad_norm": 1.3051773309707642, + "learning_rate": 4.695855456551082e-06, + "loss": 0.6419, + "step": 3550 + }, + { + "epoch": 0.9752815160670145, + "grad_norm": 1.2142539024353027, + "learning_rate": 4.69568278312763e-06, + "loss": 0.6601, + "step": 3551 + }, + { + "epoch": 0.9755561658884921, + "grad_norm": 1.2502988576889038, + "learning_rate": 4.6955100638783215e-06, + "loss": 0.6951, + "step": 3552 + }, + { + "epoch": 0.9758308157099698, + "grad_norm": 1.303180456161499, + "learning_rate": 4.695337298806764e-06, + "loss": 0.6846, + "step": 3553 + }, + { + "epoch": 0.9761054655314474, + "grad_norm": 1.2316381931304932, + "learning_rate": 4.695164487916563e-06, + "loss": 0.6353, + "step": 3554 + }, + { + "epoch": 0.976380115352925, + "grad_norm": 1.1237258911132812, + "learning_rate": 4.694991631211324e-06, + "loss": 0.5838, + "step": 3555 + }, + { + "epoch": 0.9766547651744026, + "grad_norm": 1.197152018547058, + "learning_rate": 4.6948187286946565e-06, + "loss": 0.6603, + "step": 3556 + }, + { + "epoch": 0.9769294149958803, + "grad_norm": 1.2661492824554443, + "learning_rate": 4.6946457803701665e-06, + "loss": 0.6477, + "step": 3557 + }, + { + "epoch": 0.9772040648173579, + "grad_norm": 1.1872189044952393, + "learning_rate": 4.694472786241465e-06, + "loss": 0.6767, + "step": 3558 + }, + { + "epoch": 0.9774787146388355, + "grad_norm": 1.3384547233581543, + "learning_rate": 4.694299746312163e-06, + "loss": 0.6637, + "step": 3559 + }, + { + "epoch": 0.9777533644603131, + "grad_norm": 1.2112005949020386, + "learning_rate": 4.694126660585872e-06, + "loss": 0.614, + "step": 3560 + }, + { + "epoch": 0.9780280142817908, + "grad_norm": 1.180397391319275, + "learning_rate": 4.693953529066203e-06, + "loss": 0.6181, + "step": 3561 + }, + { + "epoch": 0.9783026641032684, + "grad_norm": 1.2151719331741333, + "learning_rate": 4.693780351756771e-06, + "loss": 0.6382, + "step": 3562 + }, + { + "epoch": 0.9785773139247459, + "grad_norm": 1.2127667665481567, + "learning_rate": 4.6936071286611895e-06, + "loss": 0.6826, + "step": 3563 + }, + { + "epoch": 0.9788519637462235, + "grad_norm": 1.176018238067627, + "learning_rate": 4.693433859783074e-06, + "loss": 0.5739, + "step": 3564 + }, + { + "epoch": 0.9791266135677011, + "grad_norm": 1.1371923685073853, + "learning_rate": 4.69326054512604e-06, + "loss": 0.5998, + "step": 3565 + }, + { + "epoch": 0.9794012633891788, + "grad_norm": 1.1811968088150024, + "learning_rate": 4.693087184693707e-06, + "loss": 0.6653, + "step": 3566 + }, + { + "epoch": 0.9796759132106564, + "grad_norm": 1.2012379169464111, + "learning_rate": 4.692913778489692e-06, + "loss": 0.6533, + "step": 3567 + }, + { + "epoch": 0.979950563032134, + "grad_norm": 1.2033262252807617, + "learning_rate": 4.6927403265176125e-06, + "loss": 0.6099, + "step": 3568 + }, + { + "epoch": 0.9802252128536116, + "grad_norm": 1.1778485774993896, + "learning_rate": 4.692566828781091e-06, + "loss": 0.673, + "step": 3569 + }, + { + "epoch": 0.9804998626750893, + "grad_norm": 1.1378544569015503, + "learning_rate": 4.692393285283747e-06, + "loss": 0.6036, + "step": 3570 + }, + { + "epoch": 0.9807745124965669, + "grad_norm": 1.1852856874465942, + "learning_rate": 4.692219696029203e-06, + "loss": 0.6579, + "step": 3571 + }, + { + "epoch": 0.9810491623180445, + "grad_norm": 1.2009961605072021, + "learning_rate": 4.692046061021081e-06, + "loss": 0.6703, + "step": 3572 + }, + { + "epoch": 0.9813238121395221, + "grad_norm": 1.1928898096084595, + "learning_rate": 4.691872380263008e-06, + "loss": 0.6215, + "step": 3573 + }, + { + "epoch": 0.9815984619609998, + "grad_norm": 1.3350346088409424, + "learning_rate": 4.6916986537586065e-06, + "loss": 0.6504, + "step": 3574 + }, + { + "epoch": 0.9818731117824774, + "grad_norm": 1.1667835712432861, + "learning_rate": 4.691524881511502e-06, + "loss": 0.6393, + "step": 3575 + }, + { + "epoch": 0.982147761603955, + "grad_norm": 1.1880005598068237, + "learning_rate": 4.691351063525322e-06, + "loss": 0.6153, + "step": 3576 + }, + { + "epoch": 0.9824224114254325, + "grad_norm": 1.1832435131072998, + "learning_rate": 4.691177199803694e-06, + "loss": 0.6348, + "step": 3577 + }, + { + "epoch": 0.9826970612469101, + "grad_norm": 1.293607473373413, + "learning_rate": 4.691003290350247e-06, + "loss": 0.6289, + "step": 3578 + }, + { + "epoch": 0.9829717110683878, + "grad_norm": 1.1919909715652466, + "learning_rate": 4.69082933516861e-06, + "loss": 0.6471, + "step": 3579 + }, + { + "epoch": 0.9832463608898654, + "grad_norm": 1.2816194295883179, + "learning_rate": 4.690655334262415e-06, + "loss": 0.6539, + "step": 3580 + }, + { + "epoch": 0.983521010711343, + "grad_norm": 1.2197118997573853, + "learning_rate": 4.690481287635292e-06, + "loss": 0.6559, + "step": 3581 + }, + { + "epoch": 0.9837956605328206, + "grad_norm": 1.1973755359649658, + "learning_rate": 4.690307195290874e-06, + "loss": 0.633, + "step": 3582 + }, + { + "epoch": 0.9840703103542983, + "grad_norm": 1.2615818977355957, + "learning_rate": 4.690133057232795e-06, + "loss": 0.655, + "step": 3583 + }, + { + "epoch": 0.9843449601757759, + "grad_norm": 1.348468542098999, + "learning_rate": 4.689958873464689e-06, + "loss": 0.6892, + "step": 3584 + }, + { + "epoch": 0.9846196099972535, + "grad_norm": 1.1572542190551758, + "learning_rate": 4.6897846439901915e-06, + "loss": 0.6325, + "step": 3585 + }, + { + "epoch": 0.9848942598187311, + "grad_norm": 1.1909319162368774, + "learning_rate": 4.6896103688129385e-06, + "loss": 0.6335, + "step": 3586 + }, + { + "epoch": 0.9851689096402088, + "grad_norm": 1.2510229349136353, + "learning_rate": 4.689436047936567e-06, + "loss": 0.6495, + "step": 3587 + }, + { + "epoch": 0.9854435594616864, + "grad_norm": 1.1689658164978027, + "learning_rate": 4.689261681364717e-06, + "loss": 0.6426, + "step": 3588 + }, + { + "epoch": 0.985718209283164, + "grad_norm": 1.2040544748306274, + "learning_rate": 4.689087269101026e-06, + "loss": 0.6498, + "step": 3589 + }, + { + "epoch": 0.9859928591046416, + "grad_norm": 1.1853926181793213, + "learning_rate": 4.688912811149133e-06, + "loss": 0.6901, + "step": 3590 + }, + { + "epoch": 0.9862675089261193, + "grad_norm": 1.243388295173645, + "learning_rate": 4.688738307512683e-06, + "loss": 0.6006, + "step": 3591 + }, + { + "epoch": 0.9865421587475968, + "grad_norm": 1.1900231838226318, + "learning_rate": 4.688563758195314e-06, + "loss": 0.6517, + "step": 3592 + }, + { + "epoch": 0.9868168085690744, + "grad_norm": 1.1986360549926758, + "learning_rate": 4.688389163200672e-06, + "loss": 0.6775, + "step": 3593 + }, + { + "epoch": 0.987091458390552, + "grad_norm": 1.1277421712875366, + "learning_rate": 4.6882145225323986e-06, + "loss": 0.6093, + "step": 3594 + }, + { + "epoch": 0.9873661082120296, + "grad_norm": 1.1993963718414307, + "learning_rate": 4.68803983619414e-06, + "loss": 0.6491, + "step": 3595 + }, + { + "epoch": 0.9876407580335073, + "grad_norm": 1.3265681266784668, + "learning_rate": 4.6878651041895415e-06, + "loss": 0.6234, + "step": 3596 + }, + { + "epoch": 0.9879154078549849, + "grad_norm": 1.1865756511688232, + "learning_rate": 4.687690326522251e-06, + "loss": 0.6476, + "step": 3597 + }, + { + "epoch": 0.9881900576764625, + "grad_norm": 1.1822162866592407, + "learning_rate": 4.687515503195915e-06, + "loss": 0.6281, + "step": 3598 + }, + { + "epoch": 0.9884647074979401, + "grad_norm": 1.2379586696624756, + "learning_rate": 4.687340634214182e-06, + "loss": 0.6418, + "step": 3599 + }, + { + "epoch": 0.9887393573194178, + "grad_norm": 1.2419129610061646, + "learning_rate": 4.687165719580704e-06, + "loss": 0.6169, + "step": 3600 + }, + { + "epoch": 0.9890140071408954, + "grad_norm": 1.1305233240127563, + "learning_rate": 4.686990759299129e-06, + "loss": 0.5872, + "step": 3601 + }, + { + "epoch": 0.989288656962373, + "grad_norm": 1.1749986410140991, + "learning_rate": 4.686815753373109e-06, + "loss": 0.7042, + "step": 3602 + }, + { + "epoch": 0.9895633067838506, + "grad_norm": 1.197188377380371, + "learning_rate": 4.686640701806299e-06, + "loss": 0.658, + "step": 3603 + }, + { + "epoch": 0.9898379566053283, + "grad_norm": 1.2786458730697632, + "learning_rate": 4.686465604602349e-06, + "loss": 0.6885, + "step": 3604 + }, + { + "epoch": 0.9901126064268059, + "grad_norm": 1.2183692455291748, + "learning_rate": 4.686290461764916e-06, + "loss": 0.657, + "step": 3605 + }, + { + "epoch": 0.9903872562482834, + "grad_norm": 1.219504952430725, + "learning_rate": 4.686115273297653e-06, + "loss": 0.6751, + "step": 3606 + }, + { + "epoch": 0.990661906069761, + "grad_norm": 1.2402971982955933, + "learning_rate": 4.685940039204219e-06, + "loss": 0.6394, + "step": 3607 + }, + { + "epoch": 0.9909365558912386, + "grad_norm": 1.151853084564209, + "learning_rate": 4.68576475948827e-06, + "loss": 0.6269, + "step": 3608 + }, + { + "epoch": 0.9912112057127163, + "grad_norm": 1.208567500114441, + "learning_rate": 4.685589434153464e-06, + "loss": 0.6627, + "step": 3609 + }, + { + "epoch": 0.9914858555341939, + "grad_norm": 1.18115234375, + "learning_rate": 4.685414063203461e-06, + "loss": 0.6702, + "step": 3610 + }, + { + "epoch": 0.9917605053556715, + "grad_norm": 1.1994400024414062, + "learning_rate": 4.68523864664192e-06, + "loss": 0.6677, + "step": 3611 + }, + { + "epoch": 0.9920351551771491, + "grad_norm": 1.1907843351364136, + "learning_rate": 4.685063184472503e-06, + "loss": 0.6947, + "step": 3612 + }, + { + "epoch": 0.9923098049986268, + "grad_norm": 1.2638579607009888, + "learning_rate": 4.684887676698871e-06, + "loss": 0.6717, + "step": 3613 + }, + { + "epoch": 0.9925844548201044, + "grad_norm": 1.1944715976715088, + "learning_rate": 4.684712123324688e-06, + "loss": 0.6359, + "step": 3614 + }, + { + "epoch": 0.992859104641582, + "grad_norm": 1.2566219568252563, + "learning_rate": 4.684536524353619e-06, + "loss": 0.6617, + "step": 3615 + }, + { + "epoch": 0.9931337544630596, + "grad_norm": 1.2228596210479736, + "learning_rate": 4.684360879789327e-06, + "loss": 0.6603, + "step": 3616 + }, + { + "epoch": 0.9934084042845372, + "grad_norm": 1.2062559127807617, + "learning_rate": 4.6841851896354775e-06, + "loss": 0.6478, + "step": 3617 + }, + { + "epoch": 0.9936830541060149, + "grad_norm": 1.1936044692993164, + "learning_rate": 4.684009453895739e-06, + "loss": 0.6107, + "step": 3618 + }, + { + "epoch": 0.9939577039274925, + "grad_norm": 1.238188624382019, + "learning_rate": 4.6838336725737785e-06, + "loss": 0.6691, + "step": 3619 + }, + { + "epoch": 0.99423235374897, + "grad_norm": 1.175394892692566, + "learning_rate": 4.683657845673265e-06, + "loss": 0.6352, + "step": 3620 + }, + { + "epoch": 0.9945070035704476, + "grad_norm": 1.1954920291900635, + "learning_rate": 4.6834819731978684e-06, + "loss": 0.6199, + "step": 3621 + }, + { + "epoch": 0.9947816533919253, + "grad_norm": 1.140977382659912, + "learning_rate": 4.683306055151258e-06, + "loss": 0.6263, + "step": 3622 + }, + { + "epoch": 0.9950563032134029, + "grad_norm": 1.1128288507461548, + "learning_rate": 4.683130091537107e-06, + "loss": 0.5772, + "step": 3623 + }, + { + "epoch": 0.9953309530348805, + "grad_norm": 1.2712135314941406, + "learning_rate": 4.682954082359087e-06, + "loss": 0.6628, + "step": 3624 + }, + { + "epoch": 0.9956056028563581, + "grad_norm": 1.1899785995483398, + "learning_rate": 4.682778027620871e-06, + "loss": 0.603, + "step": 3625 + }, + { + "epoch": 0.9958802526778358, + "grad_norm": 1.211262822151184, + "learning_rate": 4.6826019273261345e-06, + "loss": 0.6006, + "step": 3626 + }, + { + "epoch": 0.9961549024993134, + "grad_norm": 1.1453601121902466, + "learning_rate": 4.682425781478552e-06, + "loss": 0.6269, + "step": 3627 + }, + { + "epoch": 0.996429552320791, + "grad_norm": 1.1709744930267334, + "learning_rate": 4.682249590081801e-06, + "loss": 0.618, + "step": 3628 + }, + { + "epoch": 0.9967042021422686, + "grad_norm": 1.1914818286895752, + "learning_rate": 4.682073353139558e-06, + "loss": 0.6567, + "step": 3629 + }, + { + "epoch": 0.9969788519637462, + "grad_norm": 1.266263723373413, + "learning_rate": 4.681897070655501e-06, + "loss": 0.6813, + "step": 3630 + }, + { + "epoch": 0.9972535017852239, + "grad_norm": 1.2135192155838013, + "learning_rate": 4.681720742633309e-06, + "loss": 0.6519, + "step": 3631 + }, + { + "epoch": 0.9975281516067015, + "grad_norm": 1.193723201751709, + "learning_rate": 4.681544369076663e-06, + "loss": 0.6529, + "step": 3632 + }, + { + "epoch": 0.9978028014281791, + "grad_norm": 1.1459523439407349, + "learning_rate": 4.6813679499892424e-06, + "loss": 0.5939, + "step": 3633 + }, + { + "epoch": 0.9980774512496566, + "grad_norm": 1.1470270156860352, + "learning_rate": 4.681191485374732e-06, + "loss": 0.6559, + "step": 3634 + }, + { + "epoch": 0.9983521010711343, + "grad_norm": 1.1562823057174683, + "learning_rate": 4.681014975236813e-06, + "loss": 0.6454, + "step": 3635 + }, + { + "epoch": 0.9986267508926119, + "grad_norm": 1.2383633852005005, + "learning_rate": 4.680838419579169e-06, + "loss": 0.6749, + "step": 3636 + }, + { + "epoch": 0.9989014007140895, + "grad_norm": 1.1749192476272583, + "learning_rate": 4.680661818405485e-06, + "loss": 0.6301, + "step": 3637 + }, + { + "epoch": 0.9991760505355671, + "grad_norm": 1.2661762237548828, + "learning_rate": 4.680485171719448e-06, + "loss": 0.6522, + "step": 3638 + }, + { + "epoch": 0.9994507003570448, + "grad_norm": 1.255720853805542, + "learning_rate": 4.6803084795247435e-06, + "loss": 0.6921, + "step": 3639 + }, + { + "epoch": 0.9997253501785224, + "grad_norm": 1.2071359157562256, + "learning_rate": 4.680131741825059e-06, + "loss": 0.6487, + "step": 3640 + }, + { + "epoch": 1.0, + "grad_norm": 1.2668778896331787, + "learning_rate": 4.679954958624086e-06, + "loss": 0.6608, + "step": 3641 + } + ], + "logging_steps": 1, + "max_steps": 21846, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 3641, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.732220843146346e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}