diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3033, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009891196834817012, + "grad_norm": 47.47864436307981, + "learning_rate": 1.6447368421052632e-07, + "loss": 11.4052, + "step": 1 + }, + { + "epoch": 0.0019782393669634025, + "grad_norm": 45.91292076188422, + "learning_rate": 3.2894736842105264e-07, + "loss": 11.2815, + "step": 2 + }, + { + "epoch": 0.002967359050445104, + "grad_norm": 47.726852516408236, + "learning_rate": 4.934210526315789e-07, + "loss": 11.3935, + "step": 3 + }, + { + "epoch": 0.003956478733926805, + "grad_norm": 45.10574020271687, + "learning_rate": 6.578947368421053e-07, + "loss": 11.5897, + "step": 4 + }, + { + "epoch": 0.004945598417408506, + "grad_norm": 50.8689323614659, + "learning_rate": 8.223684210526316e-07, + "loss": 11.2181, + "step": 5 + }, + { + "epoch": 0.005934718100890208, + "grad_norm": 44.83636067584137, + "learning_rate": 9.868421052631579e-07, + "loss": 11.543, + "step": 6 + }, + { + "epoch": 0.006923837784371909, + "grad_norm": 44.619692790934934, + "learning_rate": 1.1513157894736842e-06, + "loss": 11.364, + "step": 7 + }, + { + "epoch": 0.00791295746785361, + "grad_norm": 46.17981710645455, + "learning_rate": 1.3157894736842106e-06, + "loss": 11.372, + "step": 8 + }, + { + "epoch": 0.008902077151335312, + "grad_norm": 49.73917366470631, + "learning_rate": 1.480263157894737e-06, + "loss": 11.0869, + "step": 9 + }, + { + "epoch": 0.009891196834817012, + "grad_norm": 52.40342778684296, + "learning_rate": 1.6447368421052632e-06, + "loss": 11.054, + "step": 10 + }, + { + "epoch": 0.010880316518298714, + "grad_norm": 48.69950321668828, + "learning_rate": 1.8092105263157896e-06, + "loss": 11.0688, + "step": 11 + }, + { + "epoch": 0.011869436201780416, + "grad_norm": 63.65227677037272, + "learning_rate": 1.9736842105263157e-06, + "loss": 10.3072, + "step": 12 + }, + { + "epoch": 0.012858555885262116, + "grad_norm": 79.55368321861843, + "learning_rate": 2.138157894736842e-06, + "loss": 9.7245, + "step": 13 + }, + { + "epoch": 0.013847675568743818, + "grad_norm": 82.8103382216083, + "learning_rate": 2.3026315789473684e-06, + "loss": 9.4617, + "step": 14 + }, + { + "epoch": 0.01483679525222552, + "grad_norm": 84.84620253448091, + "learning_rate": 2.4671052631578948e-06, + "loss": 9.3283, + "step": 15 + }, + { + "epoch": 0.01582591493570722, + "grad_norm": 72.7809868244949, + "learning_rate": 2.631578947368421e-06, + "loss": 4.3421, + "step": 16 + }, + { + "epoch": 0.016815034619188922, + "grad_norm": 68.45135162785618, + "learning_rate": 2.7960526315789475e-06, + "loss": 4.768, + "step": 17 + }, + { + "epoch": 0.017804154302670624, + "grad_norm": 65.03039569474328, + "learning_rate": 2.960526315789474e-06, + "loss": 4.418, + "step": 18 + }, + { + "epoch": 0.018793273986152326, + "grad_norm": 45.947045347522504, + "learning_rate": 3.125e-06, + "loss": 3.3595, + "step": 19 + }, + { + "epoch": 0.019782393669634024, + "grad_norm": 43.95974652547691, + "learning_rate": 3.2894736842105265e-06, + "loss": 3.1867, + "step": 20 + }, + { + "epoch": 0.020771513353115726, + "grad_norm": 9.308745952351916, + "learning_rate": 3.4539473684210533e-06, + "loss": 1.9175, + "step": 21 + }, + { + "epoch": 0.021760633036597428, + "grad_norm": 5.755918880001644, + "learning_rate": 3.618421052631579e-06, + "loss": 1.536, + "step": 22 + }, + { + "epoch": 0.02274975272007913, + "grad_norm": 4.930173474585826, + "learning_rate": 3.7828947368421055e-06, + "loss": 1.4667, + "step": 23 + }, + { + "epoch": 0.02373887240356083, + "grad_norm": 4.28733293354817, + "learning_rate": 3.9473684210526315e-06, + "loss": 1.4006, + "step": 24 + }, + { + "epoch": 0.024727992087042534, + "grad_norm": 3.829716127893356, + "learning_rate": 4.111842105263159e-06, + "loss": 1.532, + "step": 25 + }, + { + "epoch": 0.025717111770524232, + "grad_norm": 3.0324967863470813, + "learning_rate": 4.276315789473684e-06, + "loss": 1.4976, + "step": 26 + }, + { + "epoch": 0.026706231454005934, + "grad_norm": 2.5586232440965344, + "learning_rate": 4.4407894736842105e-06, + "loss": 1.3703, + "step": 27 + }, + { + "epoch": 0.027695351137487636, + "grad_norm": 1.9156066610224773, + "learning_rate": 4.605263157894737e-06, + "loss": 1.1345, + "step": 28 + }, + { + "epoch": 0.028684470820969338, + "grad_norm": 1.74939443115538, + "learning_rate": 4.769736842105264e-06, + "loss": 1.2478, + "step": 29 + }, + { + "epoch": 0.02967359050445104, + "grad_norm": 1.5831108352532686, + "learning_rate": 4.9342105263157895e-06, + "loss": 1.1836, + "step": 30 + }, + { + "epoch": 0.03066271018793274, + "grad_norm": 2.2356207014234997, + "learning_rate": 5.098684210526316e-06, + "loss": 1.3763, + "step": 31 + }, + { + "epoch": 0.03165182987141444, + "grad_norm": 1.1026657175125092, + "learning_rate": 5.263157894736842e-06, + "loss": 1.0571, + "step": 32 + }, + { + "epoch": 0.032640949554896145, + "grad_norm": 1.0672135675760805, + "learning_rate": 5.4276315789473686e-06, + "loss": 1.1693, + "step": 33 + }, + { + "epoch": 0.033630069238377844, + "grad_norm": 0.9532916815841181, + "learning_rate": 5.592105263157895e-06, + "loss": 0.9262, + "step": 34 + }, + { + "epoch": 0.03461918892185954, + "grad_norm": 0.8351774057987832, + "learning_rate": 5.756578947368421e-06, + "loss": 0.9843, + "step": 35 + }, + { + "epoch": 0.03560830860534125, + "grad_norm": 0.7152560683189684, + "learning_rate": 5.921052631578948e-06, + "loss": 0.8934, + "step": 36 + }, + { + "epoch": 0.036597428288822946, + "grad_norm": 0.8605762164117818, + "learning_rate": 6.085526315789474e-06, + "loss": 1.0897, + "step": 37 + }, + { + "epoch": 0.03758654797230465, + "grad_norm": 0.8760202419017974, + "learning_rate": 6.25e-06, + "loss": 1.1443, + "step": 38 + }, + { + "epoch": 0.03857566765578635, + "grad_norm": 0.6624437855685319, + "learning_rate": 6.4144736842105275e-06, + "loss": 0.8699, + "step": 39 + }, + { + "epoch": 0.03956478733926805, + "grad_norm": 0.7049615396906419, + "learning_rate": 6.578947368421053e-06, + "loss": 0.9151, + "step": 40 + }, + { + "epoch": 0.040553907022749754, + "grad_norm": 0.7854506183295861, + "learning_rate": 6.743421052631579e-06, + "loss": 0.9542, + "step": 41 + }, + { + "epoch": 0.04154302670623145, + "grad_norm": 0.6223807683273548, + "learning_rate": 6.9078947368421065e-06, + "loss": 0.9371, + "step": 42 + }, + { + "epoch": 0.04253214638971316, + "grad_norm": 0.6248927306368205, + "learning_rate": 7.072368421052632e-06, + "loss": 0.9072, + "step": 43 + }, + { + "epoch": 0.043521266073194856, + "grad_norm": 0.647479183993099, + "learning_rate": 7.236842105263158e-06, + "loss": 0.9054, + "step": 44 + }, + { + "epoch": 0.04451038575667656, + "grad_norm": 0.5889797054704718, + "learning_rate": 7.401315789473684e-06, + "loss": 1.0402, + "step": 45 + }, + { + "epoch": 0.04549950544015826, + "grad_norm": 0.4967497474491158, + "learning_rate": 7.565789473684211e-06, + "loss": 0.9069, + "step": 46 + }, + { + "epoch": 0.04648862512363996, + "grad_norm": 0.5661932222672871, + "learning_rate": 7.730263157894737e-06, + "loss": 0.9583, + "step": 47 + }, + { + "epoch": 0.04747774480712166, + "grad_norm": 6.156382243862993, + "learning_rate": 7.894736842105263e-06, + "loss": 0.977, + "step": 48 + }, + { + "epoch": 0.04846686449060336, + "grad_norm": 0.6197962682604914, + "learning_rate": 8.05921052631579e-06, + "loss": 0.7968, + "step": 49 + }, + { + "epoch": 0.04945598417408507, + "grad_norm": 0.5935105377105369, + "learning_rate": 8.223684210526317e-06, + "loss": 0.8727, + "step": 50 + }, + { + "epoch": 0.050445103857566766, + "grad_norm": 0.5139477978028617, + "learning_rate": 8.388157894736843e-06, + "loss": 0.9035, + "step": 51 + }, + { + "epoch": 0.051434223541048464, + "grad_norm": 0.48831574166447717, + "learning_rate": 8.552631578947368e-06, + "loss": 0.7875, + "step": 52 + }, + { + "epoch": 0.05242334322453017, + "grad_norm": 0.5106264570172955, + "learning_rate": 8.717105263157894e-06, + "loss": 0.9089, + "step": 53 + }, + { + "epoch": 0.05341246290801187, + "grad_norm": 0.48126592930194784, + "learning_rate": 8.881578947368421e-06, + "loss": 0.7222, + "step": 54 + }, + { + "epoch": 0.05440158259149357, + "grad_norm": 0.3816418172871715, + "learning_rate": 9.046052631578948e-06, + "loss": 0.7726, + "step": 55 + }, + { + "epoch": 0.05539070227497527, + "grad_norm": 0.3805932483187457, + "learning_rate": 9.210526315789474e-06, + "loss": 0.7265, + "step": 56 + }, + { + "epoch": 0.05637982195845697, + "grad_norm": 0.35478918543818216, + "learning_rate": 9.375000000000001e-06, + "loss": 0.7009, + "step": 57 + }, + { + "epoch": 0.057368941641938676, + "grad_norm": 0.34092363071107723, + "learning_rate": 9.539473684210528e-06, + "loss": 0.7749, + "step": 58 + }, + { + "epoch": 0.058358061325420374, + "grad_norm": 0.36162518512491465, + "learning_rate": 9.703947368421054e-06, + "loss": 0.8226, + "step": 59 + }, + { + "epoch": 0.05934718100890208, + "grad_norm": 0.40209857091787987, + "learning_rate": 9.868421052631579e-06, + "loss": 0.836, + "step": 60 + }, + { + "epoch": 0.06033630069238378, + "grad_norm": 0.3549385133528555, + "learning_rate": 1.0032894736842106e-05, + "loss": 0.7356, + "step": 61 + }, + { + "epoch": 0.06132542037586548, + "grad_norm": 0.3578780388034424, + "learning_rate": 1.0197368421052632e-05, + "loss": 0.8599, + "step": 62 + }, + { + "epoch": 0.06231454005934718, + "grad_norm": 0.3669356895971538, + "learning_rate": 1.0361842105263159e-05, + "loss": 0.8198, + "step": 63 + }, + { + "epoch": 0.06330365974282888, + "grad_norm": 0.3543644351832402, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.8604, + "step": 64 + }, + { + "epoch": 0.06429277942631058, + "grad_norm": 0.3713938433998872, + "learning_rate": 1.0690789473684212e-05, + "loss": 0.8382, + "step": 65 + }, + { + "epoch": 0.06528189910979229, + "grad_norm": 0.3340439100059536, + "learning_rate": 1.0855263157894737e-05, + "loss": 0.7577, + "step": 66 + }, + { + "epoch": 0.06627101879327399, + "grad_norm": 0.31244239116416433, + "learning_rate": 1.1019736842105263e-05, + "loss": 0.714, + "step": 67 + }, + { + "epoch": 0.06726013847675569, + "grad_norm": 0.36393336511648067, + "learning_rate": 1.118421052631579e-05, + "loss": 0.8557, + "step": 68 + }, + { + "epoch": 0.06824925816023739, + "grad_norm": 0.39874922213238695, + "learning_rate": 1.1348684210526317e-05, + "loss": 0.7143, + "step": 69 + }, + { + "epoch": 0.06923837784371908, + "grad_norm": 0.3354909696602849, + "learning_rate": 1.1513157894736843e-05, + "loss": 0.8087, + "step": 70 + }, + { + "epoch": 0.0702274975272008, + "grad_norm": 0.3195954665583774, + "learning_rate": 1.167763157894737e-05, + "loss": 0.7187, + "step": 71 + }, + { + "epoch": 0.0712166172106825, + "grad_norm": 0.3134655600001495, + "learning_rate": 1.1842105263157895e-05, + "loss": 0.8406, + "step": 72 + }, + { + "epoch": 0.0722057368941642, + "grad_norm": 0.3026614417415018, + "learning_rate": 1.200657894736842e-05, + "loss": 0.8183, + "step": 73 + }, + { + "epoch": 0.07319485657764589, + "grad_norm": 0.3216818743924892, + "learning_rate": 1.2171052631578948e-05, + "loss": 0.849, + "step": 74 + }, + { + "epoch": 0.07418397626112759, + "grad_norm": 0.3667927953864343, + "learning_rate": 1.2335526315789473e-05, + "loss": 0.8876, + "step": 75 + }, + { + "epoch": 0.0751730959446093, + "grad_norm": 0.31846281371038865, + "learning_rate": 1.25e-05, + "loss": 0.7137, + "step": 76 + }, + { + "epoch": 0.076162215628091, + "grad_norm": 0.3649775475138143, + "learning_rate": 1.2664473684210526e-05, + "loss": 0.9686, + "step": 77 + }, + { + "epoch": 0.0771513353115727, + "grad_norm": 0.3552395073907018, + "learning_rate": 1.2828947368421055e-05, + "loss": 0.9255, + "step": 78 + }, + { + "epoch": 0.0781404549950544, + "grad_norm": 0.3362980375908929, + "learning_rate": 1.299342105263158e-05, + "loss": 0.7663, + "step": 79 + }, + { + "epoch": 0.0791295746785361, + "grad_norm": 0.3332486864873228, + "learning_rate": 1.3157894736842106e-05, + "loss": 0.8464, + "step": 80 + }, + { + "epoch": 0.08011869436201781, + "grad_norm": 0.3563264366987691, + "learning_rate": 1.3322368421052633e-05, + "loss": 0.7251, + "step": 81 + }, + { + "epoch": 0.08110781404549951, + "grad_norm": 0.32718462995919856, + "learning_rate": 1.3486842105263159e-05, + "loss": 0.8337, + "step": 82 + }, + { + "epoch": 0.0820969337289812, + "grad_norm": 0.3098690267315848, + "learning_rate": 1.3651315789473684e-05, + "loss": 0.7089, + "step": 83 + }, + { + "epoch": 0.0830860534124629, + "grad_norm": 0.33231060025498604, + "learning_rate": 1.3815789473684213e-05, + "loss": 0.6336, + "step": 84 + }, + { + "epoch": 0.0840751730959446, + "grad_norm": 0.28062844248423013, + "learning_rate": 1.3980263157894739e-05, + "loss": 0.6994, + "step": 85 + }, + { + "epoch": 0.08506429277942631, + "grad_norm": 0.29411686322877856, + "learning_rate": 1.4144736842105264e-05, + "loss": 0.6726, + "step": 86 + }, + { + "epoch": 0.08605341246290801, + "grad_norm": 0.34261251457783337, + "learning_rate": 1.430921052631579e-05, + "loss": 0.8291, + "step": 87 + }, + { + "epoch": 0.08704253214638971, + "grad_norm": 0.36178717911541053, + "learning_rate": 1.4473684210526317e-05, + "loss": 0.8385, + "step": 88 + }, + { + "epoch": 0.08803165182987141, + "grad_norm": 0.3461353549466678, + "learning_rate": 1.4638157894736842e-05, + "loss": 0.7122, + "step": 89 + }, + { + "epoch": 0.08902077151335312, + "grad_norm": 0.35500836343925757, + "learning_rate": 1.4802631578947368e-05, + "loss": 0.7191, + "step": 90 + }, + { + "epoch": 0.09000989119683482, + "grad_norm": 0.3361253921537601, + "learning_rate": 1.4967105263157897e-05, + "loss": 0.8086, + "step": 91 + }, + { + "epoch": 0.09099901088031652, + "grad_norm": 0.33504758656742495, + "learning_rate": 1.5131578947368422e-05, + "loss": 0.7375, + "step": 92 + }, + { + "epoch": 0.09198813056379822, + "grad_norm": 0.3334568733991201, + "learning_rate": 1.5296052631578946e-05, + "loss": 0.7225, + "step": 93 + }, + { + "epoch": 0.09297725024727992, + "grad_norm": 0.3350306861891048, + "learning_rate": 1.5460526315789475e-05, + "loss": 0.7998, + "step": 94 + }, + { + "epoch": 0.09396636993076163, + "grad_norm": 0.38462200040273103, + "learning_rate": 1.5625e-05, + "loss": 0.8569, + "step": 95 + }, + { + "epoch": 0.09495548961424333, + "grad_norm": 0.34279551157272875, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.6453, + "step": 96 + }, + { + "epoch": 0.09594460929772503, + "grad_norm": 0.33073887008527225, + "learning_rate": 1.5953947368421055e-05, + "loss": 0.7839, + "step": 97 + }, + { + "epoch": 0.09693372898120672, + "grad_norm": 0.2931675479755555, + "learning_rate": 1.611842105263158e-05, + "loss": 0.7174, + "step": 98 + }, + { + "epoch": 0.09792284866468842, + "grad_norm": 0.3169582507658582, + "learning_rate": 1.6282894736842106e-05, + "loss": 0.7605, + "step": 99 + }, + { + "epoch": 0.09891196834817013, + "grad_norm": 0.42763487156637653, + "learning_rate": 1.6447368421052635e-05, + "loss": 0.7602, + "step": 100 + }, + { + "epoch": 0.09990108803165183, + "grad_norm": 0.29074916324227895, + "learning_rate": 1.661184210526316e-05, + "loss": 0.6866, + "step": 101 + }, + { + "epoch": 0.10089020771513353, + "grad_norm": 0.352348279384898, + "learning_rate": 1.6776315789473686e-05, + "loss": 0.8076, + "step": 102 + }, + { + "epoch": 0.10187932739861523, + "grad_norm": 0.3273893679465929, + "learning_rate": 1.694078947368421e-05, + "loss": 0.7077, + "step": 103 + }, + { + "epoch": 0.10286844708209693, + "grad_norm": 0.3571307875845898, + "learning_rate": 1.7105263157894737e-05, + "loss": 0.7014, + "step": 104 + }, + { + "epoch": 0.10385756676557864, + "grad_norm": 0.33000827078733097, + "learning_rate": 1.7269736842105262e-05, + "loss": 0.6654, + "step": 105 + }, + { + "epoch": 0.10484668644906034, + "grad_norm": 0.4595448328039601, + "learning_rate": 1.7434210526315788e-05, + "loss": 0.7122, + "step": 106 + }, + { + "epoch": 0.10583580613254204, + "grad_norm": 0.35389814780232176, + "learning_rate": 1.7598684210526316e-05, + "loss": 0.6255, + "step": 107 + }, + { + "epoch": 0.10682492581602374, + "grad_norm": 0.35214588487585297, + "learning_rate": 1.7763157894736842e-05, + "loss": 0.6463, + "step": 108 + }, + { + "epoch": 0.10781404549950543, + "grad_norm": 0.37496257339354294, + "learning_rate": 1.7927631578947367e-05, + "loss": 0.6374, + "step": 109 + }, + { + "epoch": 0.10880316518298715, + "grad_norm": 0.3901229505469495, + "learning_rate": 1.8092105263157896e-05, + "loss": 0.8409, + "step": 110 + }, + { + "epoch": 0.10979228486646884, + "grad_norm": 0.36503555388396497, + "learning_rate": 1.8256578947368422e-05, + "loss": 0.7027, + "step": 111 + }, + { + "epoch": 0.11078140454995054, + "grad_norm": 0.39058659212806174, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.666, + "step": 112 + }, + { + "epoch": 0.11177052423343224, + "grad_norm": 0.3393334155039262, + "learning_rate": 1.8585526315789476e-05, + "loss": 0.6489, + "step": 113 + }, + { + "epoch": 0.11275964391691394, + "grad_norm": 0.3641878610374112, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.5997, + "step": 114 + }, + { + "epoch": 0.11374876360039565, + "grad_norm": 0.35954765537088884, + "learning_rate": 1.8914473684210527e-05, + "loss": 0.6643, + "step": 115 + }, + { + "epoch": 0.11473788328387735, + "grad_norm": 0.332353400970844, + "learning_rate": 1.9078947368421056e-05, + "loss": 0.6558, + "step": 116 + }, + { + "epoch": 0.11572700296735905, + "grad_norm": 0.3658649526637821, + "learning_rate": 1.924342105263158e-05, + "loss": 0.748, + "step": 117 + }, + { + "epoch": 0.11671612265084075, + "grad_norm": 0.4110068913472326, + "learning_rate": 1.9407894736842107e-05, + "loss": 0.7679, + "step": 118 + }, + { + "epoch": 0.11770524233432245, + "grad_norm": 0.312188988598259, + "learning_rate": 1.9572368421052633e-05, + "loss": 0.7806, + "step": 119 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 0.39415290875961995, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.6902, + "step": 120 + }, + { + "epoch": 0.11968348170128586, + "grad_norm": 0.30258592751919755, + "learning_rate": 1.9901315789473684e-05, + "loss": 0.6828, + "step": 121 + }, + { + "epoch": 0.12067260138476756, + "grad_norm": 0.32157745711913, + "learning_rate": 2.0065789473684213e-05, + "loss": 0.7062, + "step": 122 + }, + { + "epoch": 0.12166172106824925, + "grad_norm": 0.38984990793746416, + "learning_rate": 2.0230263157894738e-05, + "loss": 0.7383, + "step": 123 + }, + { + "epoch": 0.12265084075173097, + "grad_norm": 0.4111544139628007, + "learning_rate": 2.0394736842105264e-05, + "loss": 0.6269, + "step": 124 + }, + { + "epoch": 0.12363996043521266, + "grad_norm": 0.39448247544238374, + "learning_rate": 2.055921052631579e-05, + "loss": 0.7416, + "step": 125 + }, + { + "epoch": 0.12462908011869436, + "grad_norm": 0.3444637293280687, + "learning_rate": 2.0723684210526318e-05, + "loss": 0.6733, + "step": 126 + }, + { + "epoch": 0.12561819980217606, + "grad_norm": 0.4001985495233771, + "learning_rate": 2.0888157894736843e-05, + "loss": 0.6515, + "step": 127 + }, + { + "epoch": 0.12660731948565776, + "grad_norm": 0.3794186330915447, + "learning_rate": 2.105263157894737e-05, + "loss": 0.662, + "step": 128 + }, + { + "epoch": 0.12759643916913946, + "grad_norm": 0.38824093039999913, + "learning_rate": 2.1217105263157898e-05, + "loss": 0.7238, + "step": 129 + }, + { + "epoch": 0.12858555885262116, + "grad_norm": 0.4556524964885303, + "learning_rate": 2.1381578947368423e-05, + "loss": 0.6779, + "step": 130 + }, + { + "epoch": 0.12957467853610286, + "grad_norm": 0.3643359091192952, + "learning_rate": 2.154605263157895e-05, + "loss": 0.673, + "step": 131 + }, + { + "epoch": 0.13056379821958458, + "grad_norm": 0.491430142042037, + "learning_rate": 2.1710526315789474e-05, + "loss": 0.6031, + "step": 132 + }, + { + "epoch": 0.13155291790306628, + "grad_norm": 0.3644642242254432, + "learning_rate": 2.1875e-05, + "loss": 0.6054, + "step": 133 + }, + { + "epoch": 0.13254203758654798, + "grad_norm": 0.4391260070830592, + "learning_rate": 2.2039473684210525e-05, + "loss": 0.569, + "step": 134 + }, + { + "epoch": 0.13353115727002968, + "grad_norm": 0.3275145794011892, + "learning_rate": 2.2203947368421054e-05, + "loss": 0.6724, + "step": 135 + }, + { + "epoch": 0.13452027695351138, + "grad_norm": 0.4672558471628718, + "learning_rate": 2.236842105263158e-05, + "loss": 0.6973, + "step": 136 + }, + { + "epoch": 0.13550939663699307, + "grad_norm": 0.37484287127842214, + "learning_rate": 2.2532894736842105e-05, + "loss": 0.6024, + "step": 137 + }, + { + "epoch": 0.13649851632047477, + "grad_norm": 0.3741616001117786, + "learning_rate": 2.2697368421052634e-05, + "loss": 0.6354, + "step": 138 + }, + { + "epoch": 0.13748763600395647, + "grad_norm": 0.40705218782451663, + "learning_rate": 2.286184210526316e-05, + "loss": 0.7532, + "step": 139 + }, + { + "epoch": 0.13847675568743817, + "grad_norm": 0.3677599289459387, + "learning_rate": 2.3026315789473685e-05, + "loss": 0.7191, + "step": 140 + }, + { + "epoch": 0.1394658753709199, + "grad_norm": 0.3999859660311767, + "learning_rate": 2.3190789473684214e-05, + "loss": 0.6836, + "step": 141 + }, + { + "epoch": 0.1404549950544016, + "grad_norm": 0.38230856108223893, + "learning_rate": 2.335526315789474e-05, + "loss": 0.6404, + "step": 142 + }, + { + "epoch": 0.1414441147378833, + "grad_norm": 0.3628560484729775, + "learning_rate": 2.3519736842105265e-05, + "loss": 0.654, + "step": 143 + }, + { + "epoch": 0.142433234421365, + "grad_norm": 0.3448007883826662, + "learning_rate": 2.368421052631579e-05, + "loss": 0.6698, + "step": 144 + }, + { + "epoch": 0.1434223541048467, + "grad_norm": 0.5552741489616734, + "learning_rate": 2.3848684210526316e-05, + "loss": 0.7492, + "step": 145 + }, + { + "epoch": 0.1444114737883284, + "grad_norm": 0.3342617236256358, + "learning_rate": 2.401315789473684e-05, + "loss": 0.5378, + "step": 146 + }, + { + "epoch": 0.14540059347181009, + "grad_norm": 0.3994524276771364, + "learning_rate": 2.4177631578947367e-05, + "loss": 0.5827, + "step": 147 + }, + { + "epoch": 0.14638971315529178, + "grad_norm": 0.40544863606069553, + "learning_rate": 2.4342105263157896e-05, + "loss": 0.6543, + "step": 148 + }, + { + "epoch": 0.14737883283877348, + "grad_norm": 0.36633930457542324, + "learning_rate": 2.450657894736842e-05, + "loss": 0.6954, + "step": 149 + }, + { + "epoch": 0.14836795252225518, + "grad_norm": 0.36860483728084853, + "learning_rate": 2.4671052631578947e-05, + "loss": 0.6117, + "step": 150 + }, + { + "epoch": 0.1493570722057369, + "grad_norm": 0.40299273023116766, + "learning_rate": 2.4835526315789476e-05, + "loss": 0.6677, + "step": 151 + }, + { + "epoch": 0.1503461918892186, + "grad_norm": 0.46292039106861615, + "learning_rate": 2.5e-05, + "loss": 0.7554, + "step": 152 + }, + { + "epoch": 0.1513353115727003, + "grad_norm": 0.3858429666089543, + "learning_rate": 2.5164473684210527e-05, + "loss": 0.6302, + "step": 153 + }, + { + "epoch": 0.152324431256182, + "grad_norm": 0.5074668120955147, + "learning_rate": 2.5328947368421052e-05, + "loss": 0.6442, + "step": 154 + }, + { + "epoch": 0.1533135509396637, + "grad_norm": 0.37588795715965484, + "learning_rate": 2.5493421052631578e-05, + "loss": 0.6647, + "step": 155 + }, + { + "epoch": 0.1543026706231454, + "grad_norm": 0.563121630811762, + "learning_rate": 2.565789473684211e-05, + "loss": 0.6455, + "step": 156 + }, + { + "epoch": 0.1552917903066271, + "grad_norm": 0.48379869834031464, + "learning_rate": 2.5822368421052635e-05, + "loss": 0.723, + "step": 157 + }, + { + "epoch": 0.1562809099901088, + "grad_norm": 0.3916557190918107, + "learning_rate": 2.598684210526316e-05, + "loss": 0.5728, + "step": 158 + }, + { + "epoch": 0.1572700296735905, + "grad_norm": 0.4241188982146622, + "learning_rate": 2.6151315789473686e-05, + "loss": 0.617, + "step": 159 + }, + { + "epoch": 0.1582591493570722, + "grad_norm": 0.45540106668453567, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.6104, + "step": 160 + }, + { + "epoch": 0.15924826904055392, + "grad_norm": 0.4247150549586619, + "learning_rate": 2.6480263157894737e-05, + "loss": 0.6001, + "step": 161 + }, + { + "epoch": 0.16023738872403562, + "grad_norm": 0.4460228840551562, + "learning_rate": 2.6644736842105266e-05, + "loss": 0.5465, + "step": 162 + }, + { + "epoch": 0.16122650840751732, + "grad_norm": 0.38351087890774094, + "learning_rate": 2.6809210526315792e-05, + "loss": 0.6591, + "step": 163 + }, + { + "epoch": 0.16221562809099901, + "grad_norm": 0.46934047916659694, + "learning_rate": 2.6973684210526317e-05, + "loss": 0.7279, + "step": 164 + }, + { + "epoch": 0.1632047477744807, + "grad_norm": 0.4222690324413176, + "learning_rate": 2.7138157894736843e-05, + "loss": 0.6264, + "step": 165 + }, + { + "epoch": 0.1641938674579624, + "grad_norm": 0.4697752681692279, + "learning_rate": 2.730263157894737e-05, + "loss": 0.7027, + "step": 166 + }, + { + "epoch": 0.1651829871414441, + "grad_norm": 0.4702974530466207, + "learning_rate": 2.7467105263157894e-05, + "loss": 0.6994, + "step": 167 + }, + { + "epoch": 0.1661721068249258, + "grad_norm": 0.541483976312347, + "learning_rate": 2.7631578947368426e-05, + "loss": 0.6525, + "step": 168 + }, + { + "epoch": 0.1671612265084075, + "grad_norm": 0.3428171204640418, + "learning_rate": 2.779605263157895e-05, + "loss": 0.6431, + "step": 169 + }, + { + "epoch": 0.1681503461918892, + "grad_norm": 0.472693449324751, + "learning_rate": 2.7960526315789477e-05, + "loss": 0.6693, + "step": 170 + }, + { + "epoch": 0.16913946587537093, + "grad_norm": 0.4554753159492845, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.6924, + "step": 171 + }, + { + "epoch": 0.17012858555885263, + "grad_norm": 0.4616962159670467, + "learning_rate": 2.8289473684210528e-05, + "loss": 0.6082, + "step": 172 + }, + { + "epoch": 0.17111770524233433, + "grad_norm": 0.42013241457970946, + "learning_rate": 2.8453947368421054e-05, + "loss": 0.5976, + "step": 173 + }, + { + "epoch": 0.17210682492581603, + "grad_norm": 0.49647836517813876, + "learning_rate": 2.861842105263158e-05, + "loss": 0.6976, + "step": 174 + }, + { + "epoch": 0.17309594460929772, + "grad_norm": 0.4656047888630097, + "learning_rate": 2.8782894736842108e-05, + "loss": 0.6689, + "step": 175 + }, + { + "epoch": 0.17408506429277942, + "grad_norm": 0.62173274981684, + "learning_rate": 2.8947368421052634e-05, + "loss": 0.5864, + "step": 176 + }, + { + "epoch": 0.17507418397626112, + "grad_norm": 0.44514460495733604, + "learning_rate": 2.911184210526316e-05, + "loss": 0.5959, + "step": 177 + }, + { + "epoch": 0.17606330365974282, + "grad_norm": 0.6575269476526132, + "learning_rate": 2.9276315789473684e-05, + "loss": 0.6128, + "step": 178 + }, + { + "epoch": 0.17705242334322452, + "grad_norm": 0.4280454874654953, + "learning_rate": 2.944078947368421e-05, + "loss": 0.7002, + "step": 179 + }, + { + "epoch": 0.17804154302670624, + "grad_norm": 0.455273030279468, + "learning_rate": 2.9605263157894735e-05, + "loss": 0.682, + "step": 180 + }, + { + "epoch": 0.17903066271018794, + "grad_norm": 0.4558325615984045, + "learning_rate": 2.9769736842105268e-05, + "loss": 0.7028, + "step": 181 + }, + { + "epoch": 0.18001978239366964, + "grad_norm": 0.5877708520421765, + "learning_rate": 2.9934210526315793e-05, + "loss": 0.6836, + "step": 182 + }, + { + "epoch": 0.18100890207715134, + "grad_norm": 0.4596794024140408, + "learning_rate": 3.009868421052632e-05, + "loss": 0.6797, + "step": 183 + }, + { + "epoch": 0.18199802176063304, + "grad_norm": 0.4319051245779275, + "learning_rate": 3.0263157894736844e-05, + "loss": 0.6282, + "step": 184 + }, + { + "epoch": 0.18298714144411474, + "grad_norm": 0.5318817072015158, + "learning_rate": 3.042763157894737e-05, + "loss": 0.6233, + "step": 185 + }, + { + "epoch": 0.18397626112759644, + "grad_norm": 0.4322921072671814, + "learning_rate": 3.059210526315789e-05, + "loss": 0.6573, + "step": 186 + }, + { + "epoch": 0.18496538081107813, + "grad_norm": 0.4885053114909321, + "learning_rate": 3.075657894736843e-05, + "loss": 0.6871, + "step": 187 + }, + { + "epoch": 0.18595450049455983, + "grad_norm": 0.4473265190512735, + "learning_rate": 3.092105263157895e-05, + "loss": 0.6173, + "step": 188 + }, + { + "epoch": 0.18694362017804153, + "grad_norm": 0.5440786287668955, + "learning_rate": 3.108552631578948e-05, + "loss": 0.6141, + "step": 189 + }, + { + "epoch": 0.18793273986152326, + "grad_norm": 0.35769667117178866, + "learning_rate": 3.125e-05, + "loss": 0.694, + "step": 190 + }, + { + "epoch": 0.18892185954500496, + "grad_norm": 0.5059649250810946, + "learning_rate": 3.141447368421053e-05, + "loss": 0.6728, + "step": 191 + }, + { + "epoch": 0.18991097922848665, + "grad_norm": 0.38729560510609135, + "learning_rate": 3.157894736842105e-05, + "loss": 0.5709, + "step": 192 + }, + { + "epoch": 0.19090009891196835, + "grad_norm": 0.5371616873491071, + "learning_rate": 3.174342105263158e-05, + "loss": 0.7155, + "step": 193 + }, + { + "epoch": 0.19188921859545005, + "grad_norm": 0.6053710744464506, + "learning_rate": 3.190789473684211e-05, + "loss": 0.6955, + "step": 194 + }, + { + "epoch": 0.19287833827893175, + "grad_norm": 0.413136441297711, + "learning_rate": 3.207236842105263e-05, + "loss": 0.5837, + "step": 195 + }, + { + "epoch": 0.19386745796241345, + "grad_norm": 0.6398949332164223, + "learning_rate": 3.223684210526316e-05, + "loss": 0.7342, + "step": 196 + }, + { + "epoch": 0.19485657764589515, + "grad_norm": 0.61809961884169, + "learning_rate": 3.240131578947368e-05, + "loss": 0.6147, + "step": 197 + }, + { + "epoch": 0.19584569732937684, + "grad_norm": 0.44467152661669995, + "learning_rate": 3.256578947368421e-05, + "loss": 0.5646, + "step": 198 + }, + { + "epoch": 0.19683481701285854, + "grad_norm": 0.4851641919265251, + "learning_rate": 3.2730263157894734e-05, + "loss": 0.6334, + "step": 199 + }, + { + "epoch": 0.19782393669634027, + "grad_norm": 0.5062493576014657, + "learning_rate": 3.289473684210527e-05, + "loss": 0.5911, + "step": 200 + }, + { + "epoch": 0.19881305637982197, + "grad_norm": 0.5527966709235791, + "learning_rate": 3.305921052631579e-05, + "loss": 0.6273, + "step": 201 + }, + { + "epoch": 0.19980217606330367, + "grad_norm": 0.633334866950127, + "learning_rate": 3.322368421052632e-05, + "loss": 0.7025, + "step": 202 + }, + { + "epoch": 0.20079129574678536, + "grad_norm": 0.48779295740663486, + "learning_rate": 3.338815789473684e-05, + "loss": 0.5861, + "step": 203 + }, + { + "epoch": 0.20178041543026706, + "grad_norm": 0.6070005751744674, + "learning_rate": 3.355263157894737e-05, + "loss": 0.7428, + "step": 204 + }, + { + "epoch": 0.20276953511374876, + "grad_norm": 0.534136315876146, + "learning_rate": 3.371710526315789e-05, + "loss": 0.5728, + "step": 205 + }, + { + "epoch": 0.20375865479723046, + "grad_norm": 0.48792763692000546, + "learning_rate": 3.388157894736842e-05, + "loss": 0.586, + "step": 206 + }, + { + "epoch": 0.20474777448071216, + "grad_norm": 0.52009416122117, + "learning_rate": 3.404605263157895e-05, + "loss": 0.4851, + "step": 207 + }, + { + "epoch": 0.20573689416419386, + "grad_norm": 0.5698103555541044, + "learning_rate": 3.421052631578947e-05, + "loss": 0.6331, + "step": 208 + }, + { + "epoch": 0.20672601384767555, + "grad_norm": 0.509391746916777, + "learning_rate": 3.4375e-05, + "loss": 0.5839, + "step": 209 + }, + { + "epoch": 0.20771513353115728, + "grad_norm": 0.6184628148195033, + "learning_rate": 3.4539473684210524e-05, + "loss": 0.7618, + "step": 210 + }, + { + "epoch": 0.20870425321463898, + "grad_norm": 0.55512739516661, + "learning_rate": 3.470394736842105e-05, + "loss": 0.6801, + "step": 211 + }, + { + "epoch": 0.20969337289812068, + "grad_norm": 0.6380611857299497, + "learning_rate": 3.4868421052631575e-05, + "loss": 0.6445, + "step": 212 + }, + { + "epoch": 0.21068249258160238, + "grad_norm": 0.48106021749776834, + "learning_rate": 3.503289473684211e-05, + "loss": 0.704, + "step": 213 + }, + { + "epoch": 0.21167161226508407, + "grad_norm": 0.5155510274611654, + "learning_rate": 3.519736842105263e-05, + "loss": 0.603, + "step": 214 + }, + { + "epoch": 0.21266073194856577, + "grad_norm": 0.41115306939250096, + "learning_rate": 3.536184210526316e-05, + "loss": 0.6204, + "step": 215 + }, + { + "epoch": 0.21364985163204747, + "grad_norm": 0.42847524914883034, + "learning_rate": 3.5526315789473684e-05, + "loss": 0.6293, + "step": 216 + }, + { + "epoch": 0.21463897131552917, + "grad_norm": 0.40362083266366067, + "learning_rate": 3.569078947368421e-05, + "loss": 0.6025, + "step": 217 + }, + { + "epoch": 0.21562809099901087, + "grad_norm": 0.36583415568507316, + "learning_rate": 3.5855263157894735e-05, + "loss": 0.6429, + "step": 218 + }, + { + "epoch": 0.2166172106824926, + "grad_norm": 0.3792237531903649, + "learning_rate": 3.6019736842105264e-05, + "loss": 0.6304, + "step": 219 + }, + { + "epoch": 0.2176063303659743, + "grad_norm": 0.5016218500071241, + "learning_rate": 3.618421052631579e-05, + "loss": 0.6072, + "step": 220 + }, + { + "epoch": 0.218595450049456, + "grad_norm": 0.5195805824452137, + "learning_rate": 3.6348684210526315e-05, + "loss": 0.6955, + "step": 221 + }, + { + "epoch": 0.2195845697329377, + "grad_norm": 0.40646432999857474, + "learning_rate": 3.6513157894736844e-05, + "loss": 0.5941, + "step": 222 + }, + { + "epoch": 0.2205736894164194, + "grad_norm": 0.48614279865705684, + "learning_rate": 3.6677631578947366e-05, + "loss": 0.7392, + "step": 223 + }, + { + "epoch": 0.2215628090999011, + "grad_norm": 0.4833284553690217, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.7144, + "step": 224 + }, + { + "epoch": 0.22255192878338279, + "grad_norm": 0.4123480628971663, + "learning_rate": 3.7006578947368424e-05, + "loss": 0.5997, + "step": 225 + }, + { + "epoch": 0.22354104846686448, + "grad_norm": 0.58482567880444, + "learning_rate": 3.717105263157895e-05, + "loss": 0.625, + "step": 226 + }, + { + "epoch": 0.22453016815034618, + "grad_norm": 0.4839150358758274, + "learning_rate": 3.7335526315789475e-05, + "loss": 0.5947, + "step": 227 + }, + { + "epoch": 0.22551928783382788, + "grad_norm": 0.5821490050298971, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.63, + "step": 228 + }, + { + "epoch": 0.2265084075173096, + "grad_norm": 0.4025831144151843, + "learning_rate": 3.7664473684210526e-05, + "loss": 0.5721, + "step": 229 + }, + { + "epoch": 0.2274975272007913, + "grad_norm": 0.5564325010312093, + "learning_rate": 3.7828947368421054e-05, + "loss": 0.6745, + "step": 230 + }, + { + "epoch": 0.228486646884273, + "grad_norm": 0.559427271458402, + "learning_rate": 3.7993421052631577e-05, + "loss": 0.6608, + "step": 231 + }, + { + "epoch": 0.2294757665677547, + "grad_norm": 0.4961691077638613, + "learning_rate": 3.815789473684211e-05, + "loss": 0.673, + "step": 232 + }, + { + "epoch": 0.2304648862512364, + "grad_norm": 0.4451618595226397, + "learning_rate": 3.8322368421052634e-05, + "loss": 0.6879, + "step": 233 + }, + { + "epoch": 0.2314540059347181, + "grad_norm": 0.4764748813476515, + "learning_rate": 3.848684210526316e-05, + "loss": 0.6593, + "step": 234 + }, + { + "epoch": 0.2324431256181998, + "grad_norm": 0.4781168643440134, + "learning_rate": 3.8651315789473685e-05, + "loss": 0.6521, + "step": 235 + }, + { + "epoch": 0.2334322453016815, + "grad_norm": 0.4317515889033491, + "learning_rate": 3.8815789473684214e-05, + "loss": 0.7187, + "step": 236 + }, + { + "epoch": 0.2344213649851632, + "grad_norm": 0.5010679062349306, + "learning_rate": 3.8980263157894736e-05, + "loss": 0.6075, + "step": 237 + }, + { + "epoch": 0.2354104846686449, + "grad_norm": 0.48215015664396305, + "learning_rate": 3.9144736842105265e-05, + "loss": 0.6168, + "step": 238 + }, + { + "epoch": 0.23639960435212662, + "grad_norm": 0.4522478488377238, + "learning_rate": 3.9309210526315794e-05, + "loss": 0.6396, + "step": 239 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 0.4577399461188642, + "learning_rate": 3.9473684210526316e-05, + "loss": 0.7265, + "step": 240 + }, + { + "epoch": 0.23837784371909002, + "grad_norm": 2.1998133857858337, + "learning_rate": 3.9638157894736845e-05, + "loss": 0.6542, + "step": 241 + }, + { + "epoch": 0.23936696340257171, + "grad_norm": 0.4803483208680404, + "learning_rate": 3.980263157894737e-05, + "loss": 0.6325, + "step": 242 + }, + { + "epoch": 0.2403560830860534, + "grad_norm": 0.42001637823181315, + "learning_rate": 3.9967105263157896e-05, + "loss": 0.6449, + "step": 243 + }, + { + "epoch": 0.2413452027695351, + "grad_norm": 0.6051649931482518, + "learning_rate": 4.0131578947368425e-05, + "loss": 0.6015, + "step": 244 + }, + { + "epoch": 0.2423343224530168, + "grad_norm": 0.462743464454851, + "learning_rate": 4.0296052631578954e-05, + "loss": 0.6353, + "step": 245 + }, + { + "epoch": 0.2433234421364985, + "grad_norm": 0.5965079725135056, + "learning_rate": 4.0460526315789476e-05, + "loss": 0.6813, + "step": 246 + }, + { + "epoch": 0.2443125618199802, + "grad_norm": 0.515915006211205, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.6897, + "step": 247 + }, + { + "epoch": 0.24530168150346193, + "grad_norm": 0.5532763347341395, + "learning_rate": 4.078947368421053e-05, + "loss": 0.7042, + "step": 248 + }, + { + "epoch": 0.24629080118694363, + "grad_norm": 0.46137687834271673, + "learning_rate": 4.0953947368421056e-05, + "loss": 0.5839, + "step": 249 + }, + { + "epoch": 0.24727992087042533, + "grad_norm": 0.47745663163187557, + "learning_rate": 4.111842105263158e-05, + "loss": 0.6566, + "step": 250 + }, + { + "epoch": 0.24826904055390703, + "grad_norm": 0.44493193669631287, + "learning_rate": 4.128289473684211e-05, + "loss": 0.6223, + "step": 251 + }, + { + "epoch": 0.24925816023738873, + "grad_norm": 0.4169310337329136, + "learning_rate": 4.1447368421052636e-05, + "loss": 0.5799, + "step": 252 + }, + { + "epoch": 0.2502472799208704, + "grad_norm": 0.5324378545388998, + "learning_rate": 4.161184210526316e-05, + "loss": 0.586, + "step": 253 + }, + { + "epoch": 0.2512363996043521, + "grad_norm": 0.4713752675672079, + "learning_rate": 4.177631578947369e-05, + "loss": 0.5844, + "step": 254 + }, + { + "epoch": 0.2522255192878338, + "grad_norm": 0.4246792763221131, + "learning_rate": 4.194078947368421e-05, + "loss": 0.6691, + "step": 255 + }, + { + "epoch": 0.2532146389713155, + "grad_norm": 0.5996820122986719, + "learning_rate": 4.210526315789474e-05, + "loss": 0.6205, + "step": 256 + }, + { + "epoch": 0.2542037586547972, + "grad_norm": 0.47226024365214625, + "learning_rate": 4.226973684210527e-05, + "loss": 0.6654, + "step": 257 + }, + { + "epoch": 0.2551928783382789, + "grad_norm": 0.4718080623174236, + "learning_rate": 4.2434210526315796e-05, + "loss": 0.5252, + "step": 258 + }, + { + "epoch": 0.2561819980217606, + "grad_norm": 0.6350779709078309, + "learning_rate": 4.259868421052632e-05, + "loss": 0.6221, + "step": 259 + }, + { + "epoch": 0.2571711177052423, + "grad_norm": 0.5362489170689355, + "learning_rate": 4.2763157894736847e-05, + "loss": 0.59, + "step": 260 + }, + { + "epoch": 0.258160237388724, + "grad_norm": 0.5037993125750314, + "learning_rate": 4.292763157894737e-05, + "loss": 0.5977, + "step": 261 + }, + { + "epoch": 0.2591493570722057, + "grad_norm": 0.7119819210503119, + "learning_rate": 4.30921052631579e-05, + "loss": 0.5687, + "step": 262 + }, + { + "epoch": 0.26013847675568746, + "grad_norm": 0.48734181172443314, + "learning_rate": 4.3256578947368426e-05, + "loss": 0.5818, + "step": 263 + }, + { + "epoch": 0.26112759643916916, + "grad_norm": 0.5307408330986926, + "learning_rate": 4.342105263157895e-05, + "loss": 0.5485, + "step": 264 + }, + { + "epoch": 0.26211671612265086, + "grad_norm": 0.6102212182790211, + "learning_rate": 4.358552631578948e-05, + "loss": 0.6235, + "step": 265 + }, + { + "epoch": 0.26310583580613256, + "grad_norm": 0.5928998977874212, + "learning_rate": 4.375e-05, + "loss": 0.6044, + "step": 266 + }, + { + "epoch": 0.26409495548961426, + "grad_norm": 0.6688550804132813, + "learning_rate": 4.391447368421053e-05, + "loss": 0.6217, + "step": 267 + }, + { + "epoch": 0.26508407517309596, + "grad_norm": 0.7818753906542993, + "learning_rate": 4.407894736842105e-05, + "loss": 0.5282, + "step": 268 + }, + { + "epoch": 0.26607319485657766, + "grad_norm": 1.0091627489834931, + "learning_rate": 4.424342105263158e-05, + "loss": 0.5692, + "step": 269 + }, + { + "epoch": 0.26706231454005935, + "grad_norm": 0.6011818743170259, + "learning_rate": 4.440789473684211e-05, + "loss": 0.6119, + "step": 270 + }, + { + "epoch": 0.26805143422354105, + "grad_norm": 0.8838865925562805, + "learning_rate": 4.457236842105264e-05, + "loss": 0.6043, + "step": 271 + }, + { + "epoch": 0.26904055390702275, + "grad_norm": 0.6314911229429284, + "learning_rate": 4.473684210526316e-05, + "loss": 0.7673, + "step": 272 + }, + { + "epoch": 0.27002967359050445, + "grad_norm": 0.6804637239015356, + "learning_rate": 4.490131578947369e-05, + "loss": 0.5926, + "step": 273 + }, + { + "epoch": 0.27101879327398615, + "grad_norm": 0.547035473768601, + "learning_rate": 4.506578947368421e-05, + "loss": 0.5431, + "step": 274 + }, + { + "epoch": 0.27200791295746785, + "grad_norm": 0.49151905906223226, + "learning_rate": 4.523026315789474e-05, + "loss": 0.5535, + "step": 275 + }, + { + "epoch": 0.27299703264094954, + "grad_norm": 0.4380519372013023, + "learning_rate": 4.539473684210527e-05, + "loss": 0.7109, + "step": 276 + }, + { + "epoch": 0.27398615232443124, + "grad_norm": 0.7217367910139244, + "learning_rate": 4.555921052631579e-05, + "loss": 0.6292, + "step": 277 + }, + { + "epoch": 0.27497527200791294, + "grad_norm": 0.5585776926241103, + "learning_rate": 4.572368421052632e-05, + "loss": 0.6974, + "step": 278 + }, + { + "epoch": 0.27596439169139464, + "grad_norm": 0.4555730594700509, + "learning_rate": 4.588815789473684e-05, + "loss": 0.6425, + "step": 279 + }, + { + "epoch": 0.27695351137487634, + "grad_norm": 0.6389516893536279, + "learning_rate": 4.605263157894737e-05, + "loss": 0.5537, + "step": 280 + }, + { + "epoch": 0.27794263105835804, + "grad_norm": 0.46095286770730254, + "learning_rate": 4.621710526315789e-05, + "loss": 0.6019, + "step": 281 + }, + { + "epoch": 0.2789317507418398, + "grad_norm": 0.44048312996010536, + "learning_rate": 4.638157894736843e-05, + "loss": 0.612, + "step": 282 + }, + { + "epoch": 0.2799208704253215, + "grad_norm": 1.0389287178149595, + "learning_rate": 4.654605263157895e-05, + "loss": 0.6571, + "step": 283 + }, + { + "epoch": 0.2809099901088032, + "grad_norm": 0.6903364570618536, + "learning_rate": 4.671052631578948e-05, + "loss": 0.6973, + "step": 284 + }, + { + "epoch": 0.2818991097922849, + "grad_norm": 0.4477703451666338, + "learning_rate": 4.6875e-05, + "loss": 0.5894, + "step": 285 + }, + { + "epoch": 0.2828882294757666, + "grad_norm": 0.71253667717858, + "learning_rate": 4.703947368421053e-05, + "loss": 0.6158, + "step": 286 + }, + { + "epoch": 0.2838773491592483, + "grad_norm": 0.6605589318131366, + "learning_rate": 4.720394736842105e-05, + "loss": 0.5547, + "step": 287 + }, + { + "epoch": 0.28486646884273, + "grad_norm": 0.4142085186922097, + "learning_rate": 4.736842105263158e-05, + "loss": 0.6035, + "step": 288 + }, + { + "epoch": 0.2858555885262117, + "grad_norm": 0.5522552002749757, + "learning_rate": 4.753289473684211e-05, + "loss": 0.6205, + "step": 289 + }, + { + "epoch": 0.2868447082096934, + "grad_norm": 0.4553692557081208, + "learning_rate": 4.769736842105263e-05, + "loss": 0.5847, + "step": 290 + }, + { + "epoch": 0.2878338278931751, + "grad_norm": 0.4677603638371184, + "learning_rate": 4.786184210526316e-05, + "loss": 0.6153, + "step": 291 + }, + { + "epoch": 0.2888229475766568, + "grad_norm": 0.716448078885814, + "learning_rate": 4.802631578947368e-05, + "loss": 0.6088, + "step": 292 + }, + { + "epoch": 0.2898120672601385, + "grad_norm": 0.5629123910634363, + "learning_rate": 4.819078947368421e-05, + "loss": 0.7418, + "step": 293 + }, + { + "epoch": 0.29080118694362017, + "grad_norm": 0.5684632906175011, + "learning_rate": 4.8355263157894734e-05, + "loss": 0.6617, + "step": 294 + }, + { + "epoch": 0.29179030662710187, + "grad_norm": 0.7817295121483087, + "learning_rate": 4.851973684210527e-05, + "loss": 0.6587, + "step": 295 + }, + { + "epoch": 0.29277942631058357, + "grad_norm": 0.43426647960692005, + "learning_rate": 4.868421052631579e-05, + "loss": 0.5338, + "step": 296 + }, + { + "epoch": 0.29376854599406527, + "grad_norm": 0.7019981788147629, + "learning_rate": 4.884868421052632e-05, + "loss": 0.6053, + "step": 297 + }, + { + "epoch": 0.29475766567754697, + "grad_norm": 3.0951979705836496, + "learning_rate": 4.901315789473684e-05, + "loss": 0.5918, + "step": 298 + }, + { + "epoch": 0.29574678536102866, + "grad_norm": 1.0233134362024605, + "learning_rate": 4.917763157894737e-05, + "loss": 0.6081, + "step": 299 + }, + { + "epoch": 0.29673590504451036, + "grad_norm": 0.9953914245723955, + "learning_rate": 4.9342105263157894e-05, + "loss": 0.5341, + "step": 300 + }, + { + "epoch": 0.29772502472799206, + "grad_norm": 0.7092328246731249, + "learning_rate": 4.950657894736843e-05, + "loss": 0.7031, + "step": 301 + }, + { + "epoch": 0.2987141444114738, + "grad_norm": 1.1657097575552566, + "learning_rate": 4.967105263157895e-05, + "loss": 0.5386, + "step": 302 + }, + { + "epoch": 0.2997032640949555, + "grad_norm": 0.64809864753375, + "learning_rate": 4.983552631578948e-05, + "loss": 0.5822, + "step": 303 + }, + { + "epoch": 0.3006923837784372, + "grad_norm": 10.779831456434477, + "learning_rate": 5e-05, + "loss": 1.3367, + "step": 304 + }, + { + "epoch": 0.3016815034619189, + "grad_norm": 2.263953506492712, + "learning_rate": 4.9981678270428736e-05, + "loss": 0.652, + "step": 305 + }, + { + "epoch": 0.3026706231454006, + "grad_norm": 1.1026406639523274, + "learning_rate": 4.996335654085746e-05, + "loss": 0.5982, + "step": 306 + }, + { + "epoch": 0.3036597428288823, + "grad_norm": 1.5390268402197822, + "learning_rate": 4.994503481128619e-05, + "loss": 0.5838, + "step": 307 + }, + { + "epoch": 0.304648862512364, + "grad_norm": 2.325507669292494, + "learning_rate": 4.9926713081714915e-05, + "loss": 0.7096, + "step": 308 + }, + { + "epoch": 0.3056379821958457, + "grad_norm": 0.9681764004322482, + "learning_rate": 4.990839135214365e-05, + "loss": 0.5579, + "step": 309 + }, + { + "epoch": 0.3066271018793274, + "grad_norm": 1.1536956780162377, + "learning_rate": 4.9890069622572374e-05, + "loss": 0.6127, + "step": 310 + }, + { + "epoch": 0.3076162215628091, + "grad_norm": 0.934115930451099, + "learning_rate": 4.98717478930011e-05, + "loss": 0.5808, + "step": 311 + }, + { + "epoch": 0.3086053412462908, + "grad_norm": 1.1749017282382166, + "learning_rate": 4.985342616342983e-05, + "loss": 0.6422, + "step": 312 + }, + { + "epoch": 0.3095944609297725, + "grad_norm": 0.8515985999112896, + "learning_rate": 4.983510443385856e-05, + "loss": 0.6175, + "step": 313 + }, + { + "epoch": 0.3105835806132542, + "grad_norm": 6.518960373233818, + "learning_rate": 4.981678270428729e-05, + "loss": 0.8142, + "step": 314 + }, + { + "epoch": 0.3115727002967359, + "grad_norm": 2.1921818273490374, + "learning_rate": 4.979846097471601e-05, + "loss": 0.6226, + "step": 315 + }, + { + "epoch": 0.3125618199802176, + "grad_norm": 1.6420624734489004, + "learning_rate": 4.9780139245144747e-05, + "loss": 0.6004, + "step": 316 + }, + { + "epoch": 0.3135509396636993, + "grad_norm": 0.7686264715968514, + "learning_rate": 4.976181751557347e-05, + "loss": 0.587, + "step": 317 + }, + { + "epoch": 0.314540059347181, + "grad_norm": 1.6301825964138899, + "learning_rate": 4.9743495786002206e-05, + "loss": 0.5783, + "step": 318 + }, + { + "epoch": 0.3155291790306627, + "grad_norm": 1.7471449432767963, + "learning_rate": 4.9725174056430926e-05, + "loss": 0.6721, + "step": 319 + }, + { + "epoch": 0.3165182987141444, + "grad_norm": 1.1192919506428751, + "learning_rate": 4.970685232685966e-05, + "loss": 0.6192, + "step": 320 + }, + { + "epoch": 0.31750741839762614, + "grad_norm": 1.269040128114316, + "learning_rate": 4.9688530597288385e-05, + "loss": 0.6129, + "step": 321 + }, + { + "epoch": 0.31849653808110784, + "grad_norm": 0.828446461431732, + "learning_rate": 4.967020886771712e-05, + "loss": 0.6483, + "step": 322 + }, + { + "epoch": 0.31948565776458954, + "grad_norm": 0.8666239492089782, + "learning_rate": 4.9651887138145845e-05, + "loss": 0.5733, + "step": 323 + }, + { + "epoch": 0.32047477744807124, + "grad_norm": 0.7549123620762485, + "learning_rate": 4.963356540857457e-05, + "loss": 0.6196, + "step": 324 + }, + { + "epoch": 0.32146389713155293, + "grad_norm": 1.8217639567133426, + "learning_rate": 4.96152436790033e-05, + "loss": 0.6356, + "step": 325 + }, + { + "epoch": 0.32245301681503463, + "grad_norm": 2.292729397977442, + "learning_rate": 4.959692194943203e-05, + "loss": 0.6607, + "step": 326 + }, + { + "epoch": 0.32344213649851633, + "grad_norm": 0.83818756217215, + "learning_rate": 4.957860021986076e-05, + "loss": 0.6217, + "step": 327 + }, + { + "epoch": 0.32443125618199803, + "grad_norm": 0.676734652693397, + "learning_rate": 4.9560278490289484e-05, + "loss": 0.592, + "step": 328 + }, + { + "epoch": 0.3254203758654797, + "grad_norm": 0.5902774697246586, + "learning_rate": 4.954195676071822e-05, + "loss": 0.6153, + "step": 329 + }, + { + "epoch": 0.3264094955489614, + "grad_norm": 0.800723034946388, + "learning_rate": 4.9523635031146943e-05, + "loss": 0.664, + "step": 330 + }, + { + "epoch": 0.3273986152324431, + "grad_norm": 0.5767708938288155, + "learning_rate": 4.950531330157567e-05, + "loss": 0.5934, + "step": 331 + }, + { + "epoch": 0.3283877349159248, + "grad_norm": 0.7493746467171817, + "learning_rate": 4.9486991572004396e-05, + "loss": 0.633, + "step": 332 + }, + { + "epoch": 0.3293768545994065, + "grad_norm": 0.8168408769910189, + "learning_rate": 4.946866984243313e-05, + "loss": 0.5523, + "step": 333 + }, + { + "epoch": 0.3303659742828882, + "grad_norm": 0.6357012806203373, + "learning_rate": 4.9450348112861856e-05, + "loss": 0.5584, + "step": 334 + }, + { + "epoch": 0.3313550939663699, + "grad_norm": 0.9570738821630661, + "learning_rate": 4.943202638329059e-05, + "loss": 0.5602, + "step": 335 + }, + { + "epoch": 0.3323442136498516, + "grad_norm": 0.6271482411746069, + "learning_rate": 4.941370465371931e-05, + "loss": 0.6037, + "step": 336 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.8342569793379687, + "learning_rate": 4.939538292414804e-05, + "loss": 0.6013, + "step": 337 + }, + { + "epoch": 0.334322453016815, + "grad_norm": 0.8429515515235894, + "learning_rate": 4.937706119457677e-05, + "loss": 0.5744, + "step": 338 + }, + { + "epoch": 0.3353115727002967, + "grad_norm": 0.41480878493392426, + "learning_rate": 4.93587394650055e-05, + "loss": 0.533, + "step": 339 + }, + { + "epoch": 0.3363006923837784, + "grad_norm": 1.0762372950869659, + "learning_rate": 4.934041773543423e-05, + "loss": 0.6092, + "step": 340 + }, + { + "epoch": 0.33728981206726016, + "grad_norm": 0.5352412089968016, + "learning_rate": 4.9322096005862954e-05, + "loss": 0.5744, + "step": 341 + }, + { + "epoch": 0.33827893175074186, + "grad_norm": 11.795603000530546, + "learning_rate": 4.930377427629169e-05, + "loss": 0.8007, + "step": 342 + }, + { + "epoch": 0.33926805143422356, + "grad_norm": 1.6654251993458582, + "learning_rate": 4.9285452546720414e-05, + "loss": 0.6833, + "step": 343 + }, + { + "epoch": 0.34025717111770526, + "grad_norm": 1.023245256880444, + "learning_rate": 4.926713081714914e-05, + "loss": 0.5568, + "step": 344 + }, + { + "epoch": 0.34124629080118696, + "grad_norm": 0.9874560723539015, + "learning_rate": 4.924880908757787e-05, + "loss": 0.6252, + "step": 345 + }, + { + "epoch": 0.34223541048466866, + "grad_norm": 1.1346945125596772, + "learning_rate": 4.92304873580066e-05, + "loss": 0.667, + "step": 346 + }, + { + "epoch": 0.34322453016815035, + "grad_norm": 0.7734482167051114, + "learning_rate": 4.9212165628435326e-05, + "loss": 0.5809, + "step": 347 + }, + { + "epoch": 0.34421364985163205, + "grad_norm": 0.8763032330208194, + "learning_rate": 4.919384389886405e-05, + "loss": 0.6812, + "step": 348 + }, + { + "epoch": 0.34520276953511375, + "grad_norm": 0.8014512357084477, + "learning_rate": 4.917552216929278e-05, + "loss": 0.5391, + "step": 349 + }, + { + "epoch": 0.34619188921859545, + "grad_norm": 0.6557118095626164, + "learning_rate": 4.915720043972151e-05, + "loss": 0.5202, + "step": 350 + }, + { + "epoch": 0.34718100890207715, + "grad_norm": 0.6296821730209303, + "learning_rate": 4.913887871015024e-05, + "loss": 0.4884, + "step": 351 + }, + { + "epoch": 0.34817012858555885, + "grad_norm": 0.7422194964288009, + "learning_rate": 4.912055698057897e-05, + "loss": 0.5452, + "step": 352 + }, + { + "epoch": 0.34915924826904055, + "grad_norm": 0.5946076118500246, + "learning_rate": 4.91022352510077e-05, + "loss": 0.6166, + "step": 353 + }, + { + "epoch": 0.35014836795252224, + "grad_norm": 0.732570052375268, + "learning_rate": 4.9083913521436425e-05, + "loss": 0.6611, + "step": 354 + }, + { + "epoch": 0.35113748763600394, + "grad_norm": 0.4698106032909168, + "learning_rate": 4.906559179186516e-05, + "loss": 0.5601, + "step": 355 + }, + { + "epoch": 0.35212660731948564, + "grad_norm": 1.2453092526176186, + "learning_rate": 4.9047270062293885e-05, + "loss": 0.6009, + "step": 356 + }, + { + "epoch": 0.35311572700296734, + "grad_norm": 0.4418805854104305, + "learning_rate": 4.902894833272261e-05, + "loss": 0.6021, + "step": 357 + }, + { + "epoch": 0.35410484668644904, + "grad_norm": 0.5326586341210466, + "learning_rate": 4.901062660315134e-05, + "loss": 0.5839, + "step": 358 + }, + { + "epoch": 0.35509396636993074, + "grad_norm": 0.41554111643426483, + "learning_rate": 4.899230487358007e-05, + "loss": 0.5615, + "step": 359 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 0.534276978725234, + "learning_rate": 4.89739831440088e-05, + "loss": 0.6854, + "step": 360 + }, + { + "epoch": 0.3570722057368942, + "grad_norm": 0.41429157493399854, + "learning_rate": 4.8955661414437523e-05, + "loss": 0.5839, + "step": 361 + }, + { + "epoch": 0.3580613254203759, + "grad_norm": 1.8567649357537324, + "learning_rate": 4.893733968486625e-05, + "loss": 0.5924, + "step": 362 + }, + { + "epoch": 0.3590504451038576, + "grad_norm": 0.8369610972802223, + "learning_rate": 4.891901795529498e-05, + "loss": 0.6518, + "step": 363 + }, + { + "epoch": 0.3600395647873393, + "grad_norm": 0.5759620691607789, + "learning_rate": 4.8900696225723716e-05, + "loss": 0.5869, + "step": 364 + }, + { + "epoch": 0.361028684470821, + "grad_norm": 0.8489394500046676, + "learning_rate": 4.8882374496152436e-05, + "loss": 0.621, + "step": 365 + }, + { + "epoch": 0.3620178041543027, + "grad_norm": 0.6560658008069779, + "learning_rate": 4.886405276658117e-05, + "loss": 0.6094, + "step": 366 + }, + { + "epoch": 0.3630069238377844, + "grad_norm": 0.7865989416550275, + "learning_rate": 4.8845731037009895e-05, + "loss": 0.5788, + "step": 367 + }, + { + "epoch": 0.3639960435212661, + "grad_norm": 0.6537131935874835, + "learning_rate": 4.882740930743863e-05, + "loss": 0.5797, + "step": 368 + }, + { + "epoch": 0.3649851632047478, + "grad_norm": 0.6909493769793963, + "learning_rate": 4.8809087577867355e-05, + "loss": 0.5842, + "step": 369 + }, + { + "epoch": 0.3659742828882295, + "grad_norm": 0.4769547461410241, + "learning_rate": 4.879076584829608e-05, + "loss": 0.6062, + "step": 370 + }, + { + "epoch": 0.3669634025717112, + "grad_norm": 0.8200716144700505, + "learning_rate": 4.877244411872481e-05, + "loss": 0.6748, + "step": 371 + }, + { + "epoch": 0.36795252225519287, + "grad_norm": 0.45770566187660844, + "learning_rate": 4.875412238915354e-05, + "loss": 0.527, + "step": 372 + }, + { + "epoch": 0.36894164193867457, + "grad_norm": 0.6555897440490205, + "learning_rate": 4.873580065958227e-05, + "loss": 0.5222, + "step": 373 + }, + { + "epoch": 0.36993076162215627, + "grad_norm": 0.7580354956119526, + "learning_rate": 4.8717478930010994e-05, + "loss": 0.6309, + "step": 374 + }, + { + "epoch": 0.37091988130563797, + "grad_norm": 0.5439082223239077, + "learning_rate": 4.869915720043972e-05, + "loss": 0.5571, + "step": 375 + }, + { + "epoch": 0.37190900098911966, + "grad_norm": 0.8331347227307716, + "learning_rate": 4.8680835470868454e-05, + "loss": 0.5607, + "step": 376 + }, + { + "epoch": 0.37289812067260136, + "grad_norm": 0.5141516865612674, + "learning_rate": 4.866251374129718e-05, + "loss": 0.6567, + "step": 377 + }, + { + "epoch": 0.37388724035608306, + "grad_norm": 0.7141235700823007, + "learning_rate": 4.8644192011725906e-05, + "loss": 0.5669, + "step": 378 + }, + { + "epoch": 0.37487636003956476, + "grad_norm": 0.5560006726258888, + "learning_rate": 4.862587028215464e-05, + "loss": 0.6324, + "step": 379 + }, + { + "epoch": 0.3758654797230465, + "grad_norm": 0.7409978312950718, + "learning_rate": 4.8607548552583366e-05, + "loss": 0.7242, + "step": 380 + }, + { + "epoch": 0.3768545994065282, + "grad_norm": 0.5440930687616704, + "learning_rate": 4.85892268230121e-05, + "loss": 0.6254, + "step": 381 + }, + { + "epoch": 0.3778437190900099, + "grad_norm": 0.6478252439008088, + "learning_rate": 4.857090509344082e-05, + "loss": 0.5264, + "step": 382 + }, + { + "epoch": 0.3788328387734916, + "grad_norm": 0.4709599342774292, + "learning_rate": 4.855258336386955e-05, + "loss": 0.5647, + "step": 383 + }, + { + "epoch": 0.3798219584569733, + "grad_norm": 1.18640771551092, + "learning_rate": 4.853426163429828e-05, + "loss": 0.7512, + "step": 384 + }, + { + "epoch": 0.380811078140455, + "grad_norm": 0.3923235190086449, + "learning_rate": 4.851593990472701e-05, + "loss": 0.6049, + "step": 385 + }, + { + "epoch": 0.3818001978239367, + "grad_norm": 0.4781166108075703, + "learning_rate": 4.849761817515573e-05, + "loss": 0.5453, + "step": 386 + }, + { + "epoch": 0.3827893175074184, + "grad_norm": 0.3725786214381448, + "learning_rate": 4.8479296445584464e-05, + "loss": 0.5387, + "step": 387 + }, + { + "epoch": 0.3837784371909001, + "grad_norm": 0.4987148506610239, + "learning_rate": 4.84609747160132e-05, + "loss": 0.547, + "step": 388 + }, + { + "epoch": 0.3847675568743818, + "grad_norm": 0.35932327262892194, + "learning_rate": 4.8442652986441924e-05, + "loss": 0.5214, + "step": 389 + }, + { + "epoch": 0.3857566765578635, + "grad_norm": 0.690564658045055, + "learning_rate": 4.842433125687065e-05, + "loss": 0.591, + "step": 390 + }, + { + "epoch": 0.3867457962413452, + "grad_norm": 0.3463972013191391, + "learning_rate": 4.840600952729938e-05, + "loss": 0.6385, + "step": 391 + }, + { + "epoch": 0.3877349159248269, + "grad_norm": 0.3543061291829264, + "learning_rate": 4.838768779772811e-05, + "loss": 0.517, + "step": 392 + }, + { + "epoch": 0.3887240356083086, + "grad_norm": 0.3660023274107496, + "learning_rate": 4.8369366068156837e-05, + "loss": 0.5781, + "step": 393 + }, + { + "epoch": 0.3897131552917903, + "grad_norm": 0.35899772143056496, + "learning_rate": 4.835104433858556e-05, + "loss": 0.6156, + "step": 394 + }, + { + "epoch": 0.390702274975272, + "grad_norm": 0.4516133836964091, + "learning_rate": 4.833272260901429e-05, + "loss": 0.555, + "step": 395 + }, + { + "epoch": 0.3916913946587537, + "grad_norm": 4.069658753953614, + "learning_rate": 4.831440087944302e-05, + "loss": 0.6786, + "step": 396 + }, + { + "epoch": 0.3926805143422354, + "grad_norm": 0.5512608577859164, + "learning_rate": 4.829607914987175e-05, + "loss": 0.5819, + "step": 397 + }, + { + "epoch": 0.3936696340257171, + "grad_norm": 0.6001718577059937, + "learning_rate": 4.827775742030048e-05, + "loss": 0.5866, + "step": 398 + }, + { + "epoch": 0.39465875370919884, + "grad_norm": 0.43723220327977047, + "learning_rate": 4.82594356907292e-05, + "loss": 0.6223, + "step": 399 + }, + { + "epoch": 0.39564787339268054, + "grad_norm": 0.4091883159043, + "learning_rate": 4.8241113961157935e-05, + "loss": 0.5937, + "step": 400 + }, + { + "epoch": 0.39663699307616224, + "grad_norm": 0.3627182581990674, + "learning_rate": 4.822279223158667e-05, + "loss": 0.4927, + "step": 401 + }, + { + "epoch": 0.39762611275964393, + "grad_norm": 3.395118475745649, + "learning_rate": 4.8204470502015395e-05, + "loss": 0.6651, + "step": 402 + }, + { + "epoch": 0.39861523244312563, + "grad_norm": 0.503406651457647, + "learning_rate": 4.818614877244412e-05, + "loss": 0.6462, + "step": 403 + }, + { + "epoch": 0.39960435212660733, + "grad_norm": 0.41164305421638864, + "learning_rate": 4.816782704287285e-05, + "loss": 0.5303, + "step": 404 + }, + { + "epoch": 0.40059347181008903, + "grad_norm": 0.44166637125698777, + "learning_rate": 4.814950531330158e-05, + "loss": 0.5474, + "step": 405 + }, + { + "epoch": 0.40158259149357073, + "grad_norm": 0.48008127596420697, + "learning_rate": 4.813118358373031e-05, + "loss": 0.6364, + "step": 406 + }, + { + "epoch": 0.4025717111770524, + "grad_norm": 0.46798165410172304, + "learning_rate": 4.8112861854159033e-05, + "loss": 0.6383, + "step": 407 + }, + { + "epoch": 0.4035608308605341, + "grad_norm": 0.44125266116511586, + "learning_rate": 4.809454012458776e-05, + "loss": 0.5955, + "step": 408 + }, + { + "epoch": 0.4045499505440158, + "grad_norm": 0.4395464902377213, + "learning_rate": 4.807621839501649e-05, + "loss": 0.7118, + "step": 409 + }, + { + "epoch": 0.4055390702274975, + "grad_norm": 0.3816754087372529, + "learning_rate": 4.805789666544522e-05, + "loss": 0.5148, + "step": 410 + }, + { + "epoch": 0.4065281899109792, + "grad_norm": 0.41359029720451895, + "learning_rate": 4.8039574935873946e-05, + "loss": 0.588, + "step": 411 + }, + { + "epoch": 0.4075173095944609, + "grad_norm": 0.4693298663552246, + "learning_rate": 4.802125320630268e-05, + "loss": 0.6457, + "step": 412 + }, + { + "epoch": 0.4085064292779426, + "grad_norm": 0.4067116282641899, + "learning_rate": 4.8002931476731406e-05, + "loss": 0.5495, + "step": 413 + }, + { + "epoch": 0.4094955489614243, + "grad_norm": 0.4272277873309838, + "learning_rate": 4.798460974716014e-05, + "loss": 0.6337, + "step": 414 + }, + { + "epoch": 0.410484668644906, + "grad_norm": 0.4811569091836984, + "learning_rate": 4.7966288017588865e-05, + "loss": 0.6149, + "step": 415 + }, + { + "epoch": 0.4114737883283877, + "grad_norm": 0.38122687491339136, + "learning_rate": 4.794796628801759e-05, + "loss": 0.5846, + "step": 416 + }, + { + "epoch": 0.4124629080118694, + "grad_norm": 0.4862215332358342, + "learning_rate": 4.792964455844632e-05, + "loss": 0.6186, + "step": 417 + }, + { + "epoch": 0.4134520276953511, + "grad_norm": 0.47460277960742403, + "learning_rate": 4.791132282887505e-05, + "loss": 0.5941, + "step": 418 + }, + { + "epoch": 0.41444114737883286, + "grad_norm": 0.40971075817407676, + "learning_rate": 4.789300109930378e-05, + "loss": 0.5409, + "step": 419 + }, + { + "epoch": 0.41543026706231456, + "grad_norm": 0.40503923340944115, + "learning_rate": 4.7874679369732504e-05, + "loss": 0.5656, + "step": 420 + }, + { + "epoch": 0.41641938674579626, + "grad_norm": 0.47166101201631216, + "learning_rate": 4.785635764016123e-05, + "loss": 0.5734, + "step": 421 + }, + { + "epoch": 0.41740850642927796, + "grad_norm": 0.4074965319088034, + "learning_rate": 4.7838035910589964e-05, + "loss": 0.552, + "step": 422 + }, + { + "epoch": 0.41839762611275966, + "grad_norm": 0.39208549972043516, + "learning_rate": 4.781971418101869e-05, + "loss": 0.6738, + "step": 423 + }, + { + "epoch": 0.41938674579624136, + "grad_norm": 0.6011637351858017, + "learning_rate": 4.7801392451447416e-05, + "loss": 0.5603, + "step": 424 + }, + { + "epoch": 0.42037586547972305, + "grad_norm": 0.3726260226014429, + "learning_rate": 4.778307072187615e-05, + "loss": 0.5816, + "step": 425 + }, + { + "epoch": 0.42136498516320475, + "grad_norm": 0.6236845405373085, + "learning_rate": 4.7764748992304876e-05, + "loss": 0.6482, + "step": 426 + }, + { + "epoch": 0.42235410484668645, + "grad_norm": 3.2608951858084922, + "learning_rate": 4.774642726273361e-05, + "loss": 0.6115, + "step": 427 + }, + { + "epoch": 0.42334322453016815, + "grad_norm": 0.5292345408652966, + "learning_rate": 4.772810553316233e-05, + "loss": 0.5683, + "step": 428 + }, + { + "epoch": 0.42433234421364985, + "grad_norm": 0.7800091939799507, + "learning_rate": 4.770978380359106e-05, + "loss": 0.5404, + "step": 429 + }, + { + "epoch": 0.42532146389713155, + "grad_norm": 0.3947675558178534, + "learning_rate": 4.769146207401979e-05, + "loss": 0.5609, + "step": 430 + }, + { + "epoch": 0.42631058358061324, + "grad_norm": 8.27438656210951, + "learning_rate": 4.767314034444852e-05, + "loss": 0.8391, + "step": 431 + }, + { + "epoch": 0.42729970326409494, + "grad_norm": 0.8743079722440797, + "learning_rate": 4.765481861487724e-05, + "loss": 0.5703, + "step": 432 + }, + { + "epoch": 0.42828882294757664, + "grad_norm": 1.903800162095679, + "learning_rate": 4.7636496885305975e-05, + "loss": 0.5864, + "step": 433 + }, + { + "epoch": 0.42927794263105834, + "grad_norm": 0.6629417966412571, + "learning_rate": 4.76181751557347e-05, + "loss": 0.5367, + "step": 434 + }, + { + "epoch": 0.43026706231454004, + "grad_norm": 0.7219662282705351, + "learning_rate": 4.7599853426163434e-05, + "loss": 0.6087, + "step": 435 + }, + { + "epoch": 0.43125618199802174, + "grad_norm": 0.4530515462755643, + "learning_rate": 4.758153169659216e-05, + "loss": 0.5095, + "step": 436 + }, + { + "epoch": 0.43224530168150344, + "grad_norm": 0.7726116565074944, + "learning_rate": 4.756320996702089e-05, + "loss": 0.6424, + "step": 437 + }, + { + "epoch": 0.4332344213649852, + "grad_norm": 0.5383329629673866, + "learning_rate": 4.754488823744962e-05, + "loss": 0.6031, + "step": 438 + }, + { + "epoch": 0.4342235410484669, + "grad_norm": 0.6766939106280287, + "learning_rate": 4.7526566507878347e-05, + "loss": 0.6031, + "step": 439 + }, + { + "epoch": 0.4352126607319486, + "grad_norm": 0.6940726364507884, + "learning_rate": 4.750824477830707e-05, + "loss": 0.5308, + "step": 440 + }, + { + "epoch": 0.4362017804154303, + "grad_norm": 0.5386095591035055, + "learning_rate": 4.74899230487358e-05, + "loss": 0.5361, + "step": 441 + }, + { + "epoch": 0.437190900098912, + "grad_norm": 0.6810339477078869, + "learning_rate": 4.747160131916453e-05, + "loss": 0.5189, + "step": 442 + }, + { + "epoch": 0.4381800197823937, + "grad_norm": 0.4935297438689495, + "learning_rate": 4.745327958959326e-05, + "loss": 0.5648, + "step": 443 + }, + { + "epoch": 0.4391691394658754, + "grad_norm": 0.5426668934382847, + "learning_rate": 4.743495786002199e-05, + "loss": 0.5745, + "step": 444 + }, + { + "epoch": 0.4401582591493571, + "grad_norm": 0.5200388242525367, + "learning_rate": 4.741663613045071e-05, + "loss": 0.5863, + "step": 445 + }, + { + "epoch": 0.4411473788328388, + "grad_norm": 1.3830554463390863, + "learning_rate": 4.7398314400879445e-05, + "loss": 0.5881, + "step": 446 + }, + { + "epoch": 0.4421364985163205, + "grad_norm": 0.6948346194161272, + "learning_rate": 4.737999267130817e-05, + "loss": 0.5239, + "step": 447 + }, + { + "epoch": 0.4431256181998022, + "grad_norm": 0.6601938762365581, + "learning_rate": 4.7361670941736905e-05, + "loss": 0.5487, + "step": 448 + }, + { + "epoch": 0.44411473788328387, + "grad_norm": 0.5617566077718212, + "learning_rate": 4.734334921216563e-05, + "loss": 0.6214, + "step": 449 + }, + { + "epoch": 0.44510385756676557, + "grad_norm": 0.732252290442145, + "learning_rate": 4.732502748259436e-05, + "loss": 0.5628, + "step": 450 + }, + { + "epoch": 0.44609297725024727, + "grad_norm": 0.44306508264201677, + "learning_rate": 4.730670575302309e-05, + "loss": 0.6078, + "step": 451 + }, + { + "epoch": 0.44708209693372897, + "grad_norm": 0.5296732642576735, + "learning_rate": 4.728838402345182e-05, + "loss": 0.5147, + "step": 452 + }, + { + "epoch": 0.44807121661721067, + "grad_norm": 1.652456677052837, + "learning_rate": 4.7270062293880544e-05, + "loss": 0.5598, + "step": 453 + }, + { + "epoch": 0.44906033630069236, + "grad_norm": 0.6144592738271647, + "learning_rate": 4.725174056430927e-05, + "loss": 0.51, + "step": 454 + }, + { + "epoch": 0.45004945598417406, + "grad_norm": 0.4613783043555175, + "learning_rate": 4.7233418834738e-05, + "loss": 0.6202, + "step": 455 + }, + { + "epoch": 0.45103857566765576, + "grad_norm": 0.5213448885231209, + "learning_rate": 4.721509710516673e-05, + "loss": 0.5308, + "step": 456 + }, + { + "epoch": 0.4520276953511375, + "grad_norm": 0.46820252742016644, + "learning_rate": 4.7196775375595456e-05, + "loss": 0.5832, + "step": 457 + }, + { + "epoch": 0.4530168150346192, + "grad_norm": 0.42474287967898067, + "learning_rate": 4.717845364602418e-05, + "loss": 0.5684, + "step": 458 + }, + { + "epoch": 0.4540059347181009, + "grad_norm": 0.39392310575419126, + "learning_rate": 4.7160131916452916e-05, + "loss": 0.624, + "step": 459 + }, + { + "epoch": 0.4549950544015826, + "grad_norm": 0.4175763227925975, + "learning_rate": 4.714181018688165e-05, + "loss": 0.6065, + "step": 460 + }, + { + "epoch": 0.4559841740850643, + "grad_norm": 0.3800322746234243, + "learning_rate": 4.7123488457310375e-05, + "loss": 0.6242, + "step": 461 + }, + { + "epoch": 0.456973293768546, + "grad_norm": 0.37365387127732047, + "learning_rate": 4.71051667277391e-05, + "loss": 0.6967, + "step": 462 + }, + { + "epoch": 0.4579624134520277, + "grad_norm": 0.46031496253337684, + "learning_rate": 4.708684499816783e-05, + "loss": 0.5547, + "step": 463 + }, + { + "epoch": 0.4589515331355094, + "grad_norm": 0.39156501494001045, + "learning_rate": 4.706852326859656e-05, + "loss": 0.5693, + "step": 464 + }, + { + "epoch": 0.4599406528189911, + "grad_norm": 0.37353175638665764, + "learning_rate": 4.705020153902529e-05, + "loss": 0.587, + "step": 465 + }, + { + "epoch": 0.4609297725024728, + "grad_norm": 0.43060709388709534, + "learning_rate": 4.7031879809454014e-05, + "loss": 0.5674, + "step": 466 + }, + { + "epoch": 0.4619188921859545, + "grad_norm": 0.38735425009204794, + "learning_rate": 4.701355807988274e-05, + "loss": 0.6058, + "step": 467 + }, + { + "epoch": 0.4629080118694362, + "grad_norm": 0.46266944891118184, + "learning_rate": 4.6995236350311474e-05, + "loss": 0.5652, + "step": 468 + }, + { + "epoch": 0.4638971315529179, + "grad_norm": 0.36247469180107994, + "learning_rate": 4.69769146207402e-05, + "loss": 0.6043, + "step": 469 + }, + { + "epoch": 0.4648862512363996, + "grad_norm": 0.4030557538436561, + "learning_rate": 4.6958592891168927e-05, + "loss": 0.6128, + "step": 470 + }, + { + "epoch": 0.4658753709198813, + "grad_norm": 0.6033825885688812, + "learning_rate": 4.694027116159766e-05, + "loss": 0.5646, + "step": 471 + }, + { + "epoch": 0.466864490603363, + "grad_norm": 0.37720757452451065, + "learning_rate": 4.6921949432026386e-05, + "loss": 0.5317, + "step": 472 + }, + { + "epoch": 0.4678536102868447, + "grad_norm": 0.5333495762992678, + "learning_rate": 4.690362770245512e-05, + "loss": 0.6658, + "step": 473 + }, + { + "epoch": 0.4688427299703264, + "grad_norm": 0.5133974898384014, + "learning_rate": 4.688530597288384e-05, + "loss": 0.5035, + "step": 474 + }, + { + "epoch": 0.4698318496538081, + "grad_norm": 0.4539945208932187, + "learning_rate": 4.686698424331257e-05, + "loss": 0.5747, + "step": 475 + }, + { + "epoch": 0.4708209693372898, + "grad_norm": 0.41564631661574264, + "learning_rate": 4.68486625137413e-05, + "loss": 0.4843, + "step": 476 + }, + { + "epoch": 0.47181008902077154, + "grad_norm": 0.38138121789633134, + "learning_rate": 4.683034078417003e-05, + "loss": 0.5409, + "step": 477 + }, + { + "epoch": 0.47279920870425324, + "grad_norm": 0.44871468038998025, + "learning_rate": 4.681201905459875e-05, + "loss": 0.5109, + "step": 478 + }, + { + "epoch": 0.47378832838773494, + "grad_norm": 0.3177694456049873, + "learning_rate": 4.6793697325027485e-05, + "loss": 0.602, + "step": 479 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 0.46257643128201287, + "learning_rate": 4.677537559545621e-05, + "loss": 0.6047, + "step": 480 + }, + { + "epoch": 0.47576656775469833, + "grad_norm": 4.67269769452667, + "learning_rate": 4.6757053865884944e-05, + "loss": 0.8503, + "step": 481 + }, + { + "epoch": 0.47675568743818003, + "grad_norm": 0.39836333155328146, + "learning_rate": 4.673873213631367e-05, + "loss": 0.6113, + "step": 482 + }, + { + "epoch": 0.47774480712166173, + "grad_norm": 2.353788660145488, + "learning_rate": 4.67204104067424e-05, + "loss": 0.7275, + "step": 483 + }, + { + "epoch": 0.47873392680514343, + "grad_norm": 0.601097344141485, + "learning_rate": 4.670208867717113e-05, + "loss": 0.6335, + "step": 484 + }, + { + "epoch": 0.4797230464886251, + "grad_norm": 0.36883129234828055, + "learning_rate": 4.668376694759986e-05, + "loss": 0.6315, + "step": 485 + }, + { + "epoch": 0.4807121661721068, + "grad_norm": 1.8909305038276623, + "learning_rate": 4.666544521802858e-05, + "loss": 0.7504, + "step": 486 + }, + { + "epoch": 0.4817012858555885, + "grad_norm": 0.4591587176811992, + "learning_rate": 4.664712348845731e-05, + "loss": 0.5666, + "step": 487 + }, + { + "epoch": 0.4826904055390702, + "grad_norm": 0.4144642652670824, + "learning_rate": 4.662880175888604e-05, + "loss": 0.561, + "step": 488 + }, + { + "epoch": 0.4836795252225519, + "grad_norm": 5.896448976676767, + "learning_rate": 4.661048002931477e-05, + "loss": 0.9015, + "step": 489 + }, + { + "epoch": 0.4846686449060336, + "grad_norm": 0.5720698016034343, + "learning_rate": 4.65921582997435e-05, + "loss": 0.6114, + "step": 490 + }, + { + "epoch": 0.4856577645895153, + "grad_norm": 0.3679119804465383, + "learning_rate": 4.657383657017222e-05, + "loss": 0.5466, + "step": 491 + }, + { + "epoch": 0.486646884272997, + "grad_norm": 0.44989689849502357, + "learning_rate": 4.6555514840600955e-05, + "loss": 0.5291, + "step": 492 + }, + { + "epoch": 0.4876360039564787, + "grad_norm": 0.5024872015252905, + "learning_rate": 4.653719311102968e-05, + "loss": 0.6009, + "step": 493 + }, + { + "epoch": 0.4886251236399604, + "grad_norm": 0.464562771119027, + "learning_rate": 4.6518871381458415e-05, + "loss": 0.5875, + "step": 494 + }, + { + "epoch": 0.4896142433234421, + "grad_norm": 0.48167751471277875, + "learning_rate": 4.650054965188714e-05, + "loss": 0.4983, + "step": 495 + }, + { + "epoch": 0.49060336300692386, + "grad_norm": 1.9507530661852923, + "learning_rate": 4.648222792231587e-05, + "loss": 0.6099, + "step": 496 + }, + { + "epoch": 0.49159248269040556, + "grad_norm": 0.606821397983761, + "learning_rate": 4.64639061927446e-05, + "loss": 0.5899, + "step": 497 + }, + { + "epoch": 0.49258160237388726, + "grad_norm": 0.533867569738737, + "learning_rate": 4.644558446317333e-05, + "loss": 0.6233, + "step": 498 + }, + { + "epoch": 0.49357072205736896, + "grad_norm": 0.529229125116131, + "learning_rate": 4.6427262733602054e-05, + "loss": 0.5453, + "step": 499 + }, + { + "epoch": 0.49455984174085066, + "grad_norm": 0.4916159848073325, + "learning_rate": 4.640894100403078e-05, + "loss": 0.5724, + "step": 500 + }, + { + "epoch": 0.49554896142433236, + "grad_norm": 0.4818563677803172, + "learning_rate": 4.639061927445951e-05, + "loss": 0.5693, + "step": 501 + }, + { + "epoch": 0.49653808110781406, + "grad_norm": 4.7169140468910316, + "learning_rate": 4.637229754488824e-05, + "loss": 0.6424, + "step": 502 + }, + { + "epoch": 0.49752720079129575, + "grad_norm": 0.6812953196471956, + "learning_rate": 4.6353975815316966e-05, + "loss": 0.5695, + "step": 503 + }, + { + "epoch": 0.49851632047477745, + "grad_norm": 0.4430846469861221, + "learning_rate": 4.633565408574569e-05, + "loss": 0.5638, + "step": 504 + }, + { + "epoch": 0.49950544015825915, + "grad_norm": 0.5867096506625531, + "learning_rate": 4.6317332356174426e-05, + "loss": 0.577, + "step": 505 + }, + { + "epoch": 0.5004945598417408, + "grad_norm": 0.9442304457268548, + "learning_rate": 4.629901062660315e-05, + "loss": 0.5029, + "step": 506 + }, + { + "epoch": 0.5014836795252225, + "grad_norm": 0.5146057080865236, + "learning_rate": 4.6280688897031885e-05, + "loss": 0.5866, + "step": 507 + }, + { + "epoch": 0.5024727992087042, + "grad_norm": 0.40718135381499504, + "learning_rate": 4.626236716746061e-05, + "loss": 0.5325, + "step": 508 + }, + { + "epoch": 0.503461918892186, + "grad_norm": 0.5519103208111765, + "learning_rate": 4.624404543788934e-05, + "loss": 0.6358, + "step": 509 + }, + { + "epoch": 0.5044510385756676, + "grad_norm": 0.43699948345174316, + "learning_rate": 4.622572370831807e-05, + "loss": 0.608, + "step": 510 + }, + { + "epoch": 0.5054401582591493, + "grad_norm": 0.521982987910703, + "learning_rate": 4.62074019787468e-05, + "loss": 0.6314, + "step": 511 + }, + { + "epoch": 0.506429277942631, + "grad_norm": 0.3869957567924524, + "learning_rate": 4.6189080249175524e-05, + "loss": 0.4741, + "step": 512 + }, + { + "epoch": 0.5074183976261127, + "grad_norm": 0.4939706787894611, + "learning_rate": 4.617075851960425e-05, + "loss": 0.562, + "step": 513 + }, + { + "epoch": 0.5084075173095944, + "grad_norm": 0.4173292639174069, + "learning_rate": 4.6152436790032984e-05, + "loss": 0.5002, + "step": 514 + }, + { + "epoch": 0.5093966369930761, + "grad_norm": 0.4096494198363875, + "learning_rate": 4.613411506046171e-05, + "loss": 0.4885, + "step": 515 + }, + { + "epoch": 0.5103857566765578, + "grad_norm": 9.858981341609425, + "learning_rate": 4.6115793330890437e-05, + "loss": 0.685, + "step": 516 + }, + { + "epoch": 0.5113748763600395, + "grad_norm": 0.5195002022290981, + "learning_rate": 4.609747160131916e-05, + "loss": 0.5369, + "step": 517 + }, + { + "epoch": 0.5123639960435212, + "grad_norm": 0.3934474506050803, + "learning_rate": 4.6079149871747896e-05, + "loss": 0.5853, + "step": 518 + }, + { + "epoch": 0.5133531157270029, + "grad_norm": 0.4134221147318054, + "learning_rate": 4.606082814217663e-05, + "loss": 0.5617, + "step": 519 + }, + { + "epoch": 0.5143422354104846, + "grad_norm": 0.42829816283403555, + "learning_rate": 4.604250641260535e-05, + "loss": 0.5531, + "step": 520 + }, + { + "epoch": 0.5153313550939663, + "grad_norm": 0.48752740238717335, + "learning_rate": 4.602418468303408e-05, + "loss": 0.606, + "step": 521 + }, + { + "epoch": 0.516320474777448, + "grad_norm": 0.38538760178318904, + "learning_rate": 4.600586295346281e-05, + "loss": 0.5335, + "step": 522 + }, + { + "epoch": 0.5173095944609297, + "grad_norm": 0.3936884719789974, + "learning_rate": 4.598754122389154e-05, + "loss": 0.5447, + "step": 523 + }, + { + "epoch": 0.5182987141444114, + "grad_norm": 0.4059453713559687, + "learning_rate": 4.596921949432026e-05, + "loss": 0.4842, + "step": 524 + }, + { + "epoch": 0.5192878338278932, + "grad_norm": 1.5542942915947051, + "learning_rate": 4.5950897764748995e-05, + "loss": 0.6171, + "step": 525 + }, + { + "epoch": 0.5202769535113749, + "grad_norm": 1.5720532397067744, + "learning_rate": 4.593257603517772e-05, + "loss": 0.5877, + "step": 526 + }, + { + "epoch": 0.5212660731948566, + "grad_norm": 2.9326347862637765, + "learning_rate": 4.5914254305606454e-05, + "loss": 0.6602, + "step": 527 + }, + { + "epoch": 0.5222551928783383, + "grad_norm": 0.40876849501610224, + "learning_rate": 4.589593257603518e-05, + "loss": 0.4963, + "step": 528 + }, + { + "epoch": 0.52324431256182, + "grad_norm": 0.43764445066833524, + "learning_rate": 4.587761084646391e-05, + "loss": 0.5417, + "step": 529 + }, + { + "epoch": 0.5242334322453017, + "grad_norm": 0.40565909079834533, + "learning_rate": 4.5859289116892634e-05, + "loss": 0.576, + "step": 530 + }, + { + "epoch": 0.5252225519287834, + "grad_norm": 0.4217327834248698, + "learning_rate": 4.584096738732137e-05, + "loss": 0.6196, + "step": 531 + }, + { + "epoch": 0.5262116716122651, + "grad_norm": 0.46160735689946497, + "learning_rate": 4.582264565775009e-05, + "loss": 0.5486, + "step": 532 + }, + { + "epoch": 0.5272007912957468, + "grad_norm": 0.4032237658743589, + "learning_rate": 4.580432392817882e-05, + "loss": 0.5626, + "step": 533 + }, + { + "epoch": 0.5281899109792285, + "grad_norm": 0.5635928214859514, + "learning_rate": 4.578600219860755e-05, + "loss": 0.6173, + "step": 534 + }, + { + "epoch": 0.5291790306627102, + "grad_norm": 0.6342796371814479, + "learning_rate": 4.576768046903628e-05, + "loss": 0.6277, + "step": 535 + }, + { + "epoch": 0.5301681503461919, + "grad_norm": 0.4677423139160351, + "learning_rate": 4.574935873946501e-05, + "loss": 0.5694, + "step": 536 + }, + { + "epoch": 0.5311572700296736, + "grad_norm": 0.40238302891813643, + "learning_rate": 4.573103700989373e-05, + "loss": 0.6217, + "step": 537 + }, + { + "epoch": 0.5321463897131553, + "grad_norm": 0.504563589844548, + "learning_rate": 4.5712715280322465e-05, + "loss": 0.6222, + "step": 538 + }, + { + "epoch": 0.533135509396637, + "grad_norm": 0.42246468004279764, + "learning_rate": 4.569439355075119e-05, + "loss": 0.5684, + "step": 539 + }, + { + "epoch": 0.5341246290801187, + "grad_norm": 0.40403558987941973, + "learning_rate": 4.5676071821179925e-05, + "loss": 0.558, + "step": 540 + }, + { + "epoch": 0.5351137487636004, + "grad_norm": 0.44549737299183073, + "learning_rate": 4.5657750091608644e-05, + "loss": 0.5609, + "step": 541 + }, + { + "epoch": 0.5361028684470821, + "grad_norm": 20.81218570435005, + "learning_rate": 4.563942836203738e-05, + "loss": 0.9656, + "step": 542 + }, + { + "epoch": 0.5370919881305638, + "grad_norm": 0.4727615722395577, + "learning_rate": 4.562110663246611e-05, + "loss": 0.4987, + "step": 543 + }, + { + "epoch": 0.5380811078140455, + "grad_norm": 0.3679816340175357, + "learning_rate": 4.560278490289484e-05, + "loss": 0.5986, + "step": 544 + }, + { + "epoch": 0.5390702274975272, + "grad_norm": 0.4198111058431937, + "learning_rate": 4.5584463173323564e-05, + "loss": 0.6406, + "step": 545 + }, + { + "epoch": 0.5400593471810089, + "grad_norm": 0.381705361347438, + "learning_rate": 4.556614144375229e-05, + "loss": 0.5868, + "step": 546 + }, + { + "epoch": 0.5410484668644906, + "grad_norm": 0.36128173218405524, + "learning_rate": 4.554781971418102e-05, + "loss": 0.5606, + "step": 547 + }, + { + "epoch": 0.5420375865479723, + "grad_norm": 0.36624104089112686, + "learning_rate": 4.552949798460975e-05, + "loss": 0.5441, + "step": 548 + }, + { + "epoch": 0.543026706231454, + "grad_norm": 2.643110350633743, + "learning_rate": 4.5511176255038476e-05, + "loss": 0.6998, + "step": 549 + }, + { + "epoch": 0.5440158259149357, + "grad_norm": 0.45802173595768614, + "learning_rate": 4.54928545254672e-05, + "loss": 0.6603, + "step": 550 + }, + { + "epoch": 0.5450049455984174, + "grad_norm": 0.39506056655492705, + "learning_rate": 4.5474532795895936e-05, + "loss": 0.5296, + "step": 551 + }, + { + "epoch": 0.5459940652818991, + "grad_norm": 0.45358696386112984, + "learning_rate": 4.545621106632466e-05, + "loss": 0.5913, + "step": 552 + }, + { + "epoch": 0.5469831849653808, + "grad_norm": 0.8759613406131889, + "learning_rate": 4.5437889336753395e-05, + "loss": 0.5732, + "step": 553 + }, + { + "epoch": 0.5479723046488625, + "grad_norm": 0.5560376705190068, + "learning_rate": 4.5419567607182115e-05, + "loss": 0.5513, + "step": 554 + }, + { + "epoch": 0.5489614243323442, + "grad_norm": 0.46334900525439865, + "learning_rate": 4.540124587761085e-05, + "loss": 0.5847, + "step": 555 + }, + { + "epoch": 0.5499505440158259, + "grad_norm": 0.5044951628643652, + "learning_rate": 4.538292414803958e-05, + "loss": 0.6368, + "step": 556 + }, + { + "epoch": 0.5509396636993076, + "grad_norm": 0.44658793634543775, + "learning_rate": 4.536460241846831e-05, + "loss": 0.5482, + "step": 557 + }, + { + "epoch": 0.5519287833827893, + "grad_norm": 0.4668038982374279, + "learning_rate": 4.5346280688897034e-05, + "loss": 0.6158, + "step": 558 + }, + { + "epoch": 0.552917903066271, + "grad_norm": 0.48096355582962474, + "learning_rate": 4.532795895932576e-05, + "loss": 0.57, + "step": 559 + }, + { + "epoch": 0.5539070227497527, + "grad_norm": 0.4897334267462889, + "learning_rate": 4.5309637229754494e-05, + "loss": 0.552, + "step": 560 + }, + { + "epoch": 0.5548961424332344, + "grad_norm": 0.4185534474944087, + "learning_rate": 4.529131550018322e-05, + "loss": 0.5191, + "step": 561 + }, + { + "epoch": 0.5558852621167161, + "grad_norm": 0.5365259911263918, + "learning_rate": 4.527299377061195e-05, + "loss": 0.5084, + "step": 562 + }, + { + "epoch": 0.5568743818001978, + "grad_norm": 0.4215213687101347, + "learning_rate": 4.525467204104067e-05, + "loss": 0.6025, + "step": 563 + }, + { + "epoch": 0.5578635014836796, + "grad_norm": 0.4740697064883748, + "learning_rate": 4.5236350311469406e-05, + "loss": 0.4966, + "step": 564 + }, + { + "epoch": 0.5588526211671613, + "grad_norm": 0.44324661986533054, + "learning_rate": 4.521802858189813e-05, + "loss": 0.566, + "step": 565 + }, + { + "epoch": 0.559841740850643, + "grad_norm": 2.2285546984735856, + "learning_rate": 4.519970685232686e-05, + "loss": 0.7091, + "step": 566 + }, + { + "epoch": 0.5608308605341247, + "grad_norm": 0.5556074562961142, + "learning_rate": 4.518138512275559e-05, + "loss": 0.594, + "step": 567 + }, + { + "epoch": 0.5618199802176064, + "grad_norm": 0.4500464488699885, + "learning_rate": 4.516306339318432e-05, + "loss": 0.5656, + "step": 568 + }, + { + "epoch": 0.5628090999010881, + "grad_norm": 0.3512247907513221, + "learning_rate": 4.514474166361305e-05, + "loss": 0.5336, + "step": 569 + }, + { + "epoch": 0.5637982195845698, + "grad_norm": 0.4304687263152149, + "learning_rate": 4.512641993404177e-05, + "loss": 0.5865, + "step": 570 + }, + { + "epoch": 0.5647873392680515, + "grad_norm": 0.4521665914106063, + "learning_rate": 4.5108098204470505e-05, + "loss": 0.5873, + "step": 571 + }, + { + "epoch": 0.5657764589515332, + "grad_norm": 0.4159456344130283, + "learning_rate": 4.508977647489923e-05, + "loss": 0.5975, + "step": 572 + }, + { + "epoch": 0.5667655786350149, + "grad_norm": 0.5296905855219289, + "learning_rate": 4.5071454745327964e-05, + "loss": 0.5955, + "step": 573 + }, + { + "epoch": 0.5677546983184966, + "grad_norm": 0.4014808388631957, + "learning_rate": 4.505313301575669e-05, + "loss": 0.5433, + "step": 574 + }, + { + "epoch": 0.5687438180019783, + "grad_norm": 0.4319671517552426, + "learning_rate": 4.503481128618542e-05, + "loss": 0.4904, + "step": 575 + }, + { + "epoch": 0.56973293768546, + "grad_norm": 3.6175278656140404, + "learning_rate": 4.5016489556614144e-05, + "loss": 0.6217, + "step": 576 + }, + { + "epoch": 0.5707220573689417, + "grad_norm": 0.42678930653486047, + "learning_rate": 4.499816782704288e-05, + "loss": 0.5212, + "step": 577 + }, + { + "epoch": 0.5717111770524234, + "grad_norm": 0.4066859848927131, + "learning_rate": 4.49798460974716e-05, + "loss": 0.542, + "step": 578 + }, + { + "epoch": 0.5727002967359051, + "grad_norm": 0.4150317023868254, + "learning_rate": 4.496152436790033e-05, + "loss": 0.5509, + "step": 579 + }, + { + "epoch": 0.5736894164193868, + "grad_norm": 0.39696777501880237, + "learning_rate": 4.494320263832906e-05, + "loss": 0.6286, + "step": 580 + }, + { + "epoch": 0.5746785361028685, + "grad_norm": 0.42488826730568463, + "learning_rate": 4.492488090875779e-05, + "loss": 0.5245, + "step": 581 + }, + { + "epoch": 0.5756676557863502, + "grad_norm": 0.3832259003808749, + "learning_rate": 4.490655917918652e-05, + "loss": 0.513, + "step": 582 + }, + { + "epoch": 0.5766567754698319, + "grad_norm": 0.37765307913240526, + "learning_rate": 4.488823744961524e-05, + "loss": 0.5797, + "step": 583 + }, + { + "epoch": 0.5776458951533135, + "grad_norm": 0.5243524846267854, + "learning_rate": 4.4869915720043975e-05, + "loss": 0.5494, + "step": 584 + }, + { + "epoch": 0.5786350148367952, + "grad_norm": 0.35608241349020836, + "learning_rate": 4.48515939904727e-05, + "loss": 0.5169, + "step": 585 + }, + { + "epoch": 0.579624134520277, + "grad_norm": 0.5072400729075438, + "learning_rate": 4.4833272260901435e-05, + "loss": 0.5375, + "step": 586 + }, + { + "epoch": 0.5806132542037586, + "grad_norm": 0.3887888599949883, + "learning_rate": 4.4814950531330155e-05, + "loss": 0.5928, + "step": 587 + }, + { + "epoch": 0.5816023738872403, + "grad_norm": 0.38637277802524667, + "learning_rate": 4.479662880175889e-05, + "loss": 0.5631, + "step": 588 + }, + { + "epoch": 0.582591493570722, + "grad_norm": 0.4544815503207846, + "learning_rate": 4.4778307072187614e-05, + "loss": 0.5284, + "step": 589 + }, + { + "epoch": 0.5835806132542037, + "grad_norm": 0.44860638139409253, + "learning_rate": 4.475998534261635e-05, + "loss": 0.5568, + "step": 590 + }, + { + "epoch": 0.5845697329376854, + "grad_norm": 0.34783304628715694, + "learning_rate": 4.4741663613045074e-05, + "loss": 0.4736, + "step": 591 + }, + { + "epoch": 0.5855588526211671, + "grad_norm": 0.46635755164532483, + "learning_rate": 4.47233418834738e-05, + "loss": 0.5721, + "step": 592 + }, + { + "epoch": 0.5865479723046488, + "grad_norm": 0.35836770919240934, + "learning_rate": 4.4705020153902533e-05, + "loss": 0.5742, + "step": 593 + }, + { + "epoch": 0.5875370919881305, + "grad_norm": 2.1646391480973386, + "learning_rate": 4.468669842433126e-05, + "loss": 0.7749, + "step": 594 + }, + { + "epoch": 0.5885262116716122, + "grad_norm": 0.6322154073368694, + "learning_rate": 4.4668376694759986e-05, + "loss": 0.5222, + "step": 595 + }, + { + "epoch": 0.5895153313550939, + "grad_norm": 0.3447318173329335, + "learning_rate": 4.465005496518871e-05, + "loss": 0.5024, + "step": 596 + }, + { + "epoch": 0.5905044510385756, + "grad_norm": 0.5950854440009333, + "learning_rate": 4.4631733235617446e-05, + "loss": 0.6533, + "step": 597 + }, + { + "epoch": 0.5914935707220573, + "grad_norm": 0.521884110600529, + "learning_rate": 4.461341150604617e-05, + "loss": 0.5968, + "step": 598 + }, + { + "epoch": 0.592482690405539, + "grad_norm": 0.3392872051376154, + "learning_rate": 4.45950897764749e-05, + "loss": 0.5776, + "step": 599 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 0.5377239708646002, + "learning_rate": 4.4576768046903625e-05, + "loss": 0.6145, + "step": 600 + }, + { + "epoch": 0.5944609297725024, + "grad_norm": 0.4318047502673512, + "learning_rate": 4.455844631733236e-05, + "loss": 0.6059, + "step": 601 + }, + { + "epoch": 0.5954500494559841, + "grad_norm": 2.3631173625344735, + "learning_rate": 4.454012458776109e-05, + "loss": 0.779, + "step": 602 + }, + { + "epoch": 0.5964391691394659, + "grad_norm": 0.5336870304548778, + "learning_rate": 4.452180285818982e-05, + "loss": 0.5058, + "step": 603 + }, + { + "epoch": 0.5974282888229476, + "grad_norm": 0.4034031794946735, + "learning_rate": 4.4503481128618544e-05, + "loss": 0.5717, + "step": 604 + }, + { + "epoch": 0.5984174085064293, + "grad_norm": 0.37891278714404253, + "learning_rate": 4.448515939904727e-05, + "loss": 0.5299, + "step": 605 + }, + { + "epoch": 0.599406528189911, + "grad_norm": 1.6328876825145693, + "learning_rate": 4.4466837669476004e-05, + "loss": 0.7685, + "step": 606 + }, + { + "epoch": 0.6003956478733927, + "grad_norm": 0.4889387354780191, + "learning_rate": 4.444851593990473e-05, + "loss": 0.5767, + "step": 607 + }, + { + "epoch": 0.6013847675568744, + "grad_norm": 0.39614179421575757, + "learning_rate": 4.443019421033346e-05, + "loss": 0.5488, + "step": 608 + }, + { + "epoch": 0.6023738872403561, + "grad_norm": 0.4660736811721914, + "learning_rate": 4.441187248076218e-05, + "loss": 0.5036, + "step": 609 + }, + { + "epoch": 0.6033630069238378, + "grad_norm": 0.4305830100351165, + "learning_rate": 4.4393550751190916e-05, + "loss": 0.5586, + "step": 610 + }, + { + "epoch": 0.6043521266073195, + "grad_norm": 0.42857086912608233, + "learning_rate": 4.437522902161964e-05, + "loss": 0.5373, + "step": 611 + }, + { + "epoch": 0.6053412462908012, + "grad_norm": 0.4259812924670851, + "learning_rate": 4.435690729204837e-05, + "loss": 0.6364, + "step": 612 + }, + { + "epoch": 0.6063303659742829, + "grad_norm": 0.35384845137297105, + "learning_rate": 4.4338585562477096e-05, + "loss": 0.5686, + "step": 613 + }, + { + "epoch": 0.6073194856577646, + "grad_norm": 0.43840991923734457, + "learning_rate": 4.432026383290583e-05, + "loss": 0.5664, + "step": 614 + }, + { + "epoch": 0.6083086053412463, + "grad_norm": 0.4160175071465533, + "learning_rate": 4.430194210333456e-05, + "loss": 0.5562, + "step": 615 + }, + { + "epoch": 0.609297725024728, + "grad_norm": 0.4076622693219068, + "learning_rate": 4.428362037376328e-05, + "loss": 0.5819, + "step": 616 + }, + { + "epoch": 0.6102868447082097, + "grad_norm": 0.43491508340384, + "learning_rate": 4.4265298644192015e-05, + "loss": 0.6215, + "step": 617 + }, + { + "epoch": 0.6112759643916914, + "grad_norm": 0.9665467383337245, + "learning_rate": 4.424697691462074e-05, + "loss": 0.6142, + "step": 618 + }, + { + "epoch": 0.6122650840751731, + "grad_norm": 0.3867966415220155, + "learning_rate": 4.4228655185049474e-05, + "loss": 0.5207, + "step": 619 + }, + { + "epoch": 0.6132542037586548, + "grad_norm": 0.3384128186488838, + "learning_rate": 4.42103334554782e-05, + "loss": 0.5391, + "step": 620 + }, + { + "epoch": 0.6142433234421365, + "grad_norm": 0.37030112577139734, + "learning_rate": 4.419201172590693e-05, + "loss": 0.5357, + "step": 621 + }, + { + "epoch": 0.6152324431256182, + "grad_norm": 0.33532983716976167, + "learning_rate": 4.4173689996335654e-05, + "loss": 0.5596, + "step": 622 + }, + { + "epoch": 0.6162215628090999, + "grad_norm": 2.1912897019885156, + "learning_rate": 4.415536826676439e-05, + "loss": 0.663, + "step": 623 + }, + { + "epoch": 0.6172106824925816, + "grad_norm": 0.4108780719007618, + "learning_rate": 4.413704653719311e-05, + "loss": 0.5381, + "step": 624 + }, + { + "epoch": 0.6181998021760633, + "grad_norm": 0.345525682169045, + "learning_rate": 4.411872480762184e-05, + "loss": 0.6086, + "step": 625 + }, + { + "epoch": 0.619188921859545, + "grad_norm": 0.32145894628431493, + "learning_rate": 4.410040307805057e-05, + "loss": 0.5079, + "step": 626 + }, + { + "epoch": 0.6201780415430267, + "grad_norm": 0.3400074427817454, + "learning_rate": 4.40820813484793e-05, + "loss": 0.556, + "step": 627 + }, + { + "epoch": 0.6211671612265084, + "grad_norm": 0.336784464756408, + "learning_rate": 4.406375961890803e-05, + "loss": 0.5383, + "step": 628 + }, + { + "epoch": 0.6221562809099901, + "grad_norm": 0.36068226884339905, + "learning_rate": 4.404543788933675e-05, + "loss": 0.5838, + "step": 629 + }, + { + "epoch": 0.6231454005934718, + "grad_norm": 0.35409438086231915, + "learning_rate": 4.4027116159765485e-05, + "loss": 0.4777, + "step": 630 + }, + { + "epoch": 0.6241345202769535, + "grad_norm": 0.3726791740087236, + "learning_rate": 4.400879443019421e-05, + "loss": 0.6175, + "step": 631 + }, + { + "epoch": 0.6251236399604352, + "grad_norm": 0.4023806669048874, + "learning_rate": 4.3990472700622945e-05, + "loss": 0.6292, + "step": 632 + }, + { + "epoch": 0.6261127596439169, + "grad_norm": 0.34330582894277495, + "learning_rate": 4.3972150971051665e-05, + "loss": 0.4987, + "step": 633 + }, + { + "epoch": 0.6271018793273986, + "grad_norm": 0.3305730900775558, + "learning_rate": 4.39538292414804e-05, + "loss": 0.5235, + "step": 634 + }, + { + "epoch": 0.6280909990108803, + "grad_norm": 0.36092394055577376, + "learning_rate": 4.3935507511909124e-05, + "loss": 0.5585, + "step": 635 + }, + { + "epoch": 0.629080118694362, + "grad_norm": 0.4093047909780486, + "learning_rate": 4.391718578233786e-05, + "loss": 0.5418, + "step": 636 + }, + { + "epoch": 0.6300692383778437, + "grad_norm": 0.33645740331275986, + "learning_rate": 4.3898864052766584e-05, + "loss": 0.4971, + "step": 637 + }, + { + "epoch": 0.6310583580613254, + "grad_norm": 0.3611452319329633, + "learning_rate": 4.388054232319531e-05, + "loss": 0.5567, + "step": 638 + }, + { + "epoch": 0.6320474777448071, + "grad_norm": 0.4930946827145663, + "learning_rate": 4.3862220593624043e-05, + "loss": 0.5986, + "step": 639 + }, + { + "epoch": 0.6330365974282888, + "grad_norm": 0.3231177239659706, + "learning_rate": 4.384389886405277e-05, + "loss": 0.5975, + "step": 640 + }, + { + "epoch": 0.6340257171117705, + "grad_norm": 0.3827959704241396, + "learning_rate": 4.3825577134481496e-05, + "loss": 0.5177, + "step": 641 + }, + { + "epoch": 0.6350148367952523, + "grad_norm": 0.41977721218388675, + "learning_rate": 4.380725540491022e-05, + "loss": 0.6479, + "step": 642 + }, + { + "epoch": 0.636003956478734, + "grad_norm": 0.3662099933871658, + "learning_rate": 4.3788933675338956e-05, + "loss": 0.6163, + "step": 643 + }, + { + "epoch": 0.6369930761622157, + "grad_norm": 0.3712913474912623, + "learning_rate": 4.377061194576768e-05, + "loss": 0.4442, + "step": 644 + }, + { + "epoch": 0.6379821958456974, + "grad_norm": 0.5071826500684371, + "learning_rate": 4.375229021619641e-05, + "loss": 0.5606, + "step": 645 + }, + { + "epoch": 0.6389713155291791, + "grad_norm": 0.35640208864347855, + "learning_rate": 4.3733968486625135e-05, + "loss": 0.505, + "step": 646 + }, + { + "epoch": 0.6399604352126608, + "grad_norm": 0.4223414729847192, + "learning_rate": 4.371564675705387e-05, + "loss": 0.5597, + "step": 647 + }, + { + "epoch": 0.6409495548961425, + "grad_norm": 0.524278275869879, + "learning_rate": 4.3697325027482595e-05, + "loss": 0.5428, + "step": 648 + }, + { + "epoch": 0.6419386745796242, + "grad_norm": 0.34090519496656335, + "learning_rate": 4.367900329791133e-05, + "loss": 0.5068, + "step": 649 + }, + { + "epoch": 0.6429277942631059, + "grad_norm": 0.4974539026917071, + "learning_rate": 4.3660681568340054e-05, + "loss": 0.5449, + "step": 650 + }, + { + "epoch": 0.6439169139465876, + "grad_norm": 0.4263296924702846, + "learning_rate": 4.364235983876878e-05, + "loss": 0.5863, + "step": 651 + }, + { + "epoch": 0.6449060336300693, + "grad_norm": 0.36550014682885573, + "learning_rate": 4.3624038109197514e-05, + "loss": 0.5139, + "step": 652 + }, + { + "epoch": 0.645895153313551, + "grad_norm": 0.4420805717206251, + "learning_rate": 4.360571637962624e-05, + "loss": 0.4919, + "step": 653 + }, + { + "epoch": 0.6468842729970327, + "grad_norm": 3.5175564338439544, + "learning_rate": 4.358739465005497e-05, + "loss": 0.6138, + "step": 654 + }, + { + "epoch": 0.6478733926805144, + "grad_norm": 0.44353550687361537, + "learning_rate": 4.356907292048369e-05, + "loss": 0.5678, + "step": 655 + }, + { + "epoch": 0.6488625123639961, + "grad_norm": 0.42907631403450497, + "learning_rate": 4.3550751190912426e-05, + "loss": 0.5126, + "step": 656 + }, + { + "epoch": 0.6498516320474778, + "grad_norm": 0.5224325181644863, + "learning_rate": 4.353242946134115e-05, + "loss": 0.567, + "step": 657 + }, + { + "epoch": 0.6508407517309595, + "grad_norm": 0.36393145929326026, + "learning_rate": 4.351410773176988e-05, + "loss": 0.5758, + "step": 658 + }, + { + "epoch": 0.6518298714144412, + "grad_norm": 0.43831662678367533, + "learning_rate": 4.3495786002198606e-05, + "loss": 0.4662, + "step": 659 + }, + { + "epoch": 0.6528189910979229, + "grad_norm": 0.44731721791336854, + "learning_rate": 4.347746427262734e-05, + "loss": 0.5162, + "step": 660 + }, + { + "epoch": 0.6538081107814046, + "grad_norm": 0.3625805902395787, + "learning_rate": 4.3459142543056065e-05, + "loss": 0.5866, + "step": 661 + }, + { + "epoch": 0.6547972304648862, + "grad_norm": 0.374836185068099, + "learning_rate": 4.344082081348479e-05, + "loss": 0.5622, + "step": 662 + }, + { + "epoch": 0.655786350148368, + "grad_norm": 0.3636250559467318, + "learning_rate": 4.3422499083913525e-05, + "loss": 0.4704, + "step": 663 + }, + { + "epoch": 0.6567754698318496, + "grad_norm": 0.368172907844344, + "learning_rate": 4.340417735434225e-05, + "loss": 0.4582, + "step": 664 + }, + { + "epoch": 0.6577645895153313, + "grad_norm": 0.3781176871717141, + "learning_rate": 4.3385855624770985e-05, + "loss": 0.5608, + "step": 665 + }, + { + "epoch": 0.658753709198813, + "grad_norm": 0.3472496247475107, + "learning_rate": 4.336753389519971e-05, + "loss": 0.5267, + "step": 666 + }, + { + "epoch": 0.6597428288822947, + "grad_norm": 0.35710749282494153, + "learning_rate": 4.334921216562844e-05, + "loss": 0.5912, + "step": 667 + }, + { + "epoch": 0.6607319485657764, + "grad_norm": 0.34177842883688003, + "learning_rate": 4.3330890436057164e-05, + "loss": 0.5239, + "step": 668 + }, + { + "epoch": 0.6617210682492581, + "grad_norm": 0.45206446669886513, + "learning_rate": 4.33125687064859e-05, + "loss": 0.6048, + "step": 669 + }, + { + "epoch": 0.6627101879327398, + "grad_norm": 0.43477515611045725, + "learning_rate": 4.3294246976914623e-05, + "loss": 0.5396, + "step": 670 + }, + { + "epoch": 0.6636993076162215, + "grad_norm": 0.4495761387137212, + "learning_rate": 4.327592524734335e-05, + "loss": 0.5174, + "step": 671 + }, + { + "epoch": 0.6646884272997032, + "grad_norm": 1.8122069786779942, + "learning_rate": 4.3257603517772076e-05, + "loss": 0.5415, + "step": 672 + }, + { + "epoch": 0.6656775469831849, + "grad_norm": 0.54700387793728, + "learning_rate": 4.323928178820081e-05, + "loss": 0.5466, + "step": 673 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3497883391162035, + "learning_rate": 4.322096005862954e-05, + "loss": 0.5633, + "step": 674 + }, + { + "epoch": 0.6676557863501483, + "grad_norm": 0.45126911871908887, + "learning_rate": 4.320263832905826e-05, + "loss": 0.5531, + "step": 675 + }, + { + "epoch": 0.66864490603363, + "grad_norm": 0.3951817778962431, + "learning_rate": 4.3184316599486995e-05, + "loss": 0.5225, + "step": 676 + }, + { + "epoch": 0.6696340257171117, + "grad_norm": 0.5240280173507978, + "learning_rate": 4.316599486991572e-05, + "loss": 0.5278, + "step": 677 + }, + { + "epoch": 0.6706231454005934, + "grad_norm": 0.3375650680362749, + "learning_rate": 4.3147673140344455e-05, + "loss": 0.5464, + "step": 678 + }, + { + "epoch": 0.6716122650840751, + "grad_norm": 0.41081775932232456, + "learning_rate": 4.3129351410773175e-05, + "loss": 0.5289, + "step": 679 + }, + { + "epoch": 0.6726013847675568, + "grad_norm": 0.42912334923955686, + "learning_rate": 4.311102968120191e-05, + "loss": 0.5685, + "step": 680 + }, + { + "epoch": 0.6735905044510386, + "grad_norm": 0.44709522287251285, + "learning_rate": 4.3092707951630634e-05, + "loss": 0.5861, + "step": 681 + }, + { + "epoch": 0.6745796241345203, + "grad_norm": 8.932319607989578, + "learning_rate": 4.307438622205937e-05, + "loss": 0.5878, + "step": 682 + }, + { + "epoch": 0.675568743818002, + "grad_norm": 0.6163038315017687, + "learning_rate": 4.3056064492488094e-05, + "loss": 0.552, + "step": 683 + }, + { + "epoch": 0.6765578635014837, + "grad_norm": 0.4130993321464594, + "learning_rate": 4.303774276291682e-05, + "loss": 0.484, + "step": 684 + }, + { + "epoch": 0.6775469831849654, + "grad_norm": 0.44149481672035984, + "learning_rate": 4.301942103334555e-05, + "loss": 0.487, + "step": 685 + }, + { + "epoch": 0.6785361028684471, + "grad_norm": 0.44270474817714917, + "learning_rate": 4.300109930377428e-05, + "loss": 0.4829, + "step": 686 + }, + { + "epoch": 0.6795252225519288, + "grad_norm": 1.778964998297047, + "learning_rate": 4.2982777574203006e-05, + "loss": 0.6008, + "step": 687 + }, + { + "epoch": 0.6805143422354105, + "grad_norm": 0.5042046907833749, + "learning_rate": 4.296445584463173e-05, + "loss": 0.5508, + "step": 688 + }, + { + "epoch": 0.6815034619188922, + "grad_norm": 1.2364847410894697, + "learning_rate": 4.2946134115060466e-05, + "loss": 0.4864, + "step": 689 + }, + { + "epoch": 0.6824925816023739, + "grad_norm": 5.675707936422887, + "learning_rate": 4.292781238548919e-05, + "loss": 0.5267, + "step": 690 + }, + { + "epoch": 0.6834817012858556, + "grad_norm": 1.0113014216945861, + "learning_rate": 4.290949065591792e-05, + "loss": 0.6003, + "step": 691 + }, + { + "epoch": 0.6844708209693373, + "grad_norm": 0.7307635089929869, + "learning_rate": 4.2891168926346645e-05, + "loss": 0.5435, + "step": 692 + }, + { + "epoch": 0.685459940652819, + "grad_norm": 1.005536324793941, + "learning_rate": 4.287284719677538e-05, + "loss": 0.5953, + "step": 693 + }, + { + "epoch": 0.6864490603363007, + "grad_norm": 0.8795150549550624, + "learning_rate": 4.2854525467204105e-05, + "loss": 0.5776, + "step": 694 + }, + { + "epoch": 0.6874381800197824, + "grad_norm": 0.9217585885833345, + "learning_rate": 4.283620373763284e-05, + "loss": 0.4713, + "step": 695 + }, + { + "epoch": 0.6884272997032641, + "grad_norm": 0.5750818575992418, + "learning_rate": 4.281788200806156e-05, + "loss": 0.5646, + "step": 696 + }, + { + "epoch": 0.6894164193867458, + "grad_norm": 1.1839342901013068, + "learning_rate": 4.279956027849029e-05, + "loss": 0.5912, + "step": 697 + }, + { + "epoch": 0.6904055390702275, + "grad_norm": 0.40639325595326, + "learning_rate": 4.2781238548919024e-05, + "loss": 0.5001, + "step": 698 + }, + { + "epoch": 0.6913946587537092, + "grad_norm": 0.9197610387321684, + "learning_rate": 4.276291681934775e-05, + "loss": 0.5174, + "step": 699 + }, + { + "epoch": 0.6923837784371909, + "grad_norm": 2.2435633788044482, + "learning_rate": 4.274459508977648e-05, + "loss": 0.6464, + "step": 700 + }, + { + "epoch": 0.6933728981206726, + "grad_norm": 2.2636363371000616, + "learning_rate": 4.27262733602052e-05, + "loss": 0.6616, + "step": 701 + }, + { + "epoch": 0.6943620178041543, + "grad_norm": 1.0957925072649075, + "learning_rate": 4.2707951630633937e-05, + "loss": 0.5103, + "step": 702 + }, + { + "epoch": 0.695351137487636, + "grad_norm": 0.44136767500918594, + "learning_rate": 4.268962990106266e-05, + "loss": 0.5773, + "step": 703 + }, + { + "epoch": 0.6963402571711177, + "grad_norm": 1.2119093160665309, + "learning_rate": 4.267130817149139e-05, + "loss": 0.5676, + "step": 704 + }, + { + "epoch": 0.6973293768545994, + "grad_norm": 0.9844741497724914, + "learning_rate": 4.2652986441920116e-05, + "loss": 0.5346, + "step": 705 + }, + { + "epoch": 0.6983184965380811, + "grad_norm": 0.9825434500331776, + "learning_rate": 4.263466471234885e-05, + "loss": 0.579, + "step": 706 + }, + { + "epoch": 0.6993076162215628, + "grad_norm": 1.0080629257558236, + "learning_rate": 4.2616342982777575e-05, + "loss": 0.5322, + "step": 707 + }, + { + "epoch": 0.7002967359050445, + "grad_norm": 0.6306084604291936, + "learning_rate": 4.25980212532063e-05, + "loss": 0.6135, + "step": 708 + }, + { + "epoch": 0.7012858555885262, + "grad_norm": 0.7935850704567368, + "learning_rate": 4.257969952363503e-05, + "loss": 0.4881, + "step": 709 + }, + { + "epoch": 0.7022749752720079, + "grad_norm": 0.42031411509137817, + "learning_rate": 4.256137779406376e-05, + "loss": 0.5627, + "step": 710 + }, + { + "epoch": 0.7032640949554896, + "grad_norm": 0.6431022906724274, + "learning_rate": 4.2543056064492495e-05, + "loss": 0.5521, + "step": 711 + }, + { + "epoch": 0.7042532146389713, + "grad_norm": 0.4051950629961147, + "learning_rate": 4.252473433492122e-05, + "loss": 0.539, + "step": 712 + }, + { + "epoch": 0.705242334322453, + "grad_norm": 0.5504001902905677, + "learning_rate": 4.250641260534995e-05, + "loss": 0.6014, + "step": 713 + }, + { + "epoch": 0.7062314540059347, + "grad_norm": 0.42943927228102546, + "learning_rate": 4.2488090875778674e-05, + "loss": 0.5071, + "step": 714 + }, + { + "epoch": 0.7072205736894164, + "grad_norm": 0.4564346346810455, + "learning_rate": 4.246976914620741e-05, + "loss": 0.5798, + "step": 715 + }, + { + "epoch": 0.7082096933728981, + "grad_norm": 0.41971345322735765, + "learning_rate": 4.2451447416636133e-05, + "loss": 0.4746, + "step": 716 + }, + { + "epoch": 0.7091988130563798, + "grad_norm": 0.4767124000008028, + "learning_rate": 4.243312568706486e-05, + "loss": 0.5221, + "step": 717 + }, + { + "epoch": 0.7101879327398615, + "grad_norm": 0.473920208531501, + "learning_rate": 4.2414803957493586e-05, + "loss": 0.6092, + "step": 718 + }, + { + "epoch": 0.7111770524233432, + "grad_norm": 0.4008182380340919, + "learning_rate": 4.239648222792232e-05, + "loss": 0.5633, + "step": 719 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 0.4572650059707555, + "learning_rate": 4.2378160498351046e-05, + "loss": 0.4485, + "step": 720 + }, + { + "epoch": 0.7131552917903067, + "grad_norm": 0.3982981440820982, + "learning_rate": 4.235983876877977e-05, + "loss": 0.5882, + "step": 721 + }, + { + "epoch": 0.7141444114737884, + "grad_norm": 0.4445315839865244, + "learning_rate": 4.2341517039208506e-05, + "loss": 0.499, + "step": 722 + }, + { + "epoch": 0.7151335311572701, + "grad_norm": 0.45327248020817545, + "learning_rate": 4.232319530963723e-05, + "loss": 0.5828, + "step": 723 + }, + { + "epoch": 0.7161226508407518, + "grad_norm": 0.45072469495117606, + "learning_rate": 4.2304873580065965e-05, + "loss": 0.5684, + "step": 724 + }, + { + "epoch": 0.7171117705242335, + "grad_norm": 0.5030033920719464, + "learning_rate": 4.2286551850494685e-05, + "loss": 0.5462, + "step": 725 + }, + { + "epoch": 0.7181008902077152, + "grad_norm": 0.41612647044276724, + "learning_rate": 4.226823012092342e-05, + "loss": 0.5931, + "step": 726 + }, + { + "epoch": 0.7190900098911969, + "grad_norm": 0.48469080484439975, + "learning_rate": 4.2249908391352144e-05, + "loss": 0.5663, + "step": 727 + }, + { + "epoch": 0.7200791295746786, + "grad_norm": 0.3288997249334829, + "learning_rate": 4.223158666178088e-05, + "loss": 0.5097, + "step": 728 + }, + { + "epoch": 0.7210682492581603, + "grad_norm": 0.4306446848182276, + "learning_rate": 4.2213264932209604e-05, + "loss": 0.5869, + "step": 729 + }, + { + "epoch": 0.722057368941642, + "grad_norm": 4.7698964565308675, + "learning_rate": 4.219494320263833e-05, + "loss": 0.6767, + "step": 730 + }, + { + "epoch": 0.7230464886251237, + "grad_norm": 0.5127544163487205, + "learning_rate": 4.217662147306706e-05, + "loss": 0.6094, + "step": 731 + }, + { + "epoch": 0.7240356083086054, + "grad_norm": 0.41334473621761253, + "learning_rate": 4.215829974349579e-05, + "loss": 0.5082, + "step": 732 + }, + { + "epoch": 0.7250247279920871, + "grad_norm": 0.36240790003835044, + "learning_rate": 4.2139978013924516e-05, + "loss": 0.5948, + "step": 733 + }, + { + "epoch": 0.7260138476755688, + "grad_norm": 0.36783320263349717, + "learning_rate": 4.212165628435324e-05, + "loss": 0.4404, + "step": 734 + }, + { + "epoch": 0.7270029673590505, + "grad_norm": 0.42754353298521264, + "learning_rate": 4.2103334554781976e-05, + "loss": 0.5593, + "step": 735 + }, + { + "epoch": 0.7279920870425322, + "grad_norm": 0.334115716001555, + "learning_rate": 4.20850128252107e-05, + "loss": 0.5471, + "step": 736 + }, + { + "epoch": 0.7289812067260139, + "grad_norm": 0.3964403729349191, + "learning_rate": 4.206669109563943e-05, + "loss": 0.5169, + "step": 737 + }, + { + "epoch": 0.7299703264094956, + "grad_norm": 0.3524084819698355, + "learning_rate": 4.2048369366068155e-05, + "loss": 0.5195, + "step": 738 + }, + { + "epoch": 0.7309594460929772, + "grad_norm": 0.8826038613244607, + "learning_rate": 4.203004763649689e-05, + "loss": 0.4875, + "step": 739 + }, + { + "epoch": 0.731948565776459, + "grad_norm": 0.39827011655841327, + "learning_rate": 4.2011725906925615e-05, + "loss": 0.5543, + "step": 740 + }, + { + "epoch": 0.7329376854599406, + "grad_norm": 0.42818287781464126, + "learning_rate": 4.199340417735435e-05, + "loss": 0.5792, + "step": 741 + }, + { + "epoch": 0.7339268051434223, + "grad_norm": 1.6645913416124833, + "learning_rate": 4.197508244778307e-05, + "loss": 0.5845, + "step": 742 + }, + { + "epoch": 0.734915924826904, + "grad_norm": 0.41168909047511215, + "learning_rate": 4.19567607182118e-05, + "loss": 0.5943, + "step": 743 + }, + { + "epoch": 0.7359050445103857, + "grad_norm": 0.48966326888171574, + "learning_rate": 4.193843898864053e-05, + "loss": 0.5834, + "step": 744 + }, + { + "epoch": 0.7368941641938674, + "grad_norm": 0.7123014415902675, + "learning_rate": 4.192011725906926e-05, + "loss": 0.5631, + "step": 745 + }, + { + "epoch": 0.7378832838773491, + "grad_norm": 0.44472269484910176, + "learning_rate": 4.190179552949799e-05, + "loss": 0.6197, + "step": 746 + }, + { + "epoch": 0.7388724035608308, + "grad_norm": 0.5054408180848112, + "learning_rate": 4.1883473799926713e-05, + "loss": 0.5104, + "step": 747 + }, + { + "epoch": 0.7398615232443125, + "grad_norm": 0.5692571400825903, + "learning_rate": 4.1865152070355447e-05, + "loss": 0.5738, + "step": 748 + }, + { + "epoch": 0.7408506429277942, + "grad_norm": 0.45821840458928037, + "learning_rate": 4.184683034078417e-05, + "loss": 0.5037, + "step": 749 + }, + { + "epoch": 0.7418397626112759, + "grad_norm": 1.64441512213632, + "learning_rate": 4.18285086112129e-05, + "loss": 0.5396, + "step": 750 + }, + { + "epoch": 0.7428288822947576, + "grad_norm": 0.45495252031691974, + "learning_rate": 4.1810186881641626e-05, + "loss": 0.5177, + "step": 751 + }, + { + "epoch": 0.7438180019782393, + "grad_norm": 0.49653489147704893, + "learning_rate": 4.179186515207036e-05, + "loss": 0.6201, + "step": 752 + }, + { + "epoch": 0.744807121661721, + "grad_norm": 0.3524624469672887, + "learning_rate": 4.1773543422499085e-05, + "loss": 0.492, + "step": 753 + }, + { + "epoch": 0.7457962413452027, + "grad_norm": 0.4437060967033037, + "learning_rate": 4.175522169292781e-05, + "loss": 0.5195, + "step": 754 + }, + { + "epoch": 0.7467853610286844, + "grad_norm": 0.5587440781800124, + "learning_rate": 4.173689996335654e-05, + "loss": 0.5915, + "step": 755 + }, + { + "epoch": 0.7477744807121661, + "grad_norm": 0.3503317806200955, + "learning_rate": 4.171857823378527e-05, + "loss": 0.5469, + "step": 756 + }, + { + "epoch": 0.7487636003956478, + "grad_norm": 0.6127018887132037, + "learning_rate": 4.1700256504214005e-05, + "loss": 0.593, + "step": 757 + }, + { + "epoch": 0.7497527200791295, + "grad_norm": 0.37713529443389476, + "learning_rate": 4.168193477464273e-05, + "loss": 0.5149, + "step": 758 + }, + { + "epoch": 0.7507418397626113, + "grad_norm": 0.4418061035198679, + "learning_rate": 4.166361304507146e-05, + "loss": 0.5793, + "step": 759 + }, + { + "epoch": 0.751730959446093, + "grad_norm": 0.45422082453440427, + "learning_rate": 4.1645291315500184e-05, + "loss": 0.5298, + "step": 760 + }, + { + "epoch": 0.7527200791295747, + "grad_norm": 0.3609291523640555, + "learning_rate": 4.162696958592892e-05, + "loss": 0.5442, + "step": 761 + }, + { + "epoch": 0.7537091988130564, + "grad_norm": 0.3952489577476288, + "learning_rate": 4.1608647856357644e-05, + "loss": 0.4714, + "step": 762 + }, + { + "epoch": 0.7546983184965381, + "grad_norm": 0.4370926922917512, + "learning_rate": 4.159032612678637e-05, + "loss": 0.5229, + "step": 763 + }, + { + "epoch": 0.7556874381800198, + "grad_norm": 0.37870613078543963, + "learning_rate": 4.1572004397215096e-05, + "loss": 0.4551, + "step": 764 + }, + { + "epoch": 0.7566765578635015, + "grad_norm": 0.43124183371724567, + "learning_rate": 4.155368266764383e-05, + "loss": 0.5467, + "step": 765 + }, + { + "epoch": 0.7576656775469832, + "grad_norm": 0.3746671658838302, + "learning_rate": 4.1535360938072556e-05, + "loss": 0.5807, + "step": 766 + }, + { + "epoch": 0.7586547972304649, + "grad_norm": 0.46058945803907064, + "learning_rate": 4.151703920850128e-05, + "loss": 0.5734, + "step": 767 + }, + { + "epoch": 0.7596439169139466, + "grad_norm": 0.3766410341280779, + "learning_rate": 4.149871747893001e-05, + "loss": 0.531, + "step": 768 + }, + { + "epoch": 0.7606330365974283, + "grad_norm": 0.4917230854366619, + "learning_rate": 4.148039574935874e-05, + "loss": 0.529, + "step": 769 + }, + { + "epoch": 0.76162215628091, + "grad_norm": 0.3601762759017485, + "learning_rate": 4.1462074019787475e-05, + "loss": 0.508, + "step": 770 + }, + { + "epoch": 0.7626112759643917, + "grad_norm": 1.0927135392910616, + "learning_rate": 4.1443752290216195e-05, + "loss": 0.5042, + "step": 771 + }, + { + "epoch": 0.7636003956478734, + "grad_norm": 0.564538713331542, + "learning_rate": 4.142543056064493e-05, + "loss": 0.5492, + "step": 772 + }, + { + "epoch": 0.7645895153313551, + "grad_norm": 0.378038419799154, + "learning_rate": 4.1407108831073654e-05, + "loss": 0.5775, + "step": 773 + }, + { + "epoch": 0.7655786350148368, + "grad_norm": 0.5132969793247761, + "learning_rate": 4.138878710150239e-05, + "loss": 0.543, + "step": 774 + }, + { + "epoch": 0.7665677546983185, + "grad_norm": 0.4657837803710372, + "learning_rate": 4.1370465371931114e-05, + "loss": 0.599, + "step": 775 + }, + { + "epoch": 0.7675568743818002, + "grad_norm": 0.41364334636794253, + "learning_rate": 4.135214364235984e-05, + "loss": 0.4887, + "step": 776 + }, + { + "epoch": 0.7685459940652819, + "grad_norm": 0.48494156005877015, + "learning_rate": 4.133382191278857e-05, + "loss": 0.5205, + "step": 777 + }, + { + "epoch": 0.7695351137487636, + "grad_norm": 0.5890553667830905, + "learning_rate": 4.13155001832173e-05, + "loss": 0.5641, + "step": 778 + }, + { + "epoch": 0.7705242334322453, + "grad_norm": 0.4152911192832434, + "learning_rate": 4.1297178453646027e-05, + "loss": 0.5257, + "step": 779 + }, + { + "epoch": 0.771513353115727, + "grad_norm": 1.6781124035661552, + "learning_rate": 4.127885672407475e-05, + "loss": 0.602, + "step": 780 + }, + { + "epoch": 0.7725024727992087, + "grad_norm": 0.5975859652214398, + "learning_rate": 4.1260534994503486e-05, + "loss": 0.5373, + "step": 781 + }, + { + "epoch": 0.7734915924826904, + "grad_norm": 0.6170145683827101, + "learning_rate": 4.124221326493221e-05, + "loss": 0.5326, + "step": 782 + }, + { + "epoch": 0.7744807121661721, + "grad_norm": 0.612440557441423, + "learning_rate": 4.122389153536094e-05, + "loss": 0.5741, + "step": 783 + }, + { + "epoch": 0.7754698318496538, + "grad_norm": 0.43061617125576557, + "learning_rate": 4.1205569805789665e-05, + "loss": 0.5607, + "step": 784 + }, + { + "epoch": 0.7764589515331355, + "grad_norm": 0.5036215670510789, + "learning_rate": 4.11872480762184e-05, + "loss": 0.558, + "step": 785 + }, + { + "epoch": 0.7774480712166172, + "grad_norm": 0.44972944661661773, + "learning_rate": 4.1168926346647125e-05, + "loss": 0.5524, + "step": 786 + }, + { + "epoch": 0.7784371909000989, + "grad_norm": 0.5179361442529129, + "learning_rate": 4.115060461707586e-05, + "loss": 0.5389, + "step": 787 + }, + { + "epoch": 0.7794263105835806, + "grad_norm": 0.40648583153660667, + "learning_rate": 4.113228288750458e-05, + "loss": 0.5839, + "step": 788 + }, + { + "epoch": 0.7804154302670623, + "grad_norm": 0.4831293697463915, + "learning_rate": 4.111396115793331e-05, + "loss": 0.571, + "step": 789 + }, + { + "epoch": 0.781404549950544, + "grad_norm": 0.8247754938487293, + "learning_rate": 4.109563942836204e-05, + "loss": 0.5338, + "step": 790 + }, + { + "epoch": 0.7823936696340257, + "grad_norm": 0.4025900842786157, + "learning_rate": 4.107731769879077e-05, + "loss": 0.5634, + "step": 791 + }, + { + "epoch": 0.7833827893175074, + "grad_norm": 0.40749518524574807, + "learning_rate": 4.105899596921949e-05, + "loss": 0.4944, + "step": 792 + }, + { + "epoch": 0.7843719090009891, + "grad_norm": 0.5169892850825292, + "learning_rate": 4.1040674239648223e-05, + "loss": 0.5161, + "step": 793 + }, + { + "epoch": 0.7853610286844708, + "grad_norm": 0.35450257355535614, + "learning_rate": 4.102235251007696e-05, + "loss": 0.5398, + "step": 794 + }, + { + "epoch": 0.7863501483679525, + "grad_norm": 0.4013582591042465, + "learning_rate": 4.100403078050568e-05, + "loss": 0.5003, + "step": 795 + }, + { + "epoch": 0.7873392680514342, + "grad_norm": 0.37673552582477354, + "learning_rate": 4.098570905093441e-05, + "loss": 0.4951, + "step": 796 + }, + { + "epoch": 0.7883283877349159, + "grad_norm": 0.33875066525744113, + "learning_rate": 4.0967387321363136e-05, + "loss": 0.4806, + "step": 797 + }, + { + "epoch": 0.7893175074183977, + "grad_norm": 0.476408908388567, + "learning_rate": 4.094906559179187e-05, + "loss": 0.5316, + "step": 798 + }, + { + "epoch": 0.7903066271018794, + "grad_norm": 0.3860532691526498, + "learning_rate": 4.0930743862220596e-05, + "loss": 0.5714, + "step": 799 + }, + { + "epoch": 0.7912957467853611, + "grad_norm": 0.30426399871981497, + "learning_rate": 4.091242213264932e-05, + "loss": 0.548, + "step": 800 + }, + { + "epoch": 0.7922848664688428, + "grad_norm": 0.43022889880235965, + "learning_rate": 4.089410040307805e-05, + "loss": 0.4889, + "step": 801 + }, + { + "epoch": 0.7932739861523245, + "grad_norm": 0.4014383758314632, + "learning_rate": 4.087577867350678e-05, + "loss": 0.5388, + "step": 802 + }, + { + "epoch": 0.7942631058358062, + "grad_norm": 0.358329442879335, + "learning_rate": 4.085745694393551e-05, + "loss": 0.6311, + "step": 803 + }, + { + "epoch": 0.7952522255192879, + "grad_norm": 0.5003987855371286, + "learning_rate": 4.083913521436424e-05, + "loss": 0.5505, + "step": 804 + }, + { + "epoch": 0.7962413452027696, + "grad_norm": 0.39761394658298255, + "learning_rate": 4.082081348479297e-05, + "loss": 0.5423, + "step": 805 + }, + { + "epoch": 0.7972304648862513, + "grad_norm": 0.39424324806445543, + "learning_rate": 4.0802491755221694e-05, + "loss": 0.5376, + "step": 806 + }, + { + "epoch": 0.798219584569733, + "grad_norm": 0.4111741063989657, + "learning_rate": 4.078417002565043e-05, + "loss": 0.5637, + "step": 807 + }, + { + "epoch": 0.7992087042532147, + "grad_norm": 0.4089042750817777, + "learning_rate": 4.0765848296079154e-05, + "loss": 0.6027, + "step": 808 + }, + { + "epoch": 0.8001978239366964, + "grad_norm": 0.35649158749683113, + "learning_rate": 4.074752656650788e-05, + "loss": 0.5493, + "step": 809 + }, + { + "epoch": 0.8011869436201781, + "grad_norm": 0.34390908785069546, + "learning_rate": 4.0729204836936606e-05, + "loss": 0.4582, + "step": 810 + }, + { + "epoch": 0.8021760633036598, + "grad_norm": 0.46108022004907095, + "learning_rate": 4.071088310736534e-05, + "loss": 0.589, + "step": 811 + }, + { + "epoch": 0.8031651829871415, + "grad_norm": 0.4470976570889435, + "learning_rate": 4.0692561377794066e-05, + "loss": 0.5783, + "step": 812 + }, + { + "epoch": 0.8041543026706232, + "grad_norm": 0.41289681603112355, + "learning_rate": 4.067423964822279e-05, + "loss": 0.5335, + "step": 813 + }, + { + "epoch": 0.8051434223541049, + "grad_norm": 0.3897455999305961, + "learning_rate": 4.065591791865152e-05, + "loss": 0.5395, + "step": 814 + }, + { + "epoch": 0.8061325420375866, + "grad_norm": 0.5398949072700544, + "learning_rate": 4.063759618908025e-05, + "loss": 0.5419, + "step": 815 + }, + { + "epoch": 0.8071216617210683, + "grad_norm": 0.40262994039410505, + "learning_rate": 4.061927445950898e-05, + "loss": 0.5674, + "step": 816 + }, + { + "epoch": 0.80811078140455, + "grad_norm": 8.042258970876679, + "learning_rate": 4.0600952729937705e-05, + "loss": 1.1958, + "step": 817 + }, + { + "epoch": 0.8090999010880316, + "grad_norm": 0.7150914848705251, + "learning_rate": 4.058263100036644e-05, + "loss": 0.4752, + "step": 818 + }, + { + "epoch": 0.8100890207715133, + "grad_norm": 0.37450227733321634, + "learning_rate": 4.0564309270795165e-05, + "loss": 0.519, + "step": 819 + }, + { + "epoch": 0.811078140454995, + "grad_norm": 0.7771184561273069, + "learning_rate": 4.05459875412239e-05, + "loss": 0.5667, + "step": 820 + }, + { + "epoch": 0.8120672601384767, + "grad_norm": 0.39320567398584594, + "learning_rate": 4.0527665811652624e-05, + "loss": 0.5189, + "step": 821 + }, + { + "epoch": 0.8130563798219584, + "grad_norm": 0.6164066236372652, + "learning_rate": 4.050934408208135e-05, + "loss": 0.5956, + "step": 822 + }, + { + "epoch": 0.8140454995054401, + "grad_norm": 0.43784506001787876, + "learning_rate": 4.049102235251008e-05, + "loss": 0.5005, + "step": 823 + }, + { + "epoch": 0.8150346191889218, + "grad_norm": 0.650347294511098, + "learning_rate": 4.047270062293881e-05, + "loss": 0.5911, + "step": 824 + }, + { + "epoch": 0.8160237388724035, + "grad_norm": 0.3924537077442831, + "learning_rate": 4.045437889336754e-05, + "loss": 0.5942, + "step": 825 + }, + { + "epoch": 0.8170128585558852, + "grad_norm": 0.5306835116575841, + "learning_rate": 4.043605716379626e-05, + "loss": 0.591, + "step": 826 + }, + { + "epoch": 0.8180019782393669, + "grad_norm": 0.47068526472918815, + "learning_rate": 4.041773543422499e-05, + "loss": 0.5701, + "step": 827 + }, + { + "epoch": 0.8189910979228486, + "grad_norm": 0.5433358297169092, + "learning_rate": 4.039941370465372e-05, + "loss": 0.5032, + "step": 828 + }, + { + "epoch": 0.8199802176063303, + "grad_norm": 0.4117742532527006, + "learning_rate": 4.038109197508245e-05, + "loss": 0.5051, + "step": 829 + }, + { + "epoch": 0.820969337289812, + "grad_norm": 0.45313367798585935, + "learning_rate": 4.0362770245511175e-05, + "loss": 0.5236, + "step": 830 + }, + { + "epoch": 0.8219584569732937, + "grad_norm": 0.4743390307289405, + "learning_rate": 4.034444851593991e-05, + "loss": 0.5661, + "step": 831 + }, + { + "epoch": 0.8229475766567754, + "grad_norm": 0.43419602719141287, + "learning_rate": 4.0326126786368635e-05, + "loss": 0.5262, + "step": 832 + }, + { + "epoch": 0.8239366963402571, + "grad_norm": 0.44595712321645226, + "learning_rate": 4.030780505679737e-05, + "loss": 0.557, + "step": 833 + }, + { + "epoch": 0.8249258160237388, + "grad_norm": 0.46065824846474296, + "learning_rate": 4.028948332722609e-05, + "loss": 0.5425, + "step": 834 + }, + { + "epoch": 0.8259149357072205, + "grad_norm": 0.47395010516783537, + "learning_rate": 4.027116159765482e-05, + "loss": 0.5432, + "step": 835 + }, + { + "epoch": 0.8269040553907022, + "grad_norm": 0.409348387036287, + "learning_rate": 4.025283986808355e-05, + "loss": 0.5371, + "step": 836 + }, + { + "epoch": 0.827893175074184, + "grad_norm": 0.49737572694928917, + "learning_rate": 4.023451813851228e-05, + "loss": 0.517, + "step": 837 + }, + { + "epoch": 0.8288822947576657, + "grad_norm": 0.3964787257419728, + "learning_rate": 4.0216196408941e-05, + "loss": 0.5921, + "step": 838 + }, + { + "epoch": 0.8298714144411474, + "grad_norm": 0.3783571743456649, + "learning_rate": 4.0197874679369734e-05, + "loss": 0.5817, + "step": 839 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 0.45317557022906846, + "learning_rate": 4.017955294979846e-05, + "loss": 0.5361, + "step": 840 + }, + { + "epoch": 0.8318496538081108, + "grad_norm": 0.3855785042525516, + "learning_rate": 4.016123122022719e-05, + "loss": 0.5561, + "step": 841 + }, + { + "epoch": 0.8328387734915925, + "grad_norm": 0.37735899597002825, + "learning_rate": 4.014290949065592e-05, + "loss": 0.4952, + "step": 842 + }, + { + "epoch": 0.8338278931750742, + "grad_norm": 0.3910395508613529, + "learning_rate": 4.0124587761084646e-05, + "loss": 0.5248, + "step": 843 + }, + { + "epoch": 0.8348170128585559, + "grad_norm": 0.41255181351776066, + "learning_rate": 4.010626603151338e-05, + "loss": 0.6143, + "step": 844 + }, + { + "epoch": 0.8358061325420376, + "grad_norm": 0.3881103870770859, + "learning_rate": 4.0087944301942106e-05, + "loss": 0.5858, + "step": 845 + }, + { + "epoch": 0.8367952522255193, + "grad_norm": 0.4610095412501324, + "learning_rate": 4.006962257237083e-05, + "loss": 0.4924, + "step": 846 + }, + { + "epoch": 0.837784371909001, + "grad_norm": 0.36829299206900973, + "learning_rate": 4.005130084279956e-05, + "loss": 0.5079, + "step": 847 + }, + { + "epoch": 0.8387734915924827, + "grad_norm": 0.40289319760057474, + "learning_rate": 4.003297911322829e-05, + "loss": 0.5482, + "step": 848 + }, + { + "epoch": 0.8397626112759644, + "grad_norm": 0.4206511643989216, + "learning_rate": 4.001465738365702e-05, + "loss": 0.5605, + "step": 849 + }, + { + "epoch": 0.8407517309594461, + "grad_norm": 0.41731945925784314, + "learning_rate": 3.999633565408575e-05, + "loss": 0.6092, + "step": 850 + }, + { + "epoch": 0.8417408506429278, + "grad_norm": 0.30257425739688576, + "learning_rate": 3.997801392451447e-05, + "loss": 0.579, + "step": 851 + }, + { + "epoch": 0.8427299703264095, + "grad_norm": 0.3864935018364275, + "learning_rate": 3.9959692194943204e-05, + "loss": 0.4771, + "step": 852 + }, + { + "epoch": 0.8437190900098912, + "grad_norm": 0.3988101514171298, + "learning_rate": 3.994137046537194e-05, + "loss": 0.5624, + "step": 853 + }, + { + "epoch": 0.8447082096933729, + "grad_norm": 0.39420755605728475, + "learning_rate": 3.9923048735800664e-05, + "loss": 0.5568, + "step": 854 + }, + { + "epoch": 0.8456973293768546, + "grad_norm": 0.4921966493944131, + "learning_rate": 3.990472700622939e-05, + "loss": 0.5038, + "step": 855 + }, + { + "epoch": 0.8466864490603363, + "grad_norm": 0.4365043760382965, + "learning_rate": 3.9886405276658117e-05, + "loss": 0.5222, + "step": 856 + }, + { + "epoch": 0.847675568743818, + "grad_norm": 0.44342345181331283, + "learning_rate": 3.986808354708685e-05, + "loss": 0.5426, + "step": 857 + }, + { + "epoch": 0.8486646884272997, + "grad_norm": 0.36474923885017907, + "learning_rate": 3.9849761817515576e-05, + "loss": 0.4866, + "step": 858 + }, + { + "epoch": 0.8496538081107814, + "grad_norm": 0.4279054783487708, + "learning_rate": 3.98314400879443e-05, + "loss": 0.4454, + "step": 859 + }, + { + "epoch": 0.8506429277942631, + "grad_norm": 0.32658261042232156, + "learning_rate": 3.981311835837303e-05, + "loss": 0.5478, + "step": 860 + }, + { + "epoch": 0.8516320474777448, + "grad_norm": 0.3632391166208969, + "learning_rate": 3.979479662880176e-05, + "loss": 0.5293, + "step": 861 + }, + { + "epoch": 0.8526211671612265, + "grad_norm": 0.361466100899412, + "learning_rate": 3.977647489923049e-05, + "loss": 0.5775, + "step": 862 + }, + { + "epoch": 0.8536102868447082, + "grad_norm": 0.40555093574101875, + "learning_rate": 3.9758153169659215e-05, + "loss": 0.4856, + "step": 863 + }, + { + "epoch": 0.8545994065281899, + "grad_norm": 0.3488107870670207, + "learning_rate": 3.973983144008794e-05, + "loss": 0.514, + "step": 864 + }, + { + "epoch": 0.8555885262116716, + "grad_norm": 0.3843960708975495, + "learning_rate": 3.9721509710516675e-05, + "loss": 0.5101, + "step": 865 + }, + { + "epoch": 0.8565776458951533, + "grad_norm": 0.41358392461083593, + "learning_rate": 3.970318798094541e-05, + "loss": 0.5516, + "step": 866 + }, + { + "epoch": 0.857566765578635, + "grad_norm": 0.32460712314978024, + "learning_rate": 3.9684866251374134e-05, + "loss": 0.5351, + "step": 867 + }, + { + "epoch": 0.8585558852621167, + "grad_norm": 0.4590823844091551, + "learning_rate": 3.966654452180286e-05, + "loss": 0.6195, + "step": 868 + }, + { + "epoch": 0.8595450049455984, + "grad_norm": 0.36553223034798327, + "learning_rate": 3.964822279223159e-05, + "loss": 0.5288, + "step": 869 + }, + { + "epoch": 0.8605341246290801, + "grad_norm": 0.42785798677542214, + "learning_rate": 3.962990106266032e-05, + "loss": 0.5341, + "step": 870 + }, + { + "epoch": 0.8615232443125618, + "grad_norm": 0.34046802166560325, + "learning_rate": 3.961157933308905e-05, + "loss": 0.4989, + "step": 871 + }, + { + "epoch": 0.8625123639960435, + "grad_norm": 0.4203148762636131, + "learning_rate": 3.959325760351777e-05, + "loss": 0.4626, + "step": 872 + }, + { + "epoch": 0.8635014836795252, + "grad_norm": 0.46638243088571246, + "learning_rate": 3.95749358739465e-05, + "loss": 0.5535, + "step": 873 + }, + { + "epoch": 0.8644906033630069, + "grad_norm": 0.3207059127416101, + "learning_rate": 3.955661414437523e-05, + "loss": 0.4701, + "step": 874 + }, + { + "epoch": 0.8654797230464887, + "grad_norm": 0.4361065262778062, + "learning_rate": 3.953829241480396e-05, + "loss": 0.5232, + "step": 875 + }, + { + "epoch": 0.8664688427299704, + "grad_norm": 0.4179015275913999, + "learning_rate": 3.9519970685232686e-05, + "loss": 0.4782, + "step": 876 + }, + { + "epoch": 0.8674579624134521, + "grad_norm": 0.39023161716199395, + "learning_rate": 3.950164895566142e-05, + "loss": 0.5666, + "step": 877 + }, + { + "epoch": 0.8684470820969338, + "grad_norm": 2.1308947045622375, + "learning_rate": 3.9483327226090145e-05, + "loss": 0.4915, + "step": 878 + }, + { + "epoch": 0.8694362017804155, + "grad_norm": 0.6355227582113737, + "learning_rate": 3.946500549651888e-05, + "loss": 0.5845, + "step": 879 + }, + { + "epoch": 0.8704253214638972, + "grad_norm": 0.4231786188850144, + "learning_rate": 3.94466837669476e-05, + "loss": 0.5186, + "step": 880 + }, + { + "epoch": 0.8714144411473789, + "grad_norm": 0.5963762480510161, + "learning_rate": 3.942836203737633e-05, + "loss": 0.5478, + "step": 881 + }, + { + "epoch": 0.8724035608308606, + "grad_norm": 0.44967258871723204, + "learning_rate": 3.941004030780506e-05, + "loss": 0.6044, + "step": 882 + }, + { + "epoch": 0.8733926805143423, + "grad_norm": 0.49666107894686895, + "learning_rate": 3.939171857823379e-05, + "loss": 0.5572, + "step": 883 + }, + { + "epoch": 0.874381800197824, + "grad_norm": 0.4511223362706553, + "learning_rate": 3.937339684866251e-05, + "loss": 0.5167, + "step": 884 + }, + { + "epoch": 0.8753709198813057, + "grad_norm": 0.39070885224778895, + "learning_rate": 3.9355075119091244e-05, + "loss": 0.5711, + "step": 885 + }, + { + "epoch": 0.8763600395647874, + "grad_norm": 0.4817666139801221, + "learning_rate": 3.933675338951997e-05, + "loss": 0.5614, + "step": 886 + }, + { + "epoch": 0.8773491592482691, + "grad_norm": 0.37439555094432153, + "learning_rate": 3.93184316599487e-05, + "loss": 0.473, + "step": 887 + }, + { + "epoch": 0.8783382789317508, + "grad_norm": 0.512933754535988, + "learning_rate": 3.930010993037743e-05, + "loss": 0.5664, + "step": 888 + }, + { + "epoch": 0.8793273986152325, + "grad_norm": 0.3861195811560788, + "learning_rate": 3.9281788200806156e-05, + "loss": 0.4614, + "step": 889 + }, + { + "epoch": 0.8803165182987142, + "grad_norm": 0.5026948257818956, + "learning_rate": 3.926346647123489e-05, + "loss": 0.5455, + "step": 890 + }, + { + "epoch": 0.8813056379821959, + "grad_norm": 0.3797918839860446, + "learning_rate": 3.9245144741663616e-05, + "loss": 0.5669, + "step": 891 + }, + { + "epoch": 0.8822947576656776, + "grad_norm": 0.3580599914264445, + "learning_rate": 3.922682301209234e-05, + "loss": 0.4969, + "step": 892 + }, + { + "epoch": 0.8832838773491593, + "grad_norm": 0.4680809277442312, + "learning_rate": 3.920850128252107e-05, + "loss": 0.53, + "step": 893 + }, + { + "epoch": 0.884272997032641, + "grad_norm": 0.3182331740958757, + "learning_rate": 3.91901795529498e-05, + "loss": 0.4916, + "step": 894 + }, + { + "epoch": 0.8852621167161226, + "grad_norm": 0.38629102830836864, + "learning_rate": 3.917185782337853e-05, + "loss": 0.4831, + "step": 895 + }, + { + "epoch": 0.8862512363996043, + "grad_norm": 0.38003929877472475, + "learning_rate": 3.915353609380726e-05, + "loss": 0.4544, + "step": 896 + }, + { + "epoch": 0.887240356083086, + "grad_norm": 0.35819047353737965, + "learning_rate": 3.913521436423598e-05, + "loss": 0.6131, + "step": 897 + }, + { + "epoch": 0.8882294757665677, + "grad_norm": 0.391106759277783, + "learning_rate": 3.9116892634664714e-05, + "loss": 0.5327, + "step": 898 + }, + { + "epoch": 0.8892185954500494, + "grad_norm": 0.44913570347970116, + "learning_rate": 3.909857090509344e-05, + "loss": 0.5853, + "step": 899 + }, + { + "epoch": 0.8902077151335311, + "grad_norm": 0.33654774158156275, + "learning_rate": 3.9080249175522174e-05, + "loss": 0.5172, + "step": 900 + }, + { + "epoch": 0.8911968348170128, + "grad_norm": 0.4143598750420622, + "learning_rate": 3.90619274459509e-05, + "loss": 0.4848, + "step": 901 + }, + { + "epoch": 0.8921859545004945, + "grad_norm": 0.524991445546096, + "learning_rate": 3.904360571637963e-05, + "loss": 0.4199, + "step": 902 + }, + { + "epoch": 0.8931750741839762, + "grad_norm": 0.3927175649035216, + "learning_rate": 3.902528398680836e-05, + "loss": 0.5881, + "step": 903 + }, + { + "epoch": 0.8941641938674579, + "grad_norm": 0.4079491608662268, + "learning_rate": 3.9006962257237086e-05, + "loss": 0.5405, + "step": 904 + }, + { + "epoch": 0.8951533135509396, + "grad_norm": 0.424142781885458, + "learning_rate": 3.898864052766581e-05, + "loss": 0.5487, + "step": 905 + }, + { + "epoch": 0.8961424332344213, + "grad_norm": 0.41683695083750005, + "learning_rate": 3.897031879809454e-05, + "loss": 0.4787, + "step": 906 + }, + { + "epoch": 0.897131552917903, + "grad_norm": 0.3547610738651701, + "learning_rate": 3.895199706852327e-05, + "loss": 0.4979, + "step": 907 + }, + { + "epoch": 0.8981206726013847, + "grad_norm": 2.3469674956937228, + "learning_rate": 3.8933675338952e-05, + "loss": 0.5794, + "step": 908 + }, + { + "epoch": 0.8991097922848664, + "grad_norm": 0.6412020698124778, + "learning_rate": 3.8915353609380725e-05, + "loss": 0.5268, + "step": 909 + }, + { + "epoch": 0.9000989119683481, + "grad_norm": 1.0383233848694802, + "learning_rate": 3.889703187980945e-05, + "loss": 0.5414, + "step": 910 + }, + { + "epoch": 0.9010880316518298, + "grad_norm": 0.43251475437881326, + "learning_rate": 3.8878710150238185e-05, + "loss": 0.4797, + "step": 911 + }, + { + "epoch": 0.9020771513353115, + "grad_norm": 0.5157606933923597, + "learning_rate": 3.886038842066692e-05, + "loss": 0.5654, + "step": 912 + }, + { + "epoch": 0.9030662710187932, + "grad_norm": 0.3743540399979926, + "learning_rate": 3.8842066691095644e-05, + "loss": 0.5119, + "step": 913 + }, + { + "epoch": 0.904055390702275, + "grad_norm": 0.42098522740414857, + "learning_rate": 3.882374496152437e-05, + "loss": 0.489, + "step": 914 + }, + { + "epoch": 0.9050445103857567, + "grad_norm": 0.5243073127176688, + "learning_rate": 3.88054232319531e-05, + "loss": 0.5039, + "step": 915 + }, + { + "epoch": 0.9060336300692384, + "grad_norm": 0.3986349967328183, + "learning_rate": 3.878710150238183e-05, + "loss": 0.5337, + "step": 916 + }, + { + "epoch": 0.9070227497527201, + "grad_norm": 0.5429828786199771, + "learning_rate": 3.876877977281056e-05, + "loss": 0.5475, + "step": 917 + }, + { + "epoch": 0.9080118694362018, + "grad_norm": 0.44994465926714916, + "learning_rate": 3.875045804323928e-05, + "loss": 0.5136, + "step": 918 + }, + { + "epoch": 0.9090009891196835, + "grad_norm": 0.3415738750821067, + "learning_rate": 3.873213631366801e-05, + "loss": 0.5346, + "step": 919 + }, + { + "epoch": 0.9099901088031652, + "grad_norm": 0.3937349524036801, + "learning_rate": 3.871381458409674e-05, + "loss": 0.466, + "step": 920 + }, + { + "epoch": 0.9109792284866469, + "grad_norm": 0.5131695235804504, + "learning_rate": 3.869549285452547e-05, + "loss": 0.6701, + "step": 921 + }, + { + "epoch": 0.9119683481701286, + "grad_norm": 0.33653367131443224, + "learning_rate": 3.8677171124954196e-05, + "loss": 0.4473, + "step": 922 + }, + { + "epoch": 0.9129574678536103, + "grad_norm": 0.5446871845384862, + "learning_rate": 3.865884939538292e-05, + "loss": 0.5528, + "step": 923 + }, + { + "epoch": 0.913946587537092, + "grad_norm": 0.5015814740270789, + "learning_rate": 3.8640527665811655e-05, + "loss": 0.5226, + "step": 924 + }, + { + "epoch": 0.9149357072205737, + "grad_norm": 0.7983585338010668, + "learning_rate": 3.862220593624039e-05, + "loss": 0.4371, + "step": 925 + }, + { + "epoch": 0.9159248269040554, + "grad_norm": 0.48904182262923174, + "learning_rate": 3.860388420666911e-05, + "loss": 0.5594, + "step": 926 + }, + { + "epoch": 0.9169139465875371, + "grad_norm": 0.4503935788610765, + "learning_rate": 3.858556247709784e-05, + "loss": 0.6133, + "step": 927 + }, + { + "epoch": 0.9179030662710188, + "grad_norm": 0.3165947004747236, + "learning_rate": 3.856724074752657e-05, + "loss": 0.4772, + "step": 928 + }, + { + "epoch": 0.9188921859545005, + "grad_norm": 0.5586077488303228, + "learning_rate": 3.85489190179553e-05, + "loss": 0.5199, + "step": 929 + }, + { + "epoch": 0.9198813056379822, + "grad_norm": 0.5085374651276036, + "learning_rate": 3.853059728838402e-05, + "loss": 0.5306, + "step": 930 + }, + { + "epoch": 0.9208704253214639, + "grad_norm": 0.4129244011177407, + "learning_rate": 3.8512275558812754e-05, + "loss": 0.4806, + "step": 931 + }, + { + "epoch": 0.9218595450049456, + "grad_norm": 0.4861459748388243, + "learning_rate": 3.849395382924148e-05, + "loss": 0.4908, + "step": 932 + }, + { + "epoch": 0.9228486646884273, + "grad_norm": 0.41526131109253767, + "learning_rate": 3.847563209967021e-05, + "loss": 0.5276, + "step": 933 + }, + { + "epoch": 0.923837784371909, + "grad_norm": 0.4009408084973653, + "learning_rate": 3.845731037009894e-05, + "loss": 0.5562, + "step": 934 + }, + { + "epoch": 0.9248269040553907, + "grad_norm": 0.5745910621076383, + "learning_rate": 3.8438988640527666e-05, + "loss": 0.5774, + "step": 935 + }, + { + "epoch": 0.9258160237388724, + "grad_norm": 0.4637760415387979, + "learning_rate": 3.84206669109564e-05, + "loss": 0.5435, + "step": 936 + }, + { + "epoch": 0.9268051434223541, + "grad_norm": 0.41864980359230897, + "learning_rate": 3.8402345181385126e-05, + "loss": 0.5391, + "step": 937 + }, + { + "epoch": 0.9277942631058358, + "grad_norm": 1.1869062780052408, + "learning_rate": 3.838402345181385e-05, + "loss": 0.5023, + "step": 938 + }, + { + "epoch": 0.9287833827893175, + "grad_norm": 0.43776779645633507, + "learning_rate": 3.836570172224258e-05, + "loss": 0.5096, + "step": 939 + }, + { + "epoch": 0.9297725024727992, + "grad_norm": 0.5198208217050955, + "learning_rate": 3.834737999267131e-05, + "loss": 0.5335, + "step": 940 + }, + { + "epoch": 0.9307616221562809, + "grad_norm": 0.5235285409308027, + "learning_rate": 3.832905826310004e-05, + "loss": 0.5847, + "step": 941 + }, + { + "epoch": 0.9317507418397626, + "grad_norm": 0.44126732924600726, + "learning_rate": 3.831073653352877e-05, + "loss": 0.4719, + "step": 942 + }, + { + "epoch": 0.9327398615232443, + "grad_norm": 0.5054573511058064, + "learning_rate": 3.829241480395749e-05, + "loss": 0.5096, + "step": 943 + }, + { + "epoch": 0.933728981206726, + "grad_norm": 0.5805833518404947, + "learning_rate": 3.8274093074386224e-05, + "loss": 0.5011, + "step": 944 + }, + { + "epoch": 0.9347181008902077, + "grad_norm": 2.2314124854517408, + "learning_rate": 3.825577134481495e-05, + "loss": 0.5527, + "step": 945 + }, + { + "epoch": 0.9357072205736894, + "grad_norm": 0.5483697457861862, + "learning_rate": 3.8237449615243684e-05, + "loss": 0.5321, + "step": 946 + }, + { + "epoch": 0.9366963402571711, + "grad_norm": 0.47552103451002065, + "learning_rate": 3.8219127885672404e-05, + "loss": 0.5372, + "step": 947 + }, + { + "epoch": 0.9376854599406528, + "grad_norm": 0.31286512825007917, + "learning_rate": 3.820080615610114e-05, + "loss": 0.4985, + "step": 948 + }, + { + "epoch": 0.9386745796241345, + "grad_norm": 1.7242082421798055, + "learning_rate": 3.818248442652987e-05, + "loss": 0.5405, + "step": 949 + }, + { + "epoch": 0.9396636993076162, + "grad_norm": 0.4700031516218264, + "learning_rate": 3.8164162696958596e-05, + "loss": 0.4871, + "step": 950 + }, + { + "epoch": 0.9406528189910979, + "grad_norm": 0.38084464993511985, + "learning_rate": 3.814584096738732e-05, + "loss": 0.4736, + "step": 951 + }, + { + "epoch": 0.9416419386745796, + "grad_norm": 0.4483615144248822, + "learning_rate": 3.812751923781605e-05, + "loss": 0.5035, + "step": 952 + }, + { + "epoch": 0.9426310583580614, + "grad_norm": 0.3741684709974548, + "learning_rate": 3.810919750824478e-05, + "loss": 0.5079, + "step": 953 + }, + { + "epoch": 0.9436201780415431, + "grad_norm": 0.3159720304753471, + "learning_rate": 3.809087577867351e-05, + "loss": 0.5318, + "step": 954 + }, + { + "epoch": 0.9446092977250248, + "grad_norm": 0.4215113817807427, + "learning_rate": 3.8072554049102235e-05, + "loss": 0.5136, + "step": 955 + }, + { + "epoch": 0.9455984174085065, + "grad_norm": 0.37425060046032715, + "learning_rate": 3.805423231953096e-05, + "loss": 0.5254, + "step": 956 + }, + { + "epoch": 0.9465875370919882, + "grad_norm": 0.3257675406702953, + "learning_rate": 3.8035910589959695e-05, + "loss": 0.5079, + "step": 957 + }, + { + "epoch": 0.9475766567754699, + "grad_norm": 0.45095100461312454, + "learning_rate": 3.801758886038842e-05, + "loss": 0.512, + "step": 958 + }, + { + "epoch": 0.9485657764589516, + "grad_norm": 0.4090749619120724, + "learning_rate": 3.7999267130817154e-05, + "loss": 0.5396, + "step": 959 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.39265990057296607, + "learning_rate": 3.798094540124588e-05, + "loss": 0.4991, + "step": 960 + }, + { + "epoch": 0.950544015825915, + "grad_norm": 0.4063218758435763, + "learning_rate": 3.796262367167461e-05, + "loss": 0.489, + "step": 961 + }, + { + "epoch": 0.9515331355093967, + "grad_norm": 0.4829638146436522, + "learning_rate": 3.794430194210334e-05, + "loss": 0.5859, + "step": 962 + }, + { + "epoch": 0.9525222551928784, + "grad_norm": 0.33683234605221707, + "learning_rate": 3.792598021253207e-05, + "loss": 0.5402, + "step": 963 + }, + { + "epoch": 0.9535113748763601, + "grad_norm": 0.4136710761072618, + "learning_rate": 3.790765848296079e-05, + "loss": 0.6248, + "step": 964 + }, + { + "epoch": 0.9545004945598418, + "grad_norm": 0.44419096696115934, + "learning_rate": 3.788933675338952e-05, + "loss": 0.5687, + "step": 965 + }, + { + "epoch": 0.9554896142433235, + "grad_norm": 0.314815697263912, + "learning_rate": 3.787101502381825e-05, + "loss": 0.5257, + "step": 966 + }, + { + "epoch": 0.9564787339268052, + "grad_norm": 0.3767859758997778, + "learning_rate": 3.785269329424698e-05, + "loss": 0.4821, + "step": 967 + }, + { + "epoch": 0.9574678536102869, + "grad_norm": 0.3695256393459967, + "learning_rate": 3.7834371564675706e-05, + "loss": 0.5113, + "step": 968 + }, + { + "epoch": 0.9584569732937686, + "grad_norm": 0.29897151173130265, + "learning_rate": 3.781604983510443e-05, + "loss": 0.5127, + "step": 969 + }, + { + "epoch": 0.9594460929772503, + "grad_norm": 0.3406098944483567, + "learning_rate": 3.7797728105533165e-05, + "loss": 0.4787, + "step": 970 + }, + { + "epoch": 0.960435212660732, + "grad_norm": 0.3727822915676798, + "learning_rate": 3.777940637596189e-05, + "loss": 0.5348, + "step": 971 + }, + { + "epoch": 0.9614243323442137, + "grad_norm": 0.3140702134023394, + "learning_rate": 3.776108464639062e-05, + "loss": 0.4602, + "step": 972 + }, + { + "epoch": 0.9624134520276953, + "grad_norm": 0.3848508421746314, + "learning_rate": 3.774276291681935e-05, + "loss": 0.5356, + "step": 973 + }, + { + "epoch": 0.963402571711177, + "grad_norm": 0.32214698366653616, + "learning_rate": 3.772444118724808e-05, + "loss": 0.5184, + "step": 974 + }, + { + "epoch": 0.9643916913946587, + "grad_norm": 0.36714549493193244, + "learning_rate": 3.770611945767681e-05, + "loss": 0.4755, + "step": 975 + }, + { + "epoch": 0.9653808110781404, + "grad_norm": 0.34869975067509607, + "learning_rate": 3.768779772810553e-05, + "loss": 0.6225, + "step": 976 + }, + { + "epoch": 0.9663699307616221, + "grad_norm": 0.3820217670798009, + "learning_rate": 3.7669475998534264e-05, + "loss": 0.4952, + "step": 977 + }, + { + "epoch": 0.9673590504451038, + "grad_norm": 0.3985817080513296, + "learning_rate": 3.765115426896299e-05, + "loss": 0.5567, + "step": 978 + }, + { + "epoch": 0.9683481701285855, + "grad_norm": 0.31651201835869763, + "learning_rate": 3.7632832539391723e-05, + "loss": 0.5074, + "step": 979 + }, + { + "epoch": 0.9693372898120672, + "grad_norm": 0.37619456746828517, + "learning_rate": 3.761451080982045e-05, + "loss": 0.5589, + "step": 980 + }, + { + "epoch": 0.9703264094955489, + "grad_norm": 0.3623920056918156, + "learning_rate": 3.7596189080249176e-05, + "loss": 0.4757, + "step": 981 + }, + { + "epoch": 0.9713155291790306, + "grad_norm": 0.3420143952848393, + "learning_rate": 3.75778673506779e-05, + "loss": 0.4884, + "step": 982 + }, + { + "epoch": 0.9723046488625123, + "grad_norm": 0.32751462019753536, + "learning_rate": 3.7559545621106636e-05, + "loss": 0.4947, + "step": 983 + }, + { + "epoch": 0.973293768545994, + "grad_norm": 0.31318281028088973, + "learning_rate": 3.754122389153536e-05, + "loss": 0.4836, + "step": 984 + }, + { + "epoch": 0.9742828882294757, + "grad_norm": 0.38127965484424997, + "learning_rate": 3.752290216196409e-05, + "loss": 0.5345, + "step": 985 + }, + { + "epoch": 0.9752720079129574, + "grad_norm": 0.37819403084012576, + "learning_rate": 3.750458043239282e-05, + "loss": 0.4964, + "step": 986 + }, + { + "epoch": 0.9762611275964391, + "grad_norm": 0.3560452300150932, + "learning_rate": 3.748625870282155e-05, + "loss": 0.5693, + "step": 987 + }, + { + "epoch": 0.9772502472799208, + "grad_norm": 0.40812765024892683, + "learning_rate": 3.746793697325028e-05, + "loss": 0.5594, + "step": 988 + }, + { + "epoch": 0.9782393669634025, + "grad_norm": 0.39374505335822296, + "learning_rate": 3.7449615243679e-05, + "loss": 0.5583, + "step": 989 + }, + { + "epoch": 0.9792284866468842, + "grad_norm": 0.37540173769820445, + "learning_rate": 3.7431293514107734e-05, + "loss": 0.4881, + "step": 990 + }, + { + "epoch": 0.9802176063303659, + "grad_norm": 0.443962903657654, + "learning_rate": 3.741297178453646e-05, + "loss": 0.5349, + "step": 991 + }, + { + "epoch": 0.9812067260138477, + "grad_norm": 0.3858662193153671, + "learning_rate": 3.7394650054965194e-05, + "loss": 0.5483, + "step": 992 + }, + { + "epoch": 0.9821958456973294, + "grad_norm": 0.8166142256059559, + "learning_rate": 3.7376328325393914e-05, + "loss": 0.591, + "step": 993 + }, + { + "epoch": 0.9831849653808111, + "grad_norm": 0.4159757188981098, + "learning_rate": 3.735800659582265e-05, + "loss": 0.5583, + "step": 994 + }, + { + "epoch": 0.9841740850642928, + "grad_norm": 0.8142513421373967, + "learning_rate": 3.733968486625137e-05, + "loss": 0.5662, + "step": 995 + }, + { + "epoch": 0.9851632047477745, + "grad_norm": 0.491701116849365, + "learning_rate": 3.7321363136680106e-05, + "loss": 0.4751, + "step": 996 + }, + { + "epoch": 0.9861523244312562, + "grad_norm": 0.6257416157107429, + "learning_rate": 3.730304140710883e-05, + "loss": 0.5465, + "step": 997 + }, + { + "epoch": 0.9871414441147379, + "grad_norm": 0.3803596226603389, + "learning_rate": 3.728471967753756e-05, + "loss": 0.5614, + "step": 998 + }, + { + "epoch": 0.9881305637982196, + "grad_norm": 0.48848649838986524, + "learning_rate": 3.726639794796629e-05, + "loss": 0.5025, + "step": 999 + }, + { + "epoch": 0.9891196834817013, + "grad_norm": 0.6067005301320401, + "learning_rate": 3.724807621839502e-05, + "loss": 0.5909, + "step": 1000 + }, + { + "epoch": 0.990108803165183, + "grad_norm": 0.30491012951863644, + "learning_rate": 3.7229754488823745e-05, + "loss": 0.5312, + "step": 1001 + }, + { + "epoch": 0.9910979228486647, + "grad_norm": 0.5263181916386561, + "learning_rate": 3.721143275925247e-05, + "loss": 0.5418, + "step": 1002 + }, + { + "epoch": 0.9920870425321464, + "grad_norm": 0.48669107191244604, + "learning_rate": 3.7193111029681205e-05, + "loss": 0.4754, + "step": 1003 + }, + { + "epoch": 0.9930761622156281, + "grad_norm": 0.32683564491319556, + "learning_rate": 3.717478930010993e-05, + "loss": 0.4993, + "step": 1004 + }, + { + "epoch": 0.9940652818991098, + "grad_norm": 0.41176858367306507, + "learning_rate": 3.7156467570538664e-05, + "loss": 0.5087, + "step": 1005 + }, + { + "epoch": 0.9950544015825915, + "grad_norm": 0.47830883072176333, + "learning_rate": 3.7138145840967384e-05, + "loss": 0.5465, + "step": 1006 + }, + { + "epoch": 0.9960435212660732, + "grad_norm": 0.3751711273770695, + "learning_rate": 3.711982411139612e-05, + "loss": 0.5465, + "step": 1007 + }, + { + "epoch": 0.9970326409495549, + "grad_norm": 0.3863008248417529, + "learning_rate": 3.710150238182485e-05, + "loss": 0.5453, + "step": 1008 + }, + { + "epoch": 0.9980217606330366, + "grad_norm": 0.35255489104921495, + "learning_rate": 3.708318065225358e-05, + "loss": 0.4655, + "step": 1009 + }, + { + "epoch": 0.9990108803165183, + "grad_norm": 0.3485913687147211, + "learning_rate": 3.70648589226823e-05, + "loss": 0.5097, + "step": 1010 + }, + { + "epoch": 1.0, + "grad_norm": 0.4578352977667433, + "learning_rate": 3.704653719311103e-05, + "loss": 0.5795, + "step": 1011 + }, + { + "epoch": 1.0009891196834817, + "grad_norm": 0.38768311863290106, + "learning_rate": 3.702821546353976e-05, + "loss": 0.401, + "step": 1012 + }, + { + "epoch": 1.0019782393669634, + "grad_norm": 0.3714367929643285, + "learning_rate": 3.700989373396849e-05, + "loss": 0.43, + "step": 1013 + }, + { + "epoch": 1.002967359050445, + "grad_norm": 0.39615149577959957, + "learning_rate": 3.6991572004397216e-05, + "loss": 0.5277, + "step": 1014 + }, + { + "epoch": 1.0039564787339268, + "grad_norm": 0.2867304423631312, + "learning_rate": 3.697325027482594e-05, + "loss": 0.4223, + "step": 1015 + }, + { + "epoch": 1.0049455984174085, + "grad_norm": 0.3693181837334904, + "learning_rate": 3.6954928545254675e-05, + "loss": 0.4469, + "step": 1016 + }, + { + "epoch": 1.0059347181008902, + "grad_norm": 0.34055845537716634, + "learning_rate": 3.69366068156834e-05, + "loss": 0.4902, + "step": 1017 + }, + { + "epoch": 1.006923837784372, + "grad_norm": 0.3429833347762005, + "learning_rate": 3.691828508611213e-05, + "loss": 0.4396, + "step": 1018 + }, + { + "epoch": 1.0079129574678536, + "grad_norm": 0.3693590550132719, + "learning_rate": 3.6899963356540855e-05, + "loss": 0.425, + "step": 1019 + }, + { + "epoch": 1.0089020771513353, + "grad_norm": 3.0499675675684976, + "learning_rate": 3.688164162696959e-05, + "loss": 0.5673, + "step": 1020 + }, + { + "epoch": 1.009891196834817, + "grad_norm": 3.45703439958464, + "learning_rate": 3.686331989739832e-05, + "loss": 0.4238, + "step": 1021 + }, + { + "epoch": 1.0108803165182987, + "grad_norm": 1.1262732708801717, + "learning_rate": 3.684499816782704e-05, + "loss": 0.4644, + "step": 1022 + }, + { + "epoch": 1.0118694362017804, + "grad_norm": 0.35784290295603, + "learning_rate": 3.6826676438255774e-05, + "loss": 0.489, + "step": 1023 + }, + { + "epoch": 1.012858555885262, + "grad_norm": 0.4608616029921624, + "learning_rate": 3.68083547086845e-05, + "loss": 0.4936, + "step": 1024 + }, + { + "epoch": 1.0138476755687438, + "grad_norm": 0.424887693355373, + "learning_rate": 3.6790032979113233e-05, + "loss": 0.5167, + "step": 1025 + }, + { + "epoch": 1.0148367952522255, + "grad_norm": 0.447495609689506, + "learning_rate": 3.677171124954196e-05, + "loss": 0.4492, + "step": 1026 + }, + { + "epoch": 1.0158259149357072, + "grad_norm": 0.36530975855838116, + "learning_rate": 3.6753389519970686e-05, + "loss": 0.4778, + "step": 1027 + }, + { + "epoch": 1.0168150346191889, + "grad_norm": 1.9814451602515812, + "learning_rate": 3.673506779039941e-05, + "loss": 0.4786, + "step": 1028 + }, + { + "epoch": 1.0178041543026706, + "grad_norm": 0.5630016104667595, + "learning_rate": 3.6716746060828146e-05, + "loss": 0.4327, + "step": 1029 + }, + { + "epoch": 1.0187932739861523, + "grad_norm": 0.36111632730607574, + "learning_rate": 3.669842433125687e-05, + "loss": 0.4235, + "step": 1030 + }, + { + "epoch": 1.019782393669634, + "grad_norm": 5.436341015640222, + "learning_rate": 3.66801026016856e-05, + "loss": 0.5393, + "step": 1031 + }, + { + "epoch": 1.0207715133531157, + "grad_norm": 0.7375907758295829, + "learning_rate": 3.666178087211433e-05, + "loss": 0.4285, + "step": 1032 + }, + { + "epoch": 1.0217606330365974, + "grad_norm": 0.3037209911593098, + "learning_rate": 3.664345914254306e-05, + "loss": 0.4599, + "step": 1033 + }, + { + "epoch": 1.022749752720079, + "grad_norm": 0.7465649356485863, + "learning_rate": 3.662513741297179e-05, + "loss": 0.4379, + "step": 1034 + }, + { + "epoch": 1.0237388724035608, + "grad_norm": 0.43790277448859827, + "learning_rate": 3.660681568340051e-05, + "loss": 0.4044, + "step": 1035 + }, + { + "epoch": 1.0247279920870425, + "grad_norm": 0.6032981713354464, + "learning_rate": 3.6588493953829244e-05, + "loss": 0.4908, + "step": 1036 + }, + { + "epoch": 1.0257171117705242, + "grad_norm": 0.5761051820115305, + "learning_rate": 3.657017222425797e-05, + "loss": 0.4773, + "step": 1037 + }, + { + "epoch": 1.0267062314540059, + "grad_norm": 0.7647537299042617, + "learning_rate": 3.6551850494686704e-05, + "loss": 0.4242, + "step": 1038 + }, + { + "epoch": 1.0276953511374876, + "grad_norm": 0.5762122743965227, + "learning_rate": 3.6533528765115424e-05, + "loss": 0.4405, + "step": 1039 + }, + { + "epoch": 1.0286844708209693, + "grad_norm": 0.3009983442497535, + "learning_rate": 3.651520703554416e-05, + "loss": 0.4678, + "step": 1040 + }, + { + "epoch": 1.029673590504451, + "grad_norm": 0.562112523671321, + "learning_rate": 3.649688530597288e-05, + "loss": 0.4621, + "step": 1041 + }, + { + "epoch": 1.0306627101879327, + "grad_norm": 0.37989462590052814, + "learning_rate": 3.6478563576401616e-05, + "loss": 0.4267, + "step": 1042 + }, + { + "epoch": 1.0316518298714143, + "grad_norm": 1.0300574287245914, + "learning_rate": 3.646024184683034e-05, + "loss": 0.4476, + "step": 1043 + }, + { + "epoch": 1.032640949554896, + "grad_norm": 0.556363323634819, + "learning_rate": 3.644192011725907e-05, + "loss": 0.4468, + "step": 1044 + }, + { + "epoch": 1.0336300692383777, + "grad_norm": 0.3386788946695046, + "learning_rate": 3.64235983876878e-05, + "loss": 0.4364, + "step": 1045 + }, + { + "epoch": 1.0346191889218594, + "grad_norm": 0.5192836886115852, + "learning_rate": 3.640527665811653e-05, + "loss": 0.4294, + "step": 1046 + }, + { + "epoch": 1.0356083086053411, + "grad_norm": 0.37484641311334366, + "learning_rate": 3.6386954928545255e-05, + "loss": 0.4691, + "step": 1047 + }, + { + "epoch": 1.0365974282888228, + "grad_norm": 0.43983528468602306, + "learning_rate": 3.636863319897398e-05, + "loss": 0.4607, + "step": 1048 + }, + { + "epoch": 1.0375865479723045, + "grad_norm": 1.031961425019673, + "learning_rate": 3.6350311469402715e-05, + "loss": 0.5229, + "step": 1049 + }, + { + "epoch": 1.0385756676557865, + "grad_norm": 3.2210405891971194, + "learning_rate": 3.633198973983144e-05, + "loss": 0.5568, + "step": 1050 + }, + { + "epoch": 1.039564787339268, + "grad_norm": 0.6782949578972237, + "learning_rate": 3.6313668010260175e-05, + "loss": 0.4354, + "step": 1051 + }, + { + "epoch": 1.0405539070227499, + "grad_norm": 0.4127185908000618, + "learning_rate": 3.6295346280688894e-05, + "loss": 0.4387, + "step": 1052 + }, + { + "epoch": 1.0415430267062316, + "grad_norm": 0.490074753069455, + "learning_rate": 3.627702455111763e-05, + "loss": 0.4307, + "step": 1053 + }, + { + "epoch": 1.0425321463897133, + "grad_norm": 0.6921248034479954, + "learning_rate": 3.6258702821546354e-05, + "loss": 0.4723, + "step": 1054 + }, + { + "epoch": 1.043521266073195, + "grad_norm": 0.393153265610656, + "learning_rate": 3.624038109197509e-05, + "loss": 0.4716, + "step": 1055 + }, + { + "epoch": 1.0445103857566767, + "grad_norm": 0.5649667904822261, + "learning_rate": 3.6222059362403813e-05, + "loss": 0.4793, + "step": 1056 + }, + { + "epoch": 1.0454995054401583, + "grad_norm": 1.8299145260790468, + "learning_rate": 3.620373763283254e-05, + "loss": 0.4621, + "step": 1057 + }, + { + "epoch": 1.04648862512364, + "grad_norm": 1.725090631699042, + "learning_rate": 3.618541590326127e-05, + "loss": 0.4317, + "step": 1058 + }, + { + "epoch": 1.0474777448071217, + "grad_norm": 0.537785509003733, + "learning_rate": 3.616709417369e-05, + "loss": 0.4513, + "step": 1059 + }, + { + "epoch": 1.0484668644906034, + "grad_norm": 0.33792548688856516, + "learning_rate": 3.6148772444118726e-05, + "loss": 0.4657, + "step": 1060 + }, + { + "epoch": 1.0494559841740851, + "grad_norm": 0.8248367654993363, + "learning_rate": 3.613045071454745e-05, + "loss": 0.4434, + "step": 1061 + }, + { + "epoch": 1.0504451038575668, + "grad_norm": 0.4266254409159955, + "learning_rate": 3.6112128984976185e-05, + "loss": 0.4681, + "step": 1062 + }, + { + "epoch": 1.0514342235410485, + "grad_norm": 0.42576334310945435, + "learning_rate": 3.609380725540491e-05, + "loss": 0.4561, + "step": 1063 + }, + { + "epoch": 1.0524233432245302, + "grad_norm": 0.5266659764246051, + "learning_rate": 3.607548552583364e-05, + "loss": 0.4802, + "step": 1064 + }, + { + "epoch": 1.053412462908012, + "grad_norm": 0.810147632482765, + "learning_rate": 3.6057163796262365e-05, + "loss": 0.4591, + "step": 1065 + }, + { + "epoch": 1.0544015825914936, + "grad_norm": 0.413804915322679, + "learning_rate": 3.60388420666911e-05, + "loss": 0.3825, + "step": 1066 + }, + { + "epoch": 1.0553907022749753, + "grad_norm": 0.44504381838193996, + "learning_rate": 3.602052033711983e-05, + "loss": 0.4876, + "step": 1067 + }, + { + "epoch": 1.056379821958457, + "grad_norm": 0.4439897157440488, + "learning_rate": 3.600219860754855e-05, + "loss": 0.4253, + "step": 1068 + }, + { + "epoch": 1.0573689416419387, + "grad_norm": 0.3500704299271919, + "learning_rate": 3.5983876877977284e-05, + "loss": 0.4336, + "step": 1069 + }, + { + "epoch": 1.0583580613254204, + "grad_norm": 0.33953086838015223, + "learning_rate": 3.596555514840601e-05, + "loss": 0.4255, + "step": 1070 + }, + { + "epoch": 1.0593471810089021, + "grad_norm": 0.48346975571239686, + "learning_rate": 3.5947233418834744e-05, + "loss": 0.4609, + "step": 1071 + }, + { + "epoch": 1.0603363006923838, + "grad_norm": 0.3187757649005785, + "learning_rate": 3.592891168926347e-05, + "loss": 0.4665, + "step": 1072 + }, + { + "epoch": 1.0613254203758655, + "grad_norm": 0.42833406748043845, + "learning_rate": 3.5910589959692196e-05, + "loss": 0.3678, + "step": 1073 + }, + { + "epoch": 1.0623145400593472, + "grad_norm": 0.4566593411257765, + "learning_rate": 3.589226823012092e-05, + "loss": 0.4284, + "step": 1074 + }, + { + "epoch": 1.063303659742829, + "grad_norm": 0.3468485072829719, + "learning_rate": 3.5873946500549656e-05, + "loss": 0.4746, + "step": 1075 + }, + { + "epoch": 1.0642927794263106, + "grad_norm": 0.510162571679684, + "learning_rate": 3.585562477097838e-05, + "loss": 0.4224, + "step": 1076 + }, + { + "epoch": 1.0652818991097923, + "grad_norm": 0.3979585894735656, + "learning_rate": 3.583730304140711e-05, + "loss": 0.4646, + "step": 1077 + }, + { + "epoch": 1.066271018793274, + "grad_norm": 0.36760884964977675, + "learning_rate": 3.5818981311835835e-05, + "loss": 0.4491, + "step": 1078 + }, + { + "epoch": 1.0672601384767557, + "grad_norm": 0.5145237916838321, + "learning_rate": 3.580065958226457e-05, + "loss": 0.49, + "step": 1079 + }, + { + "epoch": 1.0682492581602374, + "grad_norm": 0.3129933723610313, + "learning_rate": 3.57823378526933e-05, + "loss": 0.443, + "step": 1080 + }, + { + "epoch": 1.0692383778437191, + "grad_norm": 0.44086395340998813, + "learning_rate": 3.576401612312202e-05, + "loss": 0.4624, + "step": 1081 + }, + { + "epoch": 1.0702274975272008, + "grad_norm": 0.3521648112115606, + "learning_rate": 3.5745694393550754e-05, + "loss": 0.492, + "step": 1082 + }, + { + "epoch": 1.0712166172106825, + "grad_norm": 0.34957384405320735, + "learning_rate": 3.572737266397948e-05, + "loss": 0.4699, + "step": 1083 + }, + { + "epoch": 1.0722057368941642, + "grad_norm": 0.4051017779258357, + "learning_rate": 3.5709050934408214e-05, + "loss": 0.4751, + "step": 1084 + }, + { + "epoch": 1.073194856577646, + "grad_norm": 0.3301743394383747, + "learning_rate": 3.5690729204836934e-05, + "loss": 0.4765, + "step": 1085 + }, + { + "epoch": 1.0741839762611276, + "grad_norm": 0.37660803140457133, + "learning_rate": 3.567240747526567e-05, + "loss": 0.4697, + "step": 1086 + }, + { + "epoch": 1.0751730959446093, + "grad_norm": 0.33541330344202874, + "learning_rate": 3.565408574569439e-05, + "loss": 0.3767, + "step": 1087 + }, + { + "epoch": 1.076162215628091, + "grad_norm": 0.31431860272850104, + "learning_rate": 3.5635764016123127e-05, + "loss": 0.4464, + "step": 1088 + }, + { + "epoch": 1.0771513353115727, + "grad_norm": 0.3822236919960425, + "learning_rate": 3.561744228655185e-05, + "loss": 0.417, + "step": 1089 + }, + { + "epoch": 1.0781404549950544, + "grad_norm": 0.3436590951082615, + "learning_rate": 3.559912055698058e-05, + "loss": 0.4275, + "step": 1090 + }, + { + "epoch": 1.079129574678536, + "grad_norm": 0.3085576056896924, + "learning_rate": 3.558079882740931e-05, + "loss": 0.3597, + "step": 1091 + }, + { + "epoch": 1.0801186943620178, + "grad_norm": 0.27756477292049936, + "learning_rate": 3.556247709783804e-05, + "loss": 0.3784, + "step": 1092 + }, + { + "epoch": 1.0811078140454995, + "grad_norm": 0.41722858436924626, + "learning_rate": 3.5544155368266765e-05, + "loss": 0.4965, + "step": 1093 + }, + { + "epoch": 1.0820969337289812, + "grad_norm": 0.3379090288767247, + "learning_rate": 3.552583363869549e-05, + "loss": 0.4804, + "step": 1094 + }, + { + "epoch": 1.083086053412463, + "grad_norm": 0.352934575359697, + "learning_rate": 3.5507511909124225e-05, + "loss": 0.388, + "step": 1095 + }, + { + "epoch": 1.0840751730959446, + "grad_norm": 2.8132481274950853, + "learning_rate": 3.548919017955295e-05, + "loss": 0.5017, + "step": 1096 + }, + { + "epoch": 1.0850642927794263, + "grad_norm": 0.45069457909979543, + "learning_rate": 3.547086844998168e-05, + "loss": 0.4375, + "step": 1097 + }, + { + "epoch": 1.086053412462908, + "grad_norm": 0.3169100654997534, + "learning_rate": 3.5452546720410404e-05, + "loss": 0.4386, + "step": 1098 + }, + { + "epoch": 1.0870425321463897, + "grad_norm": 3.015261219455845, + "learning_rate": 3.543422499083914e-05, + "loss": 0.6241, + "step": 1099 + }, + { + "epoch": 1.0880316518298714, + "grad_norm": 0.7258230030251859, + "learning_rate": 3.5415903261267864e-05, + "loss": 0.4545, + "step": 1100 + }, + { + "epoch": 1.089020771513353, + "grad_norm": 0.3799505369853889, + "learning_rate": 3.53975815316966e-05, + "loss": 0.4984, + "step": 1101 + }, + { + "epoch": 1.0900098911968348, + "grad_norm": 0.4198479739903225, + "learning_rate": 3.537925980212532e-05, + "loss": 0.4321, + "step": 1102 + }, + { + "epoch": 1.0909990108803165, + "grad_norm": 0.44474996311910875, + "learning_rate": 3.536093807255405e-05, + "loss": 0.385, + "step": 1103 + }, + { + "epoch": 1.0919881305637982, + "grad_norm": 0.34423848805072893, + "learning_rate": 3.534261634298278e-05, + "loss": 0.3839, + "step": 1104 + }, + { + "epoch": 1.0929772502472799, + "grad_norm": 0.4385335696893382, + "learning_rate": 3.532429461341151e-05, + "loss": 0.5029, + "step": 1105 + }, + { + "epoch": 1.0939663699307616, + "grad_norm": 0.44073911702739826, + "learning_rate": 3.5305972883840236e-05, + "loss": 0.4517, + "step": 1106 + }, + { + "epoch": 1.0949554896142433, + "grad_norm": 0.3661390388337049, + "learning_rate": 3.528765115426896e-05, + "loss": 0.4409, + "step": 1107 + }, + { + "epoch": 1.095944609297725, + "grad_norm": 0.3778751285178957, + "learning_rate": 3.5269329424697696e-05, + "loss": 0.4174, + "step": 1108 + }, + { + "epoch": 1.0969337289812067, + "grad_norm": 0.3484208155588471, + "learning_rate": 3.525100769512642e-05, + "loss": 0.4595, + "step": 1109 + }, + { + "epoch": 1.0979228486646884, + "grad_norm": 0.3721575495834725, + "learning_rate": 3.523268596555515e-05, + "loss": 0.4108, + "step": 1110 + }, + { + "epoch": 1.09891196834817, + "grad_norm": 0.3260714299366415, + "learning_rate": 3.5214364235983875e-05, + "loss": 0.4622, + "step": 1111 + }, + { + "epoch": 1.0999010880316518, + "grad_norm": 0.3354899051869184, + "learning_rate": 3.519604250641261e-05, + "loss": 0.4113, + "step": 1112 + }, + { + "epoch": 1.1008902077151335, + "grad_norm": 0.2888023111039527, + "learning_rate": 3.5177720776841334e-05, + "loss": 0.3783, + "step": 1113 + }, + { + "epoch": 1.1018793273986152, + "grad_norm": 0.3379822216843918, + "learning_rate": 3.515939904727006e-05, + "loss": 0.5117, + "step": 1114 + }, + { + "epoch": 1.1028684470820969, + "grad_norm": 0.3591859612245374, + "learning_rate": 3.5141077317698794e-05, + "loss": 0.487, + "step": 1115 + }, + { + "epoch": 1.1038575667655786, + "grad_norm": 0.30210957464850935, + "learning_rate": 3.512275558812752e-05, + "loss": 0.4574, + "step": 1116 + }, + { + "epoch": 1.1048466864490603, + "grad_norm": 0.3056806916327237, + "learning_rate": 3.5104433858556254e-05, + "loss": 0.4382, + "step": 1117 + }, + { + "epoch": 1.105835806132542, + "grad_norm": 0.3406702445819911, + "learning_rate": 3.508611212898498e-05, + "loss": 0.4476, + "step": 1118 + }, + { + "epoch": 1.1068249258160237, + "grad_norm": 0.31733420049096794, + "learning_rate": 3.5067790399413706e-05, + "loss": 0.454, + "step": 1119 + }, + { + "epoch": 1.1078140454995054, + "grad_norm": 0.3577310727863552, + "learning_rate": 3.504946866984243e-05, + "loss": 0.486, + "step": 1120 + }, + { + "epoch": 1.108803165182987, + "grad_norm": 0.39669890919733924, + "learning_rate": 3.5031146940271166e-05, + "loss": 0.5032, + "step": 1121 + }, + { + "epoch": 1.1097922848664687, + "grad_norm": 0.36109756118213504, + "learning_rate": 3.501282521069989e-05, + "loss": 0.4855, + "step": 1122 + }, + { + "epoch": 1.1107814045499504, + "grad_norm": 0.41157327887209877, + "learning_rate": 3.499450348112862e-05, + "loss": 0.492, + "step": 1123 + }, + { + "epoch": 1.1117705242334321, + "grad_norm": 0.8883017039989023, + "learning_rate": 3.4976181751557345e-05, + "loss": 0.4755, + "step": 1124 + }, + { + "epoch": 1.1127596439169138, + "grad_norm": 0.4045680018041389, + "learning_rate": 3.495786002198608e-05, + "loss": 0.5405, + "step": 1125 + }, + { + "epoch": 1.1137487636003955, + "grad_norm": 0.46285838105173954, + "learning_rate": 3.4939538292414805e-05, + "loss": 0.4135, + "step": 1126 + }, + { + "epoch": 1.1147378832838775, + "grad_norm": 0.4009969222725594, + "learning_rate": 3.492121656284353e-05, + "loss": 0.4772, + "step": 1127 + }, + { + "epoch": 1.115727002967359, + "grad_norm": 0.3744825716220794, + "learning_rate": 3.4902894833272265e-05, + "loss": 0.4132, + "step": 1128 + }, + { + "epoch": 1.1167161226508409, + "grad_norm": 0.4811370559022165, + "learning_rate": 3.488457310370099e-05, + "loss": 0.4486, + "step": 1129 + }, + { + "epoch": 1.1177052423343223, + "grad_norm": 0.32402321215164476, + "learning_rate": 3.4866251374129724e-05, + "loss": 0.4404, + "step": 1130 + }, + { + "epoch": 1.1186943620178043, + "grad_norm": 0.35736672487594145, + "learning_rate": 3.4847929644558444e-05, + "loss": 0.4517, + "step": 1131 + }, + { + "epoch": 1.119683481701286, + "grad_norm": 0.43284952844544416, + "learning_rate": 3.482960791498718e-05, + "loss": 0.432, + "step": 1132 + }, + { + "epoch": 1.1206726013847677, + "grad_norm": 0.33367942093658537, + "learning_rate": 3.4811286185415903e-05, + "loss": 0.4439, + "step": 1133 + }, + { + "epoch": 1.1216617210682494, + "grad_norm": 0.34533013092078974, + "learning_rate": 3.479296445584464e-05, + "loss": 0.4537, + "step": 1134 + }, + { + "epoch": 1.122650840751731, + "grad_norm": 0.4103596180059782, + "learning_rate": 3.477464272627336e-05, + "loss": 0.4401, + "step": 1135 + }, + { + "epoch": 1.1236399604352127, + "grad_norm": 0.3508858030722054, + "learning_rate": 3.475632099670209e-05, + "loss": 0.4513, + "step": 1136 + }, + { + "epoch": 1.1246290801186944, + "grad_norm": 0.2644357297408127, + "learning_rate": 3.4737999267130816e-05, + "loss": 0.4285, + "step": 1137 + }, + { + "epoch": 1.1256181998021761, + "grad_norm": 0.8993722168956318, + "learning_rate": 3.471967753755955e-05, + "loss": 0.5024, + "step": 1138 + }, + { + "epoch": 1.1266073194856578, + "grad_norm": 0.40831707081279583, + "learning_rate": 3.4701355807988275e-05, + "loss": 0.455, + "step": 1139 + }, + { + "epoch": 1.1275964391691395, + "grad_norm": 0.3182793210526495, + "learning_rate": 3.4683034078417e-05, + "loss": 0.4162, + "step": 1140 + }, + { + "epoch": 1.1285855588526212, + "grad_norm": 0.3821840881244759, + "learning_rate": 3.4664712348845735e-05, + "loss": 0.4608, + "step": 1141 + }, + { + "epoch": 1.129574678536103, + "grad_norm": 0.4427568558577197, + "learning_rate": 3.464639061927446e-05, + "loss": 0.5133, + "step": 1142 + }, + { + "epoch": 1.1305637982195846, + "grad_norm": 0.9566817035370797, + "learning_rate": 3.462806888970319e-05, + "loss": 0.4638, + "step": 1143 + }, + { + "epoch": 1.1315529179030663, + "grad_norm": 0.6038194624696839, + "learning_rate": 3.4609747160131914e-05, + "loss": 0.3681, + "step": 1144 + }, + { + "epoch": 1.132542037586548, + "grad_norm": 0.6807613022411125, + "learning_rate": 3.459142543056065e-05, + "loss": 0.4679, + "step": 1145 + }, + { + "epoch": 1.1335311572700297, + "grad_norm": 0.4115234393020162, + "learning_rate": 3.4573103700989374e-05, + "loss": 0.4422, + "step": 1146 + }, + { + "epoch": 1.1345202769535114, + "grad_norm": 0.4341844759408288, + "learning_rate": 3.455478197141811e-05, + "loss": 0.4423, + "step": 1147 + }, + { + "epoch": 1.1355093966369931, + "grad_norm": 0.5051556302844019, + "learning_rate": 3.453646024184683e-05, + "loss": 0.4541, + "step": 1148 + }, + { + "epoch": 1.1364985163204748, + "grad_norm": 0.3758242355747177, + "learning_rate": 3.451813851227556e-05, + "loss": 0.4276, + "step": 1149 + }, + { + "epoch": 1.1374876360039565, + "grad_norm": 0.3320259649587403, + "learning_rate": 3.4499816782704286e-05, + "loss": 0.4316, + "step": 1150 + }, + { + "epoch": 1.1384767556874382, + "grad_norm": 0.45695875442585837, + "learning_rate": 3.448149505313302e-05, + "loss": 0.4378, + "step": 1151 + }, + { + "epoch": 1.13946587537092, + "grad_norm": 0.43478750136447, + "learning_rate": 3.4463173323561746e-05, + "loss": 0.5218, + "step": 1152 + }, + { + "epoch": 1.1404549950544016, + "grad_norm": 0.8782929825237592, + "learning_rate": 3.444485159399047e-05, + "loss": 0.4065, + "step": 1153 + }, + { + "epoch": 1.1414441147378833, + "grad_norm": 0.48626073683964544, + "learning_rate": 3.4426529864419206e-05, + "loss": 0.4389, + "step": 1154 + }, + { + "epoch": 1.142433234421365, + "grad_norm": 0.49062321657072966, + "learning_rate": 3.440820813484793e-05, + "loss": 0.4402, + "step": 1155 + }, + { + "epoch": 1.1434223541048467, + "grad_norm": 3.9259528695103088, + "learning_rate": 3.438988640527666e-05, + "loss": 0.5033, + "step": 1156 + }, + { + "epoch": 1.1444114737883284, + "grad_norm": 0.4908247634768901, + "learning_rate": 3.4371564675705385e-05, + "loss": 0.4815, + "step": 1157 + }, + { + "epoch": 1.1454005934718101, + "grad_norm": 0.44304978825609653, + "learning_rate": 3.435324294613412e-05, + "loss": 0.3978, + "step": 1158 + }, + { + "epoch": 1.1463897131552918, + "grad_norm": 0.32374507014467085, + "learning_rate": 3.4334921216562844e-05, + "loss": 0.4137, + "step": 1159 + }, + { + "epoch": 1.1473788328387735, + "grad_norm": 0.42036545626914656, + "learning_rate": 3.431659948699157e-05, + "loss": 0.4604, + "step": 1160 + }, + { + "epoch": 1.1483679525222552, + "grad_norm": 0.4166874208449986, + "learning_rate": 3.42982777574203e-05, + "loss": 0.4776, + "step": 1161 + }, + { + "epoch": 1.149357072205737, + "grad_norm": 0.31668752094571023, + "learning_rate": 3.427995602784903e-05, + "loss": 0.4456, + "step": 1162 + }, + { + "epoch": 1.1503461918892186, + "grad_norm": 0.43054522427624803, + "learning_rate": 3.4261634298277764e-05, + "loss": 0.443, + "step": 1163 + }, + { + "epoch": 1.1513353115727003, + "grad_norm": 0.42615533383169335, + "learning_rate": 3.424331256870649e-05, + "loss": 0.4032, + "step": 1164 + }, + { + "epoch": 1.152324431256182, + "grad_norm": 0.3387153691463948, + "learning_rate": 3.4224990839135217e-05, + "loss": 0.4377, + "step": 1165 + }, + { + "epoch": 1.1533135509396637, + "grad_norm": 0.36898697794603996, + "learning_rate": 3.420666910956394e-05, + "loss": 0.3753, + "step": 1166 + }, + { + "epoch": 1.1543026706231454, + "grad_norm": 0.3421939288510364, + "learning_rate": 3.4188347379992676e-05, + "loss": 0.4382, + "step": 1167 + }, + { + "epoch": 1.155291790306627, + "grad_norm": 0.31926334513776805, + "learning_rate": 3.41700256504214e-05, + "loss": 0.4165, + "step": 1168 + }, + { + "epoch": 1.1562809099901088, + "grad_norm": 0.40484584927634126, + "learning_rate": 3.415170392085013e-05, + "loss": 0.4012, + "step": 1169 + }, + { + "epoch": 1.1572700296735905, + "grad_norm": 0.38104083917952464, + "learning_rate": 3.4133382191278855e-05, + "loss": 0.4316, + "step": 1170 + }, + { + "epoch": 1.1582591493570722, + "grad_norm": 0.3709082882174054, + "learning_rate": 3.411506046170759e-05, + "loss": 0.4243, + "step": 1171 + }, + { + "epoch": 1.159248269040554, + "grad_norm": 0.38180703630514035, + "learning_rate": 3.4096738732136315e-05, + "loss": 0.4204, + "step": 1172 + }, + { + "epoch": 1.1602373887240356, + "grad_norm": 0.35066602597324176, + "learning_rate": 3.407841700256504e-05, + "loss": 0.4517, + "step": 1173 + }, + { + "epoch": 1.1612265084075173, + "grad_norm": 0.3409612624272448, + "learning_rate": 3.4060095272993775e-05, + "loss": 0.4681, + "step": 1174 + }, + { + "epoch": 1.162215628090999, + "grad_norm": 0.3242800885803111, + "learning_rate": 3.40417735434225e-05, + "loss": 0.5142, + "step": 1175 + }, + { + "epoch": 1.1632047477744807, + "grad_norm": 0.3999158161657895, + "learning_rate": 3.4023451813851234e-05, + "loss": 0.4721, + "step": 1176 + }, + { + "epoch": 1.1641938674579624, + "grad_norm": 0.3057813262892474, + "learning_rate": 3.4005130084279954e-05, + "loss": 0.4081, + "step": 1177 + }, + { + "epoch": 1.165182987141444, + "grad_norm": 0.29265432398856944, + "learning_rate": 3.398680835470869e-05, + "loss": 0.4741, + "step": 1178 + }, + { + "epoch": 1.1661721068249258, + "grad_norm": 0.35031812706583215, + "learning_rate": 3.3968486625137414e-05, + "loss": 0.4069, + "step": 1179 + }, + { + "epoch": 1.1671612265084075, + "grad_norm": 0.3199265954315745, + "learning_rate": 3.395016489556615e-05, + "loss": 0.4153, + "step": 1180 + }, + { + "epoch": 1.1681503461918892, + "grad_norm": 0.3322579671121011, + "learning_rate": 3.393184316599487e-05, + "loss": 0.4966, + "step": 1181 + }, + { + "epoch": 1.1691394658753709, + "grad_norm": 0.31674223480365804, + "learning_rate": 3.39135214364236e-05, + "loss": 0.4019, + "step": 1182 + }, + { + "epoch": 1.1701285855588526, + "grad_norm": 0.3452256992238975, + "learning_rate": 3.3895199706852326e-05, + "loss": 0.508, + "step": 1183 + }, + { + "epoch": 1.1711177052423343, + "grad_norm": 0.3126306353162765, + "learning_rate": 3.387687797728106e-05, + "loss": 0.4411, + "step": 1184 + }, + { + "epoch": 1.172106824925816, + "grad_norm": 0.3504430898459003, + "learning_rate": 3.3858556247709786e-05, + "loss": 0.4409, + "step": 1185 + }, + { + "epoch": 1.1730959446092977, + "grad_norm": 0.32158432591053415, + "learning_rate": 3.384023451813851e-05, + "loss": 0.4216, + "step": 1186 + }, + { + "epoch": 1.1740850642927794, + "grad_norm": 0.33575292344710606, + "learning_rate": 3.3821912788567245e-05, + "loss": 0.4653, + "step": 1187 + }, + { + "epoch": 1.175074183976261, + "grad_norm": 0.2953528980384244, + "learning_rate": 3.380359105899597e-05, + "loss": 0.49, + "step": 1188 + }, + { + "epoch": 1.1760633036597428, + "grad_norm": 0.44904372451288704, + "learning_rate": 3.37852693294247e-05, + "loss": 0.49, + "step": 1189 + }, + { + "epoch": 1.1770524233432245, + "grad_norm": 0.3985352253667881, + "learning_rate": 3.3766947599853424e-05, + "loss": 0.4351, + "step": 1190 + }, + { + "epoch": 1.1780415430267062, + "grad_norm": 0.36529013646478736, + "learning_rate": 3.374862587028216e-05, + "loss": 0.3994, + "step": 1191 + }, + { + "epoch": 1.1790306627101879, + "grad_norm": 0.4181274052477301, + "learning_rate": 3.3730304140710884e-05, + "loss": 0.461, + "step": 1192 + }, + { + "epoch": 1.1800197823936696, + "grad_norm": 0.37986297109107886, + "learning_rate": 3.371198241113962e-05, + "loss": 0.4425, + "step": 1193 + }, + { + "epoch": 1.1810089020771513, + "grad_norm": 0.38662532285644546, + "learning_rate": 3.369366068156834e-05, + "loss": 0.4557, + "step": 1194 + }, + { + "epoch": 1.181998021760633, + "grad_norm": 0.320518534529602, + "learning_rate": 3.367533895199707e-05, + "loss": 0.387, + "step": 1195 + }, + { + "epoch": 1.1829871414441147, + "grad_norm": 0.3794192278894525, + "learning_rate": 3.3657017222425796e-05, + "loss": 0.4355, + "step": 1196 + }, + { + "epoch": 1.1839762611275964, + "grad_norm": 0.3258255894383896, + "learning_rate": 3.363869549285453e-05, + "loss": 0.4617, + "step": 1197 + }, + { + "epoch": 1.184965380811078, + "grad_norm": 0.3753454976012276, + "learning_rate": 3.3620373763283256e-05, + "loss": 0.4796, + "step": 1198 + }, + { + "epoch": 1.1859545004945597, + "grad_norm": 0.31066105843247516, + "learning_rate": 3.360205203371198e-05, + "loss": 0.4841, + "step": 1199 + }, + { + "epoch": 1.1869436201780414, + "grad_norm": 0.37065175137983325, + "learning_rate": 3.3583730304140716e-05, + "loss": 0.4436, + "step": 1200 + }, + { + "epoch": 1.1879327398615231, + "grad_norm": 0.38153937820459927, + "learning_rate": 3.356540857456944e-05, + "loss": 0.4449, + "step": 1201 + }, + { + "epoch": 1.188921859545005, + "grad_norm": 0.31499452837981, + "learning_rate": 3.354708684499817e-05, + "loss": 0.3774, + "step": 1202 + }, + { + "epoch": 1.1899109792284865, + "grad_norm": 0.36683386599623874, + "learning_rate": 3.3528765115426895e-05, + "loss": 0.4489, + "step": 1203 + }, + { + "epoch": 1.1909000989119685, + "grad_norm": 0.3929092276173119, + "learning_rate": 3.351044338585563e-05, + "loss": 0.4983, + "step": 1204 + }, + { + "epoch": 1.19188921859545, + "grad_norm": 0.33361040009672116, + "learning_rate": 3.3492121656284355e-05, + "loss": 0.404, + "step": 1205 + }, + { + "epoch": 1.1928783382789319, + "grad_norm": 0.32846256380043704, + "learning_rate": 3.347379992671308e-05, + "loss": 0.4868, + "step": 1206 + }, + { + "epoch": 1.1938674579624133, + "grad_norm": 0.3340382987952063, + "learning_rate": 3.345547819714181e-05, + "loss": 0.4268, + "step": 1207 + }, + { + "epoch": 1.1948565776458953, + "grad_norm": 0.37162462421580916, + "learning_rate": 3.343715646757054e-05, + "loss": 0.4391, + "step": 1208 + }, + { + "epoch": 1.1958456973293767, + "grad_norm": 0.3365741665506994, + "learning_rate": 3.341883473799927e-05, + "loss": 0.4853, + "step": 1209 + }, + { + "epoch": 1.1968348170128587, + "grad_norm": 0.38473036798920535, + "learning_rate": 3.3400513008428e-05, + "loss": 0.4402, + "step": 1210 + }, + { + "epoch": 1.1978239366963404, + "grad_norm": 0.2943755821058674, + "learning_rate": 3.338219127885673e-05, + "loss": 0.4238, + "step": 1211 + }, + { + "epoch": 1.198813056379822, + "grad_norm": 0.3355562973386804, + "learning_rate": 3.336386954928545e-05, + "loss": 0.3917, + "step": 1212 + }, + { + "epoch": 1.1998021760633037, + "grad_norm": 0.317119955078073, + "learning_rate": 3.3345547819714186e-05, + "loss": 0.4691, + "step": 1213 + }, + { + "epoch": 1.2007912957467854, + "grad_norm": 0.33025445538947157, + "learning_rate": 3.332722609014291e-05, + "loss": 0.5038, + "step": 1214 + }, + { + "epoch": 1.2017804154302671, + "grad_norm": 0.3340396874499203, + "learning_rate": 3.330890436057164e-05, + "loss": 0.4274, + "step": 1215 + }, + { + "epoch": 1.2027695351137488, + "grad_norm": 0.34655165596355814, + "learning_rate": 3.3290582631000365e-05, + "loss": 0.4066, + "step": 1216 + }, + { + "epoch": 1.2037586547972305, + "grad_norm": 0.6174225922607776, + "learning_rate": 3.32722609014291e-05, + "loss": 0.4073, + "step": 1217 + }, + { + "epoch": 1.2047477744807122, + "grad_norm": 0.3327762274517774, + "learning_rate": 3.3253939171857825e-05, + "loss": 0.446, + "step": 1218 + }, + { + "epoch": 1.205736894164194, + "grad_norm": 0.32324569403115144, + "learning_rate": 3.323561744228655e-05, + "loss": 0.3973, + "step": 1219 + }, + { + "epoch": 1.2067260138476756, + "grad_norm": 0.3183062423357114, + "learning_rate": 3.321729571271528e-05, + "loss": 0.3924, + "step": 1220 + }, + { + "epoch": 1.2077151335311573, + "grad_norm": 0.3100295021920569, + "learning_rate": 3.319897398314401e-05, + "loss": 0.4126, + "step": 1221 + }, + { + "epoch": 1.208704253214639, + "grad_norm": 0.3373361154076598, + "learning_rate": 3.3180652253572744e-05, + "loss": 0.4281, + "step": 1222 + }, + { + "epoch": 1.2096933728981207, + "grad_norm": 0.3765180938062049, + "learning_rate": 3.3162330524001464e-05, + "loss": 0.5078, + "step": 1223 + }, + { + "epoch": 1.2106824925816024, + "grad_norm": 0.3143717459812389, + "learning_rate": 3.31440087944302e-05, + "loss": 0.4406, + "step": 1224 + }, + { + "epoch": 1.2116716122650841, + "grad_norm": 0.43796853097237903, + "learning_rate": 3.3125687064858924e-05, + "loss": 0.4843, + "step": 1225 + }, + { + "epoch": 1.2126607319485658, + "grad_norm": 0.27756885414505256, + "learning_rate": 3.310736533528766e-05, + "loss": 0.3495, + "step": 1226 + }, + { + "epoch": 1.2136498516320475, + "grad_norm": 0.34846044043895186, + "learning_rate": 3.308904360571638e-05, + "loss": 0.4714, + "step": 1227 + }, + { + "epoch": 1.2146389713155292, + "grad_norm": 0.2792141653842612, + "learning_rate": 3.307072187614511e-05, + "loss": 0.435, + "step": 1228 + }, + { + "epoch": 1.215628090999011, + "grad_norm": 0.3247383857479883, + "learning_rate": 3.3052400146573836e-05, + "loss": 0.4369, + "step": 1229 + }, + { + "epoch": 1.2166172106824926, + "grad_norm": 0.31934633509474236, + "learning_rate": 3.303407841700257e-05, + "loss": 0.4247, + "step": 1230 + }, + { + "epoch": 1.2176063303659743, + "grad_norm": 0.3117420006063186, + "learning_rate": 3.3015756687431296e-05, + "loss": 0.4221, + "step": 1231 + }, + { + "epoch": 1.218595450049456, + "grad_norm": 0.37143959870180393, + "learning_rate": 3.299743495786002e-05, + "loss": 0.4451, + "step": 1232 + }, + { + "epoch": 1.2195845697329377, + "grad_norm": 0.2747122535928208, + "learning_rate": 3.297911322828875e-05, + "loss": 0.4168, + "step": 1233 + }, + { + "epoch": 1.2205736894164194, + "grad_norm": 0.32821875211096874, + "learning_rate": 3.296079149871748e-05, + "loss": 0.4876, + "step": 1234 + }, + { + "epoch": 1.2215628090999011, + "grad_norm": 0.2848558407621968, + "learning_rate": 3.294246976914621e-05, + "loss": 0.4402, + "step": 1235 + }, + { + "epoch": 1.2225519287833828, + "grad_norm": 0.2885008544507463, + "learning_rate": 3.2924148039574935e-05, + "loss": 0.4349, + "step": 1236 + }, + { + "epoch": 1.2235410484668645, + "grad_norm": 0.33320909099352525, + "learning_rate": 3.290582631000367e-05, + "loss": 0.4433, + "step": 1237 + }, + { + "epoch": 1.2245301681503462, + "grad_norm": 0.25857871277266625, + "learning_rate": 3.2887504580432394e-05, + "loss": 0.4339, + "step": 1238 + }, + { + "epoch": 1.225519287833828, + "grad_norm": 0.32250835248537585, + "learning_rate": 3.286918285086113e-05, + "loss": 0.4378, + "step": 1239 + }, + { + "epoch": 1.2265084075173096, + "grad_norm": 0.30737128663542473, + "learning_rate": 3.285086112128985e-05, + "loss": 0.4577, + "step": 1240 + }, + { + "epoch": 1.2274975272007913, + "grad_norm": 0.2742545175524849, + "learning_rate": 3.283253939171858e-05, + "loss": 0.428, + "step": 1241 + }, + { + "epoch": 1.228486646884273, + "grad_norm": 0.28951560409808336, + "learning_rate": 3.2814217662147307e-05, + "loss": 0.3863, + "step": 1242 + }, + { + "epoch": 1.2294757665677547, + "grad_norm": 0.7199122350538452, + "learning_rate": 3.279589593257604e-05, + "loss": 0.4365, + "step": 1243 + }, + { + "epoch": 1.2304648862512364, + "grad_norm": 0.25600755117681134, + "learning_rate": 3.277757420300476e-05, + "loss": 0.4074, + "step": 1244 + }, + { + "epoch": 1.231454005934718, + "grad_norm": 0.8811383887062425, + "learning_rate": 3.275925247343349e-05, + "loss": 0.481, + "step": 1245 + }, + { + "epoch": 1.2324431256181998, + "grad_norm": 0.3023345814779684, + "learning_rate": 3.2740930743862226e-05, + "loss": 0.4768, + "step": 1246 + }, + { + "epoch": 1.2334322453016815, + "grad_norm": 0.2755925407169624, + "learning_rate": 3.272260901429095e-05, + "loss": 0.4027, + "step": 1247 + }, + { + "epoch": 1.2344213649851632, + "grad_norm": 11.463153475362521, + "learning_rate": 3.270428728471968e-05, + "loss": 0.4636, + "step": 1248 + }, + { + "epoch": 1.235410484668645, + "grad_norm": 0.45187999656619815, + "learning_rate": 3.2685965555148405e-05, + "loss": 0.4927, + "step": 1249 + }, + { + "epoch": 1.2363996043521266, + "grad_norm": 0.32892957675978074, + "learning_rate": 3.266764382557714e-05, + "loss": 0.4548, + "step": 1250 + }, + { + "epoch": 1.2373887240356083, + "grad_norm": 0.3303929130848598, + "learning_rate": 3.2649322096005865e-05, + "loss": 0.4709, + "step": 1251 + }, + { + "epoch": 1.23837784371909, + "grad_norm": 0.3373802532562506, + "learning_rate": 3.263100036643459e-05, + "loss": 0.4582, + "step": 1252 + }, + { + "epoch": 1.2393669634025717, + "grad_norm": 0.3779420909570119, + "learning_rate": 3.261267863686332e-05, + "loss": 0.4451, + "step": 1253 + }, + { + "epoch": 1.2403560830860534, + "grad_norm": 0.291216947790094, + "learning_rate": 3.259435690729205e-05, + "loss": 0.4013, + "step": 1254 + }, + { + "epoch": 1.241345202769535, + "grad_norm": 0.34985117379943337, + "learning_rate": 3.257603517772078e-05, + "loss": 0.4513, + "step": 1255 + }, + { + "epoch": 1.2423343224530168, + "grad_norm": 0.31095271457516216, + "learning_rate": 3.255771344814951e-05, + "loss": 0.4411, + "step": 1256 + }, + { + "epoch": 1.2433234421364985, + "grad_norm": 1.047735080448275, + "learning_rate": 3.253939171857823e-05, + "loss": 0.5013, + "step": 1257 + }, + { + "epoch": 1.2443125618199802, + "grad_norm": 0.27825085687486434, + "learning_rate": 3.252106998900696e-05, + "loss": 0.4161, + "step": 1258 + }, + { + "epoch": 1.2453016815034619, + "grad_norm": 0.3636283861995447, + "learning_rate": 3.2502748259435696e-05, + "loss": 0.5048, + "step": 1259 + }, + { + "epoch": 1.2462908011869436, + "grad_norm": 0.2972461553564986, + "learning_rate": 3.248442652986442e-05, + "loss": 0.4577, + "step": 1260 + }, + { + "epoch": 1.2472799208704253, + "grad_norm": 0.4663431964179042, + "learning_rate": 3.246610480029315e-05, + "loss": 0.4355, + "step": 1261 + }, + { + "epoch": 1.248269040553907, + "grad_norm": 0.7010092875578634, + "learning_rate": 3.2447783070721876e-05, + "loss": 0.4501, + "step": 1262 + }, + { + "epoch": 1.2492581602373887, + "grad_norm": 0.301261398514134, + "learning_rate": 3.242946134115061e-05, + "loss": 0.4805, + "step": 1263 + }, + { + "epoch": 1.2502472799208704, + "grad_norm": 0.3418549326196357, + "learning_rate": 3.2411139611579335e-05, + "loss": 0.4795, + "step": 1264 + }, + { + "epoch": 1.251236399604352, + "grad_norm": 0.29526440353783767, + "learning_rate": 3.239281788200806e-05, + "loss": 0.4332, + "step": 1265 + }, + { + "epoch": 1.2522255192878338, + "grad_norm": 0.2967530321345406, + "learning_rate": 3.237449615243679e-05, + "loss": 0.3992, + "step": 1266 + }, + { + "epoch": 1.2532146389713155, + "grad_norm": 0.30327817973420274, + "learning_rate": 3.235617442286552e-05, + "loss": 0.4832, + "step": 1267 + }, + { + "epoch": 1.2542037586547972, + "grad_norm": 0.28390309436771854, + "learning_rate": 3.233785269329425e-05, + "loss": 0.394, + "step": 1268 + }, + { + "epoch": 1.2551928783382789, + "grad_norm": 0.3640976735315917, + "learning_rate": 3.2319530963722974e-05, + "loss": 0.4319, + "step": 1269 + }, + { + "epoch": 1.2561819980217606, + "grad_norm": 0.3039414854494338, + "learning_rate": 3.230120923415171e-05, + "loss": 0.4777, + "step": 1270 + }, + { + "epoch": 1.2571711177052423, + "grad_norm": 0.2797669473882524, + "learning_rate": 3.2282887504580434e-05, + "loss": 0.4259, + "step": 1271 + }, + { + "epoch": 1.258160237388724, + "grad_norm": 0.3431193474424225, + "learning_rate": 3.226456577500917e-05, + "loss": 0.4484, + "step": 1272 + }, + { + "epoch": 1.2591493570722057, + "grad_norm": 0.2748018185129936, + "learning_rate": 3.224624404543789e-05, + "loss": 0.4019, + "step": 1273 + }, + { + "epoch": 1.2601384767556874, + "grad_norm": 0.3078022305195795, + "learning_rate": 3.222792231586662e-05, + "loss": 0.4357, + "step": 1274 + }, + { + "epoch": 1.2611275964391693, + "grad_norm": 0.28817276925435575, + "learning_rate": 3.2209600586295346e-05, + "loss": 0.4581, + "step": 1275 + }, + { + "epoch": 1.2621167161226508, + "grad_norm": 0.28915115293238075, + "learning_rate": 3.219127885672408e-05, + "loss": 0.4113, + "step": 1276 + }, + { + "epoch": 1.2631058358061327, + "grad_norm": 0.29431101862681536, + "learning_rate": 3.2172957127152806e-05, + "loss": 0.4844, + "step": 1277 + }, + { + "epoch": 1.2640949554896141, + "grad_norm": 0.31417644441909515, + "learning_rate": 3.215463539758153e-05, + "loss": 0.4359, + "step": 1278 + }, + { + "epoch": 1.265084075173096, + "grad_norm": 0.2932996151523536, + "learning_rate": 3.213631366801026e-05, + "loss": 0.4566, + "step": 1279 + }, + { + "epoch": 1.2660731948565775, + "grad_norm": 0.3544874502119, + "learning_rate": 3.211799193843899e-05, + "loss": 0.4528, + "step": 1280 + }, + { + "epoch": 1.2670623145400595, + "grad_norm": 0.31431737172579327, + "learning_rate": 3.209967020886772e-05, + "loss": 0.4873, + "step": 1281 + }, + { + "epoch": 1.268051434223541, + "grad_norm": 0.32083957972618893, + "learning_rate": 3.2081348479296445e-05, + "loss": 0.4405, + "step": 1282 + }, + { + "epoch": 1.2690405539070229, + "grad_norm": 0.28531911086752, + "learning_rate": 3.206302674972518e-05, + "loss": 0.4763, + "step": 1283 + }, + { + "epoch": 1.2700296735905043, + "grad_norm": 0.39038530630890933, + "learning_rate": 3.2044705020153904e-05, + "loss": 0.487, + "step": 1284 + }, + { + "epoch": 1.2710187932739863, + "grad_norm": 0.3231694114187269, + "learning_rate": 3.202638329058264e-05, + "loss": 0.4575, + "step": 1285 + }, + { + "epoch": 1.2720079129574677, + "grad_norm": 0.39483284030242527, + "learning_rate": 3.200806156101136e-05, + "loss": 0.5078, + "step": 1286 + }, + { + "epoch": 1.2729970326409497, + "grad_norm": 0.2624573775627927, + "learning_rate": 3.198973983144009e-05, + "loss": 0.4189, + "step": 1287 + }, + { + "epoch": 1.2739861523244311, + "grad_norm": 0.3539064045837323, + "learning_rate": 3.197141810186882e-05, + "loss": 0.4083, + "step": 1288 + }, + { + "epoch": 1.274975272007913, + "grad_norm": 0.41113892601872426, + "learning_rate": 3.195309637229755e-05, + "loss": 0.4359, + "step": 1289 + }, + { + "epoch": 1.2759643916913945, + "grad_norm": 0.3263835404973625, + "learning_rate": 3.193477464272627e-05, + "loss": 0.4466, + "step": 1290 + }, + { + "epoch": 1.2769535113748764, + "grad_norm": 0.3521424288990003, + "learning_rate": 3.1916452913155e-05, + "loss": 0.4346, + "step": 1291 + }, + { + "epoch": 1.277942631058358, + "grad_norm": 0.32960482803892793, + "learning_rate": 3.189813118358373e-05, + "loss": 0.4081, + "step": 1292 + }, + { + "epoch": 1.2789317507418398, + "grad_norm": 0.37621318875682574, + "learning_rate": 3.187980945401246e-05, + "loss": 0.5071, + "step": 1293 + }, + { + "epoch": 1.2799208704253215, + "grad_norm": 0.2842212763877933, + "learning_rate": 3.186148772444119e-05, + "loss": 0.4383, + "step": 1294 + }, + { + "epoch": 1.2809099901088032, + "grad_norm": 0.4185836752845537, + "learning_rate": 3.1843165994869915e-05, + "loss": 0.4605, + "step": 1295 + }, + { + "epoch": 1.281899109792285, + "grad_norm": 0.2864590173818887, + "learning_rate": 3.182484426529865e-05, + "loss": 0.4115, + "step": 1296 + }, + { + "epoch": 1.2828882294757666, + "grad_norm": 0.31251818275646565, + "learning_rate": 3.1806522535727375e-05, + "loss": 0.4749, + "step": 1297 + }, + { + "epoch": 1.2838773491592483, + "grad_norm": 0.40375308662312326, + "learning_rate": 3.17882008061561e-05, + "loss": 0.4805, + "step": 1298 + }, + { + "epoch": 1.28486646884273, + "grad_norm": 0.2787733861619942, + "learning_rate": 3.176987907658483e-05, + "loss": 0.4695, + "step": 1299 + }, + { + "epoch": 1.2858555885262117, + "grad_norm": 0.3390155103152818, + "learning_rate": 3.175155734701356e-05, + "loss": 0.4799, + "step": 1300 + }, + { + "epoch": 1.2868447082096934, + "grad_norm": 0.39915923407897047, + "learning_rate": 3.173323561744229e-05, + "loss": 0.4911, + "step": 1301 + }, + { + "epoch": 1.2878338278931751, + "grad_norm": 0.2829054197112506, + "learning_rate": 3.171491388787102e-05, + "loss": 0.3704, + "step": 1302 + }, + { + "epoch": 1.2888229475766568, + "grad_norm": 0.3354351078071799, + "learning_rate": 3.169659215829974e-05, + "loss": 0.4108, + "step": 1303 + }, + { + "epoch": 1.2898120672601385, + "grad_norm": 0.2977538229373433, + "learning_rate": 3.167827042872847e-05, + "loss": 0.4279, + "step": 1304 + }, + { + "epoch": 1.2908011869436202, + "grad_norm": 0.328567314688651, + "learning_rate": 3.16599486991572e-05, + "loss": 0.4056, + "step": 1305 + }, + { + "epoch": 1.291790306627102, + "grad_norm": 0.3262674751990915, + "learning_rate": 3.164162696958593e-05, + "loss": 0.4404, + "step": 1306 + }, + { + "epoch": 1.2927794263105836, + "grad_norm": 0.38924960367345507, + "learning_rate": 3.162330524001466e-05, + "loss": 0.4364, + "step": 1307 + }, + { + "epoch": 1.2937685459940653, + "grad_norm": 0.28787499890259116, + "learning_rate": 3.1604983510443386e-05, + "loss": 0.5439, + "step": 1308 + }, + { + "epoch": 1.294757665677547, + "grad_norm": 0.3356946307141818, + "learning_rate": 3.158666178087212e-05, + "loss": 0.4324, + "step": 1309 + }, + { + "epoch": 1.2957467853610287, + "grad_norm": 0.4879317138260975, + "learning_rate": 3.1568340051300845e-05, + "loss": 0.4807, + "step": 1310 + }, + { + "epoch": 1.2967359050445104, + "grad_norm": 0.2955688732632104, + "learning_rate": 3.155001832172957e-05, + "loss": 0.4036, + "step": 1311 + }, + { + "epoch": 1.2977250247279921, + "grad_norm": 0.4032945847602782, + "learning_rate": 3.15316965921583e-05, + "loss": 0.4401, + "step": 1312 + }, + { + "epoch": 1.2987141444114738, + "grad_norm": 0.3717861992107911, + "learning_rate": 3.151337486258703e-05, + "loss": 0.412, + "step": 1313 + }, + { + "epoch": 1.2997032640949555, + "grad_norm": 0.36230212240482274, + "learning_rate": 3.149505313301576e-05, + "loss": 0.4756, + "step": 1314 + }, + { + "epoch": 1.3006923837784372, + "grad_norm": 0.3219831393538455, + "learning_rate": 3.1476731403444484e-05, + "loss": 0.4307, + "step": 1315 + }, + { + "epoch": 1.301681503461919, + "grad_norm": 0.2980114239679098, + "learning_rate": 3.145840967387321e-05, + "loss": 0.3987, + "step": 1316 + }, + { + "epoch": 1.3026706231454006, + "grad_norm": 0.8151806056234506, + "learning_rate": 3.1440087944301944e-05, + "loss": 0.4737, + "step": 1317 + }, + { + "epoch": 1.3036597428288823, + "grad_norm": 0.4605566888952793, + "learning_rate": 3.142176621473068e-05, + "loss": 0.4641, + "step": 1318 + }, + { + "epoch": 1.304648862512364, + "grad_norm": 0.3980859249064098, + "learning_rate": 3.14034444851594e-05, + "loss": 0.459, + "step": 1319 + }, + { + "epoch": 1.3056379821958457, + "grad_norm": 0.2841614140383684, + "learning_rate": 3.138512275558813e-05, + "loss": 0.4057, + "step": 1320 + }, + { + "epoch": 1.3066271018793274, + "grad_norm": 1.9073972336742402, + "learning_rate": 3.1366801026016856e-05, + "loss": 0.4767, + "step": 1321 + }, + { + "epoch": 1.307616221562809, + "grad_norm": 0.5592709384548382, + "learning_rate": 3.134847929644559e-05, + "loss": 0.4819, + "step": 1322 + }, + { + "epoch": 1.3086053412462908, + "grad_norm": 0.3222748433424541, + "learning_rate": 3.1330157566874316e-05, + "loss": 0.4371, + "step": 1323 + }, + { + "epoch": 1.3095944609297725, + "grad_norm": 0.4650918381065197, + "learning_rate": 3.131183583730304e-05, + "loss": 0.415, + "step": 1324 + }, + { + "epoch": 1.3105835806132542, + "grad_norm": 0.4224117379256276, + "learning_rate": 3.129351410773177e-05, + "loss": 0.4566, + "step": 1325 + }, + { + "epoch": 1.311572700296736, + "grad_norm": 0.46207236337386226, + "learning_rate": 3.12751923781605e-05, + "loss": 0.5005, + "step": 1326 + }, + { + "epoch": 1.3125618199802176, + "grad_norm": 0.4683411564216293, + "learning_rate": 3.125687064858923e-05, + "loss": 0.4825, + "step": 1327 + }, + { + "epoch": 1.3135509396636993, + "grad_norm": 0.40426282504932587, + "learning_rate": 3.1238548919017955e-05, + "loss": 0.4031, + "step": 1328 + }, + { + "epoch": 1.314540059347181, + "grad_norm": 0.4937917446065795, + "learning_rate": 3.122022718944669e-05, + "loss": 0.4375, + "step": 1329 + }, + { + "epoch": 1.3155291790306627, + "grad_norm": 0.44951168903509353, + "learning_rate": 3.1201905459875414e-05, + "loss": 0.4931, + "step": 1330 + }, + { + "epoch": 1.3165182987141444, + "grad_norm": 0.391635440933602, + "learning_rate": 3.118358373030415e-05, + "loss": 0.3879, + "step": 1331 + }, + { + "epoch": 1.317507418397626, + "grad_norm": 0.5580933846109488, + "learning_rate": 3.116526200073287e-05, + "loss": 0.4064, + "step": 1332 + }, + { + "epoch": 1.3184965380811078, + "grad_norm": 0.26495237831546364, + "learning_rate": 3.11469402711616e-05, + "loss": 0.3427, + "step": 1333 + }, + { + "epoch": 1.3194856577645895, + "grad_norm": 0.4754556883385079, + "learning_rate": 3.112861854159033e-05, + "loss": 0.4223, + "step": 1334 + }, + { + "epoch": 1.3204747774480712, + "grad_norm": 0.3881819412366177, + "learning_rate": 3.111029681201906e-05, + "loss": 0.4623, + "step": 1335 + }, + { + "epoch": 1.3214638971315529, + "grad_norm": 0.27166678881531925, + "learning_rate": 3.109197508244778e-05, + "loss": 0.4131, + "step": 1336 + }, + { + "epoch": 1.3224530168150346, + "grad_norm": 0.40190887482300114, + "learning_rate": 3.107365335287651e-05, + "loss": 0.492, + "step": 1337 + }, + { + "epoch": 1.3234421364985163, + "grad_norm": 0.3539039447670423, + "learning_rate": 3.105533162330524e-05, + "loss": 0.4205, + "step": 1338 + }, + { + "epoch": 1.324431256181998, + "grad_norm": 0.35944898679004156, + "learning_rate": 3.103700989373397e-05, + "loss": 0.4791, + "step": 1339 + }, + { + "epoch": 1.3254203758654797, + "grad_norm": 0.405897222512928, + "learning_rate": 3.10186881641627e-05, + "loss": 0.5133, + "step": 1340 + }, + { + "epoch": 1.3264094955489614, + "grad_norm": 0.3149489860402367, + "learning_rate": 3.1000366434591425e-05, + "loss": 0.4603, + "step": 1341 + }, + { + "epoch": 1.327398615232443, + "grad_norm": 0.45252664436277323, + "learning_rate": 3.098204470502016e-05, + "loss": 0.5322, + "step": 1342 + }, + { + "epoch": 1.3283877349159248, + "grad_norm": 0.2880720148908769, + "learning_rate": 3.0963722975448885e-05, + "loss": 0.4097, + "step": 1343 + }, + { + "epoch": 1.3293768545994065, + "grad_norm": 0.43858458830839114, + "learning_rate": 3.094540124587761e-05, + "loss": 0.4331, + "step": 1344 + }, + { + "epoch": 1.3303659742828882, + "grad_norm": 0.2965447563557795, + "learning_rate": 3.092707951630634e-05, + "loss": 0.4524, + "step": 1345 + }, + { + "epoch": 1.3313550939663699, + "grad_norm": 0.3131743992651659, + "learning_rate": 3.090875778673507e-05, + "loss": 0.4266, + "step": 1346 + }, + { + "epoch": 1.3323442136498516, + "grad_norm": 0.35252494183868, + "learning_rate": 3.08904360571638e-05, + "loss": 0.4365, + "step": 1347 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.3138151302405501, + "learning_rate": 3.087211432759253e-05, + "loss": 0.4451, + "step": 1348 + }, + { + "epoch": 1.334322453016815, + "grad_norm": 0.3115085002074383, + "learning_rate": 3.085379259802125e-05, + "loss": 0.4425, + "step": 1349 + }, + { + "epoch": 1.3353115727002967, + "grad_norm": 0.30006114039678367, + "learning_rate": 3.083547086844998e-05, + "loss": 0.3935, + "step": 1350 + }, + { + "epoch": 1.3363006923837784, + "grad_norm": 0.28834125078240946, + "learning_rate": 3.081714913887871e-05, + "loss": 0.413, + "step": 1351 + }, + { + "epoch": 1.3372898120672603, + "grad_norm": 0.3561705460960808, + "learning_rate": 3.079882740930744e-05, + "loss": 0.4968, + "step": 1352 + }, + { + "epoch": 1.3382789317507418, + "grad_norm": 0.3260716628155875, + "learning_rate": 3.078050567973617e-05, + "loss": 0.4487, + "step": 1353 + }, + { + "epoch": 1.3392680514342237, + "grad_norm": 0.32982555741060104, + "learning_rate": 3.0762183950164896e-05, + "loss": 0.441, + "step": 1354 + }, + { + "epoch": 1.3402571711177051, + "grad_norm": 0.30807762347144296, + "learning_rate": 3.074386222059363e-05, + "loss": 0.4121, + "step": 1355 + }, + { + "epoch": 1.341246290801187, + "grad_norm": 0.3532020602660648, + "learning_rate": 3.0725540491022355e-05, + "loss": 0.505, + "step": 1356 + }, + { + "epoch": 1.3422354104846685, + "grad_norm": 0.2876611640436799, + "learning_rate": 3.070721876145108e-05, + "loss": 0.4096, + "step": 1357 + }, + { + "epoch": 1.3432245301681505, + "grad_norm": 0.3186642457128235, + "learning_rate": 3.068889703187981e-05, + "loss": 0.4585, + "step": 1358 + }, + { + "epoch": 1.344213649851632, + "grad_norm": 0.31673282289780236, + "learning_rate": 3.067057530230854e-05, + "loss": 0.4235, + "step": 1359 + }, + { + "epoch": 1.3452027695351139, + "grad_norm": 0.35253205951974537, + "learning_rate": 3.065225357273727e-05, + "loss": 0.4569, + "step": 1360 + }, + { + "epoch": 1.3461918892185953, + "grad_norm": 0.3349663982529888, + "learning_rate": 3.0633931843165994e-05, + "loss": 0.3936, + "step": 1361 + }, + { + "epoch": 1.3471810089020773, + "grad_norm": 0.33498838963025845, + "learning_rate": 3.061561011359472e-05, + "loss": 0.4983, + "step": 1362 + }, + { + "epoch": 1.3481701285855587, + "grad_norm": 0.3400374587506467, + "learning_rate": 3.0597288384023454e-05, + "loss": 0.4503, + "step": 1363 + }, + { + "epoch": 1.3491592482690407, + "grad_norm": 0.3203012395834267, + "learning_rate": 3.057896665445218e-05, + "loss": 0.3944, + "step": 1364 + }, + { + "epoch": 1.3501483679525221, + "grad_norm": 0.3920156373833764, + "learning_rate": 3.0560644924880913e-05, + "loss": 0.4813, + "step": 1365 + }, + { + "epoch": 1.351137487636004, + "grad_norm": 0.30315401884490906, + "learning_rate": 3.054232319530964e-05, + "loss": 0.4454, + "step": 1366 + }, + { + "epoch": 1.3521266073194855, + "grad_norm": 0.35739212453761815, + "learning_rate": 3.0524001465738366e-05, + "loss": 0.439, + "step": 1367 + }, + { + "epoch": 1.3531157270029674, + "grad_norm": 0.3417886435859495, + "learning_rate": 3.0505679736167096e-05, + "loss": 0.4665, + "step": 1368 + }, + { + "epoch": 1.354104846686449, + "grad_norm": 0.3235348071287897, + "learning_rate": 3.0487358006595822e-05, + "loss": 0.4226, + "step": 1369 + }, + { + "epoch": 1.3550939663699308, + "grad_norm": 0.3159309730114347, + "learning_rate": 3.0469036277024556e-05, + "loss": 0.4466, + "step": 1370 + }, + { + "epoch": 1.3560830860534125, + "grad_norm": 0.324193279571812, + "learning_rate": 3.045071454745328e-05, + "loss": 0.4501, + "step": 1371 + }, + { + "epoch": 1.3570722057368942, + "grad_norm": 0.3071795678407246, + "learning_rate": 3.0432392817882012e-05, + "loss": 0.4487, + "step": 1372 + }, + { + "epoch": 1.358061325420376, + "grad_norm": 0.30058020510609845, + "learning_rate": 3.0414071088310735e-05, + "loss": 0.4561, + "step": 1373 + }, + { + "epoch": 1.3590504451038576, + "grad_norm": 0.3505053014597225, + "learning_rate": 3.0395749358739468e-05, + "loss": 0.4025, + "step": 1374 + }, + { + "epoch": 1.3600395647873393, + "grad_norm": 0.3111111301895231, + "learning_rate": 3.037742762916819e-05, + "loss": 0.4098, + "step": 1375 + }, + { + "epoch": 1.361028684470821, + "grad_norm": 0.2601225263616035, + "learning_rate": 3.0359105899596924e-05, + "loss": 0.4243, + "step": 1376 + }, + { + "epoch": 1.3620178041543027, + "grad_norm": 0.316696497556371, + "learning_rate": 3.0340784170025654e-05, + "loss": 0.3969, + "step": 1377 + }, + { + "epoch": 1.3630069238377844, + "grad_norm": 0.34847389358503794, + "learning_rate": 3.032246244045438e-05, + "loss": 0.4551, + "step": 1378 + }, + { + "epoch": 1.3639960435212661, + "grad_norm": 0.2902604562577088, + "learning_rate": 3.030414071088311e-05, + "loss": 0.407, + "step": 1379 + }, + { + "epoch": 1.3649851632047478, + "grad_norm": 0.27781817505418216, + "learning_rate": 3.0285818981311837e-05, + "loss": 0.4328, + "step": 1380 + }, + { + "epoch": 1.3659742828882295, + "grad_norm": 5.942774002381965, + "learning_rate": 3.0267497251740567e-05, + "loss": 0.4695, + "step": 1381 + }, + { + "epoch": 1.3669634025717112, + "grad_norm": 0.38926658156561217, + "learning_rate": 3.0249175522169293e-05, + "loss": 0.4174, + "step": 1382 + }, + { + "epoch": 1.367952522255193, + "grad_norm": 0.4415126867453672, + "learning_rate": 3.0230853792598023e-05, + "loss": 0.4475, + "step": 1383 + }, + { + "epoch": 1.3689416419386746, + "grad_norm": 0.3120038844745485, + "learning_rate": 3.021253206302675e-05, + "loss": 0.469, + "step": 1384 + }, + { + "epoch": 1.3699307616221563, + "grad_norm": 0.41969819981367795, + "learning_rate": 3.019421033345548e-05, + "loss": 0.4923, + "step": 1385 + }, + { + "epoch": 1.370919881305638, + "grad_norm": 0.38256379980543986, + "learning_rate": 3.0175888603884205e-05, + "loss": 0.4336, + "step": 1386 + }, + { + "epoch": 1.3719090009891197, + "grad_norm": 0.3767270435686749, + "learning_rate": 3.0157566874312935e-05, + "loss": 0.4539, + "step": 1387 + }, + { + "epoch": 1.3728981206726014, + "grad_norm": 0.3157034106065137, + "learning_rate": 3.013924514474166e-05, + "loss": 0.445, + "step": 1388 + }, + { + "epoch": 1.3738872403560831, + "grad_norm": 0.36045163968750676, + "learning_rate": 3.0120923415170395e-05, + "loss": 0.4121, + "step": 1389 + }, + { + "epoch": 1.3748763600395648, + "grad_norm": 0.31479023453240734, + "learning_rate": 3.0102601685599125e-05, + "loss": 0.4216, + "step": 1390 + }, + { + "epoch": 1.3758654797230465, + "grad_norm": 0.2997878080503291, + "learning_rate": 3.008427995602785e-05, + "loss": 0.3644, + "step": 1391 + }, + { + "epoch": 1.3768545994065282, + "grad_norm": 0.3320290130778573, + "learning_rate": 3.006595822645658e-05, + "loss": 0.4513, + "step": 1392 + }, + { + "epoch": 1.37784371909001, + "grad_norm": 0.29477176122877125, + "learning_rate": 3.0047636496885307e-05, + "loss": 0.4601, + "step": 1393 + }, + { + "epoch": 1.3788328387734916, + "grad_norm": 0.3412218077170071, + "learning_rate": 3.0029314767314037e-05, + "loss": 0.4154, + "step": 1394 + }, + { + "epoch": 1.3798219584569733, + "grad_norm": 0.3275380780138223, + "learning_rate": 3.0010993037742764e-05, + "loss": 0.4176, + "step": 1395 + }, + { + "epoch": 1.380811078140455, + "grad_norm": 0.2837253095050283, + "learning_rate": 2.9992671308171493e-05, + "loss": 0.4187, + "step": 1396 + }, + { + "epoch": 1.3818001978239367, + "grad_norm": 0.4176886469982826, + "learning_rate": 2.997434957860022e-05, + "loss": 0.4724, + "step": 1397 + }, + { + "epoch": 1.3827893175074184, + "grad_norm": 0.31073598352099785, + "learning_rate": 2.995602784902895e-05, + "loss": 0.4342, + "step": 1398 + }, + { + "epoch": 1.3837784371909, + "grad_norm": 0.3543815361213989, + "learning_rate": 2.9937706119457676e-05, + "loss": 0.5104, + "step": 1399 + }, + { + "epoch": 1.3847675568743818, + "grad_norm": 0.33664475436986274, + "learning_rate": 2.9919384389886406e-05, + "loss": 0.4302, + "step": 1400 + }, + { + "epoch": 1.3857566765578635, + "grad_norm": 0.36124601508500437, + "learning_rate": 2.990106266031514e-05, + "loss": 0.4512, + "step": 1401 + }, + { + "epoch": 1.3867457962413452, + "grad_norm": 0.35574503741610786, + "learning_rate": 2.9882740930743862e-05, + "loss": 0.4337, + "step": 1402 + }, + { + "epoch": 1.387734915924827, + "grad_norm": 0.36123605275941045, + "learning_rate": 2.9864419201172595e-05, + "loss": 0.4312, + "step": 1403 + }, + { + "epoch": 1.3887240356083086, + "grad_norm": 0.3363751995869784, + "learning_rate": 2.9846097471601318e-05, + "loss": 0.4351, + "step": 1404 + }, + { + "epoch": 1.3897131552917903, + "grad_norm": 0.34547691242406064, + "learning_rate": 2.982777574203005e-05, + "loss": 0.4442, + "step": 1405 + }, + { + "epoch": 1.390702274975272, + "grad_norm": 0.364044074816971, + "learning_rate": 2.9809454012458778e-05, + "loss": 0.4708, + "step": 1406 + }, + { + "epoch": 1.3916913946587537, + "grad_norm": 0.3049021711534288, + "learning_rate": 2.9791132282887508e-05, + "loss": 0.477, + "step": 1407 + }, + { + "epoch": 1.3926805143422354, + "grad_norm": 0.38211284345526886, + "learning_rate": 2.9772810553316234e-05, + "loss": 0.43, + "step": 1408 + }, + { + "epoch": 1.393669634025717, + "grad_norm": 0.41386742466147336, + "learning_rate": 2.9754488823744964e-05, + "loss": 0.4561, + "step": 1409 + }, + { + "epoch": 1.3946587537091988, + "grad_norm": 0.3386310090552587, + "learning_rate": 2.973616709417369e-05, + "loss": 0.4387, + "step": 1410 + }, + { + "epoch": 1.3956478733926805, + "grad_norm": 0.3687161941753462, + "learning_rate": 2.971784536460242e-05, + "loss": 0.4446, + "step": 1411 + }, + { + "epoch": 1.3966369930761622, + "grad_norm": 0.3763623220861594, + "learning_rate": 2.9699523635031147e-05, + "loss": 0.4598, + "step": 1412 + }, + { + "epoch": 1.3976261127596439, + "grad_norm": 0.3938022952660516, + "learning_rate": 2.9681201905459876e-05, + "loss": 0.3538, + "step": 1413 + }, + { + "epoch": 1.3986152324431256, + "grad_norm": 0.36478061032823716, + "learning_rate": 2.9662880175888606e-05, + "loss": 0.4526, + "step": 1414 + }, + { + "epoch": 1.3996043521266073, + "grad_norm": 0.35983777332657113, + "learning_rate": 2.9644558446317333e-05, + "loss": 0.3961, + "step": 1415 + }, + { + "epoch": 1.400593471810089, + "grad_norm": 0.4361337967780688, + "learning_rate": 2.9626236716746066e-05, + "loss": 0.46, + "step": 1416 + }, + { + "epoch": 1.4015825914935707, + "grad_norm": 0.31864426898365455, + "learning_rate": 2.960791498717479e-05, + "loss": 0.4578, + "step": 1417 + }, + { + "epoch": 1.4025717111770524, + "grad_norm": 4.370523949146226, + "learning_rate": 2.9589593257603522e-05, + "loss": 0.5349, + "step": 1418 + }, + { + "epoch": 1.403560830860534, + "grad_norm": 0.7799236801274586, + "learning_rate": 2.9571271528032245e-05, + "loss": 0.4095, + "step": 1419 + }, + { + "epoch": 1.4045499505440158, + "grad_norm": 0.36466940872257664, + "learning_rate": 2.9552949798460978e-05, + "loss": 0.3791, + "step": 1420 + }, + { + "epoch": 1.4055390702274975, + "grad_norm": 0.5175092771422681, + "learning_rate": 2.95346280688897e-05, + "loss": 0.399, + "step": 1421 + }, + { + "epoch": 1.4065281899109792, + "grad_norm": 1.7636557470614664, + "learning_rate": 2.9516306339318434e-05, + "loss": 0.4801, + "step": 1422 + }, + { + "epoch": 1.4075173095944609, + "grad_norm": 0.4215106804422865, + "learning_rate": 2.9497984609747157e-05, + "loss": 0.4709, + "step": 1423 + }, + { + "epoch": 1.4085064292779426, + "grad_norm": 0.4032698691829994, + "learning_rate": 2.947966288017589e-05, + "loss": 0.4624, + "step": 1424 + }, + { + "epoch": 1.4094955489614243, + "grad_norm": 0.4050242307208399, + "learning_rate": 2.946134115060462e-05, + "loss": 0.5015, + "step": 1425 + }, + { + "epoch": 1.410484668644906, + "grad_norm": 0.31172007206763763, + "learning_rate": 2.9443019421033347e-05, + "loss": 0.442, + "step": 1426 + }, + { + "epoch": 1.4114737883283877, + "grad_norm": 0.36677022548571625, + "learning_rate": 2.9424697691462077e-05, + "loss": 0.4427, + "step": 1427 + }, + { + "epoch": 1.4124629080118694, + "grad_norm": 0.3546662288262847, + "learning_rate": 2.9406375961890803e-05, + "loss": 0.4413, + "step": 1428 + }, + { + "epoch": 1.413452027695351, + "grad_norm": 0.36917339334462623, + "learning_rate": 2.9388054232319533e-05, + "loss": 0.4743, + "step": 1429 + }, + { + "epoch": 1.4144411473788328, + "grad_norm": 0.3675002506888929, + "learning_rate": 2.936973250274826e-05, + "loss": 0.46, + "step": 1430 + }, + { + "epoch": 1.4154302670623147, + "grad_norm": 0.28465523267849413, + "learning_rate": 2.935141077317699e-05, + "loss": 0.418, + "step": 1431 + }, + { + "epoch": 1.4164193867457961, + "grad_norm": 0.4170643740008079, + "learning_rate": 2.9333089043605716e-05, + "loss": 0.4097, + "step": 1432 + }, + { + "epoch": 1.417408506429278, + "grad_norm": 0.3046886523834855, + "learning_rate": 2.9314767314034445e-05, + "loss": 0.4429, + "step": 1433 + }, + { + "epoch": 1.4183976261127595, + "grad_norm": 0.40000915818868443, + "learning_rate": 2.9296445584463172e-05, + "loss": 0.4261, + "step": 1434 + }, + { + "epoch": 1.4193867457962415, + "grad_norm": 0.38918672454921527, + "learning_rate": 2.9278123854891905e-05, + "loss": 0.4585, + "step": 1435 + }, + { + "epoch": 1.420375865479723, + "grad_norm": 0.29249532179463994, + "learning_rate": 2.9259802125320628e-05, + "loss": 0.3921, + "step": 1436 + }, + { + "epoch": 1.4213649851632049, + "grad_norm": 0.3243674617976441, + "learning_rate": 2.924148039574936e-05, + "loss": 0.511, + "step": 1437 + }, + { + "epoch": 1.4223541048466863, + "grad_norm": 0.31778105502346576, + "learning_rate": 2.922315866617809e-05, + "loss": 0.4576, + "step": 1438 + }, + { + "epoch": 1.4233432245301683, + "grad_norm": 0.3062041751866015, + "learning_rate": 2.9204836936606817e-05, + "loss": 0.4189, + "step": 1439 + }, + { + "epoch": 1.4243323442136497, + "grad_norm": 0.309264172194014, + "learning_rate": 2.9186515207035547e-05, + "loss": 0.4606, + "step": 1440 + }, + { + "epoch": 1.4253214638971317, + "grad_norm": 0.2801605149580132, + "learning_rate": 2.9168193477464274e-05, + "loss": 0.3993, + "step": 1441 + }, + { + "epoch": 1.4263105835806131, + "grad_norm": 0.30985960538013585, + "learning_rate": 2.9149871747893003e-05, + "loss": 0.4407, + "step": 1442 + }, + { + "epoch": 1.427299703264095, + "grad_norm": 0.3388775836866452, + "learning_rate": 2.913155001832173e-05, + "loss": 0.4429, + "step": 1443 + }, + { + "epoch": 1.4282888229475765, + "grad_norm": 0.2889630554202242, + "learning_rate": 2.911322828875046e-05, + "loss": 0.432, + "step": 1444 + }, + { + "epoch": 1.4292779426310585, + "grad_norm": 0.315605316728071, + "learning_rate": 2.9094906559179186e-05, + "loss": 0.4516, + "step": 1445 + }, + { + "epoch": 1.43026706231454, + "grad_norm": 0.3457337439226024, + "learning_rate": 2.9076584829607916e-05, + "loss": 0.4674, + "step": 1446 + }, + { + "epoch": 1.4312561819980218, + "grad_norm": 0.2952032263895049, + "learning_rate": 2.9058263100036642e-05, + "loss": 0.4521, + "step": 1447 + }, + { + "epoch": 1.4322453016815033, + "grad_norm": 0.28620576457716407, + "learning_rate": 2.9039941370465372e-05, + "loss": 0.4621, + "step": 1448 + }, + { + "epoch": 1.4332344213649852, + "grad_norm": 0.36744980164748403, + "learning_rate": 2.9021619640894105e-05, + "loss": 0.4888, + "step": 1449 + }, + { + "epoch": 1.434223541048467, + "grad_norm": 0.339812788180591, + "learning_rate": 2.900329791132283e-05, + "loss": 0.5289, + "step": 1450 + }, + { + "epoch": 1.4352126607319486, + "grad_norm": 0.31566827118837226, + "learning_rate": 2.898497618175156e-05, + "loss": 0.4492, + "step": 1451 + }, + { + "epoch": 1.4362017804154303, + "grad_norm": 0.31330393196868006, + "learning_rate": 2.8966654452180288e-05, + "loss": 0.3981, + "step": 1452 + }, + { + "epoch": 1.437190900098912, + "grad_norm": 0.3067900583902443, + "learning_rate": 2.8948332722609018e-05, + "loss": 0.4488, + "step": 1453 + }, + { + "epoch": 1.4381800197823937, + "grad_norm": 0.2968129574510213, + "learning_rate": 2.8930010993037744e-05, + "loss": 0.447, + "step": 1454 + }, + { + "epoch": 1.4391691394658754, + "grad_norm": 0.2864682872837869, + "learning_rate": 2.8911689263466474e-05, + "loss": 0.4034, + "step": 1455 + }, + { + "epoch": 1.4401582591493571, + "grad_norm": 0.6880039761239137, + "learning_rate": 2.88933675338952e-05, + "loss": 0.4305, + "step": 1456 + }, + { + "epoch": 1.4411473788328388, + "grad_norm": 0.27383935188184777, + "learning_rate": 2.887504580432393e-05, + "loss": 0.4472, + "step": 1457 + }, + { + "epoch": 1.4421364985163205, + "grad_norm": 0.29593151735190626, + "learning_rate": 2.8856724074752657e-05, + "loss": 0.4261, + "step": 1458 + }, + { + "epoch": 1.4431256181998022, + "grad_norm": 0.28273928682471794, + "learning_rate": 2.8838402345181386e-05, + "loss": 0.4034, + "step": 1459 + }, + { + "epoch": 1.444114737883284, + "grad_norm": 0.3165369665953018, + "learning_rate": 2.8820080615610116e-05, + "loss": 0.4266, + "step": 1460 + }, + { + "epoch": 1.4451038575667656, + "grad_norm": 0.3012146535540838, + "learning_rate": 2.8801758886038843e-05, + "loss": 0.4315, + "step": 1461 + }, + { + "epoch": 1.4460929772502473, + "grad_norm": 0.2711981424629738, + "learning_rate": 2.8783437156467576e-05, + "loss": 0.4184, + "step": 1462 + }, + { + "epoch": 1.447082096933729, + "grad_norm": 0.31002008293801026, + "learning_rate": 2.87651154268963e-05, + "loss": 0.4222, + "step": 1463 + }, + { + "epoch": 1.4480712166172107, + "grad_norm": 0.3539283902685812, + "learning_rate": 2.8746793697325032e-05, + "loss": 0.4322, + "step": 1464 + }, + { + "epoch": 1.4490603363006924, + "grad_norm": 0.3197987270904071, + "learning_rate": 2.8728471967753755e-05, + "loss": 0.4626, + "step": 1465 + }, + { + "epoch": 1.4500494559841741, + "grad_norm": 0.294396776618464, + "learning_rate": 2.8710150238182488e-05, + "loss": 0.4294, + "step": 1466 + }, + { + "epoch": 1.4510385756676558, + "grad_norm": 0.31250638103474127, + "learning_rate": 2.869182850861121e-05, + "loss": 0.491, + "step": 1467 + }, + { + "epoch": 1.4520276953511375, + "grad_norm": 0.29465729172509825, + "learning_rate": 2.8673506779039945e-05, + "loss": 0.4182, + "step": 1468 + }, + { + "epoch": 1.4530168150346192, + "grad_norm": 0.27683736051729835, + "learning_rate": 2.8655185049468668e-05, + "loss": 0.4176, + "step": 1469 + }, + { + "epoch": 1.454005934718101, + "grad_norm": 0.3763143498743748, + "learning_rate": 2.86368633198974e-05, + "loss": 0.4864, + "step": 1470 + }, + { + "epoch": 1.4549950544015826, + "grad_norm": 0.2951254080913707, + "learning_rate": 2.8618541590326127e-05, + "loss": 0.4084, + "step": 1471 + }, + { + "epoch": 1.4559841740850643, + "grad_norm": 0.2874094634392024, + "learning_rate": 2.8600219860754857e-05, + "loss": 0.4525, + "step": 1472 + }, + { + "epoch": 1.456973293768546, + "grad_norm": 0.34781596007553534, + "learning_rate": 2.8581898131183587e-05, + "loss": 0.4446, + "step": 1473 + }, + { + "epoch": 1.4579624134520277, + "grad_norm": 0.3118284461430052, + "learning_rate": 2.8563576401612313e-05, + "loss": 0.4084, + "step": 1474 + }, + { + "epoch": 1.4589515331355094, + "grad_norm": 0.3162361256418991, + "learning_rate": 2.8545254672041043e-05, + "loss": 0.4418, + "step": 1475 + }, + { + "epoch": 1.459940652818991, + "grad_norm": 0.38053633547795723, + "learning_rate": 2.852693294246977e-05, + "loss": 0.3583, + "step": 1476 + }, + { + "epoch": 1.4609297725024728, + "grad_norm": 0.30505081841976306, + "learning_rate": 2.85086112128985e-05, + "loss": 0.4295, + "step": 1477 + }, + { + "epoch": 1.4619188921859545, + "grad_norm": 0.3700760611954873, + "learning_rate": 2.8490289483327226e-05, + "loss": 0.4091, + "step": 1478 + }, + { + "epoch": 1.4629080118694362, + "grad_norm": 0.29239279326719575, + "learning_rate": 2.8471967753755955e-05, + "loss": 0.4535, + "step": 1479 + }, + { + "epoch": 1.463897131552918, + "grad_norm": 0.3096224600613784, + "learning_rate": 2.8453646024184682e-05, + "loss": 0.4517, + "step": 1480 + }, + { + "epoch": 1.4648862512363996, + "grad_norm": 0.29982859930862565, + "learning_rate": 2.8435324294613415e-05, + "loss": 0.3935, + "step": 1481 + }, + { + "epoch": 1.4658753709198813, + "grad_norm": 0.2972910923176461, + "learning_rate": 2.8417002565042138e-05, + "loss": 0.4544, + "step": 1482 + }, + { + "epoch": 1.466864490603363, + "grad_norm": 0.2926067116253536, + "learning_rate": 2.839868083547087e-05, + "loss": 0.415, + "step": 1483 + }, + { + "epoch": 1.4678536102868447, + "grad_norm": 0.3328585626789144, + "learning_rate": 2.83803591058996e-05, + "loss": 0.4579, + "step": 1484 + }, + { + "epoch": 1.4688427299703264, + "grad_norm": 0.3481307578459911, + "learning_rate": 2.8362037376328327e-05, + "loss": 0.4605, + "step": 1485 + }, + { + "epoch": 1.469831849653808, + "grad_norm": 0.31652707929635426, + "learning_rate": 2.8343715646757057e-05, + "loss": 0.4656, + "step": 1486 + }, + { + "epoch": 1.4708209693372898, + "grad_norm": 0.29953398282105115, + "learning_rate": 2.8325393917185784e-05, + "loss": 0.3825, + "step": 1487 + }, + { + "epoch": 1.4718100890207715, + "grad_norm": 0.3097931361873477, + "learning_rate": 2.8307072187614514e-05, + "loss": 0.3972, + "step": 1488 + }, + { + "epoch": 1.4727992087042532, + "grad_norm": 0.33071430642317845, + "learning_rate": 2.828875045804324e-05, + "loss": 0.4285, + "step": 1489 + }, + { + "epoch": 1.4737883283877349, + "grad_norm": 0.2859188800768853, + "learning_rate": 2.827042872847197e-05, + "loss": 0.418, + "step": 1490 + }, + { + "epoch": 1.4747774480712166, + "grad_norm": 0.2881058247271708, + "learning_rate": 2.8252106998900696e-05, + "loss": 0.4643, + "step": 1491 + }, + { + "epoch": 1.4757665677546983, + "grad_norm": 0.38974799158353357, + "learning_rate": 2.8233785269329426e-05, + "loss": 0.4169, + "step": 1492 + }, + { + "epoch": 1.47675568743818, + "grad_norm": 0.3275131922732535, + "learning_rate": 2.8215463539758152e-05, + "loss": 0.446, + "step": 1493 + }, + { + "epoch": 1.4777448071216617, + "grad_norm": 0.3299066979177711, + "learning_rate": 2.8197141810186882e-05, + "loss": 0.4617, + "step": 1494 + }, + { + "epoch": 1.4787339268051434, + "grad_norm": 0.364073130150921, + "learning_rate": 2.817882008061561e-05, + "loss": 0.5335, + "step": 1495 + }, + { + "epoch": 1.479723046488625, + "grad_norm": 0.3277284148931501, + "learning_rate": 2.816049835104434e-05, + "loss": 0.4456, + "step": 1496 + }, + { + "epoch": 1.4807121661721068, + "grad_norm": 0.43802618542564276, + "learning_rate": 2.814217662147307e-05, + "loss": 0.4724, + "step": 1497 + }, + { + "epoch": 1.4817012858555885, + "grad_norm": 0.2765090527927701, + "learning_rate": 2.8123854891901798e-05, + "loss": 0.3961, + "step": 1498 + }, + { + "epoch": 1.4826904055390702, + "grad_norm": 0.3416485689233916, + "learning_rate": 2.8105533162330528e-05, + "loss": 0.4166, + "step": 1499 + }, + { + "epoch": 1.4836795252225519, + "grad_norm": 0.37839623878952566, + "learning_rate": 2.8087211432759254e-05, + "loss": 0.4906, + "step": 1500 + }, + { + "epoch": 1.4846686449060336, + "grad_norm": 0.3518638957400736, + "learning_rate": 2.8068889703187984e-05, + "loss": 0.4069, + "step": 1501 + }, + { + "epoch": 1.4856577645895153, + "grad_norm": 0.33111446442715076, + "learning_rate": 2.805056797361671e-05, + "loss": 0.4293, + "step": 1502 + }, + { + "epoch": 1.486646884272997, + "grad_norm": 0.295453610129648, + "learning_rate": 2.803224624404544e-05, + "loss": 0.4292, + "step": 1503 + }, + { + "epoch": 1.4876360039564787, + "grad_norm": 0.2756874683521067, + "learning_rate": 2.8013924514474167e-05, + "loss": 0.4104, + "step": 1504 + }, + { + "epoch": 1.4886251236399604, + "grad_norm": 0.3798280905192116, + "learning_rate": 2.7995602784902896e-05, + "loss": 0.4483, + "step": 1505 + }, + { + "epoch": 1.489614243323442, + "grad_norm": 0.3414752636312388, + "learning_rate": 2.7977281055331623e-05, + "loss": 0.4637, + "step": 1506 + }, + { + "epoch": 1.4906033630069238, + "grad_norm": 0.35252371421106593, + "learning_rate": 2.7958959325760353e-05, + "loss": 0.4697, + "step": 1507 + }, + { + "epoch": 1.4915924826904057, + "grad_norm": 0.35553900870734595, + "learning_rate": 2.7940637596189086e-05, + "loss": 0.4427, + "step": 1508 + }, + { + "epoch": 1.4925816023738872, + "grad_norm": 0.27479809910211267, + "learning_rate": 2.792231586661781e-05, + "loss": 0.4329, + "step": 1509 + }, + { + "epoch": 1.493570722057369, + "grad_norm": 0.3169137202038055, + "learning_rate": 2.7903994137046542e-05, + "loss": 0.4504, + "step": 1510 + }, + { + "epoch": 1.4945598417408505, + "grad_norm": 0.3611773054393912, + "learning_rate": 2.7885672407475265e-05, + "loss": 0.3944, + "step": 1511 + }, + { + "epoch": 1.4955489614243325, + "grad_norm": 0.27848717086943314, + "learning_rate": 2.7867350677904e-05, + "loss": 0.4297, + "step": 1512 + }, + { + "epoch": 1.496538081107814, + "grad_norm": 0.2890433618635751, + "learning_rate": 2.784902894833272e-05, + "loss": 0.4256, + "step": 1513 + }, + { + "epoch": 1.4975272007912959, + "grad_norm": 0.29233252685415784, + "learning_rate": 2.7830707218761455e-05, + "loss": 0.5019, + "step": 1514 + }, + { + "epoch": 1.4985163204747773, + "grad_norm": 0.2808355518258454, + "learning_rate": 2.7812385489190178e-05, + "loss": 0.4533, + "step": 1515 + }, + { + "epoch": 1.4995054401582593, + "grad_norm": 0.26776681113551326, + "learning_rate": 2.779406375961891e-05, + "loss": 0.4154, + "step": 1516 + }, + { + "epoch": 1.5004945598417407, + "grad_norm": 0.318650708286605, + "learning_rate": 2.7775742030047637e-05, + "loss": 0.4203, + "step": 1517 + }, + { + "epoch": 1.5014836795252227, + "grad_norm": 0.26460015417559507, + "learning_rate": 2.7757420300476367e-05, + "loss": 0.4147, + "step": 1518 + }, + { + "epoch": 1.5024727992087041, + "grad_norm": 0.25425666410240005, + "learning_rate": 2.7739098570905093e-05, + "loss": 0.3944, + "step": 1519 + }, + { + "epoch": 1.503461918892186, + "grad_norm": 0.35103080270975756, + "learning_rate": 2.7720776841333823e-05, + "loss": 0.417, + "step": 1520 + }, + { + "epoch": 1.5044510385756675, + "grad_norm": 0.31288047385282897, + "learning_rate": 2.7702455111762553e-05, + "loss": 0.4376, + "step": 1521 + }, + { + "epoch": 1.5054401582591495, + "grad_norm": 0.3026304301870927, + "learning_rate": 2.768413338219128e-05, + "loss": 0.4141, + "step": 1522 + }, + { + "epoch": 1.506429277942631, + "grad_norm": 0.29749849928337113, + "learning_rate": 2.766581165262001e-05, + "loss": 0.4098, + "step": 1523 + }, + { + "epoch": 1.5074183976261128, + "grad_norm": 0.29038320359051056, + "learning_rate": 2.7647489923048736e-05, + "loss": 0.4431, + "step": 1524 + }, + { + "epoch": 1.5084075173095943, + "grad_norm": 0.2907976228534865, + "learning_rate": 2.7629168193477466e-05, + "loss": 0.4574, + "step": 1525 + }, + { + "epoch": 1.5093966369930762, + "grad_norm": 0.25183312695543336, + "learning_rate": 2.7610846463906192e-05, + "loss": 0.3744, + "step": 1526 + }, + { + "epoch": 1.5103857566765577, + "grad_norm": 0.2855912969143611, + "learning_rate": 2.7592524734334925e-05, + "loss": 0.4661, + "step": 1527 + }, + { + "epoch": 1.5113748763600396, + "grad_norm": 0.2732293198137018, + "learning_rate": 2.7574203004763648e-05, + "loss": 0.4355, + "step": 1528 + }, + { + "epoch": 1.5123639960435211, + "grad_norm": 0.3213776545316619, + "learning_rate": 2.755588127519238e-05, + "loss": 0.455, + "step": 1529 + }, + { + "epoch": 1.513353115727003, + "grad_norm": 0.29145234783713037, + "learning_rate": 2.7537559545621104e-05, + "loss": 0.4878, + "step": 1530 + }, + { + "epoch": 1.5143422354104845, + "grad_norm": 0.266071594136441, + "learning_rate": 2.7519237816049838e-05, + "loss": 0.4167, + "step": 1531 + }, + { + "epoch": 1.5153313550939664, + "grad_norm": 0.2731003398932023, + "learning_rate": 2.7500916086478567e-05, + "loss": 0.4768, + "step": 1532 + }, + { + "epoch": 1.516320474777448, + "grad_norm": 0.3097341831479785, + "learning_rate": 2.7482594356907294e-05, + "loss": 0.4474, + "step": 1533 + }, + { + "epoch": 1.5173095944609298, + "grad_norm": 0.34027538016329345, + "learning_rate": 2.7464272627336024e-05, + "loss": 0.4393, + "step": 1534 + }, + { + "epoch": 1.5182987141444113, + "grad_norm": 1.2356940989603606, + "learning_rate": 2.744595089776475e-05, + "loss": 0.4578, + "step": 1535 + }, + { + "epoch": 1.5192878338278932, + "grad_norm": 0.5459791028027153, + "learning_rate": 2.742762916819348e-05, + "loss": 0.4724, + "step": 1536 + }, + { + "epoch": 1.520276953511375, + "grad_norm": 0.31844571348235073, + "learning_rate": 2.7409307438622206e-05, + "loss": 0.4125, + "step": 1537 + }, + { + "epoch": 1.5212660731948566, + "grad_norm": 0.31015078316170824, + "learning_rate": 2.7390985709050936e-05, + "loss": 0.433, + "step": 1538 + }, + { + "epoch": 1.5222551928783383, + "grad_norm": 0.29396592937434984, + "learning_rate": 2.7372663979479662e-05, + "loss": 0.4376, + "step": 1539 + }, + { + "epoch": 1.52324431256182, + "grad_norm": 0.2911564550284436, + "learning_rate": 2.7354342249908392e-05, + "loss": 0.4112, + "step": 1540 + }, + { + "epoch": 1.5242334322453017, + "grad_norm": 0.3141108762736842, + "learning_rate": 2.733602052033712e-05, + "loss": 0.4333, + "step": 1541 + }, + { + "epoch": 1.5252225519287834, + "grad_norm": 0.28602791405930644, + "learning_rate": 2.731769879076585e-05, + "loss": 0.4535, + "step": 1542 + }, + { + "epoch": 1.5262116716122651, + "grad_norm": 0.2559604367618264, + "learning_rate": 2.7299377061194575e-05, + "loss": 0.4126, + "step": 1543 + }, + { + "epoch": 1.5272007912957468, + "grad_norm": 0.27275667409056814, + "learning_rate": 2.7281055331623305e-05, + "loss": 0.3398, + "step": 1544 + }, + { + "epoch": 1.5281899109792285, + "grad_norm": 0.27888080177130276, + "learning_rate": 2.7262733602052038e-05, + "loss": 0.396, + "step": 1545 + }, + { + "epoch": 1.5291790306627102, + "grad_norm": 0.29457676257202875, + "learning_rate": 2.7244411872480764e-05, + "loss": 0.4142, + "step": 1546 + }, + { + "epoch": 1.530168150346192, + "grad_norm": 0.2657669136258467, + "learning_rate": 2.7226090142909494e-05, + "loss": 0.3811, + "step": 1547 + }, + { + "epoch": 1.5311572700296736, + "grad_norm": 0.2787061006660649, + "learning_rate": 2.720776841333822e-05, + "loss": 0.3936, + "step": 1548 + }, + { + "epoch": 1.5321463897131553, + "grad_norm": 0.37452021826356185, + "learning_rate": 2.718944668376695e-05, + "loss": 0.4526, + "step": 1549 + }, + { + "epoch": 1.533135509396637, + "grad_norm": 0.24566974329827732, + "learning_rate": 2.7171124954195677e-05, + "loss": 0.3751, + "step": 1550 + }, + { + "epoch": 1.5341246290801187, + "grad_norm": 0.2757225886209204, + "learning_rate": 2.7152803224624407e-05, + "loss": 0.466, + "step": 1551 + }, + { + "epoch": 1.5351137487636004, + "grad_norm": 0.360202782237012, + "learning_rate": 2.7134481495053133e-05, + "loss": 0.5028, + "step": 1552 + }, + { + "epoch": 1.536102868447082, + "grad_norm": 0.28217047010151747, + "learning_rate": 2.7116159765481863e-05, + "loss": 0.4216, + "step": 1553 + }, + { + "epoch": 1.5370919881305638, + "grad_norm": 0.3369657062476821, + "learning_rate": 2.709783803591059e-05, + "loss": 0.4474, + "step": 1554 + }, + { + "epoch": 1.5380811078140455, + "grad_norm": 0.32643487155724593, + "learning_rate": 2.707951630633932e-05, + "loss": 0.4275, + "step": 1555 + }, + { + "epoch": 1.5390702274975272, + "grad_norm": 0.2817779215324222, + "learning_rate": 2.7061194576768052e-05, + "loss": 0.4371, + "step": 1556 + }, + { + "epoch": 1.540059347181009, + "grad_norm": 0.3063323029729112, + "learning_rate": 2.7042872847196775e-05, + "loss": 0.4084, + "step": 1557 + }, + { + "epoch": 1.5410484668644906, + "grad_norm": 0.26570368053936894, + "learning_rate": 2.702455111762551e-05, + "loss": 0.4377, + "step": 1558 + }, + { + "epoch": 1.5420375865479723, + "grad_norm": 0.28478093201712384, + "learning_rate": 2.700622938805423e-05, + "loss": 0.3711, + "step": 1559 + }, + { + "epoch": 1.543026706231454, + "grad_norm": 0.3527584738458635, + "learning_rate": 2.6987907658482965e-05, + "loss": 0.4467, + "step": 1560 + }, + { + "epoch": 1.5440158259149357, + "grad_norm": 0.2998161804452798, + "learning_rate": 2.6969585928911688e-05, + "loss": 0.4022, + "step": 1561 + }, + { + "epoch": 1.5450049455984174, + "grad_norm": 0.30287586677925865, + "learning_rate": 2.695126419934042e-05, + "loss": 0.4276, + "step": 1562 + }, + { + "epoch": 1.545994065281899, + "grad_norm": 0.29066288076793795, + "learning_rate": 2.6932942469769147e-05, + "loss": 0.4298, + "step": 1563 + }, + { + "epoch": 1.5469831849653808, + "grad_norm": 0.36634590156409147, + "learning_rate": 2.6914620740197877e-05, + "loss": 0.4333, + "step": 1564 + }, + { + "epoch": 1.5479723046488625, + "grad_norm": 0.3516684553628714, + "learning_rate": 2.6896299010626604e-05, + "loss": 0.474, + "step": 1565 + }, + { + "epoch": 1.5489614243323442, + "grad_norm": 0.3315473412552291, + "learning_rate": 2.6877977281055333e-05, + "loss": 0.4199, + "step": 1566 + }, + { + "epoch": 1.5499505440158259, + "grad_norm": 0.304443458898562, + "learning_rate": 2.685965555148406e-05, + "loss": 0.4685, + "step": 1567 + }, + { + "epoch": 1.5509396636993076, + "grad_norm": 0.34939766241844106, + "learning_rate": 2.684133382191279e-05, + "loss": 0.3867, + "step": 1568 + }, + { + "epoch": 1.5519287833827893, + "grad_norm": 0.3364698970448001, + "learning_rate": 2.682301209234152e-05, + "loss": 0.4314, + "step": 1569 + }, + { + "epoch": 1.552917903066271, + "grad_norm": 0.34050583019646397, + "learning_rate": 2.6804690362770246e-05, + "loss": 0.4741, + "step": 1570 + }, + { + "epoch": 1.5539070227497527, + "grad_norm": 0.26237037768227367, + "learning_rate": 2.6786368633198976e-05, + "loss": 0.4107, + "step": 1571 + }, + { + "epoch": 1.5548961424332344, + "grad_norm": 0.3461301127225192, + "learning_rate": 2.6768046903627702e-05, + "loss": 0.4541, + "step": 1572 + }, + { + "epoch": 1.555885262116716, + "grad_norm": 0.2511422428233448, + "learning_rate": 2.6749725174056435e-05, + "loss": 0.4286, + "step": 1573 + }, + { + "epoch": 1.5568743818001978, + "grad_norm": 0.6854382606786477, + "learning_rate": 2.6731403444485158e-05, + "loss": 0.5168, + "step": 1574 + }, + { + "epoch": 1.5578635014836797, + "grad_norm": 0.29587534482934763, + "learning_rate": 2.671308171491389e-05, + "loss": 0.4379, + "step": 1575 + }, + { + "epoch": 1.5588526211671612, + "grad_norm": 0.30965230188237924, + "learning_rate": 2.6694759985342614e-05, + "loss": 0.4372, + "step": 1576 + }, + { + "epoch": 1.559841740850643, + "grad_norm": 0.29630073934251444, + "learning_rate": 2.6676438255771348e-05, + "loss": 0.4082, + "step": 1577 + }, + { + "epoch": 1.5608308605341246, + "grad_norm": 0.3254718398772897, + "learning_rate": 2.665811652620007e-05, + "loss": 0.446, + "step": 1578 + }, + { + "epoch": 1.5618199802176065, + "grad_norm": 0.35742974851957743, + "learning_rate": 2.6639794796628804e-05, + "loss": 0.4064, + "step": 1579 + }, + { + "epoch": 1.562809099901088, + "grad_norm": 0.297566684364922, + "learning_rate": 2.6621473067057534e-05, + "loss": 0.3941, + "step": 1580 + }, + { + "epoch": 1.5637982195845699, + "grad_norm": 0.34824595226140137, + "learning_rate": 2.660315133748626e-05, + "loss": 0.4817, + "step": 1581 + }, + { + "epoch": 1.5647873392680514, + "grad_norm": 0.37702204793009536, + "learning_rate": 2.658482960791499e-05, + "loss": 0.4767, + "step": 1582 + }, + { + "epoch": 1.5657764589515333, + "grad_norm": 0.35461074860740005, + "learning_rate": 2.6566507878343716e-05, + "loss": 0.4296, + "step": 1583 + }, + { + "epoch": 1.5667655786350148, + "grad_norm": 0.3200818420727633, + "learning_rate": 2.6548186148772446e-05, + "loss": 0.381, + "step": 1584 + }, + { + "epoch": 1.5677546983184967, + "grad_norm": 0.425899730481969, + "learning_rate": 2.6529864419201173e-05, + "loss": 0.4158, + "step": 1585 + }, + { + "epoch": 1.5687438180019782, + "grad_norm": 1.1299623445949691, + "learning_rate": 2.6511542689629902e-05, + "loss": 0.3845, + "step": 1586 + }, + { + "epoch": 1.56973293768546, + "grad_norm": 0.2782420434679356, + "learning_rate": 2.649322096005863e-05, + "loss": 0.4361, + "step": 1587 + }, + { + "epoch": 1.5707220573689415, + "grad_norm": 0.3363493977754832, + "learning_rate": 2.647489923048736e-05, + "loss": 0.3991, + "step": 1588 + }, + { + "epoch": 1.5717111770524235, + "grad_norm": 0.3204275823714799, + "learning_rate": 2.6456577500916085e-05, + "loss": 0.5167, + "step": 1589 + }, + { + "epoch": 1.572700296735905, + "grad_norm": 0.3366195485123245, + "learning_rate": 2.6438255771344815e-05, + "loss": 0.4971, + "step": 1590 + }, + { + "epoch": 1.5736894164193869, + "grad_norm": 0.3067388676397528, + "learning_rate": 2.641993404177354e-05, + "loss": 0.4899, + "step": 1591 + }, + { + "epoch": 1.5746785361028683, + "grad_norm": 0.27590309427573345, + "learning_rate": 2.6401612312202274e-05, + "loss": 0.4055, + "step": 1592 + }, + { + "epoch": 1.5756676557863503, + "grad_norm": 0.3184897304138686, + "learning_rate": 2.6383290582631004e-05, + "loss": 0.4245, + "step": 1593 + }, + { + "epoch": 1.5766567754698317, + "grad_norm": 0.2753393949803328, + "learning_rate": 2.636496885305973e-05, + "loss": 0.4415, + "step": 1594 + }, + { + "epoch": 1.5776458951533137, + "grad_norm": 0.2696948403053459, + "learning_rate": 2.634664712348846e-05, + "loss": 0.4367, + "step": 1595 + }, + { + "epoch": 1.5786350148367951, + "grad_norm": 0.291084848882316, + "learning_rate": 2.6328325393917187e-05, + "loss": 0.4221, + "step": 1596 + }, + { + "epoch": 1.579624134520277, + "grad_norm": 0.29012800052048404, + "learning_rate": 2.6310003664345917e-05, + "loss": 0.407, + "step": 1597 + }, + { + "epoch": 1.5806132542037585, + "grad_norm": 0.32837973620046834, + "learning_rate": 2.6291681934774643e-05, + "loss": 0.4483, + "step": 1598 + }, + { + "epoch": 1.5816023738872405, + "grad_norm": 0.28011551139081314, + "learning_rate": 2.6273360205203373e-05, + "loss": 0.4711, + "step": 1599 + }, + { + "epoch": 1.582591493570722, + "grad_norm": 0.3282890674772606, + "learning_rate": 2.62550384756321e-05, + "loss": 0.3873, + "step": 1600 + }, + { + "epoch": 1.5835806132542039, + "grad_norm": 0.33875347479904816, + "learning_rate": 2.623671674606083e-05, + "loss": 0.4552, + "step": 1601 + }, + { + "epoch": 1.5845697329376853, + "grad_norm": 0.29670107000989415, + "learning_rate": 2.6218395016489556e-05, + "loss": 0.4795, + "step": 1602 + }, + { + "epoch": 1.5855588526211672, + "grad_norm": 0.265294490890646, + "learning_rate": 2.6200073286918285e-05, + "loss": 0.3973, + "step": 1603 + }, + { + "epoch": 1.5865479723046487, + "grad_norm": 0.33359772528312437, + "learning_rate": 2.618175155734702e-05, + "loss": 0.4843, + "step": 1604 + }, + { + "epoch": 1.5875370919881306, + "grad_norm": 0.4026752678559618, + "learning_rate": 2.616342982777574e-05, + "loss": 0.4425, + "step": 1605 + }, + { + "epoch": 1.5885262116716121, + "grad_norm": 0.3058367116238238, + "learning_rate": 2.6145108098204475e-05, + "loss": 0.4316, + "step": 1606 + }, + { + "epoch": 1.589515331355094, + "grad_norm": 0.30426608314843634, + "learning_rate": 2.6126786368633198e-05, + "loss": 0.4927, + "step": 1607 + }, + { + "epoch": 1.5905044510385755, + "grad_norm": 0.37917329704685165, + "learning_rate": 2.610846463906193e-05, + "loss": 0.5052, + "step": 1608 + }, + { + "epoch": 1.5914935707220574, + "grad_norm": 0.33134630981635044, + "learning_rate": 2.6090142909490657e-05, + "loss": 0.4995, + "step": 1609 + }, + { + "epoch": 1.592482690405539, + "grad_norm": 0.2991963652688359, + "learning_rate": 2.6071821179919387e-05, + "loss": 0.3927, + "step": 1610 + }, + { + "epoch": 1.5934718100890208, + "grad_norm": 0.3570948011063814, + "learning_rate": 2.6053499450348114e-05, + "loss": 0.4624, + "step": 1611 + }, + { + "epoch": 1.5944609297725023, + "grad_norm": 0.3199568594829708, + "learning_rate": 2.6035177720776843e-05, + "loss": 0.4605, + "step": 1612 + }, + { + "epoch": 1.5954500494559842, + "grad_norm": 0.33280196155936287, + "learning_rate": 2.601685599120557e-05, + "loss": 0.4746, + "step": 1613 + }, + { + "epoch": 1.596439169139466, + "grad_norm": 0.39417035896603214, + "learning_rate": 2.59985342616343e-05, + "loss": 0.4208, + "step": 1614 + }, + { + "epoch": 1.5974282888229476, + "grad_norm": 0.33066363581898495, + "learning_rate": 2.598021253206303e-05, + "loss": 0.504, + "step": 1615 + }, + { + "epoch": 1.5984174085064293, + "grad_norm": 0.3461460215044843, + "learning_rate": 2.5961890802491756e-05, + "loss": 0.4331, + "step": 1616 + }, + { + "epoch": 1.599406528189911, + "grad_norm": 0.3652275383465214, + "learning_rate": 2.5943569072920486e-05, + "loss": 0.4299, + "step": 1617 + }, + { + "epoch": 1.6003956478733927, + "grad_norm": 0.30120482055467496, + "learning_rate": 2.5925247343349212e-05, + "loss": 0.4453, + "step": 1618 + }, + { + "epoch": 1.6013847675568744, + "grad_norm": 0.30615378277352595, + "learning_rate": 2.5906925613777945e-05, + "loss": 0.4507, + "step": 1619 + }, + { + "epoch": 1.6023738872403561, + "grad_norm": 0.32283104348568503, + "learning_rate": 2.5888603884206668e-05, + "loss": 0.3887, + "step": 1620 + }, + { + "epoch": 1.6033630069238378, + "grad_norm": 0.38745300511964836, + "learning_rate": 2.58702821546354e-05, + "loss": 0.4475, + "step": 1621 + }, + { + "epoch": 1.6043521266073195, + "grad_norm": 0.29046077388027175, + "learning_rate": 2.5851960425064125e-05, + "loss": 0.4147, + "step": 1622 + }, + { + "epoch": 1.6053412462908012, + "grad_norm": 0.440454936747926, + "learning_rate": 2.5833638695492858e-05, + "loss": 0.4652, + "step": 1623 + }, + { + "epoch": 1.606330365974283, + "grad_norm": 0.6507104848117019, + "learning_rate": 2.581531696592158e-05, + "loss": 0.4032, + "step": 1624 + }, + { + "epoch": 1.6073194856577646, + "grad_norm": 0.3264866453501732, + "learning_rate": 2.5796995236350314e-05, + "loss": 0.4031, + "step": 1625 + }, + { + "epoch": 1.6083086053412463, + "grad_norm": 0.3023992104795454, + "learning_rate": 2.5778673506779037e-05, + "loss": 0.4203, + "step": 1626 + }, + { + "epoch": 1.609297725024728, + "grad_norm": 0.359657753419691, + "learning_rate": 2.576035177720777e-05, + "loss": 0.4624, + "step": 1627 + }, + { + "epoch": 1.6102868447082097, + "grad_norm": 0.3372351206140866, + "learning_rate": 2.57420300476365e-05, + "loss": 0.4056, + "step": 1628 + }, + { + "epoch": 1.6112759643916914, + "grad_norm": 0.30039862972108095, + "learning_rate": 2.5723708318065226e-05, + "loss": 0.4399, + "step": 1629 + }, + { + "epoch": 1.612265084075173, + "grad_norm": 0.30065834303511957, + "learning_rate": 2.5705386588493956e-05, + "loss": 0.4663, + "step": 1630 + }, + { + "epoch": 1.6132542037586548, + "grad_norm": 0.3410392017721668, + "learning_rate": 2.5687064858922683e-05, + "loss": 0.4458, + "step": 1631 + }, + { + "epoch": 1.6142433234421365, + "grad_norm": 0.31764577949936024, + "learning_rate": 2.5668743129351412e-05, + "loss": 0.4946, + "step": 1632 + }, + { + "epoch": 1.6152324431256182, + "grad_norm": 0.3139161488552942, + "learning_rate": 2.565042139978014e-05, + "loss": 0.45, + "step": 1633 + }, + { + "epoch": 1.6162215628091, + "grad_norm": 0.2838563703036641, + "learning_rate": 2.563209967020887e-05, + "loss": 0.3615, + "step": 1634 + }, + { + "epoch": 1.6172106824925816, + "grad_norm": 0.280910691075243, + "learning_rate": 2.5613777940637595e-05, + "loss": 0.3664, + "step": 1635 + }, + { + "epoch": 1.6181998021760633, + "grad_norm": 0.3442161215592867, + "learning_rate": 2.5595456211066325e-05, + "loss": 0.45, + "step": 1636 + }, + { + "epoch": 1.619188921859545, + "grad_norm": 0.2953185716419831, + "learning_rate": 2.557713448149505e-05, + "loss": 0.4172, + "step": 1637 + }, + { + "epoch": 1.6201780415430267, + "grad_norm": 0.2654726745970755, + "learning_rate": 2.5558812751923784e-05, + "loss": 0.3943, + "step": 1638 + }, + { + "epoch": 1.6211671612265084, + "grad_norm": 0.3200576472766354, + "learning_rate": 2.5540491022352514e-05, + "loss": 0.4265, + "step": 1639 + }, + { + "epoch": 1.62215628090999, + "grad_norm": 0.29628265645606083, + "learning_rate": 2.552216929278124e-05, + "loss": 0.4116, + "step": 1640 + }, + { + "epoch": 1.6231454005934718, + "grad_norm": 0.2841844484198089, + "learning_rate": 2.550384756320997e-05, + "loss": 0.44, + "step": 1641 + }, + { + "epoch": 1.6241345202769535, + "grad_norm": 0.2557201167123379, + "learning_rate": 2.5485525833638697e-05, + "loss": 0.3717, + "step": 1642 + }, + { + "epoch": 1.6251236399604352, + "grad_norm": 0.33250075592922157, + "learning_rate": 2.5467204104067427e-05, + "loss": 0.4857, + "step": 1643 + }, + { + "epoch": 1.6261127596439169, + "grad_norm": 0.2800845720099231, + "learning_rate": 2.5448882374496153e-05, + "loss": 0.3958, + "step": 1644 + }, + { + "epoch": 1.6271018793273986, + "grad_norm": 0.31456766982731915, + "learning_rate": 2.5430560644924883e-05, + "loss": 0.4571, + "step": 1645 + }, + { + "epoch": 1.6280909990108803, + "grad_norm": 0.277187648256086, + "learning_rate": 2.541223891535361e-05, + "loss": 0.3837, + "step": 1646 + }, + { + "epoch": 1.629080118694362, + "grad_norm": 0.26160671112794515, + "learning_rate": 2.539391718578234e-05, + "loss": 0.4197, + "step": 1647 + }, + { + "epoch": 1.6300692383778437, + "grad_norm": 0.3224446440259182, + "learning_rate": 2.5375595456211066e-05, + "loss": 0.44, + "step": 1648 + }, + { + "epoch": 1.6310583580613254, + "grad_norm": 0.2627299707116854, + "learning_rate": 2.5357273726639795e-05, + "loss": 0.4301, + "step": 1649 + }, + { + "epoch": 1.632047477744807, + "grad_norm": 0.2959085159607189, + "learning_rate": 2.5338951997068522e-05, + "loss": 0.4882, + "step": 1650 + }, + { + "epoch": 1.6330365974282888, + "grad_norm": 0.28662674449197795, + "learning_rate": 2.532063026749725e-05, + "loss": 0.4183, + "step": 1651 + }, + { + "epoch": 1.6340257171117705, + "grad_norm": 0.30546067103773544, + "learning_rate": 2.5302308537925985e-05, + "loss": 0.413, + "step": 1652 + }, + { + "epoch": 1.6350148367952522, + "grad_norm": 0.276370031240087, + "learning_rate": 2.5283986808354708e-05, + "loss": 0.4232, + "step": 1653 + }, + { + "epoch": 1.636003956478734, + "grad_norm": 0.2706115264434037, + "learning_rate": 2.526566507878344e-05, + "loss": 0.4293, + "step": 1654 + }, + { + "epoch": 1.6369930761622156, + "grad_norm": 0.30532651247215364, + "learning_rate": 2.5247343349212167e-05, + "loss": 0.4428, + "step": 1655 + }, + { + "epoch": 1.6379821958456975, + "grad_norm": 0.3187299875860798, + "learning_rate": 2.5229021619640897e-05, + "loss": 0.4933, + "step": 1656 + }, + { + "epoch": 1.638971315529179, + "grad_norm": 0.2719115674229351, + "learning_rate": 2.5210699890069624e-05, + "loss": 0.3899, + "step": 1657 + }, + { + "epoch": 1.6399604352126609, + "grad_norm": 0.27197612915344427, + "learning_rate": 2.5192378160498353e-05, + "loss": 0.3803, + "step": 1658 + }, + { + "epoch": 1.6409495548961424, + "grad_norm": 0.30000048728325507, + "learning_rate": 2.517405643092708e-05, + "loss": 0.4025, + "step": 1659 + }, + { + "epoch": 1.6419386745796243, + "grad_norm": 0.2644120267282455, + "learning_rate": 2.515573470135581e-05, + "loss": 0.3988, + "step": 1660 + }, + { + "epoch": 1.6429277942631058, + "grad_norm": 0.2741960980354914, + "learning_rate": 2.5137412971784536e-05, + "loss": 0.4587, + "step": 1661 + }, + { + "epoch": 1.6439169139465877, + "grad_norm": 0.2930096159187744, + "learning_rate": 2.5119091242213266e-05, + "loss": 0.4697, + "step": 1662 + }, + { + "epoch": 1.6449060336300692, + "grad_norm": 0.35317847881770087, + "learning_rate": 2.5100769512641996e-05, + "loss": 0.4459, + "step": 1663 + }, + { + "epoch": 1.645895153313551, + "grad_norm": 0.2870305099438775, + "learning_rate": 2.5082447783070722e-05, + "loss": 0.4329, + "step": 1664 + }, + { + "epoch": 1.6468842729970326, + "grad_norm": 0.31469807580481884, + "learning_rate": 2.5064126053499455e-05, + "loss": 0.4459, + "step": 1665 + }, + { + "epoch": 1.6478733926805145, + "grad_norm": 0.32969026971793497, + "learning_rate": 2.504580432392818e-05, + "loss": 0.4279, + "step": 1666 + }, + { + "epoch": 1.648862512363996, + "grad_norm": 0.331096333283008, + "learning_rate": 2.502748259435691e-05, + "loss": 0.4519, + "step": 1667 + }, + { + "epoch": 1.6498516320474779, + "grad_norm": 0.3441582936309551, + "learning_rate": 2.5009160864785635e-05, + "loss": 0.4669, + "step": 1668 + }, + { + "epoch": 1.6508407517309593, + "grad_norm": 0.3160027414594968, + "learning_rate": 2.4990839135214368e-05, + "loss": 0.4374, + "step": 1669 + }, + { + "epoch": 1.6518298714144413, + "grad_norm": 0.2995831571732668, + "learning_rate": 2.4972517405643094e-05, + "loss": 0.4105, + "step": 1670 + }, + { + "epoch": 1.6528189910979227, + "grad_norm": 0.4442058450146274, + "learning_rate": 2.4954195676071824e-05, + "loss": 0.4731, + "step": 1671 + }, + { + "epoch": 1.6538081107814047, + "grad_norm": 0.33416266456418664, + "learning_rate": 2.493587394650055e-05, + "loss": 0.4784, + "step": 1672 + }, + { + "epoch": 1.6547972304648861, + "grad_norm": 0.3146001816140936, + "learning_rate": 2.491755221692928e-05, + "loss": 0.4262, + "step": 1673 + }, + { + "epoch": 1.655786350148368, + "grad_norm": 0.3743726426973562, + "learning_rate": 2.4899230487358007e-05, + "loss": 0.522, + "step": 1674 + }, + { + "epoch": 1.6567754698318495, + "grad_norm": 0.37599411126000926, + "learning_rate": 2.4880908757786736e-05, + "loss": 0.4539, + "step": 1675 + }, + { + "epoch": 1.6577645895153315, + "grad_norm": 0.2971196338181042, + "learning_rate": 2.4862587028215463e-05, + "loss": 0.4425, + "step": 1676 + }, + { + "epoch": 1.658753709198813, + "grad_norm": 0.3185181610361161, + "learning_rate": 2.4844265298644193e-05, + "loss": 0.4499, + "step": 1677 + }, + { + "epoch": 1.6597428288822949, + "grad_norm": 0.34756786268548084, + "learning_rate": 2.4825943569072922e-05, + "loss": 0.4892, + "step": 1678 + }, + { + "epoch": 1.6607319485657763, + "grad_norm": 0.30828529971670104, + "learning_rate": 2.480762183950165e-05, + "loss": 0.408, + "step": 1679 + }, + { + "epoch": 1.6617210682492582, + "grad_norm": 0.3146532160887173, + "learning_rate": 2.478930010993038e-05, + "loss": 0.4639, + "step": 1680 + }, + { + "epoch": 1.6627101879327397, + "grad_norm": 0.30842421088723865, + "learning_rate": 2.477097838035911e-05, + "loss": 0.4362, + "step": 1681 + }, + { + "epoch": 1.6636993076162216, + "grad_norm": 0.3318964750018238, + "learning_rate": 2.4752656650787835e-05, + "loss": 0.4272, + "step": 1682 + }, + { + "epoch": 1.6646884272997031, + "grad_norm": 0.2719805738190358, + "learning_rate": 2.4734334921216565e-05, + "loss": 0.3888, + "step": 1683 + }, + { + "epoch": 1.665677546983185, + "grad_norm": 0.310777229938151, + "learning_rate": 2.4716013191645295e-05, + "loss": 0.4384, + "step": 1684 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3443093523477439, + "learning_rate": 2.469769146207402e-05, + "loss": 0.4107, + "step": 1685 + }, + { + "epoch": 1.6676557863501484, + "grad_norm": 0.27732248345294597, + "learning_rate": 2.467936973250275e-05, + "loss": 0.4154, + "step": 1686 + }, + { + "epoch": 1.66864490603363, + "grad_norm": 0.3394675785807699, + "learning_rate": 2.4661048002931477e-05, + "loss": 0.5017, + "step": 1687 + }, + { + "epoch": 1.6696340257171118, + "grad_norm": 0.33530005890339165, + "learning_rate": 2.4642726273360207e-05, + "loss": 0.4788, + "step": 1688 + }, + { + "epoch": 1.6706231454005933, + "grad_norm": 0.3242349395154102, + "learning_rate": 2.4624404543788933e-05, + "loss": 0.4241, + "step": 1689 + }, + { + "epoch": 1.6716122650840752, + "grad_norm": 0.30760783104340705, + "learning_rate": 2.4606082814217663e-05, + "loss": 0.4403, + "step": 1690 + }, + { + "epoch": 1.6726013847675567, + "grad_norm": 0.33533199396862406, + "learning_rate": 2.458776108464639e-05, + "loss": 0.4568, + "step": 1691 + }, + { + "epoch": 1.6735905044510386, + "grad_norm": 0.26868937017240524, + "learning_rate": 2.456943935507512e-05, + "loss": 0.4603, + "step": 1692 + }, + { + "epoch": 1.6745796241345203, + "grad_norm": 0.27887094020722614, + "learning_rate": 2.455111762550385e-05, + "loss": 0.3977, + "step": 1693 + }, + { + "epoch": 1.675568743818002, + "grad_norm": 0.2952611325359872, + "learning_rate": 2.453279589593258e-05, + "loss": 0.4352, + "step": 1694 + }, + { + "epoch": 1.6765578635014837, + "grad_norm": 0.26972392989431493, + "learning_rate": 2.4514474166361305e-05, + "loss": 0.4404, + "step": 1695 + }, + { + "epoch": 1.6775469831849654, + "grad_norm": 0.26276099105184353, + "learning_rate": 2.4496152436790035e-05, + "loss": 0.4099, + "step": 1696 + }, + { + "epoch": 1.6785361028684471, + "grad_norm": 0.2834003933123841, + "learning_rate": 2.4477830707218762e-05, + "loss": 0.4044, + "step": 1697 + }, + { + "epoch": 1.6795252225519288, + "grad_norm": 0.28788506496878463, + "learning_rate": 2.445950897764749e-05, + "loss": 0.3827, + "step": 1698 + }, + { + "epoch": 1.6805143422354105, + "grad_norm": 0.31665468599783186, + "learning_rate": 2.4441187248076218e-05, + "loss": 0.4364, + "step": 1699 + }, + { + "epoch": 1.6815034619188922, + "grad_norm": 0.2578234089735481, + "learning_rate": 2.4422865518504948e-05, + "loss": 0.3777, + "step": 1700 + }, + { + "epoch": 1.682492581602374, + "grad_norm": 0.2979943738077502, + "learning_rate": 2.4404543788933678e-05, + "loss": 0.3914, + "step": 1701 + }, + { + "epoch": 1.6834817012858556, + "grad_norm": 3.020828917743508, + "learning_rate": 2.4386222059362404e-05, + "loss": 0.3896, + "step": 1702 + }, + { + "epoch": 1.6844708209693373, + "grad_norm": 0.3359718058064896, + "learning_rate": 2.4367900329791134e-05, + "loss": 0.4185, + "step": 1703 + }, + { + "epoch": 1.685459940652819, + "grad_norm": 0.2994177996182077, + "learning_rate": 2.434957860021986e-05, + "loss": 0.4086, + "step": 1704 + }, + { + "epoch": 1.6864490603363007, + "grad_norm": 0.2991038976312438, + "learning_rate": 2.433125687064859e-05, + "loss": 0.435, + "step": 1705 + }, + { + "epoch": 1.6874381800197824, + "grad_norm": 0.28661212810149267, + "learning_rate": 2.431293514107732e-05, + "loss": 0.4143, + "step": 1706 + }, + { + "epoch": 1.688427299703264, + "grad_norm": 0.37509122239648596, + "learning_rate": 2.429461341150605e-05, + "loss": 0.4484, + "step": 1707 + }, + { + "epoch": 1.6894164193867458, + "grad_norm": 0.27836601910888614, + "learning_rate": 2.4276291681934776e-05, + "loss": 0.4142, + "step": 1708 + }, + { + "epoch": 1.6904055390702275, + "grad_norm": 0.2782146839258225, + "learning_rate": 2.4257969952363506e-05, + "loss": 0.4015, + "step": 1709 + }, + { + "epoch": 1.6913946587537092, + "grad_norm": 0.313916326384413, + "learning_rate": 2.4239648222792232e-05, + "loss": 0.4149, + "step": 1710 + }, + { + "epoch": 1.692383778437191, + "grad_norm": 0.29787858936299944, + "learning_rate": 2.4221326493220962e-05, + "loss": 0.4177, + "step": 1711 + }, + { + "epoch": 1.6933728981206726, + "grad_norm": 0.2902230168937217, + "learning_rate": 2.420300476364969e-05, + "loss": 0.4459, + "step": 1712 + }, + { + "epoch": 1.6943620178041543, + "grad_norm": 0.2764799202205296, + "learning_rate": 2.4184683034078418e-05, + "loss": 0.4803, + "step": 1713 + }, + { + "epoch": 1.695351137487636, + "grad_norm": 0.3045330992531217, + "learning_rate": 2.4166361304507145e-05, + "loss": 0.4842, + "step": 1714 + }, + { + "epoch": 1.6963402571711177, + "grad_norm": 0.3029379637137786, + "learning_rate": 2.4148039574935874e-05, + "loss": 0.4554, + "step": 1715 + }, + { + "epoch": 1.6973293768545994, + "grad_norm": 0.28227347456024454, + "learning_rate": 2.41297178453646e-05, + "loss": 0.4703, + "step": 1716 + }, + { + "epoch": 1.698318496538081, + "grad_norm": 0.2943760092371024, + "learning_rate": 2.4111396115793334e-05, + "loss": 0.4384, + "step": 1717 + }, + { + "epoch": 1.6993076162215628, + "grad_norm": 0.2985609138611823, + "learning_rate": 2.409307438622206e-05, + "loss": 0.4287, + "step": 1718 + }, + { + "epoch": 1.7002967359050445, + "grad_norm": 0.26327465676428885, + "learning_rate": 2.407475265665079e-05, + "loss": 0.4177, + "step": 1719 + }, + { + "epoch": 1.7012858555885262, + "grad_norm": 0.3178230796077746, + "learning_rate": 2.4056430927079517e-05, + "loss": 0.4423, + "step": 1720 + }, + { + "epoch": 1.7022749752720079, + "grad_norm": 0.2814544147392274, + "learning_rate": 2.4038109197508247e-05, + "loss": 0.4657, + "step": 1721 + }, + { + "epoch": 1.7032640949554896, + "grad_norm": 0.3002239090911129, + "learning_rate": 2.4019787467936973e-05, + "loss": 0.4673, + "step": 1722 + }, + { + "epoch": 1.7042532146389713, + "grad_norm": 0.31781217494252273, + "learning_rate": 2.4001465738365703e-05, + "loss": 0.527, + "step": 1723 + }, + { + "epoch": 1.705242334322453, + "grad_norm": 0.3498801258975987, + "learning_rate": 2.3983144008794433e-05, + "loss": 0.4487, + "step": 1724 + }, + { + "epoch": 1.7062314540059347, + "grad_norm": 0.9087648133583388, + "learning_rate": 2.396482227922316e-05, + "loss": 0.3576, + "step": 1725 + }, + { + "epoch": 1.7072205736894164, + "grad_norm": 0.3719846637041946, + "learning_rate": 2.394650054965189e-05, + "loss": 0.4582, + "step": 1726 + }, + { + "epoch": 1.708209693372898, + "grad_norm": 0.35660510630355013, + "learning_rate": 2.3928178820080615e-05, + "loss": 0.4612, + "step": 1727 + }, + { + "epoch": 1.7091988130563798, + "grad_norm": 0.33981496021373514, + "learning_rate": 2.3909857090509345e-05, + "loss": 0.4273, + "step": 1728 + }, + { + "epoch": 1.7101879327398615, + "grad_norm": 0.3742138810240101, + "learning_rate": 2.3891535360938075e-05, + "loss": 0.4248, + "step": 1729 + }, + { + "epoch": 1.7111770524233432, + "grad_norm": 0.36187927647193935, + "learning_rate": 2.3873213631366805e-05, + "loss": 0.4379, + "step": 1730 + }, + { + "epoch": 1.712166172106825, + "grad_norm": 0.3017112790609432, + "learning_rate": 2.385489190179553e-05, + "loss": 0.4641, + "step": 1731 + }, + { + "epoch": 1.7131552917903066, + "grad_norm": 0.3798548828676941, + "learning_rate": 2.383657017222426e-05, + "loss": 0.4185, + "step": 1732 + }, + { + "epoch": 1.7141444114737885, + "grad_norm": 0.289413116025503, + "learning_rate": 2.3818248442652987e-05, + "loss": 0.3906, + "step": 1733 + }, + { + "epoch": 1.71513353115727, + "grad_norm": 0.32233281303153377, + "learning_rate": 2.3799926713081717e-05, + "loss": 0.4263, + "step": 1734 + }, + { + "epoch": 1.7161226508407519, + "grad_norm": 0.3006262895335519, + "learning_rate": 2.3781604983510443e-05, + "loss": 0.4834, + "step": 1735 + }, + { + "epoch": 1.7171117705242334, + "grad_norm": 0.2618376951089462, + "learning_rate": 2.3763283253939173e-05, + "loss": 0.3999, + "step": 1736 + }, + { + "epoch": 1.7181008902077153, + "grad_norm": 0.29424619057293433, + "learning_rate": 2.37449615243679e-05, + "loss": 0.4073, + "step": 1737 + }, + { + "epoch": 1.7190900098911968, + "grad_norm": 0.3246152750212638, + "learning_rate": 2.372663979479663e-05, + "loss": 0.4299, + "step": 1738 + }, + { + "epoch": 1.7200791295746787, + "grad_norm": 0.31160852845216713, + "learning_rate": 2.3708318065225356e-05, + "loss": 0.3979, + "step": 1739 + }, + { + "epoch": 1.7210682492581602, + "grad_norm": 0.2755114326711097, + "learning_rate": 2.3689996335654086e-05, + "loss": 0.4641, + "step": 1740 + }, + { + "epoch": 1.722057368941642, + "grad_norm": 0.30346825870958033, + "learning_rate": 2.3671674606082816e-05, + "loss": 0.4408, + "step": 1741 + }, + { + "epoch": 1.7230464886251236, + "grad_norm": 0.30129194966583844, + "learning_rate": 2.3653352876511545e-05, + "loss": 0.3837, + "step": 1742 + }, + { + "epoch": 1.7240356083086055, + "grad_norm": 0.2775790052611303, + "learning_rate": 2.3635031146940272e-05, + "loss": 0.4221, + "step": 1743 + }, + { + "epoch": 1.725024727992087, + "grad_norm": 0.265693373028082, + "learning_rate": 2.3616709417369e-05, + "loss": 0.4168, + "step": 1744 + }, + { + "epoch": 1.7260138476755689, + "grad_norm": 0.3022427971745998, + "learning_rate": 2.3598387687797728e-05, + "loss": 0.4021, + "step": 1745 + }, + { + "epoch": 1.7270029673590503, + "grad_norm": 0.308247068877542, + "learning_rate": 2.3580065958226458e-05, + "loss": 0.4473, + "step": 1746 + }, + { + "epoch": 1.7279920870425323, + "grad_norm": 0.29663390988995886, + "learning_rate": 2.3561744228655188e-05, + "loss": 0.4039, + "step": 1747 + }, + { + "epoch": 1.7289812067260137, + "grad_norm": 0.2736150704186472, + "learning_rate": 2.3543422499083914e-05, + "loss": 0.4025, + "step": 1748 + }, + { + "epoch": 1.7299703264094957, + "grad_norm": 2.823911700520109, + "learning_rate": 2.3525100769512644e-05, + "loss": 0.475, + "step": 1749 + }, + { + "epoch": 1.7309594460929771, + "grad_norm": 0.34298388253711626, + "learning_rate": 2.350677903994137e-05, + "loss": 0.4595, + "step": 1750 + }, + { + "epoch": 1.731948565776459, + "grad_norm": 0.30530109475341805, + "learning_rate": 2.34884573103701e-05, + "loss": 0.4368, + "step": 1751 + }, + { + "epoch": 1.7329376854599405, + "grad_norm": 0.32890647032684833, + "learning_rate": 2.347013558079883e-05, + "loss": 0.4635, + "step": 1752 + }, + { + "epoch": 1.7339268051434225, + "grad_norm": 0.2888669372975163, + "learning_rate": 2.345181385122756e-05, + "loss": 0.3912, + "step": 1753 + }, + { + "epoch": 1.734915924826904, + "grad_norm": 0.29633911242582683, + "learning_rate": 2.3433492121656286e-05, + "loss": 0.3674, + "step": 1754 + }, + { + "epoch": 1.7359050445103859, + "grad_norm": 0.26951749995769536, + "learning_rate": 2.3415170392085016e-05, + "loss": 0.4199, + "step": 1755 + }, + { + "epoch": 1.7368941641938673, + "grad_norm": 0.3749803881291908, + "learning_rate": 2.3396848662513742e-05, + "loss": 0.483, + "step": 1756 + }, + { + "epoch": 1.7378832838773492, + "grad_norm": 0.32853671133944534, + "learning_rate": 2.3378526932942472e-05, + "loss": 0.4501, + "step": 1757 + }, + { + "epoch": 1.7388724035608307, + "grad_norm": 0.3443696483010131, + "learning_rate": 2.33602052033712e-05, + "loss": 0.499, + "step": 1758 + }, + { + "epoch": 1.7398615232443126, + "grad_norm": 0.3145778345363139, + "learning_rate": 2.334188347379993e-05, + "loss": 0.3949, + "step": 1759 + }, + { + "epoch": 1.7408506429277941, + "grad_norm": 0.319931054528546, + "learning_rate": 2.3323561744228655e-05, + "loss": 0.3834, + "step": 1760 + }, + { + "epoch": 1.741839762611276, + "grad_norm": 0.2862423758324553, + "learning_rate": 2.3305240014657385e-05, + "loss": 0.4465, + "step": 1761 + }, + { + "epoch": 1.7428288822947575, + "grad_norm": 0.30485977879966464, + "learning_rate": 2.328691828508611e-05, + "loss": 0.401, + "step": 1762 + }, + { + "epoch": 1.7438180019782394, + "grad_norm": 0.3165930256651749, + "learning_rate": 2.326859655551484e-05, + "loss": 0.4678, + "step": 1763 + }, + { + "epoch": 1.744807121661721, + "grad_norm": 0.29246729099227736, + "learning_rate": 2.325027482594357e-05, + "loss": 0.3993, + "step": 1764 + }, + { + "epoch": 1.7457962413452028, + "grad_norm": 0.31561117841951253, + "learning_rate": 2.32319530963723e-05, + "loss": 0.4925, + "step": 1765 + }, + { + "epoch": 1.7467853610286843, + "grad_norm": 0.2888481565602988, + "learning_rate": 2.3213631366801027e-05, + "loss": 0.4381, + "step": 1766 + }, + { + "epoch": 1.7477744807121662, + "grad_norm": 0.2868705156659709, + "learning_rate": 2.3195309637229757e-05, + "loss": 0.3683, + "step": 1767 + }, + { + "epoch": 1.7487636003956477, + "grad_norm": 0.3782601564035021, + "learning_rate": 2.3176987907658483e-05, + "loss": 0.4532, + "step": 1768 + }, + { + "epoch": 1.7497527200791296, + "grad_norm": 0.2704871280008665, + "learning_rate": 2.3158666178087213e-05, + "loss": 0.4201, + "step": 1769 + }, + { + "epoch": 1.7507418397626113, + "grad_norm": 0.3875138252219907, + "learning_rate": 2.3140344448515943e-05, + "loss": 0.4805, + "step": 1770 + }, + { + "epoch": 1.751730959446093, + "grad_norm": 0.5492745343372892, + "learning_rate": 2.312202271894467e-05, + "loss": 0.4156, + "step": 1771 + }, + { + "epoch": 1.7527200791295747, + "grad_norm": 0.3204756187656851, + "learning_rate": 2.31037009893734e-05, + "loss": 0.4988, + "step": 1772 + }, + { + "epoch": 1.7537091988130564, + "grad_norm": 0.35278429265769184, + "learning_rate": 2.3085379259802125e-05, + "loss": 0.4668, + "step": 1773 + }, + { + "epoch": 1.7546983184965381, + "grad_norm": 0.2818358328142832, + "learning_rate": 2.3067057530230855e-05, + "loss": 0.4493, + "step": 1774 + }, + { + "epoch": 1.7556874381800198, + "grad_norm": 0.3770205606309304, + "learning_rate": 2.304873580065958e-05, + "loss": 0.4706, + "step": 1775 + }, + { + "epoch": 1.7566765578635015, + "grad_norm": 0.2887530974485894, + "learning_rate": 2.3030414071088315e-05, + "loss": 0.4208, + "step": 1776 + }, + { + "epoch": 1.7576656775469832, + "grad_norm": 0.38165004798947394, + "learning_rate": 2.301209234151704e-05, + "loss": 0.4299, + "step": 1777 + }, + { + "epoch": 1.758654797230465, + "grad_norm": 0.30821868136973407, + "learning_rate": 2.299377061194577e-05, + "loss": 0.4569, + "step": 1778 + }, + { + "epoch": 1.7596439169139466, + "grad_norm": 0.24167543739660413, + "learning_rate": 2.2975448882374497e-05, + "loss": 0.3777, + "step": 1779 + }, + { + "epoch": 1.7606330365974283, + "grad_norm": 0.40409075639658476, + "learning_rate": 2.2957127152803227e-05, + "loss": 0.4594, + "step": 1780 + }, + { + "epoch": 1.76162215628091, + "grad_norm": 0.3254914242691898, + "learning_rate": 2.2938805423231954e-05, + "loss": 0.4206, + "step": 1781 + }, + { + "epoch": 1.7626112759643917, + "grad_norm": 2.509631711790463, + "learning_rate": 2.2920483693660683e-05, + "loss": 0.4511, + "step": 1782 + }, + { + "epoch": 1.7636003956478734, + "grad_norm": 0.3808689413851784, + "learning_rate": 2.290216196408941e-05, + "loss": 0.4229, + "step": 1783 + }, + { + "epoch": 1.764589515331355, + "grad_norm": 0.318175279111379, + "learning_rate": 2.288384023451814e-05, + "loss": 0.4416, + "step": 1784 + }, + { + "epoch": 1.7655786350148368, + "grad_norm": 0.26964885093490887, + "learning_rate": 2.2865518504946866e-05, + "loss": 0.4192, + "step": 1785 + }, + { + "epoch": 1.7665677546983185, + "grad_norm": 0.2632332792155176, + "learning_rate": 2.2847196775375596e-05, + "loss": 0.4172, + "step": 1786 + }, + { + "epoch": 1.7675568743818002, + "grad_norm": 0.4047127793846293, + "learning_rate": 2.2828875045804322e-05, + "loss": 0.4712, + "step": 1787 + }, + { + "epoch": 1.768545994065282, + "grad_norm": 0.27723739287491106, + "learning_rate": 2.2810553316233055e-05, + "loss": 0.4164, + "step": 1788 + }, + { + "epoch": 1.7695351137487636, + "grad_norm": 0.3030859442151421, + "learning_rate": 2.2792231586661782e-05, + "loss": 0.3961, + "step": 1789 + }, + { + "epoch": 1.7705242334322453, + "grad_norm": 0.8577754720604038, + "learning_rate": 2.277390985709051e-05, + "loss": 0.504, + "step": 1790 + }, + { + "epoch": 1.771513353115727, + "grad_norm": 0.3242162051396364, + "learning_rate": 2.2755588127519238e-05, + "loss": 0.4342, + "step": 1791 + }, + { + "epoch": 1.7725024727992087, + "grad_norm": 0.3408702815957067, + "learning_rate": 2.2737266397947968e-05, + "loss": 0.5057, + "step": 1792 + }, + { + "epoch": 1.7734915924826904, + "grad_norm": 0.3542183072173877, + "learning_rate": 2.2718944668376698e-05, + "loss": 0.4232, + "step": 1793 + }, + { + "epoch": 1.774480712166172, + "grad_norm": 0.4235624797543405, + "learning_rate": 2.2700622938805424e-05, + "loss": 0.437, + "step": 1794 + }, + { + "epoch": 1.7754698318496538, + "grad_norm": 0.2789629789323769, + "learning_rate": 2.2682301209234154e-05, + "loss": 0.4422, + "step": 1795 + }, + { + "epoch": 1.7764589515331355, + "grad_norm": 0.31066306585084447, + "learning_rate": 2.266397947966288e-05, + "loss": 0.4424, + "step": 1796 + }, + { + "epoch": 1.7774480712166172, + "grad_norm": 0.33933834935357965, + "learning_rate": 2.264565775009161e-05, + "loss": 0.4203, + "step": 1797 + }, + { + "epoch": 1.7784371909000989, + "grad_norm": 0.32712238177562997, + "learning_rate": 2.2627336020520337e-05, + "loss": 0.4319, + "step": 1798 + }, + { + "epoch": 1.7794263105835806, + "grad_norm": 0.3040834782849549, + "learning_rate": 2.2609014290949066e-05, + "loss": 0.4453, + "step": 1799 + }, + { + "epoch": 1.7804154302670623, + "grad_norm": 0.30022108803178027, + "learning_rate": 2.2590692561377796e-05, + "loss": 0.4445, + "step": 1800 + }, + { + "epoch": 1.781404549950544, + "grad_norm": 0.3088237766977971, + "learning_rate": 2.2572370831806526e-05, + "loss": 0.425, + "step": 1801 + }, + { + "epoch": 1.7823936696340257, + "grad_norm": 0.25065884893036056, + "learning_rate": 2.2554049102235252e-05, + "loss": 0.4144, + "step": 1802 + }, + { + "epoch": 1.7833827893175074, + "grad_norm": 0.2975123258434565, + "learning_rate": 2.2535727372663982e-05, + "loss": 0.4606, + "step": 1803 + }, + { + "epoch": 1.784371909000989, + "grad_norm": 3.0916522188183118, + "learning_rate": 2.251740564309271e-05, + "loss": 0.5796, + "step": 1804 + }, + { + "epoch": 1.7853610286844708, + "grad_norm": 0.4161541547275752, + "learning_rate": 2.249908391352144e-05, + "loss": 0.4224, + "step": 1805 + }, + { + "epoch": 1.7863501483679525, + "grad_norm": 0.4190616672823314, + "learning_rate": 2.2480762183950165e-05, + "loss": 0.4956, + "step": 1806 + }, + { + "epoch": 1.7873392680514342, + "grad_norm": 0.7468045943261583, + "learning_rate": 2.2462440454378895e-05, + "loss": 0.506, + "step": 1807 + }, + { + "epoch": 1.7883283877349159, + "grad_norm": 0.39878637991681054, + "learning_rate": 2.244411872480762e-05, + "loss": 0.4582, + "step": 1808 + }, + { + "epoch": 1.7893175074183976, + "grad_norm": 0.3939410722134704, + "learning_rate": 2.242579699523635e-05, + "loss": 0.4213, + "step": 1809 + }, + { + "epoch": 1.7903066271018795, + "grad_norm": 0.35893540238509397, + "learning_rate": 2.2407475265665077e-05, + "loss": 0.4452, + "step": 1810 + }, + { + "epoch": 1.791295746785361, + "grad_norm": 0.3179290020697894, + "learning_rate": 2.2389153536093807e-05, + "loss": 0.4307, + "step": 1811 + }, + { + "epoch": 1.7922848664688429, + "grad_norm": 0.37222861438373844, + "learning_rate": 2.2370831806522537e-05, + "loss": 0.4481, + "step": 1812 + }, + { + "epoch": 1.7932739861523244, + "grad_norm": 0.4758358116814864, + "learning_rate": 2.2352510076951267e-05, + "loss": 0.4565, + "step": 1813 + }, + { + "epoch": 1.7942631058358063, + "grad_norm": 0.30087252373022794, + "learning_rate": 2.2334188347379993e-05, + "loss": 0.4506, + "step": 1814 + }, + { + "epoch": 1.7952522255192878, + "grad_norm": 0.38966070557634785, + "learning_rate": 2.2315866617808723e-05, + "loss": 0.4129, + "step": 1815 + }, + { + "epoch": 1.7962413452027697, + "grad_norm": 0.4977116302980233, + "learning_rate": 2.229754488823745e-05, + "loss": 0.4218, + "step": 1816 + }, + { + "epoch": 1.7972304648862512, + "grad_norm": 0.3265049192175431, + "learning_rate": 2.227922315866618e-05, + "loss": 0.418, + "step": 1817 + }, + { + "epoch": 1.798219584569733, + "grad_norm": 0.3832944235691656, + "learning_rate": 2.226090142909491e-05, + "loss": 0.4684, + "step": 1818 + }, + { + "epoch": 1.7992087042532146, + "grad_norm": 0.2958689650940704, + "learning_rate": 2.2242579699523635e-05, + "loss": 0.4074, + "step": 1819 + }, + { + "epoch": 1.8001978239366965, + "grad_norm": 0.37038788365942676, + "learning_rate": 2.2224257969952365e-05, + "loss": 0.4259, + "step": 1820 + }, + { + "epoch": 1.801186943620178, + "grad_norm": 0.3553791418334394, + "learning_rate": 2.220593624038109e-05, + "loss": 0.4669, + "step": 1821 + }, + { + "epoch": 1.8021760633036599, + "grad_norm": 0.29718199661262223, + "learning_rate": 2.218761451080982e-05, + "loss": 0.4221, + "step": 1822 + }, + { + "epoch": 1.8031651829871413, + "grad_norm": 0.3359429382037479, + "learning_rate": 2.2169292781238548e-05, + "loss": 0.397, + "step": 1823 + }, + { + "epoch": 1.8041543026706233, + "grad_norm": 0.3048041433497428, + "learning_rate": 2.215097105166728e-05, + "loss": 0.4191, + "step": 1824 + }, + { + "epoch": 1.8051434223541047, + "grad_norm": 3.3517379572451342, + "learning_rate": 2.2132649322096007e-05, + "loss": 0.4819, + "step": 1825 + }, + { + "epoch": 1.8061325420375867, + "grad_norm": 0.2867123953220445, + "learning_rate": 2.2114327592524737e-05, + "loss": 0.4172, + "step": 1826 + }, + { + "epoch": 1.8071216617210681, + "grad_norm": 0.2871478819593749, + "learning_rate": 2.2096005862953464e-05, + "loss": 0.3971, + "step": 1827 + }, + { + "epoch": 1.80811078140455, + "grad_norm": 0.3245949025237984, + "learning_rate": 2.2077684133382193e-05, + "loss": 0.4968, + "step": 1828 + }, + { + "epoch": 1.8090999010880315, + "grad_norm": 25.78091323351991, + "learning_rate": 2.205936240381092e-05, + "loss": 0.693, + "step": 1829 + }, + { + "epoch": 1.8100890207715135, + "grad_norm": 0.49724123565643946, + "learning_rate": 2.204104067423965e-05, + "loss": 0.4492, + "step": 1830 + }, + { + "epoch": 1.811078140454995, + "grad_norm": 0.3303838283801352, + "learning_rate": 2.2022718944668376e-05, + "loss": 0.4216, + "step": 1831 + }, + { + "epoch": 1.8120672601384769, + "grad_norm": 0.3510098031005022, + "learning_rate": 2.2004397215097106e-05, + "loss": 0.4758, + "step": 1832 + }, + { + "epoch": 1.8130563798219583, + "grad_norm": 0.2829388974092525, + "learning_rate": 2.1986075485525832e-05, + "loss": 0.4332, + "step": 1833 + }, + { + "epoch": 1.8140454995054403, + "grad_norm": 0.29997263346647157, + "learning_rate": 2.1967753755954562e-05, + "loss": 0.4289, + "step": 1834 + }, + { + "epoch": 1.8150346191889217, + "grad_norm": 0.31669510053999383, + "learning_rate": 2.1949432026383292e-05, + "loss": 0.4411, + "step": 1835 + }, + { + "epoch": 1.8160237388724036, + "grad_norm": 0.29684511655355245, + "learning_rate": 2.1931110296812022e-05, + "loss": 0.407, + "step": 1836 + }, + { + "epoch": 1.8170128585558851, + "grad_norm": 0.6958017772881732, + "learning_rate": 2.1912788567240748e-05, + "loss": 0.4468, + "step": 1837 + }, + { + "epoch": 1.818001978239367, + "grad_norm": 0.336206373063671, + "learning_rate": 2.1894466837669478e-05, + "loss": 0.4296, + "step": 1838 + }, + { + "epoch": 1.8189910979228485, + "grad_norm": 0.38504542640419415, + "learning_rate": 2.1876145108098204e-05, + "loss": 0.4059, + "step": 1839 + }, + { + "epoch": 1.8199802176063304, + "grad_norm": 0.32789250753473015, + "learning_rate": 2.1857823378526934e-05, + "loss": 0.4773, + "step": 1840 + }, + { + "epoch": 1.820969337289812, + "grad_norm": 0.3200012365592021, + "learning_rate": 2.1839501648955664e-05, + "loss": 0.3923, + "step": 1841 + }, + { + "epoch": 1.8219584569732938, + "grad_norm": 0.45827677015168833, + "learning_rate": 2.182117991938439e-05, + "loss": 0.4699, + "step": 1842 + }, + { + "epoch": 1.8229475766567753, + "grad_norm": 0.30103049669909526, + "learning_rate": 2.180285818981312e-05, + "loss": 0.4359, + "step": 1843 + }, + { + "epoch": 1.8239366963402572, + "grad_norm": 0.297495772208957, + "learning_rate": 2.1784536460241847e-05, + "loss": 0.4252, + "step": 1844 + }, + { + "epoch": 1.8249258160237387, + "grad_norm": 0.3680654701793383, + "learning_rate": 2.1766214730670576e-05, + "loss": 0.458, + "step": 1845 + }, + { + "epoch": 1.8259149357072206, + "grad_norm": 0.3403708954708426, + "learning_rate": 2.1747893001099303e-05, + "loss": 0.444, + "step": 1846 + }, + { + "epoch": 1.826904055390702, + "grad_norm": 0.2803891604745382, + "learning_rate": 2.1729571271528033e-05, + "loss": 0.4229, + "step": 1847 + }, + { + "epoch": 1.827893175074184, + "grad_norm": 0.2997125783852745, + "learning_rate": 2.1711249541956762e-05, + "loss": 0.4057, + "step": 1848 + }, + { + "epoch": 1.8288822947576657, + "grad_norm": 0.3371508195438753, + "learning_rate": 2.1692927812385492e-05, + "loss": 0.4581, + "step": 1849 + }, + { + "epoch": 1.8298714144411474, + "grad_norm": 0.2822053601448072, + "learning_rate": 2.167460608281422e-05, + "loss": 0.4262, + "step": 1850 + }, + { + "epoch": 1.8308605341246291, + "grad_norm": 0.33119023555410904, + "learning_rate": 2.165628435324295e-05, + "loss": 0.4673, + "step": 1851 + }, + { + "epoch": 1.8318496538081108, + "grad_norm": 0.2982548956831065, + "learning_rate": 2.1637962623671675e-05, + "loss": 0.4265, + "step": 1852 + }, + { + "epoch": 1.8328387734915925, + "grad_norm": 0.2861624181648438, + "learning_rate": 2.1619640894100405e-05, + "loss": 0.4637, + "step": 1853 + }, + { + "epoch": 1.8338278931750742, + "grad_norm": 0.27624749764651596, + "learning_rate": 2.160131916452913e-05, + "loss": 0.378, + "step": 1854 + }, + { + "epoch": 1.834817012858556, + "grad_norm": 0.30172044665288217, + "learning_rate": 2.158299743495786e-05, + "loss": 0.442, + "step": 1855 + }, + { + "epoch": 1.8358061325420376, + "grad_norm": 0.8175953310300249, + "learning_rate": 2.1564675705386587e-05, + "loss": 0.5055, + "step": 1856 + }, + { + "epoch": 1.8367952522255193, + "grad_norm": 0.30746248179532876, + "learning_rate": 2.1546353975815317e-05, + "loss": 0.45, + "step": 1857 + }, + { + "epoch": 1.837784371909001, + "grad_norm": 0.3408014710812103, + "learning_rate": 2.1528032246244047e-05, + "loss": 0.4448, + "step": 1858 + }, + { + "epoch": 1.8387734915924827, + "grad_norm": 0.6207727087629815, + "learning_rate": 2.1509710516672773e-05, + "loss": 0.4727, + "step": 1859 + }, + { + "epoch": 1.8397626112759644, + "grad_norm": 0.4225278421377986, + "learning_rate": 2.1491388787101503e-05, + "loss": 0.4452, + "step": 1860 + }, + { + "epoch": 1.840751730959446, + "grad_norm": 0.4685555138093697, + "learning_rate": 2.1473067057530233e-05, + "loss": 0.4202, + "step": 1861 + }, + { + "epoch": 1.8417408506429278, + "grad_norm": 0.2891269631180874, + "learning_rate": 2.145474532795896e-05, + "loss": 0.4393, + "step": 1862 + }, + { + "epoch": 1.8427299703264095, + "grad_norm": 0.37223238502196465, + "learning_rate": 2.143642359838769e-05, + "loss": 0.3935, + "step": 1863 + }, + { + "epoch": 1.8437190900098912, + "grad_norm": 0.3478084518953676, + "learning_rate": 2.141810186881642e-05, + "loss": 0.4112, + "step": 1864 + }, + { + "epoch": 1.844708209693373, + "grad_norm": 0.2905645567287115, + "learning_rate": 2.1399780139245145e-05, + "loss": 0.4286, + "step": 1865 + }, + { + "epoch": 1.8456973293768546, + "grad_norm": 0.37515837566952076, + "learning_rate": 2.1381458409673875e-05, + "loss": 0.5235, + "step": 1866 + }, + { + "epoch": 1.8466864490603363, + "grad_norm": 0.2951545143923494, + "learning_rate": 2.13631366801026e-05, + "loss": 0.5067, + "step": 1867 + }, + { + "epoch": 1.847675568743818, + "grad_norm": 0.2948346426391745, + "learning_rate": 2.134481495053133e-05, + "loss": 0.3966, + "step": 1868 + }, + { + "epoch": 1.8486646884272997, + "grad_norm": 0.41530734276356635, + "learning_rate": 2.1326493220960058e-05, + "loss": 0.4632, + "step": 1869 + }, + { + "epoch": 1.8496538081107814, + "grad_norm": 0.3147529275156093, + "learning_rate": 2.1308171491388788e-05, + "loss": 0.4547, + "step": 1870 + }, + { + "epoch": 1.850642927794263, + "grad_norm": 0.2927562274139951, + "learning_rate": 2.1289849761817514e-05, + "loss": 0.3962, + "step": 1871 + }, + { + "epoch": 1.8516320474777448, + "grad_norm": 0.3657012517786323, + "learning_rate": 2.1271528032246247e-05, + "loss": 0.4178, + "step": 1872 + }, + { + "epoch": 1.8526211671612265, + "grad_norm": 0.2740740734494427, + "learning_rate": 2.1253206302674974e-05, + "loss": 0.4467, + "step": 1873 + }, + { + "epoch": 1.8536102868447082, + "grad_norm": 0.3075673256037017, + "learning_rate": 2.1234884573103704e-05, + "loss": 0.4186, + "step": 1874 + }, + { + "epoch": 1.8545994065281899, + "grad_norm": 0.2920268194596183, + "learning_rate": 2.121656284353243e-05, + "loss": 0.3896, + "step": 1875 + }, + { + "epoch": 1.8555885262116716, + "grad_norm": 0.27201039572059627, + "learning_rate": 2.119824111396116e-05, + "loss": 0.4085, + "step": 1876 + }, + { + "epoch": 1.8565776458951533, + "grad_norm": 0.353148381804933, + "learning_rate": 2.1179919384389886e-05, + "loss": 0.4426, + "step": 1877 + }, + { + "epoch": 1.857566765578635, + "grad_norm": 8.40808607244907, + "learning_rate": 2.1161597654818616e-05, + "loss": 0.4798, + "step": 1878 + }, + { + "epoch": 1.8585558852621167, + "grad_norm": 0.267897511367695, + "learning_rate": 2.1143275925247342e-05, + "loss": 0.4339, + "step": 1879 + }, + { + "epoch": 1.8595450049455984, + "grad_norm": 0.31654957685044094, + "learning_rate": 2.1124954195676072e-05, + "loss": 0.4733, + "step": 1880 + }, + { + "epoch": 1.86053412462908, + "grad_norm": 0.3027075365016889, + "learning_rate": 2.1106632466104802e-05, + "loss": 0.4363, + "step": 1881 + }, + { + "epoch": 1.8615232443125618, + "grad_norm": 0.31790990073853925, + "learning_rate": 2.108831073653353e-05, + "loss": 0.4105, + "step": 1882 + }, + { + "epoch": 1.8625123639960435, + "grad_norm": 0.2843270321617449, + "learning_rate": 2.1069989006962258e-05, + "loss": 0.4514, + "step": 1883 + }, + { + "epoch": 1.8635014836795252, + "grad_norm": 0.3044166271116939, + "learning_rate": 2.1051667277390988e-05, + "loss": 0.4842, + "step": 1884 + }, + { + "epoch": 1.8644906033630069, + "grad_norm": 0.3358497677903399, + "learning_rate": 2.1033345547819714e-05, + "loss": 0.4455, + "step": 1885 + }, + { + "epoch": 1.8654797230464886, + "grad_norm": 0.30731251689927197, + "learning_rate": 2.1015023818248444e-05, + "loss": 0.4451, + "step": 1886 + }, + { + "epoch": 1.8664688427299705, + "grad_norm": 0.3143692937530264, + "learning_rate": 2.0996702088677174e-05, + "loss": 0.4269, + "step": 1887 + }, + { + "epoch": 1.867457962413452, + "grad_norm": 0.2834940816390246, + "learning_rate": 2.09783803591059e-05, + "loss": 0.412, + "step": 1888 + }, + { + "epoch": 1.8684470820969339, + "grad_norm": 0.2788270677440903, + "learning_rate": 2.096005862953463e-05, + "loss": 0.3973, + "step": 1889 + }, + { + "epoch": 1.8694362017804154, + "grad_norm": 0.3078569868447692, + "learning_rate": 2.0941736899963357e-05, + "loss": 0.4802, + "step": 1890 + }, + { + "epoch": 1.8704253214638973, + "grad_norm": 0.2882271462085658, + "learning_rate": 2.0923415170392087e-05, + "loss": 0.4506, + "step": 1891 + }, + { + "epoch": 1.8714144411473788, + "grad_norm": 0.2768064460294965, + "learning_rate": 2.0905093440820813e-05, + "loss": 0.4525, + "step": 1892 + }, + { + "epoch": 1.8724035608308607, + "grad_norm": 0.30177959635692414, + "learning_rate": 2.0886771711249543e-05, + "loss": 0.4446, + "step": 1893 + }, + { + "epoch": 1.8733926805143422, + "grad_norm": 0.30727441186455273, + "learning_rate": 2.086844998167827e-05, + "loss": 0.4138, + "step": 1894 + }, + { + "epoch": 1.874381800197824, + "grad_norm": 0.3029843395912454, + "learning_rate": 2.0850128252107002e-05, + "loss": 0.467, + "step": 1895 + }, + { + "epoch": 1.8753709198813056, + "grad_norm": 0.28610816219099716, + "learning_rate": 2.083180652253573e-05, + "loss": 0.4103, + "step": 1896 + }, + { + "epoch": 1.8763600395647875, + "grad_norm": 0.26564180151467914, + "learning_rate": 2.081348479296446e-05, + "loss": 0.413, + "step": 1897 + }, + { + "epoch": 1.877349159248269, + "grad_norm": 0.32891431091901296, + "learning_rate": 2.0795163063393185e-05, + "loss": 0.4131, + "step": 1898 + }, + { + "epoch": 1.8783382789317509, + "grad_norm": 0.2655039622432033, + "learning_rate": 2.0776841333821915e-05, + "loss": 0.421, + "step": 1899 + }, + { + "epoch": 1.8793273986152323, + "grad_norm": 0.3009633138938526, + "learning_rate": 2.075851960425064e-05, + "loss": 0.4188, + "step": 1900 + }, + { + "epoch": 1.8803165182987143, + "grad_norm": 0.29124656986978736, + "learning_rate": 2.074019787467937e-05, + "loss": 0.4162, + "step": 1901 + }, + { + "epoch": 1.8813056379821957, + "grad_norm": 0.30884019091654086, + "learning_rate": 2.0721876145108097e-05, + "loss": 0.4565, + "step": 1902 + }, + { + "epoch": 1.8822947576656777, + "grad_norm": 0.30522532353130594, + "learning_rate": 2.0703554415536827e-05, + "loss": 0.5036, + "step": 1903 + }, + { + "epoch": 1.8832838773491591, + "grad_norm": 0.3076309070429283, + "learning_rate": 2.0685232685965557e-05, + "loss": 0.4343, + "step": 1904 + }, + { + "epoch": 1.884272997032641, + "grad_norm": 0.3451728446476686, + "learning_rate": 2.0666910956394283e-05, + "loss": 0.4339, + "step": 1905 + }, + { + "epoch": 1.8852621167161225, + "grad_norm": 0.28959111272819593, + "learning_rate": 2.0648589226823013e-05, + "loss": 0.4249, + "step": 1906 + }, + { + "epoch": 1.8862512363996045, + "grad_norm": 0.30354422997001196, + "learning_rate": 2.0630267497251743e-05, + "loss": 0.4156, + "step": 1907 + }, + { + "epoch": 1.887240356083086, + "grad_norm": 0.3085053722428896, + "learning_rate": 2.061194576768047e-05, + "loss": 0.4465, + "step": 1908 + }, + { + "epoch": 1.8882294757665679, + "grad_norm": 0.3437045799968937, + "learning_rate": 2.05936240381092e-05, + "loss": 0.4523, + "step": 1909 + }, + { + "epoch": 1.8892185954500493, + "grad_norm": 0.28581058747896737, + "learning_rate": 2.057530230853793e-05, + "loss": 0.427, + "step": 1910 + }, + { + "epoch": 1.8902077151335313, + "grad_norm": 0.271508864393249, + "learning_rate": 2.0556980578966656e-05, + "loss": 0.3641, + "step": 1911 + }, + { + "epoch": 1.8911968348170127, + "grad_norm": 0.35199224588959227, + "learning_rate": 2.0538658849395385e-05, + "loss": 0.4209, + "step": 1912 + }, + { + "epoch": 1.8921859545004946, + "grad_norm": 0.2953225794878225, + "learning_rate": 2.0520337119824112e-05, + "loss": 0.3718, + "step": 1913 + }, + { + "epoch": 1.8931750741839761, + "grad_norm": 0.2738162952704808, + "learning_rate": 2.050201539025284e-05, + "loss": 0.3926, + "step": 1914 + }, + { + "epoch": 1.894164193867458, + "grad_norm": 0.26534375180654185, + "learning_rate": 2.0483693660681568e-05, + "loss": 0.3923, + "step": 1915 + }, + { + "epoch": 1.8951533135509395, + "grad_norm": 0.2631897845252683, + "learning_rate": 2.0465371931110298e-05, + "loss": 0.3955, + "step": 1916 + }, + { + "epoch": 1.8961424332344214, + "grad_norm": 0.2942843582306155, + "learning_rate": 2.0447050201539024e-05, + "loss": 0.419, + "step": 1917 + }, + { + "epoch": 1.897131552917903, + "grad_norm": 0.30867929917338793, + "learning_rate": 2.0428728471967754e-05, + "loss": 0.4338, + "step": 1918 + }, + { + "epoch": 1.8981206726013848, + "grad_norm": 0.2990621787095512, + "learning_rate": 2.0410406742396484e-05, + "loss": 0.4383, + "step": 1919 + }, + { + "epoch": 1.8991097922848663, + "grad_norm": 0.27810473804617325, + "learning_rate": 2.0392085012825214e-05, + "loss": 0.4491, + "step": 1920 + }, + { + "epoch": 1.9000989119683482, + "grad_norm": 0.33382076803749, + "learning_rate": 2.037376328325394e-05, + "loss": 0.4381, + "step": 1921 + }, + { + "epoch": 1.9010880316518297, + "grad_norm": 0.3304112930354128, + "learning_rate": 2.035544155368267e-05, + "loss": 0.5064, + "step": 1922 + }, + { + "epoch": 1.9020771513353116, + "grad_norm": 0.25783866601539174, + "learning_rate": 2.0337119824111396e-05, + "loss": 0.3897, + "step": 1923 + }, + { + "epoch": 1.903066271018793, + "grad_norm": 0.27328280868082716, + "learning_rate": 2.0318798094540126e-05, + "loss": 0.4383, + "step": 1924 + }, + { + "epoch": 1.904055390702275, + "grad_norm": 0.33204613407624645, + "learning_rate": 2.0300476364968852e-05, + "loss": 0.5073, + "step": 1925 + }, + { + "epoch": 1.9050445103857567, + "grad_norm": 0.2766857216599759, + "learning_rate": 2.0282154635397582e-05, + "loss": 0.4149, + "step": 1926 + }, + { + "epoch": 1.9060336300692384, + "grad_norm": 0.266461589601163, + "learning_rate": 2.0263832905826312e-05, + "loss": 0.4314, + "step": 1927 + }, + { + "epoch": 1.9070227497527201, + "grad_norm": 0.2512857349198211, + "learning_rate": 2.024551117625504e-05, + "loss": 0.4336, + "step": 1928 + }, + { + "epoch": 1.9080118694362018, + "grad_norm": 0.33637530648765923, + "learning_rate": 2.022718944668377e-05, + "loss": 0.4225, + "step": 1929 + }, + { + "epoch": 1.9090009891196835, + "grad_norm": 0.28208217764150173, + "learning_rate": 2.0208867717112495e-05, + "loss": 0.4725, + "step": 1930 + }, + { + "epoch": 1.9099901088031652, + "grad_norm": 0.26065398592400135, + "learning_rate": 2.0190545987541225e-05, + "loss": 0.4567, + "step": 1931 + }, + { + "epoch": 1.910979228486647, + "grad_norm": 0.28479843394920357, + "learning_rate": 2.0172224257969954e-05, + "loss": 0.4287, + "step": 1932 + }, + { + "epoch": 1.9119683481701286, + "grad_norm": 3.294579825793158, + "learning_rate": 2.0153902528398684e-05, + "loss": 0.5882, + "step": 1933 + }, + { + "epoch": 1.9129574678536103, + "grad_norm": 0.28936302581206536, + "learning_rate": 2.013558079882741e-05, + "loss": 0.5019, + "step": 1934 + }, + { + "epoch": 1.913946587537092, + "grad_norm": 0.28047234749246863, + "learning_rate": 2.011725906925614e-05, + "loss": 0.4326, + "step": 1935 + }, + { + "epoch": 1.9149357072205737, + "grad_norm": 0.3100419020654506, + "learning_rate": 2.0098937339684867e-05, + "loss": 0.4336, + "step": 1936 + }, + { + "epoch": 1.9159248269040554, + "grad_norm": 0.30316063562527656, + "learning_rate": 2.0080615610113597e-05, + "loss": 0.4482, + "step": 1937 + }, + { + "epoch": 1.916913946587537, + "grad_norm": 0.3203458184728088, + "learning_rate": 2.0062293880542323e-05, + "loss": 0.4761, + "step": 1938 + }, + { + "epoch": 1.9179030662710188, + "grad_norm": 0.319115811593246, + "learning_rate": 2.0043972150971053e-05, + "loss": 0.4387, + "step": 1939 + }, + { + "epoch": 1.9188921859545005, + "grad_norm": 0.29266033140236186, + "learning_rate": 2.002565042139978e-05, + "loss": 0.4128, + "step": 1940 + }, + { + "epoch": 1.9198813056379822, + "grad_norm": 0.2934869021796227, + "learning_rate": 2.000732869182851e-05, + "loss": 0.4266, + "step": 1941 + }, + { + "epoch": 1.920870425321464, + "grad_norm": 0.2954310231819758, + "learning_rate": 1.9989006962257235e-05, + "loss": 0.4862, + "step": 1942 + }, + { + "epoch": 1.9218595450049456, + "grad_norm": 0.2989192690361351, + "learning_rate": 1.997068523268597e-05, + "loss": 0.4284, + "step": 1943 + }, + { + "epoch": 1.9228486646884273, + "grad_norm": 0.32532736480103575, + "learning_rate": 1.9952363503114695e-05, + "loss": 0.5123, + "step": 1944 + }, + { + "epoch": 1.923837784371909, + "grad_norm": 0.279809147331516, + "learning_rate": 1.9934041773543425e-05, + "loss": 0.4593, + "step": 1945 + }, + { + "epoch": 1.9248269040553907, + "grad_norm": 0.3043919655682651, + "learning_rate": 1.991572004397215e-05, + "loss": 0.4417, + "step": 1946 + }, + { + "epoch": 1.9258160237388724, + "grad_norm": 0.2795337887573538, + "learning_rate": 1.989739831440088e-05, + "loss": 0.4287, + "step": 1947 + }, + { + "epoch": 1.926805143422354, + "grad_norm": 0.26009593666483405, + "learning_rate": 1.9879076584829608e-05, + "loss": 0.4479, + "step": 1948 + }, + { + "epoch": 1.9277942631058358, + "grad_norm": 0.25306492114507956, + "learning_rate": 1.9860754855258337e-05, + "loss": 0.4074, + "step": 1949 + }, + { + "epoch": 1.9287833827893175, + "grad_norm": 0.2706763584509426, + "learning_rate": 1.9842433125687067e-05, + "loss": 0.4687, + "step": 1950 + }, + { + "epoch": 1.9297725024727992, + "grad_norm": 0.2546219655402372, + "learning_rate": 1.9824111396115794e-05, + "loss": 0.3809, + "step": 1951 + }, + { + "epoch": 1.9307616221562809, + "grad_norm": 0.2917574146301412, + "learning_rate": 1.9805789666544523e-05, + "loss": 0.4002, + "step": 1952 + }, + { + "epoch": 1.9317507418397626, + "grad_norm": 0.914811674910379, + "learning_rate": 1.978746793697325e-05, + "loss": 0.4656, + "step": 1953 + }, + { + "epoch": 1.9327398615232443, + "grad_norm": 0.2783013811561288, + "learning_rate": 1.976914620740198e-05, + "loss": 0.4792, + "step": 1954 + }, + { + "epoch": 1.933728981206726, + "grad_norm": 0.28602656481136324, + "learning_rate": 1.975082447783071e-05, + "loss": 0.4541, + "step": 1955 + }, + { + "epoch": 1.9347181008902077, + "grad_norm": 0.2592620006394712, + "learning_rate": 1.973250274825944e-05, + "loss": 0.4348, + "step": 1956 + }, + { + "epoch": 1.9357072205736894, + "grad_norm": 0.2773762643584451, + "learning_rate": 1.9714181018688166e-05, + "loss": 0.4063, + "step": 1957 + }, + { + "epoch": 1.936696340257171, + "grad_norm": 0.28599821633612305, + "learning_rate": 1.9695859289116895e-05, + "loss": 0.4824, + "step": 1958 + }, + { + "epoch": 1.9376854599406528, + "grad_norm": 0.26672204401496125, + "learning_rate": 1.9677537559545622e-05, + "loss": 0.4232, + "step": 1959 + }, + { + "epoch": 1.9386745796241345, + "grad_norm": 0.2892625352799071, + "learning_rate": 1.965921582997435e-05, + "loss": 0.4796, + "step": 1960 + }, + { + "epoch": 1.9396636993076162, + "grad_norm": 0.2817788199429448, + "learning_rate": 1.9640894100403078e-05, + "loss": 0.402, + "step": 1961 + }, + { + "epoch": 1.9406528189910979, + "grad_norm": 0.2653710321950093, + "learning_rate": 1.9622572370831808e-05, + "loss": 0.4031, + "step": 1962 + }, + { + "epoch": 1.9416419386745796, + "grad_norm": 0.313468529227229, + "learning_rate": 1.9604250641260534e-05, + "loss": 0.469, + "step": 1963 + }, + { + "epoch": 1.9426310583580615, + "grad_norm": 0.30116180498202555, + "learning_rate": 1.9585928911689264e-05, + "loss": 0.4781, + "step": 1964 + }, + { + "epoch": 1.943620178041543, + "grad_norm": 0.36835947832919974, + "learning_rate": 1.956760718211799e-05, + "loss": 0.3505, + "step": 1965 + }, + { + "epoch": 1.9446092977250249, + "grad_norm": 0.2916022025393715, + "learning_rate": 1.954928545254672e-05, + "loss": 0.4331, + "step": 1966 + }, + { + "epoch": 1.9455984174085064, + "grad_norm": 0.24576757883657926, + "learning_rate": 1.953096372297545e-05, + "loss": 0.3918, + "step": 1967 + }, + { + "epoch": 1.9465875370919883, + "grad_norm": 0.30131909213793245, + "learning_rate": 1.951264199340418e-05, + "loss": 0.461, + "step": 1968 + }, + { + "epoch": 1.9475766567754698, + "grad_norm": 0.3339260654475112, + "learning_rate": 1.9494320263832906e-05, + "loss": 0.436, + "step": 1969 + }, + { + "epoch": 1.9485657764589517, + "grad_norm": 0.27524157139256206, + "learning_rate": 1.9475998534261636e-05, + "loss": 0.4545, + "step": 1970 + }, + { + "epoch": 1.9495548961424332, + "grad_norm": 0.2874593210645949, + "learning_rate": 1.9457676804690363e-05, + "loss": 0.4127, + "step": 1971 + }, + { + "epoch": 1.950544015825915, + "grad_norm": 0.31215326151701583, + "learning_rate": 1.9439355075119092e-05, + "loss": 0.3706, + "step": 1972 + }, + { + "epoch": 1.9515331355093966, + "grad_norm": 0.2780337765875761, + "learning_rate": 1.9421033345547822e-05, + "loss": 0.3934, + "step": 1973 + }, + { + "epoch": 1.9525222551928785, + "grad_norm": 0.3453992054984105, + "learning_rate": 1.940271161597655e-05, + "loss": 0.4671, + "step": 1974 + }, + { + "epoch": 1.95351137487636, + "grad_norm": 0.3321376938331568, + "learning_rate": 1.938438988640528e-05, + "loss": 0.4228, + "step": 1975 + }, + { + "epoch": 1.9545004945598419, + "grad_norm": 0.2928057230254324, + "learning_rate": 1.9366068156834005e-05, + "loss": 0.469, + "step": 1976 + }, + { + "epoch": 1.9554896142433233, + "grad_norm": 0.30394150855107643, + "learning_rate": 1.9347746427262735e-05, + "loss": 0.4653, + "step": 1977 + }, + { + "epoch": 1.9564787339268053, + "grad_norm": 0.2894909046020456, + "learning_rate": 1.932942469769146e-05, + "loss": 0.3932, + "step": 1978 + }, + { + "epoch": 1.9574678536102867, + "grad_norm": 0.2711875527085813, + "learning_rate": 1.9311102968120194e-05, + "loss": 0.3798, + "step": 1979 + }, + { + "epoch": 1.9584569732937687, + "grad_norm": 0.2725315165989554, + "learning_rate": 1.929278123854892e-05, + "loss": 0.3549, + "step": 1980 + }, + { + "epoch": 1.9594460929772501, + "grad_norm": 0.26597443326290904, + "learning_rate": 1.927445950897765e-05, + "loss": 0.3765, + "step": 1981 + }, + { + "epoch": 1.960435212660732, + "grad_norm": 0.27613823878296645, + "learning_rate": 1.9256137779406377e-05, + "loss": 0.3882, + "step": 1982 + }, + { + "epoch": 1.9614243323442135, + "grad_norm": 0.2851650579144139, + "learning_rate": 1.9237816049835107e-05, + "loss": 0.4073, + "step": 1983 + }, + { + "epoch": 1.9624134520276955, + "grad_norm": 0.2855817930760619, + "learning_rate": 1.9219494320263833e-05, + "loss": 0.4373, + "step": 1984 + }, + { + "epoch": 1.963402571711177, + "grad_norm": 0.2727240403962694, + "learning_rate": 1.9201172590692563e-05, + "loss": 0.4231, + "step": 1985 + }, + { + "epoch": 1.9643916913946589, + "grad_norm": 0.26875071309544885, + "learning_rate": 1.918285086112129e-05, + "loss": 0.4155, + "step": 1986 + }, + { + "epoch": 1.9653808110781403, + "grad_norm": 0.24915569601701976, + "learning_rate": 1.916452913155002e-05, + "loss": 0.3926, + "step": 1987 + }, + { + "epoch": 1.9663699307616223, + "grad_norm": 1.6073228027788682, + "learning_rate": 1.9146207401978746e-05, + "loss": 0.4279, + "step": 1988 + }, + { + "epoch": 1.9673590504451037, + "grad_norm": 0.2792233401337717, + "learning_rate": 1.9127885672407475e-05, + "loss": 0.4021, + "step": 1989 + }, + { + "epoch": 1.9683481701285857, + "grad_norm": 0.25494389574541926, + "learning_rate": 1.9109563942836202e-05, + "loss": 0.4267, + "step": 1990 + }, + { + "epoch": 1.9693372898120671, + "grad_norm": 0.2898106866186205, + "learning_rate": 1.9091242213264935e-05, + "loss": 0.4323, + "step": 1991 + }, + { + "epoch": 1.970326409495549, + "grad_norm": 0.2728511736986319, + "learning_rate": 1.907292048369366e-05, + "loss": 0.3848, + "step": 1992 + }, + { + "epoch": 1.9713155291790305, + "grad_norm": 0.24292208041411942, + "learning_rate": 1.905459875412239e-05, + "loss": 0.4054, + "step": 1993 + }, + { + "epoch": 1.9723046488625124, + "grad_norm": 0.25921473245660914, + "learning_rate": 1.9036277024551118e-05, + "loss": 0.393, + "step": 1994 + }, + { + "epoch": 1.973293768545994, + "grad_norm": 0.27928092261140336, + "learning_rate": 1.9017955294979847e-05, + "loss": 0.4445, + "step": 1995 + }, + { + "epoch": 1.9742828882294758, + "grad_norm": 0.27406269020220453, + "learning_rate": 1.8999633565408577e-05, + "loss": 0.454, + "step": 1996 + }, + { + "epoch": 1.9752720079129573, + "grad_norm": 0.2614651016454685, + "learning_rate": 1.8981311835837304e-05, + "loss": 0.3824, + "step": 1997 + }, + { + "epoch": 1.9762611275964392, + "grad_norm": 0.27737401784240373, + "learning_rate": 1.8962990106266033e-05, + "loss": 0.4434, + "step": 1998 + }, + { + "epoch": 1.9772502472799207, + "grad_norm": 0.2604415643950939, + "learning_rate": 1.894466837669476e-05, + "loss": 0.4093, + "step": 1999 + }, + { + "epoch": 1.9782393669634026, + "grad_norm": 0.24222620275410053, + "learning_rate": 1.892634664712349e-05, + "loss": 0.4325, + "step": 2000 + }, + { + "epoch": 1.979228486646884, + "grad_norm": 0.23224541289636652, + "learning_rate": 1.8908024917552216e-05, + "loss": 0.3769, + "step": 2001 + }, + { + "epoch": 1.980217606330366, + "grad_norm": 0.2723348778036034, + "learning_rate": 1.8889703187980946e-05, + "loss": 0.3764, + "step": 2002 + }, + { + "epoch": 1.9812067260138477, + "grad_norm": 0.31673784091219626, + "learning_rate": 1.8871381458409676e-05, + "loss": 0.4378, + "step": 2003 + }, + { + "epoch": 1.9821958456973294, + "grad_norm": 0.2928301587474028, + "learning_rate": 1.8853059728838405e-05, + "loss": 0.4119, + "step": 2004 + }, + { + "epoch": 1.9831849653808111, + "grad_norm": 0.24968201113913796, + "learning_rate": 1.8834737999267132e-05, + "loss": 0.4273, + "step": 2005 + }, + { + "epoch": 1.9841740850642928, + "grad_norm": 0.3322683975213439, + "learning_rate": 1.8816416269695862e-05, + "loss": 0.4857, + "step": 2006 + }, + { + "epoch": 1.9851632047477745, + "grad_norm": 0.2951583445750594, + "learning_rate": 1.8798094540124588e-05, + "loss": 0.4307, + "step": 2007 + }, + { + "epoch": 1.9861523244312562, + "grad_norm": 0.33937998935052427, + "learning_rate": 1.8779772810553318e-05, + "loss": 0.4664, + "step": 2008 + }, + { + "epoch": 1.987141444114738, + "grad_norm": 0.2923057755122759, + "learning_rate": 1.8761451080982044e-05, + "loss": 0.5091, + "step": 2009 + }, + { + "epoch": 1.9881305637982196, + "grad_norm": 0.2788462746112801, + "learning_rate": 1.8743129351410774e-05, + "loss": 0.386, + "step": 2010 + }, + { + "epoch": 1.9891196834817013, + "grad_norm": 0.2516597878121542, + "learning_rate": 1.87248076218395e-05, + "loss": 0.3403, + "step": 2011 + }, + { + "epoch": 1.990108803165183, + "grad_norm": 0.30376516700631484, + "learning_rate": 1.870648589226823e-05, + "loss": 0.4669, + "step": 2012 + }, + { + "epoch": 1.9910979228486647, + "grad_norm": 0.28097309920042796, + "learning_rate": 1.8688164162696957e-05, + "loss": 0.4018, + "step": 2013 + }, + { + "epoch": 1.9920870425321464, + "grad_norm": 0.27334460203651567, + "learning_rate": 1.8669842433125687e-05, + "loss": 0.4525, + "step": 2014 + }, + { + "epoch": 1.993076162215628, + "grad_norm": 0.29672124438553915, + "learning_rate": 1.8651520703554416e-05, + "loss": 0.454, + "step": 2015 + }, + { + "epoch": 1.9940652818991098, + "grad_norm": 0.2888799406442104, + "learning_rate": 1.8633198973983146e-05, + "loss": 0.4324, + "step": 2016 + }, + { + "epoch": 1.9950544015825915, + "grad_norm": 0.25475149445910006, + "learning_rate": 1.8614877244411873e-05, + "loss": 0.4482, + "step": 2017 + }, + { + "epoch": 1.9960435212660732, + "grad_norm": 0.29167348715567726, + "learning_rate": 1.8596555514840602e-05, + "loss": 0.399, + "step": 2018 + }, + { + "epoch": 1.997032640949555, + "grad_norm": 0.2692693993748144, + "learning_rate": 1.8578233785269332e-05, + "loss": 0.4011, + "step": 2019 + }, + { + "epoch": 1.9980217606330366, + "grad_norm": 0.2589196204328078, + "learning_rate": 1.855991205569806e-05, + "loss": 0.3859, + "step": 2020 + }, + { + "epoch": 1.9990108803165183, + "grad_norm": 0.6296008364535154, + "learning_rate": 1.854159032612679e-05, + "loss": 0.4505, + "step": 2021 + }, + { + "epoch": 2.0, + "grad_norm": 0.3007094377136532, + "learning_rate": 1.8523268596555515e-05, + "loss": 0.3938, + "step": 2022 + }, + { + "epoch": 2.000989119683482, + "grad_norm": 0.33376807762869504, + "learning_rate": 1.8504946866984245e-05, + "loss": 0.362, + "step": 2023 + }, + { + "epoch": 2.0019782393669634, + "grad_norm": 0.2601375793910521, + "learning_rate": 1.848662513741297e-05, + "loss": 0.3527, + "step": 2024 + }, + { + "epoch": 2.0029673590504453, + "grad_norm": 0.2808677816345753, + "learning_rate": 1.84683034078417e-05, + "loss": 0.3309, + "step": 2025 + }, + { + "epoch": 2.003956478733927, + "grad_norm": 0.3021748972184023, + "learning_rate": 1.8449981678270427e-05, + "loss": 0.3377, + "step": 2026 + }, + { + "epoch": 2.0049455984174087, + "grad_norm": 0.25396738481241404, + "learning_rate": 1.843165994869916e-05, + "loss": 0.3214, + "step": 2027 + }, + { + "epoch": 2.00593471810089, + "grad_norm": 0.2522846151099154, + "learning_rate": 1.8413338219127887e-05, + "loss": 0.33, + "step": 2028 + }, + { + "epoch": 2.006923837784372, + "grad_norm": 0.2888004385011141, + "learning_rate": 1.8395016489556617e-05, + "loss": 0.3479, + "step": 2029 + }, + { + "epoch": 2.0079129574678536, + "grad_norm": 0.2587138992354896, + "learning_rate": 1.8376694759985343e-05, + "loss": 0.2811, + "step": 2030 + }, + { + "epoch": 2.0089020771513355, + "grad_norm": 0.3155016932703736, + "learning_rate": 1.8358373030414073e-05, + "loss": 0.3416, + "step": 2031 + }, + { + "epoch": 2.009891196834817, + "grad_norm": 0.2857913643432993, + "learning_rate": 1.83400513008428e-05, + "loss": 0.3571, + "step": 2032 + }, + { + "epoch": 2.010880316518299, + "grad_norm": 0.2678200050894992, + "learning_rate": 1.832172957127153e-05, + "loss": 0.3581, + "step": 2033 + }, + { + "epoch": 2.0118694362017804, + "grad_norm": 0.25644421751373253, + "learning_rate": 1.8303407841700256e-05, + "loss": 0.313, + "step": 2034 + }, + { + "epoch": 2.0128585558852623, + "grad_norm": 0.4465486558136206, + "learning_rate": 1.8285086112128985e-05, + "loss": 0.4098, + "step": 2035 + }, + { + "epoch": 2.013847675568744, + "grad_norm": 0.2833668117731138, + "learning_rate": 1.8266764382557712e-05, + "loss": 0.3211, + "step": 2036 + }, + { + "epoch": 2.0148367952522257, + "grad_norm": 0.23426073423687663, + "learning_rate": 1.824844265298644e-05, + "loss": 0.3165, + "step": 2037 + }, + { + "epoch": 2.015825914935707, + "grad_norm": 0.32693965911004064, + "learning_rate": 1.823012092341517e-05, + "loss": 0.4052, + "step": 2038 + }, + { + "epoch": 2.016815034619189, + "grad_norm": 0.3380208112798089, + "learning_rate": 1.82117991938439e-05, + "loss": 0.3321, + "step": 2039 + }, + { + "epoch": 2.0178041543026706, + "grad_norm": 0.24941143626683626, + "learning_rate": 1.8193477464272628e-05, + "loss": 0.3336, + "step": 2040 + }, + { + "epoch": 2.0187932739861525, + "grad_norm": 0.277234094183247, + "learning_rate": 1.8175155734701357e-05, + "loss": 0.353, + "step": 2041 + }, + { + "epoch": 2.019782393669634, + "grad_norm": 0.2726781199448609, + "learning_rate": 1.8156834005130087e-05, + "loss": 0.331, + "step": 2042 + }, + { + "epoch": 2.020771513353116, + "grad_norm": 0.2617540569925556, + "learning_rate": 1.8138512275558814e-05, + "loss": 0.3276, + "step": 2043 + }, + { + "epoch": 2.0217606330365974, + "grad_norm": 0.23504401210710754, + "learning_rate": 1.8120190545987544e-05, + "loss": 0.3109, + "step": 2044 + }, + { + "epoch": 2.0227497527200793, + "grad_norm": 0.2407046086285793, + "learning_rate": 1.810186881641627e-05, + "loss": 0.3374, + "step": 2045 + }, + { + "epoch": 2.0237388724035608, + "grad_norm": 0.2641964761821478, + "learning_rate": 1.8083547086845e-05, + "loss": 0.376, + "step": 2046 + }, + { + "epoch": 2.0247279920870427, + "grad_norm": 0.2882718123838396, + "learning_rate": 1.8065225357273726e-05, + "loss": 0.3328, + "step": 2047 + }, + { + "epoch": 2.025717111770524, + "grad_norm": 0.2786807025896711, + "learning_rate": 1.8046903627702456e-05, + "loss": 0.3459, + "step": 2048 + }, + { + "epoch": 2.026706231454006, + "grad_norm": 0.2539432049765403, + "learning_rate": 1.8028581898131182e-05, + "loss": 0.3647, + "step": 2049 + }, + { + "epoch": 2.0276953511374876, + "grad_norm": 0.2556731150500688, + "learning_rate": 1.8010260168559916e-05, + "loss": 0.3777, + "step": 2050 + }, + { + "epoch": 2.0286844708209695, + "grad_norm": 0.25486739832116906, + "learning_rate": 1.7991938438988642e-05, + "loss": 0.3138, + "step": 2051 + }, + { + "epoch": 2.029673590504451, + "grad_norm": 0.2774415544605617, + "learning_rate": 1.7973616709417372e-05, + "loss": 0.3343, + "step": 2052 + }, + { + "epoch": 2.030662710187933, + "grad_norm": 0.25646240486523647, + "learning_rate": 1.7955294979846098e-05, + "loss": 0.3676, + "step": 2053 + }, + { + "epoch": 2.0316518298714143, + "grad_norm": 0.23627554159821249, + "learning_rate": 1.7936973250274828e-05, + "loss": 0.297, + "step": 2054 + }, + { + "epoch": 2.0326409495548963, + "grad_norm": 0.9750055399356007, + "learning_rate": 1.7918651520703554e-05, + "loss": 0.3385, + "step": 2055 + }, + { + "epoch": 2.0336300692383777, + "grad_norm": 0.3259195566861495, + "learning_rate": 1.7900329791132284e-05, + "loss": 0.3283, + "step": 2056 + }, + { + "epoch": 2.0346191889218597, + "grad_norm": 0.34994469632467384, + "learning_rate": 1.788200806156101e-05, + "loss": 0.3335, + "step": 2057 + }, + { + "epoch": 2.035608308605341, + "grad_norm": 0.3874809670228957, + "learning_rate": 1.786368633198974e-05, + "loss": 0.3423, + "step": 2058 + }, + { + "epoch": 2.036597428288823, + "grad_norm": 0.4397999002338133, + "learning_rate": 1.7845364602418467e-05, + "loss": 0.3911, + "step": 2059 + }, + { + "epoch": 2.0375865479723045, + "grad_norm": 0.2965068166961372, + "learning_rate": 1.7827042872847197e-05, + "loss": 0.368, + "step": 2060 + }, + { + "epoch": 2.0385756676557865, + "grad_norm": 0.33913465022866834, + "learning_rate": 1.7808721143275926e-05, + "loss": 0.3485, + "step": 2061 + }, + { + "epoch": 2.039564787339268, + "grad_norm": 0.41953455881087176, + "learning_rate": 1.7790399413704656e-05, + "loss": 0.3337, + "step": 2062 + }, + { + "epoch": 2.04055390702275, + "grad_norm": 0.24963906693283444, + "learning_rate": 1.7772077684133383e-05, + "loss": 0.3388, + "step": 2063 + }, + { + "epoch": 2.0415430267062313, + "grad_norm": 0.24427178402898375, + "learning_rate": 1.7753755954562113e-05, + "loss": 0.3758, + "step": 2064 + }, + { + "epoch": 2.0425321463897133, + "grad_norm": 0.3391522202251561, + "learning_rate": 1.773543422499084e-05, + "loss": 0.3954, + "step": 2065 + }, + { + "epoch": 2.0435212660731947, + "grad_norm": 0.305105117952119, + "learning_rate": 1.771711249541957e-05, + "loss": 0.3189, + "step": 2066 + }, + { + "epoch": 2.0445103857566767, + "grad_norm": 0.2872800239846866, + "learning_rate": 1.76987907658483e-05, + "loss": 0.3776, + "step": 2067 + }, + { + "epoch": 2.045499505440158, + "grad_norm": 0.2504019307251769, + "learning_rate": 1.7680469036277025e-05, + "loss": 0.356, + "step": 2068 + }, + { + "epoch": 2.04648862512364, + "grad_norm": 0.25901454727456, + "learning_rate": 1.7662147306705755e-05, + "loss": 0.3776, + "step": 2069 + }, + { + "epoch": 2.0474777448071215, + "grad_norm": 0.29939939557269285, + "learning_rate": 1.764382557713448e-05, + "loss": 0.3126, + "step": 2070 + }, + { + "epoch": 2.0484668644906034, + "grad_norm": 0.3231004145272702, + "learning_rate": 1.762550384756321e-05, + "loss": 0.3393, + "step": 2071 + }, + { + "epoch": 2.049455984174085, + "grad_norm": 0.2476325861585833, + "learning_rate": 1.7607182117991937e-05, + "loss": 0.3463, + "step": 2072 + }, + { + "epoch": 2.050445103857567, + "grad_norm": 0.2979710367960843, + "learning_rate": 1.7588860388420667e-05, + "loss": 0.3726, + "step": 2073 + }, + { + "epoch": 2.0514342235410483, + "grad_norm": 0.2762787234379862, + "learning_rate": 1.7570538658849397e-05, + "loss": 0.3372, + "step": 2074 + }, + { + "epoch": 2.0524233432245302, + "grad_norm": 0.2525936147732162, + "learning_rate": 1.7552216929278127e-05, + "loss": 0.3224, + "step": 2075 + }, + { + "epoch": 2.0534124629080117, + "grad_norm": 0.2584635923287862, + "learning_rate": 1.7533895199706853e-05, + "loss": 0.3592, + "step": 2076 + }, + { + "epoch": 2.0544015825914936, + "grad_norm": 0.24758511782034934, + "learning_rate": 1.7515573470135583e-05, + "loss": 0.3228, + "step": 2077 + }, + { + "epoch": 2.055390702274975, + "grad_norm": 0.2883921683823937, + "learning_rate": 1.749725174056431e-05, + "loss": 0.3278, + "step": 2078 + }, + { + "epoch": 2.056379821958457, + "grad_norm": 0.3758824821850759, + "learning_rate": 1.747893001099304e-05, + "loss": 0.3069, + "step": 2079 + }, + { + "epoch": 2.0573689416419385, + "grad_norm": 0.26898488904546836, + "learning_rate": 1.7460608281421766e-05, + "loss": 0.3374, + "step": 2080 + }, + { + "epoch": 2.0583580613254204, + "grad_norm": 0.22348922505676247, + "learning_rate": 1.7442286551850495e-05, + "loss": 0.297, + "step": 2081 + }, + { + "epoch": 2.059347181008902, + "grad_norm": 0.2431620389642761, + "learning_rate": 1.7423964822279222e-05, + "loss": 0.3452, + "step": 2082 + }, + { + "epoch": 2.060336300692384, + "grad_norm": 0.2715649731140133, + "learning_rate": 1.7405643092707952e-05, + "loss": 0.3588, + "step": 2083 + }, + { + "epoch": 2.0613254203758653, + "grad_norm": 0.24583847764861538, + "learning_rate": 1.738732136313668e-05, + "loss": 0.3515, + "step": 2084 + }, + { + "epoch": 2.0623145400593472, + "grad_norm": 6.147132786755756, + "learning_rate": 1.7368999633565408e-05, + "loss": 0.6556, + "step": 2085 + }, + { + "epoch": 2.0633036597428287, + "grad_norm": 0.2739117127003737, + "learning_rate": 1.7350677903994138e-05, + "loss": 0.4016, + "step": 2086 + }, + { + "epoch": 2.0642927794263106, + "grad_norm": 0.2735119068979336, + "learning_rate": 1.7332356174422868e-05, + "loss": 0.3623, + "step": 2087 + }, + { + "epoch": 2.065281899109792, + "grad_norm": 0.2384520866225091, + "learning_rate": 1.7314034444851594e-05, + "loss": 0.3101, + "step": 2088 + }, + { + "epoch": 2.066271018793274, + "grad_norm": 0.2396852378862046, + "learning_rate": 1.7295712715280324e-05, + "loss": 0.3292, + "step": 2089 + }, + { + "epoch": 2.0672601384767555, + "grad_norm": 0.9802498145057038, + "learning_rate": 1.7277390985709054e-05, + "loss": 0.4025, + "step": 2090 + }, + { + "epoch": 2.0682492581602374, + "grad_norm": 0.26601631784836316, + "learning_rate": 1.725906925613778e-05, + "loss": 0.3529, + "step": 2091 + }, + { + "epoch": 2.069238377843719, + "grad_norm": 0.2914785211803027, + "learning_rate": 1.724074752656651e-05, + "loss": 0.3318, + "step": 2092 + }, + { + "epoch": 2.070227497527201, + "grad_norm": 0.27346245434038796, + "learning_rate": 1.7222425796995236e-05, + "loss": 0.3645, + "step": 2093 + }, + { + "epoch": 2.0712166172106823, + "grad_norm": 0.24874170019953046, + "learning_rate": 1.7204104067423966e-05, + "loss": 0.3408, + "step": 2094 + }, + { + "epoch": 2.072205736894164, + "grad_norm": 0.297906315031933, + "learning_rate": 1.7185782337852692e-05, + "loss": 0.3444, + "step": 2095 + }, + { + "epoch": 2.0731948565776457, + "grad_norm": 0.2744309054019997, + "learning_rate": 1.7167460608281422e-05, + "loss": 0.3133, + "step": 2096 + }, + { + "epoch": 2.0741839762611276, + "grad_norm": 0.24571474785996675, + "learning_rate": 1.714913887871015e-05, + "loss": 0.3276, + "step": 2097 + }, + { + "epoch": 2.075173095944609, + "grad_norm": 0.26936390193781384, + "learning_rate": 1.7130817149138882e-05, + "loss": 0.3773, + "step": 2098 + }, + { + "epoch": 2.076162215628091, + "grad_norm": 0.27132705449446876, + "learning_rate": 1.7112495419567608e-05, + "loss": 0.3354, + "step": 2099 + }, + { + "epoch": 2.077151335311573, + "grad_norm": 0.26429280776805036, + "learning_rate": 1.7094173689996338e-05, + "loss": 0.3582, + "step": 2100 + }, + { + "epoch": 2.0781404549950544, + "grad_norm": 0.2312406462125731, + "learning_rate": 1.7075851960425065e-05, + "loss": 0.3097, + "step": 2101 + }, + { + "epoch": 2.079129574678536, + "grad_norm": 0.24469123822007616, + "learning_rate": 1.7057530230853794e-05, + "loss": 0.3485, + "step": 2102 + }, + { + "epoch": 2.080118694362018, + "grad_norm": 0.2319289399183783, + "learning_rate": 1.703920850128252e-05, + "loss": 0.3115, + "step": 2103 + }, + { + "epoch": 2.0811078140454997, + "grad_norm": 0.2528130883133997, + "learning_rate": 1.702088677171125e-05, + "loss": 0.3154, + "step": 2104 + }, + { + "epoch": 2.082096933728981, + "grad_norm": 0.2530788330356479, + "learning_rate": 1.7002565042139977e-05, + "loss": 0.3571, + "step": 2105 + }, + { + "epoch": 2.083086053412463, + "grad_norm": 0.25288858939964987, + "learning_rate": 1.6984243312568707e-05, + "loss": 0.3261, + "step": 2106 + }, + { + "epoch": 2.0840751730959446, + "grad_norm": 0.2573111175053986, + "learning_rate": 1.6965921582997437e-05, + "loss": 0.3312, + "step": 2107 + }, + { + "epoch": 2.0850642927794265, + "grad_norm": 0.24283051150823262, + "learning_rate": 1.6947599853426163e-05, + "loss": 0.331, + "step": 2108 + }, + { + "epoch": 2.086053412462908, + "grad_norm": 0.7752505829926737, + "learning_rate": 1.6929278123854893e-05, + "loss": 0.343, + "step": 2109 + }, + { + "epoch": 2.08704253214639, + "grad_norm": 0.29407470466934255, + "learning_rate": 1.6910956394283623e-05, + "loss": 0.3481, + "step": 2110 + }, + { + "epoch": 2.0880316518298714, + "grad_norm": 0.24568189642774338, + "learning_rate": 1.689263466471235e-05, + "loss": 0.3132, + "step": 2111 + }, + { + "epoch": 2.0890207715133533, + "grad_norm": 0.23470405276109677, + "learning_rate": 1.687431293514108e-05, + "loss": 0.305, + "step": 2112 + }, + { + "epoch": 2.090009891196835, + "grad_norm": 0.26268923309496633, + "learning_rate": 1.685599120556981e-05, + "loss": 0.2962, + "step": 2113 + }, + { + "epoch": 2.0909990108803167, + "grad_norm": 0.26191345194398796, + "learning_rate": 1.6837669475998535e-05, + "loss": 0.3463, + "step": 2114 + }, + { + "epoch": 2.091988130563798, + "grad_norm": 1.0669555775783601, + "learning_rate": 1.6819347746427265e-05, + "loss": 0.3839, + "step": 2115 + }, + { + "epoch": 2.09297725024728, + "grad_norm": 0.2443401607557005, + "learning_rate": 1.680102601685599e-05, + "loss": 0.3496, + "step": 2116 + }, + { + "epoch": 2.0939663699307616, + "grad_norm": 0.2788783097347, + "learning_rate": 1.678270428728472e-05, + "loss": 0.3646, + "step": 2117 + }, + { + "epoch": 2.0949554896142435, + "grad_norm": 0.2808482996134958, + "learning_rate": 1.6764382557713447e-05, + "loss": 0.3222, + "step": 2118 + }, + { + "epoch": 2.095944609297725, + "grad_norm": 0.27568916240187386, + "learning_rate": 1.6746060828142177e-05, + "loss": 0.3844, + "step": 2119 + }, + { + "epoch": 2.096933728981207, + "grad_norm": 0.2581105035106014, + "learning_rate": 1.6727739098570904e-05, + "loss": 0.3115, + "step": 2120 + }, + { + "epoch": 2.0979228486646884, + "grad_norm": 0.2541721070612834, + "learning_rate": 1.6709417368999634e-05, + "loss": 0.3393, + "step": 2121 + }, + { + "epoch": 2.0989119683481703, + "grad_norm": 0.23819664141370792, + "learning_rate": 1.6691095639428363e-05, + "loss": 0.3133, + "step": 2122 + }, + { + "epoch": 2.0999010880316518, + "grad_norm": 0.2557320947503553, + "learning_rate": 1.6672773909857093e-05, + "loss": 0.3477, + "step": 2123 + }, + { + "epoch": 2.1008902077151337, + "grad_norm": 0.2271503017475219, + "learning_rate": 1.665445218028582e-05, + "loss": 0.324, + "step": 2124 + }, + { + "epoch": 2.101879327398615, + "grad_norm": 0.2863216334585617, + "learning_rate": 1.663613045071455e-05, + "loss": 0.3486, + "step": 2125 + }, + { + "epoch": 2.102868447082097, + "grad_norm": 0.2855347267238924, + "learning_rate": 1.6617808721143276e-05, + "loss": 0.3575, + "step": 2126 + }, + { + "epoch": 2.1038575667655786, + "grad_norm": 0.23052832309475305, + "learning_rate": 1.6599486991572006e-05, + "loss": 0.3211, + "step": 2127 + }, + { + "epoch": 2.1048466864490605, + "grad_norm": 0.24044171148061522, + "learning_rate": 1.6581165262000732e-05, + "loss": 0.3517, + "step": 2128 + }, + { + "epoch": 2.105835806132542, + "grad_norm": 0.26424268988397426, + "learning_rate": 1.6562843532429462e-05, + "loss": 0.3354, + "step": 2129 + }, + { + "epoch": 2.106824925816024, + "grad_norm": 0.32985801683730653, + "learning_rate": 1.654452180285819e-05, + "loss": 0.3937, + "step": 2130 + }, + { + "epoch": 2.1078140454995054, + "grad_norm": 0.2851148291510856, + "learning_rate": 1.6526200073286918e-05, + "loss": 0.369, + "step": 2131 + }, + { + "epoch": 2.1088031651829873, + "grad_norm": 0.2631999833247007, + "learning_rate": 1.6507878343715648e-05, + "loss": 0.3141, + "step": 2132 + }, + { + "epoch": 2.1097922848664687, + "grad_norm": 0.27216770136236385, + "learning_rate": 1.6489556614144374e-05, + "loss": 0.3707, + "step": 2133 + }, + { + "epoch": 2.1107814045499507, + "grad_norm": 0.26372284319992984, + "learning_rate": 1.6471234884573104e-05, + "loss": 0.3432, + "step": 2134 + }, + { + "epoch": 2.111770524233432, + "grad_norm": 2.7801144799732995, + "learning_rate": 1.6452913155001834e-05, + "loss": 0.4697, + "step": 2135 + }, + { + "epoch": 2.112759643916914, + "grad_norm": 0.3911000168082124, + "learning_rate": 1.6434591425430564e-05, + "loss": 0.3456, + "step": 2136 + }, + { + "epoch": 2.1137487636003955, + "grad_norm": 0.39596832117969594, + "learning_rate": 1.641626969585929e-05, + "loss": 0.2952, + "step": 2137 + }, + { + "epoch": 2.1147378832838775, + "grad_norm": 0.29291151380810526, + "learning_rate": 1.639794796628802e-05, + "loss": 0.3477, + "step": 2138 + }, + { + "epoch": 2.115727002967359, + "grad_norm": 0.2601124431784706, + "learning_rate": 1.6379626236716746e-05, + "loss": 0.4013, + "step": 2139 + }, + { + "epoch": 2.116716122650841, + "grad_norm": 0.3076184020471208, + "learning_rate": 1.6361304507145476e-05, + "loss": 0.3481, + "step": 2140 + }, + { + "epoch": 2.1177052423343223, + "grad_norm": 0.31115479146294667, + "learning_rate": 1.6342982777574203e-05, + "loss": 0.3502, + "step": 2141 + }, + { + "epoch": 2.1186943620178043, + "grad_norm": 0.2729157234349751, + "learning_rate": 1.6324661048002932e-05, + "loss": 0.3569, + "step": 2142 + }, + { + "epoch": 2.1196834817012857, + "grad_norm": 0.2522458263760746, + "learning_rate": 1.630633931843166e-05, + "loss": 0.3964, + "step": 2143 + }, + { + "epoch": 2.1206726013847677, + "grad_norm": 0.26441364119229976, + "learning_rate": 1.628801758886039e-05, + "loss": 0.3243, + "step": 2144 + }, + { + "epoch": 2.121661721068249, + "grad_norm": 0.26478323771121154, + "learning_rate": 1.6269695859289115e-05, + "loss": 0.3417, + "step": 2145 + }, + { + "epoch": 2.122650840751731, + "grad_norm": 5.200418924621773, + "learning_rate": 1.6251374129717848e-05, + "loss": 0.9442, + "step": 2146 + }, + { + "epoch": 2.1236399604352125, + "grad_norm": 0.26631129417116767, + "learning_rate": 1.6233052400146575e-05, + "loss": 0.3734, + "step": 2147 + }, + { + "epoch": 2.1246290801186944, + "grad_norm": 0.2397373984722219, + "learning_rate": 1.6214730670575304e-05, + "loss": 0.3142, + "step": 2148 + }, + { + "epoch": 2.125618199802176, + "grad_norm": 0.24875529064629065, + "learning_rate": 1.619640894100403e-05, + "loss": 0.3487, + "step": 2149 + }, + { + "epoch": 2.126607319485658, + "grad_norm": 0.265519222625967, + "learning_rate": 1.617808721143276e-05, + "loss": 0.319, + "step": 2150 + }, + { + "epoch": 2.1275964391691393, + "grad_norm": 0.2842533650962704, + "learning_rate": 1.6159765481861487e-05, + "loss": 0.3824, + "step": 2151 + }, + { + "epoch": 2.1285855588526212, + "grad_norm": 0.22298739828635666, + "learning_rate": 1.6141443752290217e-05, + "loss": 0.2756, + "step": 2152 + }, + { + "epoch": 2.1295746785361027, + "grad_norm": 0.25735506163209704, + "learning_rate": 1.6123122022718947e-05, + "loss": 0.3611, + "step": 2153 + }, + { + "epoch": 2.1305637982195846, + "grad_norm": 0.2818421075725635, + "learning_rate": 1.6104800293147673e-05, + "loss": 0.3923, + "step": 2154 + }, + { + "epoch": 2.131552917903066, + "grad_norm": 0.2754159452458974, + "learning_rate": 1.6086478563576403e-05, + "loss": 0.3704, + "step": 2155 + }, + { + "epoch": 2.132542037586548, + "grad_norm": 0.23952536288599155, + "learning_rate": 1.606815683400513e-05, + "loss": 0.318, + "step": 2156 + }, + { + "epoch": 2.1335311572700295, + "grad_norm": 0.25008759238683226, + "learning_rate": 1.604983510443386e-05, + "loss": 0.3357, + "step": 2157 + }, + { + "epoch": 2.1345202769535114, + "grad_norm": 0.27461726260777686, + "learning_rate": 1.603151337486259e-05, + "loss": 0.3535, + "step": 2158 + }, + { + "epoch": 2.135509396636993, + "grad_norm": 0.27788201287671715, + "learning_rate": 1.601319164529132e-05, + "loss": 0.3793, + "step": 2159 + }, + { + "epoch": 2.136498516320475, + "grad_norm": 0.26150100655181496, + "learning_rate": 1.5994869915720045e-05, + "loss": 0.3318, + "step": 2160 + }, + { + "epoch": 2.1374876360039563, + "grad_norm": 0.277747766506713, + "learning_rate": 1.5976548186148775e-05, + "loss": 0.356, + "step": 2161 + }, + { + "epoch": 2.1384767556874382, + "grad_norm": 0.25722497139475575, + "learning_rate": 1.59582264565775e-05, + "loss": 0.3627, + "step": 2162 + }, + { + "epoch": 2.1394658753709197, + "grad_norm": 0.23571960846153608, + "learning_rate": 1.593990472700623e-05, + "loss": 0.3348, + "step": 2163 + }, + { + "epoch": 2.1404549950544016, + "grad_norm": 0.2968932627364013, + "learning_rate": 1.5921582997434958e-05, + "loss": 0.3604, + "step": 2164 + }, + { + "epoch": 2.141444114737883, + "grad_norm": 0.2755392915657586, + "learning_rate": 1.5903261267863687e-05, + "loss": 0.3541, + "step": 2165 + }, + { + "epoch": 2.142433234421365, + "grad_norm": 0.23899316297498144, + "learning_rate": 1.5884939538292414e-05, + "loss": 0.3257, + "step": 2166 + }, + { + "epoch": 2.1434223541048465, + "grad_norm": 0.25067453063367884, + "learning_rate": 1.5866617808721144e-05, + "loss": 0.3296, + "step": 2167 + }, + { + "epoch": 2.1444114737883284, + "grad_norm": 0.24598510268818005, + "learning_rate": 1.584829607914987e-05, + "loss": 0.3175, + "step": 2168 + }, + { + "epoch": 2.14540059347181, + "grad_norm": 0.24842567923116413, + "learning_rate": 1.58299743495786e-05, + "loss": 0.3386, + "step": 2169 + }, + { + "epoch": 2.146389713155292, + "grad_norm": 0.279977258093171, + "learning_rate": 1.581165262000733e-05, + "loss": 0.3838, + "step": 2170 + }, + { + "epoch": 2.1473788328387733, + "grad_norm": 0.30234421674111595, + "learning_rate": 1.579333089043606e-05, + "loss": 0.3861, + "step": 2171 + }, + { + "epoch": 2.148367952522255, + "grad_norm": 0.23537629559910506, + "learning_rate": 1.5775009160864786e-05, + "loss": 0.329, + "step": 2172 + }, + { + "epoch": 2.1493570722057367, + "grad_norm": 0.27012138733219354, + "learning_rate": 1.5756687431293516e-05, + "loss": 0.3325, + "step": 2173 + }, + { + "epoch": 2.1503461918892186, + "grad_norm": 0.24908855134556376, + "learning_rate": 1.5738365701722242e-05, + "loss": 0.3055, + "step": 2174 + }, + { + "epoch": 2.1513353115727005, + "grad_norm": 0.22465760736037585, + "learning_rate": 1.5720043972150972e-05, + "loss": 0.3314, + "step": 2175 + }, + { + "epoch": 2.152324431256182, + "grad_norm": 0.24248671493308285, + "learning_rate": 1.57017222425797e-05, + "loss": 0.3247, + "step": 2176 + }, + { + "epoch": 2.1533135509396635, + "grad_norm": 0.22953998434558445, + "learning_rate": 1.5683400513008428e-05, + "loss": 0.2973, + "step": 2177 + }, + { + "epoch": 2.1543026706231454, + "grad_norm": 0.22595938467456078, + "learning_rate": 1.5665078783437158e-05, + "loss": 0.3005, + "step": 2178 + }, + { + "epoch": 2.1552917903066273, + "grad_norm": 0.2506828155398757, + "learning_rate": 1.5646757053865884e-05, + "loss": 0.3772, + "step": 2179 + }, + { + "epoch": 2.156280909990109, + "grad_norm": 0.25791925829552964, + "learning_rate": 1.5628435324294614e-05, + "loss": 0.3562, + "step": 2180 + }, + { + "epoch": 2.1572700296735903, + "grad_norm": 0.24209818620770746, + "learning_rate": 1.5610113594723344e-05, + "loss": 0.2983, + "step": 2181 + }, + { + "epoch": 2.158259149357072, + "grad_norm": 0.23728500789942555, + "learning_rate": 1.5591791865152074e-05, + "loss": 0.3153, + "step": 2182 + }, + { + "epoch": 2.159248269040554, + "grad_norm": 0.26976125221293046, + "learning_rate": 1.55734701355808e-05, + "loss": 0.3283, + "step": 2183 + }, + { + "epoch": 2.1602373887240356, + "grad_norm": 0.2401119717618975, + "learning_rate": 1.555514840600953e-05, + "loss": 0.3377, + "step": 2184 + }, + { + "epoch": 2.1612265084075175, + "grad_norm": 0.2321954749389243, + "learning_rate": 1.5536826676438256e-05, + "loss": 0.3271, + "step": 2185 + }, + { + "epoch": 2.162215628090999, + "grad_norm": 0.23834291539778332, + "learning_rate": 1.5518504946866986e-05, + "loss": 0.3227, + "step": 2186 + }, + { + "epoch": 2.163204747774481, + "grad_norm": 0.3017365420246128, + "learning_rate": 1.5500183217295713e-05, + "loss": 0.3912, + "step": 2187 + }, + { + "epoch": 2.1641938674579624, + "grad_norm": 0.25939756442962536, + "learning_rate": 1.5481861487724442e-05, + "loss": 0.3645, + "step": 2188 + }, + { + "epoch": 2.1651829871414443, + "grad_norm": 0.2671540694432964, + "learning_rate": 1.546353975815317e-05, + "loss": 0.3464, + "step": 2189 + }, + { + "epoch": 2.166172106824926, + "grad_norm": 0.24114920876599572, + "learning_rate": 1.54452180285819e-05, + "loss": 0.3318, + "step": 2190 + }, + { + "epoch": 2.1671612265084077, + "grad_norm": 0.23250896713005623, + "learning_rate": 1.5426896299010625e-05, + "loss": 0.3018, + "step": 2191 + }, + { + "epoch": 2.168150346191889, + "grad_norm": 0.25900429796077135, + "learning_rate": 1.5408574569439355e-05, + "loss": 0.3794, + "step": 2192 + }, + { + "epoch": 2.169139465875371, + "grad_norm": 0.2534934876573256, + "learning_rate": 1.5390252839868085e-05, + "loss": 0.3412, + "step": 2193 + }, + { + "epoch": 2.1701285855588526, + "grad_norm": 0.3237099924662489, + "learning_rate": 1.5371931110296814e-05, + "loss": 0.3876, + "step": 2194 + }, + { + "epoch": 2.1711177052423345, + "grad_norm": 0.2263325734121825, + "learning_rate": 1.535360938072554e-05, + "loss": 0.3421, + "step": 2195 + }, + { + "epoch": 2.172106824925816, + "grad_norm": 0.27015210617952656, + "learning_rate": 1.533528765115427e-05, + "loss": 0.325, + "step": 2196 + }, + { + "epoch": 2.173095944609298, + "grad_norm": 0.24041857827848195, + "learning_rate": 1.5316965921582997e-05, + "loss": 0.3352, + "step": 2197 + }, + { + "epoch": 2.1740850642927794, + "grad_norm": 0.2303276894642009, + "learning_rate": 1.5298644192011727e-05, + "loss": 0.3082, + "step": 2198 + }, + { + "epoch": 2.1750741839762613, + "grad_norm": 0.22582439744525307, + "learning_rate": 1.5280322462440457e-05, + "loss": 0.3294, + "step": 2199 + }, + { + "epoch": 2.1760633036597428, + "grad_norm": 0.24683399509790616, + "learning_rate": 1.5262000732869183e-05, + "loss": 0.3577, + "step": 2200 + }, + { + "epoch": 2.1770524233432247, + "grad_norm": 0.2370261131813352, + "learning_rate": 1.5243679003297911e-05, + "loss": 0.3411, + "step": 2201 + }, + { + "epoch": 2.178041543026706, + "grad_norm": 0.24781478654496875, + "learning_rate": 1.522535727372664e-05, + "loss": 0.3316, + "step": 2202 + }, + { + "epoch": 2.179030662710188, + "grad_norm": 0.23239061025797889, + "learning_rate": 1.5207035544155367e-05, + "loss": 0.3263, + "step": 2203 + }, + { + "epoch": 2.1800197823936696, + "grad_norm": 0.2232722530248141, + "learning_rate": 1.5188713814584096e-05, + "loss": 0.3081, + "step": 2204 + }, + { + "epoch": 2.1810089020771515, + "grad_norm": 0.2594820299682084, + "learning_rate": 1.5170392085012827e-05, + "loss": 0.3653, + "step": 2205 + }, + { + "epoch": 2.181998021760633, + "grad_norm": 0.24820776329147887, + "learning_rate": 1.5152070355441555e-05, + "loss": 0.3115, + "step": 2206 + }, + { + "epoch": 2.182987141444115, + "grad_norm": 0.4308536837119438, + "learning_rate": 1.5133748625870283e-05, + "loss": 0.3583, + "step": 2207 + }, + { + "epoch": 2.1839762611275964, + "grad_norm": 0.2447733062324265, + "learning_rate": 1.5115426896299011e-05, + "loss": 0.3242, + "step": 2208 + }, + { + "epoch": 2.1849653808110783, + "grad_norm": 0.255212859520126, + "learning_rate": 1.509710516672774e-05, + "loss": 0.3407, + "step": 2209 + }, + { + "epoch": 2.1859545004945597, + "grad_norm": 0.24780445254507258, + "learning_rate": 1.5078783437156468e-05, + "loss": 0.3777, + "step": 2210 + }, + { + "epoch": 2.1869436201780417, + "grad_norm": 0.22667950198277595, + "learning_rate": 1.5060461707585197e-05, + "loss": 0.322, + "step": 2211 + }, + { + "epoch": 2.187932739861523, + "grad_norm": 0.2736208075026148, + "learning_rate": 1.5042139978013926e-05, + "loss": 0.3435, + "step": 2212 + }, + { + "epoch": 2.188921859545005, + "grad_norm": 0.25076062434160956, + "learning_rate": 1.5023818248442654e-05, + "loss": 0.3341, + "step": 2213 + }, + { + "epoch": 2.1899109792284865, + "grad_norm": 0.2516337328032655, + "learning_rate": 1.5005496518871382e-05, + "loss": 0.3767, + "step": 2214 + }, + { + "epoch": 2.1909000989119685, + "grad_norm": 0.24007131228024153, + "learning_rate": 1.498717478930011e-05, + "loss": 0.3304, + "step": 2215 + }, + { + "epoch": 2.19188921859545, + "grad_norm": 0.2653627206351916, + "learning_rate": 1.4968853059728838e-05, + "loss": 0.3581, + "step": 2216 + }, + { + "epoch": 2.192878338278932, + "grad_norm": 0.23740704954394445, + "learning_rate": 1.495053133015757e-05, + "loss": 0.3322, + "step": 2217 + }, + { + "epoch": 2.1938674579624133, + "grad_norm": 0.26183283644070443, + "learning_rate": 1.4932209600586298e-05, + "loss": 0.3058, + "step": 2218 + }, + { + "epoch": 2.1948565776458953, + "grad_norm": 0.25535363753862483, + "learning_rate": 1.4913887871015026e-05, + "loss": 0.3867, + "step": 2219 + }, + { + "epoch": 2.1958456973293767, + "grad_norm": 0.2410531896962311, + "learning_rate": 1.4895566141443754e-05, + "loss": 0.3287, + "step": 2220 + }, + { + "epoch": 2.1968348170128587, + "grad_norm": 0.23864241175081596, + "learning_rate": 1.4877244411872482e-05, + "loss": 0.2957, + "step": 2221 + }, + { + "epoch": 2.19782393669634, + "grad_norm": 0.23152858294826056, + "learning_rate": 1.485892268230121e-05, + "loss": 0.3267, + "step": 2222 + }, + { + "epoch": 2.198813056379822, + "grad_norm": 0.25038544912526106, + "learning_rate": 1.4840600952729938e-05, + "loss": 0.3727, + "step": 2223 + }, + { + "epoch": 2.1998021760633035, + "grad_norm": 0.24575360880677702, + "learning_rate": 1.4822279223158666e-05, + "loss": 0.3204, + "step": 2224 + }, + { + "epoch": 2.2007912957467854, + "grad_norm": 0.2617185419544845, + "learning_rate": 1.4803957493587394e-05, + "loss": 0.3344, + "step": 2225 + }, + { + "epoch": 2.201780415430267, + "grad_norm": 0.23621554765095076, + "learning_rate": 1.4785635764016123e-05, + "loss": 0.3194, + "step": 2226 + }, + { + "epoch": 2.202769535113749, + "grad_norm": 0.2656714021948276, + "learning_rate": 1.476731403444485e-05, + "loss": 0.3251, + "step": 2227 + }, + { + "epoch": 2.2037586547972303, + "grad_norm": 0.2352678555408159, + "learning_rate": 1.4748992304873579e-05, + "loss": 0.3474, + "step": 2228 + }, + { + "epoch": 2.2047477744807122, + "grad_norm": 0.23365121088195945, + "learning_rate": 1.473067057530231e-05, + "loss": 0.3214, + "step": 2229 + }, + { + "epoch": 2.2057368941641937, + "grad_norm": 0.2455996352228685, + "learning_rate": 1.4712348845731038e-05, + "loss": 0.4093, + "step": 2230 + }, + { + "epoch": 2.2067260138476756, + "grad_norm": 0.2740975848282929, + "learning_rate": 1.4694027116159766e-05, + "loss": 0.3399, + "step": 2231 + }, + { + "epoch": 2.207715133531157, + "grad_norm": 0.2598627296070607, + "learning_rate": 1.4675705386588495e-05, + "loss": 0.3083, + "step": 2232 + }, + { + "epoch": 2.208704253214639, + "grad_norm": 0.23897597759120234, + "learning_rate": 1.4657383657017223e-05, + "loss": 0.3551, + "step": 2233 + }, + { + "epoch": 2.2096933728981205, + "grad_norm": 0.28717030628471557, + "learning_rate": 1.4639061927445952e-05, + "loss": 0.36, + "step": 2234 + }, + { + "epoch": 2.2106824925816024, + "grad_norm": 0.2551734546245449, + "learning_rate": 1.462074019787468e-05, + "loss": 0.2981, + "step": 2235 + }, + { + "epoch": 2.211671612265084, + "grad_norm": 0.22660711986812593, + "learning_rate": 1.4602418468303409e-05, + "loss": 0.3241, + "step": 2236 + }, + { + "epoch": 2.212660731948566, + "grad_norm": 0.22072364142005796, + "learning_rate": 1.4584096738732137e-05, + "loss": 0.3397, + "step": 2237 + }, + { + "epoch": 2.2136498516320473, + "grad_norm": 0.25077102419811953, + "learning_rate": 1.4565775009160865e-05, + "loss": 0.3007, + "step": 2238 + }, + { + "epoch": 2.2146389713155292, + "grad_norm": 0.23275953912915226, + "learning_rate": 1.4547453279589593e-05, + "loss": 0.3301, + "step": 2239 + }, + { + "epoch": 2.2156280909990107, + "grad_norm": 0.2552844764105821, + "learning_rate": 1.4529131550018321e-05, + "loss": 0.3359, + "step": 2240 + }, + { + "epoch": 2.2166172106824926, + "grad_norm": 0.2209011298450527, + "learning_rate": 1.4510809820447053e-05, + "loss": 0.3598, + "step": 2241 + }, + { + "epoch": 2.217606330365974, + "grad_norm": 1.480125764071416, + "learning_rate": 1.449248809087578e-05, + "loss": 0.3688, + "step": 2242 + }, + { + "epoch": 2.218595450049456, + "grad_norm": 0.30985712503457974, + "learning_rate": 1.4474166361304509e-05, + "loss": 0.3408, + "step": 2243 + }, + { + "epoch": 2.2195845697329375, + "grad_norm": 0.24892275955536927, + "learning_rate": 1.4455844631733237e-05, + "loss": 0.3085, + "step": 2244 + }, + { + "epoch": 2.2205736894164194, + "grad_norm": 0.24101322361416325, + "learning_rate": 1.4437522902161965e-05, + "loss": 0.3437, + "step": 2245 + }, + { + "epoch": 2.221562809099901, + "grad_norm": 0.2910916695987308, + "learning_rate": 1.4419201172590693e-05, + "loss": 0.342, + "step": 2246 + }, + { + "epoch": 2.222551928783383, + "grad_norm": 0.2761723063225733, + "learning_rate": 1.4400879443019421e-05, + "loss": 0.3252, + "step": 2247 + }, + { + "epoch": 2.2235410484668643, + "grad_norm": 0.23723078949172213, + "learning_rate": 1.438255771344815e-05, + "loss": 0.3535, + "step": 2248 + }, + { + "epoch": 2.224530168150346, + "grad_norm": 0.23083729839007824, + "learning_rate": 1.4364235983876878e-05, + "loss": 0.3462, + "step": 2249 + }, + { + "epoch": 2.2255192878338277, + "grad_norm": 0.24923526336688007, + "learning_rate": 1.4345914254305606e-05, + "loss": 0.3479, + "step": 2250 + }, + { + "epoch": 2.2265084075173096, + "grad_norm": 0.2365620936428639, + "learning_rate": 1.4327592524734334e-05, + "loss": 0.3341, + "step": 2251 + }, + { + "epoch": 2.227497527200791, + "grad_norm": 0.26201081319476877, + "learning_rate": 1.4309270795163064e-05, + "loss": 0.3605, + "step": 2252 + }, + { + "epoch": 2.228486646884273, + "grad_norm": 0.23329839147411013, + "learning_rate": 1.4290949065591793e-05, + "loss": 0.3275, + "step": 2253 + }, + { + "epoch": 2.229475766567755, + "grad_norm": 0.23449570191396432, + "learning_rate": 1.4272627336020521e-05, + "loss": 0.3344, + "step": 2254 + }, + { + "epoch": 2.2304648862512364, + "grad_norm": 0.25077592463879617, + "learning_rate": 1.425430560644925e-05, + "loss": 0.3158, + "step": 2255 + }, + { + "epoch": 2.231454005934718, + "grad_norm": 0.2638212095862189, + "learning_rate": 1.4235983876877978e-05, + "loss": 0.3905, + "step": 2256 + }, + { + "epoch": 2.2324431256182, + "grad_norm": 0.2281528691605368, + "learning_rate": 1.4217662147306708e-05, + "loss": 0.3068, + "step": 2257 + }, + { + "epoch": 2.2334322453016817, + "grad_norm": 0.24868299339572644, + "learning_rate": 1.4199340417735436e-05, + "loss": 0.3352, + "step": 2258 + }, + { + "epoch": 2.234421364985163, + "grad_norm": 0.25226182191275837, + "learning_rate": 1.4181018688164164e-05, + "loss": 0.3799, + "step": 2259 + }, + { + "epoch": 2.2354104846686447, + "grad_norm": 0.23719055969275504, + "learning_rate": 1.4162696958592892e-05, + "loss": 0.285, + "step": 2260 + }, + { + "epoch": 2.2363996043521266, + "grad_norm": 0.22645086719819674, + "learning_rate": 1.414437522902162e-05, + "loss": 0.3535, + "step": 2261 + }, + { + "epoch": 2.2373887240356085, + "grad_norm": 0.2378652475705354, + "learning_rate": 1.4126053499450348e-05, + "loss": 0.322, + "step": 2262 + }, + { + "epoch": 2.23837784371909, + "grad_norm": 0.2446078244591778, + "learning_rate": 1.4107731769879076e-05, + "loss": 0.3008, + "step": 2263 + }, + { + "epoch": 2.239366963402572, + "grad_norm": 0.23473892675122612, + "learning_rate": 1.4089410040307804e-05, + "loss": 0.3174, + "step": 2264 + }, + { + "epoch": 2.2403560830860534, + "grad_norm": 0.2740082445935662, + "learning_rate": 1.4071088310736536e-05, + "loss": 0.3673, + "step": 2265 + }, + { + "epoch": 2.2413452027695353, + "grad_norm": 0.3047335384705569, + "learning_rate": 1.4052766581165264e-05, + "loss": 0.3862, + "step": 2266 + }, + { + "epoch": 2.242334322453017, + "grad_norm": 0.22953115502891203, + "learning_rate": 1.4034444851593992e-05, + "loss": 0.3544, + "step": 2267 + }, + { + "epoch": 2.2433234421364987, + "grad_norm": 0.2381048121745753, + "learning_rate": 1.401612312202272e-05, + "loss": 0.385, + "step": 2268 + }, + { + "epoch": 2.24431256181998, + "grad_norm": 0.22695002295754346, + "learning_rate": 1.3997801392451448e-05, + "loss": 0.3113, + "step": 2269 + }, + { + "epoch": 2.245301681503462, + "grad_norm": 0.23494139839113953, + "learning_rate": 1.3979479662880176e-05, + "loss": 0.3006, + "step": 2270 + }, + { + "epoch": 2.2462908011869436, + "grad_norm": 0.2508107946864829, + "learning_rate": 1.3961157933308904e-05, + "loss": 0.3602, + "step": 2271 + }, + { + "epoch": 2.2472799208704255, + "grad_norm": 0.22515857517930243, + "learning_rate": 1.3942836203737633e-05, + "loss": 0.3296, + "step": 2272 + }, + { + "epoch": 2.248269040553907, + "grad_norm": 0.25467938360409764, + "learning_rate": 1.392451447416636e-05, + "loss": 0.3448, + "step": 2273 + }, + { + "epoch": 2.249258160237389, + "grad_norm": 0.30210334820145374, + "learning_rate": 1.3906192744595089e-05, + "loss": 0.3662, + "step": 2274 + }, + { + "epoch": 2.2502472799208704, + "grad_norm": 0.22040906536404206, + "learning_rate": 1.3887871015023819e-05, + "loss": 0.3222, + "step": 2275 + }, + { + "epoch": 2.2512363996043523, + "grad_norm": 0.24078053098136895, + "learning_rate": 1.3869549285452547e-05, + "loss": 0.3578, + "step": 2276 + }, + { + "epoch": 2.2522255192878338, + "grad_norm": 0.23682978022586784, + "learning_rate": 1.3851227555881277e-05, + "loss": 0.3418, + "step": 2277 + }, + { + "epoch": 2.2532146389713157, + "grad_norm": 0.25235210494112936, + "learning_rate": 1.3832905826310005e-05, + "loss": 0.342, + "step": 2278 + }, + { + "epoch": 2.254203758654797, + "grad_norm": 0.21464531728228006, + "learning_rate": 1.3814584096738733e-05, + "loss": 0.2858, + "step": 2279 + }, + { + "epoch": 2.255192878338279, + "grad_norm": 0.24947023802823698, + "learning_rate": 1.3796262367167463e-05, + "loss": 0.3581, + "step": 2280 + }, + { + "epoch": 2.2561819980217606, + "grad_norm": 0.24126311074885234, + "learning_rate": 1.377794063759619e-05, + "loss": 0.3372, + "step": 2281 + }, + { + "epoch": 2.2571711177052425, + "grad_norm": 0.24381172903295012, + "learning_rate": 1.3759618908024919e-05, + "loss": 0.3309, + "step": 2282 + }, + { + "epoch": 2.258160237388724, + "grad_norm": 0.2216901316551094, + "learning_rate": 1.3741297178453647e-05, + "loss": 0.2906, + "step": 2283 + }, + { + "epoch": 2.259149357072206, + "grad_norm": 0.22751523042939115, + "learning_rate": 1.3722975448882375e-05, + "loss": 0.327, + "step": 2284 + }, + { + "epoch": 2.2601384767556874, + "grad_norm": 0.2253518058336741, + "learning_rate": 1.3704653719311103e-05, + "loss": 0.3119, + "step": 2285 + }, + { + "epoch": 2.2611275964391693, + "grad_norm": 0.2479817150971885, + "learning_rate": 1.3686331989739831e-05, + "loss": 0.3448, + "step": 2286 + }, + { + "epoch": 2.2621167161226508, + "grad_norm": 0.2201794942968985, + "learning_rate": 1.366801026016856e-05, + "loss": 0.3606, + "step": 2287 + }, + { + "epoch": 2.2631058358061327, + "grad_norm": 0.21364590272690012, + "learning_rate": 1.3649688530597287e-05, + "loss": 0.3305, + "step": 2288 + }, + { + "epoch": 2.264094955489614, + "grad_norm": 0.24392483676889015, + "learning_rate": 1.3631366801026019e-05, + "loss": 0.3393, + "step": 2289 + }, + { + "epoch": 2.265084075173096, + "grad_norm": 0.24221769435099127, + "learning_rate": 1.3613045071454747e-05, + "loss": 0.3369, + "step": 2290 + }, + { + "epoch": 2.2660731948565775, + "grad_norm": 0.24861588082223832, + "learning_rate": 1.3594723341883475e-05, + "loss": 0.3626, + "step": 2291 + }, + { + "epoch": 2.2670623145400595, + "grad_norm": 0.23052081299132418, + "learning_rate": 1.3576401612312203e-05, + "loss": 0.3063, + "step": 2292 + }, + { + "epoch": 2.268051434223541, + "grad_norm": 0.24275686147196318, + "learning_rate": 1.3558079882740931e-05, + "loss": 0.3038, + "step": 2293 + }, + { + "epoch": 2.269040553907023, + "grad_norm": 0.22272342553004223, + "learning_rate": 1.353975815316966e-05, + "loss": 0.3094, + "step": 2294 + }, + { + "epoch": 2.2700296735905043, + "grad_norm": 0.23299197318354145, + "learning_rate": 1.3521436423598388e-05, + "loss": 0.3396, + "step": 2295 + }, + { + "epoch": 2.2710187932739863, + "grad_norm": 0.22615622555168388, + "learning_rate": 1.3503114694027116e-05, + "loss": 0.3336, + "step": 2296 + }, + { + "epoch": 2.2720079129574677, + "grad_norm": 0.2252293872395291, + "learning_rate": 1.3484792964455844e-05, + "loss": 0.2977, + "step": 2297 + }, + { + "epoch": 2.2729970326409497, + "grad_norm": 0.23457296789984822, + "learning_rate": 1.3466471234884574e-05, + "loss": 0.3399, + "step": 2298 + }, + { + "epoch": 2.273986152324431, + "grad_norm": 0.24129016983914284, + "learning_rate": 1.3448149505313302e-05, + "loss": 0.3219, + "step": 2299 + }, + { + "epoch": 2.274975272007913, + "grad_norm": 0.22141667622956007, + "learning_rate": 1.342982777574203e-05, + "loss": 0.3082, + "step": 2300 + }, + { + "epoch": 2.2759643916913945, + "grad_norm": 0.2413245232232439, + "learning_rate": 1.341150604617076e-05, + "loss": 0.3426, + "step": 2301 + }, + { + "epoch": 2.2769535113748764, + "grad_norm": 0.23696458764674602, + "learning_rate": 1.3393184316599488e-05, + "loss": 0.3759, + "step": 2302 + }, + { + "epoch": 2.277942631058358, + "grad_norm": 0.2518570901503138, + "learning_rate": 1.3374862587028218e-05, + "loss": 0.3695, + "step": 2303 + }, + { + "epoch": 2.27893175074184, + "grad_norm": 0.23086745437243067, + "learning_rate": 1.3356540857456946e-05, + "loss": 0.3369, + "step": 2304 + }, + { + "epoch": 2.2799208704253213, + "grad_norm": 0.26169884977451946, + "learning_rate": 1.3338219127885674e-05, + "loss": 0.3373, + "step": 2305 + }, + { + "epoch": 2.2809099901088032, + "grad_norm": 0.2587088888423314, + "learning_rate": 1.3319897398314402e-05, + "loss": 0.3574, + "step": 2306 + }, + { + "epoch": 2.2818991097922847, + "grad_norm": 0.22977702824971338, + "learning_rate": 1.330157566874313e-05, + "loss": 0.3457, + "step": 2307 + }, + { + "epoch": 2.2828882294757666, + "grad_norm": 0.22721408632878823, + "learning_rate": 1.3283253939171858e-05, + "loss": 0.308, + "step": 2308 + }, + { + "epoch": 2.283877349159248, + "grad_norm": 0.2519093156052609, + "learning_rate": 1.3264932209600586e-05, + "loss": 0.3589, + "step": 2309 + }, + { + "epoch": 2.28486646884273, + "grad_norm": 0.26712020676598086, + "learning_rate": 1.3246610480029314e-05, + "loss": 0.3624, + "step": 2310 + }, + { + "epoch": 2.2858555885262115, + "grad_norm": 0.2326596051044427, + "learning_rate": 1.3228288750458042e-05, + "loss": 0.3335, + "step": 2311 + }, + { + "epoch": 2.2868447082096934, + "grad_norm": 0.2765369772968104, + "learning_rate": 1.320996702088677e-05, + "loss": 0.3757, + "step": 2312 + }, + { + "epoch": 2.287833827893175, + "grad_norm": 0.25058824305887195, + "learning_rate": 1.3191645291315502e-05, + "loss": 0.3808, + "step": 2313 + }, + { + "epoch": 2.288822947576657, + "grad_norm": 0.2248644826213857, + "learning_rate": 1.317332356174423e-05, + "loss": 0.3213, + "step": 2314 + }, + { + "epoch": 2.2898120672601383, + "grad_norm": 0.23779476179693096, + "learning_rate": 1.3155001832172958e-05, + "loss": 0.3293, + "step": 2315 + }, + { + "epoch": 2.2908011869436202, + "grad_norm": 0.2723138257723175, + "learning_rate": 1.3136680102601686e-05, + "loss": 0.3428, + "step": 2316 + }, + { + "epoch": 2.2917903066271017, + "grad_norm": 0.2616699975935992, + "learning_rate": 1.3118358373030415e-05, + "loss": 0.3454, + "step": 2317 + }, + { + "epoch": 2.2927794263105836, + "grad_norm": 0.24102049258430722, + "learning_rate": 1.3100036643459143e-05, + "loss": 0.3333, + "step": 2318 + }, + { + "epoch": 2.293768545994065, + "grad_norm": 0.23650754741019725, + "learning_rate": 1.308171491388787e-05, + "loss": 0.3123, + "step": 2319 + }, + { + "epoch": 2.294757665677547, + "grad_norm": 0.37538745041783184, + "learning_rate": 1.3063393184316599e-05, + "loss": 0.369, + "step": 2320 + }, + { + "epoch": 2.2957467853610285, + "grad_norm": 0.23666732747554203, + "learning_rate": 1.3045071454745329e-05, + "loss": 0.316, + "step": 2321 + }, + { + "epoch": 2.2967359050445104, + "grad_norm": 0.22954722561453486, + "learning_rate": 1.3026749725174057e-05, + "loss": 0.3216, + "step": 2322 + }, + { + "epoch": 2.297725024727992, + "grad_norm": 0.2490149729269961, + "learning_rate": 1.3008427995602785e-05, + "loss": 0.3144, + "step": 2323 + }, + { + "epoch": 2.298714144411474, + "grad_norm": 0.25850900249259257, + "learning_rate": 1.2990106266031515e-05, + "loss": 0.3166, + "step": 2324 + }, + { + "epoch": 2.2997032640949557, + "grad_norm": 0.26057122713059194, + "learning_rate": 1.2971784536460243e-05, + "loss": 0.3526, + "step": 2325 + }, + { + "epoch": 2.300692383778437, + "grad_norm": 0.24491367081125778, + "learning_rate": 1.2953462806888973e-05, + "loss": 0.3525, + "step": 2326 + }, + { + "epoch": 2.3016815034619187, + "grad_norm": 0.2760339712557652, + "learning_rate": 1.29351410773177e-05, + "loss": 0.3596, + "step": 2327 + }, + { + "epoch": 2.3026706231454006, + "grad_norm": 0.24073219741729474, + "learning_rate": 1.2916819347746429e-05, + "loss": 0.3269, + "step": 2328 + }, + { + "epoch": 2.3036597428288825, + "grad_norm": 0.2703548932443427, + "learning_rate": 1.2898497618175157e-05, + "loss": 0.3219, + "step": 2329 + }, + { + "epoch": 2.304648862512364, + "grad_norm": 0.2687127704366253, + "learning_rate": 1.2880175888603885e-05, + "loss": 0.3417, + "step": 2330 + }, + { + "epoch": 2.3056379821958455, + "grad_norm": 0.25270161397798035, + "learning_rate": 1.2861854159032613e-05, + "loss": 0.3714, + "step": 2331 + }, + { + "epoch": 2.3066271018793274, + "grad_norm": 0.22146824391487552, + "learning_rate": 1.2843532429461341e-05, + "loss": 0.3217, + "step": 2332 + }, + { + "epoch": 2.3076162215628093, + "grad_norm": 0.2558572987685534, + "learning_rate": 1.282521069989007e-05, + "loss": 0.3753, + "step": 2333 + }, + { + "epoch": 2.308605341246291, + "grad_norm": 0.25523094828283194, + "learning_rate": 1.2806888970318798e-05, + "loss": 0.3419, + "step": 2334 + }, + { + "epoch": 2.3095944609297723, + "grad_norm": 0.24281022901050803, + "learning_rate": 1.2788567240747526e-05, + "loss": 0.3487, + "step": 2335 + }, + { + "epoch": 2.310583580613254, + "grad_norm": 0.2748946608527433, + "learning_rate": 1.2770245511176257e-05, + "loss": 0.3927, + "step": 2336 + }, + { + "epoch": 2.311572700296736, + "grad_norm": 0.24688576678949595, + "learning_rate": 1.2751923781604985e-05, + "loss": 0.3877, + "step": 2337 + }, + { + "epoch": 2.3125618199802176, + "grad_norm": 0.27181775950372444, + "learning_rate": 1.2733602052033713e-05, + "loss": 0.3822, + "step": 2338 + }, + { + "epoch": 2.313550939663699, + "grad_norm": 0.22921656595976417, + "learning_rate": 1.2715280322462441e-05, + "loss": 0.2951, + "step": 2339 + }, + { + "epoch": 2.314540059347181, + "grad_norm": 0.2575726813551104, + "learning_rate": 1.269695859289117e-05, + "loss": 0.3406, + "step": 2340 + }, + { + "epoch": 2.315529179030663, + "grad_norm": 0.2555589633497033, + "learning_rate": 1.2678636863319898e-05, + "loss": 0.3396, + "step": 2341 + }, + { + "epoch": 2.3165182987141444, + "grad_norm": 0.2364187403064698, + "learning_rate": 1.2660315133748626e-05, + "loss": 0.377, + "step": 2342 + }, + { + "epoch": 2.3175074183976263, + "grad_norm": 0.2302096798238319, + "learning_rate": 1.2641993404177354e-05, + "loss": 0.3319, + "step": 2343 + }, + { + "epoch": 2.318496538081108, + "grad_norm": 0.24888650385331362, + "learning_rate": 1.2623671674606084e-05, + "loss": 0.3066, + "step": 2344 + }, + { + "epoch": 2.3194856577645897, + "grad_norm": 0.2773610707597904, + "learning_rate": 1.2605349945034812e-05, + "loss": 0.3414, + "step": 2345 + }, + { + "epoch": 2.320474777448071, + "grad_norm": 0.2154430712098479, + "learning_rate": 1.258702821546354e-05, + "loss": 0.2968, + "step": 2346 + }, + { + "epoch": 2.321463897131553, + "grad_norm": 0.2338043867905404, + "learning_rate": 1.2568706485892268e-05, + "loss": 0.3118, + "step": 2347 + }, + { + "epoch": 2.3224530168150346, + "grad_norm": 0.24602167457573554, + "learning_rate": 1.2550384756320998e-05, + "loss": 0.3702, + "step": 2348 + }, + { + "epoch": 2.3234421364985165, + "grad_norm": 0.2327990964511457, + "learning_rate": 1.2532063026749728e-05, + "loss": 0.3523, + "step": 2349 + }, + { + "epoch": 2.324431256181998, + "grad_norm": 0.2589528675798955, + "learning_rate": 1.2513741297178456e-05, + "loss": 0.3864, + "step": 2350 + }, + { + "epoch": 2.32542037586548, + "grad_norm": 0.26176208543972257, + "learning_rate": 1.2495419567607184e-05, + "loss": 0.3336, + "step": 2351 + }, + { + "epoch": 2.3264094955489614, + "grad_norm": 0.2844298379253604, + "learning_rate": 1.2477097838035912e-05, + "loss": 0.3492, + "step": 2352 + }, + { + "epoch": 2.3273986152324433, + "grad_norm": 0.2546167667799685, + "learning_rate": 1.245877610846464e-05, + "loss": 0.3555, + "step": 2353 + }, + { + "epoch": 2.3283877349159248, + "grad_norm": 0.2696964327271954, + "learning_rate": 1.2440454378893368e-05, + "loss": 0.3706, + "step": 2354 + }, + { + "epoch": 2.3293768545994067, + "grad_norm": 0.31827984227427614, + "learning_rate": 1.2422132649322096e-05, + "loss": 0.332, + "step": 2355 + }, + { + "epoch": 2.330365974282888, + "grad_norm": 0.2240056160701879, + "learning_rate": 1.2403810919750824e-05, + "loss": 0.3328, + "step": 2356 + }, + { + "epoch": 2.33135509396637, + "grad_norm": 0.2517740200145697, + "learning_rate": 1.2385489190179554e-05, + "loss": 0.372, + "step": 2357 + }, + { + "epoch": 2.3323442136498516, + "grad_norm": 0.24940675998881365, + "learning_rate": 1.2367167460608282e-05, + "loss": 0.3959, + "step": 2358 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.2459642937733617, + "learning_rate": 1.234884573103701e-05, + "loss": 0.3492, + "step": 2359 + }, + { + "epoch": 2.334322453016815, + "grad_norm": 0.22768174832698604, + "learning_rate": 1.2330524001465739e-05, + "loss": 0.3404, + "step": 2360 + }, + { + "epoch": 2.335311572700297, + "grad_norm": 0.22086915383890057, + "learning_rate": 1.2312202271894467e-05, + "loss": 0.3099, + "step": 2361 + }, + { + "epoch": 2.3363006923837784, + "grad_norm": 0.22113520177366824, + "learning_rate": 1.2293880542323195e-05, + "loss": 0.3393, + "step": 2362 + }, + { + "epoch": 2.3372898120672603, + "grad_norm": 2.991113459889468, + "learning_rate": 1.2275558812751925e-05, + "loss": 0.3309, + "step": 2363 + }, + { + "epoch": 2.3382789317507418, + "grad_norm": 0.29054610148791177, + "learning_rate": 1.2257237083180653e-05, + "loss": 0.3655, + "step": 2364 + }, + { + "epoch": 2.3392680514342237, + "grad_norm": 0.24093202591978707, + "learning_rate": 1.2238915353609381e-05, + "loss": 0.3428, + "step": 2365 + }, + { + "epoch": 2.340257171117705, + "grad_norm": 0.23736300980535247, + "learning_rate": 1.2220593624038109e-05, + "loss": 0.3229, + "step": 2366 + }, + { + "epoch": 2.341246290801187, + "grad_norm": 0.35258591516934035, + "learning_rate": 1.2202271894466839e-05, + "loss": 0.3363, + "step": 2367 + }, + { + "epoch": 2.3422354104846685, + "grad_norm": 0.26676374709095013, + "learning_rate": 1.2183950164895567e-05, + "loss": 0.3211, + "step": 2368 + }, + { + "epoch": 2.3432245301681505, + "grad_norm": 0.2351851020565668, + "learning_rate": 1.2165628435324295e-05, + "loss": 0.35, + "step": 2369 + }, + { + "epoch": 2.344213649851632, + "grad_norm": 0.22232852533104622, + "learning_rate": 1.2147306705753025e-05, + "loss": 0.31, + "step": 2370 + }, + { + "epoch": 2.345202769535114, + "grad_norm": 0.23680359247744562, + "learning_rate": 1.2128984976181753e-05, + "loss": 0.3478, + "step": 2371 + }, + { + "epoch": 2.3461918892185953, + "grad_norm": 0.24988403986844843, + "learning_rate": 1.2110663246610481e-05, + "loss": 0.3635, + "step": 2372 + }, + { + "epoch": 2.3471810089020773, + "grad_norm": 0.24767501874015097, + "learning_rate": 1.2092341517039209e-05, + "loss": 0.3883, + "step": 2373 + }, + { + "epoch": 2.3481701285855587, + "grad_norm": 0.2521283236697681, + "learning_rate": 1.2074019787467937e-05, + "loss": 0.4225, + "step": 2374 + }, + { + "epoch": 2.3491592482690407, + "grad_norm": 0.24675041358192287, + "learning_rate": 1.2055698057896667e-05, + "loss": 0.3663, + "step": 2375 + }, + { + "epoch": 2.350148367952522, + "grad_norm": 0.24311636349663465, + "learning_rate": 1.2037376328325395e-05, + "loss": 0.3759, + "step": 2376 + }, + { + "epoch": 2.351137487636004, + "grad_norm": 0.24263994864791805, + "learning_rate": 1.2019054598754123e-05, + "loss": 0.38, + "step": 2377 + }, + { + "epoch": 2.3521266073194855, + "grad_norm": 0.25402272877139787, + "learning_rate": 1.2000732869182851e-05, + "loss": 0.31, + "step": 2378 + }, + { + "epoch": 2.3531157270029674, + "grad_norm": 0.2495073709635195, + "learning_rate": 1.198241113961158e-05, + "loss": 0.3659, + "step": 2379 + }, + { + "epoch": 2.354104846686449, + "grad_norm": 0.2298654377866694, + "learning_rate": 1.1964089410040308e-05, + "loss": 0.3209, + "step": 2380 + }, + { + "epoch": 2.355093966369931, + "grad_norm": 0.28863897336682676, + "learning_rate": 1.1945767680469037e-05, + "loss": 0.3417, + "step": 2381 + }, + { + "epoch": 2.3560830860534123, + "grad_norm": 0.41489670024603986, + "learning_rate": 1.1927445950897766e-05, + "loss": 0.3503, + "step": 2382 + }, + { + "epoch": 2.3570722057368942, + "grad_norm": 0.2205422492533083, + "learning_rate": 1.1909124221326494e-05, + "loss": 0.3529, + "step": 2383 + }, + { + "epoch": 2.3580613254203757, + "grad_norm": 0.2364704954092965, + "learning_rate": 1.1890802491755222e-05, + "loss": 0.3625, + "step": 2384 + }, + { + "epoch": 2.3590504451038576, + "grad_norm": 0.2476419419748873, + "learning_rate": 1.187248076218395e-05, + "loss": 0.3181, + "step": 2385 + }, + { + "epoch": 2.360039564787339, + "grad_norm": 0.24541438121836218, + "learning_rate": 1.1854159032612678e-05, + "loss": 0.3931, + "step": 2386 + }, + { + "epoch": 2.361028684470821, + "grad_norm": 0.23259123379943467, + "learning_rate": 1.1835837303041408e-05, + "loss": 0.3599, + "step": 2387 + }, + { + "epoch": 2.3620178041543025, + "grad_norm": 0.24998847686378414, + "learning_rate": 1.1817515573470136e-05, + "loss": 0.3454, + "step": 2388 + }, + { + "epoch": 2.3630069238377844, + "grad_norm": 0.2488071258698388, + "learning_rate": 1.1799193843898864e-05, + "loss": 0.3633, + "step": 2389 + }, + { + "epoch": 2.363996043521266, + "grad_norm": 0.22105819528108095, + "learning_rate": 1.1780872114327594e-05, + "loss": 0.332, + "step": 2390 + }, + { + "epoch": 2.364985163204748, + "grad_norm": 0.2334569251699502, + "learning_rate": 1.1762550384756322e-05, + "loss": 0.3415, + "step": 2391 + }, + { + "epoch": 2.3659742828882293, + "grad_norm": 0.27195122783585196, + "learning_rate": 1.174422865518505e-05, + "loss": 0.3633, + "step": 2392 + }, + { + "epoch": 2.3669634025717112, + "grad_norm": 0.2687234363535735, + "learning_rate": 1.172590692561378e-05, + "loss": 0.3223, + "step": 2393 + }, + { + "epoch": 2.3679525222551927, + "grad_norm": 0.27418695750053923, + "learning_rate": 1.1707585196042508e-05, + "loss": 0.348, + "step": 2394 + }, + { + "epoch": 2.3689416419386746, + "grad_norm": 0.2250392951845017, + "learning_rate": 1.1689263466471236e-05, + "loss": 0.3367, + "step": 2395 + }, + { + "epoch": 2.369930761622156, + "grad_norm": 0.2428040281312013, + "learning_rate": 1.1670941736899964e-05, + "loss": 0.3316, + "step": 2396 + }, + { + "epoch": 2.370919881305638, + "grad_norm": 0.257913044877425, + "learning_rate": 1.1652620007328692e-05, + "loss": 0.3541, + "step": 2397 + }, + { + "epoch": 2.3719090009891195, + "grad_norm": 0.24888673843156847, + "learning_rate": 1.163429827775742e-05, + "loss": 0.2872, + "step": 2398 + }, + { + "epoch": 2.3728981206726014, + "grad_norm": 0.22992256914847725, + "learning_rate": 1.161597654818615e-05, + "loss": 0.3668, + "step": 2399 + }, + { + "epoch": 2.373887240356083, + "grad_norm": 0.23105699693670317, + "learning_rate": 1.1597654818614878e-05, + "loss": 0.3354, + "step": 2400 + }, + { + "epoch": 2.374876360039565, + "grad_norm": 0.23878238019277057, + "learning_rate": 1.1579333089043606e-05, + "loss": 0.3339, + "step": 2401 + }, + { + "epoch": 2.3758654797230463, + "grad_norm": 0.23787914735351112, + "learning_rate": 1.1561011359472335e-05, + "loss": 0.3409, + "step": 2402 + }, + { + "epoch": 2.376854599406528, + "grad_norm": 0.21777157204428282, + "learning_rate": 1.1542689629901063e-05, + "loss": 0.3152, + "step": 2403 + }, + { + "epoch": 2.37784371909001, + "grad_norm": 0.23084255588965574, + "learning_rate": 1.152436790032979e-05, + "loss": 0.3406, + "step": 2404 + }, + { + "epoch": 2.3788328387734916, + "grad_norm": 0.23000546672119904, + "learning_rate": 1.150604617075852e-05, + "loss": 0.3362, + "step": 2405 + }, + { + "epoch": 2.379821958456973, + "grad_norm": 0.24929393096074584, + "learning_rate": 1.1487724441187249e-05, + "loss": 0.3395, + "step": 2406 + }, + { + "epoch": 2.380811078140455, + "grad_norm": 0.2253847100064449, + "learning_rate": 1.1469402711615977e-05, + "loss": 0.2843, + "step": 2407 + }, + { + "epoch": 2.381800197823937, + "grad_norm": 0.21736855996444068, + "learning_rate": 1.1451080982044705e-05, + "loss": 0.3224, + "step": 2408 + }, + { + "epoch": 2.3827893175074184, + "grad_norm": 0.26077964199405224, + "learning_rate": 1.1432759252473433e-05, + "loss": 0.3754, + "step": 2409 + }, + { + "epoch": 2.3837784371909, + "grad_norm": 0.23441329142827347, + "learning_rate": 1.1414437522902161e-05, + "loss": 0.3193, + "step": 2410 + }, + { + "epoch": 2.384767556874382, + "grad_norm": 0.22460852238892978, + "learning_rate": 1.1396115793330891e-05, + "loss": 0.3019, + "step": 2411 + }, + { + "epoch": 2.3857566765578637, + "grad_norm": 0.22061710031425286, + "learning_rate": 1.1377794063759619e-05, + "loss": 0.329, + "step": 2412 + }, + { + "epoch": 2.386745796241345, + "grad_norm": 0.23283333531597325, + "learning_rate": 1.1359472334188349e-05, + "loss": 0.3475, + "step": 2413 + }, + { + "epoch": 2.3877349159248267, + "grad_norm": 0.23549258278998214, + "learning_rate": 1.1341150604617077e-05, + "loss": 0.3681, + "step": 2414 + }, + { + "epoch": 2.3887240356083086, + "grad_norm": 0.23746707960246552, + "learning_rate": 1.1322828875045805e-05, + "loss": 0.3218, + "step": 2415 + }, + { + "epoch": 2.3897131552917905, + "grad_norm": 0.21232937070558, + "learning_rate": 1.1304507145474533e-05, + "loss": 0.3096, + "step": 2416 + }, + { + "epoch": 2.390702274975272, + "grad_norm": 0.22158582934961538, + "learning_rate": 1.1286185415903263e-05, + "loss": 0.3158, + "step": 2417 + }, + { + "epoch": 2.3916913946587535, + "grad_norm": 0.2481025761788115, + "learning_rate": 1.1267863686331991e-05, + "loss": 0.3196, + "step": 2418 + }, + { + "epoch": 2.3926805143422354, + "grad_norm": 0.24337539698694738, + "learning_rate": 1.124954195676072e-05, + "loss": 0.3259, + "step": 2419 + }, + { + "epoch": 2.3936696340257173, + "grad_norm": 0.228969243738579, + "learning_rate": 1.1231220227189447e-05, + "loss": 0.3427, + "step": 2420 + }, + { + "epoch": 2.394658753709199, + "grad_norm": 0.22025974633504555, + "learning_rate": 1.1212898497618175e-05, + "loss": 0.3483, + "step": 2421 + }, + { + "epoch": 2.3956478733926807, + "grad_norm": 0.26184165101054785, + "learning_rate": 1.1194576768046904e-05, + "loss": 0.3828, + "step": 2422 + }, + { + "epoch": 2.396636993076162, + "grad_norm": 0.24478098303647366, + "learning_rate": 1.1176255038475633e-05, + "loss": 0.3599, + "step": 2423 + }, + { + "epoch": 2.397626112759644, + "grad_norm": 0.24261159243488178, + "learning_rate": 1.1157933308904361e-05, + "loss": 0.3112, + "step": 2424 + }, + { + "epoch": 2.3986152324431256, + "grad_norm": 0.22301944660446946, + "learning_rate": 1.113961157933309e-05, + "loss": 0.3214, + "step": 2425 + }, + { + "epoch": 2.3996043521266075, + "grad_norm": 0.24202065872251996, + "learning_rate": 1.1121289849761818e-05, + "loss": 0.3373, + "step": 2426 + }, + { + "epoch": 2.400593471810089, + "grad_norm": 0.26782140193309023, + "learning_rate": 1.1102968120190546e-05, + "loss": 0.3926, + "step": 2427 + }, + { + "epoch": 2.401582591493571, + "grad_norm": 0.24029910736845295, + "learning_rate": 1.1084646390619274e-05, + "loss": 0.368, + "step": 2428 + }, + { + "epoch": 2.4025717111770524, + "grad_norm": 0.2407671258225506, + "learning_rate": 1.1066324661048004e-05, + "loss": 0.3423, + "step": 2429 + }, + { + "epoch": 2.4035608308605343, + "grad_norm": 0.24572982261117407, + "learning_rate": 1.1048002931476732e-05, + "loss": 0.3482, + "step": 2430 + }, + { + "epoch": 2.4045499505440158, + "grad_norm": 0.2788166813071352, + "learning_rate": 1.102968120190546e-05, + "loss": 0.3574, + "step": 2431 + }, + { + "epoch": 2.4055390702274977, + "grad_norm": 0.23296254668053415, + "learning_rate": 1.1011359472334188e-05, + "loss": 0.3137, + "step": 2432 + }, + { + "epoch": 2.406528189910979, + "grad_norm": 0.24801668753216083, + "learning_rate": 1.0993037742762916e-05, + "loss": 0.3509, + "step": 2433 + }, + { + "epoch": 2.407517309594461, + "grad_norm": 0.25891621196791104, + "learning_rate": 1.0974716013191646e-05, + "loss": 0.3266, + "step": 2434 + }, + { + "epoch": 2.4085064292779426, + "grad_norm": 0.25721058330749236, + "learning_rate": 1.0956394283620374e-05, + "loss": 0.3299, + "step": 2435 + }, + { + "epoch": 2.4094955489614245, + "grad_norm": 0.23910805211819738, + "learning_rate": 1.0938072554049102e-05, + "loss": 0.3222, + "step": 2436 + }, + { + "epoch": 2.410484668644906, + "grad_norm": 0.7585722434930996, + "learning_rate": 1.0919750824477832e-05, + "loss": 0.3307, + "step": 2437 + }, + { + "epoch": 2.411473788328388, + "grad_norm": 0.2622094628177202, + "learning_rate": 1.090142909490656e-05, + "loss": 0.3657, + "step": 2438 + }, + { + "epoch": 2.4124629080118694, + "grad_norm": 0.24659250738864202, + "learning_rate": 1.0883107365335288e-05, + "loss": 0.3411, + "step": 2439 + }, + { + "epoch": 2.4134520276953513, + "grad_norm": 0.24754083987784184, + "learning_rate": 1.0864785635764016e-05, + "loss": 0.3949, + "step": 2440 + }, + { + "epoch": 2.4144411473788328, + "grad_norm": 0.2300523326797196, + "learning_rate": 1.0846463906192746e-05, + "loss": 0.3217, + "step": 2441 + }, + { + "epoch": 2.4154302670623147, + "grad_norm": 0.268740945166187, + "learning_rate": 1.0828142176621474e-05, + "loss": 0.3416, + "step": 2442 + }, + { + "epoch": 2.416419386745796, + "grad_norm": 0.25252939297829813, + "learning_rate": 1.0809820447050202e-05, + "loss": 0.3397, + "step": 2443 + }, + { + "epoch": 2.417408506429278, + "grad_norm": 0.2530969379846874, + "learning_rate": 1.079149871747893e-05, + "loss": 0.3864, + "step": 2444 + }, + { + "epoch": 2.4183976261127595, + "grad_norm": 0.24562207385794602, + "learning_rate": 1.0773176987907659e-05, + "loss": 0.3548, + "step": 2445 + }, + { + "epoch": 2.4193867457962415, + "grad_norm": 0.2441336356073428, + "learning_rate": 1.0754855258336387e-05, + "loss": 0.3508, + "step": 2446 + }, + { + "epoch": 2.420375865479723, + "grad_norm": 0.33254572988738407, + "learning_rate": 1.0736533528765116e-05, + "loss": 0.3308, + "step": 2447 + }, + { + "epoch": 2.421364985163205, + "grad_norm": 0.33598359004717754, + "learning_rate": 1.0718211799193845e-05, + "loss": 0.3371, + "step": 2448 + }, + { + "epoch": 2.4223541048466863, + "grad_norm": 0.24536171632195142, + "learning_rate": 1.0699890069622573e-05, + "loss": 0.346, + "step": 2449 + }, + { + "epoch": 2.4233432245301683, + "grad_norm": 0.220072693747127, + "learning_rate": 1.06815683400513e-05, + "loss": 0.3051, + "step": 2450 + }, + { + "epoch": 2.4243323442136497, + "grad_norm": 0.26313981909407597, + "learning_rate": 1.0663246610480029e-05, + "loss": 0.3351, + "step": 2451 + }, + { + "epoch": 2.4253214638971317, + "grad_norm": 0.2710866024316405, + "learning_rate": 1.0644924880908757e-05, + "loss": 0.3164, + "step": 2452 + }, + { + "epoch": 2.426310583580613, + "grad_norm": 0.25396458294554114, + "learning_rate": 1.0626603151337487e-05, + "loss": 0.3463, + "step": 2453 + }, + { + "epoch": 2.427299703264095, + "grad_norm": 0.2252749311775426, + "learning_rate": 1.0608281421766215e-05, + "loss": 0.354, + "step": 2454 + }, + { + "epoch": 2.4282888229475765, + "grad_norm": 0.2606999859832173, + "learning_rate": 1.0589959692194943e-05, + "loss": 0.3511, + "step": 2455 + }, + { + "epoch": 2.4292779426310585, + "grad_norm": 0.37868283734687525, + "learning_rate": 1.0571637962623671e-05, + "loss": 0.361, + "step": 2456 + }, + { + "epoch": 2.43026706231454, + "grad_norm": 0.23778601344518252, + "learning_rate": 1.0553316233052401e-05, + "loss": 0.3375, + "step": 2457 + }, + { + "epoch": 2.431256181998022, + "grad_norm": 0.23759219268753948, + "learning_rate": 1.0534994503481129e-05, + "loss": 0.3371, + "step": 2458 + }, + { + "epoch": 2.4322453016815033, + "grad_norm": 0.22579121090246385, + "learning_rate": 1.0516672773909857e-05, + "loss": 0.3615, + "step": 2459 + }, + { + "epoch": 2.4332344213649852, + "grad_norm": 0.23369877437828138, + "learning_rate": 1.0498351044338587e-05, + "loss": 0.375, + "step": 2460 + }, + { + "epoch": 2.4342235410484667, + "grad_norm": 0.22961846338817127, + "learning_rate": 1.0480029314767315e-05, + "loss": 0.3484, + "step": 2461 + }, + { + "epoch": 2.4352126607319486, + "grad_norm": 0.22887264558327763, + "learning_rate": 1.0461707585196043e-05, + "loss": 0.3385, + "step": 2462 + }, + { + "epoch": 2.43620178041543, + "grad_norm": 0.25032092756268276, + "learning_rate": 1.0443385855624771e-05, + "loss": 0.3151, + "step": 2463 + }, + { + "epoch": 2.437190900098912, + "grad_norm": 0.23728928057357426, + "learning_rate": 1.0425064126053501e-05, + "loss": 0.3813, + "step": 2464 + }, + { + "epoch": 2.4381800197823935, + "grad_norm": 0.2438827034129295, + "learning_rate": 1.040674239648223e-05, + "loss": 0.3659, + "step": 2465 + }, + { + "epoch": 2.4391691394658754, + "grad_norm": 0.2778214140943854, + "learning_rate": 1.0388420666910957e-05, + "loss": 0.3513, + "step": 2466 + }, + { + "epoch": 2.440158259149357, + "grad_norm": 0.22815945841865654, + "learning_rate": 1.0370098937339686e-05, + "loss": 0.3447, + "step": 2467 + }, + { + "epoch": 2.441147378832839, + "grad_norm": 0.23528405331897878, + "learning_rate": 1.0351777207768414e-05, + "loss": 0.3352, + "step": 2468 + }, + { + "epoch": 2.4421364985163203, + "grad_norm": 0.235995080768111, + "learning_rate": 1.0333455478197142e-05, + "loss": 0.3263, + "step": 2469 + }, + { + "epoch": 2.4431256181998022, + "grad_norm": 0.21678114600038997, + "learning_rate": 1.0315133748625872e-05, + "loss": 0.3112, + "step": 2470 + }, + { + "epoch": 2.4441147378832837, + "grad_norm": 0.25806394891487444, + "learning_rate": 1.02968120190546e-05, + "loss": 0.3551, + "step": 2471 + }, + { + "epoch": 2.4451038575667656, + "grad_norm": 0.26401170188018913, + "learning_rate": 1.0278490289483328e-05, + "loss": 0.39, + "step": 2472 + }, + { + "epoch": 2.446092977250247, + "grad_norm": 0.21740446151393822, + "learning_rate": 1.0260168559912056e-05, + "loss": 0.3286, + "step": 2473 + }, + { + "epoch": 2.447082096933729, + "grad_norm": 0.23443927897272304, + "learning_rate": 1.0241846830340784e-05, + "loss": 0.3652, + "step": 2474 + }, + { + "epoch": 2.4480712166172105, + "grad_norm": 0.218480918457683, + "learning_rate": 1.0223525100769512e-05, + "loss": 0.3383, + "step": 2475 + }, + { + "epoch": 2.4490603363006924, + "grad_norm": 0.2203346851930569, + "learning_rate": 1.0205203371198242e-05, + "loss": 0.3401, + "step": 2476 + }, + { + "epoch": 2.450049455984174, + "grad_norm": 0.22869849971032385, + "learning_rate": 1.018688164162697e-05, + "loss": 0.3308, + "step": 2477 + }, + { + "epoch": 2.451038575667656, + "grad_norm": 0.2363096068659232, + "learning_rate": 1.0168559912055698e-05, + "loss": 0.3628, + "step": 2478 + }, + { + "epoch": 2.4520276953511377, + "grad_norm": 0.21724450496142547, + "learning_rate": 1.0150238182484426e-05, + "loss": 0.3435, + "step": 2479 + }, + { + "epoch": 2.453016815034619, + "grad_norm": 0.24107977424377153, + "learning_rate": 1.0131916452913156e-05, + "loss": 0.365, + "step": 2480 + }, + { + "epoch": 2.4540059347181007, + "grad_norm": 0.24828390832939212, + "learning_rate": 1.0113594723341884e-05, + "loss": 0.3366, + "step": 2481 + }, + { + "epoch": 2.4549950544015826, + "grad_norm": 0.20569807648782604, + "learning_rate": 1.0095272993770612e-05, + "loss": 0.2761, + "step": 2482 + }, + { + "epoch": 2.4559841740850645, + "grad_norm": 0.2322910616182908, + "learning_rate": 1.0076951264199342e-05, + "loss": 0.3741, + "step": 2483 + }, + { + "epoch": 2.456973293768546, + "grad_norm": 0.24079586769347716, + "learning_rate": 1.005862953462807e-05, + "loss": 0.3657, + "step": 2484 + }, + { + "epoch": 2.4579624134520275, + "grad_norm": 0.24451423706174355, + "learning_rate": 1.0040307805056798e-05, + "loss": 0.3375, + "step": 2485 + }, + { + "epoch": 2.4589515331355094, + "grad_norm": 0.21789521602688885, + "learning_rate": 1.0021986075485526e-05, + "loss": 0.3115, + "step": 2486 + }, + { + "epoch": 2.4599406528189913, + "grad_norm": 0.2367007818716319, + "learning_rate": 1.0003664345914255e-05, + "loss": 0.3434, + "step": 2487 + }, + { + "epoch": 2.460929772502473, + "grad_norm": 0.2307538273441414, + "learning_rate": 9.985342616342984e-06, + "loss": 0.3279, + "step": 2488 + }, + { + "epoch": 2.4619188921859543, + "grad_norm": 0.2366577365331995, + "learning_rate": 9.967020886771712e-06, + "loss": 0.3405, + "step": 2489 + }, + { + "epoch": 2.462908011869436, + "grad_norm": 0.21292878456865053, + "learning_rate": 9.94869915720044e-06, + "loss": 0.3271, + "step": 2490 + }, + { + "epoch": 2.463897131552918, + "grad_norm": 0.21294135453400145, + "learning_rate": 9.930377427629169e-06, + "loss": 0.312, + "step": 2491 + }, + { + "epoch": 2.4648862512363996, + "grad_norm": 0.2406136238914857, + "learning_rate": 9.912055698057897e-06, + "loss": 0.3537, + "step": 2492 + }, + { + "epoch": 2.465875370919881, + "grad_norm": 0.25484410374642147, + "learning_rate": 9.893733968486625e-06, + "loss": 0.3134, + "step": 2493 + }, + { + "epoch": 2.466864490603363, + "grad_norm": 0.2620293766154584, + "learning_rate": 9.875412238915355e-06, + "loss": 0.4176, + "step": 2494 + }, + { + "epoch": 2.467853610286845, + "grad_norm": 0.23944770550967767, + "learning_rate": 9.857090509344083e-06, + "loss": 0.356, + "step": 2495 + }, + { + "epoch": 2.4688427299703264, + "grad_norm": 0.25086364175393056, + "learning_rate": 9.838768779772811e-06, + "loss": 0.3575, + "step": 2496 + }, + { + "epoch": 2.469831849653808, + "grad_norm": 0.23501454699640065, + "learning_rate": 9.820447050201539e-06, + "loss": 0.3876, + "step": 2497 + }, + { + "epoch": 2.47082096933729, + "grad_norm": 0.2551766847463871, + "learning_rate": 9.802125320630267e-06, + "loss": 0.3367, + "step": 2498 + }, + { + "epoch": 2.4718100890207717, + "grad_norm": 0.2361580571158426, + "learning_rate": 9.783803591058995e-06, + "loss": 0.4009, + "step": 2499 + }, + { + "epoch": 2.472799208704253, + "grad_norm": 1.4900062991141605, + "learning_rate": 9.765481861487725e-06, + "loss": 0.3581, + "step": 2500 + }, + { + "epoch": 2.473788328387735, + "grad_norm": 0.23073936660230762, + "learning_rate": 9.747160131916453e-06, + "loss": 0.3565, + "step": 2501 + }, + { + "epoch": 2.4747774480712166, + "grad_norm": 0.24682663116703085, + "learning_rate": 9.728838402345181e-06, + "loss": 0.3535, + "step": 2502 + }, + { + "epoch": 2.4757665677546985, + "grad_norm": 0.27034996608934037, + "learning_rate": 9.710516672773911e-06, + "loss": 0.3916, + "step": 2503 + }, + { + "epoch": 2.47675568743818, + "grad_norm": 0.2303037854565598, + "learning_rate": 9.69219494320264e-06, + "loss": 0.3436, + "step": 2504 + }, + { + "epoch": 2.477744807121662, + "grad_norm": 0.23585596331964975, + "learning_rate": 9.673873213631367e-06, + "loss": 0.3118, + "step": 2505 + }, + { + "epoch": 2.4787339268051434, + "grad_norm": 0.23726271220235548, + "learning_rate": 9.655551484060097e-06, + "loss": 0.3394, + "step": 2506 + }, + { + "epoch": 2.4797230464886253, + "grad_norm": 0.2487286182774457, + "learning_rate": 9.637229754488825e-06, + "loss": 0.3573, + "step": 2507 + }, + { + "epoch": 2.4807121661721068, + "grad_norm": 0.2698429096752246, + "learning_rate": 9.618908024917553e-06, + "loss": 0.3876, + "step": 2508 + }, + { + "epoch": 2.4817012858555887, + "grad_norm": 0.24070333522567572, + "learning_rate": 9.600586295346281e-06, + "loss": 0.3465, + "step": 2509 + }, + { + "epoch": 2.48269040553907, + "grad_norm": 0.24206724387701353, + "learning_rate": 9.58226456577501e-06, + "loss": 0.3652, + "step": 2510 + }, + { + "epoch": 2.483679525222552, + "grad_norm": 0.22545629834132364, + "learning_rate": 9.563942836203738e-06, + "loss": 0.3275, + "step": 2511 + }, + { + "epoch": 2.4846686449060336, + "grad_norm": 0.23858808844243973, + "learning_rate": 9.545621106632467e-06, + "loss": 0.3505, + "step": 2512 + }, + { + "epoch": 2.4856577645895155, + "grad_norm": 0.2728863075026196, + "learning_rate": 9.527299377061196e-06, + "loss": 0.3541, + "step": 2513 + }, + { + "epoch": 2.486646884272997, + "grad_norm": 0.23294695154773748, + "learning_rate": 9.508977647489924e-06, + "loss": 0.3574, + "step": 2514 + }, + { + "epoch": 2.487636003956479, + "grad_norm": 0.24124966388283223, + "learning_rate": 9.490655917918652e-06, + "loss": 0.3243, + "step": 2515 + }, + { + "epoch": 2.4886251236399604, + "grad_norm": 0.2320389035173641, + "learning_rate": 9.47233418834738e-06, + "loss": 0.3245, + "step": 2516 + }, + { + "epoch": 2.4896142433234423, + "grad_norm": 0.26700657514143444, + "learning_rate": 9.454012458776108e-06, + "loss": 0.3852, + "step": 2517 + }, + { + "epoch": 2.4906033630069238, + "grad_norm": 0.2507593747565587, + "learning_rate": 9.435690729204838e-06, + "loss": 0.3476, + "step": 2518 + }, + { + "epoch": 2.4915924826904057, + "grad_norm": 0.21748192964113497, + "learning_rate": 9.417368999633566e-06, + "loss": 0.3334, + "step": 2519 + }, + { + "epoch": 2.492581602373887, + "grad_norm": 0.25264072777527896, + "learning_rate": 9.399047270062294e-06, + "loss": 0.3685, + "step": 2520 + }, + { + "epoch": 2.493570722057369, + "grad_norm": 0.24839443510243983, + "learning_rate": 9.380725540491022e-06, + "loss": 0.3523, + "step": 2521 + }, + { + "epoch": 2.4945598417408505, + "grad_norm": 0.22023654916565122, + "learning_rate": 9.36240381091975e-06, + "loss": 0.32, + "step": 2522 + }, + { + "epoch": 2.4955489614243325, + "grad_norm": 0.2441603761072515, + "learning_rate": 9.344082081348478e-06, + "loss": 0.3677, + "step": 2523 + }, + { + "epoch": 2.496538081107814, + "grad_norm": 0.23410429303428584, + "learning_rate": 9.325760351777208e-06, + "loss": 0.3282, + "step": 2524 + }, + { + "epoch": 2.497527200791296, + "grad_norm": 0.5284404116687907, + "learning_rate": 9.307438622205936e-06, + "loss": 0.3842, + "step": 2525 + }, + { + "epoch": 2.4985163204747773, + "grad_norm": 0.2597369164991658, + "learning_rate": 9.289116892634666e-06, + "loss": 0.3564, + "step": 2526 + }, + { + "epoch": 2.4995054401582593, + "grad_norm": 1.5905916143954588, + "learning_rate": 9.270795163063394e-06, + "loss": 0.4346, + "step": 2527 + }, + { + "epoch": 2.5004945598417407, + "grad_norm": 0.2406750160968987, + "learning_rate": 9.252473433492122e-06, + "loss": 0.3277, + "step": 2528 + }, + { + "epoch": 2.5014836795252227, + "grad_norm": 0.25687044415825083, + "learning_rate": 9.23415170392085e-06, + "loss": 0.3453, + "step": 2529 + }, + { + "epoch": 2.502472799208704, + "grad_norm": 0.25135048181774106, + "learning_rate": 9.21582997434958e-06, + "loss": 0.3547, + "step": 2530 + }, + { + "epoch": 2.503461918892186, + "grad_norm": 0.24560280016284752, + "learning_rate": 9.197508244778308e-06, + "loss": 0.3661, + "step": 2531 + }, + { + "epoch": 2.5044510385756675, + "grad_norm": 0.49064750366441084, + "learning_rate": 9.179186515207036e-06, + "loss": 0.3812, + "step": 2532 + }, + { + "epoch": 2.5054401582591495, + "grad_norm": 0.22993415291097227, + "learning_rate": 9.160864785635765e-06, + "loss": 0.3637, + "step": 2533 + }, + { + "epoch": 2.506429277942631, + "grad_norm": 0.21741000097273247, + "learning_rate": 9.142543056064493e-06, + "loss": 0.3319, + "step": 2534 + }, + { + "epoch": 2.507418397626113, + "grad_norm": 0.22510085134096802, + "learning_rate": 9.12422132649322e-06, + "loss": 0.3054, + "step": 2535 + }, + { + "epoch": 2.5084075173095943, + "grad_norm": 0.23033682251844267, + "learning_rate": 9.10589959692195e-06, + "loss": 0.3684, + "step": 2536 + }, + { + "epoch": 2.5093966369930762, + "grad_norm": 0.23336881930405867, + "learning_rate": 9.087577867350679e-06, + "loss": 0.336, + "step": 2537 + }, + { + "epoch": 2.5103857566765577, + "grad_norm": 0.25835519741672547, + "learning_rate": 9.069256137779407e-06, + "loss": 0.3665, + "step": 2538 + }, + { + "epoch": 2.5113748763600396, + "grad_norm": 0.21527278722575882, + "learning_rate": 9.050934408208135e-06, + "loss": 0.3271, + "step": 2539 + }, + { + "epoch": 2.512363996043521, + "grad_norm": 0.22846804776774987, + "learning_rate": 9.032612678636863e-06, + "loss": 0.3634, + "step": 2540 + }, + { + "epoch": 2.513353115727003, + "grad_norm": 0.2461420001985178, + "learning_rate": 9.014290949065591e-06, + "loss": 0.3618, + "step": 2541 + }, + { + "epoch": 2.5143422354104845, + "grad_norm": 0.24677056500039793, + "learning_rate": 8.995969219494321e-06, + "loss": 0.3474, + "step": 2542 + }, + { + "epoch": 2.5153313550939664, + "grad_norm": 0.23552827235851204, + "learning_rate": 8.977647489923049e-06, + "loss": 0.3477, + "step": 2543 + }, + { + "epoch": 2.516320474777448, + "grad_norm": 0.23544874721670853, + "learning_rate": 8.959325760351777e-06, + "loss": 0.3665, + "step": 2544 + }, + { + "epoch": 2.51730959446093, + "grad_norm": 0.25589519603994987, + "learning_rate": 8.941004030780505e-06, + "loss": 0.3676, + "step": 2545 + }, + { + "epoch": 2.5182987141444113, + "grad_norm": 0.22529131811822634, + "learning_rate": 8.922682301209233e-06, + "loss": 0.342, + "step": 2546 + }, + { + "epoch": 2.5192878338278932, + "grad_norm": 0.22830805413949087, + "learning_rate": 8.904360571637963e-06, + "loss": 0.3228, + "step": 2547 + }, + { + "epoch": 2.5202769535113747, + "grad_norm": 0.24283043796243683, + "learning_rate": 8.886038842066691e-06, + "loss": 0.3521, + "step": 2548 + }, + { + "epoch": 2.5212660731948566, + "grad_norm": 0.23539048865709836, + "learning_rate": 8.86771711249542e-06, + "loss": 0.3232, + "step": 2549 + }, + { + "epoch": 2.5222551928783385, + "grad_norm": 0.22416815087474565, + "learning_rate": 8.84939538292415e-06, + "loss": 0.3502, + "step": 2550 + }, + { + "epoch": 2.52324431256182, + "grad_norm": 0.24356963148470098, + "learning_rate": 8.831073653352877e-06, + "loss": 0.3041, + "step": 2551 + }, + { + "epoch": 2.5242334322453015, + "grad_norm": 0.2395352023996427, + "learning_rate": 8.812751923781605e-06, + "loss": 0.3787, + "step": 2552 + }, + { + "epoch": 2.5252225519287834, + "grad_norm": 0.23644859570044596, + "learning_rate": 8.794430194210334e-06, + "loss": 0.3657, + "step": 2553 + }, + { + "epoch": 2.5262116716122653, + "grad_norm": 0.253996407977228, + "learning_rate": 8.776108464639063e-06, + "loss": 0.3445, + "step": 2554 + }, + { + "epoch": 2.527200791295747, + "grad_norm": 0.251282958120267, + "learning_rate": 8.757786735067792e-06, + "loss": 0.3536, + "step": 2555 + }, + { + "epoch": 2.5281899109792283, + "grad_norm": 0.45586735138497564, + "learning_rate": 8.73946500549652e-06, + "loss": 0.3641, + "step": 2556 + }, + { + "epoch": 2.52917903066271, + "grad_norm": 0.25052516019444043, + "learning_rate": 8.721143275925248e-06, + "loss": 0.3669, + "step": 2557 + }, + { + "epoch": 2.530168150346192, + "grad_norm": 0.23437819091702222, + "learning_rate": 8.702821546353976e-06, + "loss": 0.3283, + "step": 2558 + }, + { + "epoch": 2.5311572700296736, + "grad_norm": 0.22284762353121085, + "learning_rate": 8.684499816782704e-06, + "loss": 0.3145, + "step": 2559 + }, + { + "epoch": 2.532146389713155, + "grad_norm": 0.22449486144864736, + "learning_rate": 8.666178087211434e-06, + "loss": 0.3553, + "step": 2560 + }, + { + "epoch": 2.533135509396637, + "grad_norm": 0.2431824560198144, + "learning_rate": 8.647856357640162e-06, + "loss": 0.353, + "step": 2561 + }, + { + "epoch": 2.534124629080119, + "grad_norm": 0.24220316126556765, + "learning_rate": 8.62953462806889e-06, + "loss": 0.3698, + "step": 2562 + }, + { + "epoch": 2.5351137487636004, + "grad_norm": 0.24561350691928416, + "learning_rate": 8.611212898497618e-06, + "loss": 0.3541, + "step": 2563 + }, + { + "epoch": 2.536102868447082, + "grad_norm": 0.26058782429161903, + "learning_rate": 8.592891168926346e-06, + "loss": 0.358, + "step": 2564 + }, + { + "epoch": 2.537091988130564, + "grad_norm": 0.24483285029632448, + "learning_rate": 8.574569439355074e-06, + "loss": 0.3722, + "step": 2565 + }, + { + "epoch": 2.5380811078140457, + "grad_norm": 0.22463143941644045, + "learning_rate": 8.556247709783804e-06, + "loss": 0.3344, + "step": 2566 + }, + { + "epoch": 2.539070227497527, + "grad_norm": 0.25857496371914634, + "learning_rate": 8.537925980212532e-06, + "loss": 0.3955, + "step": 2567 + }, + { + "epoch": 2.5400593471810087, + "grad_norm": 0.24794143406441915, + "learning_rate": 8.51960425064126e-06, + "loss": 0.3279, + "step": 2568 + }, + { + "epoch": 2.5410484668644906, + "grad_norm": 0.23454478908544515, + "learning_rate": 8.501282521069988e-06, + "loss": 0.3387, + "step": 2569 + }, + { + "epoch": 2.5420375865479725, + "grad_norm": 0.218775146778518, + "learning_rate": 8.482960791498718e-06, + "loss": 0.3149, + "step": 2570 + }, + { + "epoch": 2.543026706231454, + "grad_norm": 0.256130200586046, + "learning_rate": 8.464639061927446e-06, + "loss": 0.3491, + "step": 2571 + }, + { + "epoch": 2.5440158259149355, + "grad_norm": 0.21696788649308552, + "learning_rate": 8.446317332356175e-06, + "loss": 0.3194, + "step": 2572 + }, + { + "epoch": 2.5450049455984174, + "grad_norm": 0.214921895223905, + "learning_rate": 8.427995602784904e-06, + "loss": 0.3066, + "step": 2573 + }, + { + "epoch": 2.5459940652818993, + "grad_norm": 0.222786093431417, + "learning_rate": 8.409673873213632e-06, + "loss": 0.3633, + "step": 2574 + }, + { + "epoch": 2.546983184965381, + "grad_norm": 0.23510985038285132, + "learning_rate": 8.39135214364236e-06, + "loss": 0.3647, + "step": 2575 + }, + { + "epoch": 2.5479723046488623, + "grad_norm": 0.2192600372412503, + "learning_rate": 8.373030414071089e-06, + "loss": 0.3552, + "step": 2576 + }, + { + "epoch": 2.548961424332344, + "grad_norm": 0.21452070249266664, + "learning_rate": 8.354708684499817e-06, + "loss": 0.3031, + "step": 2577 + }, + { + "epoch": 2.549950544015826, + "grad_norm": 0.2286175179188525, + "learning_rate": 8.336386954928547e-06, + "loss": 0.3338, + "step": 2578 + }, + { + "epoch": 2.5509396636993076, + "grad_norm": 0.24433394067425576, + "learning_rate": 8.318065225357275e-06, + "loss": 0.3519, + "step": 2579 + }, + { + "epoch": 2.551928783382789, + "grad_norm": 0.2188927821523692, + "learning_rate": 8.299743495786003e-06, + "loss": 0.2997, + "step": 2580 + }, + { + "epoch": 2.552917903066271, + "grad_norm": 0.23903928008474262, + "learning_rate": 8.281421766214731e-06, + "loss": 0.3582, + "step": 2581 + }, + { + "epoch": 2.553907022749753, + "grad_norm": 0.2252037417774991, + "learning_rate": 8.263100036643459e-06, + "loss": 0.3158, + "step": 2582 + }, + { + "epoch": 2.5548961424332344, + "grad_norm": 0.22547563235668888, + "learning_rate": 8.244778307072187e-06, + "loss": 0.3161, + "step": 2583 + }, + { + "epoch": 2.555885262116716, + "grad_norm": 0.212708651970397, + "learning_rate": 8.226456577500917e-06, + "loss": 0.3021, + "step": 2584 + }, + { + "epoch": 2.5568743818001978, + "grad_norm": 0.2211913667645755, + "learning_rate": 8.208134847929645e-06, + "loss": 0.3355, + "step": 2585 + }, + { + "epoch": 2.5578635014836797, + "grad_norm": 0.2422032730001762, + "learning_rate": 8.189813118358373e-06, + "loss": 0.3753, + "step": 2586 + }, + { + "epoch": 2.558852621167161, + "grad_norm": 0.26729432792289365, + "learning_rate": 8.171491388787101e-06, + "loss": 0.399, + "step": 2587 + }, + { + "epoch": 2.559841740850643, + "grad_norm": 0.25427480702090216, + "learning_rate": 8.15316965921583e-06, + "loss": 0.3876, + "step": 2588 + }, + { + "epoch": 2.5608308605341246, + "grad_norm": 0.23376302569182988, + "learning_rate": 8.134847929644557e-06, + "loss": 0.3499, + "step": 2589 + }, + { + "epoch": 2.5618199802176065, + "grad_norm": 0.21819970135465813, + "learning_rate": 8.116526200073287e-06, + "loss": 0.3337, + "step": 2590 + }, + { + "epoch": 2.562809099901088, + "grad_norm": 0.249529986101218, + "learning_rate": 8.098204470502015e-06, + "loss": 0.358, + "step": 2591 + }, + { + "epoch": 2.56379821958457, + "grad_norm": 0.2365993729449896, + "learning_rate": 8.079882740930744e-06, + "loss": 0.3484, + "step": 2592 + }, + { + "epoch": 2.5647873392680514, + "grad_norm": 0.23483393877228428, + "learning_rate": 8.061561011359473e-06, + "loss": 0.3566, + "step": 2593 + }, + { + "epoch": 2.5657764589515333, + "grad_norm": 0.222182938428324, + "learning_rate": 8.043239281788201e-06, + "loss": 0.3106, + "step": 2594 + }, + { + "epoch": 2.5667655786350148, + "grad_norm": 0.2094235992786249, + "learning_rate": 8.02491755221693e-06, + "loss": 0.3308, + "step": 2595 + }, + { + "epoch": 2.5677546983184967, + "grad_norm": 0.22678718769630118, + "learning_rate": 8.00659582264566e-06, + "loss": 0.3054, + "step": 2596 + }, + { + "epoch": 2.568743818001978, + "grad_norm": 0.2309026367438794, + "learning_rate": 7.988274093074387e-06, + "loss": 0.3136, + "step": 2597 + }, + { + "epoch": 2.56973293768546, + "grad_norm": 0.22667618009371604, + "learning_rate": 7.969952363503116e-06, + "loss": 0.349, + "step": 2598 + }, + { + "epoch": 2.5707220573689415, + "grad_norm": 0.21117846126384685, + "learning_rate": 7.951630633931844e-06, + "loss": 0.327, + "step": 2599 + }, + { + "epoch": 2.5717111770524235, + "grad_norm": 0.21092844530263816, + "learning_rate": 7.933308904360572e-06, + "loss": 0.2968, + "step": 2600 + }, + { + "epoch": 2.572700296735905, + "grad_norm": 0.22503274613024515, + "learning_rate": 7.9149871747893e-06, + "loss": 0.348, + "step": 2601 + }, + { + "epoch": 2.573689416419387, + "grad_norm": 0.22080704154411201, + "learning_rate": 7.89666544521803e-06, + "loss": 0.3173, + "step": 2602 + }, + { + "epoch": 2.5746785361028683, + "grad_norm": 0.20732028945319075, + "learning_rate": 7.878343715646758e-06, + "loss": 0.3168, + "step": 2603 + }, + { + "epoch": 2.5756676557863503, + "grad_norm": 0.2380503672520683, + "learning_rate": 7.860021986075486e-06, + "loss": 0.3353, + "step": 2604 + }, + { + "epoch": 2.5766567754698317, + "grad_norm": 0.21944010685108145, + "learning_rate": 7.841700256504214e-06, + "loss": 0.3154, + "step": 2605 + }, + { + "epoch": 2.5776458951533137, + "grad_norm": 0.5291657134084721, + "learning_rate": 7.823378526932942e-06, + "loss": 0.3486, + "step": 2606 + }, + { + "epoch": 2.578635014836795, + "grad_norm": 0.2287926036524713, + "learning_rate": 7.805056797361672e-06, + "loss": 0.3367, + "step": 2607 + }, + { + "epoch": 2.579624134520277, + "grad_norm": 0.24607517669943713, + "learning_rate": 7.7867350677904e-06, + "loss": 0.3822, + "step": 2608 + }, + { + "epoch": 2.5806132542037585, + "grad_norm": 0.30753285728195096, + "learning_rate": 7.768413338219128e-06, + "loss": 0.331, + "step": 2609 + }, + { + "epoch": 2.5816023738872405, + "grad_norm": 0.23264272610772255, + "learning_rate": 7.750091608647856e-06, + "loss": 0.3177, + "step": 2610 + }, + { + "epoch": 2.582591493570722, + "grad_norm": 0.24746556520732035, + "learning_rate": 7.731769879076584e-06, + "loss": 0.3255, + "step": 2611 + }, + { + "epoch": 2.583580613254204, + "grad_norm": 0.2586316776392973, + "learning_rate": 7.713448149505313e-06, + "loss": 0.3122, + "step": 2612 + }, + { + "epoch": 2.5845697329376853, + "grad_norm": 0.22370719892578067, + "learning_rate": 7.695126419934042e-06, + "loss": 0.3386, + "step": 2613 + }, + { + "epoch": 2.5855588526211672, + "grad_norm": 0.22463072855787217, + "learning_rate": 7.67680469036277e-06, + "loss": 0.3192, + "step": 2614 + }, + { + "epoch": 2.5865479723046487, + "grad_norm": 0.23719441390072404, + "learning_rate": 7.658482960791499e-06, + "loss": 0.3573, + "step": 2615 + }, + { + "epoch": 2.5875370919881306, + "grad_norm": 0.607168367699493, + "learning_rate": 7.640161231220228e-06, + "loss": 0.4113, + "step": 2616 + }, + { + "epoch": 2.588526211671612, + "grad_norm": 0.21084308256123116, + "learning_rate": 7.621839501648956e-06, + "loss": 0.3162, + "step": 2617 + }, + { + "epoch": 2.589515331355094, + "grad_norm": 0.22956176268509806, + "learning_rate": 7.603517772077684e-06, + "loss": 0.3383, + "step": 2618 + }, + { + "epoch": 2.5905044510385755, + "grad_norm": 0.22913294281991195, + "learning_rate": 7.5851960425064135e-06, + "loss": 0.3425, + "step": 2619 + }, + { + "epoch": 2.5914935707220574, + "grad_norm": 0.23260244421957282, + "learning_rate": 7.566874312935142e-06, + "loss": 0.3409, + "step": 2620 + }, + { + "epoch": 2.592482690405539, + "grad_norm": 0.2436083162884712, + "learning_rate": 7.54855258336387e-06, + "loss": 0.3385, + "step": 2621 + }, + { + "epoch": 2.593471810089021, + "grad_norm": 0.21262219531288126, + "learning_rate": 7.530230853792599e-06, + "loss": 0.3135, + "step": 2622 + }, + { + "epoch": 2.5944609297725023, + "grad_norm": 0.2368450650064414, + "learning_rate": 7.511909124221327e-06, + "loss": 0.385, + "step": 2623 + }, + { + "epoch": 2.5954500494559842, + "grad_norm": 0.215786394243701, + "learning_rate": 7.493587394650055e-06, + "loss": 0.3208, + "step": 2624 + }, + { + "epoch": 2.596439169139466, + "grad_norm": 0.238337529875864, + "learning_rate": 7.475265665078785e-06, + "loss": 0.3352, + "step": 2625 + }, + { + "epoch": 2.5974282888229476, + "grad_norm": 0.20979045932789106, + "learning_rate": 7.456943935507513e-06, + "loss": 0.3257, + "step": 2626 + }, + { + "epoch": 2.598417408506429, + "grad_norm": 0.24130139437210824, + "learning_rate": 7.438622205936241e-06, + "loss": 0.3548, + "step": 2627 + }, + { + "epoch": 2.599406528189911, + "grad_norm": 0.2258956518218783, + "learning_rate": 7.420300476364969e-06, + "loss": 0.3752, + "step": 2628 + }, + { + "epoch": 2.600395647873393, + "grad_norm": 0.23656689766867856, + "learning_rate": 7.401978746793697e-06, + "loss": 0.3919, + "step": 2629 + }, + { + "epoch": 2.6013847675568744, + "grad_norm": 0.20797130671298258, + "learning_rate": 7.383657017222425e-06, + "loss": 0.295, + "step": 2630 + }, + { + "epoch": 2.602373887240356, + "grad_norm": 0.2123407700433041, + "learning_rate": 7.365335287651155e-06, + "loss": 0.3318, + "step": 2631 + }, + { + "epoch": 2.603363006923838, + "grad_norm": 0.25028617103387574, + "learning_rate": 7.347013558079883e-06, + "loss": 0.3695, + "step": 2632 + }, + { + "epoch": 2.6043521266073197, + "grad_norm": 0.23738737755032072, + "learning_rate": 7.328691828508611e-06, + "loss": 0.3469, + "step": 2633 + }, + { + "epoch": 2.605341246290801, + "grad_norm": 0.230389214849109, + "learning_rate": 7.31037009893734e-06, + "loss": 0.3563, + "step": 2634 + }, + { + "epoch": 2.6063303659742827, + "grad_norm": 0.21597950801778004, + "learning_rate": 7.292048369366068e-06, + "loss": 0.3339, + "step": 2635 + }, + { + "epoch": 2.6073194856577646, + "grad_norm": 0.2349603015018135, + "learning_rate": 7.2737266397947965e-06, + "loss": 0.3586, + "step": 2636 + }, + { + "epoch": 2.6083086053412465, + "grad_norm": 0.2083573369468888, + "learning_rate": 7.255404910223526e-06, + "loss": 0.3004, + "step": 2637 + }, + { + "epoch": 2.609297725024728, + "grad_norm": 0.2375417304044579, + "learning_rate": 7.2370831806522544e-06, + "loss": 0.3667, + "step": 2638 + }, + { + "epoch": 2.6102868447082095, + "grad_norm": 0.21283724617762306, + "learning_rate": 7.2187614510809825e-06, + "loss": 0.3242, + "step": 2639 + }, + { + "epoch": 2.6112759643916914, + "grad_norm": 0.24402416859387327, + "learning_rate": 7.200439721509711e-06, + "loss": 0.3715, + "step": 2640 + }, + { + "epoch": 2.6122650840751733, + "grad_norm": 0.22429238323329195, + "learning_rate": 7.182117991938439e-06, + "loss": 0.3266, + "step": 2641 + }, + { + "epoch": 2.613254203758655, + "grad_norm": 0.21784385253259106, + "learning_rate": 7.163796262367167e-06, + "loss": 0.3087, + "step": 2642 + }, + { + "epoch": 2.6142433234421363, + "grad_norm": 0.22124966929792098, + "learning_rate": 7.145474532795897e-06, + "loss": 0.3418, + "step": 2643 + }, + { + "epoch": 2.615232443125618, + "grad_norm": 0.21365105225452002, + "learning_rate": 7.127152803224625e-06, + "loss": 0.3277, + "step": 2644 + }, + { + "epoch": 2.6162215628091, + "grad_norm": 0.22073371217981072, + "learning_rate": 7.108831073653354e-06, + "loss": 0.3436, + "step": 2645 + }, + { + "epoch": 2.6172106824925816, + "grad_norm": 0.23269085111573848, + "learning_rate": 7.090509344082082e-06, + "loss": 0.3352, + "step": 2646 + }, + { + "epoch": 2.618199802176063, + "grad_norm": 0.20633980623132578, + "learning_rate": 7.07218761451081e-06, + "loss": 0.3013, + "step": 2647 + }, + { + "epoch": 2.619188921859545, + "grad_norm": 0.2241331658702146, + "learning_rate": 7.053865884939538e-06, + "loss": 0.3542, + "step": 2648 + }, + { + "epoch": 2.620178041543027, + "grad_norm": 0.2289694050769472, + "learning_rate": 7.035544155368268e-06, + "loss": 0.3261, + "step": 2649 + }, + { + "epoch": 2.6211671612265084, + "grad_norm": 0.22213256212288926, + "learning_rate": 7.017222425796996e-06, + "loss": 0.3227, + "step": 2650 + }, + { + "epoch": 2.62215628090999, + "grad_norm": 0.22863504343907765, + "learning_rate": 6.998900696225724e-06, + "loss": 0.3628, + "step": 2651 + }, + { + "epoch": 2.623145400593472, + "grad_norm": 0.2166138187893422, + "learning_rate": 6.980578966654452e-06, + "loss": 0.3213, + "step": 2652 + }, + { + "epoch": 2.6241345202769537, + "grad_norm": 0.22061018086704073, + "learning_rate": 6.96225723708318e-06, + "loss": 0.362, + "step": 2653 + }, + { + "epoch": 2.625123639960435, + "grad_norm": 0.24304965680542823, + "learning_rate": 6.943935507511909e-06, + "loss": 0.3757, + "step": 2654 + }, + { + "epoch": 2.6261127596439167, + "grad_norm": 0.2166210140761748, + "learning_rate": 6.925613777940638e-06, + "loss": 0.3167, + "step": 2655 + }, + { + "epoch": 2.6271018793273986, + "grad_norm": 0.21031218581565844, + "learning_rate": 6.907292048369366e-06, + "loss": 0.3391, + "step": 2656 + }, + { + "epoch": 2.6280909990108805, + "grad_norm": 0.20543467160833437, + "learning_rate": 6.888970318798095e-06, + "loss": 0.3058, + "step": 2657 + }, + { + "epoch": 2.629080118694362, + "grad_norm": 0.23810595397235643, + "learning_rate": 6.8706485892268234e-06, + "loss": 0.3524, + "step": 2658 + }, + { + "epoch": 2.6300692383778435, + "grad_norm": 0.21740713887368684, + "learning_rate": 6.8523268596555516e-06, + "loss": 0.3436, + "step": 2659 + }, + { + "epoch": 2.6310583580613254, + "grad_norm": 0.20995936719262198, + "learning_rate": 6.83400513008428e-06, + "loss": 0.3251, + "step": 2660 + }, + { + "epoch": 2.6320474777448073, + "grad_norm": 0.2303528115218695, + "learning_rate": 6.8156834005130095e-06, + "loss": 0.3675, + "step": 2661 + }, + { + "epoch": 2.6330365974282888, + "grad_norm": 0.22458667063645485, + "learning_rate": 6.797361670941738e-06, + "loss": 0.3362, + "step": 2662 + }, + { + "epoch": 2.6340257171117702, + "grad_norm": 0.20914465008907085, + "learning_rate": 6.779039941370466e-06, + "loss": 0.3254, + "step": 2663 + }, + { + "epoch": 2.635014836795252, + "grad_norm": 0.23181001869723658, + "learning_rate": 6.760718211799194e-06, + "loss": 0.3387, + "step": 2664 + }, + { + "epoch": 2.636003956478734, + "grad_norm": 0.2223165254541124, + "learning_rate": 6.742396482227922e-06, + "loss": 0.3498, + "step": 2665 + }, + { + "epoch": 2.6369930761622156, + "grad_norm": 0.22334071001886052, + "learning_rate": 6.724074752656651e-06, + "loss": 0.3115, + "step": 2666 + }, + { + "epoch": 2.6379821958456975, + "grad_norm": 0.23398152025847294, + "learning_rate": 6.70575302308538e-06, + "loss": 0.3507, + "step": 2667 + }, + { + "epoch": 2.638971315529179, + "grad_norm": 0.22539172159114598, + "learning_rate": 6.687431293514109e-06, + "loss": 0.3611, + "step": 2668 + }, + { + "epoch": 2.639960435212661, + "grad_norm": 0.21097633122850826, + "learning_rate": 6.669109563942837e-06, + "loss": 0.3183, + "step": 2669 + }, + { + "epoch": 2.6409495548961424, + "grad_norm": 0.2445964426483477, + "learning_rate": 6.650787834371565e-06, + "loss": 0.3399, + "step": 2670 + }, + { + "epoch": 2.6419386745796243, + "grad_norm": 0.23695631235504286, + "learning_rate": 6.632466104800293e-06, + "loss": 0.3412, + "step": 2671 + }, + { + "epoch": 2.6429277942631058, + "grad_norm": 0.2534807066261069, + "learning_rate": 6.614144375229021e-06, + "loss": 0.3811, + "step": 2672 + }, + { + "epoch": 2.6439169139465877, + "grad_norm": 0.2352492192731664, + "learning_rate": 6.595822645657751e-06, + "loss": 0.3265, + "step": 2673 + }, + { + "epoch": 2.644906033630069, + "grad_norm": 0.22325401378241108, + "learning_rate": 6.577500916086479e-06, + "loss": 0.338, + "step": 2674 + }, + { + "epoch": 2.645895153313551, + "grad_norm": 0.2355023381434876, + "learning_rate": 6.559179186515207e-06, + "loss": 0.3439, + "step": 2675 + }, + { + "epoch": 2.6468842729970326, + "grad_norm": 0.25844539764836716, + "learning_rate": 6.540857456943935e-06, + "loss": 0.3465, + "step": 2676 + }, + { + "epoch": 2.6478733926805145, + "grad_norm": 0.20204122613105294, + "learning_rate": 6.522535727372664e-06, + "loss": 0.326, + "step": 2677 + }, + { + "epoch": 2.648862512363996, + "grad_norm": 0.20091765692132063, + "learning_rate": 6.5042139978013925e-06, + "loss": 0.3146, + "step": 2678 + }, + { + "epoch": 2.649851632047478, + "grad_norm": 0.2151974965408436, + "learning_rate": 6.485892268230121e-06, + "loss": 0.3508, + "step": 2679 + }, + { + "epoch": 2.6508407517309593, + "grad_norm": 0.25164157106976864, + "learning_rate": 6.46757053865885e-06, + "loss": 0.3793, + "step": 2680 + }, + { + "epoch": 2.6518298714144413, + "grad_norm": 0.23370693030036446, + "learning_rate": 6.4492488090875785e-06, + "loss": 0.4006, + "step": 2681 + }, + { + "epoch": 2.6528189910979227, + "grad_norm": 0.23987905015984629, + "learning_rate": 6.430927079516307e-06, + "loss": 0.3613, + "step": 2682 + }, + { + "epoch": 2.6538081107814047, + "grad_norm": 0.21594181062915788, + "learning_rate": 6.412605349945035e-06, + "loss": 0.3307, + "step": 2683 + }, + { + "epoch": 2.654797230464886, + "grad_norm": 0.2290158725540692, + "learning_rate": 6.394283620373763e-06, + "loss": 0.3062, + "step": 2684 + }, + { + "epoch": 2.655786350148368, + "grad_norm": 0.22948262260424798, + "learning_rate": 6.375961890802493e-06, + "loss": 0.3722, + "step": 2685 + }, + { + "epoch": 2.6567754698318495, + "grad_norm": 0.22027398416939878, + "learning_rate": 6.357640161231221e-06, + "loss": 0.3253, + "step": 2686 + }, + { + "epoch": 2.6577645895153315, + "grad_norm": 0.22047131128891237, + "learning_rate": 6.339318431659949e-06, + "loss": 0.3447, + "step": 2687 + }, + { + "epoch": 2.658753709198813, + "grad_norm": 0.21264120578875886, + "learning_rate": 6.320996702088677e-06, + "loss": 0.3178, + "step": 2688 + }, + { + "epoch": 2.659742828882295, + "grad_norm": 0.21403466908930086, + "learning_rate": 6.302674972517406e-06, + "loss": 0.3263, + "step": 2689 + }, + { + "epoch": 2.6607319485657763, + "grad_norm": 0.2340128370458701, + "learning_rate": 6.284353242946134e-06, + "loss": 0.344, + "step": 2690 + }, + { + "epoch": 2.6617210682492582, + "grad_norm": 0.21709601415825422, + "learning_rate": 6.266031513374864e-06, + "loss": 0.332, + "step": 2691 + }, + { + "epoch": 2.6627101879327397, + "grad_norm": 0.20573493768846976, + "learning_rate": 6.247709783803592e-06, + "loss": 0.2896, + "step": 2692 + }, + { + "epoch": 2.6636993076162216, + "grad_norm": 0.2175633594597353, + "learning_rate": 6.22938805423232e-06, + "loss": 0.3255, + "step": 2693 + }, + { + "epoch": 2.664688427299703, + "grad_norm": 0.2208890382225565, + "learning_rate": 6.211066324661048e-06, + "loss": 0.3241, + "step": 2694 + }, + { + "epoch": 2.665677546983185, + "grad_norm": 0.22648814414163115, + "learning_rate": 6.192744595089777e-06, + "loss": 0.3803, + "step": 2695 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.2065203676472046, + "learning_rate": 6.174422865518505e-06, + "loss": 0.3179, + "step": 2696 + }, + { + "epoch": 2.6676557863501484, + "grad_norm": 0.22119342098303224, + "learning_rate": 6.156101135947233e-06, + "loss": 0.3375, + "step": 2697 + }, + { + "epoch": 2.66864490603363, + "grad_norm": 0.22093559612372538, + "learning_rate": 6.137779406375962e-06, + "loss": 0.3514, + "step": 2698 + }, + { + "epoch": 2.669634025717112, + "grad_norm": 0.4155710402624548, + "learning_rate": 6.1194576768046904e-06, + "loss": 0.4384, + "step": 2699 + }, + { + "epoch": 2.6706231454005933, + "grad_norm": 0.21197542896961055, + "learning_rate": 6.101135947233419e-06, + "loss": 0.3247, + "step": 2700 + }, + { + "epoch": 2.6716122650840752, + "grad_norm": 0.20867372426418582, + "learning_rate": 6.0828142176621475e-06, + "loss": 0.3162, + "step": 2701 + }, + { + "epoch": 2.6726013847675567, + "grad_norm": 0.21283459025003898, + "learning_rate": 6.0644924880908765e-06, + "loss": 0.3088, + "step": 2702 + }, + { + "epoch": 2.6735905044510386, + "grad_norm": 0.27512316233142525, + "learning_rate": 6.0461707585196046e-06, + "loss": 0.3543, + "step": 2703 + }, + { + "epoch": 2.6745796241345206, + "grad_norm": 0.23578251994420446, + "learning_rate": 6.0278490289483335e-06, + "loss": 0.3408, + "step": 2704 + }, + { + "epoch": 2.675568743818002, + "grad_norm": 0.23108104721689834, + "learning_rate": 6.009527299377062e-06, + "loss": 0.3547, + "step": 2705 + }, + { + "epoch": 2.6765578635014835, + "grad_norm": 0.22990919690448694, + "learning_rate": 5.99120556980579e-06, + "loss": 0.3583, + "step": 2706 + }, + { + "epoch": 2.6775469831849654, + "grad_norm": 0.22608119799904844, + "learning_rate": 5.972883840234519e-06, + "loss": 0.3468, + "step": 2707 + }, + { + "epoch": 2.6785361028684473, + "grad_norm": 0.23420424171951532, + "learning_rate": 5.954562110663247e-06, + "loss": 0.3711, + "step": 2708 + }, + { + "epoch": 2.679525222551929, + "grad_norm": 0.22283499002319432, + "learning_rate": 5.936240381091975e-06, + "loss": 0.3433, + "step": 2709 + }, + { + "epoch": 2.6805143422354103, + "grad_norm": 0.25165983826778154, + "learning_rate": 5.917918651520704e-06, + "loss": 0.3577, + "step": 2710 + }, + { + "epoch": 2.681503461918892, + "grad_norm": 0.21381831548899768, + "learning_rate": 5.899596921949432e-06, + "loss": 0.3243, + "step": 2711 + }, + { + "epoch": 2.682492581602374, + "grad_norm": 0.24859770910561865, + "learning_rate": 5.881275192378161e-06, + "loss": 0.3932, + "step": 2712 + }, + { + "epoch": 2.6834817012858556, + "grad_norm": 0.22835997974466907, + "learning_rate": 5.86295346280689e-06, + "loss": 0.358, + "step": 2713 + }, + { + "epoch": 2.684470820969337, + "grad_norm": 0.21603309757069702, + "learning_rate": 5.844631733235618e-06, + "loss": 0.3227, + "step": 2714 + }, + { + "epoch": 2.685459940652819, + "grad_norm": 0.22420474785394226, + "learning_rate": 5.826310003664346e-06, + "loss": 0.3219, + "step": 2715 + }, + { + "epoch": 2.686449060336301, + "grad_norm": 0.211435200550928, + "learning_rate": 5.807988274093075e-06, + "loss": 0.3027, + "step": 2716 + }, + { + "epoch": 2.6874381800197824, + "grad_norm": 0.2244513347827355, + "learning_rate": 5.789666544521803e-06, + "loss": 0.3397, + "step": 2717 + }, + { + "epoch": 2.688427299703264, + "grad_norm": 0.21717002471268115, + "learning_rate": 5.771344814950531e-06, + "loss": 0.3334, + "step": 2718 + }, + { + "epoch": 2.689416419386746, + "grad_norm": 0.23016598747660916, + "learning_rate": 5.75302308537926e-06, + "loss": 0.3375, + "step": 2719 + }, + { + "epoch": 2.6904055390702277, + "grad_norm": 0.21015267166960389, + "learning_rate": 5.734701355807988e-06, + "loss": 0.3159, + "step": 2720 + }, + { + "epoch": 2.691394658753709, + "grad_norm": 0.2108694801384789, + "learning_rate": 5.7163796262367165e-06, + "loss": 0.3308, + "step": 2721 + }, + { + "epoch": 2.6923837784371907, + "grad_norm": 0.23368408762172113, + "learning_rate": 5.6980578966654455e-06, + "loss": 0.3768, + "step": 2722 + }, + { + "epoch": 2.6933728981206726, + "grad_norm": 0.23439430393969224, + "learning_rate": 5.679736167094174e-06, + "loss": 0.3017, + "step": 2723 + }, + { + "epoch": 2.6943620178041545, + "grad_norm": 0.23942635675606497, + "learning_rate": 5.6614144375229025e-06, + "loss": 0.3689, + "step": 2724 + }, + { + "epoch": 2.695351137487636, + "grad_norm": 0.22177373379695609, + "learning_rate": 5.6430927079516315e-06, + "loss": 0.3521, + "step": 2725 + }, + { + "epoch": 2.6963402571711175, + "grad_norm": 0.21547233338756122, + "learning_rate": 5.62477097838036e-06, + "loss": 0.3416, + "step": 2726 + }, + { + "epoch": 2.6973293768545994, + "grad_norm": 0.21774106460684997, + "learning_rate": 5.606449248809088e-06, + "loss": 0.3218, + "step": 2727 + }, + { + "epoch": 2.6983184965380813, + "grad_norm": 0.2334321077543478, + "learning_rate": 5.588127519237817e-06, + "loss": 0.358, + "step": 2728 + }, + { + "epoch": 2.699307616221563, + "grad_norm": 0.24139566192301815, + "learning_rate": 5.569805789666545e-06, + "loss": 0.3771, + "step": 2729 + }, + { + "epoch": 2.7002967359050443, + "grad_norm": 0.2540431339935207, + "learning_rate": 5.551484060095273e-06, + "loss": 0.3657, + "step": 2730 + }, + { + "epoch": 2.701285855588526, + "grad_norm": 0.5909800839666434, + "learning_rate": 5.533162330524002e-06, + "loss": 0.3952, + "step": 2731 + }, + { + "epoch": 2.702274975272008, + "grad_norm": 0.2503607288953179, + "learning_rate": 5.51484060095273e-06, + "loss": 0.3793, + "step": 2732 + }, + { + "epoch": 2.7032640949554896, + "grad_norm": 0.2223796649689281, + "learning_rate": 5.496518871381458e-06, + "loss": 0.3031, + "step": 2733 + }, + { + "epoch": 2.704253214638971, + "grad_norm": 0.23538524556735074, + "learning_rate": 5.478197141810187e-06, + "loss": 0.3706, + "step": 2734 + }, + { + "epoch": 2.705242334322453, + "grad_norm": 0.2229912934237323, + "learning_rate": 5.459875412238916e-06, + "loss": 0.3253, + "step": 2735 + }, + { + "epoch": 2.706231454005935, + "grad_norm": 0.22679198948267829, + "learning_rate": 5.441553682667644e-06, + "loss": 0.3297, + "step": 2736 + }, + { + "epoch": 2.7072205736894164, + "grad_norm": 0.21590619745768205, + "learning_rate": 5.423231953096373e-06, + "loss": 0.3169, + "step": 2737 + }, + { + "epoch": 2.708209693372898, + "grad_norm": 0.2217628704155836, + "learning_rate": 5.404910223525101e-06, + "loss": 0.3624, + "step": 2738 + }, + { + "epoch": 2.7091988130563798, + "grad_norm": 0.2169003913914285, + "learning_rate": 5.386588493953829e-06, + "loss": 0.3112, + "step": 2739 + }, + { + "epoch": 2.7101879327398617, + "grad_norm": 0.23975530450787874, + "learning_rate": 5.368266764382558e-06, + "loss": 0.371, + "step": 2740 + }, + { + "epoch": 2.711177052423343, + "grad_norm": 0.22760491329000107, + "learning_rate": 5.349945034811286e-06, + "loss": 0.3033, + "step": 2741 + }, + { + "epoch": 2.712166172106825, + "grad_norm": 0.2126172357592932, + "learning_rate": 5.3316233052400145e-06, + "loss": 0.3459, + "step": 2742 + }, + { + "epoch": 2.7131552917903066, + "grad_norm": 0.24447754746061037, + "learning_rate": 5.3133015756687434e-06, + "loss": 0.3579, + "step": 2743 + }, + { + "epoch": 2.7141444114737885, + "grad_norm": 0.21923248450136007, + "learning_rate": 5.2949798460974715e-06, + "loss": 0.3553, + "step": 2744 + }, + { + "epoch": 2.71513353115727, + "grad_norm": 0.23324322920894322, + "learning_rate": 5.2766581165262005e-06, + "loss": 0.3876, + "step": 2745 + }, + { + "epoch": 2.716122650840752, + "grad_norm": 0.21651127289013336, + "learning_rate": 5.258336386954929e-06, + "loss": 0.3271, + "step": 2746 + }, + { + "epoch": 2.7171117705242334, + "grad_norm": 0.2278600935466184, + "learning_rate": 5.2400146573836576e-06, + "loss": 0.3767, + "step": 2747 + }, + { + "epoch": 2.7181008902077153, + "grad_norm": 0.2102279454382535, + "learning_rate": 5.221692927812386e-06, + "loss": 0.2996, + "step": 2748 + }, + { + "epoch": 2.7190900098911968, + "grad_norm": 0.19846329542181648, + "learning_rate": 5.203371198241115e-06, + "loss": 0.2969, + "step": 2749 + }, + { + "epoch": 2.7200791295746787, + "grad_norm": 0.23474124954742392, + "learning_rate": 5.185049468669843e-06, + "loss": 0.3563, + "step": 2750 + }, + { + "epoch": 2.72106824925816, + "grad_norm": 0.2133239490347135, + "learning_rate": 5.166727739098571e-06, + "loss": 0.3625, + "step": 2751 + }, + { + "epoch": 2.722057368941642, + "grad_norm": 0.2358371050129765, + "learning_rate": 5.1484060095273e-06, + "loss": 0.3468, + "step": 2752 + }, + { + "epoch": 2.7230464886251236, + "grad_norm": 0.20255887262973588, + "learning_rate": 5.130084279956028e-06, + "loss": 0.3045, + "step": 2753 + }, + { + "epoch": 2.7240356083086055, + "grad_norm": 0.22868689020977842, + "learning_rate": 5.111762550384756e-06, + "loss": 0.3525, + "step": 2754 + }, + { + "epoch": 2.725024727992087, + "grad_norm": 0.24113962478912296, + "learning_rate": 5.093440820813485e-06, + "loss": 0.3564, + "step": 2755 + }, + { + "epoch": 2.726013847675569, + "grad_norm": 0.2442213322729859, + "learning_rate": 5.075119091242213e-06, + "loss": 0.3272, + "step": 2756 + }, + { + "epoch": 2.7270029673590503, + "grad_norm": 0.22564109275269484, + "learning_rate": 5.056797361670942e-06, + "loss": 0.3294, + "step": 2757 + }, + { + "epoch": 2.7279920870425323, + "grad_norm": 0.2187439459162259, + "learning_rate": 5.038475632099671e-06, + "loss": 0.3428, + "step": 2758 + }, + { + "epoch": 2.7289812067260137, + "grad_norm": 0.22198247455708378, + "learning_rate": 5.020153902528399e-06, + "loss": 0.3523, + "step": 2759 + }, + { + "epoch": 2.7299703264094957, + "grad_norm": 0.20648216241698295, + "learning_rate": 5.001832172957127e-06, + "loss": 0.2998, + "step": 2760 + }, + { + "epoch": 2.730959446092977, + "grad_norm": 0.21591111811349886, + "learning_rate": 4.983510443385856e-06, + "loss": 0.3502, + "step": 2761 + }, + { + "epoch": 2.731948565776459, + "grad_norm": 0.20875229932327685, + "learning_rate": 4.965188713814584e-06, + "loss": 0.319, + "step": 2762 + }, + { + "epoch": 2.7329376854599405, + "grad_norm": 0.2309618563362465, + "learning_rate": 4.9468669842433124e-06, + "loss": 0.3301, + "step": 2763 + }, + { + "epoch": 2.7339268051434225, + "grad_norm": 0.21705046113121196, + "learning_rate": 4.928545254672041e-06, + "loss": 0.3328, + "step": 2764 + }, + { + "epoch": 2.734915924826904, + "grad_norm": 0.2027787922422683, + "learning_rate": 4.9102235251007695e-06, + "loss": 0.3022, + "step": 2765 + }, + { + "epoch": 2.735905044510386, + "grad_norm": 0.2133690055551515, + "learning_rate": 4.891901795529498e-06, + "loss": 0.3357, + "step": 2766 + }, + { + "epoch": 2.7368941641938673, + "grad_norm": 0.22559371081701649, + "learning_rate": 4.873580065958227e-06, + "loss": 0.3545, + "step": 2767 + }, + { + "epoch": 2.7378832838773492, + "grad_norm": 0.21203557203334927, + "learning_rate": 4.8552583363869555e-06, + "loss": 0.3328, + "step": 2768 + }, + { + "epoch": 2.7388724035608307, + "grad_norm": 0.21294747265250044, + "learning_rate": 4.836936606815684e-06, + "loss": 0.332, + "step": 2769 + }, + { + "epoch": 2.7398615232443126, + "grad_norm": 0.21123677919400954, + "learning_rate": 4.818614877244413e-06, + "loss": 0.3414, + "step": 2770 + }, + { + "epoch": 2.740850642927794, + "grad_norm": 0.21659310977778334, + "learning_rate": 4.800293147673141e-06, + "loss": 0.3264, + "step": 2771 + }, + { + "epoch": 2.741839762611276, + "grad_norm": 0.22960667507714003, + "learning_rate": 4.781971418101869e-06, + "loss": 0.374, + "step": 2772 + }, + { + "epoch": 2.7428288822947575, + "grad_norm": 0.21339735395164217, + "learning_rate": 4.763649688530598e-06, + "loss": 0.3446, + "step": 2773 + }, + { + "epoch": 2.7438180019782394, + "grad_norm": 0.23992675800975746, + "learning_rate": 4.745327958959326e-06, + "loss": 0.3627, + "step": 2774 + }, + { + "epoch": 2.744807121661721, + "grad_norm": 0.21653441504113724, + "learning_rate": 4.727006229388054e-06, + "loss": 0.3292, + "step": 2775 + }, + { + "epoch": 2.745796241345203, + "grad_norm": 0.20765781658155638, + "learning_rate": 4.708684499816783e-06, + "loss": 0.3369, + "step": 2776 + }, + { + "epoch": 2.7467853610286843, + "grad_norm": 0.2137923966682356, + "learning_rate": 4.690362770245511e-06, + "loss": 0.3316, + "step": 2777 + }, + { + "epoch": 2.7477744807121662, + "grad_norm": 0.22951511263981078, + "learning_rate": 4.672041040674239e-06, + "loss": 0.3232, + "step": 2778 + }, + { + "epoch": 2.7487636003956477, + "grad_norm": 0.21668420363059104, + "learning_rate": 4.653719311102968e-06, + "loss": 0.3133, + "step": 2779 + }, + { + "epoch": 2.7497527200791296, + "grad_norm": 0.20870298094145542, + "learning_rate": 4.635397581531697e-06, + "loss": 0.3079, + "step": 2780 + }, + { + "epoch": 2.750741839762611, + "grad_norm": 0.21695592576488162, + "learning_rate": 4.617075851960425e-06, + "loss": 0.3365, + "step": 2781 + }, + { + "epoch": 2.751730959446093, + "grad_norm": 0.23452800703138157, + "learning_rate": 4.598754122389154e-06, + "loss": 0.3605, + "step": 2782 + }, + { + "epoch": 2.752720079129575, + "grad_norm": 0.22386479424333205, + "learning_rate": 4.580432392817882e-06, + "loss": 0.3766, + "step": 2783 + }, + { + "epoch": 2.7537091988130564, + "grad_norm": 0.2500059904791562, + "learning_rate": 4.56211066324661e-06, + "loss": 0.3552, + "step": 2784 + }, + { + "epoch": 2.754698318496538, + "grad_norm": 0.2120523623817133, + "learning_rate": 4.543788933675339e-06, + "loss": 0.3178, + "step": 2785 + }, + { + "epoch": 2.75568743818002, + "grad_norm": 0.2151292399138165, + "learning_rate": 4.5254672041040675e-06, + "loss": 0.3671, + "step": 2786 + }, + { + "epoch": 2.7566765578635017, + "grad_norm": 0.21872770644925504, + "learning_rate": 4.507145474532796e-06, + "loss": 0.3641, + "step": 2787 + }, + { + "epoch": 2.757665677546983, + "grad_norm": 0.24772664908497757, + "learning_rate": 4.4888237449615246e-06, + "loss": 0.3734, + "step": 2788 + }, + { + "epoch": 2.7586547972304647, + "grad_norm": 0.2642668933133229, + "learning_rate": 4.470502015390253e-06, + "loss": 0.3684, + "step": 2789 + }, + { + "epoch": 2.7596439169139466, + "grad_norm": 0.2221777432007733, + "learning_rate": 4.452180285818982e-06, + "loss": 0.349, + "step": 2790 + }, + { + "epoch": 2.7606330365974285, + "grad_norm": 0.2196073440373851, + "learning_rate": 4.43385855624771e-06, + "loss": 0.3489, + "step": 2791 + }, + { + "epoch": 2.76162215628091, + "grad_norm": 0.23391467333346672, + "learning_rate": 4.415536826676439e-06, + "loss": 0.3598, + "step": 2792 + }, + { + "epoch": 2.7626112759643915, + "grad_norm": 0.2115533472124657, + "learning_rate": 4.397215097105167e-06, + "loss": 0.3081, + "step": 2793 + }, + { + "epoch": 2.7636003956478734, + "grad_norm": 0.2271678820385135, + "learning_rate": 4.378893367533896e-06, + "loss": 0.3716, + "step": 2794 + }, + { + "epoch": 2.7645895153313553, + "grad_norm": 0.23767597252465578, + "learning_rate": 4.360571637962624e-06, + "loss": 0.3591, + "step": 2795 + }, + { + "epoch": 2.765578635014837, + "grad_norm": 0.2066536302237579, + "learning_rate": 4.342249908391352e-06, + "loss": 0.3413, + "step": 2796 + }, + { + "epoch": 2.7665677546983183, + "grad_norm": 0.21845630072729733, + "learning_rate": 4.323928178820081e-06, + "loss": 0.3577, + "step": 2797 + }, + { + "epoch": 2.7675568743818, + "grad_norm": 0.21779485602289012, + "learning_rate": 4.305606449248809e-06, + "loss": 0.3468, + "step": 2798 + }, + { + "epoch": 2.768545994065282, + "grad_norm": 0.22232401786465894, + "learning_rate": 4.287284719677537e-06, + "loss": 0.3144, + "step": 2799 + }, + { + "epoch": 2.7695351137487636, + "grad_norm": 0.23008435815779882, + "learning_rate": 4.268962990106266e-06, + "loss": 0.3433, + "step": 2800 + }, + { + "epoch": 2.770524233432245, + "grad_norm": 0.2226756211496331, + "learning_rate": 4.250641260534994e-06, + "loss": 0.375, + "step": 2801 + }, + { + "epoch": 2.771513353115727, + "grad_norm": 0.21695887700850297, + "learning_rate": 4.232319530963723e-06, + "loss": 0.3465, + "step": 2802 + }, + { + "epoch": 2.772502472799209, + "grad_norm": 0.22223208742172174, + "learning_rate": 4.213997801392452e-06, + "loss": 0.2978, + "step": 2803 + }, + { + "epoch": 2.7734915924826904, + "grad_norm": 0.2170797804111916, + "learning_rate": 4.19567607182118e-06, + "loss": 0.3213, + "step": 2804 + }, + { + "epoch": 2.774480712166172, + "grad_norm": 0.22027515254777197, + "learning_rate": 4.177354342249908e-06, + "loss": 0.3443, + "step": 2805 + }, + { + "epoch": 2.775469831849654, + "grad_norm": 0.22207428166028975, + "learning_rate": 4.159032612678637e-06, + "loss": 0.3573, + "step": 2806 + }, + { + "epoch": 2.7764589515331357, + "grad_norm": 0.2087620444374386, + "learning_rate": 4.1407108831073654e-06, + "loss": 0.3414, + "step": 2807 + }, + { + "epoch": 2.777448071216617, + "grad_norm": 0.2078913845907797, + "learning_rate": 4.1223891535360936e-06, + "loss": 0.3213, + "step": 2808 + }, + { + "epoch": 2.7784371909000987, + "grad_norm": 0.23077801939274703, + "learning_rate": 4.1040674239648225e-06, + "loss": 0.3524, + "step": 2809 + }, + { + "epoch": 2.7794263105835806, + "grad_norm": 0.21731853730142003, + "learning_rate": 4.085745694393551e-06, + "loss": 0.3441, + "step": 2810 + }, + { + "epoch": 2.7804154302670625, + "grad_norm": 0.216573556930499, + "learning_rate": 4.067423964822279e-06, + "loss": 0.3576, + "step": 2811 + }, + { + "epoch": 2.781404549950544, + "grad_norm": 0.21120665151959334, + "learning_rate": 4.049102235251008e-06, + "loss": 0.3431, + "step": 2812 + }, + { + "epoch": 2.7823936696340255, + "grad_norm": 0.23307858510646606, + "learning_rate": 4.030780505679737e-06, + "loss": 0.3603, + "step": 2813 + }, + { + "epoch": 2.7833827893175074, + "grad_norm": 0.2107086413941475, + "learning_rate": 4.012458776108465e-06, + "loss": 0.3332, + "step": 2814 + }, + { + "epoch": 2.7843719090009893, + "grad_norm": 0.23230553012383037, + "learning_rate": 3.994137046537194e-06, + "loss": 0.3313, + "step": 2815 + }, + { + "epoch": 2.7853610286844708, + "grad_norm": 0.25073644328468253, + "learning_rate": 3.975815316965922e-06, + "loss": 0.4522, + "step": 2816 + }, + { + "epoch": 2.7863501483679523, + "grad_norm": 0.22850963515675785, + "learning_rate": 3.95749358739465e-06, + "loss": 0.3318, + "step": 2817 + }, + { + "epoch": 2.787339268051434, + "grad_norm": 0.22594036641456755, + "learning_rate": 3.939171857823379e-06, + "loss": 0.3078, + "step": 2818 + }, + { + "epoch": 2.788328387734916, + "grad_norm": 0.23726950060282387, + "learning_rate": 3.920850128252107e-06, + "loss": 0.3667, + "step": 2819 + }, + { + "epoch": 2.7893175074183976, + "grad_norm": 0.21765453645873728, + "learning_rate": 3.902528398680836e-06, + "loss": 0.3253, + "step": 2820 + }, + { + "epoch": 2.7903066271018795, + "grad_norm": 0.21731756160867208, + "learning_rate": 3.884206669109564e-06, + "loss": 0.3354, + "step": 2821 + }, + { + "epoch": 2.791295746785361, + "grad_norm": 0.20040051432164183, + "learning_rate": 3.865884939538292e-06, + "loss": 0.3006, + "step": 2822 + }, + { + "epoch": 2.792284866468843, + "grad_norm": 0.22186912403842526, + "learning_rate": 3.847563209967021e-06, + "loss": 0.3171, + "step": 2823 + }, + { + "epoch": 2.7932739861523244, + "grad_norm": 0.22337977682594518, + "learning_rate": 3.829241480395749e-06, + "loss": 0.3805, + "step": 2824 + }, + { + "epoch": 2.7942631058358063, + "grad_norm": 0.22332412301847868, + "learning_rate": 3.810919750824478e-06, + "loss": 0.3363, + "step": 2825 + }, + { + "epoch": 2.7952522255192878, + "grad_norm": 0.2184887090680327, + "learning_rate": 3.7925980212532068e-06, + "loss": 0.3253, + "step": 2826 + }, + { + "epoch": 2.7962413452027697, + "grad_norm": 0.2326535028536087, + "learning_rate": 3.774276291681935e-06, + "loss": 0.3413, + "step": 2827 + }, + { + "epoch": 2.797230464886251, + "grad_norm": 0.22537246867937816, + "learning_rate": 3.7559545621106634e-06, + "loss": 0.3787, + "step": 2828 + }, + { + "epoch": 2.798219584569733, + "grad_norm": 0.21512167988752373, + "learning_rate": 3.7376328325393924e-06, + "loss": 0.3419, + "step": 2829 + }, + { + "epoch": 2.7992087042532146, + "grad_norm": 0.21070682115918318, + "learning_rate": 3.7193111029681205e-06, + "loss": 0.3435, + "step": 2830 + }, + { + "epoch": 2.8001978239366965, + "grad_norm": 0.23029319903291864, + "learning_rate": 3.7009893733968486e-06, + "loss": 0.3486, + "step": 2831 + }, + { + "epoch": 2.801186943620178, + "grad_norm": 0.2138444420546789, + "learning_rate": 3.6826676438255776e-06, + "loss": 0.3415, + "step": 2832 + }, + { + "epoch": 2.80217606330366, + "grad_norm": 0.20357209160670728, + "learning_rate": 3.6643459142543057e-06, + "loss": 0.3293, + "step": 2833 + }, + { + "epoch": 2.8031651829871413, + "grad_norm": 0.25026208901493, + "learning_rate": 3.646024184683034e-06, + "loss": 0.3354, + "step": 2834 + }, + { + "epoch": 2.8041543026706233, + "grad_norm": 0.2011884823421506, + "learning_rate": 3.627702455111763e-06, + "loss": 0.3096, + "step": 2835 + }, + { + "epoch": 2.8051434223541047, + "grad_norm": 0.2394174392476079, + "learning_rate": 3.6093807255404913e-06, + "loss": 0.37, + "step": 2836 + }, + { + "epoch": 2.8061325420375867, + "grad_norm": 0.2255311976493184, + "learning_rate": 3.5910589959692194e-06, + "loss": 0.3478, + "step": 2837 + }, + { + "epoch": 2.807121661721068, + "grad_norm": 0.24237047406156542, + "learning_rate": 3.5727372663979483e-06, + "loss": 0.3575, + "step": 2838 + }, + { + "epoch": 2.80811078140455, + "grad_norm": 0.21059337790432422, + "learning_rate": 3.554415536826677e-06, + "loss": 0.3284, + "step": 2839 + }, + { + "epoch": 2.8090999010880315, + "grad_norm": 0.20503847170160883, + "learning_rate": 3.536093807255405e-06, + "loss": 0.3233, + "step": 2840 + }, + { + "epoch": 2.8100890207715135, + "grad_norm": 0.21119961563953477, + "learning_rate": 3.517772077684134e-06, + "loss": 0.3272, + "step": 2841 + }, + { + "epoch": 2.811078140454995, + "grad_norm": 0.19978901112842018, + "learning_rate": 3.499450348112862e-06, + "loss": 0.3026, + "step": 2842 + }, + { + "epoch": 2.812067260138477, + "grad_norm": 0.21915361223504046, + "learning_rate": 3.48112861854159e-06, + "loss": 0.3428, + "step": 2843 + }, + { + "epoch": 2.8130563798219583, + "grad_norm": 0.2235259457896506, + "learning_rate": 3.462806888970319e-06, + "loss": 0.3043, + "step": 2844 + }, + { + "epoch": 2.8140454995054403, + "grad_norm": 0.2178345130349941, + "learning_rate": 3.4444851593990477e-06, + "loss": 0.3428, + "step": 2845 + }, + { + "epoch": 2.8150346191889217, + "grad_norm": 0.21372597462198137, + "learning_rate": 3.4261634298277758e-06, + "loss": 0.2926, + "step": 2846 + }, + { + "epoch": 2.8160237388724036, + "grad_norm": 0.22720651208550413, + "learning_rate": 3.4078417002565047e-06, + "loss": 0.3482, + "step": 2847 + }, + { + "epoch": 2.817012858555885, + "grad_norm": 0.22006740899272062, + "learning_rate": 3.389519970685233e-06, + "loss": 0.3427, + "step": 2848 + }, + { + "epoch": 2.818001978239367, + "grad_norm": 0.2159529003979577, + "learning_rate": 3.371198241113961e-06, + "loss": 0.3423, + "step": 2849 + }, + { + "epoch": 2.8189910979228485, + "grad_norm": 0.20214518327063988, + "learning_rate": 3.35287651154269e-06, + "loss": 0.3143, + "step": 2850 + }, + { + "epoch": 2.8199802176063304, + "grad_norm": 0.21793790452801143, + "learning_rate": 3.3345547819714185e-06, + "loss": 0.3113, + "step": 2851 + }, + { + "epoch": 2.820969337289812, + "grad_norm": 0.20335863728522915, + "learning_rate": 3.3162330524001466e-06, + "loss": 0.3059, + "step": 2852 + }, + { + "epoch": 2.821958456973294, + "grad_norm": 0.20270595996542862, + "learning_rate": 3.2979113228288755e-06, + "loss": 0.3272, + "step": 2853 + }, + { + "epoch": 2.8229475766567753, + "grad_norm": 1.1077539805296455, + "learning_rate": 3.2795895932576036e-06, + "loss": 0.3856, + "step": 2854 + }, + { + "epoch": 2.8239366963402572, + "grad_norm": 0.24132309930564733, + "learning_rate": 3.261267863686332e-06, + "loss": 0.3957, + "step": 2855 + }, + { + "epoch": 2.8249258160237387, + "grad_norm": 0.2275466649899499, + "learning_rate": 3.2429461341150607e-06, + "loss": 0.3769, + "step": 2856 + }, + { + "epoch": 2.8259149357072206, + "grad_norm": 0.25008670282611023, + "learning_rate": 3.2246244045437892e-06, + "loss": 0.35, + "step": 2857 + }, + { + "epoch": 2.826904055390702, + "grad_norm": 0.22773541395218408, + "learning_rate": 3.2063026749725174e-06, + "loss": 0.3784, + "step": 2858 + }, + { + "epoch": 2.827893175074184, + "grad_norm": 0.20609422624498253, + "learning_rate": 3.1879809454012463e-06, + "loss": 0.3239, + "step": 2859 + }, + { + "epoch": 2.8288822947576655, + "grad_norm": 0.20460207000166866, + "learning_rate": 3.1696592158299744e-06, + "loss": 0.3215, + "step": 2860 + }, + { + "epoch": 2.8298714144411474, + "grad_norm": 0.23129338204484665, + "learning_rate": 3.151337486258703e-06, + "loss": 0.3938, + "step": 2861 + }, + { + "epoch": 2.8308605341246293, + "grad_norm": 0.22362717643248406, + "learning_rate": 3.133015756687432e-06, + "loss": 0.3617, + "step": 2862 + }, + { + "epoch": 2.831849653808111, + "grad_norm": 0.21117103548385116, + "learning_rate": 3.11469402711616e-06, + "loss": 0.3237, + "step": 2863 + }, + { + "epoch": 2.8328387734915923, + "grad_norm": 0.20671124421870285, + "learning_rate": 3.0963722975448886e-06, + "loss": 0.3112, + "step": 2864 + }, + { + "epoch": 2.833827893175074, + "grad_norm": 0.22217972694620672, + "learning_rate": 3.0780505679736167e-06, + "loss": 0.3634, + "step": 2865 + }, + { + "epoch": 2.834817012858556, + "grad_norm": 0.2123227501555754, + "learning_rate": 3.0597288384023452e-06, + "loss": 0.3181, + "step": 2866 + }, + { + "epoch": 2.8358061325420376, + "grad_norm": 0.20096780688164737, + "learning_rate": 3.0414071088310737e-06, + "loss": 0.3491, + "step": 2867 + }, + { + "epoch": 2.836795252225519, + "grad_norm": 0.21924673434018688, + "learning_rate": 3.0230853792598023e-06, + "loss": 0.3443, + "step": 2868 + }, + { + "epoch": 2.837784371909001, + "grad_norm": 0.2286827845733992, + "learning_rate": 3.004763649688531e-06, + "loss": 0.3668, + "step": 2869 + }, + { + "epoch": 2.838773491592483, + "grad_norm": 0.21092705025428599, + "learning_rate": 2.9864419201172594e-06, + "loss": 0.338, + "step": 2870 + }, + { + "epoch": 2.8397626112759644, + "grad_norm": 0.2148124154519086, + "learning_rate": 2.9681201905459875e-06, + "loss": 0.3545, + "step": 2871 + }, + { + "epoch": 2.840751730959446, + "grad_norm": 0.2133227511589604, + "learning_rate": 2.949798460974716e-06, + "loss": 0.3347, + "step": 2872 + }, + { + "epoch": 2.841740850642928, + "grad_norm": 0.2208561637151365, + "learning_rate": 2.931476731403445e-06, + "loss": 0.3511, + "step": 2873 + }, + { + "epoch": 2.8427299703264097, + "grad_norm": 0.2211175652037006, + "learning_rate": 2.913155001832173e-06, + "loss": 0.3802, + "step": 2874 + }, + { + "epoch": 2.843719090009891, + "grad_norm": 0.22087087502836644, + "learning_rate": 2.8948332722609016e-06, + "loss": 0.3668, + "step": 2875 + }, + { + "epoch": 2.8447082096933727, + "grad_norm": 0.22955939394628602, + "learning_rate": 2.87651154268963e-06, + "loss": 0.3035, + "step": 2876 + }, + { + "epoch": 2.8456973293768546, + "grad_norm": 0.20911592349861238, + "learning_rate": 2.8581898131183583e-06, + "loss": 0.3487, + "step": 2877 + }, + { + "epoch": 2.8466864490603365, + "grad_norm": 0.20740229129970458, + "learning_rate": 2.839868083547087e-06, + "loss": 0.2983, + "step": 2878 + }, + { + "epoch": 2.847675568743818, + "grad_norm": 0.1982895805702466, + "learning_rate": 2.8215463539758157e-06, + "loss": 0.29, + "step": 2879 + }, + { + "epoch": 2.8486646884272995, + "grad_norm": 0.21506799683461428, + "learning_rate": 2.803224624404544e-06, + "loss": 0.3596, + "step": 2880 + }, + { + "epoch": 2.8496538081107814, + "grad_norm": 0.21618734491043085, + "learning_rate": 2.7849028948332724e-06, + "loss": 0.3111, + "step": 2881 + }, + { + "epoch": 2.8506429277942633, + "grad_norm": 0.2123248042207484, + "learning_rate": 2.766581165262001e-06, + "loss": 0.3545, + "step": 2882 + }, + { + "epoch": 2.851632047477745, + "grad_norm": 0.22214805828542464, + "learning_rate": 2.748259435690729e-06, + "loss": 0.358, + "step": 2883 + }, + { + "epoch": 2.8526211671612263, + "grad_norm": 0.2004977051100376, + "learning_rate": 2.729937706119458e-06, + "loss": 0.3267, + "step": 2884 + }, + { + "epoch": 2.853610286844708, + "grad_norm": 0.21295550797121635, + "learning_rate": 2.7116159765481865e-06, + "loss": 0.3417, + "step": 2885 + }, + { + "epoch": 2.85459940652819, + "grad_norm": 0.2103516015542619, + "learning_rate": 2.6932942469769146e-06, + "loss": 0.3276, + "step": 2886 + }, + { + "epoch": 2.8555885262116716, + "grad_norm": 0.20954170278454848, + "learning_rate": 2.674972517405643e-06, + "loss": 0.3189, + "step": 2887 + }, + { + "epoch": 2.856577645895153, + "grad_norm": 0.20360676369415176, + "learning_rate": 2.6566507878343717e-06, + "loss": 0.319, + "step": 2888 + }, + { + "epoch": 2.857566765578635, + "grad_norm": 0.21277689756209228, + "learning_rate": 2.6383290582631003e-06, + "loss": 0.3326, + "step": 2889 + }, + { + "epoch": 2.858555885262117, + "grad_norm": 0.21752596509030658, + "learning_rate": 2.6200073286918288e-06, + "loss": 0.3731, + "step": 2890 + }, + { + "epoch": 2.8595450049455984, + "grad_norm": 0.2138648178065751, + "learning_rate": 2.6016855991205573e-06, + "loss": 0.3556, + "step": 2891 + }, + { + "epoch": 2.86053412462908, + "grad_norm": 0.21421289059170068, + "learning_rate": 2.5833638695492854e-06, + "loss": 0.3449, + "step": 2892 + }, + { + "epoch": 2.8615232443125618, + "grad_norm": 0.22802713240073696, + "learning_rate": 2.565042139978014e-06, + "loss": 0.339, + "step": 2893 + }, + { + "epoch": 2.8625123639960437, + "grad_norm": 0.20891785110011957, + "learning_rate": 2.5467204104067425e-06, + "loss": 0.3272, + "step": 2894 + }, + { + "epoch": 2.863501483679525, + "grad_norm": 0.2154505874086274, + "learning_rate": 2.528398680835471e-06, + "loss": 0.3493, + "step": 2895 + }, + { + "epoch": 2.8644906033630066, + "grad_norm": 0.21116137690722792, + "learning_rate": 2.5100769512641996e-06, + "loss": 0.3261, + "step": 2896 + }, + { + "epoch": 2.8654797230464886, + "grad_norm": 0.20872464139224442, + "learning_rate": 2.491755221692928e-06, + "loss": 0.3091, + "step": 2897 + }, + { + "epoch": 2.8664688427299705, + "grad_norm": 0.22172791728030142, + "learning_rate": 2.4734334921216562e-06, + "loss": 0.3674, + "step": 2898 + }, + { + "epoch": 2.867457962413452, + "grad_norm": 0.22760995786242147, + "learning_rate": 2.4551117625503848e-06, + "loss": 0.3728, + "step": 2899 + }, + { + "epoch": 2.868447082096934, + "grad_norm": 0.22014957904362617, + "learning_rate": 2.4367900329791133e-06, + "loss": 0.3356, + "step": 2900 + }, + { + "epoch": 2.8694362017804154, + "grad_norm": 0.21304917465749867, + "learning_rate": 2.418468303407842e-06, + "loss": 0.3579, + "step": 2901 + }, + { + "epoch": 2.8704253214638973, + "grad_norm": 0.19788363376546905, + "learning_rate": 2.4001465738365704e-06, + "loss": 0.3202, + "step": 2902 + }, + { + "epoch": 2.8714144411473788, + "grad_norm": 0.21988418201730414, + "learning_rate": 2.381824844265299e-06, + "loss": 0.3181, + "step": 2903 + }, + { + "epoch": 2.8724035608308607, + "grad_norm": 0.20738796103864413, + "learning_rate": 2.363503114694027e-06, + "loss": 0.3477, + "step": 2904 + }, + { + "epoch": 2.873392680514342, + "grad_norm": 0.21762364979569407, + "learning_rate": 2.3451813851227555e-06, + "loss": 0.3439, + "step": 2905 + }, + { + "epoch": 2.874381800197824, + "grad_norm": 0.2143578729116185, + "learning_rate": 2.326859655551484e-06, + "loss": 0.3348, + "step": 2906 + }, + { + "epoch": 2.8753709198813056, + "grad_norm": 0.232885047628153, + "learning_rate": 2.3085379259802126e-06, + "loss": 0.3748, + "step": 2907 + }, + { + "epoch": 2.8763600395647875, + "grad_norm": 0.21077048066014784, + "learning_rate": 2.290216196408941e-06, + "loss": 0.3429, + "step": 2908 + }, + { + "epoch": 2.877349159248269, + "grad_norm": 0.21001095217110033, + "learning_rate": 2.2718944668376697e-06, + "loss": 0.3032, + "step": 2909 + }, + { + "epoch": 2.878338278931751, + "grad_norm": 0.2166268520185779, + "learning_rate": 2.253572737266398e-06, + "loss": 0.3875, + "step": 2910 + }, + { + "epoch": 2.8793273986152323, + "grad_norm": 0.21898528679644236, + "learning_rate": 2.2352510076951263e-06, + "loss": 0.3256, + "step": 2911 + }, + { + "epoch": 2.8803165182987143, + "grad_norm": 0.22735390510825595, + "learning_rate": 2.216929278123855e-06, + "loss": 0.4083, + "step": 2912 + }, + { + "epoch": 2.8813056379821957, + "grad_norm": 0.23062531916202664, + "learning_rate": 2.1986075485525834e-06, + "loss": 0.372, + "step": 2913 + }, + { + "epoch": 2.8822947576656777, + "grad_norm": 0.21477846290717173, + "learning_rate": 2.180285818981312e-06, + "loss": 0.3271, + "step": 2914 + }, + { + "epoch": 2.883283877349159, + "grad_norm": 0.2117459683443643, + "learning_rate": 2.1619640894100405e-06, + "loss": 0.3577, + "step": 2915 + }, + { + "epoch": 2.884272997032641, + "grad_norm": 0.21937476541837353, + "learning_rate": 2.1436423598387686e-06, + "loss": 0.3423, + "step": 2916 + }, + { + "epoch": 2.8852621167161225, + "grad_norm": 0.1976009681830841, + "learning_rate": 2.125320630267497e-06, + "loss": 0.3253, + "step": 2917 + }, + { + "epoch": 2.8862512363996045, + "grad_norm": 0.21302952806939138, + "learning_rate": 2.106998900696226e-06, + "loss": 0.3305, + "step": 2918 + }, + { + "epoch": 2.887240356083086, + "grad_norm": 0.206815293905744, + "learning_rate": 2.088677171124954e-06, + "loss": 0.3496, + "step": 2919 + }, + { + "epoch": 2.888229475766568, + "grad_norm": 0.1995894143318209, + "learning_rate": 2.0703554415536827e-06, + "loss": 0.3112, + "step": 2920 + }, + { + "epoch": 2.8892185954500493, + "grad_norm": 0.19955543207743653, + "learning_rate": 2.0520337119824113e-06, + "loss": 0.3113, + "step": 2921 + }, + { + "epoch": 2.8902077151335313, + "grad_norm": 0.2158562754144166, + "learning_rate": 2.0337119824111394e-06, + "loss": 0.3698, + "step": 2922 + }, + { + "epoch": 2.8911968348170127, + "grad_norm": 0.20864063973766814, + "learning_rate": 2.0153902528398683e-06, + "loss": 0.3441, + "step": 2923 + }, + { + "epoch": 2.8921859545004946, + "grad_norm": 0.2119247973375731, + "learning_rate": 1.997068523268597e-06, + "loss": 0.3437, + "step": 2924 + }, + { + "epoch": 2.893175074183976, + "grad_norm": 0.2232073700071952, + "learning_rate": 1.978746793697325e-06, + "loss": 0.3929, + "step": 2925 + }, + { + "epoch": 2.894164193867458, + "grad_norm": 0.20286778897421978, + "learning_rate": 1.9604250641260535e-06, + "loss": 0.3237, + "step": 2926 + }, + { + "epoch": 2.8951533135509395, + "grad_norm": 0.2092640542037701, + "learning_rate": 1.942103334554782e-06, + "loss": 0.2943, + "step": 2927 + }, + { + "epoch": 2.8961424332344214, + "grad_norm": 0.2088218793092618, + "learning_rate": 1.9237816049835106e-06, + "loss": 0.3332, + "step": 2928 + }, + { + "epoch": 2.897131552917903, + "grad_norm": 0.20119421841154572, + "learning_rate": 1.905459875412239e-06, + "loss": 0.319, + "step": 2929 + }, + { + "epoch": 2.898120672601385, + "grad_norm": 0.20219374342024857, + "learning_rate": 1.8871381458409674e-06, + "loss": 0.3187, + "step": 2930 + }, + { + "epoch": 2.8991097922848663, + "grad_norm": 0.19777287722260406, + "learning_rate": 1.8688164162696962e-06, + "loss": 0.3202, + "step": 2931 + }, + { + "epoch": 2.9000989119683482, + "grad_norm": 0.20427420158126633, + "learning_rate": 1.8504946866984243e-06, + "loss": 0.322, + "step": 2932 + }, + { + "epoch": 2.9010880316518297, + "grad_norm": 0.21551527284059774, + "learning_rate": 1.8321729571271528e-06, + "loss": 0.3069, + "step": 2933 + }, + { + "epoch": 2.9020771513353116, + "grad_norm": 0.199753848958888, + "learning_rate": 1.8138512275558816e-06, + "loss": 0.291, + "step": 2934 + }, + { + "epoch": 2.903066271018793, + "grad_norm": 0.2313478126214845, + "learning_rate": 1.7955294979846097e-06, + "loss": 0.368, + "step": 2935 + }, + { + "epoch": 2.904055390702275, + "grad_norm": 0.20699383705522323, + "learning_rate": 1.7772077684133384e-06, + "loss": 0.3402, + "step": 2936 + }, + { + "epoch": 2.905044510385757, + "grad_norm": 0.2187505761089255, + "learning_rate": 1.758886038842067e-06, + "loss": 0.3623, + "step": 2937 + }, + { + "epoch": 2.9060336300692384, + "grad_norm": 0.19345606798141826, + "learning_rate": 1.740564309270795e-06, + "loss": 0.3104, + "step": 2938 + }, + { + "epoch": 2.90702274975272, + "grad_norm": 0.21864657860270537, + "learning_rate": 1.7222425796995238e-06, + "loss": 0.3415, + "step": 2939 + }, + { + "epoch": 2.908011869436202, + "grad_norm": 0.19683732715540603, + "learning_rate": 1.7039208501282524e-06, + "loss": 0.2874, + "step": 2940 + }, + { + "epoch": 2.9090009891196837, + "grad_norm": 0.21266519134677597, + "learning_rate": 1.6855991205569805e-06, + "loss": 0.3586, + "step": 2941 + }, + { + "epoch": 2.909990108803165, + "grad_norm": 0.21398924276198206, + "learning_rate": 1.6672773909857092e-06, + "loss": 0.3229, + "step": 2942 + }, + { + "epoch": 2.9109792284866467, + "grad_norm": 0.22495179099039847, + "learning_rate": 1.6489556614144378e-06, + "loss": 0.3834, + "step": 2943 + }, + { + "epoch": 2.9119683481701286, + "grad_norm": 0.2024056953522565, + "learning_rate": 1.630633931843166e-06, + "loss": 0.3348, + "step": 2944 + }, + { + "epoch": 2.9129574678536105, + "grad_norm": 0.21753481027936725, + "learning_rate": 1.6123122022718946e-06, + "loss": 0.3871, + "step": 2945 + }, + { + "epoch": 2.913946587537092, + "grad_norm": 0.23414035730320035, + "learning_rate": 1.5939904727006232e-06, + "loss": 0.403, + "step": 2946 + }, + { + "epoch": 2.9149357072205735, + "grad_norm": 0.20610158192122174, + "learning_rate": 1.5756687431293515e-06, + "loss": 0.3576, + "step": 2947 + }, + { + "epoch": 2.9159248269040554, + "grad_norm": 0.2264080372941922, + "learning_rate": 1.55734701355808e-06, + "loss": 0.3619, + "step": 2948 + }, + { + "epoch": 2.9169139465875373, + "grad_norm": 0.20401730729882686, + "learning_rate": 1.5390252839868083e-06, + "loss": 0.3416, + "step": 2949 + }, + { + "epoch": 2.917903066271019, + "grad_norm": 0.20630091994141, + "learning_rate": 1.5207035544155369e-06, + "loss": 0.3122, + "step": 2950 + }, + { + "epoch": 2.9188921859545003, + "grad_norm": 0.2284728589041083, + "learning_rate": 1.5023818248442654e-06, + "loss": 0.3559, + "step": 2951 + }, + { + "epoch": 2.919881305637982, + "grad_norm": 0.21523792128505556, + "learning_rate": 1.4840600952729937e-06, + "loss": 0.3322, + "step": 2952 + }, + { + "epoch": 2.920870425321464, + "grad_norm": 0.21968342654749495, + "learning_rate": 1.4657383657017225e-06, + "loss": 0.3689, + "step": 2953 + }, + { + "epoch": 2.9218595450049456, + "grad_norm": 0.22690637293425686, + "learning_rate": 1.4474166361304508e-06, + "loss": 0.3335, + "step": 2954 + }, + { + "epoch": 2.922848664688427, + "grad_norm": 0.23072015845383623, + "learning_rate": 1.4290949065591791e-06, + "loss": 0.383, + "step": 2955 + }, + { + "epoch": 2.923837784371909, + "grad_norm": 0.2227436812072427, + "learning_rate": 1.4107731769879079e-06, + "loss": 0.3514, + "step": 2956 + }, + { + "epoch": 2.924826904055391, + "grad_norm": 0.23110200374260195, + "learning_rate": 1.3924514474166362e-06, + "loss": 0.4092, + "step": 2957 + }, + { + "epoch": 2.9258160237388724, + "grad_norm": 0.20321476984964387, + "learning_rate": 1.3741297178453645e-06, + "loss": 0.3342, + "step": 2958 + }, + { + "epoch": 2.926805143422354, + "grad_norm": 0.18821044499165784, + "learning_rate": 1.3558079882740933e-06, + "loss": 0.2944, + "step": 2959 + }, + { + "epoch": 2.927794263105836, + "grad_norm": 0.2133703549279981, + "learning_rate": 1.3374862587028216e-06, + "loss": 0.3528, + "step": 2960 + }, + { + "epoch": 2.9287833827893177, + "grad_norm": 0.19949471309765202, + "learning_rate": 1.3191645291315501e-06, + "loss": 0.3012, + "step": 2961 + }, + { + "epoch": 2.929772502472799, + "grad_norm": 0.20980980030921756, + "learning_rate": 1.3008427995602787e-06, + "loss": 0.3459, + "step": 2962 + }, + { + "epoch": 2.9307616221562807, + "grad_norm": 0.20593464559520894, + "learning_rate": 1.282521069989007e-06, + "loss": 0.3328, + "step": 2963 + }, + { + "epoch": 2.9317507418397626, + "grad_norm": 0.20716599210238562, + "learning_rate": 1.2641993404177355e-06, + "loss": 0.3473, + "step": 2964 + }, + { + "epoch": 2.9327398615232445, + "grad_norm": 0.22715584225040256, + "learning_rate": 1.245877610846464e-06, + "loss": 0.3487, + "step": 2965 + }, + { + "epoch": 2.933728981206726, + "grad_norm": 0.19923436499191036, + "learning_rate": 1.2275558812751924e-06, + "loss": 0.3245, + "step": 2966 + }, + { + "epoch": 2.9347181008902075, + "grad_norm": 0.21117769437499914, + "learning_rate": 1.209234151703921e-06, + "loss": 0.3417, + "step": 2967 + }, + { + "epoch": 2.9357072205736894, + "grad_norm": 0.2198013877108107, + "learning_rate": 1.1909124221326494e-06, + "loss": 0.3786, + "step": 2968 + }, + { + "epoch": 2.9366963402571713, + "grad_norm": 0.20015001058595291, + "learning_rate": 1.1725906925613778e-06, + "loss": 0.3256, + "step": 2969 + }, + { + "epoch": 2.9376854599406528, + "grad_norm": 0.21276982029063424, + "learning_rate": 1.1542689629901063e-06, + "loss": 0.3417, + "step": 2970 + }, + { + "epoch": 2.9386745796241343, + "grad_norm": 0.22527713295979443, + "learning_rate": 1.1359472334188348e-06, + "loss": 0.3625, + "step": 2971 + }, + { + "epoch": 2.939663699307616, + "grad_norm": 0.201432544398078, + "learning_rate": 1.1176255038475632e-06, + "loss": 0.3073, + "step": 2972 + }, + { + "epoch": 2.940652818991098, + "grad_norm": 7.943101719255052, + "learning_rate": 1.0993037742762917e-06, + "loss": 0.8247, + "step": 2973 + }, + { + "epoch": 2.9416419386745796, + "grad_norm": 0.21340135227018642, + "learning_rate": 1.0809820447050202e-06, + "loss": 0.3784, + "step": 2974 + }, + { + "epoch": 2.9426310583580615, + "grad_norm": 0.2113735158072839, + "learning_rate": 1.0626603151337486e-06, + "loss": 0.3574, + "step": 2975 + }, + { + "epoch": 2.943620178041543, + "grad_norm": 0.19609645027272843, + "learning_rate": 1.044338585562477e-06, + "loss": 0.3304, + "step": 2976 + }, + { + "epoch": 2.944609297725025, + "grad_norm": 0.21099389986750994, + "learning_rate": 1.0260168559912056e-06, + "loss": 0.346, + "step": 2977 + }, + { + "epoch": 2.9455984174085064, + "grad_norm": 0.21796864077492323, + "learning_rate": 1.0076951264199342e-06, + "loss": 0.3519, + "step": 2978 + }, + { + "epoch": 2.9465875370919883, + "grad_norm": 0.2382189042915159, + "learning_rate": 9.893733968486625e-07, + "loss": 0.3673, + "step": 2979 + }, + { + "epoch": 2.9475766567754698, + "grad_norm": 0.20419821656474832, + "learning_rate": 9.71051667277391e-07, + "loss": 0.3464, + "step": 2980 + }, + { + "epoch": 2.9485657764589517, + "grad_norm": 0.22395363781798322, + "learning_rate": 9.527299377061195e-07, + "loss": 0.3279, + "step": 2981 + }, + { + "epoch": 2.949554896142433, + "grad_norm": 0.20253721100448752, + "learning_rate": 9.344082081348481e-07, + "loss": 0.3368, + "step": 2982 + }, + { + "epoch": 2.950544015825915, + "grad_norm": 0.2022713938179965, + "learning_rate": 9.160864785635764e-07, + "loss": 0.3311, + "step": 2983 + }, + { + "epoch": 2.9515331355093966, + "grad_norm": 0.20595080269509453, + "learning_rate": 8.977647489923048e-07, + "loss": 0.352, + "step": 2984 + }, + { + "epoch": 2.9525222551928785, + "grad_norm": 0.1987103967446778, + "learning_rate": 8.794430194210335e-07, + "loss": 0.3465, + "step": 2985 + }, + { + "epoch": 2.95351137487636, + "grad_norm": 0.2057786134643462, + "learning_rate": 8.611212898497619e-07, + "loss": 0.3466, + "step": 2986 + }, + { + "epoch": 2.954500494559842, + "grad_norm": 0.21967452126576467, + "learning_rate": 8.427995602784902e-07, + "loss": 0.3539, + "step": 2987 + }, + { + "epoch": 2.9554896142433233, + "grad_norm": 0.22985902892440865, + "learning_rate": 8.244778307072189e-07, + "loss": 0.3911, + "step": 2988 + }, + { + "epoch": 2.9564787339268053, + "grad_norm": 0.2201316492538097, + "learning_rate": 8.061561011359473e-07, + "loss": 0.3493, + "step": 2989 + }, + { + "epoch": 2.9574678536102867, + "grad_norm": 0.19954226377424544, + "learning_rate": 7.878343715646757e-07, + "loss": 0.3463, + "step": 2990 + }, + { + "epoch": 2.9584569732937687, + "grad_norm": 0.20222148981802693, + "learning_rate": 7.695126419934042e-07, + "loss": 0.3135, + "step": 2991 + }, + { + "epoch": 2.95944609297725, + "grad_norm": 0.21871651088863459, + "learning_rate": 7.511909124221327e-07, + "loss": 0.3676, + "step": 2992 + }, + { + "epoch": 2.960435212660732, + "grad_norm": 0.21927670774593283, + "learning_rate": 7.328691828508612e-07, + "loss": 0.3351, + "step": 2993 + }, + { + "epoch": 2.9614243323442135, + "grad_norm": 0.208780563559218, + "learning_rate": 7.145474532795896e-07, + "loss": 0.3447, + "step": 2994 + }, + { + "epoch": 2.9624134520276955, + "grad_norm": 0.2126920301550537, + "learning_rate": 6.962257237083181e-07, + "loss": 0.3608, + "step": 2995 + }, + { + "epoch": 2.963402571711177, + "grad_norm": 0.20548958069933887, + "learning_rate": 6.779039941370466e-07, + "loss": 0.3304, + "step": 2996 + }, + { + "epoch": 2.964391691394659, + "grad_norm": 0.2036679215154334, + "learning_rate": 6.595822645657751e-07, + "loss": 0.3463, + "step": 2997 + }, + { + "epoch": 2.9653808110781403, + "grad_norm": 0.21172321613940526, + "learning_rate": 6.412605349945035e-07, + "loss": 0.3599, + "step": 2998 + }, + { + "epoch": 2.9663699307616223, + "grad_norm": 0.21160961055287664, + "learning_rate": 6.22938805423232e-07, + "loss": 0.3891, + "step": 2999 + }, + { + "epoch": 2.9673590504451037, + "grad_norm": 0.21327213001738943, + "learning_rate": 6.046170758519605e-07, + "loss": 0.3691, + "step": 3000 + }, + { + "epoch": 2.9683481701285857, + "grad_norm": 0.21547058365602503, + "learning_rate": 5.862953462806889e-07, + "loss": 0.3385, + "step": 3001 + }, + { + "epoch": 2.969337289812067, + "grad_norm": 0.20224924064139008, + "learning_rate": 5.679736167094174e-07, + "loss": 0.3285, + "step": 3002 + }, + { + "epoch": 2.970326409495549, + "grad_norm": 0.219988718165209, + "learning_rate": 5.496518871381459e-07, + "loss": 0.3431, + "step": 3003 + }, + { + "epoch": 2.9713155291790305, + "grad_norm": 0.22283985943183246, + "learning_rate": 5.313301575668743e-07, + "loss": 0.3386, + "step": 3004 + }, + { + "epoch": 2.9723046488625124, + "grad_norm": 0.20042527762749443, + "learning_rate": 5.130084279956028e-07, + "loss": 0.3339, + "step": 3005 + }, + { + "epoch": 2.973293768545994, + "grad_norm": 0.9140786441671352, + "learning_rate": 4.946866984243312e-07, + "loss": 0.3563, + "step": 3006 + }, + { + "epoch": 2.974282888229476, + "grad_norm": 0.20417408391682282, + "learning_rate": 4.763649688530597e-07, + "loss": 0.3401, + "step": 3007 + }, + { + "epoch": 2.9752720079129573, + "grad_norm": 0.20602866956947444, + "learning_rate": 4.580432392817882e-07, + "loss": 0.3247, + "step": 3008 + }, + { + "epoch": 2.9762611275964392, + "grad_norm": 0.21276857507850871, + "learning_rate": 4.3972150971051674e-07, + "loss": 0.3594, + "step": 3009 + }, + { + "epoch": 2.9772502472799207, + "grad_norm": 0.19943196340758107, + "learning_rate": 4.213997801392451e-07, + "loss": 0.3296, + "step": 3010 + }, + { + "epoch": 2.9782393669634026, + "grad_norm": 0.21268206131905573, + "learning_rate": 4.0307805056797366e-07, + "loss": 0.3423, + "step": 3011 + }, + { + "epoch": 2.979228486646884, + "grad_norm": 0.2057146467485511, + "learning_rate": 3.847563209967021e-07, + "loss": 0.3186, + "step": 3012 + }, + { + "epoch": 2.980217606330366, + "grad_norm": 0.20744284251947426, + "learning_rate": 3.664345914254306e-07, + "loss": 0.3208, + "step": 3013 + }, + { + "epoch": 2.9812067260138475, + "grad_norm": 0.20532924838610409, + "learning_rate": 3.4811286185415905e-07, + "loss": 0.3296, + "step": 3014 + }, + { + "epoch": 2.9821958456973294, + "grad_norm": 0.2033415582326357, + "learning_rate": 3.2979113228288753e-07, + "loss": 0.3257, + "step": 3015 + }, + { + "epoch": 2.9831849653808113, + "grad_norm": 0.20754506694784877, + "learning_rate": 3.11469402711616e-07, + "loss": 0.3296, + "step": 3016 + }, + { + "epoch": 2.984174085064293, + "grad_norm": 0.20629067666131826, + "learning_rate": 2.9314767314034444e-07, + "loss": 0.3189, + "step": 3017 + }, + { + "epoch": 2.9851632047477743, + "grad_norm": 0.22938650262981103, + "learning_rate": 2.748259435690729e-07, + "loss": 0.3406, + "step": 3018 + }, + { + "epoch": 2.986152324431256, + "grad_norm": 0.20636895308921352, + "learning_rate": 2.565042139978014e-07, + "loss": 0.3344, + "step": 3019 + }, + { + "epoch": 2.987141444114738, + "grad_norm": 0.20357483574027163, + "learning_rate": 2.3818248442652986e-07, + "loss": 0.3484, + "step": 3020 + }, + { + "epoch": 2.9881305637982196, + "grad_norm": 0.214430972642966, + "learning_rate": 2.1986075485525837e-07, + "loss": 0.3679, + "step": 3021 + }, + { + "epoch": 2.989119683481701, + "grad_norm": 0.19589997688875407, + "learning_rate": 2.0153902528398683e-07, + "loss": 0.3265, + "step": 3022 + }, + { + "epoch": 2.990108803165183, + "grad_norm": 0.23345971886153838, + "learning_rate": 1.832172957127153e-07, + "loss": 0.3985, + "step": 3023 + }, + { + "epoch": 2.991097922848665, + "grad_norm": 0.1981503597152391, + "learning_rate": 1.6489556614144377e-07, + "loss": 0.3099, + "step": 3024 + }, + { + "epoch": 2.9920870425321464, + "grad_norm": 0.2145800019812682, + "learning_rate": 1.4657383657017222e-07, + "loss": 0.3628, + "step": 3025 + }, + { + "epoch": 2.993076162215628, + "grad_norm": 0.1993615760861862, + "learning_rate": 1.282521069989007e-07, + "loss": 0.3104, + "step": 3026 + }, + { + "epoch": 2.99406528189911, + "grad_norm": 0.22261786031016298, + "learning_rate": 1.0993037742762919e-07, + "loss": 0.373, + "step": 3027 + }, + { + "epoch": 2.9950544015825917, + "grad_norm": 0.2022644187391328, + "learning_rate": 9.160864785635765e-08, + "loss": 0.3516, + "step": 3028 + }, + { + "epoch": 2.996043521266073, + "grad_norm": 0.19548068811302774, + "learning_rate": 7.328691828508611e-08, + "loss": 0.3303, + "step": 3029 + }, + { + "epoch": 2.9970326409495547, + "grad_norm": 0.202037553129905, + "learning_rate": 5.496518871381459e-08, + "loss": 0.3579, + "step": 3030 + }, + { + "epoch": 2.9980217606330366, + "grad_norm": 0.19575234484936066, + "learning_rate": 3.6643459142543055e-08, + "loss": 0.3171, + "step": 3031 + }, + { + "epoch": 2.9990108803165185, + "grad_norm": 0.21172137281490067, + "learning_rate": 1.8321729571271528e-08, + "loss": 0.3511, + "step": 3032 + }, + { + "epoch": 3.0, + "grad_norm": 0.20754262490189151, + "learning_rate": 0.0, + "loss": 0.3455, + "step": 3033 + }, + { + "epoch": 3.0, + "step": 3033, + "total_flos": 2.5830390820744724e+18, + "train_loss": 0.5209268033307966, + "train_runtime": 175746.8725, + "train_samples_per_second": 0.276, + "train_steps_per_second": 0.017 + } + ], + "logging_steps": 1, + "max_steps": 3033, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5830390820744724e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}