diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16213 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.99675535366645, + "eval_steps": 500, + "global_step": 2310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012978585334198572, + "grad_norm": 51.281074849235615, + "learning_rate": 0.0, + "loss": 11.2228, + "step": 1 + }, + { + "epoch": 0.0025957170668397143, + "grad_norm": 52.288477767034706, + "learning_rate": 2.1645021645021646e-07, + "loss": 11.2142, + "step": 2 + }, + { + "epoch": 0.003893575600259572, + "grad_norm": 51.690439395204805, + "learning_rate": 4.329004329004329e-07, + "loss": 11.2982, + "step": 3 + }, + { + "epoch": 0.005191434133679429, + "grad_norm": 52.70272970065567, + "learning_rate": 6.493506493506494e-07, + "loss": 11.2202, + "step": 4 + }, + { + "epoch": 0.006489292667099286, + "grad_norm": 52.19249689221791, + "learning_rate": 8.658008658008658e-07, + "loss": 11.223, + "step": 5 + }, + { + "epoch": 0.007787151200519144, + "grad_norm": 52.51499888342824, + "learning_rate": 1.0822510822510822e-06, + "loss": 11.1764, + "step": 6 + }, + { + "epoch": 0.009085009733939001, + "grad_norm": 54.65313049626493, + "learning_rate": 1.2987012987012988e-06, + "loss": 11.1836, + "step": 7 + }, + { + "epoch": 0.010382868267358857, + "grad_norm": 56.26332260448297, + "learning_rate": 1.5151515151515152e-06, + "loss": 10.9434, + "step": 8 + }, + { + "epoch": 0.011680726800778715, + "grad_norm": 56.526613959934075, + "learning_rate": 1.7316017316017317e-06, + "loss": 10.8116, + "step": 9 + }, + { + "epoch": 0.012978585334198572, + "grad_norm": 73.22487510820876, + "learning_rate": 1.948051948051948e-06, + "loss": 10.0099, + "step": 10 + }, + { + "epoch": 0.01427644386761843, + "grad_norm": 78.76237638740871, + "learning_rate": 2.1645021645021643e-06, + "loss": 9.7022, + "step": 11 + }, + { + "epoch": 0.015574302401038288, + "grad_norm": 84.53260376952899, + "learning_rate": 2.3809523809523808e-06, + "loss": 9.2879, + "step": 12 + }, + { + "epoch": 0.016872160934458143, + "grad_norm": 93.89336284136559, + "learning_rate": 2.5974025974025976e-06, + "loss": 9.1068, + "step": 13 + }, + { + "epoch": 0.018170019467878003, + "grad_norm": 67.50935080000205, + "learning_rate": 2.813852813852814e-06, + "loss": 4.1366, + "step": 14 + }, + { + "epoch": 0.01946787800129786, + "grad_norm": 60.451555462271216, + "learning_rate": 3.0303030303030305e-06, + "loss": 3.7321, + "step": 15 + }, + { + "epoch": 0.020765736534717714, + "grad_norm": 48.941978109448335, + "learning_rate": 3.2467532467532465e-06, + "loss": 3.3276, + "step": 16 + }, + { + "epoch": 0.022063595068137574, + "grad_norm": 41.193189217962875, + "learning_rate": 3.4632034632034634e-06, + "loss": 3.0135, + "step": 17 + }, + { + "epoch": 0.02336145360155743, + "grad_norm": 19.806882723512313, + "learning_rate": 3.67965367965368e-06, + "loss": 2.1732, + "step": 18 + }, + { + "epoch": 0.02465931213497729, + "grad_norm": 6.601644234547365, + "learning_rate": 3.896103896103896e-06, + "loss": 1.5589, + "step": 19 + }, + { + "epoch": 0.025957170668397145, + "grad_norm": 5.420878318524882, + "learning_rate": 4.112554112554113e-06, + "loss": 1.4694, + "step": 20 + }, + { + "epoch": 0.027255029201817, + "grad_norm": 4.513535899746422, + "learning_rate": 4.329004329004329e-06, + "loss": 1.4127, + "step": 21 + }, + { + "epoch": 0.02855288773523686, + "grad_norm": 3.4399120070700926, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.308, + "step": 22 + }, + { + "epoch": 0.029850746268656716, + "grad_norm": 2.737903832715275, + "learning_rate": 4.7619047619047615e-06, + "loss": 1.2391, + "step": 23 + }, + { + "epoch": 0.031148604802076575, + "grad_norm": 2.229792362203531, + "learning_rate": 4.978354978354978e-06, + "loss": 1.207, + "step": 24 + }, + { + "epoch": 0.03244646333549643, + "grad_norm": 1.67506952925642, + "learning_rate": 5.194805194805195e-06, + "loss": 1.164, + "step": 25 + }, + { + "epoch": 0.03374432186891629, + "grad_norm": 7.810491533013112, + "learning_rate": 5.411255411255411e-06, + "loss": 1.0519, + "step": 26 + }, + { + "epoch": 0.03504218040233614, + "grad_norm": 1.8418087052814074, + "learning_rate": 5.627705627705628e-06, + "loss": 1.0332, + "step": 27 + }, + { + "epoch": 0.036340038935756006, + "grad_norm": 1.6946651958442733, + "learning_rate": 5.844155844155844e-06, + "loss": 1.0257, + "step": 28 + }, + { + "epoch": 0.03763789746917586, + "grad_norm": 1.0837092117736122, + "learning_rate": 6.060606060606061e-06, + "loss": 0.9995, + "step": 29 + }, + { + "epoch": 0.03893575600259572, + "grad_norm": 1.022247931246769, + "learning_rate": 6.277056277056277e-06, + "loss": 0.9658, + "step": 30 + }, + { + "epoch": 0.04023361453601557, + "grad_norm": 1.0439465825515253, + "learning_rate": 6.493506493506493e-06, + "loss": 0.9258, + "step": 31 + }, + { + "epoch": 0.04153147306943543, + "grad_norm": 0.8476038489892367, + "learning_rate": 6.709956709956711e-06, + "loss": 0.9183, + "step": 32 + }, + { + "epoch": 0.04282933160285529, + "grad_norm": 0.7260505313959857, + "learning_rate": 6.926406926406927e-06, + "loss": 0.8859, + "step": 33 + }, + { + "epoch": 0.04412719013627515, + "grad_norm": 0.9334322529996619, + "learning_rate": 7.142857142857143e-06, + "loss": 0.8775, + "step": 34 + }, + { + "epoch": 0.045425048669695, + "grad_norm": 0.7507371805560344, + "learning_rate": 7.35930735930736e-06, + "loss": 0.8506, + "step": 35 + }, + { + "epoch": 0.04672290720311486, + "grad_norm": 0.7037392218293158, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.8373, + "step": 36 + }, + { + "epoch": 0.048020765736534715, + "grad_norm": 0.7026115757535957, + "learning_rate": 7.792207792207792e-06, + "loss": 0.8148, + "step": 37 + }, + { + "epoch": 0.04931862426995458, + "grad_norm": 0.6329073089403997, + "learning_rate": 8.008658008658008e-06, + "loss": 0.7989, + "step": 38 + }, + { + "epoch": 0.050616482803374434, + "grad_norm": 0.576557174722755, + "learning_rate": 8.225108225108225e-06, + "loss": 0.801, + "step": 39 + }, + { + "epoch": 0.05191434133679429, + "grad_norm": 0.633584321792007, + "learning_rate": 8.441558441558442e-06, + "loss": 0.8154, + "step": 40 + }, + { + "epoch": 0.053212199870214145, + "grad_norm": 0.6357768126157509, + "learning_rate": 8.658008658008657e-06, + "loss": 0.7985, + "step": 41 + }, + { + "epoch": 0.054510058403634, + "grad_norm": 0.4606140950872704, + "learning_rate": 8.874458874458876e-06, + "loss": 0.7875, + "step": 42 + }, + { + "epoch": 0.055807916937053864, + "grad_norm": 0.42579840291728105, + "learning_rate": 9.090909090909091e-06, + "loss": 0.7882, + "step": 43 + }, + { + "epoch": 0.05710577547047372, + "grad_norm": 0.5127047756782175, + "learning_rate": 9.307359307359308e-06, + "loss": 0.7668, + "step": 44 + }, + { + "epoch": 0.058403634003893576, + "grad_norm": 0.5275747680292829, + "learning_rate": 9.523809523809523e-06, + "loss": 0.7556, + "step": 45 + }, + { + "epoch": 0.05970149253731343, + "grad_norm": 0.4422307893352111, + "learning_rate": 9.740259740259742e-06, + "loss": 0.7469, + "step": 46 + }, + { + "epoch": 0.06099935107073329, + "grad_norm": 0.3950972183567316, + "learning_rate": 9.956709956709957e-06, + "loss": 0.7257, + "step": 47 + }, + { + "epoch": 0.06229720960415315, + "grad_norm": 0.4294144227066294, + "learning_rate": 1.0173160173160174e-05, + "loss": 0.7082, + "step": 48 + }, + { + "epoch": 0.063595068137573, + "grad_norm": 0.4261355016492852, + "learning_rate": 1.038961038961039e-05, + "loss": 0.7202, + "step": 49 + }, + { + "epoch": 0.06489292667099286, + "grad_norm": 0.40006881327817506, + "learning_rate": 1.0606060606060607e-05, + "loss": 0.7006, + "step": 50 + }, + { + "epoch": 0.06619078520441272, + "grad_norm": 0.3484479390008924, + "learning_rate": 1.0822510822510823e-05, + "loss": 0.7141, + "step": 51 + }, + { + "epoch": 0.06748864373783257, + "grad_norm": 0.3508582969164509, + "learning_rate": 1.103896103896104e-05, + "loss": 0.7203, + "step": 52 + }, + { + "epoch": 0.06878650227125244, + "grad_norm": 0.3754749738655716, + "learning_rate": 1.1255411255411256e-05, + "loss": 0.7465, + "step": 53 + }, + { + "epoch": 0.07008436080467229, + "grad_norm": 0.33275270814242425, + "learning_rate": 1.1471861471861473e-05, + "loss": 0.6844, + "step": 54 + }, + { + "epoch": 0.07138221933809215, + "grad_norm": 0.29711493887953333, + "learning_rate": 1.1688311688311688e-05, + "loss": 0.668, + "step": 55 + }, + { + "epoch": 0.07268007787151201, + "grad_norm": 0.3254569707924083, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.7234, + "step": 56 + }, + { + "epoch": 0.07397793640493186, + "grad_norm": 0.2925311216603525, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.7004, + "step": 57 + }, + { + "epoch": 0.07527579493835172, + "grad_norm": 0.2781203466736231, + "learning_rate": 1.2337662337662339e-05, + "loss": 0.6845, + "step": 58 + }, + { + "epoch": 0.07657365347177157, + "grad_norm": 0.27946888261667213, + "learning_rate": 1.2554112554112554e-05, + "loss": 0.6999, + "step": 59 + }, + { + "epoch": 0.07787151200519143, + "grad_norm": 0.2728571313678063, + "learning_rate": 1.2770562770562773e-05, + "loss": 0.6639, + "step": 60 + }, + { + "epoch": 0.0791693705386113, + "grad_norm": 0.3093993935391829, + "learning_rate": 1.2987012987012986e-05, + "loss": 0.701, + "step": 61 + }, + { + "epoch": 0.08046722907203115, + "grad_norm": 0.2852724472177098, + "learning_rate": 1.3203463203463205e-05, + "loss": 0.681, + "step": 62 + }, + { + "epoch": 0.08176508760545101, + "grad_norm": 0.2693071822601781, + "learning_rate": 1.3419913419913421e-05, + "loss": 0.6679, + "step": 63 + }, + { + "epoch": 0.08306294613887086, + "grad_norm": 0.2883803733655785, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.6871, + "step": 64 + }, + { + "epoch": 0.08436080467229072, + "grad_norm": 0.27168971444927753, + "learning_rate": 1.3852813852813853e-05, + "loss": 0.6478, + "step": 65 + }, + { + "epoch": 0.08565866320571058, + "grad_norm": 0.2780741659791045, + "learning_rate": 1.406926406926407e-05, + "loss": 0.6654, + "step": 66 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.2669958151004055, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.6494, + "step": 67 + }, + { + "epoch": 0.0882543802725503, + "grad_norm": 0.2645854701003351, + "learning_rate": 1.4502164502164502e-05, + "loss": 0.6343, + "step": 68 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.27977755521966374, + "learning_rate": 1.471861471861472e-05, + "loss": 0.6703, + "step": 69 + }, + { + "epoch": 0.09085009733939, + "grad_norm": 0.2701714280796314, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.6657, + "step": 70 + }, + { + "epoch": 0.09214795587280987, + "grad_norm": 0.3340236352400633, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.6654, + "step": 71 + }, + { + "epoch": 0.09344581440622972, + "grad_norm": 0.25125625871192836, + "learning_rate": 1.5367965367965366e-05, + "loss": 0.6829, + "step": 72 + }, + { + "epoch": 0.09474367293964958, + "grad_norm": 0.27623404854696354, + "learning_rate": 1.5584415584415583e-05, + "loss": 0.6864, + "step": 73 + }, + { + "epoch": 0.09604153147306943, + "grad_norm": 0.2855287411905892, + "learning_rate": 1.5800865800865803e-05, + "loss": 0.6669, + "step": 74 + }, + { + "epoch": 0.0973393900064893, + "grad_norm": 0.2544109696892319, + "learning_rate": 1.6017316017316017e-05, + "loss": 0.6441, + "step": 75 + }, + { + "epoch": 0.09863724853990916, + "grad_norm": 0.29021289781813303, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.6664, + "step": 76 + }, + { + "epoch": 0.099935107073329, + "grad_norm": 0.26812240880351107, + "learning_rate": 1.645021645021645e-05, + "loss": 0.6391, + "step": 77 + }, + { + "epoch": 0.10123296560674887, + "grad_norm": 0.27576904300300786, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.6355, + "step": 78 + }, + { + "epoch": 0.10253082414016872, + "grad_norm": 0.2814054287273717, + "learning_rate": 1.6883116883116884e-05, + "loss": 0.6265, + "step": 79 + }, + { + "epoch": 0.10382868267358858, + "grad_norm": 0.3103049215962741, + "learning_rate": 1.70995670995671e-05, + "loss": 0.6497, + "step": 80 + }, + { + "epoch": 0.10512654120700844, + "grad_norm": 0.2728333867240695, + "learning_rate": 1.7316017316017315e-05, + "loss": 0.628, + "step": 81 + }, + { + "epoch": 0.10642439974042829, + "grad_norm": 0.29691347602771223, + "learning_rate": 1.7532467532467535e-05, + "loss": 0.6481, + "step": 82 + }, + { + "epoch": 0.10772225827384815, + "grad_norm": 0.29273954514595735, + "learning_rate": 1.7748917748917752e-05, + "loss": 0.6261, + "step": 83 + }, + { + "epoch": 0.109020116807268, + "grad_norm": 0.3074962736781368, + "learning_rate": 1.7965367965367965e-05, + "loss": 0.6299, + "step": 84 + }, + { + "epoch": 0.11031797534068787, + "grad_norm": 0.29602233662175786, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.6335, + "step": 85 + }, + { + "epoch": 0.11161583387410773, + "grad_norm": 0.2830666870801868, + "learning_rate": 1.83982683982684e-05, + "loss": 0.641, + "step": 86 + }, + { + "epoch": 0.11291369240752758, + "grad_norm": 0.3125259689124729, + "learning_rate": 1.8614718614718616e-05, + "loss": 0.6388, + "step": 87 + }, + { + "epoch": 0.11421155094094744, + "grad_norm": 0.26645549323423784, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.6208, + "step": 88 + }, + { + "epoch": 0.11550940947436729, + "grad_norm": 0.28954783071217016, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.6273, + "step": 89 + }, + { + "epoch": 0.11680726800778715, + "grad_norm": 0.28149679569001645, + "learning_rate": 1.9264069264069266e-05, + "loss": 0.6028, + "step": 90 + }, + { + "epoch": 0.11810512654120701, + "grad_norm": 0.2906262772721881, + "learning_rate": 1.9480519480519483e-05, + "loss": 0.6245, + "step": 91 + }, + { + "epoch": 0.11940298507462686, + "grad_norm": 0.2711185379042277, + "learning_rate": 1.9696969696969697e-05, + "loss": 0.6267, + "step": 92 + }, + { + "epoch": 0.12070084360804673, + "grad_norm": 0.3052664513793837, + "learning_rate": 1.9913419913419914e-05, + "loss": 0.6346, + "step": 93 + }, + { + "epoch": 0.12199870214146658, + "grad_norm": 0.29479074178005676, + "learning_rate": 2.012987012987013e-05, + "loss": 0.6255, + "step": 94 + }, + { + "epoch": 0.12329656067488644, + "grad_norm": 0.3687002197662538, + "learning_rate": 2.0346320346320347e-05, + "loss": 0.6269, + "step": 95 + }, + { + "epoch": 0.1245944192083063, + "grad_norm": 0.26974731920341294, + "learning_rate": 2.0562770562770564e-05, + "loss": 0.6355, + "step": 96 + }, + { + "epoch": 0.12589227774172615, + "grad_norm": 0.35521751114512884, + "learning_rate": 2.077922077922078e-05, + "loss": 0.6293, + "step": 97 + }, + { + "epoch": 0.127190136275146, + "grad_norm": 0.31122119266101045, + "learning_rate": 2.0995670995670998e-05, + "loss": 0.6548, + "step": 98 + }, + { + "epoch": 0.12848799480856588, + "grad_norm": 0.32784103974924345, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.6409, + "step": 99 + }, + { + "epoch": 0.12978585334198572, + "grad_norm": 0.2862191321006967, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.6287, + "step": 100 + }, + { + "epoch": 0.13108371187540557, + "grad_norm": 0.2888970770108121, + "learning_rate": 2.1645021645021645e-05, + "loss": 0.5825, + "step": 101 + }, + { + "epoch": 0.13238157040882545, + "grad_norm": 0.27541204550524634, + "learning_rate": 2.1861471861471862e-05, + "loss": 0.6056, + "step": 102 + }, + { + "epoch": 0.1336794289422453, + "grad_norm": 0.2829745345550545, + "learning_rate": 2.207792207792208e-05, + "loss": 0.6388, + "step": 103 + }, + { + "epoch": 0.13497728747566515, + "grad_norm": 0.31335331278223877, + "learning_rate": 2.2294372294372296e-05, + "loss": 0.6149, + "step": 104 + }, + { + "epoch": 0.136275146009085, + "grad_norm": 0.26183513844983125, + "learning_rate": 2.2510822510822512e-05, + "loss": 0.598, + "step": 105 + }, + { + "epoch": 0.13757300454250487, + "grad_norm": 0.3166303353223508, + "learning_rate": 2.272727272727273e-05, + "loss": 0.6153, + "step": 106 + }, + { + "epoch": 0.13887086307592472, + "grad_norm": 0.2827827597423759, + "learning_rate": 2.2943722943722946e-05, + "loss": 0.5878, + "step": 107 + }, + { + "epoch": 0.14016872160934457, + "grad_norm": 0.27950978868403287, + "learning_rate": 2.3160173160173163e-05, + "loss": 0.6022, + "step": 108 + }, + { + "epoch": 0.14146658014276445, + "grad_norm": 0.31785506543954495, + "learning_rate": 2.3376623376623376e-05, + "loss": 0.6419, + "step": 109 + }, + { + "epoch": 0.1427644386761843, + "grad_norm": 0.2760724448320942, + "learning_rate": 2.3593073593073593e-05, + "loss": 0.5892, + "step": 110 + }, + { + "epoch": 0.14406229720960415, + "grad_norm": 0.31705667668464776, + "learning_rate": 2.380952380952381e-05, + "loss": 0.5828, + "step": 111 + }, + { + "epoch": 0.14536015574302402, + "grad_norm": 0.2786427147511611, + "learning_rate": 2.4025974025974027e-05, + "loss": 0.6189, + "step": 112 + }, + { + "epoch": 0.14665801427644387, + "grad_norm": 0.33800188191224245, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.5867, + "step": 113 + }, + { + "epoch": 0.14795587280986372, + "grad_norm": 0.3183986863565769, + "learning_rate": 2.4458874458874457e-05, + "loss": 0.6244, + "step": 114 + }, + { + "epoch": 0.14925373134328357, + "grad_norm": 0.346611504802979, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.6114, + "step": 115 + }, + { + "epoch": 0.15055158987670345, + "grad_norm": 0.3193746967683076, + "learning_rate": 2.4891774891774894e-05, + "loss": 0.5847, + "step": 116 + }, + { + "epoch": 0.1518494484101233, + "grad_norm": 0.329720331399979, + "learning_rate": 2.5108225108225108e-05, + "loss": 0.6104, + "step": 117 + }, + { + "epoch": 0.15314730694354314, + "grad_norm": 0.30497761214035857, + "learning_rate": 2.5324675324675325e-05, + "loss": 0.6147, + "step": 118 + }, + { + "epoch": 0.15444516547696302, + "grad_norm": 0.3065657873353463, + "learning_rate": 2.5541125541125545e-05, + "loss": 0.5891, + "step": 119 + }, + { + "epoch": 0.15574302401038287, + "grad_norm": 0.3040591111660935, + "learning_rate": 2.575757575757576e-05, + "loss": 0.5874, + "step": 120 + }, + { + "epoch": 0.15704088254380272, + "grad_norm": 0.3176140258251669, + "learning_rate": 2.5974025974025972e-05, + "loss": 0.5891, + "step": 121 + }, + { + "epoch": 0.1583387410772226, + "grad_norm": 0.33129130491628744, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.5754, + "step": 122 + }, + { + "epoch": 0.15963659961064244, + "grad_norm": 0.3400250207622185, + "learning_rate": 2.640692640692641e-05, + "loss": 0.5927, + "step": 123 + }, + { + "epoch": 0.1609344581440623, + "grad_norm": 0.3294442929975534, + "learning_rate": 2.6623376623376623e-05, + "loss": 0.6016, + "step": 124 + }, + { + "epoch": 0.16223231667748214, + "grad_norm": 0.27952039743370355, + "learning_rate": 2.6839826839826843e-05, + "loss": 0.5674, + "step": 125 + }, + { + "epoch": 0.16353017521090202, + "grad_norm": 0.3263152361115472, + "learning_rate": 2.7056277056277056e-05, + "loss": 0.6185, + "step": 126 + }, + { + "epoch": 0.16482803374432187, + "grad_norm": 0.34561117525982527, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.6003, + "step": 127 + }, + { + "epoch": 0.16612589227774172, + "grad_norm": 0.36330136868220264, + "learning_rate": 2.7489177489177493e-05, + "loss": 0.5796, + "step": 128 + }, + { + "epoch": 0.1674237508111616, + "grad_norm": 0.3448144052747857, + "learning_rate": 2.7705627705627707e-05, + "loss": 0.5858, + "step": 129 + }, + { + "epoch": 0.16872160934458144, + "grad_norm": 0.30841385505522906, + "learning_rate": 2.792207792207792e-05, + "loss": 0.5913, + "step": 130 + }, + { + "epoch": 0.1700194678780013, + "grad_norm": 0.3823986000835476, + "learning_rate": 2.813852813852814e-05, + "loss": 0.6089, + "step": 131 + }, + { + "epoch": 0.17131732641142117, + "grad_norm": 0.3183137204294537, + "learning_rate": 2.8354978354978357e-05, + "loss": 0.5974, + "step": 132 + }, + { + "epoch": 0.17261518494484102, + "grad_norm": 0.3375228791953999, + "learning_rate": 2.857142857142857e-05, + "loss": 0.5919, + "step": 133 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.34769553896113353, + "learning_rate": 2.878787878787879e-05, + "loss": 0.587, + "step": 134 + }, + { + "epoch": 0.1752109020116807, + "grad_norm": 0.34053830322587214, + "learning_rate": 2.9004329004329005e-05, + "loss": 0.5846, + "step": 135 + }, + { + "epoch": 0.1765087605451006, + "grad_norm": 0.3327693629121813, + "learning_rate": 2.922077922077922e-05, + "loss": 0.5929, + "step": 136 + }, + { + "epoch": 0.17780661907852044, + "grad_norm": 0.37595317145253215, + "learning_rate": 2.943722943722944e-05, + "loss": 0.5836, + "step": 137 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.31124901930305726, + "learning_rate": 2.9653679653679655e-05, + "loss": 0.5946, + "step": 138 + }, + { + "epoch": 0.18040233614536016, + "grad_norm": 0.41500685318923003, + "learning_rate": 2.9870129870129872e-05, + "loss": 0.599, + "step": 139 + }, + { + "epoch": 0.18170019467878, + "grad_norm": 0.4422225800744917, + "learning_rate": 3.0086580086580092e-05, + "loss": 0.6079, + "step": 140 + }, + { + "epoch": 0.18299805321219986, + "grad_norm": 0.3911349391427895, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.5911, + "step": 141 + }, + { + "epoch": 0.18429591174561974, + "grad_norm": 0.3565760473012978, + "learning_rate": 3.051948051948052e-05, + "loss": 0.5874, + "step": 142 + }, + { + "epoch": 0.1855937702790396, + "grad_norm": 0.3316833578762426, + "learning_rate": 3.073593073593073e-05, + "loss": 0.5987, + "step": 143 + }, + { + "epoch": 0.18689162881245944, + "grad_norm": 0.4255792906025628, + "learning_rate": 3.095238095238095e-05, + "loss": 0.5674, + "step": 144 + }, + { + "epoch": 0.18818948734587929, + "grad_norm": 0.3111389344438918, + "learning_rate": 3.1168831168831166e-05, + "loss": 0.5916, + "step": 145 + }, + { + "epoch": 0.18948734587929916, + "grad_norm": 0.40391893328316164, + "learning_rate": 3.1385281385281387e-05, + "loss": 0.5862, + "step": 146 + }, + { + "epoch": 0.190785204412719, + "grad_norm": 0.3571856870514297, + "learning_rate": 3.160173160173161e-05, + "loss": 0.5783, + "step": 147 + }, + { + "epoch": 0.19208306294613886, + "grad_norm": 0.34724535128608686, + "learning_rate": 3.181818181818182e-05, + "loss": 0.593, + "step": 148 + }, + { + "epoch": 0.19338092147955874, + "grad_norm": 0.36623311715616075, + "learning_rate": 3.2034632034632034e-05, + "loss": 0.5791, + "step": 149 + }, + { + "epoch": 0.1946787800129786, + "grad_norm": 0.35421377131407383, + "learning_rate": 3.2251082251082254e-05, + "loss": 0.5869, + "step": 150 + }, + { + "epoch": 0.19597663854639844, + "grad_norm": 0.3580175565804796, + "learning_rate": 3.246753246753247e-05, + "loss": 0.5731, + "step": 151 + }, + { + "epoch": 0.1972744970798183, + "grad_norm": 0.3779107544260428, + "learning_rate": 3.268398268398268e-05, + "loss": 0.5888, + "step": 152 + }, + { + "epoch": 0.19857235561323816, + "grad_norm": 0.381401724832965, + "learning_rate": 3.29004329004329e-05, + "loss": 0.5754, + "step": 153 + }, + { + "epoch": 0.199870214146658, + "grad_norm": 0.3996699371198549, + "learning_rate": 3.311688311688312e-05, + "loss": 0.5878, + "step": 154 + }, + { + "epoch": 0.20116807268007786, + "grad_norm": 0.3498521285804811, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5624, + "step": 155 + }, + { + "epoch": 0.20246593121349774, + "grad_norm": 0.4329402753533996, + "learning_rate": 3.3549783549783555e-05, + "loss": 0.5823, + "step": 156 + }, + { + "epoch": 0.20376378974691758, + "grad_norm": 0.4715275117713498, + "learning_rate": 3.376623376623377e-05, + "loss": 0.5679, + "step": 157 + }, + { + "epoch": 0.20506164828033743, + "grad_norm": 0.4087297995001702, + "learning_rate": 3.398268398268398e-05, + "loss": 0.5492, + "step": 158 + }, + { + "epoch": 0.2063595068137573, + "grad_norm": 0.3963104181486302, + "learning_rate": 3.41991341991342e-05, + "loss": 0.5779, + "step": 159 + }, + { + "epoch": 0.20765736534717716, + "grad_norm": 0.4626784659383467, + "learning_rate": 3.4415584415584416e-05, + "loss": 0.5791, + "step": 160 + }, + { + "epoch": 0.208955223880597, + "grad_norm": 0.4629189431646934, + "learning_rate": 3.463203463203463e-05, + "loss": 0.5709, + "step": 161 + }, + { + "epoch": 0.21025308241401688, + "grad_norm": 0.4327284524192223, + "learning_rate": 3.484848484848485e-05, + "loss": 0.5821, + "step": 162 + }, + { + "epoch": 0.21155094094743673, + "grad_norm": 0.42226923421652224, + "learning_rate": 3.506493506493507e-05, + "loss": 0.579, + "step": 163 + }, + { + "epoch": 0.21284879948085658, + "grad_norm": 0.37986989822155737, + "learning_rate": 3.528138528138528e-05, + "loss": 0.5656, + "step": 164 + }, + { + "epoch": 0.21414665801427643, + "grad_norm": 0.4629547655665463, + "learning_rate": 3.5497835497835503e-05, + "loss": 0.5703, + "step": 165 + }, + { + "epoch": 0.2154445165476963, + "grad_norm": 0.41674661311211725, + "learning_rate": 3.571428571428572e-05, + "loss": 0.5775, + "step": 166 + }, + { + "epoch": 0.21674237508111616, + "grad_norm": 0.37812170353301494, + "learning_rate": 3.593073593073593e-05, + "loss": 0.5647, + "step": 167 + }, + { + "epoch": 0.218040233614536, + "grad_norm": 0.3533683709945352, + "learning_rate": 3.6147186147186144e-05, + "loss": 0.5742, + "step": 168 + }, + { + "epoch": 0.21933809214795588, + "grad_norm": 0.3327311378407231, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.5616, + "step": 169 + }, + { + "epoch": 0.22063595068137573, + "grad_norm": 0.31532044110811386, + "learning_rate": 3.6580086580086584e-05, + "loss": 0.5434, + "step": 170 + }, + { + "epoch": 0.22193380921479558, + "grad_norm": 0.3191051407668251, + "learning_rate": 3.67965367965368e-05, + "loss": 0.5719, + "step": 171 + }, + { + "epoch": 0.22323166774821546, + "grad_norm": 0.36508138364995835, + "learning_rate": 3.701298701298702e-05, + "loss": 0.5496, + "step": 172 + }, + { + "epoch": 0.2245295262816353, + "grad_norm": 0.35917301960844417, + "learning_rate": 3.722943722943723e-05, + "loss": 0.5599, + "step": 173 + }, + { + "epoch": 0.22582738481505515, + "grad_norm": 0.332146935347223, + "learning_rate": 3.7445887445887445e-05, + "loss": 0.5729, + "step": 174 + }, + { + "epoch": 0.227125243348475, + "grad_norm": 0.3761709742507644, + "learning_rate": 3.7662337662337665e-05, + "loss": 0.5395, + "step": 175 + }, + { + "epoch": 0.22842310188189488, + "grad_norm": 0.357015737631827, + "learning_rate": 3.787878787878788e-05, + "loss": 0.5661, + "step": 176 + }, + { + "epoch": 0.22972096041531473, + "grad_norm": 0.34903223053324706, + "learning_rate": 3.809523809523809e-05, + "loss": 0.5633, + "step": 177 + }, + { + "epoch": 0.23101881894873458, + "grad_norm": 0.3592680565530814, + "learning_rate": 3.831168831168831e-05, + "loss": 0.5504, + "step": 178 + }, + { + "epoch": 0.23231667748215445, + "grad_norm": 0.31763219861115954, + "learning_rate": 3.852813852813853e-05, + "loss": 0.5693, + "step": 179 + }, + { + "epoch": 0.2336145360155743, + "grad_norm": 0.37831351950166914, + "learning_rate": 3.8744588744588746e-05, + "loss": 0.5545, + "step": 180 + }, + { + "epoch": 0.23491239454899415, + "grad_norm": 0.3032029950520603, + "learning_rate": 3.8961038961038966e-05, + "loss": 0.5483, + "step": 181 + }, + { + "epoch": 0.23621025308241403, + "grad_norm": 0.45423832636818023, + "learning_rate": 3.917748917748918e-05, + "loss": 0.5646, + "step": 182 + }, + { + "epoch": 0.23750811161583388, + "grad_norm": 0.3885683241280637, + "learning_rate": 3.939393939393939e-05, + "loss": 0.5782, + "step": 183 + }, + { + "epoch": 0.23880597014925373, + "grad_norm": 0.4113001708391826, + "learning_rate": 3.9610389610389614e-05, + "loss": 0.5709, + "step": 184 + }, + { + "epoch": 0.24010382868267358, + "grad_norm": 0.43060691545097807, + "learning_rate": 3.982683982683983e-05, + "loss": 0.5357, + "step": 185 + }, + { + "epoch": 0.24140168721609345, + "grad_norm": 0.48621329563873417, + "learning_rate": 4.004329004329004e-05, + "loss": 0.5438, + "step": 186 + }, + { + "epoch": 0.2426995457495133, + "grad_norm": 0.346819520203559, + "learning_rate": 4.025974025974026e-05, + "loss": 0.5448, + "step": 187 + }, + { + "epoch": 0.24399740428293315, + "grad_norm": 0.5771040244138606, + "learning_rate": 4.047619047619048e-05, + "loss": 0.5609, + "step": 188 + }, + { + "epoch": 0.24529526281635303, + "grad_norm": 0.5691856093486398, + "learning_rate": 4.0692640692640695e-05, + "loss": 0.5509, + "step": 189 + }, + { + "epoch": 0.24659312134977288, + "grad_norm": 0.5658078548327832, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.5457, + "step": 190 + }, + { + "epoch": 0.24789097988319272, + "grad_norm": 0.32565141992028185, + "learning_rate": 4.112554112554113e-05, + "loss": 0.5576, + "step": 191 + }, + { + "epoch": 0.2491888384166126, + "grad_norm": 0.693035116005457, + "learning_rate": 4.134199134199134e-05, + "loss": 0.5818, + "step": 192 + }, + { + "epoch": 0.25048669695003245, + "grad_norm": 0.5767521454545272, + "learning_rate": 4.155844155844156e-05, + "loss": 0.5651, + "step": 193 + }, + { + "epoch": 0.2517845554834523, + "grad_norm": 0.5780821207088752, + "learning_rate": 4.1774891774891775e-05, + "loss": 0.569, + "step": 194 + }, + { + "epoch": 0.25308241401687215, + "grad_norm": 0.37604239901597153, + "learning_rate": 4.1991341991341996e-05, + "loss": 0.543, + "step": 195 + }, + { + "epoch": 0.254380272550292, + "grad_norm": 0.5156588377708116, + "learning_rate": 4.220779220779221e-05, + "loss": 0.5655, + "step": 196 + }, + { + "epoch": 0.2556781310837119, + "grad_norm": 0.547020541236707, + "learning_rate": 4.242424242424243e-05, + "loss": 0.5823, + "step": 197 + }, + { + "epoch": 0.25697598961713175, + "grad_norm": 0.4902045464021542, + "learning_rate": 4.264069264069264e-05, + "loss": 0.5819, + "step": 198 + }, + { + "epoch": 0.2582738481505516, + "grad_norm": 0.43892225186858413, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.5407, + "step": 199 + }, + { + "epoch": 0.25957170668397145, + "grad_norm": 0.35311657422045256, + "learning_rate": 4.3073593073593077e-05, + "loss": 0.5333, + "step": 200 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.550191337628468, + "learning_rate": 4.329004329004329e-05, + "loss": 0.5691, + "step": 201 + }, + { + "epoch": 0.26216742375081115, + "grad_norm": 0.3252801360738619, + "learning_rate": 4.3506493506493503e-05, + "loss": 0.5533, + "step": 202 + }, + { + "epoch": 0.263465282284231, + "grad_norm": 0.4553304513015423, + "learning_rate": 4.3722943722943724e-05, + "loss": 0.5565, + "step": 203 + }, + { + "epoch": 0.2647631408176509, + "grad_norm": 0.3263307722581273, + "learning_rate": 4.3939393939393944e-05, + "loss": 0.5671, + "step": 204 + }, + { + "epoch": 0.26606099935107075, + "grad_norm": 0.4000844274004943, + "learning_rate": 4.415584415584416e-05, + "loss": 0.5399, + "step": 205 + }, + { + "epoch": 0.2673588578844906, + "grad_norm": 0.38431545582799964, + "learning_rate": 4.437229437229438e-05, + "loss": 0.5417, + "step": 206 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.40816346897613404, + "learning_rate": 4.458874458874459e-05, + "loss": 0.5592, + "step": 207 + }, + { + "epoch": 0.2699545749513303, + "grad_norm": 0.34584489381728045, + "learning_rate": 4.4805194805194805e-05, + "loss": 0.5438, + "step": 208 + }, + { + "epoch": 0.27125243348475014, + "grad_norm": 0.4537307823944973, + "learning_rate": 4.5021645021645025e-05, + "loss": 0.5399, + "step": 209 + }, + { + "epoch": 0.27255029201817, + "grad_norm": 0.5752635325535591, + "learning_rate": 4.523809523809524e-05, + "loss": 0.5672, + "step": 210 + }, + { + "epoch": 0.2738481505515899, + "grad_norm": 0.4083818210095887, + "learning_rate": 4.545454545454546e-05, + "loss": 0.5617, + "step": 211 + }, + { + "epoch": 0.27514600908500975, + "grad_norm": 0.3170399657064755, + "learning_rate": 4.567099567099568e-05, + "loss": 0.5352, + "step": 212 + }, + { + "epoch": 0.2764438676184296, + "grad_norm": 0.31917717130826856, + "learning_rate": 4.588744588744589e-05, + "loss": 0.5617, + "step": 213 + }, + { + "epoch": 0.27774172615184944, + "grad_norm": 0.3772448329589651, + "learning_rate": 4.6103896103896106e-05, + "loss": 0.5662, + "step": 214 + }, + { + "epoch": 0.2790395846852693, + "grad_norm": 0.3799776585928483, + "learning_rate": 4.6320346320346326e-05, + "loss": 0.5814, + "step": 215 + }, + { + "epoch": 0.28033744321868914, + "grad_norm": 0.376252486652199, + "learning_rate": 4.653679653679654e-05, + "loss": 0.5475, + "step": 216 + }, + { + "epoch": 0.28163530175210905, + "grad_norm": 0.3703755301826731, + "learning_rate": 4.675324675324675e-05, + "loss": 0.5488, + "step": 217 + }, + { + "epoch": 0.2829331602855289, + "grad_norm": 0.32861168704985866, + "learning_rate": 4.696969696969697e-05, + "loss": 0.554, + "step": 218 + }, + { + "epoch": 0.28423101881894874, + "grad_norm": 0.3475845025879547, + "learning_rate": 4.718614718614719e-05, + "loss": 0.5407, + "step": 219 + }, + { + "epoch": 0.2855288773523686, + "grad_norm": 0.3648655973805309, + "learning_rate": 4.740259740259741e-05, + "loss": 0.5466, + "step": 220 + }, + { + "epoch": 0.28682673588578844, + "grad_norm": 0.3350866523035428, + "learning_rate": 4.761904761904762e-05, + "loss": 0.5548, + "step": 221 + }, + { + "epoch": 0.2881245944192083, + "grad_norm": 0.43767143054287594, + "learning_rate": 4.783549783549784e-05, + "loss": 0.5684, + "step": 222 + }, + { + "epoch": 0.28942245295262814, + "grad_norm": 0.421178133286777, + "learning_rate": 4.8051948051948054e-05, + "loss": 0.5639, + "step": 223 + }, + { + "epoch": 0.29072031148604804, + "grad_norm": 0.37835877083504477, + "learning_rate": 4.826839826839827e-05, + "loss": 0.5566, + "step": 224 + }, + { + "epoch": 0.2920181700194679, + "grad_norm": 0.3417724143733512, + "learning_rate": 4.848484848484849e-05, + "loss": 0.5552, + "step": 225 + }, + { + "epoch": 0.29331602855288774, + "grad_norm": 0.3870541340632366, + "learning_rate": 4.87012987012987e-05, + "loss": 0.549, + "step": 226 + }, + { + "epoch": 0.2946138870863076, + "grad_norm": 0.4889598386044001, + "learning_rate": 4.8917748917748915e-05, + "loss": 0.5538, + "step": 227 + }, + { + "epoch": 0.29591174561972744, + "grad_norm": 0.4543222558469965, + "learning_rate": 4.9134199134199135e-05, + "loss": 0.5651, + "step": 228 + }, + { + "epoch": 0.2972096041531473, + "grad_norm": 0.38147571297168936, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.5456, + "step": 229 + }, + { + "epoch": 0.29850746268656714, + "grad_norm": 0.48062886052178266, + "learning_rate": 4.956709956709957e-05, + "loss": 0.5519, + "step": 230 + }, + { + "epoch": 0.29980532121998704, + "grad_norm": 0.3436776708584428, + "learning_rate": 4.978354978354979e-05, + "loss": 0.5572, + "step": 231 + }, + { + "epoch": 0.3011031797534069, + "grad_norm": 0.48075516118965306, + "learning_rate": 5e-05, + "loss": 0.5566, + "step": 232 + }, + { + "epoch": 0.30240103828682674, + "grad_norm": 0.5764784128417795, + "learning_rate": 4.997594997594998e-05, + "loss": 0.5819, + "step": 233 + }, + { + "epoch": 0.3036988968202466, + "grad_norm": 0.396476527818061, + "learning_rate": 4.995189995189995e-05, + "loss": 0.5411, + "step": 234 + }, + { + "epoch": 0.30499675535366644, + "grad_norm": 0.46291378630567925, + "learning_rate": 4.992784992784993e-05, + "loss": 0.5552, + "step": 235 + }, + { + "epoch": 0.3062946138870863, + "grad_norm": 0.44861478710130637, + "learning_rate": 4.990379990379991e-05, + "loss": 0.544, + "step": 236 + }, + { + "epoch": 0.3075924724205062, + "grad_norm": 0.3873611746053732, + "learning_rate": 4.987974987974988e-05, + "loss": 0.5556, + "step": 237 + }, + { + "epoch": 0.30889033095392604, + "grad_norm": 0.41664468323948045, + "learning_rate": 4.985569985569986e-05, + "loss": 0.5771, + "step": 238 + }, + { + "epoch": 0.3101881894873459, + "grad_norm": 0.3859756658142421, + "learning_rate": 4.983164983164983e-05, + "loss": 0.532, + "step": 239 + }, + { + "epoch": 0.31148604802076574, + "grad_norm": 0.44937677319362224, + "learning_rate": 4.980759980759981e-05, + "loss": 0.5426, + "step": 240 + }, + { + "epoch": 0.3127839065541856, + "grad_norm": 0.4349437401518082, + "learning_rate": 4.978354978354979e-05, + "loss": 0.5259, + "step": 241 + }, + { + "epoch": 0.31408176508760544, + "grad_norm": 0.400324790012705, + "learning_rate": 4.9759499759499764e-05, + "loss": 0.5537, + "step": 242 + }, + { + "epoch": 0.3153796236210253, + "grad_norm": 0.43872297509664254, + "learning_rate": 4.973544973544973e-05, + "loss": 0.549, + "step": 243 + }, + { + "epoch": 0.3166774821544452, + "grad_norm": 0.4653708053643151, + "learning_rate": 4.971139971139971e-05, + "loss": 0.5254, + "step": 244 + }, + { + "epoch": 0.31797534068786504, + "grad_norm": 0.40941811760654495, + "learning_rate": 4.968734968734969e-05, + "loss": 0.568, + "step": 245 + }, + { + "epoch": 0.3192731992212849, + "grad_norm": 0.5368348479077355, + "learning_rate": 4.966329966329967e-05, + "loss": 0.5476, + "step": 246 + }, + { + "epoch": 0.32057105775470474, + "grad_norm": 0.4544642184839637, + "learning_rate": 4.963924963924964e-05, + "loss": 0.5566, + "step": 247 + }, + { + "epoch": 0.3218689162881246, + "grad_norm": 0.42978031279738266, + "learning_rate": 4.961519961519962e-05, + "loss": 0.548, + "step": 248 + }, + { + "epoch": 0.32316677482154443, + "grad_norm": 0.41191622365654873, + "learning_rate": 4.9591149591149594e-05, + "loss": 0.5458, + "step": 249 + }, + { + "epoch": 0.3244646333549643, + "grad_norm": 0.6074054124348204, + "learning_rate": 4.956709956709957e-05, + "loss": 0.5519, + "step": 250 + }, + { + "epoch": 0.3257624918883842, + "grad_norm": 0.4651053481351256, + "learning_rate": 4.9543049543049543e-05, + "loss": 0.5811, + "step": 251 + }, + { + "epoch": 0.32706035042180404, + "grad_norm": 0.4240240962916135, + "learning_rate": 4.951899951899952e-05, + "loss": 0.5523, + "step": 252 + }, + { + "epoch": 0.3283582089552239, + "grad_norm": 0.5066208761057746, + "learning_rate": 4.94949494949495e-05, + "loss": 0.544, + "step": 253 + }, + { + "epoch": 0.32965606748864373, + "grad_norm": 0.38109072762259544, + "learning_rate": 4.9470899470899475e-05, + "loss": 0.5538, + "step": 254 + }, + { + "epoch": 0.3309539260220636, + "grad_norm": 0.5117807003713138, + "learning_rate": 4.944684944684945e-05, + "loss": 0.5577, + "step": 255 + }, + { + "epoch": 0.33225178455548343, + "grad_norm": 0.44912086500472626, + "learning_rate": 4.9422799422799424e-05, + "loss": 0.5495, + "step": 256 + }, + { + "epoch": 0.33354964308890334, + "grad_norm": 0.3651331486905666, + "learning_rate": 4.93987493987494e-05, + "loss": 0.5631, + "step": 257 + }, + { + "epoch": 0.3348475016223232, + "grad_norm": 0.5611125950484844, + "learning_rate": 4.937469937469938e-05, + "loss": 0.5465, + "step": 258 + }, + { + "epoch": 0.33614536015574303, + "grad_norm": 0.5300284860526002, + "learning_rate": 4.9350649350649355e-05, + "loss": 0.5425, + "step": 259 + }, + { + "epoch": 0.3374432186891629, + "grad_norm": 0.42241934122178765, + "learning_rate": 4.932659932659932e-05, + "loss": 0.5613, + "step": 260 + }, + { + "epoch": 0.33874107722258273, + "grad_norm": 0.6480707951702842, + "learning_rate": 4.9302549302549305e-05, + "loss": 0.5443, + "step": 261 + }, + { + "epoch": 0.3400389357560026, + "grad_norm": 0.5458559898285835, + "learning_rate": 4.927849927849928e-05, + "loss": 0.5362, + "step": 262 + }, + { + "epoch": 0.34133679428942243, + "grad_norm": 0.4307852761395753, + "learning_rate": 4.925444925444926e-05, + "loss": 0.5405, + "step": 263 + }, + { + "epoch": 0.34263465282284233, + "grad_norm": 0.5693990395449862, + "learning_rate": 4.923039923039923e-05, + "loss": 0.5455, + "step": 264 + }, + { + "epoch": 0.3439325113562622, + "grad_norm": 0.4427765568418805, + "learning_rate": 4.9206349206349204e-05, + "loss": 0.5475, + "step": 265 + }, + { + "epoch": 0.34523036988968203, + "grad_norm": 0.4724926699957873, + "learning_rate": 4.9182299182299185e-05, + "loss": 0.5502, + "step": 266 + }, + { + "epoch": 0.3465282284231019, + "grad_norm": 0.6296467164625645, + "learning_rate": 4.915824915824916e-05, + "loss": 0.555, + "step": 267 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.521325771991002, + "learning_rate": 4.9134199134199135e-05, + "loss": 0.5521, + "step": 268 + }, + { + "epoch": 0.3491239454899416, + "grad_norm": 0.4920183923356473, + "learning_rate": 4.911014911014911e-05, + "loss": 0.5589, + "step": 269 + }, + { + "epoch": 0.3504218040233614, + "grad_norm": 0.6860051439883974, + "learning_rate": 4.908609908609909e-05, + "loss": 0.5396, + "step": 270 + }, + { + "epoch": 0.35171966255678133, + "grad_norm": 0.38098025544839875, + "learning_rate": 4.9062049062049066e-05, + "loss": 0.5439, + "step": 271 + }, + { + "epoch": 0.3530175210902012, + "grad_norm": 0.545523500518169, + "learning_rate": 4.903799903799904e-05, + "loss": 0.5285, + "step": 272 + }, + { + "epoch": 0.35431537962362103, + "grad_norm": 0.4773245042110645, + "learning_rate": 4.9013949013949016e-05, + "loss": 0.5506, + "step": 273 + }, + { + "epoch": 0.3556132381570409, + "grad_norm": 0.41823644467627, + "learning_rate": 4.898989898989899e-05, + "loss": 0.5382, + "step": 274 + }, + { + "epoch": 0.3569110966904607, + "grad_norm": 0.43108861210799143, + "learning_rate": 4.896584896584897e-05, + "loss": 0.5312, + "step": 275 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.35256122918946825, + "learning_rate": 4.894179894179895e-05, + "loss": 0.5507, + "step": 276 + }, + { + "epoch": 0.3595068137573005, + "grad_norm": 0.641603115121163, + "learning_rate": 4.8917748917748915e-05, + "loss": 0.5494, + "step": 277 + }, + { + "epoch": 0.36080467229072033, + "grad_norm": 0.42144449610145046, + "learning_rate": 4.8893698893698896e-05, + "loss": 0.5537, + "step": 278 + }, + { + "epoch": 0.3621025308241402, + "grad_norm": 0.4421221398296794, + "learning_rate": 4.886964886964887e-05, + "loss": 0.5305, + "step": 279 + }, + { + "epoch": 0.36340038935756, + "grad_norm": 0.34904354726043524, + "learning_rate": 4.884559884559885e-05, + "loss": 0.5151, + "step": 280 + }, + { + "epoch": 0.3646982478909799, + "grad_norm": 0.567323138161088, + "learning_rate": 4.882154882154882e-05, + "loss": 0.5394, + "step": 281 + }, + { + "epoch": 0.3659961064243997, + "grad_norm": 0.4275900383373202, + "learning_rate": 4.8797498797498795e-05, + "loss": 0.5793, + "step": 282 + }, + { + "epoch": 0.3672939649578196, + "grad_norm": 0.43590374764579676, + "learning_rate": 4.877344877344878e-05, + "loss": 0.5238, + "step": 283 + }, + { + "epoch": 0.3685918234912395, + "grad_norm": 0.3806185860855704, + "learning_rate": 4.874939874939875e-05, + "loss": 0.5311, + "step": 284 + }, + { + "epoch": 0.3698896820246593, + "grad_norm": 0.36597622680635733, + "learning_rate": 4.8725348725348726e-05, + "loss": 0.5483, + "step": 285 + }, + { + "epoch": 0.3711875405580792, + "grad_norm": 0.39934249219009466, + "learning_rate": 4.87012987012987e-05, + "loss": 0.5323, + "step": 286 + }, + { + "epoch": 0.372485399091499, + "grad_norm": 0.35489673738601485, + "learning_rate": 4.8677248677248676e-05, + "loss": 0.5197, + "step": 287 + }, + { + "epoch": 0.3737832576249189, + "grad_norm": 0.35597996373456253, + "learning_rate": 4.865319865319866e-05, + "loss": 0.541, + "step": 288 + }, + { + "epoch": 0.3750811161583387, + "grad_norm": 0.30995272924377104, + "learning_rate": 4.862914862914863e-05, + "loss": 0.531, + "step": 289 + }, + { + "epoch": 0.37637897469175857, + "grad_norm": 0.3041222657562842, + "learning_rate": 4.860509860509861e-05, + "loss": 0.5263, + "step": 290 + }, + { + "epoch": 0.3776768332251785, + "grad_norm": 0.27479710316885086, + "learning_rate": 4.858104858104858e-05, + "loss": 0.5179, + "step": 291 + }, + { + "epoch": 0.3789746917585983, + "grad_norm": 0.4108809131825242, + "learning_rate": 4.8556998556998563e-05, + "loss": 0.5285, + "step": 292 + }, + { + "epoch": 0.3802725502920182, + "grad_norm": 0.3283706178094482, + "learning_rate": 4.853294853294854e-05, + "loss": 0.5485, + "step": 293 + }, + { + "epoch": 0.381570408825438, + "grad_norm": 0.3628325275789365, + "learning_rate": 4.8508898508898506e-05, + "loss": 0.5342, + "step": 294 + }, + { + "epoch": 0.38286826735885787, + "grad_norm": 0.3545709020214379, + "learning_rate": 4.848484848484849e-05, + "loss": 0.5441, + "step": 295 + }, + { + "epoch": 0.3841661258922777, + "grad_norm": 0.27536849505708144, + "learning_rate": 4.846079846079846e-05, + "loss": 0.5189, + "step": 296 + }, + { + "epoch": 0.3854639844256976, + "grad_norm": 0.31314568760395595, + "learning_rate": 4.8436748436748444e-05, + "loss": 0.5165, + "step": 297 + }, + { + "epoch": 0.3867618429591175, + "grad_norm": 0.31727676668467136, + "learning_rate": 4.841269841269841e-05, + "loss": 0.5185, + "step": 298 + }, + { + "epoch": 0.3880597014925373, + "grad_norm": 0.35285183197833564, + "learning_rate": 4.838864838864839e-05, + "loss": 0.5328, + "step": 299 + }, + { + "epoch": 0.3893575600259572, + "grad_norm": 0.2990420731073892, + "learning_rate": 4.836459836459837e-05, + "loss": 0.532, + "step": 300 + }, + { + "epoch": 0.390655418559377, + "grad_norm": 0.38606448483559813, + "learning_rate": 4.834054834054834e-05, + "loss": 0.5461, + "step": 301 + }, + { + "epoch": 0.39195327709279687, + "grad_norm": 0.37652402001442803, + "learning_rate": 4.831649831649832e-05, + "loss": 0.5379, + "step": 302 + }, + { + "epoch": 0.3932511356262167, + "grad_norm": 0.34953978468725405, + "learning_rate": 4.829244829244829e-05, + "loss": 0.5359, + "step": 303 + }, + { + "epoch": 0.3945489941596366, + "grad_norm": 0.3382778166946982, + "learning_rate": 4.826839826839827e-05, + "loss": 0.5342, + "step": 304 + }, + { + "epoch": 0.3958468526930565, + "grad_norm": 0.34560665492104875, + "learning_rate": 4.824434824434825e-05, + "loss": 0.5324, + "step": 305 + }, + { + "epoch": 0.3971447112264763, + "grad_norm": 0.34496470111641636, + "learning_rate": 4.8220298220298224e-05, + "loss": 0.5339, + "step": 306 + }, + { + "epoch": 0.39844256975989617, + "grad_norm": 0.40001685434062584, + "learning_rate": 4.81962481962482e-05, + "loss": 0.5272, + "step": 307 + }, + { + "epoch": 0.399740428293316, + "grad_norm": 0.366032696592655, + "learning_rate": 4.8172198172198173e-05, + "loss": 0.5306, + "step": 308 + }, + { + "epoch": 0.40103828682673587, + "grad_norm": 0.37927598899770393, + "learning_rate": 4.814814814814815e-05, + "loss": 0.5591, + "step": 309 + }, + { + "epoch": 0.4023361453601557, + "grad_norm": 0.32812121834756386, + "learning_rate": 4.812409812409813e-05, + "loss": 0.5422, + "step": 310 + }, + { + "epoch": 0.4036340038935756, + "grad_norm": 0.35171717899329513, + "learning_rate": 4.81000481000481e-05, + "loss": 0.532, + "step": 311 + }, + { + "epoch": 0.40493186242699547, + "grad_norm": 0.3756784968486016, + "learning_rate": 4.807599807599808e-05, + "loss": 0.5392, + "step": 312 + }, + { + "epoch": 0.4062297209604153, + "grad_norm": 0.3426264703785813, + "learning_rate": 4.8051948051948054e-05, + "loss": 0.5457, + "step": 313 + }, + { + "epoch": 0.40752757949383517, + "grad_norm": 0.39836935230937326, + "learning_rate": 4.8027898027898036e-05, + "loss": 0.5268, + "step": 314 + }, + { + "epoch": 0.408825438027255, + "grad_norm": 0.33486717640072616, + "learning_rate": 4.8003848003848004e-05, + "loss": 0.5298, + "step": 315 + }, + { + "epoch": 0.41012329656067487, + "grad_norm": 0.3463640087410465, + "learning_rate": 4.797979797979798e-05, + "loss": 0.5372, + "step": 316 + }, + { + "epoch": 0.41142115509409477, + "grad_norm": 0.2981951724559669, + "learning_rate": 4.795574795574796e-05, + "loss": 0.5193, + "step": 317 + }, + { + "epoch": 0.4127190136275146, + "grad_norm": 0.37701472504733063, + "learning_rate": 4.7931697931697935e-05, + "loss": 0.5262, + "step": 318 + }, + { + "epoch": 0.41401687216093447, + "grad_norm": 0.2958251594721693, + "learning_rate": 4.790764790764791e-05, + "loss": 0.5262, + "step": 319 + }, + { + "epoch": 0.4153147306943543, + "grad_norm": 0.36512530778352836, + "learning_rate": 4.7883597883597884e-05, + "loss": 0.5451, + "step": 320 + }, + { + "epoch": 0.41661258922777417, + "grad_norm": 0.32275488837011096, + "learning_rate": 4.785954785954786e-05, + "loss": 0.5378, + "step": 321 + }, + { + "epoch": 0.417910447761194, + "grad_norm": 0.29968884353456, + "learning_rate": 4.783549783549784e-05, + "loss": 0.5308, + "step": 322 + }, + { + "epoch": 0.41920830629461386, + "grad_norm": 0.3472967912976659, + "learning_rate": 4.7811447811447815e-05, + "loss": 0.5262, + "step": 323 + }, + { + "epoch": 0.42050616482803377, + "grad_norm": 0.3256673670375662, + "learning_rate": 4.778739778739779e-05, + "loss": 0.5349, + "step": 324 + }, + { + "epoch": 0.4218040233614536, + "grad_norm": 0.3830575202323324, + "learning_rate": 4.7763347763347765e-05, + "loss": 0.5343, + "step": 325 + }, + { + "epoch": 0.42310188189487347, + "grad_norm": 0.34375094386741617, + "learning_rate": 4.773929773929774e-05, + "loss": 0.5295, + "step": 326 + }, + { + "epoch": 0.4243997404282933, + "grad_norm": 0.32100699117380493, + "learning_rate": 4.771524771524772e-05, + "loss": 0.5241, + "step": 327 + }, + { + "epoch": 0.42569759896171316, + "grad_norm": 0.3546414912790039, + "learning_rate": 4.769119769119769e-05, + "loss": 0.5292, + "step": 328 + }, + { + "epoch": 0.426995457495133, + "grad_norm": 0.367282717001635, + "learning_rate": 4.766714766714767e-05, + "loss": 0.525, + "step": 329 + }, + { + "epoch": 0.42829331602855286, + "grad_norm": 0.36332040957365974, + "learning_rate": 4.7643097643097646e-05, + "loss": 0.5395, + "step": 330 + }, + { + "epoch": 0.42959117456197277, + "grad_norm": 0.36242424332632034, + "learning_rate": 4.761904761904762e-05, + "loss": 0.5533, + "step": 331 + }, + { + "epoch": 0.4308890330953926, + "grad_norm": 0.36697609874383924, + "learning_rate": 4.7594997594997595e-05, + "loss": 0.5359, + "step": 332 + }, + { + "epoch": 0.43218689162881246, + "grad_norm": 0.33118162802731466, + "learning_rate": 4.757094757094757e-05, + "loss": 0.5317, + "step": 333 + }, + { + "epoch": 0.4334847501622323, + "grad_norm": 0.30441401984534905, + "learning_rate": 4.754689754689755e-05, + "loss": 0.5117, + "step": 334 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.38992931838953715, + "learning_rate": 4.7522847522847526e-05, + "loss": 0.5461, + "step": 335 + }, + { + "epoch": 0.436080467229072, + "grad_norm": 0.3335314021890073, + "learning_rate": 4.74987974987975e-05, + "loss": 0.521, + "step": 336 + }, + { + "epoch": 0.4373783257624919, + "grad_norm": 0.44568000339670255, + "learning_rate": 4.7474747474747476e-05, + "loss": 0.5351, + "step": 337 + }, + { + "epoch": 0.43867618429591176, + "grad_norm": 0.30521804239806394, + "learning_rate": 4.745069745069745e-05, + "loss": 0.5354, + "step": 338 + }, + { + "epoch": 0.4399740428293316, + "grad_norm": 0.40068879251857975, + "learning_rate": 4.742664742664743e-05, + "loss": 0.519, + "step": 339 + }, + { + "epoch": 0.44127190136275146, + "grad_norm": 0.2988879048992771, + "learning_rate": 4.740259740259741e-05, + "loss": 0.5219, + "step": 340 + }, + { + "epoch": 0.4425697598961713, + "grad_norm": 0.3568374963385924, + "learning_rate": 4.737854737854738e-05, + "loss": 0.5407, + "step": 341 + }, + { + "epoch": 0.44386761842959116, + "grad_norm": 0.3314619863009568, + "learning_rate": 4.7354497354497356e-05, + "loss": 0.5293, + "step": 342 + }, + { + "epoch": 0.445165476963011, + "grad_norm": 0.38859118585963526, + "learning_rate": 4.733044733044733e-05, + "loss": 0.5313, + "step": 343 + }, + { + "epoch": 0.4464633354964309, + "grad_norm": 0.29870660291558937, + "learning_rate": 4.730639730639731e-05, + "loss": 0.5235, + "step": 344 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.33559539703922564, + "learning_rate": 4.728234728234728e-05, + "loss": 0.5165, + "step": 345 + }, + { + "epoch": 0.4490590525632706, + "grad_norm": 0.29408841700456767, + "learning_rate": 4.725829725829726e-05, + "loss": 0.5385, + "step": 346 + }, + { + "epoch": 0.45035691109669046, + "grad_norm": 0.37054598216697, + "learning_rate": 4.723424723424724e-05, + "loss": 0.5107, + "step": 347 + }, + { + "epoch": 0.4516547696301103, + "grad_norm": 0.3250925011044857, + "learning_rate": 4.721019721019721e-05, + "loss": 0.5233, + "step": 348 + }, + { + "epoch": 0.45295262816353016, + "grad_norm": 0.35856402885603006, + "learning_rate": 4.718614718614719e-05, + "loss": 0.5055, + "step": 349 + }, + { + "epoch": 0.45425048669695, + "grad_norm": 0.34612856309129164, + "learning_rate": 4.716209716209716e-05, + "loss": 0.5404, + "step": 350 + }, + { + "epoch": 0.4555483452303699, + "grad_norm": 0.33303853027334285, + "learning_rate": 4.713804713804714e-05, + "loss": 0.5178, + "step": 351 + }, + { + "epoch": 0.45684620376378976, + "grad_norm": 0.32091795432054987, + "learning_rate": 4.711399711399712e-05, + "loss": 0.5222, + "step": 352 + }, + { + "epoch": 0.4581440622972096, + "grad_norm": 0.34934754787554123, + "learning_rate": 4.708994708994709e-05, + "loss": 0.5405, + "step": 353 + }, + { + "epoch": 0.45944192083062946, + "grad_norm": 0.2937202903692653, + "learning_rate": 4.706589706589707e-05, + "loss": 0.5283, + "step": 354 + }, + { + "epoch": 0.4607397793640493, + "grad_norm": 0.3464664667340698, + "learning_rate": 4.704184704184704e-05, + "loss": 0.5257, + "step": 355 + }, + { + "epoch": 0.46203763789746916, + "grad_norm": 0.3021689351056674, + "learning_rate": 4.7017797017797024e-05, + "loss": 0.5256, + "step": 356 + }, + { + "epoch": 0.46333549643088906, + "grad_norm": 0.3373492256075124, + "learning_rate": 4.6993746993747e-05, + "loss": 0.5262, + "step": 357 + }, + { + "epoch": 0.4646333549643089, + "grad_norm": 0.3279466476251607, + "learning_rate": 4.696969696969697e-05, + "loss": 0.5252, + "step": 358 + }, + { + "epoch": 0.46593121349772876, + "grad_norm": 0.3151794533478745, + "learning_rate": 4.694564694564695e-05, + "loss": 0.5355, + "step": 359 + }, + { + "epoch": 0.4672290720311486, + "grad_norm": 0.3676469350203011, + "learning_rate": 4.692159692159692e-05, + "loss": 0.5536, + "step": 360 + }, + { + "epoch": 0.46852693056456846, + "grad_norm": 0.2638129347242171, + "learning_rate": 4.6897546897546904e-05, + "loss": 0.5167, + "step": 361 + }, + { + "epoch": 0.4698247890979883, + "grad_norm": 0.32036105761534295, + "learning_rate": 4.687349687349687e-05, + "loss": 0.5105, + "step": 362 + }, + { + "epoch": 0.47112264763140815, + "grad_norm": 0.3312350329187521, + "learning_rate": 4.6849446849446854e-05, + "loss": 0.5403, + "step": 363 + }, + { + "epoch": 0.47242050616482806, + "grad_norm": 0.2691270807481844, + "learning_rate": 4.682539682539683e-05, + "loss": 0.5129, + "step": 364 + }, + { + "epoch": 0.4737183646982479, + "grad_norm": 0.31512841418214993, + "learning_rate": 4.68013468013468e-05, + "loss": 0.5188, + "step": 365 + }, + { + "epoch": 0.47501622323166776, + "grad_norm": 0.32669598998030175, + "learning_rate": 4.677729677729678e-05, + "loss": 0.5384, + "step": 366 + }, + { + "epoch": 0.4763140817650876, + "grad_norm": 0.3235775681673074, + "learning_rate": 4.675324675324675e-05, + "loss": 0.5194, + "step": 367 + }, + { + "epoch": 0.47761194029850745, + "grad_norm": 0.3095310587265592, + "learning_rate": 4.6729196729196734e-05, + "loss": 0.5323, + "step": 368 + }, + { + "epoch": 0.4789097988319273, + "grad_norm": 0.32341229991033627, + "learning_rate": 4.670514670514671e-05, + "loss": 0.5192, + "step": 369 + }, + { + "epoch": 0.48020765736534715, + "grad_norm": 0.3175500385279334, + "learning_rate": 4.6681096681096684e-05, + "loss": 0.53, + "step": 370 + }, + { + "epoch": 0.48150551589876706, + "grad_norm": 0.34206247662543693, + "learning_rate": 4.665704665704666e-05, + "loss": 0.5314, + "step": 371 + }, + { + "epoch": 0.4828033744321869, + "grad_norm": 0.3185235414836324, + "learning_rate": 4.6632996632996634e-05, + "loss": 0.5367, + "step": 372 + }, + { + "epoch": 0.48410123296560675, + "grad_norm": 0.28453878466972515, + "learning_rate": 4.6608946608946615e-05, + "loss": 0.5223, + "step": 373 + }, + { + "epoch": 0.4853990914990266, + "grad_norm": 0.2957039810409513, + "learning_rate": 4.658489658489659e-05, + "loss": 0.5005, + "step": 374 + }, + { + "epoch": 0.48669695003244645, + "grad_norm": 0.2940626178906619, + "learning_rate": 4.656084656084656e-05, + "loss": 0.5098, + "step": 375 + }, + { + "epoch": 0.4879948085658663, + "grad_norm": 0.33976965417394467, + "learning_rate": 4.653679653679654e-05, + "loss": 0.5155, + "step": 376 + }, + { + "epoch": 0.4892926670992862, + "grad_norm": 0.2993214412594064, + "learning_rate": 4.6512746512746514e-05, + "loss": 0.5183, + "step": 377 + }, + { + "epoch": 0.49059052563270605, + "grad_norm": 0.3550998192684892, + "learning_rate": 4.6488696488696496e-05, + "loss": 0.5404, + "step": 378 + }, + { + "epoch": 0.4918883841661259, + "grad_norm": 0.3961098073492471, + "learning_rate": 4.6464646464646464e-05, + "loss": 0.5411, + "step": 379 + }, + { + "epoch": 0.49318624269954575, + "grad_norm": 0.34269318810223304, + "learning_rate": 4.6440596440596445e-05, + "loss": 0.5206, + "step": 380 + }, + { + "epoch": 0.4944841012329656, + "grad_norm": 0.29509416892424534, + "learning_rate": 4.641654641654642e-05, + "loss": 0.5189, + "step": 381 + }, + { + "epoch": 0.49578195976638545, + "grad_norm": 0.32772442148274133, + "learning_rate": 4.6392496392496395e-05, + "loss": 0.5368, + "step": 382 + }, + { + "epoch": 0.4970798182998053, + "grad_norm": 0.2719733229054414, + "learning_rate": 4.636844636844637e-05, + "loss": 0.5027, + "step": 383 + }, + { + "epoch": 0.4983776768332252, + "grad_norm": 0.32827976870034653, + "learning_rate": 4.6344396344396344e-05, + "loss": 0.5225, + "step": 384 + }, + { + "epoch": 0.49967553536664505, + "grad_norm": 0.36529779846696075, + "learning_rate": 4.6320346320346326e-05, + "loss": 0.4961, + "step": 385 + }, + { + "epoch": 0.5009733939000649, + "grad_norm": 0.34737533192311987, + "learning_rate": 4.62962962962963e-05, + "loss": 0.5056, + "step": 386 + }, + { + "epoch": 0.5022712524334848, + "grad_norm": 0.32570267249669654, + "learning_rate": 4.6272246272246276e-05, + "loss": 0.5114, + "step": 387 + }, + { + "epoch": 0.5035691109669046, + "grad_norm": 0.3419484703073112, + "learning_rate": 4.624819624819625e-05, + "loss": 0.5141, + "step": 388 + }, + { + "epoch": 0.5048669695003245, + "grad_norm": 0.34141193067026915, + "learning_rate": 4.6224146224146225e-05, + "loss": 0.5296, + "step": 389 + }, + { + "epoch": 0.5061648280337443, + "grad_norm": 0.314416021500765, + "learning_rate": 4.620009620009621e-05, + "loss": 0.5202, + "step": 390 + }, + { + "epoch": 0.5074626865671642, + "grad_norm": 0.2608120507481533, + "learning_rate": 4.617604617604618e-05, + "loss": 0.507, + "step": 391 + }, + { + "epoch": 0.508760545100584, + "grad_norm": 0.3415961124944403, + "learning_rate": 4.615199615199615e-05, + "loss": 0.5513, + "step": 392 + }, + { + "epoch": 0.5100584036340039, + "grad_norm": 0.3545772824977214, + "learning_rate": 4.612794612794613e-05, + "loss": 0.5195, + "step": 393 + }, + { + "epoch": 0.5113562621674238, + "grad_norm": 0.2759048918555112, + "learning_rate": 4.6103896103896106e-05, + "loss": 0.5145, + "step": 394 + }, + { + "epoch": 0.5126541207008436, + "grad_norm": 0.29424902451922874, + "learning_rate": 4.607984607984609e-05, + "loss": 0.4965, + "step": 395 + }, + { + "epoch": 0.5139519792342635, + "grad_norm": 0.2865983981532377, + "learning_rate": 4.6055796055796055e-05, + "loss": 0.51, + "step": 396 + }, + { + "epoch": 0.5152498377676833, + "grad_norm": 0.2826395400080094, + "learning_rate": 4.603174603174603e-05, + "loss": 0.5177, + "step": 397 + }, + { + "epoch": 0.5165476963011032, + "grad_norm": 0.3154832346968727, + "learning_rate": 4.600769600769601e-05, + "loss": 0.5167, + "step": 398 + }, + { + "epoch": 0.517845554834523, + "grad_norm": 0.28657500627349963, + "learning_rate": 4.5983645983645986e-05, + "loss": 0.4915, + "step": 399 + }, + { + "epoch": 0.5191434133679429, + "grad_norm": 0.3108878669769025, + "learning_rate": 4.595959595959596e-05, + "loss": 0.5094, + "step": 400 + }, + { + "epoch": 0.5204412719013628, + "grad_norm": 0.3105653890499028, + "learning_rate": 4.5935545935545936e-05, + "loss": 0.5395, + "step": 401 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.3041614944176647, + "learning_rate": 4.591149591149592e-05, + "loss": 0.5015, + "step": 402 + }, + { + "epoch": 0.5230369889682025, + "grad_norm": 0.2774234169618742, + "learning_rate": 4.588744588744589e-05, + "loss": 0.4953, + "step": 403 + }, + { + "epoch": 0.5243348475016223, + "grad_norm": 0.3292325244305617, + "learning_rate": 4.586339586339587e-05, + "loss": 0.5326, + "step": 404 + }, + { + "epoch": 0.5256327060350422, + "grad_norm": 0.280230976708198, + "learning_rate": 4.583934583934584e-05, + "loss": 0.5186, + "step": 405 + }, + { + "epoch": 0.526930564568462, + "grad_norm": 0.29220060360384115, + "learning_rate": 4.5815295815295817e-05, + "loss": 0.5224, + "step": 406 + }, + { + "epoch": 0.5282284231018819, + "grad_norm": 0.2973987484773138, + "learning_rate": 4.57912457912458e-05, + "loss": 0.5198, + "step": 407 + }, + { + "epoch": 0.5295262816353018, + "grad_norm": 0.31013343434720114, + "learning_rate": 4.576719576719577e-05, + "loss": 0.5303, + "step": 408 + }, + { + "epoch": 0.5308241401687216, + "grad_norm": 0.3091402470001352, + "learning_rate": 4.574314574314574e-05, + "loss": 0.5176, + "step": 409 + }, + { + "epoch": 0.5321219987021415, + "grad_norm": 0.2974831281530903, + "learning_rate": 4.571909571909572e-05, + "loss": 0.5153, + "step": 410 + }, + { + "epoch": 0.5334198572355613, + "grad_norm": 0.30558238497100093, + "learning_rate": 4.56950456950457e-05, + "loss": 0.5171, + "step": 411 + }, + { + "epoch": 0.5347177157689812, + "grad_norm": 0.3050417584271822, + "learning_rate": 4.567099567099568e-05, + "loss": 0.51, + "step": 412 + }, + { + "epoch": 0.536015574302401, + "grad_norm": 0.3062163771011129, + "learning_rate": 4.564694564694565e-05, + "loss": 0.5182, + "step": 413 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.2918708356467548, + "learning_rate": 4.562289562289562e-05, + "loss": 0.5372, + "step": 414 + }, + { + "epoch": 0.5386112913692408, + "grad_norm": 0.3100802477130393, + "learning_rate": 4.55988455988456e-05, + "loss": 0.504, + "step": 415 + }, + { + "epoch": 0.5399091499026606, + "grad_norm": 0.28080959444703624, + "learning_rate": 4.557479557479558e-05, + "loss": 0.5133, + "step": 416 + }, + { + "epoch": 0.5412070084360805, + "grad_norm": 0.31510663571529784, + "learning_rate": 4.555074555074555e-05, + "loss": 0.5083, + "step": 417 + }, + { + "epoch": 0.5425048669695003, + "grad_norm": 0.2987074534694197, + "learning_rate": 4.552669552669553e-05, + "loss": 0.5233, + "step": 418 + }, + { + "epoch": 0.5438027255029202, + "grad_norm": 0.3018100646658762, + "learning_rate": 4.55026455026455e-05, + "loss": 0.5125, + "step": 419 + }, + { + "epoch": 0.54510058403634, + "grad_norm": 0.3068942632441547, + "learning_rate": 4.5478595478595484e-05, + "loss": 0.5141, + "step": 420 + }, + { + "epoch": 0.5463984425697599, + "grad_norm": 0.30401355001057445, + "learning_rate": 4.545454545454546e-05, + "loss": 0.5275, + "step": 421 + }, + { + "epoch": 0.5476963011031798, + "grad_norm": 0.3203825407640607, + "learning_rate": 4.543049543049543e-05, + "loss": 0.5374, + "step": 422 + }, + { + "epoch": 0.5489941596365996, + "grad_norm": 0.2890332355349151, + "learning_rate": 4.540644540644541e-05, + "loss": 0.4928, + "step": 423 + }, + { + "epoch": 0.5502920181700195, + "grad_norm": 0.3167375640999411, + "learning_rate": 4.538239538239538e-05, + "loss": 0.5238, + "step": 424 + }, + { + "epoch": 0.5515898767034393, + "grad_norm": 0.2838739525584143, + "learning_rate": 4.535834535834536e-05, + "loss": 0.5193, + "step": 425 + }, + { + "epoch": 0.5528877352368592, + "grad_norm": 0.31600565211015347, + "learning_rate": 4.533429533429533e-05, + "loss": 0.5408, + "step": 426 + }, + { + "epoch": 0.5541855937702791, + "grad_norm": 0.2936881048070723, + "learning_rate": 4.5310245310245314e-05, + "loss": 0.517, + "step": 427 + }, + { + "epoch": 0.5554834523036989, + "grad_norm": 0.2869325695453175, + "learning_rate": 4.528619528619529e-05, + "loss": 0.5272, + "step": 428 + }, + { + "epoch": 0.5567813108371188, + "grad_norm": 0.26396682696755225, + "learning_rate": 4.5262145262145264e-05, + "loss": 0.5092, + "step": 429 + }, + { + "epoch": 0.5580791693705386, + "grad_norm": 0.2778797627866081, + "learning_rate": 4.523809523809524e-05, + "loss": 0.5107, + "step": 430 + }, + { + "epoch": 0.5593770279039585, + "grad_norm": 0.29638166733777366, + "learning_rate": 4.521404521404521e-05, + "loss": 0.5207, + "step": 431 + }, + { + "epoch": 0.5606748864373783, + "grad_norm": 0.31086056140406293, + "learning_rate": 4.5189995189995195e-05, + "loss": 0.5213, + "step": 432 + }, + { + "epoch": 0.5619727449707982, + "grad_norm": 0.2777342628277087, + "learning_rate": 4.516594516594517e-05, + "loss": 0.5225, + "step": 433 + }, + { + "epoch": 0.5632706035042181, + "grad_norm": 0.31153041204212967, + "learning_rate": 4.5141895141895144e-05, + "loss": 0.5029, + "step": 434 + }, + { + "epoch": 0.5645684620376379, + "grad_norm": 0.2833387330391144, + "learning_rate": 4.511784511784512e-05, + "loss": 0.5269, + "step": 435 + }, + { + "epoch": 0.5658663205710578, + "grad_norm": 0.3208653719812624, + "learning_rate": 4.5093795093795094e-05, + "loss": 0.5027, + "step": 436 + }, + { + "epoch": 0.5671641791044776, + "grad_norm": 0.3024059727560176, + "learning_rate": 4.5069745069745075e-05, + "loss": 0.5219, + "step": 437 + }, + { + "epoch": 0.5684620376378975, + "grad_norm": 0.2822549725146957, + "learning_rate": 4.504569504569504e-05, + "loss": 0.5095, + "step": 438 + }, + { + "epoch": 0.5697598961713173, + "grad_norm": 0.3016137705597104, + "learning_rate": 4.5021645021645025e-05, + "loss": 0.5104, + "step": 439 + }, + { + "epoch": 0.5710577547047372, + "grad_norm": 0.3627945641240943, + "learning_rate": 4.4997594997595e-05, + "loss": 0.5264, + "step": 440 + }, + { + "epoch": 0.5723556132381571, + "grad_norm": 0.27507558856590775, + "learning_rate": 4.4973544973544974e-05, + "loss": 0.5139, + "step": 441 + }, + { + "epoch": 0.5736534717715769, + "grad_norm": 0.3593784232319199, + "learning_rate": 4.494949494949495e-05, + "loss": 0.5212, + "step": 442 + }, + { + "epoch": 0.5749513303049968, + "grad_norm": 0.29557452932516204, + "learning_rate": 4.4925444925444924e-05, + "loss": 0.502, + "step": 443 + }, + { + "epoch": 0.5762491888384166, + "grad_norm": 0.3005471775259294, + "learning_rate": 4.4901394901394906e-05, + "loss": 0.4912, + "step": 444 + }, + { + "epoch": 0.5775470473718365, + "grad_norm": 0.26919062978615377, + "learning_rate": 4.487734487734488e-05, + "loss": 0.5313, + "step": 445 + }, + { + "epoch": 0.5788449059052563, + "grad_norm": 0.2556241030912058, + "learning_rate": 4.4853294853294855e-05, + "loss": 0.5085, + "step": 446 + }, + { + "epoch": 0.5801427644386762, + "grad_norm": 0.2733433348389189, + "learning_rate": 4.482924482924483e-05, + "loss": 0.5232, + "step": 447 + }, + { + "epoch": 0.5814406229720961, + "grad_norm": 0.2699232360629045, + "learning_rate": 4.4805194805194805e-05, + "loss": 0.5049, + "step": 448 + }, + { + "epoch": 0.5827384815055159, + "grad_norm": 0.28747431650418886, + "learning_rate": 4.4781144781144786e-05, + "loss": 0.5112, + "step": 449 + }, + { + "epoch": 0.5840363400389358, + "grad_norm": 0.28804716410878617, + "learning_rate": 4.475709475709476e-05, + "loss": 0.4905, + "step": 450 + }, + { + "epoch": 0.5853341985723556, + "grad_norm": 0.2919584848714507, + "learning_rate": 4.4733044733044736e-05, + "loss": 0.5272, + "step": 451 + }, + { + "epoch": 0.5866320571057755, + "grad_norm": 0.27840735576549713, + "learning_rate": 4.470899470899471e-05, + "loss": 0.4885, + "step": 452 + }, + { + "epoch": 0.5879299156391953, + "grad_norm": 0.29490953800516345, + "learning_rate": 4.4684944684944685e-05, + "loss": 0.5164, + "step": 453 + }, + { + "epoch": 0.5892277741726152, + "grad_norm": 0.3240170627979527, + "learning_rate": 4.466089466089467e-05, + "loss": 0.5173, + "step": 454 + }, + { + "epoch": 0.5905256327060351, + "grad_norm": 0.2665880580848304, + "learning_rate": 4.4636844636844635e-05, + "loss": 0.5045, + "step": 455 + }, + { + "epoch": 0.5918234912394549, + "grad_norm": 0.36453608305554464, + "learning_rate": 4.4612794612794616e-05, + "loss": 0.5176, + "step": 456 + }, + { + "epoch": 0.5931213497728748, + "grad_norm": 0.2971475504780928, + "learning_rate": 4.458874458874459e-05, + "loss": 0.5047, + "step": 457 + }, + { + "epoch": 0.5944192083062946, + "grad_norm": 0.34100322893736984, + "learning_rate": 4.4564694564694566e-05, + "loss": 0.5111, + "step": 458 + }, + { + "epoch": 0.5957170668397145, + "grad_norm": 0.27240121549823265, + "learning_rate": 4.454064454064454e-05, + "loss": 0.4885, + "step": 459 + }, + { + "epoch": 0.5970149253731343, + "grad_norm": 0.31589767714076145, + "learning_rate": 4.4516594516594515e-05, + "loss": 0.4994, + "step": 460 + }, + { + "epoch": 0.5983127839065542, + "grad_norm": 0.2801464661937106, + "learning_rate": 4.44925444925445e-05, + "loss": 0.497, + "step": 461 + }, + { + "epoch": 0.5996106424399741, + "grad_norm": 0.33064691940201346, + "learning_rate": 4.446849446849447e-05, + "loss": 0.5336, + "step": 462 + }, + { + "epoch": 0.6009085009733939, + "grad_norm": 0.2866479133025442, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.5346, + "step": 463 + }, + { + "epoch": 0.6022063595068138, + "grad_norm": 0.33165659226246563, + "learning_rate": 4.442039442039442e-05, + "loss": 0.5145, + "step": 464 + }, + { + "epoch": 0.6035042180402336, + "grad_norm": 0.4080281603274731, + "learning_rate": 4.4396344396344396e-05, + "loss": 0.5083, + "step": 465 + }, + { + "epoch": 0.6048020765736535, + "grad_norm": 0.32997017809734314, + "learning_rate": 4.437229437229438e-05, + "loss": 0.5108, + "step": 466 + }, + { + "epoch": 0.6060999351070734, + "grad_norm": 0.4193305220535223, + "learning_rate": 4.434824434824435e-05, + "loss": 0.5333, + "step": 467 + }, + { + "epoch": 0.6073977936404932, + "grad_norm": 0.33536790460482346, + "learning_rate": 4.432419432419432e-05, + "loss": 0.5315, + "step": 468 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.30331774787903265, + "learning_rate": 4.43001443001443e-05, + "loss": 0.5044, + "step": 469 + }, + { + "epoch": 0.6099935107073329, + "grad_norm": 0.3332494229182582, + "learning_rate": 4.427609427609428e-05, + "loss": 0.5271, + "step": 470 + }, + { + "epoch": 0.6112913692407528, + "grad_norm": 0.27631817841293893, + "learning_rate": 4.425204425204426e-05, + "loss": 0.4869, + "step": 471 + }, + { + "epoch": 0.6125892277741726, + "grad_norm": 0.30864904652906816, + "learning_rate": 4.4227994227994226e-05, + "loss": 0.5189, + "step": 472 + }, + { + "epoch": 0.6138870863075925, + "grad_norm": 0.25286124571299967, + "learning_rate": 4.420394420394421e-05, + "loss": 0.5243, + "step": 473 + }, + { + "epoch": 0.6151849448410124, + "grad_norm": 0.32194768257527906, + "learning_rate": 4.417989417989418e-05, + "loss": 0.517, + "step": 474 + }, + { + "epoch": 0.6164828033744322, + "grad_norm": 0.25295912491765415, + "learning_rate": 4.415584415584416e-05, + "loss": 0.5119, + "step": 475 + }, + { + "epoch": 0.6177806619078521, + "grad_norm": 0.30031239529360704, + "learning_rate": 4.413179413179413e-05, + "loss": 0.5167, + "step": 476 + }, + { + "epoch": 0.6190785204412719, + "grad_norm": 0.3457458465491688, + "learning_rate": 4.410774410774411e-05, + "loss": 0.5122, + "step": 477 + }, + { + "epoch": 0.6203763789746918, + "grad_norm": 0.3091494265523315, + "learning_rate": 4.408369408369409e-05, + "loss": 0.4996, + "step": 478 + }, + { + "epoch": 0.6216742375081116, + "grad_norm": 0.34841455852307157, + "learning_rate": 4.405964405964406e-05, + "loss": 0.5187, + "step": 479 + }, + { + "epoch": 0.6229720960415315, + "grad_norm": 0.28466491288804874, + "learning_rate": 4.403559403559404e-05, + "loss": 0.5081, + "step": 480 + }, + { + "epoch": 0.6242699545749514, + "grad_norm": 0.31239695738713696, + "learning_rate": 4.401154401154401e-05, + "loss": 0.5188, + "step": 481 + }, + { + "epoch": 0.6255678131083712, + "grad_norm": 0.2906342979686575, + "learning_rate": 4.398749398749399e-05, + "loss": 0.4979, + "step": 482 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.32742410431546853, + "learning_rate": 4.396344396344397e-05, + "loss": 0.5096, + "step": 483 + }, + { + "epoch": 0.6281635301752109, + "grad_norm": 0.28587236180759673, + "learning_rate": 4.3939393939393944e-05, + "loss": 0.5282, + "step": 484 + }, + { + "epoch": 0.6294613887086308, + "grad_norm": 0.2939115603096443, + "learning_rate": 4.391534391534391e-05, + "loss": 0.5172, + "step": 485 + }, + { + "epoch": 0.6307592472420506, + "grad_norm": 0.2873210431560928, + "learning_rate": 4.3891293891293894e-05, + "loss": 0.5229, + "step": 486 + }, + { + "epoch": 0.6320571057754705, + "grad_norm": 0.342818284796367, + "learning_rate": 4.386724386724387e-05, + "loss": 0.5272, + "step": 487 + }, + { + "epoch": 0.6333549643088904, + "grad_norm": 0.2663532951872564, + "learning_rate": 4.384319384319385e-05, + "loss": 0.5087, + "step": 488 + }, + { + "epoch": 0.6346528228423102, + "grad_norm": 0.25849575306366, + "learning_rate": 4.381914381914382e-05, + "loss": 0.503, + "step": 489 + }, + { + "epoch": 0.6359506813757301, + "grad_norm": 0.2682428237326465, + "learning_rate": 4.379509379509379e-05, + "loss": 0.5178, + "step": 490 + }, + { + "epoch": 0.6372485399091499, + "grad_norm": 0.2899634415594277, + "learning_rate": 4.3771043771043774e-05, + "loss": 0.4964, + "step": 491 + }, + { + "epoch": 0.6385463984425698, + "grad_norm": 0.3453086828842896, + "learning_rate": 4.374699374699375e-05, + "loss": 0.53, + "step": 492 + }, + { + "epoch": 0.6398442569759896, + "grad_norm": 0.34399408107909996, + "learning_rate": 4.3722943722943724e-05, + "loss": 0.5193, + "step": 493 + }, + { + "epoch": 0.6411421155094095, + "grad_norm": 0.3610030879163129, + "learning_rate": 4.36988936988937e-05, + "loss": 0.5144, + "step": 494 + }, + { + "epoch": 0.6424399740428294, + "grad_norm": 0.23843044383570788, + "learning_rate": 4.367484367484368e-05, + "loss": 0.4999, + "step": 495 + }, + { + "epoch": 0.6437378325762492, + "grad_norm": 0.3654439591676623, + "learning_rate": 4.3650793650793655e-05, + "loss": 0.4888, + "step": 496 + }, + { + "epoch": 0.6450356911096691, + "grad_norm": 0.28776010247656836, + "learning_rate": 4.362674362674363e-05, + "loss": 0.5022, + "step": 497 + }, + { + "epoch": 0.6463335496430889, + "grad_norm": 0.3647131705869751, + "learning_rate": 4.3602693602693604e-05, + "loss": 0.5092, + "step": 498 + }, + { + "epoch": 0.6476314081765088, + "grad_norm": 0.30736812776643446, + "learning_rate": 4.357864357864358e-05, + "loss": 0.5171, + "step": 499 + }, + { + "epoch": 0.6489292667099286, + "grad_norm": 0.36290147629484104, + "learning_rate": 4.355459355459356e-05, + "loss": 0.5223, + "step": 500 + }, + { + "epoch": 0.6502271252433485, + "grad_norm": 0.32228223382695725, + "learning_rate": 4.3530543530543535e-05, + "loss": 0.5099, + "step": 501 + }, + { + "epoch": 0.6515249837767684, + "grad_norm": 0.31393689007483594, + "learning_rate": 4.3506493506493503e-05, + "loss": 0.496, + "step": 502 + }, + { + "epoch": 0.6528228423101882, + "grad_norm": 0.2966759326603879, + "learning_rate": 4.3482443482443485e-05, + "loss": 0.5173, + "step": 503 + }, + { + "epoch": 0.6541207008436081, + "grad_norm": 0.2864744517114858, + "learning_rate": 4.345839345839346e-05, + "loss": 0.503, + "step": 504 + }, + { + "epoch": 0.6554185593770279, + "grad_norm": 0.28016826596559247, + "learning_rate": 4.343434343434344e-05, + "loss": 0.5095, + "step": 505 + }, + { + "epoch": 0.6567164179104478, + "grad_norm": 0.3045274480234983, + "learning_rate": 4.341029341029341e-05, + "loss": 0.5234, + "step": 506 + }, + { + "epoch": 0.6580142764438677, + "grad_norm": 0.2865539821179636, + "learning_rate": 4.3386243386243384e-05, + "loss": 0.5057, + "step": 507 + }, + { + "epoch": 0.6593121349772875, + "grad_norm": 0.28016725527352626, + "learning_rate": 4.3362193362193366e-05, + "loss": 0.5054, + "step": 508 + }, + { + "epoch": 0.6606099935107074, + "grad_norm": 0.2779087438851858, + "learning_rate": 4.333814333814334e-05, + "loss": 0.4844, + "step": 509 + }, + { + "epoch": 0.6619078520441272, + "grad_norm": 0.29308593442315034, + "learning_rate": 4.3314093314093315e-05, + "loss": 0.5026, + "step": 510 + }, + { + "epoch": 0.6632057105775471, + "grad_norm": 0.24617150101353785, + "learning_rate": 4.329004329004329e-05, + "loss": 0.505, + "step": 511 + }, + { + "epoch": 0.6645035691109669, + "grad_norm": 0.2801536462432465, + "learning_rate": 4.3265993265993265e-05, + "loss": 0.4957, + "step": 512 + }, + { + "epoch": 0.6658014276443868, + "grad_norm": 0.2590262249669081, + "learning_rate": 4.3241943241943246e-05, + "loss": 0.4976, + "step": 513 + }, + { + "epoch": 0.6670992861778067, + "grad_norm": 0.27675213215164485, + "learning_rate": 4.321789321789322e-05, + "loss": 0.5016, + "step": 514 + }, + { + "epoch": 0.6683971447112265, + "grad_norm": 0.3211262394859621, + "learning_rate": 4.3193843193843196e-05, + "loss": 0.5285, + "step": 515 + }, + { + "epoch": 0.6696950032446464, + "grad_norm": 0.2847594895492132, + "learning_rate": 4.316979316979317e-05, + "loss": 0.5174, + "step": 516 + }, + { + "epoch": 0.6709928617780662, + "grad_norm": 0.31731208678548406, + "learning_rate": 4.314574314574315e-05, + "loss": 0.5287, + "step": 517 + }, + { + "epoch": 0.6722907203114861, + "grad_norm": 0.26600293134050695, + "learning_rate": 4.312169312169313e-05, + "loss": 0.5105, + "step": 518 + }, + { + "epoch": 0.6735885788449059, + "grad_norm": 0.29880462234281113, + "learning_rate": 4.3097643097643095e-05, + "loss": 0.5375, + "step": 519 + }, + { + "epoch": 0.6748864373783258, + "grad_norm": 0.2652094878668775, + "learning_rate": 4.3073593073593077e-05, + "loss": 0.5033, + "step": 520 + }, + { + "epoch": 0.6761842959117457, + "grad_norm": 0.315140738606816, + "learning_rate": 4.304954304954305e-05, + "loss": 0.5238, + "step": 521 + }, + { + "epoch": 0.6774821544451655, + "grad_norm": 0.2852888179467452, + "learning_rate": 4.302549302549303e-05, + "loss": 0.5125, + "step": 522 + }, + { + "epoch": 0.6787800129785854, + "grad_norm": 0.3217782609108167, + "learning_rate": 4.3001443001443e-05, + "loss": 0.5084, + "step": 523 + }, + { + "epoch": 0.6800778715120052, + "grad_norm": 0.3067930968649758, + "learning_rate": 4.2977392977392976e-05, + "loss": 0.4999, + "step": 524 + }, + { + "epoch": 0.6813757300454251, + "grad_norm": 0.2937819263154037, + "learning_rate": 4.295334295334296e-05, + "loss": 0.5256, + "step": 525 + }, + { + "epoch": 0.6826735885788449, + "grad_norm": 0.32438054578281567, + "learning_rate": 4.292929292929293e-05, + "loss": 0.4907, + "step": 526 + }, + { + "epoch": 0.6839714471122648, + "grad_norm": 0.2742147889295781, + "learning_rate": 4.290524290524291e-05, + "loss": 0.5068, + "step": 527 + }, + { + "epoch": 0.6852693056456847, + "grad_norm": 0.35488588986537717, + "learning_rate": 4.288119288119288e-05, + "loss": 0.5248, + "step": 528 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 0.26229530604678386, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.5037, + "step": 529 + }, + { + "epoch": 0.6878650227125244, + "grad_norm": 0.3461696681941986, + "learning_rate": 4.283309283309284e-05, + "loss": 0.5025, + "step": 530 + }, + { + "epoch": 0.6891628812459442, + "grad_norm": 0.266178675206237, + "learning_rate": 4.280904280904281e-05, + "loss": 0.4896, + "step": 531 + }, + { + "epoch": 0.6904607397793641, + "grad_norm": 0.34686998824653287, + "learning_rate": 4.278499278499279e-05, + "loss": 0.5034, + "step": 532 + }, + { + "epoch": 0.6917585983127839, + "grad_norm": 0.3320503579783302, + "learning_rate": 4.276094276094276e-05, + "loss": 0.5178, + "step": 533 + }, + { + "epoch": 0.6930564568462038, + "grad_norm": 0.3083799644603529, + "learning_rate": 4.273689273689274e-05, + "loss": 0.526, + "step": 534 + }, + { + "epoch": 0.6943543153796237, + "grad_norm": 0.36890093348582187, + "learning_rate": 4.271284271284272e-05, + "loss": 0.5311, + "step": 535 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.30993196929952377, + "learning_rate": 4.2688792688792686e-05, + "loss": 0.5246, + "step": 536 + }, + { + "epoch": 0.6969500324464634, + "grad_norm": 0.3135809737260554, + "learning_rate": 4.266474266474267e-05, + "loss": 0.5042, + "step": 537 + }, + { + "epoch": 0.6982478909798832, + "grad_norm": 0.32235641085299527, + "learning_rate": 4.264069264069264e-05, + "loss": 0.5226, + "step": 538 + }, + { + "epoch": 0.6995457495133031, + "grad_norm": 0.29955767973991637, + "learning_rate": 4.2616642616642624e-05, + "loss": 0.5003, + "step": 539 + }, + { + "epoch": 0.7008436080467229, + "grad_norm": 0.2728001277044658, + "learning_rate": 4.259259259259259e-05, + "loss": 0.5035, + "step": 540 + }, + { + "epoch": 0.7021414665801428, + "grad_norm": 0.3125500639615984, + "learning_rate": 4.256854256854257e-05, + "loss": 0.4982, + "step": 541 + }, + { + "epoch": 0.7034393251135627, + "grad_norm": 0.3214238247187388, + "learning_rate": 4.254449254449255e-05, + "loss": 0.5015, + "step": 542 + }, + { + "epoch": 0.7047371836469825, + "grad_norm": 0.34677033141949526, + "learning_rate": 4.2520442520442523e-05, + "loss": 0.5071, + "step": 543 + }, + { + "epoch": 0.7060350421804024, + "grad_norm": 0.320737794563556, + "learning_rate": 4.24963924963925e-05, + "loss": 0.5036, + "step": 544 + }, + { + "epoch": 0.7073329007138222, + "grad_norm": 0.28140631509820974, + "learning_rate": 4.247234247234247e-05, + "loss": 0.472, + "step": 545 + }, + { + "epoch": 0.7086307592472421, + "grad_norm": 0.2876262450309547, + "learning_rate": 4.244829244829245e-05, + "loss": 0.5033, + "step": 546 + }, + { + "epoch": 0.7099286177806619, + "grad_norm": 0.28203604302358143, + "learning_rate": 4.242424242424243e-05, + "loss": 0.4933, + "step": 547 + }, + { + "epoch": 0.7112264763140818, + "grad_norm": 0.3106404772330975, + "learning_rate": 4.2400192400192404e-05, + "loss": 0.4972, + "step": 548 + }, + { + "epoch": 0.7125243348475017, + "grad_norm": 0.25531650904916336, + "learning_rate": 4.237614237614238e-05, + "loss": 0.524, + "step": 549 + }, + { + "epoch": 0.7138221933809215, + "grad_norm": 0.3694832680055122, + "learning_rate": 4.2352092352092354e-05, + "loss": 0.5215, + "step": 550 + }, + { + "epoch": 0.7151200519143414, + "grad_norm": 0.29317455967258776, + "learning_rate": 4.232804232804233e-05, + "loss": 0.4978, + "step": 551 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.36952833950680053, + "learning_rate": 4.230399230399231e-05, + "loss": 0.5091, + "step": 552 + }, + { + "epoch": 0.7177157689811811, + "grad_norm": 0.3458300719165068, + "learning_rate": 4.227994227994228e-05, + "loss": 0.5179, + "step": 553 + }, + { + "epoch": 0.719013627514601, + "grad_norm": 0.35910338257547214, + "learning_rate": 4.225589225589226e-05, + "loss": 0.4967, + "step": 554 + }, + { + "epoch": 0.7203114860480208, + "grad_norm": 0.3832017565467235, + "learning_rate": 4.2231842231842234e-05, + "loss": 0.506, + "step": 555 + }, + { + "epoch": 0.7216093445814407, + "grad_norm": 0.3270524496685099, + "learning_rate": 4.220779220779221e-05, + "loss": 0.4956, + "step": 556 + }, + { + "epoch": 0.7229072031148605, + "grad_norm": 0.31306380662178745, + "learning_rate": 4.2183742183742184e-05, + "loss": 0.516, + "step": 557 + }, + { + "epoch": 0.7242050616482804, + "grad_norm": 0.294604631221543, + "learning_rate": 4.215969215969216e-05, + "loss": 0.5055, + "step": 558 + }, + { + "epoch": 0.7255029201817002, + "grad_norm": 0.3534780044338388, + "learning_rate": 4.213564213564214e-05, + "loss": 0.4975, + "step": 559 + }, + { + "epoch": 0.72680077871512, + "grad_norm": 0.33032987931239965, + "learning_rate": 4.2111592111592115e-05, + "loss": 0.5118, + "step": 560 + }, + { + "epoch": 0.72809863724854, + "grad_norm": 0.3196832192635056, + "learning_rate": 4.208754208754209e-05, + "loss": 0.5093, + "step": 561 + }, + { + "epoch": 0.7293964957819598, + "grad_norm": 0.36785704594666774, + "learning_rate": 4.2063492063492065e-05, + "loss": 0.5069, + "step": 562 + }, + { + "epoch": 0.7306943543153797, + "grad_norm": 0.3702503469527744, + "learning_rate": 4.203944203944204e-05, + "loss": 0.513, + "step": 563 + }, + { + "epoch": 0.7319922128487995, + "grad_norm": 0.3070674221331164, + "learning_rate": 4.201539201539202e-05, + "loss": 0.497, + "step": 564 + }, + { + "epoch": 0.7332900713822194, + "grad_norm": 0.35750959007798994, + "learning_rate": 4.1991341991341996e-05, + "loss": 0.5085, + "step": 565 + }, + { + "epoch": 0.7345879299156391, + "grad_norm": 0.2835364219292076, + "learning_rate": 4.196729196729197e-05, + "loss": 0.5091, + "step": 566 + }, + { + "epoch": 0.735885788449059, + "grad_norm": 0.2884098082465498, + "learning_rate": 4.1943241943241945e-05, + "loss": 0.4884, + "step": 567 + }, + { + "epoch": 0.737183646982479, + "grad_norm": 0.3203510406175552, + "learning_rate": 4.191919191919192e-05, + "loss": 0.4971, + "step": 568 + }, + { + "epoch": 0.7384815055158988, + "grad_norm": 0.27371373687668255, + "learning_rate": 4.18951418951419e-05, + "loss": 0.5095, + "step": 569 + }, + { + "epoch": 0.7397793640493187, + "grad_norm": 0.34717402203397457, + "learning_rate": 4.187109187109187e-05, + "loss": 0.5014, + "step": 570 + }, + { + "epoch": 0.7410772225827384, + "grad_norm": 0.30582393639621713, + "learning_rate": 4.184704184704185e-05, + "loss": 0.5181, + "step": 571 + }, + { + "epoch": 0.7423750811161584, + "grad_norm": 0.32112393843480735, + "learning_rate": 4.1822991822991826e-05, + "loss": 0.5006, + "step": 572 + }, + { + "epoch": 0.7436729396495781, + "grad_norm": 0.33979137877685406, + "learning_rate": 4.17989417989418e-05, + "loss": 0.5248, + "step": 573 + }, + { + "epoch": 0.744970798182998, + "grad_norm": 0.3209001348833202, + "learning_rate": 4.1774891774891775e-05, + "loss": 0.5013, + "step": 574 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 0.3266878409508907, + "learning_rate": 4.175084175084175e-05, + "loss": 0.4915, + "step": 575 + }, + { + "epoch": 0.7475665152498377, + "grad_norm": 0.30503612561210064, + "learning_rate": 4.172679172679173e-05, + "loss": 0.5046, + "step": 576 + }, + { + "epoch": 0.7488643737832577, + "grad_norm": 0.28971405798539174, + "learning_rate": 4.1702741702741707e-05, + "loss": 0.494, + "step": 577 + }, + { + "epoch": 0.7501622323166774, + "grad_norm": 0.29780493895562776, + "learning_rate": 4.167869167869168e-05, + "loss": 0.492, + "step": 578 + }, + { + "epoch": 0.7514600908500974, + "grad_norm": 0.32685361493691256, + "learning_rate": 4.1654641654641656e-05, + "loss": 0.5101, + "step": 579 + }, + { + "epoch": 0.7527579493835171, + "grad_norm": 0.34614664346128227, + "learning_rate": 4.163059163059163e-05, + "loss": 0.4953, + "step": 580 + }, + { + "epoch": 0.754055807916937, + "grad_norm": 0.24964382812915295, + "learning_rate": 4.160654160654161e-05, + "loss": 0.4974, + "step": 581 + }, + { + "epoch": 0.755353666450357, + "grad_norm": 0.3266391465805975, + "learning_rate": 4.158249158249159e-05, + "loss": 0.5039, + "step": 582 + }, + { + "epoch": 0.7566515249837767, + "grad_norm": 0.34857873431761155, + "learning_rate": 4.155844155844156e-05, + "loss": 0.5056, + "step": 583 + }, + { + "epoch": 0.7579493835171967, + "grad_norm": 0.2921239047290261, + "learning_rate": 4.153439153439154e-05, + "loss": 0.4967, + "step": 584 + }, + { + "epoch": 0.7592472420506164, + "grad_norm": 0.33268372260105683, + "learning_rate": 4.151034151034151e-05, + "loss": 0.5066, + "step": 585 + }, + { + "epoch": 0.7605451005840363, + "grad_norm": 0.318064660501317, + "learning_rate": 4.148629148629149e-05, + "loss": 0.5014, + "step": 586 + }, + { + "epoch": 0.7618429591174561, + "grad_norm": 0.40789097570888044, + "learning_rate": 4.146224146224146e-05, + "loss": 0.5236, + "step": 587 + }, + { + "epoch": 0.763140817650876, + "grad_norm": 0.36460994866717067, + "learning_rate": 4.143819143819144e-05, + "loss": 0.5028, + "step": 588 + }, + { + "epoch": 0.764438676184296, + "grad_norm": 0.32621095461004207, + "learning_rate": 4.141414141414142e-05, + "loss": 0.5, + "step": 589 + }, + { + "epoch": 0.7657365347177157, + "grad_norm": 0.35949920016397585, + "learning_rate": 4.139009139009139e-05, + "loss": 0.5028, + "step": 590 + }, + { + "epoch": 0.7670343932511356, + "grad_norm": 0.26064162839844207, + "learning_rate": 4.136604136604137e-05, + "loss": 0.4926, + "step": 591 + }, + { + "epoch": 0.7683322517845554, + "grad_norm": 0.33955569046497985, + "learning_rate": 4.134199134199134e-05, + "loss": 0.491, + "step": 592 + }, + { + "epoch": 0.7696301103179753, + "grad_norm": 0.302141896432662, + "learning_rate": 4.131794131794132e-05, + "loss": 0.4884, + "step": 593 + }, + { + "epoch": 0.7709279688513953, + "grad_norm": 0.30640851435457384, + "learning_rate": 4.12938912938913e-05, + "loss": 0.4993, + "step": 594 + }, + { + "epoch": 0.772225827384815, + "grad_norm": 0.3040997259974209, + "learning_rate": 4.126984126984127e-05, + "loss": 0.5207, + "step": 595 + }, + { + "epoch": 0.773523685918235, + "grad_norm": 0.3146837697580934, + "learning_rate": 4.124579124579125e-05, + "loss": 0.5041, + "step": 596 + }, + { + "epoch": 0.7748215444516547, + "grad_norm": 0.2837662015282775, + "learning_rate": 4.122174122174122e-05, + "loss": 0.4958, + "step": 597 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 0.3005344336762094, + "learning_rate": 4.1197691197691204e-05, + "loss": 0.4995, + "step": 598 + }, + { + "epoch": 0.7774172615184944, + "grad_norm": 0.2593994091267606, + "learning_rate": 4.117364117364118e-05, + "loss": 0.4949, + "step": 599 + }, + { + "epoch": 0.7787151200519143, + "grad_norm": 0.31127336265884026, + "learning_rate": 4.114959114959115e-05, + "loss": 0.5026, + "step": 600 + }, + { + "epoch": 0.7800129785853342, + "grad_norm": 0.24332809263192706, + "learning_rate": 4.112554112554113e-05, + "loss": 0.506, + "step": 601 + }, + { + "epoch": 0.781310837118754, + "grad_norm": 0.268091615023721, + "learning_rate": 4.11014911014911e-05, + "loss": 0.5148, + "step": 602 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.27606994914293354, + "learning_rate": 4.1077441077441085e-05, + "loss": 0.5058, + "step": 603 + }, + { + "epoch": 0.7839065541855937, + "grad_norm": 0.25091695629276883, + "learning_rate": 4.105339105339105e-05, + "loss": 0.4939, + "step": 604 + }, + { + "epoch": 0.7852044127190136, + "grad_norm": 0.25969299604058604, + "learning_rate": 4.1029341029341034e-05, + "loss": 0.5045, + "step": 605 + }, + { + "epoch": 0.7865022712524334, + "grad_norm": 0.28645937532292653, + "learning_rate": 4.100529100529101e-05, + "loss": 0.51, + "step": 606 + }, + { + "epoch": 0.7878001297858533, + "grad_norm": 0.2867588479387004, + "learning_rate": 4.0981240981240984e-05, + "loss": 0.4849, + "step": 607 + }, + { + "epoch": 0.7890979883192732, + "grad_norm": 0.2646936431763157, + "learning_rate": 4.095719095719096e-05, + "loss": 0.5139, + "step": 608 + }, + { + "epoch": 0.790395846852693, + "grad_norm": 0.3083485975619241, + "learning_rate": 4.093314093314093e-05, + "loss": 0.52, + "step": 609 + }, + { + "epoch": 0.791693705386113, + "grad_norm": 0.29615253606758346, + "learning_rate": 4.0909090909090915e-05, + "loss": 0.491, + "step": 610 + }, + { + "epoch": 0.7929915639195327, + "grad_norm": 0.29803496441037525, + "learning_rate": 4.088504088504089e-05, + "loss": 0.5054, + "step": 611 + }, + { + "epoch": 0.7942894224529526, + "grad_norm": 0.2854912264040868, + "learning_rate": 4.0860990860990864e-05, + "loss": 0.4838, + "step": 612 + }, + { + "epoch": 0.7955872809863724, + "grad_norm": 0.2861142997625756, + "learning_rate": 4.083694083694084e-05, + "loss": 0.4777, + "step": 613 + }, + { + "epoch": 0.7968851395197923, + "grad_norm": 0.28878974958497106, + "learning_rate": 4.0812890812890814e-05, + "loss": 0.5043, + "step": 614 + }, + { + "epoch": 0.7981829980532122, + "grad_norm": 0.28908126130671624, + "learning_rate": 4.0788840788840795e-05, + "loss": 0.5077, + "step": 615 + }, + { + "epoch": 0.799480856586632, + "grad_norm": 0.31059225731020423, + "learning_rate": 4.0764790764790763e-05, + "loss": 0.4846, + "step": 616 + }, + { + "epoch": 0.8007787151200519, + "grad_norm": 0.2990367658353648, + "learning_rate": 4.074074074074074e-05, + "loss": 0.5075, + "step": 617 + }, + { + "epoch": 0.8020765736534717, + "grad_norm": 0.29529728033655306, + "learning_rate": 4.071669071669072e-05, + "loss": 0.4876, + "step": 618 + }, + { + "epoch": 0.8033744321868916, + "grad_norm": 0.299331177429087, + "learning_rate": 4.0692640692640695e-05, + "loss": 0.5032, + "step": 619 + }, + { + "epoch": 0.8046722907203114, + "grad_norm": 0.2842657684401892, + "learning_rate": 4.066859066859067e-05, + "loss": 0.4889, + "step": 620 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.2692733717815561, + "learning_rate": 4.0644540644540644e-05, + "loss": 0.5009, + "step": 621 + }, + { + "epoch": 0.8072680077871512, + "grad_norm": 0.3436883319030681, + "learning_rate": 4.062049062049062e-05, + "loss": 0.5185, + "step": 622 + }, + { + "epoch": 0.808565866320571, + "grad_norm": 0.2653175278056993, + "learning_rate": 4.05964405964406e-05, + "loss": 0.4739, + "step": 623 + }, + { + "epoch": 0.8098637248539909, + "grad_norm": 0.31209657077735303, + "learning_rate": 4.0572390572390575e-05, + "loss": 0.4944, + "step": 624 + }, + { + "epoch": 0.8111615833874107, + "grad_norm": 0.3396825057908641, + "learning_rate": 4.054834054834055e-05, + "loss": 0.5203, + "step": 625 + }, + { + "epoch": 0.8124594419208306, + "grad_norm": 0.31652376647001546, + "learning_rate": 4.0524290524290525e-05, + "loss": 0.4973, + "step": 626 + }, + { + "epoch": 0.8137573004542504, + "grad_norm": 0.41987696956302806, + "learning_rate": 4.05002405002405e-05, + "loss": 0.5008, + "step": 627 + }, + { + "epoch": 0.8150551589876703, + "grad_norm": 0.32125784926625567, + "learning_rate": 4.047619047619048e-05, + "loss": 0.5107, + "step": 628 + }, + { + "epoch": 0.8163530175210902, + "grad_norm": 0.43302794720660975, + "learning_rate": 4.045214045214045e-05, + "loss": 0.5174, + "step": 629 + }, + { + "epoch": 0.81765087605451, + "grad_norm": 0.29529734876987174, + "learning_rate": 4.042809042809043e-05, + "loss": 0.4881, + "step": 630 + }, + { + "epoch": 0.8189487345879299, + "grad_norm": 0.4076264173563411, + "learning_rate": 4.0404040404040405e-05, + "loss": 0.5034, + "step": 631 + }, + { + "epoch": 0.8202465931213497, + "grad_norm": 0.30337707563686833, + "learning_rate": 4.037999037999039e-05, + "loss": 0.5119, + "step": 632 + }, + { + "epoch": 0.8215444516547696, + "grad_norm": 0.39849923453663594, + "learning_rate": 4.0355940355940355e-05, + "loss": 0.5216, + "step": 633 + }, + { + "epoch": 0.8228423101881895, + "grad_norm": 0.26447226558452136, + "learning_rate": 4.033189033189033e-05, + "loss": 0.5122, + "step": 634 + }, + { + "epoch": 0.8241401687216093, + "grad_norm": 0.36530243282807756, + "learning_rate": 4.030784030784031e-05, + "loss": 0.4918, + "step": 635 + }, + { + "epoch": 0.8254380272550292, + "grad_norm": 0.3160155549438362, + "learning_rate": 4.0283790283790286e-05, + "loss": 0.5024, + "step": 636 + }, + { + "epoch": 0.826735885788449, + "grad_norm": 0.33636766065888035, + "learning_rate": 4.025974025974026e-05, + "loss": 0.5075, + "step": 637 + }, + { + "epoch": 0.8280337443218689, + "grad_norm": 0.29456591212102723, + "learning_rate": 4.0235690235690236e-05, + "loss": 0.5241, + "step": 638 + }, + { + "epoch": 0.8293316028552887, + "grad_norm": 0.3220137418817016, + "learning_rate": 4.021164021164021e-05, + "loss": 0.5028, + "step": 639 + }, + { + "epoch": 0.8306294613887086, + "grad_norm": 0.279849046005973, + "learning_rate": 4.018759018759019e-05, + "loss": 0.4937, + "step": 640 + }, + { + "epoch": 0.8319273199221285, + "grad_norm": 0.34243863539028374, + "learning_rate": 4.016354016354017e-05, + "loss": 0.4992, + "step": 641 + }, + { + "epoch": 0.8332251784555483, + "grad_norm": 0.3077281111260478, + "learning_rate": 4.013949013949014e-05, + "loss": 0.5013, + "step": 642 + }, + { + "epoch": 0.8345230369889682, + "grad_norm": 0.2917135387110751, + "learning_rate": 4.0115440115440116e-05, + "loss": 0.5084, + "step": 643 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 0.4035461806364624, + "learning_rate": 4.009139009139009e-05, + "loss": 0.5075, + "step": 644 + }, + { + "epoch": 0.8371187540558079, + "grad_norm": 0.28209498726622767, + "learning_rate": 4.006734006734007e-05, + "loss": 0.5018, + "step": 645 + }, + { + "epoch": 0.8384166125892277, + "grad_norm": 0.32365984928312647, + "learning_rate": 4.004329004329004e-05, + "loss": 0.5059, + "step": 646 + }, + { + "epoch": 0.8397144711226476, + "grad_norm": 0.27577530319773297, + "learning_rate": 4.001924001924002e-05, + "loss": 0.4833, + "step": 647 + }, + { + "epoch": 0.8410123296560675, + "grad_norm": 0.28322226416250573, + "learning_rate": 3.999518999519e-05, + "loss": 0.4918, + "step": 648 + }, + { + "epoch": 0.8423101881894873, + "grad_norm": 0.2855631382527533, + "learning_rate": 3.997113997113997e-05, + "loss": 0.4865, + "step": 649 + }, + { + "epoch": 0.8436080467229072, + "grad_norm": 0.32968743054146016, + "learning_rate": 3.9947089947089946e-05, + "loss": 0.4995, + "step": 650 + }, + { + "epoch": 0.844905905256327, + "grad_norm": 0.2546632505302031, + "learning_rate": 3.992303992303992e-05, + "loss": 0.5007, + "step": 651 + }, + { + "epoch": 0.8462037637897469, + "grad_norm": 0.3379202280608477, + "learning_rate": 3.98989898989899e-05, + "loss": 0.5191, + "step": 652 + }, + { + "epoch": 0.8475016223231667, + "grad_norm": 0.2828398301217406, + "learning_rate": 3.987493987493988e-05, + "loss": 0.511, + "step": 653 + }, + { + "epoch": 0.8487994808565866, + "grad_norm": 0.2759621170821239, + "learning_rate": 3.985088985088985e-05, + "loss": 0.4833, + "step": 654 + }, + { + "epoch": 0.8500973393900065, + "grad_norm": 0.30549235671106895, + "learning_rate": 3.982683982683983e-05, + "loss": 0.5041, + "step": 655 + }, + { + "epoch": 0.8513951979234263, + "grad_norm": 0.27394626074529427, + "learning_rate": 3.98027898027898e-05, + "loss": 0.4765, + "step": 656 + }, + { + "epoch": 0.8526930564568462, + "grad_norm": 0.28778374402990897, + "learning_rate": 3.9778739778739783e-05, + "loss": 0.5204, + "step": 657 + }, + { + "epoch": 0.853990914990266, + "grad_norm": 0.34038234379097493, + "learning_rate": 3.975468975468976e-05, + "loss": 0.4974, + "step": 658 + }, + { + "epoch": 0.8552887735236859, + "grad_norm": 0.2866059277868264, + "learning_rate": 3.973063973063973e-05, + "loss": 0.5054, + "step": 659 + }, + { + "epoch": 0.8565866320571057, + "grad_norm": 0.2927739560082091, + "learning_rate": 3.970658970658971e-05, + "loss": 0.4974, + "step": 660 + }, + { + "epoch": 0.8578844905905256, + "grad_norm": 0.3262341662863849, + "learning_rate": 3.968253968253968e-05, + "loss": 0.4989, + "step": 661 + }, + { + "epoch": 0.8591823491239455, + "grad_norm": 0.2845679103896212, + "learning_rate": 3.9658489658489664e-05, + "loss": 0.4874, + "step": 662 + }, + { + "epoch": 0.8604802076573653, + "grad_norm": 0.2655596774616536, + "learning_rate": 3.963443963443963e-05, + "loss": 0.4843, + "step": 663 + }, + { + "epoch": 0.8617780661907852, + "grad_norm": 0.2820305028454277, + "learning_rate": 3.9610389610389614e-05, + "loss": 0.5164, + "step": 664 + }, + { + "epoch": 0.863075924724205, + "grad_norm": 0.2940016391705861, + "learning_rate": 3.958633958633959e-05, + "loss": 0.4886, + "step": 665 + }, + { + "epoch": 0.8643737832576249, + "grad_norm": 0.2547644694514986, + "learning_rate": 3.956228956228956e-05, + "loss": 0.5051, + "step": 666 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 0.26023827863988136, + "learning_rate": 3.953823953823954e-05, + "loss": 0.5217, + "step": 667 + }, + { + "epoch": 0.8669695003244646, + "grad_norm": 0.27927530276749113, + "learning_rate": 3.951418951418951e-05, + "loss": 0.5092, + "step": 668 + }, + { + "epoch": 0.8682673588578845, + "grad_norm": 0.26218361367284654, + "learning_rate": 3.9490139490139494e-05, + "loss": 0.4889, + "step": 669 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.2574083587232944, + "learning_rate": 3.946608946608947e-05, + "loss": 0.498, + "step": 670 + }, + { + "epoch": 0.8708630759247242, + "grad_norm": 0.28970396117550484, + "learning_rate": 3.9442039442039444e-05, + "loss": 0.5005, + "step": 671 + }, + { + "epoch": 0.872160934458144, + "grad_norm": 0.253672603517478, + "learning_rate": 3.941798941798942e-05, + "loss": 0.4802, + "step": 672 + }, + { + "epoch": 0.8734587929915639, + "grad_norm": 0.28176527595198714, + "learning_rate": 3.939393939393939e-05, + "loss": 0.5009, + "step": 673 + }, + { + "epoch": 0.8747566515249838, + "grad_norm": 0.29970103183262975, + "learning_rate": 3.9369889369889375e-05, + "loss": 0.494, + "step": 674 + }, + { + "epoch": 0.8760545100584036, + "grad_norm": 0.28208623298136165, + "learning_rate": 3.934583934583935e-05, + "loss": 0.4784, + "step": 675 + }, + { + "epoch": 0.8773523685918235, + "grad_norm": 0.31291964058774646, + "learning_rate": 3.9321789321789324e-05, + "loss": 0.4869, + "step": 676 + }, + { + "epoch": 0.8786502271252433, + "grad_norm": 0.2979349183006068, + "learning_rate": 3.92977392977393e-05, + "loss": 0.5046, + "step": 677 + }, + { + "epoch": 0.8799480856586632, + "grad_norm": 0.2908014815865184, + "learning_rate": 3.9273689273689274e-05, + "loss": 0.5265, + "step": 678 + }, + { + "epoch": 0.881245944192083, + "grad_norm": 0.26416143914819024, + "learning_rate": 3.9249639249639256e-05, + "loss": 0.4818, + "step": 679 + }, + { + "epoch": 0.8825438027255029, + "grad_norm": 0.2933751247833131, + "learning_rate": 3.9225589225589224e-05, + "loss": 0.5012, + "step": 680 + }, + { + "epoch": 0.8838416612589228, + "grad_norm": 0.3126465117694497, + "learning_rate": 3.9201539201539205e-05, + "loss": 0.5146, + "step": 681 + }, + { + "epoch": 0.8851395197923426, + "grad_norm": 0.2524418661547154, + "learning_rate": 3.917748917748918e-05, + "loss": 0.5016, + "step": 682 + }, + { + "epoch": 0.8864373783257625, + "grad_norm": 0.25838371662824994, + "learning_rate": 3.9153439153439155e-05, + "loss": 0.4788, + "step": 683 + }, + { + "epoch": 0.8877352368591823, + "grad_norm": 0.26415640627712256, + "learning_rate": 3.912938912938913e-05, + "loss": 0.5026, + "step": 684 + }, + { + "epoch": 0.8890330953926022, + "grad_norm": 0.24247698201137077, + "learning_rate": 3.9105339105339104e-05, + "loss": 0.4785, + "step": 685 + }, + { + "epoch": 0.890330953926022, + "grad_norm": 0.2592072694481307, + "learning_rate": 3.9081289081289086e-05, + "loss": 0.4977, + "step": 686 + }, + { + "epoch": 0.8916288124594419, + "grad_norm": 0.2542832037438216, + "learning_rate": 3.905723905723906e-05, + "loss": 0.4965, + "step": 687 + }, + { + "epoch": 0.8929266709928618, + "grad_norm": 0.262249098843731, + "learning_rate": 3.9033189033189035e-05, + "loss": 0.4973, + "step": 688 + }, + { + "epoch": 0.8942245295262816, + "grad_norm": 0.31610300163969496, + "learning_rate": 3.900913900913901e-05, + "loss": 0.4825, + "step": 689 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.2740619461513036, + "learning_rate": 3.8985088985088985e-05, + "loss": 0.4975, + "step": 690 + }, + { + "epoch": 0.8968202465931213, + "grad_norm": 0.2741811791339238, + "learning_rate": 3.8961038961038966e-05, + "loss": 0.4959, + "step": 691 + }, + { + "epoch": 0.8981181051265412, + "grad_norm": 0.271574424484243, + "learning_rate": 3.893698893698894e-05, + "loss": 0.4871, + "step": 692 + }, + { + "epoch": 0.899415963659961, + "grad_norm": 0.23758978275848563, + "learning_rate": 3.891293891293891e-05, + "loss": 0.4862, + "step": 693 + }, + { + "epoch": 0.9007138221933809, + "grad_norm": 0.27263432443106744, + "learning_rate": 3.888888888888889e-05, + "loss": 0.5041, + "step": 694 + }, + { + "epoch": 0.9020116807268008, + "grad_norm": 0.2707424886921677, + "learning_rate": 3.8864838864838866e-05, + "loss": 0.4669, + "step": 695 + }, + { + "epoch": 0.9033095392602206, + "grad_norm": 0.25600324849109557, + "learning_rate": 3.884078884078885e-05, + "loss": 0.4915, + "step": 696 + }, + { + "epoch": 0.9046073977936405, + "grad_norm": 0.28010923150865535, + "learning_rate": 3.8816738816738815e-05, + "loss": 0.5038, + "step": 697 + }, + { + "epoch": 0.9059052563270603, + "grad_norm": 0.28506679888273495, + "learning_rate": 3.87926887926888e-05, + "loss": 0.4965, + "step": 698 + }, + { + "epoch": 0.9072031148604802, + "grad_norm": 0.26956128070889057, + "learning_rate": 3.876863876863877e-05, + "loss": 0.5141, + "step": 699 + }, + { + "epoch": 0.9085009733939, + "grad_norm": 0.25947152657252537, + "learning_rate": 3.8744588744588746e-05, + "loss": 0.477, + "step": 700 + }, + { + "epoch": 0.9097988319273199, + "grad_norm": 0.29443853949197607, + "learning_rate": 3.872053872053872e-05, + "loss": 0.5059, + "step": 701 + }, + { + "epoch": 0.9110966904607398, + "grad_norm": 0.23371391184316104, + "learning_rate": 3.8696488696488696e-05, + "loss": 0.4783, + "step": 702 + }, + { + "epoch": 0.9123945489941596, + "grad_norm": 0.2960003000238748, + "learning_rate": 3.867243867243868e-05, + "loss": 0.4975, + "step": 703 + }, + { + "epoch": 0.9136924075275795, + "grad_norm": 0.2447202791191107, + "learning_rate": 3.864838864838865e-05, + "loss": 0.4922, + "step": 704 + }, + { + "epoch": 0.9149902660609993, + "grad_norm": 0.26642870018861, + "learning_rate": 3.862433862433863e-05, + "loss": 0.5252, + "step": 705 + }, + { + "epoch": 0.9162881245944192, + "grad_norm": 0.2560976171455726, + "learning_rate": 3.86002886002886e-05, + "loss": 0.5012, + "step": 706 + }, + { + "epoch": 0.917585983127839, + "grad_norm": 0.28883925682227224, + "learning_rate": 3.8576238576238576e-05, + "loss": 0.509, + "step": 707 + }, + { + "epoch": 0.9188838416612589, + "grad_norm": 0.2513723680500846, + "learning_rate": 3.855218855218856e-05, + "loss": 0.5064, + "step": 708 + }, + { + "epoch": 0.9201817001946788, + "grad_norm": 0.26385183299541554, + "learning_rate": 3.852813852813853e-05, + "loss": 0.4844, + "step": 709 + }, + { + "epoch": 0.9214795587280986, + "grad_norm": 0.2739471730680778, + "learning_rate": 3.85040885040885e-05, + "loss": 0.5062, + "step": 710 + }, + { + "epoch": 0.9227774172615185, + "grad_norm": 0.29349547383156327, + "learning_rate": 3.848003848003848e-05, + "loss": 0.4663, + "step": 711 + }, + { + "epoch": 0.9240752757949383, + "grad_norm": 0.24335538681766872, + "learning_rate": 3.845598845598846e-05, + "loss": 0.4802, + "step": 712 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 0.26992374528269225, + "learning_rate": 3.843193843193844e-05, + "loss": 0.4737, + "step": 713 + }, + { + "epoch": 0.9266709928617781, + "grad_norm": 0.2601068353604271, + "learning_rate": 3.8407888407888407e-05, + "loss": 0.4871, + "step": 714 + }, + { + "epoch": 0.9279688513951979, + "grad_norm": 0.2758032793334676, + "learning_rate": 3.838383838383838e-05, + "loss": 0.497, + "step": 715 + }, + { + "epoch": 0.9292667099286178, + "grad_norm": 0.28995105804236226, + "learning_rate": 3.835978835978836e-05, + "loss": 0.4954, + "step": 716 + }, + { + "epoch": 0.9305645684620376, + "grad_norm": 0.2711126929498793, + "learning_rate": 3.833573833573834e-05, + "loss": 0.4899, + "step": 717 + }, + { + "epoch": 0.9318624269954575, + "grad_norm": 0.27659146601067053, + "learning_rate": 3.831168831168831e-05, + "loss": 0.4724, + "step": 718 + }, + { + "epoch": 0.9331602855288773, + "grad_norm": 0.27572149711082233, + "learning_rate": 3.828763828763829e-05, + "loss": 0.4771, + "step": 719 + }, + { + "epoch": 0.9344581440622972, + "grad_norm": 0.26747547869691946, + "learning_rate": 3.826358826358827e-05, + "loss": 0.508, + "step": 720 + }, + { + "epoch": 0.9357560025957171, + "grad_norm": 0.25233671196063495, + "learning_rate": 3.8239538239538244e-05, + "loss": 0.5112, + "step": 721 + }, + { + "epoch": 0.9370538611291369, + "grad_norm": 0.25520294945614, + "learning_rate": 3.821548821548822e-05, + "loss": 0.5015, + "step": 722 + }, + { + "epoch": 0.9383517196625568, + "grad_norm": 0.2638285175394357, + "learning_rate": 3.819143819143819e-05, + "loss": 0.5159, + "step": 723 + }, + { + "epoch": 0.9396495781959766, + "grad_norm": 0.2424275513941752, + "learning_rate": 3.816738816738817e-05, + "loss": 0.4934, + "step": 724 + }, + { + "epoch": 0.9409474367293965, + "grad_norm": 0.25978321355889966, + "learning_rate": 3.814333814333815e-05, + "loss": 0.493, + "step": 725 + }, + { + "epoch": 0.9422452952628163, + "grad_norm": 0.2717237370055125, + "learning_rate": 3.8119288119288124e-05, + "loss": 0.4848, + "step": 726 + }, + { + "epoch": 0.9435431537962362, + "grad_norm": 0.2649262510228781, + "learning_rate": 3.809523809523809e-05, + "loss": 0.5106, + "step": 727 + }, + { + "epoch": 0.9448410123296561, + "grad_norm": 0.2630223525297261, + "learning_rate": 3.8071188071188074e-05, + "loss": 0.4819, + "step": 728 + }, + { + "epoch": 0.9461388708630759, + "grad_norm": 0.2552060740231092, + "learning_rate": 3.804713804713805e-05, + "loss": 0.4846, + "step": 729 + }, + { + "epoch": 0.9474367293964958, + "grad_norm": 0.27269809006023926, + "learning_rate": 3.802308802308803e-05, + "loss": 0.4646, + "step": 730 + }, + { + "epoch": 0.9487345879299156, + "grad_norm": 0.26086007457601984, + "learning_rate": 3.7999037999038e-05, + "loss": 0.4881, + "step": 731 + }, + { + "epoch": 0.9500324464633355, + "grad_norm": 0.261121106269547, + "learning_rate": 3.797498797498797e-05, + "loss": 0.4676, + "step": 732 + }, + { + "epoch": 0.9513303049967553, + "grad_norm": 0.24120860308268785, + "learning_rate": 3.7950937950937954e-05, + "loss": 0.475, + "step": 733 + }, + { + "epoch": 0.9526281635301752, + "grad_norm": 0.23338097940225563, + "learning_rate": 3.792688792688793e-05, + "loss": 0.4913, + "step": 734 + }, + { + "epoch": 0.9539260220635951, + "grad_norm": 0.27856238721903515, + "learning_rate": 3.7902837902837904e-05, + "loss": 0.4817, + "step": 735 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 0.23489180306981414, + "learning_rate": 3.787878787878788e-05, + "loss": 0.4876, + "step": 736 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.24747777349253483, + "learning_rate": 3.7854737854737854e-05, + "loss": 0.4926, + "step": 737 + }, + { + "epoch": 0.9578195976638546, + "grad_norm": 0.2464671646367973, + "learning_rate": 3.7830687830687835e-05, + "loss": 0.4988, + "step": 738 + }, + { + "epoch": 0.9591174561972745, + "grad_norm": 0.2444290486715431, + "learning_rate": 3.780663780663781e-05, + "loss": 0.5102, + "step": 739 + }, + { + "epoch": 0.9604153147306943, + "grad_norm": 0.23262037897628499, + "learning_rate": 3.7782587782587785e-05, + "loss": 0.4928, + "step": 740 + }, + { + "epoch": 0.9617131732641142, + "grad_norm": 0.2320844900099328, + "learning_rate": 3.775853775853776e-05, + "loss": 0.4858, + "step": 741 + }, + { + "epoch": 0.9630110317975341, + "grad_norm": 0.25468927720147666, + "learning_rate": 3.773448773448774e-05, + "loss": 0.4978, + "step": 742 + }, + { + "epoch": 0.9643088903309539, + "grad_norm": 0.2686348081196404, + "learning_rate": 3.7710437710437716e-05, + "loss": 0.4774, + "step": 743 + }, + { + "epoch": 0.9656067488643738, + "grad_norm": 0.25462323553264127, + "learning_rate": 3.7686387686387684e-05, + "loss": 0.4967, + "step": 744 + }, + { + "epoch": 0.9669046073977936, + "grad_norm": 0.23826397601414423, + "learning_rate": 3.7662337662337665e-05, + "loss": 0.4803, + "step": 745 + }, + { + "epoch": 0.9682024659312135, + "grad_norm": 0.28828391446426194, + "learning_rate": 3.763828763828764e-05, + "loss": 0.4853, + "step": 746 + }, + { + "epoch": 0.9695003244646333, + "grad_norm": 0.28433469305996517, + "learning_rate": 3.761423761423762e-05, + "loss": 0.4952, + "step": 747 + }, + { + "epoch": 0.9707981829980532, + "grad_norm": 0.23492438563324666, + "learning_rate": 3.759018759018759e-05, + "loss": 0.4707, + "step": 748 + }, + { + "epoch": 0.9720960415314731, + "grad_norm": 0.24099143399264922, + "learning_rate": 3.7566137566137564e-05, + "loss": 0.5025, + "step": 749 + }, + { + "epoch": 0.9733939000648929, + "grad_norm": 0.26521862280904784, + "learning_rate": 3.7542087542087546e-05, + "loss": 0.4801, + "step": 750 + }, + { + "epoch": 0.9746917585983128, + "grad_norm": 0.25301181696938563, + "learning_rate": 3.751803751803752e-05, + "loss": 0.4812, + "step": 751 + }, + { + "epoch": 0.9759896171317326, + "grad_norm": 0.27340749938479236, + "learning_rate": 3.7493987493987495e-05, + "loss": 0.5218, + "step": 752 + }, + { + "epoch": 0.9772874756651525, + "grad_norm": 0.2758375543775843, + "learning_rate": 3.746993746993747e-05, + "loss": 0.5003, + "step": 753 + }, + { + "epoch": 0.9785853341985724, + "grad_norm": 0.2894871095962033, + "learning_rate": 3.7445887445887445e-05, + "loss": 0.4898, + "step": 754 + }, + { + "epoch": 0.9798831927319922, + "grad_norm": 0.24603375955455123, + "learning_rate": 3.7421837421837427e-05, + "loss": 0.4975, + "step": 755 + }, + { + "epoch": 0.9811810512654121, + "grad_norm": 0.3430084532330243, + "learning_rate": 3.73977873977874e-05, + "loss": 0.4986, + "step": 756 + }, + { + "epoch": 0.9824789097988319, + "grad_norm": 0.26260792839174424, + "learning_rate": 3.7373737373737376e-05, + "loss": 0.4948, + "step": 757 + }, + { + "epoch": 0.9837767683322518, + "grad_norm": 0.3417705691604155, + "learning_rate": 3.734968734968735e-05, + "loss": 0.4892, + "step": 758 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.2367132738219532, + "learning_rate": 3.7325637325637326e-05, + "loss": 0.4837, + "step": 759 + }, + { + "epoch": 0.9863724853990915, + "grad_norm": 0.35539307871553294, + "learning_rate": 3.730158730158731e-05, + "loss": 0.5081, + "step": 760 + }, + { + "epoch": 0.9876703439325114, + "grad_norm": 0.22735379818962181, + "learning_rate": 3.7277537277537275e-05, + "loss": 0.4791, + "step": 761 + }, + { + "epoch": 0.9889682024659312, + "grad_norm": 0.30959128828024085, + "learning_rate": 3.725348725348726e-05, + "loss": 0.4977, + "step": 762 + }, + { + "epoch": 0.9902660609993511, + "grad_norm": 0.24407830637335193, + "learning_rate": 3.722943722943723e-05, + "loss": 0.4889, + "step": 763 + }, + { + "epoch": 0.9915639195327709, + "grad_norm": 0.37047588851002045, + "learning_rate": 3.720538720538721e-05, + "loss": 0.4887, + "step": 764 + }, + { + "epoch": 0.9928617780661908, + "grad_norm": 0.3132029888056198, + "learning_rate": 3.718133718133718e-05, + "loss": 0.4808, + "step": 765 + }, + { + "epoch": 0.9941596365996106, + "grad_norm": 0.336725063578875, + "learning_rate": 3.7157287157287156e-05, + "loss": 0.4944, + "step": 766 + }, + { + "epoch": 0.9954574951330305, + "grad_norm": 0.29559283526551605, + "learning_rate": 3.713323713323714e-05, + "loss": 0.485, + "step": 767 + }, + { + "epoch": 0.9967553536664504, + "grad_norm": 0.3181505432986373, + "learning_rate": 3.710918710918711e-05, + "loss": 0.5109, + "step": 768 + }, + { + "epoch": 0.9980532121998702, + "grad_norm": 0.25792486030536, + "learning_rate": 3.708513708513709e-05, + "loss": 0.4837, + "step": 769 + }, + { + "epoch": 0.9993510707332901, + "grad_norm": 0.2662559802060241, + "learning_rate": 3.706108706108706e-05, + "loss": 0.4768, + "step": 770 + }, + { + "epoch": 1.0, + "grad_norm": 0.2662559802060241, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.4749, + "step": 771 + }, + { + "epoch": 1.0012978585334198, + "grad_norm": 0.45710604360462226, + "learning_rate": 3.701298701298702e-05, + "loss": 0.4295, + "step": 772 + }, + { + "epoch": 1.0025957170668398, + "grad_norm": 0.3984763478659025, + "learning_rate": 3.698893698893699e-05, + "loss": 0.4211, + "step": 773 + }, + { + "epoch": 1.0038935756002596, + "grad_norm": 0.33604362226653633, + "learning_rate": 3.696488696488697e-05, + "loss": 0.4472, + "step": 774 + }, + { + "epoch": 1.0051914341336794, + "grad_norm": 0.35492193650921217, + "learning_rate": 3.694083694083694e-05, + "loss": 0.4144, + "step": 775 + }, + { + "epoch": 1.0064892926670992, + "grad_norm": 0.34395756575343545, + "learning_rate": 3.691678691678692e-05, + "loss": 0.4371, + "step": 776 + }, + { + "epoch": 1.0077871512005192, + "grad_norm": 0.3027073670465442, + "learning_rate": 3.68927368927369e-05, + "loss": 0.4418, + "step": 777 + }, + { + "epoch": 1.009085009733939, + "grad_norm": 0.33164997403681556, + "learning_rate": 3.686868686868687e-05, + "loss": 0.4236, + "step": 778 + }, + { + "epoch": 1.0103828682673588, + "grad_norm": 0.29836741552546153, + "learning_rate": 3.684463684463685e-05, + "loss": 0.4291, + "step": 779 + }, + { + "epoch": 1.0116807268007788, + "grad_norm": 0.31908724983675907, + "learning_rate": 3.682058682058682e-05, + "loss": 0.4444, + "step": 780 + }, + { + "epoch": 1.0129785853341986, + "grad_norm": 0.3063522863393575, + "learning_rate": 3.67965367965368e-05, + "loss": 0.4235, + "step": 781 + }, + { + "epoch": 1.0142764438676184, + "grad_norm": 0.286969078111983, + "learning_rate": 3.677248677248677e-05, + "loss": 0.4273, + "step": 782 + }, + { + "epoch": 1.0155743024010382, + "grad_norm": 0.30969412667904017, + "learning_rate": 3.674843674843675e-05, + "loss": 0.4179, + "step": 783 + }, + { + "epoch": 1.0168721609344582, + "grad_norm": 0.28846243749065087, + "learning_rate": 3.672438672438673e-05, + "loss": 0.445, + "step": 784 + }, + { + "epoch": 1.018170019467878, + "grad_norm": 0.3052289303187025, + "learning_rate": 3.6700336700336704e-05, + "loss": 0.4282, + "step": 785 + }, + { + "epoch": 1.0194678780012978, + "grad_norm": 0.28776824445687055, + "learning_rate": 3.667628667628668e-05, + "loss": 0.4361, + "step": 786 + }, + { + "epoch": 1.0207657365347178, + "grad_norm": 0.25471106959244577, + "learning_rate": 3.665223665223665e-05, + "loss": 0.4177, + "step": 787 + }, + { + "epoch": 1.0220635950681376, + "grad_norm": 0.2740291049864792, + "learning_rate": 3.662818662818663e-05, + "loss": 0.4394, + "step": 788 + }, + { + "epoch": 1.0233614536015574, + "grad_norm": 0.3017972640732574, + "learning_rate": 3.660413660413661e-05, + "loss": 0.4386, + "step": 789 + }, + { + "epoch": 1.0246593121349772, + "grad_norm": 0.23597430678446688, + "learning_rate": 3.6580086580086584e-05, + "loss": 0.4499, + "step": 790 + }, + { + "epoch": 1.0259571706683972, + "grad_norm": 0.2879673662024183, + "learning_rate": 3.655603655603656e-05, + "loss": 0.4293, + "step": 791 + }, + { + "epoch": 1.027255029201817, + "grad_norm": 0.2700731107191033, + "learning_rate": 3.6531986531986534e-05, + "loss": 0.4258, + "step": 792 + }, + { + "epoch": 1.0285528877352368, + "grad_norm": 0.25488494703667464, + "learning_rate": 3.650793650793651e-05, + "loss": 0.4074, + "step": 793 + }, + { + "epoch": 1.0298507462686568, + "grad_norm": 0.26394634723070287, + "learning_rate": 3.648388648388649e-05, + "loss": 0.4508, + "step": 794 + }, + { + "epoch": 1.0311486048020766, + "grad_norm": 0.27582489513269015, + "learning_rate": 3.645983645983646e-05, + "loss": 0.4412, + "step": 795 + }, + { + "epoch": 1.0324464633354964, + "grad_norm": 0.24726232362927064, + "learning_rate": 3.643578643578644e-05, + "loss": 0.4168, + "step": 796 + }, + { + "epoch": 1.0337443218689162, + "grad_norm": 0.28259272497475907, + "learning_rate": 3.6411736411736415e-05, + "loss": 0.4154, + "step": 797 + }, + { + "epoch": 1.0350421804023362, + "grad_norm": 0.2663149191606831, + "learning_rate": 3.638768638768639e-05, + "loss": 0.4189, + "step": 798 + }, + { + "epoch": 1.036340038935756, + "grad_norm": 0.23665837923986385, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.4098, + "step": 799 + }, + { + "epoch": 1.0376378974691758, + "grad_norm": 0.26291163598038214, + "learning_rate": 3.633958633958634e-05, + "loss": 0.4311, + "step": 800 + }, + { + "epoch": 1.0389357560025958, + "grad_norm": 0.27970192834528934, + "learning_rate": 3.631553631553632e-05, + "loss": 0.4145, + "step": 801 + }, + { + "epoch": 1.0402336145360156, + "grad_norm": 0.2710942408225925, + "learning_rate": 3.6291486291486295e-05, + "loss": 0.4277, + "step": 802 + }, + { + "epoch": 1.0415314730694354, + "grad_norm": 0.2727015101923209, + "learning_rate": 3.626743626743627e-05, + "loss": 0.4232, + "step": 803 + }, + { + "epoch": 1.0428293316028552, + "grad_norm": 0.3128392157246963, + "learning_rate": 3.6243386243386245e-05, + "loss": 0.4335, + "step": 804 + }, + { + "epoch": 1.0441271901362752, + "grad_norm": 0.2349817412715045, + "learning_rate": 3.621933621933622e-05, + "loss": 0.4253, + "step": 805 + }, + { + "epoch": 1.045425048669695, + "grad_norm": 0.28497577605120694, + "learning_rate": 3.61952861952862e-05, + "loss": 0.4168, + "step": 806 + }, + { + "epoch": 1.0467229072031148, + "grad_norm": 0.2587789310002607, + "learning_rate": 3.617123617123617e-05, + "loss": 0.4217, + "step": 807 + }, + { + "epoch": 1.0480207657365348, + "grad_norm": 0.26828544455125314, + "learning_rate": 3.6147186147186144e-05, + "loss": 0.4074, + "step": 808 + }, + { + "epoch": 1.0493186242699546, + "grad_norm": 0.2598287776154231, + "learning_rate": 3.6123136123136125e-05, + "loss": 0.4184, + "step": 809 + }, + { + "epoch": 1.0506164828033744, + "grad_norm": 0.2703287966299121, + "learning_rate": 3.60990860990861e-05, + "loss": 0.4323, + "step": 810 + }, + { + "epoch": 1.0519143413367944, + "grad_norm": 0.318496639053465, + "learning_rate": 3.6075036075036075e-05, + "loss": 0.4209, + "step": 811 + }, + { + "epoch": 1.0532121998702142, + "grad_norm": 0.2697909678170961, + "learning_rate": 3.605098605098605e-05, + "loss": 0.4244, + "step": 812 + }, + { + "epoch": 1.054510058403634, + "grad_norm": 0.2969724315201258, + "learning_rate": 3.602693602693603e-05, + "loss": 0.4172, + "step": 813 + }, + { + "epoch": 1.0558079169370538, + "grad_norm": 0.30303096771353466, + "learning_rate": 3.6002886002886006e-05, + "loss": 0.404, + "step": 814 + }, + { + "epoch": 1.0571057754704738, + "grad_norm": 0.240870306849622, + "learning_rate": 3.597883597883598e-05, + "loss": 0.4226, + "step": 815 + }, + { + "epoch": 1.0584036340038936, + "grad_norm": 0.31791173210740425, + "learning_rate": 3.5954785954785956e-05, + "loss": 0.411, + "step": 816 + }, + { + "epoch": 1.0597014925373134, + "grad_norm": 0.2804167030463435, + "learning_rate": 3.593073593073593e-05, + "loss": 0.4297, + "step": 817 + }, + { + "epoch": 1.0609993510707332, + "grad_norm": 0.24716383688085644, + "learning_rate": 3.590668590668591e-05, + "loss": 0.434, + "step": 818 + }, + { + "epoch": 1.0622972096041532, + "grad_norm": 0.3039180273884032, + "learning_rate": 3.588263588263589e-05, + "loss": 0.4212, + "step": 819 + }, + { + "epoch": 1.063595068137573, + "grad_norm": 0.319488059258111, + "learning_rate": 3.5858585858585855e-05, + "loss": 0.4068, + "step": 820 + }, + { + "epoch": 1.0648929266709928, + "grad_norm": 0.22619788557964504, + "learning_rate": 3.5834535834535836e-05, + "loss": 0.4311, + "step": 821 + }, + { + "epoch": 1.0661907852044128, + "grad_norm": 0.34410773893661634, + "learning_rate": 3.581048581048581e-05, + "loss": 0.4349, + "step": 822 + }, + { + "epoch": 1.0674886437378326, + "grad_norm": 0.3100368353729728, + "learning_rate": 3.578643578643579e-05, + "loss": 0.4352, + "step": 823 + }, + { + "epoch": 1.0687865022712524, + "grad_norm": 0.2901826811884039, + "learning_rate": 3.576238576238576e-05, + "loss": 0.4157, + "step": 824 + }, + { + "epoch": 1.0700843608046724, + "grad_norm": 0.31611344846131356, + "learning_rate": 3.5738335738335735e-05, + "loss": 0.4693, + "step": 825 + }, + { + "epoch": 1.0713822193380922, + "grad_norm": 0.35227684302990314, + "learning_rate": 3.571428571428572e-05, + "loss": 0.4356, + "step": 826 + }, + { + "epoch": 1.072680077871512, + "grad_norm": 0.25990758753916315, + "learning_rate": 3.569023569023569e-05, + "loss": 0.4149, + "step": 827 + }, + { + "epoch": 1.0739779364049318, + "grad_norm": 0.33795998379210196, + "learning_rate": 3.5666185666185667e-05, + "loss": 0.4231, + "step": 828 + }, + { + "epoch": 1.0752757949383518, + "grad_norm": 0.260416289520159, + "learning_rate": 3.564213564213564e-05, + "loss": 0.4439, + "step": 829 + }, + { + "epoch": 1.0765736534717716, + "grad_norm": 0.2745403629951124, + "learning_rate": 3.5618085618085616e-05, + "loss": 0.4363, + "step": 830 + }, + { + "epoch": 1.0778715120051914, + "grad_norm": 0.30247544618833483, + "learning_rate": 3.55940355940356e-05, + "loss": 0.4195, + "step": 831 + }, + { + "epoch": 1.0791693705386114, + "grad_norm": 0.32082708667036386, + "learning_rate": 3.556998556998557e-05, + "loss": 0.4207, + "step": 832 + }, + { + "epoch": 1.0804672290720312, + "grad_norm": 0.2897448897920795, + "learning_rate": 3.554593554593555e-05, + "loss": 0.4256, + "step": 833 + }, + { + "epoch": 1.081765087605451, + "grad_norm": 0.2799359981594651, + "learning_rate": 3.552188552188552e-05, + "loss": 0.4566, + "step": 834 + }, + { + "epoch": 1.0830629461388708, + "grad_norm": 0.3069382837366587, + "learning_rate": 3.5497835497835503e-05, + "loss": 0.4422, + "step": 835 + }, + { + "epoch": 1.0843608046722908, + "grad_norm": 0.2432398771659819, + "learning_rate": 3.547378547378548e-05, + "loss": 0.4058, + "step": 836 + }, + { + "epoch": 1.0856586632057106, + "grad_norm": 0.2946884230374921, + "learning_rate": 3.5449735449735446e-05, + "loss": 0.4212, + "step": 837 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.2833055873065403, + "learning_rate": 3.542568542568543e-05, + "loss": 0.4236, + "step": 838 + }, + { + "epoch": 1.0882543802725504, + "grad_norm": 0.24633104520706475, + "learning_rate": 3.54016354016354e-05, + "loss": 0.4296, + "step": 839 + }, + { + "epoch": 1.0895522388059702, + "grad_norm": 0.30305714876494183, + "learning_rate": 3.5377585377585384e-05, + "loss": 0.4125, + "step": 840 + }, + { + "epoch": 1.09085009733939, + "grad_norm": 0.27515539449115956, + "learning_rate": 3.535353535353535e-05, + "loss": 0.4343, + "step": 841 + }, + { + "epoch": 1.0921479558728098, + "grad_norm": 0.2566570554342314, + "learning_rate": 3.532948532948533e-05, + "loss": 0.4259, + "step": 842 + }, + { + "epoch": 1.0934458144062298, + "grad_norm": 0.27580070959828384, + "learning_rate": 3.530543530543531e-05, + "loss": 0.4429, + "step": 843 + }, + { + "epoch": 1.0947436729396496, + "grad_norm": 0.40316368454416424, + "learning_rate": 3.528138528138528e-05, + "loss": 0.4471, + "step": 844 + }, + { + "epoch": 1.0960415314730694, + "grad_norm": 0.2693638650733015, + "learning_rate": 3.525733525733526e-05, + "loss": 0.4259, + "step": 845 + }, + { + "epoch": 1.0973393900064894, + "grad_norm": 0.3044405084823207, + "learning_rate": 3.523328523328523e-05, + "loss": 0.4187, + "step": 846 + }, + { + "epoch": 1.0986372485399092, + "grad_norm": 0.24933029657871267, + "learning_rate": 3.520923520923521e-05, + "loss": 0.4328, + "step": 847 + }, + { + "epoch": 1.099935107073329, + "grad_norm": 0.26906838606368316, + "learning_rate": 3.518518518518519e-05, + "loss": 0.4368, + "step": 848 + }, + { + "epoch": 1.1012329656067488, + "grad_norm": 0.26971278689485273, + "learning_rate": 3.5161135161135164e-05, + "loss": 0.4297, + "step": 849 + }, + { + "epoch": 1.1025308241401688, + "grad_norm": 0.2747922938561738, + "learning_rate": 3.513708513708514e-05, + "loss": 0.4373, + "step": 850 + }, + { + "epoch": 1.1038286826735886, + "grad_norm": 0.2761210141034917, + "learning_rate": 3.5113035113035113e-05, + "loss": 0.4382, + "step": 851 + }, + { + "epoch": 1.1051265412070084, + "grad_norm": 0.2964790930440498, + "learning_rate": 3.508898508898509e-05, + "loss": 0.4231, + "step": 852 + }, + { + "epoch": 1.1064243997404284, + "grad_norm": 0.25620765135533435, + "learning_rate": 3.506493506493507e-05, + "loss": 0.4408, + "step": 853 + }, + { + "epoch": 1.1077222582738482, + "grad_norm": 0.21064152286484897, + "learning_rate": 3.504088504088504e-05, + "loss": 0.417, + "step": 854 + }, + { + "epoch": 1.109020116807268, + "grad_norm": 0.2958430063839789, + "learning_rate": 3.501683501683502e-05, + "loss": 0.4339, + "step": 855 + }, + { + "epoch": 1.1103179753406878, + "grad_norm": 0.22482067437635359, + "learning_rate": 3.4992784992784994e-05, + "loss": 0.4295, + "step": 856 + }, + { + "epoch": 1.1116158338741078, + "grad_norm": 0.2955967782295721, + "learning_rate": 3.4968734968734976e-05, + "loss": 0.4015, + "step": 857 + }, + { + "epoch": 1.1129136924075276, + "grad_norm": 0.2843718668047481, + "learning_rate": 3.4944684944684944e-05, + "loss": 0.4218, + "step": 858 + }, + { + "epoch": 1.1142115509409474, + "grad_norm": 0.2764727160970055, + "learning_rate": 3.492063492063492e-05, + "loss": 0.4246, + "step": 859 + }, + { + "epoch": 1.1155094094743674, + "grad_norm": 0.2624405170772858, + "learning_rate": 3.48965848965849e-05, + "loss": 0.407, + "step": 860 + }, + { + "epoch": 1.1168072680077872, + "grad_norm": 0.2666157900698976, + "learning_rate": 3.4872534872534875e-05, + "loss": 0.4402, + "step": 861 + }, + { + "epoch": 1.118105126541207, + "grad_norm": 1.2301966564667228, + "learning_rate": 3.484848484848485e-05, + "loss": 0.4452, + "step": 862 + }, + { + "epoch": 1.1194029850746268, + "grad_norm": 0.3054038468605886, + "learning_rate": 3.4824434824434824e-05, + "loss": 0.4354, + "step": 863 + }, + { + "epoch": 1.1207008436080468, + "grad_norm": 0.2582864922377692, + "learning_rate": 3.48003848003848e-05, + "loss": 0.4495, + "step": 864 + }, + { + "epoch": 1.1219987021414666, + "grad_norm": 0.2500252146157672, + "learning_rate": 3.477633477633478e-05, + "loss": 0.4395, + "step": 865 + }, + { + "epoch": 1.1232965606748864, + "grad_norm": 0.29631159663934065, + "learning_rate": 3.4752284752284755e-05, + "loss": 0.4514, + "step": 866 + }, + { + "epoch": 1.1245944192083064, + "grad_norm": 0.28152136468600736, + "learning_rate": 3.472823472823473e-05, + "loss": 0.4397, + "step": 867 + }, + { + "epoch": 1.1258922777417262, + "grad_norm": 0.21131772623693806, + "learning_rate": 3.4704184704184705e-05, + "loss": 0.4285, + "step": 868 + }, + { + "epoch": 1.127190136275146, + "grad_norm": 0.25615102365106246, + "learning_rate": 3.468013468013468e-05, + "loss": 0.4133, + "step": 869 + }, + { + "epoch": 1.128487994808566, + "grad_norm": 0.23917269321305726, + "learning_rate": 3.465608465608466e-05, + "loss": 0.4435, + "step": 870 + }, + { + "epoch": 1.1297858533419858, + "grad_norm": 0.27161646725256705, + "learning_rate": 3.463203463203463e-05, + "loss": 0.4233, + "step": 871 + }, + { + "epoch": 1.1310837118754056, + "grad_norm": 0.2412152469942244, + "learning_rate": 3.460798460798461e-05, + "loss": 0.4461, + "step": 872 + }, + { + "epoch": 1.1323815704088254, + "grad_norm": 0.3179450028674149, + "learning_rate": 3.4583934583934586e-05, + "loss": 0.4119, + "step": 873 + }, + { + "epoch": 1.1336794289422454, + "grad_norm": 0.23384786069007624, + "learning_rate": 3.455988455988456e-05, + "loss": 0.4103, + "step": 874 + }, + { + "epoch": 1.1349772874756652, + "grad_norm": 0.2576089230129144, + "learning_rate": 3.4535834535834535e-05, + "loss": 0.4256, + "step": 875 + }, + { + "epoch": 1.136275146009085, + "grad_norm": 0.25522949186712174, + "learning_rate": 3.451178451178451e-05, + "loss": 0.4306, + "step": 876 + }, + { + "epoch": 1.1375730045425048, + "grad_norm": 0.30146067353056377, + "learning_rate": 3.448773448773449e-05, + "loss": 0.4373, + "step": 877 + }, + { + "epoch": 1.1388708630759248, + "grad_norm": 0.2821696115546317, + "learning_rate": 3.4463684463684466e-05, + "loss": 0.4262, + "step": 878 + }, + { + "epoch": 1.1401687216093446, + "grad_norm": 0.23559687187392536, + "learning_rate": 3.443963443963444e-05, + "loss": 0.4128, + "step": 879 + }, + { + "epoch": 1.1414665801427644, + "grad_norm": 0.3096443772818808, + "learning_rate": 3.4415584415584416e-05, + "loss": 0.424, + "step": 880 + }, + { + "epoch": 1.1427644386761844, + "grad_norm": 0.27131628167000316, + "learning_rate": 3.439153439153439e-05, + "loss": 0.4243, + "step": 881 + }, + { + "epoch": 1.1440622972096042, + "grad_norm": 0.3321877999366987, + "learning_rate": 3.436748436748437e-05, + "loss": 0.3962, + "step": 882 + }, + { + "epoch": 1.145360155743024, + "grad_norm": 0.252836911633108, + "learning_rate": 3.434343434343435e-05, + "loss": 0.4364, + "step": 883 + }, + { + "epoch": 1.146658014276444, + "grad_norm": 0.28472809342059574, + "learning_rate": 3.431938431938432e-05, + "loss": 0.4265, + "step": 884 + }, + { + "epoch": 1.1479558728098638, + "grad_norm": 0.25893718977038643, + "learning_rate": 3.4295334295334296e-05, + "loss": 0.4239, + "step": 885 + }, + { + "epoch": 1.1492537313432836, + "grad_norm": 0.29888823830438355, + "learning_rate": 3.427128427128427e-05, + "loss": 0.4641, + "step": 886 + }, + { + "epoch": 1.1505515898767034, + "grad_norm": 0.2336271423650757, + "learning_rate": 3.424723424723425e-05, + "loss": 0.4363, + "step": 887 + }, + { + "epoch": 1.1518494484101234, + "grad_norm": 0.28355868807379, + "learning_rate": 3.422318422318422e-05, + "loss": 0.4446, + "step": 888 + }, + { + "epoch": 1.1531473069435432, + "grad_norm": 0.25482834362533163, + "learning_rate": 3.41991341991342e-05, + "loss": 0.432, + "step": 889 + }, + { + "epoch": 1.154445165476963, + "grad_norm": 0.257058818612092, + "learning_rate": 3.417508417508418e-05, + "loss": 0.4238, + "step": 890 + }, + { + "epoch": 1.1557430240103828, + "grad_norm": 0.2964878759739716, + "learning_rate": 3.415103415103415e-05, + "loss": 0.4341, + "step": 891 + }, + { + "epoch": 1.1570408825438028, + "grad_norm": 0.24581976687613294, + "learning_rate": 3.412698412698413e-05, + "loss": 0.445, + "step": 892 + }, + { + "epoch": 1.1583387410772226, + "grad_norm": 0.2672951398900844, + "learning_rate": 3.41029341029341e-05, + "loss": 0.4146, + "step": 893 + }, + { + "epoch": 1.1596365996106424, + "grad_norm": 0.29744313702383335, + "learning_rate": 3.407888407888408e-05, + "loss": 0.4291, + "step": 894 + }, + { + "epoch": 1.1609344581440624, + "grad_norm": 0.2594025370257348, + "learning_rate": 3.405483405483406e-05, + "loss": 0.4393, + "step": 895 + }, + { + "epoch": 1.1622323166774822, + "grad_norm": 0.27322558327059043, + "learning_rate": 3.403078403078403e-05, + "loss": 0.413, + "step": 896 + }, + { + "epoch": 1.163530175210902, + "grad_norm": 0.27895427053368943, + "learning_rate": 3.400673400673401e-05, + "loss": 0.4282, + "step": 897 + }, + { + "epoch": 1.164828033744322, + "grad_norm": 0.3278143440045291, + "learning_rate": 3.398268398268398e-05, + "loss": 0.4503, + "step": 898 + }, + { + "epoch": 1.1661258922777418, + "grad_norm": 0.2878645741851875, + "learning_rate": 3.3958633958633964e-05, + "loss": 0.418, + "step": 899 + }, + { + "epoch": 1.1674237508111616, + "grad_norm": 0.279091054078343, + "learning_rate": 3.393458393458394e-05, + "loss": 0.4351, + "step": 900 + }, + { + "epoch": 1.1687216093445814, + "grad_norm": 0.300972554323965, + "learning_rate": 3.391053391053391e-05, + "loss": 0.4177, + "step": 901 + }, + { + "epoch": 1.1700194678780014, + "grad_norm": 0.2912604255538886, + "learning_rate": 3.388648388648389e-05, + "loss": 0.4239, + "step": 902 + }, + { + "epoch": 1.1713173264114212, + "grad_norm": 0.28729371845984225, + "learning_rate": 3.386243386243386e-05, + "loss": 0.4498, + "step": 903 + }, + { + "epoch": 1.172615184944841, + "grad_norm": 0.2983707424965568, + "learning_rate": 3.3838383838383844e-05, + "loss": 0.4093, + "step": 904 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.2396146655592429, + "learning_rate": 3.381433381433381e-05, + "loss": 0.4134, + "step": 905 + }, + { + "epoch": 1.1752109020116808, + "grad_norm": 0.25743340902304185, + "learning_rate": 3.3790283790283794e-05, + "loss": 0.4024, + "step": 906 + }, + { + "epoch": 1.1765087605451006, + "grad_norm": 0.27027531302973373, + "learning_rate": 3.376623376623377e-05, + "loss": 0.4518, + "step": 907 + }, + { + "epoch": 1.1778066190785204, + "grad_norm": 0.25280300819232365, + "learning_rate": 3.3742183742183743e-05, + "loss": 0.4185, + "step": 908 + }, + { + "epoch": 1.1791044776119404, + "grad_norm": 0.22682160703006224, + "learning_rate": 3.371813371813372e-05, + "loss": 0.4174, + "step": 909 + }, + { + "epoch": 1.1804023361453602, + "grad_norm": 0.23204503630025836, + "learning_rate": 3.369408369408369e-05, + "loss": 0.4177, + "step": 910 + }, + { + "epoch": 1.18170019467878, + "grad_norm": 0.25880635574030375, + "learning_rate": 3.3670033670033675e-05, + "loss": 0.4179, + "step": 911 + }, + { + "epoch": 1.1829980532122, + "grad_norm": 0.2522597708371833, + "learning_rate": 3.364598364598365e-05, + "loss": 0.4283, + "step": 912 + }, + { + "epoch": 1.1842959117456198, + "grad_norm": 0.2883869624140782, + "learning_rate": 3.3621933621933624e-05, + "loss": 0.4372, + "step": 913 + }, + { + "epoch": 1.1855937702790396, + "grad_norm": 0.25106486957221746, + "learning_rate": 3.35978835978836e-05, + "loss": 0.4281, + "step": 914 + }, + { + "epoch": 1.1868916288124594, + "grad_norm": 0.292526125260076, + "learning_rate": 3.3573833573833574e-05, + "loss": 0.4365, + "step": 915 + }, + { + "epoch": 1.1881894873458794, + "grad_norm": 0.2676690874911841, + "learning_rate": 3.3549783549783555e-05, + "loss": 0.4326, + "step": 916 + }, + { + "epoch": 1.1894873458792992, + "grad_norm": 0.26481922535161423, + "learning_rate": 3.352573352573353e-05, + "loss": 0.4274, + "step": 917 + }, + { + "epoch": 1.190785204412719, + "grad_norm": 0.2798203995195467, + "learning_rate": 3.35016835016835e-05, + "loss": 0.4368, + "step": 918 + }, + { + "epoch": 1.1920830629461387, + "grad_norm": 0.2861488916957064, + "learning_rate": 3.347763347763348e-05, + "loss": 0.4137, + "step": 919 + }, + { + "epoch": 1.1933809214795588, + "grad_norm": 0.29631412123985384, + "learning_rate": 3.3453583453583454e-05, + "loss": 0.4569, + "step": 920 + }, + { + "epoch": 1.1946787800129786, + "grad_norm": 0.2342734368682864, + "learning_rate": 3.3429533429533436e-05, + "loss": 0.4161, + "step": 921 + }, + { + "epoch": 1.1959766385463984, + "grad_norm": 0.2701444926397203, + "learning_rate": 3.3405483405483404e-05, + "loss": 0.4436, + "step": 922 + }, + { + "epoch": 1.1972744970798184, + "grad_norm": 0.24533441345926324, + "learning_rate": 3.3381433381433385e-05, + "loss": 0.4169, + "step": 923 + }, + { + "epoch": 1.1985723556132382, + "grad_norm": 0.2540671392711727, + "learning_rate": 3.335738335738336e-05, + "loss": 0.4157, + "step": 924 + }, + { + "epoch": 1.199870214146658, + "grad_norm": 0.24799812272450378, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4185, + "step": 925 + }, + { + "epoch": 1.201168072680078, + "grad_norm": 0.22598144682861446, + "learning_rate": 3.330928330928331e-05, + "loss": 0.4316, + "step": 926 + }, + { + "epoch": 1.2024659312134978, + "grad_norm": 0.2448180172195478, + "learning_rate": 3.3285233285233284e-05, + "loss": 0.4224, + "step": 927 + }, + { + "epoch": 1.2037637897469176, + "grad_norm": 0.2765575264990623, + "learning_rate": 3.3261183261183266e-05, + "loss": 0.4152, + "step": 928 + }, + { + "epoch": 1.2050616482803373, + "grad_norm": 0.2804413742079038, + "learning_rate": 3.323713323713324e-05, + "loss": 0.4268, + "step": 929 + }, + { + "epoch": 1.2063595068137574, + "grad_norm": 0.241273559139525, + "learning_rate": 3.3213083213083216e-05, + "loss": 0.44, + "step": 930 + }, + { + "epoch": 1.2076573653471772, + "grad_norm": 0.26424527656796193, + "learning_rate": 3.318903318903319e-05, + "loss": 0.423, + "step": 931 + }, + { + "epoch": 1.208955223880597, + "grad_norm": 0.2618836109205494, + "learning_rate": 3.3164983164983165e-05, + "loss": 0.4263, + "step": 932 + }, + { + "epoch": 1.210253082414017, + "grad_norm": 0.232277628692469, + "learning_rate": 3.314093314093315e-05, + "loss": 0.4285, + "step": 933 + }, + { + "epoch": 1.2115509409474368, + "grad_norm": 0.22920838397957313, + "learning_rate": 3.311688311688312e-05, + "loss": 0.4222, + "step": 934 + }, + { + "epoch": 1.2128487994808566, + "grad_norm": 0.22768841564535697, + "learning_rate": 3.309283309283309e-05, + "loss": 0.4343, + "step": 935 + }, + { + "epoch": 1.2141466580142763, + "grad_norm": 0.2503532795435805, + "learning_rate": 3.306878306878307e-05, + "loss": 0.439, + "step": 936 + }, + { + "epoch": 1.2154445165476964, + "grad_norm": 0.22461109640165802, + "learning_rate": 3.3044733044733046e-05, + "loss": 0.4188, + "step": 937 + }, + { + "epoch": 1.2167423750811162, + "grad_norm": 0.209011116297864, + "learning_rate": 3.302068302068303e-05, + "loss": 0.4095, + "step": 938 + }, + { + "epoch": 1.218040233614536, + "grad_norm": 0.2266971578892572, + "learning_rate": 3.2996632996632995e-05, + "loss": 0.4354, + "step": 939 + }, + { + "epoch": 1.219338092147956, + "grad_norm": 0.22300168989786548, + "learning_rate": 3.297258297258297e-05, + "loss": 0.4225, + "step": 940 + }, + { + "epoch": 1.2206359506813758, + "grad_norm": 0.24047207294507855, + "learning_rate": 3.294853294853295e-05, + "loss": 0.4485, + "step": 941 + }, + { + "epoch": 1.2219338092147956, + "grad_norm": 0.26188839036093997, + "learning_rate": 3.2924482924482926e-05, + "loss": 0.4338, + "step": 942 + }, + { + "epoch": 1.2232316677482156, + "grad_norm": 0.2235845694258825, + "learning_rate": 3.29004329004329e-05, + "loss": 0.4242, + "step": 943 + }, + { + "epoch": 1.2245295262816354, + "grad_norm": 0.21723162921446287, + "learning_rate": 3.2876382876382876e-05, + "loss": 0.4241, + "step": 944 + }, + { + "epoch": 1.2258273848150552, + "grad_norm": 0.25526775092171644, + "learning_rate": 3.285233285233286e-05, + "loss": 0.4253, + "step": 945 + }, + { + "epoch": 1.227125243348475, + "grad_norm": 0.20573746450142508, + "learning_rate": 3.282828282828283e-05, + "loss": 0.4479, + "step": 946 + }, + { + "epoch": 1.228423101881895, + "grad_norm": 0.2510082428750361, + "learning_rate": 3.280423280423281e-05, + "loss": 0.4347, + "step": 947 + }, + { + "epoch": 1.2297209604153148, + "grad_norm": 0.2669964257731318, + "learning_rate": 3.278018278018278e-05, + "loss": 0.4291, + "step": 948 + }, + { + "epoch": 1.2310188189487346, + "grad_norm": 0.23768286255343224, + "learning_rate": 3.275613275613276e-05, + "loss": 0.4348, + "step": 949 + }, + { + "epoch": 1.2323166774821543, + "grad_norm": 0.253301068721141, + "learning_rate": 3.273208273208274e-05, + "loss": 0.4165, + "step": 950 + }, + { + "epoch": 1.2336145360155744, + "grad_norm": 0.24211175208943894, + "learning_rate": 3.270803270803271e-05, + "loss": 0.4525, + "step": 951 + }, + { + "epoch": 1.2349123945489942, + "grad_norm": 0.20694813799256812, + "learning_rate": 3.268398268398268e-05, + "loss": 0.4353, + "step": 952 + }, + { + "epoch": 1.236210253082414, + "grad_norm": 0.24139016385768045, + "learning_rate": 3.265993265993266e-05, + "loss": 0.4255, + "step": 953 + }, + { + "epoch": 1.237508111615834, + "grad_norm": 0.23298523425827472, + "learning_rate": 3.263588263588264e-05, + "loss": 0.4453, + "step": 954 + }, + { + "epoch": 1.2388059701492538, + "grad_norm": 0.22089226803029272, + "learning_rate": 3.261183261183262e-05, + "loss": 0.4142, + "step": 955 + }, + { + "epoch": 1.2401038286826735, + "grad_norm": 0.21279087400923866, + "learning_rate": 3.258778258778259e-05, + "loss": 0.3967, + "step": 956 + }, + { + "epoch": 1.2414016872160936, + "grad_norm": 0.21365588686190673, + "learning_rate": 3.256373256373256e-05, + "loss": 0.4265, + "step": 957 + }, + { + "epoch": 1.2426995457495134, + "grad_norm": 0.2297103141691502, + "learning_rate": 3.253968253968254e-05, + "loss": 0.4247, + "step": 958 + }, + { + "epoch": 1.2439974042829332, + "grad_norm": 0.21237733020443594, + "learning_rate": 3.251563251563252e-05, + "loss": 0.4187, + "step": 959 + }, + { + "epoch": 1.245295262816353, + "grad_norm": 0.25205287449171737, + "learning_rate": 3.249158249158249e-05, + "loss": 0.4195, + "step": 960 + }, + { + "epoch": 1.246593121349773, + "grad_norm": 0.2275565335826296, + "learning_rate": 3.246753246753247e-05, + "loss": 0.4222, + "step": 961 + }, + { + "epoch": 1.2478909798831928, + "grad_norm": 0.24497964692242122, + "learning_rate": 3.244348244348244e-05, + "loss": 0.4282, + "step": 962 + }, + { + "epoch": 1.2491888384166125, + "grad_norm": 0.2154307123634933, + "learning_rate": 3.2419432419432424e-05, + "loss": 0.4354, + "step": 963 + }, + { + "epoch": 1.2504866969500323, + "grad_norm": 0.21744389583650917, + "learning_rate": 3.23953823953824e-05, + "loss": 0.4138, + "step": 964 + }, + { + "epoch": 1.2517845554834524, + "grad_norm": 0.2430698493626977, + "learning_rate": 3.237133237133237e-05, + "loss": 0.4342, + "step": 965 + }, + { + "epoch": 1.2530824140168721, + "grad_norm": 0.2426482535744958, + "learning_rate": 3.234728234728235e-05, + "loss": 0.4253, + "step": 966 + }, + { + "epoch": 1.254380272550292, + "grad_norm": 0.22495203456758703, + "learning_rate": 3.232323232323233e-05, + "loss": 0.4308, + "step": 967 + }, + { + "epoch": 1.255678131083712, + "grad_norm": 0.24355897774937213, + "learning_rate": 3.2299182299182304e-05, + "loss": 0.4284, + "step": 968 + }, + { + "epoch": 1.2569759896171318, + "grad_norm": 0.2417579603003613, + "learning_rate": 3.227513227513227e-05, + "loss": 0.4236, + "step": 969 + }, + { + "epoch": 1.2582738481505515, + "grad_norm": 0.21950714489690643, + "learning_rate": 3.2251082251082254e-05, + "loss": 0.4319, + "step": 970 + }, + { + "epoch": 1.2595717066839716, + "grad_norm": 0.248967863409087, + "learning_rate": 3.222703222703223e-05, + "loss": 0.414, + "step": 971 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.2320971622059916, + "learning_rate": 3.220298220298221e-05, + "loss": 0.4396, + "step": 972 + }, + { + "epoch": 1.2621674237508111, + "grad_norm": 0.23309901348515835, + "learning_rate": 3.217893217893218e-05, + "loss": 0.423, + "step": 973 + }, + { + "epoch": 1.263465282284231, + "grad_norm": 0.2227973619365033, + "learning_rate": 3.215488215488215e-05, + "loss": 0.4256, + "step": 974 + }, + { + "epoch": 1.264763140817651, + "grad_norm": 0.245760384681702, + "learning_rate": 3.2130832130832135e-05, + "loss": 0.4326, + "step": 975 + }, + { + "epoch": 1.2660609993510707, + "grad_norm": 0.2789130551480554, + "learning_rate": 3.210678210678211e-05, + "loss": 0.4158, + "step": 976 + }, + { + "epoch": 1.2673588578844905, + "grad_norm": 0.24466337361794802, + "learning_rate": 3.2082732082732084e-05, + "loss": 0.4132, + "step": 977 + }, + { + "epoch": 1.2686567164179103, + "grad_norm": 0.26994667632692604, + "learning_rate": 3.205868205868206e-05, + "loss": 0.4193, + "step": 978 + }, + { + "epoch": 1.2699545749513304, + "grad_norm": 0.27229427206268936, + "learning_rate": 3.2034632034632034e-05, + "loss": 0.4438, + "step": 979 + }, + { + "epoch": 1.2712524334847501, + "grad_norm": 0.2562325137645907, + "learning_rate": 3.2010582010582015e-05, + "loss": 0.4356, + "step": 980 + }, + { + "epoch": 1.27255029201817, + "grad_norm": 0.25248121832020193, + "learning_rate": 3.198653198653199e-05, + "loss": 0.4143, + "step": 981 + }, + { + "epoch": 1.27384815055159, + "grad_norm": 0.2656639346583922, + "learning_rate": 3.1962481962481965e-05, + "loss": 0.4116, + "step": 982 + }, + { + "epoch": 1.2751460090850097, + "grad_norm": 0.22210045965124164, + "learning_rate": 3.193843193843194e-05, + "loss": 0.4194, + "step": 983 + }, + { + "epoch": 1.2764438676184295, + "grad_norm": 0.29306826782606415, + "learning_rate": 3.1914381914381914e-05, + "loss": 0.4148, + "step": 984 + }, + { + "epoch": 1.2777417261518496, + "grad_norm": 0.24612787413957143, + "learning_rate": 3.1890331890331896e-05, + "loss": 0.419, + "step": 985 + }, + { + "epoch": 1.2790395846852693, + "grad_norm": 0.24144328983707405, + "learning_rate": 3.1866281866281864e-05, + "loss": 0.4412, + "step": 986 + }, + { + "epoch": 1.2803374432186891, + "grad_norm": 0.24433831385926233, + "learning_rate": 3.1842231842231846e-05, + "loss": 0.4237, + "step": 987 + }, + { + "epoch": 1.2816353017521092, + "grad_norm": 0.29652784983616687, + "learning_rate": 3.181818181818182e-05, + "loss": 0.4052, + "step": 988 + }, + { + "epoch": 1.282933160285529, + "grad_norm": 0.22517076903481237, + "learning_rate": 3.1794131794131795e-05, + "loss": 0.437, + "step": 989 + }, + { + "epoch": 1.2842310188189487, + "grad_norm": 0.2695004176632199, + "learning_rate": 3.177008177008177e-05, + "loss": 0.4051, + "step": 990 + }, + { + "epoch": 1.2855288773523685, + "grad_norm": 0.2831795101586642, + "learning_rate": 3.1746031746031745e-05, + "loss": 0.4142, + "step": 991 + }, + { + "epoch": 1.2868267358857883, + "grad_norm": 0.2305605000016992, + "learning_rate": 3.1721981721981726e-05, + "loss": 0.4318, + "step": 992 + }, + { + "epoch": 1.2881245944192083, + "grad_norm": 0.27421133904277795, + "learning_rate": 3.16979316979317e-05, + "loss": 0.4303, + "step": 993 + }, + { + "epoch": 1.2894224529526281, + "grad_norm": 0.28015539417195207, + "learning_rate": 3.1673881673881676e-05, + "loss": 0.4332, + "step": 994 + }, + { + "epoch": 1.290720311486048, + "grad_norm": 0.2456153256375182, + "learning_rate": 3.164983164983165e-05, + "loss": 0.4377, + "step": 995 + }, + { + "epoch": 1.292018170019468, + "grad_norm": 0.2875091277813538, + "learning_rate": 3.1625781625781625e-05, + "loss": 0.4306, + "step": 996 + }, + { + "epoch": 1.2933160285528877, + "grad_norm": 0.24534524749679693, + "learning_rate": 3.160173160173161e-05, + "loss": 0.4389, + "step": 997 + }, + { + "epoch": 1.2946138870863075, + "grad_norm": 0.2604745981282834, + "learning_rate": 3.1577681577681575e-05, + "loss": 0.4221, + "step": 998 + }, + { + "epoch": 1.2959117456197276, + "grad_norm": 0.2519193028255322, + "learning_rate": 3.1553631553631556e-05, + "loss": 0.4613, + "step": 999 + }, + { + "epoch": 1.2972096041531473, + "grad_norm": 0.28305211286597437, + "learning_rate": 3.152958152958153e-05, + "loss": 0.4364, + "step": 1000 + }, + { + "epoch": 1.2985074626865671, + "grad_norm": 0.31302239991262554, + "learning_rate": 3.1505531505531506e-05, + "loss": 0.4315, + "step": 1001 + }, + { + "epoch": 1.2998053212199872, + "grad_norm": 0.21929034373943385, + "learning_rate": 3.148148148148148e-05, + "loss": 0.4389, + "step": 1002 + }, + { + "epoch": 1.301103179753407, + "grad_norm": 0.33967578917657276, + "learning_rate": 3.1457431457431456e-05, + "loss": 0.4245, + "step": 1003 + }, + { + "epoch": 1.3024010382868267, + "grad_norm": 0.26773599652804664, + "learning_rate": 3.143338143338144e-05, + "loss": 0.4322, + "step": 1004 + }, + { + "epoch": 1.3036988968202465, + "grad_norm": 0.3377829113222901, + "learning_rate": 3.140933140933141e-05, + "loss": 0.4156, + "step": 1005 + }, + { + "epoch": 1.3049967553536663, + "grad_norm": 0.28733563773070486, + "learning_rate": 3.1385281385281387e-05, + "loss": 0.4153, + "step": 1006 + }, + { + "epoch": 1.3062946138870863, + "grad_norm": 0.29064269118004, + "learning_rate": 3.136123136123136e-05, + "loss": 0.4248, + "step": 1007 + }, + { + "epoch": 1.3075924724205061, + "grad_norm": 0.3578706488624722, + "learning_rate": 3.1337181337181336e-05, + "loss": 0.4103, + "step": 1008 + }, + { + "epoch": 1.308890330953926, + "grad_norm": 0.2506804986945498, + "learning_rate": 3.131313131313132e-05, + "loss": 0.4455, + "step": 1009 + }, + { + "epoch": 1.310188189487346, + "grad_norm": 0.26163932223113945, + "learning_rate": 3.128908128908129e-05, + "loss": 0.418, + "step": 1010 + }, + { + "epoch": 1.3114860480207657, + "grad_norm": 0.2968145670132288, + "learning_rate": 3.126503126503126e-05, + "loss": 0.4419, + "step": 1011 + }, + { + "epoch": 1.3127839065541855, + "grad_norm": 0.29985902672925774, + "learning_rate": 3.124098124098124e-05, + "loss": 0.4397, + "step": 1012 + }, + { + "epoch": 1.3140817650876055, + "grad_norm": 0.2693824680991638, + "learning_rate": 3.121693121693122e-05, + "loss": 0.4477, + "step": 1013 + }, + { + "epoch": 1.3153796236210253, + "grad_norm": 0.2703405621428287, + "learning_rate": 3.11928811928812e-05, + "loss": 0.4325, + "step": 1014 + }, + { + "epoch": 1.3166774821544451, + "grad_norm": 0.2721038339798775, + "learning_rate": 3.1168831168831166e-05, + "loss": 0.4373, + "step": 1015 + }, + { + "epoch": 1.3179753406878651, + "grad_norm": 0.26849320227585655, + "learning_rate": 3.114478114478115e-05, + "loss": 0.4246, + "step": 1016 + }, + { + "epoch": 1.319273199221285, + "grad_norm": 0.28294666170474586, + "learning_rate": 3.112073112073112e-05, + "loss": 0.4405, + "step": 1017 + }, + { + "epoch": 1.3205710577547047, + "grad_norm": 0.7257885246786743, + "learning_rate": 3.10966810966811e-05, + "loss": 0.419, + "step": 1018 + }, + { + "epoch": 1.3218689162881245, + "grad_norm": 0.26474834284364107, + "learning_rate": 3.107263107263107e-05, + "loss": 0.4561, + "step": 1019 + }, + { + "epoch": 1.3231667748215443, + "grad_norm": 0.2836196696373746, + "learning_rate": 3.104858104858105e-05, + "loss": 0.4187, + "step": 1020 + }, + { + "epoch": 1.3244646333549643, + "grad_norm": 0.2978444678442113, + "learning_rate": 3.102453102453103e-05, + "loss": 0.4245, + "step": 1021 + }, + { + "epoch": 1.3257624918883841, + "grad_norm": 0.27039863643097406, + "learning_rate": 3.1000481000481e-05, + "loss": 0.4492, + "step": 1022 + }, + { + "epoch": 1.327060350421804, + "grad_norm": 0.28245910629768817, + "learning_rate": 3.097643097643098e-05, + "loss": 0.4205, + "step": 1023 + }, + { + "epoch": 1.328358208955224, + "grad_norm": 0.235926922135542, + "learning_rate": 3.095238095238095e-05, + "loss": 0.4176, + "step": 1024 + }, + { + "epoch": 1.3296560674886437, + "grad_norm": 0.24582624312732296, + "learning_rate": 3.092833092833093e-05, + "loss": 0.4409, + "step": 1025 + }, + { + "epoch": 1.3309539260220635, + "grad_norm": 0.25618143979663144, + "learning_rate": 3.090428090428091e-05, + "loss": 0.4411, + "step": 1026 + }, + { + "epoch": 1.3322517845554835, + "grad_norm": 0.3044844628215498, + "learning_rate": 3.0880230880230884e-05, + "loss": 0.4628, + "step": 1027 + }, + { + "epoch": 1.3335496430889033, + "grad_norm": 0.25846935424140755, + "learning_rate": 3.085618085618085e-05, + "loss": 0.431, + "step": 1028 + }, + { + "epoch": 1.3348475016223231, + "grad_norm": 0.2299843153442688, + "learning_rate": 3.0832130832130834e-05, + "loss": 0.4091, + "step": 1029 + }, + { + "epoch": 1.3361453601557431, + "grad_norm": 0.2461059374323843, + "learning_rate": 3.080808080808081e-05, + "loss": 0.4095, + "step": 1030 + }, + { + "epoch": 1.337443218689163, + "grad_norm": 0.2591709406665044, + "learning_rate": 3.078403078403079e-05, + "loss": 0.4311, + "step": 1031 + }, + { + "epoch": 1.3387410772225827, + "grad_norm": 0.22916233734032224, + "learning_rate": 3.075998075998076e-05, + "loss": 0.4485, + "step": 1032 + }, + { + "epoch": 1.3400389357560025, + "grad_norm": 0.2600469770283129, + "learning_rate": 3.073593073593073e-05, + "loss": 0.4356, + "step": 1033 + }, + { + "epoch": 1.3413367942894223, + "grad_norm": 0.24066487258683386, + "learning_rate": 3.0711880711880714e-05, + "loss": 0.4185, + "step": 1034 + }, + { + "epoch": 1.3426346528228423, + "grad_norm": 0.27145997155071, + "learning_rate": 3.068783068783069e-05, + "loss": 0.4145, + "step": 1035 + }, + { + "epoch": 1.3439325113562621, + "grad_norm": 0.22303833825329822, + "learning_rate": 3.0663780663780664e-05, + "loss": 0.4544, + "step": 1036 + }, + { + "epoch": 1.345230369889682, + "grad_norm": 0.2631308244918525, + "learning_rate": 3.063973063973064e-05, + "loss": 0.4171, + "step": 1037 + }, + { + "epoch": 1.346528228423102, + "grad_norm": 0.24013765031743725, + "learning_rate": 3.061568061568062e-05, + "loss": 0.4162, + "step": 1038 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.2563444998106366, + "learning_rate": 3.0591630591630595e-05, + "loss": 0.4228, + "step": 1039 + }, + { + "epoch": 1.3491239454899415, + "grad_norm": 0.26755087682195894, + "learning_rate": 3.056758056758057e-05, + "loss": 0.4393, + "step": 1040 + }, + { + "epoch": 1.3504218040233615, + "grad_norm": 0.2651622032346258, + "learning_rate": 3.0543530543530544e-05, + "loss": 0.4149, + "step": 1041 + }, + { + "epoch": 1.3517196625567813, + "grad_norm": 0.25334754508930496, + "learning_rate": 3.051948051948052e-05, + "loss": 0.3989, + "step": 1042 + }, + { + "epoch": 1.3530175210902011, + "grad_norm": 0.23721003539310276, + "learning_rate": 3.04954304954305e-05, + "loss": 0.4221, + "step": 1043 + }, + { + "epoch": 1.3543153796236211, + "grad_norm": 0.23037112505555338, + "learning_rate": 3.0471380471380472e-05, + "loss": 0.4182, + "step": 1044 + }, + { + "epoch": 1.355613238157041, + "grad_norm": 0.2436111206549788, + "learning_rate": 3.0447330447330447e-05, + "loss": 0.4109, + "step": 1045 + }, + { + "epoch": 1.3569110966904607, + "grad_norm": 0.24241659874410296, + "learning_rate": 3.0423280423280425e-05, + "loss": 0.4223, + "step": 1046 + }, + { + "epoch": 1.3582089552238805, + "grad_norm": 0.25749535429929715, + "learning_rate": 3.03992303992304e-05, + "loss": 0.4182, + "step": 1047 + }, + { + "epoch": 1.3595068137573005, + "grad_norm": 0.22344908087958898, + "learning_rate": 3.0375180375180378e-05, + "loss": 0.4321, + "step": 1048 + }, + { + "epoch": 1.3608046722907203, + "grad_norm": 0.23197019372372432, + "learning_rate": 3.0351130351130353e-05, + "loss": 0.4399, + "step": 1049 + }, + { + "epoch": 1.3621025308241401, + "grad_norm": 0.237479688071686, + "learning_rate": 3.0327080327080328e-05, + "loss": 0.4395, + "step": 1050 + }, + { + "epoch": 1.36340038935756, + "grad_norm": 0.2607112703183096, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.4379, + "step": 1051 + }, + { + "epoch": 1.36469824789098, + "grad_norm": 0.2367521283120222, + "learning_rate": 3.027898027898028e-05, + "loss": 0.4102, + "step": 1052 + }, + { + "epoch": 1.3659961064243997, + "grad_norm": 0.29194250880671824, + "learning_rate": 3.025493025493026e-05, + "loss": 0.443, + "step": 1053 + }, + { + "epoch": 1.3672939649578195, + "grad_norm": 0.26648481874523033, + "learning_rate": 3.0230880230880233e-05, + "loss": 0.434, + "step": 1054 + }, + { + "epoch": 1.3685918234912395, + "grad_norm": 0.26285311775511155, + "learning_rate": 3.0206830206830205e-05, + "loss": 0.4094, + "step": 1055 + }, + { + "epoch": 1.3698896820246593, + "grad_norm": 0.2564406034453145, + "learning_rate": 3.0182780182780186e-05, + "loss": 0.4398, + "step": 1056 + }, + { + "epoch": 1.3711875405580791, + "grad_norm": 0.2796151000660689, + "learning_rate": 3.0158730158730158e-05, + "loss": 0.4276, + "step": 1057 + }, + { + "epoch": 1.3724853990914991, + "grad_norm": 0.26631625570118417, + "learning_rate": 3.013468013468014e-05, + "loss": 0.4343, + "step": 1058 + }, + { + "epoch": 1.373783257624919, + "grad_norm": 0.23806771079205116, + "learning_rate": 3.011063011063011e-05, + "loss": 0.4334, + "step": 1059 + }, + { + "epoch": 1.3750811161583387, + "grad_norm": 0.27717084828254007, + "learning_rate": 3.0086580086580092e-05, + "loss": 0.4445, + "step": 1060 + }, + { + "epoch": 1.3763789746917585, + "grad_norm": 0.25955745890460635, + "learning_rate": 3.0062530062530064e-05, + "loss": 0.4339, + "step": 1061 + }, + { + "epoch": 1.3776768332251785, + "grad_norm": 0.27833631860695396, + "learning_rate": 3.003848003848004e-05, + "loss": 0.4236, + "step": 1062 + }, + { + "epoch": 1.3789746917585983, + "grad_norm": 0.23547677910993067, + "learning_rate": 3.0014430014430017e-05, + "loss": 0.4451, + "step": 1063 + }, + { + "epoch": 1.3802725502920181, + "grad_norm": 0.2818218077693615, + "learning_rate": 2.999037999037999e-05, + "loss": 0.444, + "step": 1064 + }, + { + "epoch": 1.381570408825438, + "grad_norm": 0.22221888861761538, + "learning_rate": 2.996632996632997e-05, + "loss": 0.403, + "step": 1065 + }, + { + "epoch": 1.382868267358858, + "grad_norm": 0.2445641007844883, + "learning_rate": 2.9942279942279944e-05, + "loss": 0.4163, + "step": 1066 + }, + { + "epoch": 1.3841661258922777, + "grad_norm": 0.24596043479534688, + "learning_rate": 2.991822991822992e-05, + "loss": 0.4227, + "step": 1067 + }, + { + "epoch": 1.3854639844256975, + "grad_norm": 0.2278867553746751, + "learning_rate": 2.9894179894179897e-05, + "loss": 0.4252, + "step": 1068 + }, + { + "epoch": 1.3867618429591175, + "grad_norm": 0.2275831703154012, + "learning_rate": 2.9870129870129872e-05, + "loss": 0.4242, + "step": 1069 + }, + { + "epoch": 1.3880597014925373, + "grad_norm": 0.2792734033461531, + "learning_rate": 2.984607984607985e-05, + "loss": 0.4117, + "step": 1070 + }, + { + "epoch": 1.3893575600259571, + "grad_norm": 0.22328171573584032, + "learning_rate": 2.9822029822029825e-05, + "loss": 0.4115, + "step": 1071 + }, + { + "epoch": 1.3906554185593771, + "grad_norm": 0.2554385321095662, + "learning_rate": 2.9797979797979796e-05, + "loss": 0.4193, + "step": 1072 + }, + { + "epoch": 1.391953277092797, + "grad_norm": 0.23411526096024632, + "learning_rate": 2.9773929773929778e-05, + "loss": 0.401, + "step": 1073 + }, + { + "epoch": 1.3932511356262167, + "grad_norm": 0.2417797864860212, + "learning_rate": 2.974987974987975e-05, + "loss": 0.4175, + "step": 1074 + }, + { + "epoch": 1.3945489941596367, + "grad_norm": 0.22909915502634573, + "learning_rate": 2.972582972582973e-05, + "loss": 0.4145, + "step": 1075 + }, + { + "epoch": 1.3958468526930565, + "grad_norm": 0.24654124988803922, + "learning_rate": 2.9701779701779702e-05, + "loss": 0.4363, + "step": 1076 + }, + { + "epoch": 1.3971447112264763, + "grad_norm": 0.23841085503876774, + "learning_rate": 2.9677729677729677e-05, + "loss": 0.4263, + "step": 1077 + }, + { + "epoch": 1.3984425697598961, + "grad_norm": 0.2372620155067687, + "learning_rate": 2.9653679653679655e-05, + "loss": 0.4164, + "step": 1078 + }, + { + "epoch": 1.399740428293316, + "grad_norm": 0.2174655800712752, + "learning_rate": 2.962962962962963e-05, + "loss": 0.4131, + "step": 1079 + }, + { + "epoch": 1.401038286826736, + "grad_norm": 0.25800767555211196, + "learning_rate": 2.9605579605579608e-05, + "loss": 0.4353, + "step": 1080 + }, + { + "epoch": 1.4023361453601557, + "grad_norm": 0.22921820584725752, + "learning_rate": 2.9581529581529583e-05, + "loss": 0.4048, + "step": 1081 + }, + { + "epoch": 1.4036340038935755, + "grad_norm": 0.23402586035822698, + "learning_rate": 2.955747955747956e-05, + "loss": 0.4424, + "step": 1082 + }, + { + "epoch": 1.4049318624269955, + "grad_norm": 0.2352437377010652, + "learning_rate": 2.9533429533429536e-05, + "loss": 0.4026, + "step": 1083 + }, + { + "epoch": 1.4062297209604153, + "grad_norm": 0.2247412452603025, + "learning_rate": 2.950937950937951e-05, + "loss": 0.4336, + "step": 1084 + }, + { + "epoch": 1.4075275794938351, + "grad_norm": 0.22277806225908384, + "learning_rate": 2.948532948532949e-05, + "loss": 0.4114, + "step": 1085 + }, + { + "epoch": 1.4088254380272551, + "grad_norm": 0.241538295583898, + "learning_rate": 2.946127946127946e-05, + "loss": 0.4394, + "step": 1086 + }, + { + "epoch": 1.410123296560675, + "grad_norm": 0.2610774457770794, + "learning_rate": 2.943722943722944e-05, + "loss": 0.4078, + "step": 1087 + }, + { + "epoch": 1.4114211550940947, + "grad_norm": 0.28762989292521274, + "learning_rate": 2.9413179413179413e-05, + "loss": 0.4109, + "step": 1088 + }, + { + "epoch": 1.4127190136275147, + "grad_norm": 0.22764926899208376, + "learning_rate": 2.9389129389129388e-05, + "loss": 0.4113, + "step": 1089 + }, + { + "epoch": 1.4140168721609345, + "grad_norm": 0.2951748901233817, + "learning_rate": 2.9365079365079366e-05, + "loss": 0.4112, + "step": 1090 + }, + { + "epoch": 1.4153147306943543, + "grad_norm": 0.2491523290558, + "learning_rate": 2.934102934102934e-05, + "loss": 0.4061, + "step": 1091 + }, + { + "epoch": 1.416612589227774, + "grad_norm": 0.24774585578620387, + "learning_rate": 2.931697931697932e-05, + "loss": 0.4208, + "step": 1092 + }, + { + "epoch": 1.417910447761194, + "grad_norm": 0.2383954293588994, + "learning_rate": 2.9292929292929294e-05, + "loss": 0.4243, + "step": 1093 + }, + { + "epoch": 1.419208306294614, + "grad_norm": 0.2551439406874695, + "learning_rate": 2.926887926887927e-05, + "loss": 0.4254, + "step": 1094 + }, + { + "epoch": 1.4205061648280337, + "grad_norm": 0.24117464674091157, + "learning_rate": 2.9244829244829247e-05, + "loss": 0.4082, + "step": 1095 + }, + { + "epoch": 1.4218040233614535, + "grad_norm": 0.216503714519975, + "learning_rate": 2.922077922077922e-05, + "loss": 0.4242, + "step": 1096 + }, + { + "epoch": 1.4231018818948735, + "grad_norm": 0.2422351302114647, + "learning_rate": 2.91967291967292e-05, + "loss": 0.4181, + "step": 1097 + }, + { + "epoch": 1.4243997404282933, + "grad_norm": 0.22788810318626124, + "learning_rate": 2.9172679172679174e-05, + "loss": 0.4203, + "step": 1098 + }, + { + "epoch": 1.425697598961713, + "grad_norm": 0.24325054021695863, + "learning_rate": 2.9148629148629146e-05, + "loss": 0.423, + "step": 1099 + }, + { + "epoch": 1.4269954574951331, + "grad_norm": 0.23344217723128893, + "learning_rate": 2.9124579124579127e-05, + "loss": 0.4162, + "step": 1100 + }, + { + "epoch": 1.428293316028553, + "grad_norm": 0.2543879197815251, + "learning_rate": 2.91005291005291e-05, + "loss": 0.4376, + "step": 1101 + }, + { + "epoch": 1.4295911745619727, + "grad_norm": 0.2501682851968916, + "learning_rate": 2.907647907647908e-05, + "loss": 0.4164, + "step": 1102 + }, + { + "epoch": 1.4308890330953927, + "grad_norm": 0.21492688667239696, + "learning_rate": 2.905242905242905e-05, + "loss": 0.4113, + "step": 1103 + }, + { + "epoch": 1.4321868916288125, + "grad_norm": 0.2658354623358409, + "learning_rate": 2.9028379028379033e-05, + "loss": 0.4315, + "step": 1104 + }, + { + "epoch": 1.4334847501622323, + "grad_norm": 0.2831492165913101, + "learning_rate": 2.9004329004329005e-05, + "loss": 0.4338, + "step": 1105 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.22471574523786844, + "learning_rate": 2.898027898027898e-05, + "loss": 0.4157, + "step": 1106 + }, + { + "epoch": 1.436080467229072, + "grad_norm": 0.24385415494263882, + "learning_rate": 2.8956228956228958e-05, + "loss": 0.4172, + "step": 1107 + }, + { + "epoch": 1.437378325762492, + "grad_norm": 0.2722198360023313, + "learning_rate": 2.8932178932178932e-05, + "loss": 0.407, + "step": 1108 + }, + { + "epoch": 1.4386761842959117, + "grad_norm": 0.21019395837235147, + "learning_rate": 2.890812890812891e-05, + "loss": 0.4189, + "step": 1109 + }, + { + "epoch": 1.4399740428293315, + "grad_norm": 0.23984115871051997, + "learning_rate": 2.8884078884078885e-05, + "loss": 0.4261, + "step": 1110 + }, + { + "epoch": 1.4412719013627515, + "grad_norm": 0.25738974208155574, + "learning_rate": 2.886002886002886e-05, + "loss": 0.4694, + "step": 1111 + }, + { + "epoch": 1.4425697598961713, + "grad_norm": 0.2739742760545878, + "learning_rate": 2.8835978835978838e-05, + "loss": 0.4284, + "step": 1112 + }, + { + "epoch": 1.443867618429591, + "grad_norm": 0.2563615388623274, + "learning_rate": 2.8811928811928813e-05, + "loss": 0.4187, + "step": 1113 + }, + { + "epoch": 1.4451654769630111, + "grad_norm": 0.2355873474417628, + "learning_rate": 2.878787878787879e-05, + "loss": 0.4201, + "step": 1114 + }, + { + "epoch": 1.446463335496431, + "grad_norm": 0.3037078461896459, + "learning_rate": 2.8763828763828766e-05, + "loss": 0.4164, + "step": 1115 + }, + { + "epoch": 1.4477611940298507, + "grad_norm": 0.2778889853166693, + "learning_rate": 2.8739778739778737e-05, + "loss": 0.4263, + "step": 1116 + }, + { + "epoch": 1.4490590525632707, + "grad_norm": 0.25304453875189337, + "learning_rate": 2.871572871572872e-05, + "loss": 0.4238, + "step": 1117 + }, + { + "epoch": 1.4503569110966905, + "grad_norm": 0.2617845594600046, + "learning_rate": 2.869167869167869e-05, + "loss": 0.4325, + "step": 1118 + }, + { + "epoch": 1.4516547696301103, + "grad_norm": 0.28455794197858575, + "learning_rate": 2.8667628667628672e-05, + "loss": 0.4156, + "step": 1119 + }, + { + "epoch": 1.45295262816353, + "grad_norm": 0.23740191563596869, + "learning_rate": 2.8643578643578643e-05, + "loss": 0.4261, + "step": 1120 + }, + { + "epoch": 1.45425048669695, + "grad_norm": 0.24072843737266889, + "learning_rate": 2.8619528619528618e-05, + "loss": 0.4151, + "step": 1121 + }, + { + "epoch": 1.45554834523037, + "grad_norm": 0.2700405419220064, + "learning_rate": 2.8595478595478596e-05, + "loss": 0.4252, + "step": 1122 + }, + { + "epoch": 1.4568462037637897, + "grad_norm": 0.2488359814096937, + "learning_rate": 2.857142857142857e-05, + "loss": 0.4428, + "step": 1123 + }, + { + "epoch": 1.4581440622972095, + "grad_norm": 0.23487079361910798, + "learning_rate": 2.854737854737855e-05, + "loss": 0.4216, + "step": 1124 + }, + { + "epoch": 1.4594419208306295, + "grad_norm": 0.2466576788103327, + "learning_rate": 2.8523328523328524e-05, + "loss": 0.4226, + "step": 1125 + }, + { + "epoch": 1.4607397793640493, + "grad_norm": 0.2391996649480345, + "learning_rate": 2.8499278499278502e-05, + "loss": 0.4096, + "step": 1126 + }, + { + "epoch": 1.462037637897469, + "grad_norm": 0.24258796808928063, + "learning_rate": 2.8475228475228477e-05, + "loss": 0.4145, + "step": 1127 + }, + { + "epoch": 1.4633354964308891, + "grad_norm": 0.2560406117346898, + "learning_rate": 2.845117845117845e-05, + "loss": 0.4312, + "step": 1128 + }, + { + "epoch": 1.464633354964309, + "grad_norm": 0.2861667925744788, + "learning_rate": 2.842712842712843e-05, + "loss": 0.4328, + "step": 1129 + }, + { + "epoch": 1.4659312134977287, + "grad_norm": 0.25402106108095945, + "learning_rate": 2.8403078403078404e-05, + "loss": 0.4258, + "step": 1130 + }, + { + "epoch": 1.4672290720311487, + "grad_norm": 0.24075563965908323, + "learning_rate": 2.8379028379028383e-05, + "loss": 0.4397, + "step": 1131 + }, + { + "epoch": 1.4685269305645685, + "grad_norm": 0.28522807177447185, + "learning_rate": 2.8354978354978357e-05, + "loss": 0.4303, + "step": 1132 + }, + { + "epoch": 1.4698247890979883, + "grad_norm": 0.26004049344709895, + "learning_rate": 2.833092833092833e-05, + "loss": 0.4294, + "step": 1133 + }, + { + "epoch": 1.471122647631408, + "grad_norm": 0.29853418709486346, + "learning_rate": 2.830687830687831e-05, + "loss": 0.4323, + "step": 1134 + }, + { + "epoch": 1.4724205061648281, + "grad_norm": 0.2633187635030568, + "learning_rate": 2.8282828282828282e-05, + "loss": 0.439, + "step": 1135 + }, + { + "epoch": 1.473718364698248, + "grad_norm": 0.3157910451013907, + "learning_rate": 2.8258778258778263e-05, + "loss": 0.4337, + "step": 1136 + }, + { + "epoch": 1.4750162232316677, + "grad_norm": 0.27203466331005977, + "learning_rate": 2.8234728234728235e-05, + "loss": 0.4198, + "step": 1137 + }, + { + "epoch": 1.4763140817650875, + "grad_norm": 0.2508023724498563, + "learning_rate": 2.821067821067821e-05, + "loss": 0.4267, + "step": 1138 + }, + { + "epoch": 1.4776119402985075, + "grad_norm": 0.2808852939089359, + "learning_rate": 2.8186628186628188e-05, + "loss": 0.4281, + "step": 1139 + }, + { + "epoch": 1.4789097988319273, + "grad_norm": 0.24576060851892864, + "learning_rate": 2.8162578162578162e-05, + "loss": 0.4206, + "step": 1140 + }, + { + "epoch": 1.480207657365347, + "grad_norm": 0.29052064234787545, + "learning_rate": 2.813852813852814e-05, + "loss": 0.413, + "step": 1141 + }, + { + "epoch": 1.4815055158987671, + "grad_norm": 0.23398439998245094, + "learning_rate": 2.8114478114478115e-05, + "loss": 0.4352, + "step": 1142 + }, + { + "epoch": 1.482803374432187, + "grad_norm": 0.25678491256047153, + "learning_rate": 2.809042809042809e-05, + "loss": 0.4346, + "step": 1143 + }, + { + "epoch": 1.4841012329656067, + "grad_norm": 0.2826048101734635, + "learning_rate": 2.8066378066378068e-05, + "loss": 0.4262, + "step": 1144 + }, + { + "epoch": 1.4853990914990267, + "grad_norm": 0.25015775061708817, + "learning_rate": 2.8042328042328043e-05, + "loss": 0.4129, + "step": 1145 + }, + { + "epoch": 1.4866969500324465, + "grad_norm": 0.2561977557181458, + "learning_rate": 2.801827801827802e-05, + "loss": 0.417, + "step": 1146 + }, + { + "epoch": 1.4879948085658663, + "grad_norm": 0.25036370521793533, + "learning_rate": 2.7994227994227996e-05, + "loss": 0.4079, + "step": 1147 + }, + { + "epoch": 1.4892926670992863, + "grad_norm": 0.28901223175805674, + "learning_rate": 2.7970177970177974e-05, + "loss": 0.4094, + "step": 1148 + }, + { + "epoch": 1.490590525632706, + "grad_norm": 0.23134484811007663, + "learning_rate": 2.794612794612795e-05, + "loss": 0.4476, + "step": 1149 + }, + { + "epoch": 1.491888384166126, + "grad_norm": 0.25137689970727467, + "learning_rate": 2.792207792207792e-05, + "loss": 0.4416, + "step": 1150 + }, + { + "epoch": 1.4931862426995457, + "grad_norm": 0.2524284266331274, + "learning_rate": 2.7898027898027902e-05, + "loss": 0.4295, + "step": 1151 + }, + { + "epoch": 1.4944841012329655, + "grad_norm": 0.22266682751444122, + "learning_rate": 2.7873977873977873e-05, + "loss": 0.4115, + "step": 1152 + }, + { + "epoch": 1.4957819597663855, + "grad_norm": 0.2085505106465029, + "learning_rate": 2.7849927849927855e-05, + "loss": 0.4271, + "step": 1153 + }, + { + "epoch": 1.4970798182998053, + "grad_norm": 0.2352572065506912, + "learning_rate": 2.7825877825877826e-05, + "loss": 0.4129, + "step": 1154 + }, + { + "epoch": 1.498377676833225, + "grad_norm": 0.2322270923460416, + "learning_rate": 2.78018278018278e-05, + "loss": 0.4404, + "step": 1155 + }, + { + "epoch": 1.499675535366645, + "grad_norm": 0.20327079840186968, + "learning_rate": 2.777777777777778e-05, + "loss": 0.3992, + "step": 1156 + }, + { + "epoch": 1.500973393900065, + "grad_norm": 0.22409767153079405, + "learning_rate": 2.7753727753727754e-05, + "loss": 0.4351, + "step": 1157 + }, + { + "epoch": 1.5022712524334847, + "grad_norm": 0.21789564363803948, + "learning_rate": 2.7729677729677732e-05, + "loss": 0.4263, + "step": 1158 + }, + { + "epoch": 1.5035691109669047, + "grad_norm": 0.23289144485137522, + "learning_rate": 2.7705627705627707e-05, + "loss": 0.4296, + "step": 1159 + }, + { + "epoch": 1.5048669695003245, + "grad_norm": 0.22790992420912343, + "learning_rate": 2.768157768157768e-05, + "loss": 0.4275, + "step": 1160 + }, + { + "epoch": 1.5061648280337443, + "grad_norm": 0.2180550660231808, + "learning_rate": 2.765752765752766e-05, + "loss": 0.4242, + "step": 1161 + }, + { + "epoch": 1.5074626865671643, + "grad_norm": 0.23490836544769822, + "learning_rate": 2.7633477633477635e-05, + "loss": 0.43, + "step": 1162 + }, + { + "epoch": 1.5087605451005839, + "grad_norm": 0.22192430223007012, + "learning_rate": 2.7609427609427613e-05, + "loss": 0.429, + "step": 1163 + }, + { + "epoch": 1.510058403634004, + "grad_norm": 0.2311428363766677, + "learning_rate": 2.7585377585377587e-05, + "loss": 0.4414, + "step": 1164 + }, + { + "epoch": 1.511356262167424, + "grad_norm": 0.2001408824188442, + "learning_rate": 2.756132756132756e-05, + "loss": 0.4225, + "step": 1165 + }, + { + "epoch": 1.5126541207008435, + "grad_norm": 0.2279619365636095, + "learning_rate": 2.753727753727754e-05, + "loss": 0.4149, + "step": 1166 + }, + { + "epoch": 1.5139519792342635, + "grad_norm": 0.24302952170040615, + "learning_rate": 2.7513227513227512e-05, + "loss": 0.4153, + "step": 1167 + }, + { + "epoch": 1.5152498377676833, + "grad_norm": 0.2252876935922963, + "learning_rate": 2.7489177489177493e-05, + "loss": 0.4135, + "step": 1168 + }, + { + "epoch": 1.516547696301103, + "grad_norm": 0.22162900859128726, + "learning_rate": 2.7465127465127465e-05, + "loss": 0.4283, + "step": 1169 + }, + { + "epoch": 1.517845554834523, + "grad_norm": 0.23728269459202284, + "learning_rate": 2.7441077441077446e-05, + "loss": 0.4284, + "step": 1170 + }, + { + "epoch": 1.519143413367943, + "grad_norm": 0.2073068726020532, + "learning_rate": 2.7417027417027418e-05, + "loss": 0.419, + "step": 1171 + }, + { + "epoch": 1.5204412719013627, + "grad_norm": 0.2308870482056988, + "learning_rate": 2.7392977392977392e-05, + "loss": 0.4409, + "step": 1172 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.21898533880032697, + "learning_rate": 2.736892736892737e-05, + "loss": 0.4171, + "step": 1173 + }, + { + "epoch": 1.5230369889682025, + "grad_norm": 0.21000995819843474, + "learning_rate": 2.7344877344877345e-05, + "loss": 0.417, + "step": 1174 + }, + { + "epoch": 1.5243348475016223, + "grad_norm": 0.2150245170655777, + "learning_rate": 2.7320827320827324e-05, + "loss": 0.4365, + "step": 1175 + }, + { + "epoch": 1.5256327060350423, + "grad_norm": 0.24290565598308295, + "learning_rate": 2.72967772967773e-05, + "loss": 0.4201, + "step": 1176 + }, + { + "epoch": 1.5269305645684619, + "grad_norm": 0.2304464719146474, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.4149, + "step": 1177 + }, + { + "epoch": 1.528228423101882, + "grad_norm": 0.23523933221515506, + "learning_rate": 2.724867724867725e-05, + "loss": 0.453, + "step": 1178 + }, + { + "epoch": 1.529526281635302, + "grad_norm": 0.2253825255615944, + "learning_rate": 2.7224627224627226e-05, + "loss": 0.4209, + "step": 1179 + }, + { + "epoch": 1.5308241401687215, + "grad_norm": 0.2742775834013937, + "learning_rate": 2.7200577200577204e-05, + "loss": 0.442, + "step": 1180 + }, + { + "epoch": 1.5321219987021415, + "grad_norm": 0.2176528388600847, + "learning_rate": 2.717652717652718e-05, + "loss": 0.4329, + "step": 1181 + }, + { + "epoch": 1.5334198572355613, + "grad_norm": 0.23818415433225926, + "learning_rate": 2.715247715247715e-05, + "loss": 0.4187, + "step": 1182 + }, + { + "epoch": 1.534717715768981, + "grad_norm": 0.26109881547859903, + "learning_rate": 2.7128427128427132e-05, + "loss": 0.4251, + "step": 1183 + }, + { + "epoch": 1.536015574302401, + "grad_norm": 0.2196942384869763, + "learning_rate": 2.7104377104377103e-05, + "loss": 0.418, + "step": 1184 + }, + { + "epoch": 1.537313432835821, + "grad_norm": 0.2400322015222156, + "learning_rate": 2.7080327080327085e-05, + "loss": 0.4109, + "step": 1185 + }, + { + "epoch": 1.5386112913692407, + "grad_norm": 0.23150552647711828, + "learning_rate": 2.7056277056277056e-05, + "loss": 0.4264, + "step": 1186 + }, + { + "epoch": 1.5399091499026607, + "grad_norm": 0.22005403208488783, + "learning_rate": 2.703222703222703e-05, + "loss": 0.4039, + "step": 1187 + }, + { + "epoch": 1.5412070084360805, + "grad_norm": 0.22581597634393283, + "learning_rate": 2.700817700817701e-05, + "loss": 0.425, + "step": 1188 + }, + { + "epoch": 1.5425048669695003, + "grad_norm": 0.2382695341310579, + "learning_rate": 2.6984126984126984e-05, + "loss": 0.4496, + "step": 1189 + }, + { + "epoch": 1.5438027255029203, + "grad_norm": 0.2203961917107305, + "learning_rate": 2.6960076960076962e-05, + "loss": 0.4155, + "step": 1190 + }, + { + "epoch": 1.5451005840363399, + "grad_norm": 0.25210372953982285, + "learning_rate": 2.6936026936026937e-05, + "loss": 0.4342, + "step": 1191 + }, + { + "epoch": 1.54639844256976, + "grad_norm": 0.238604523146027, + "learning_rate": 2.691197691197691e-05, + "loss": 0.4323, + "step": 1192 + }, + { + "epoch": 1.54769630110318, + "grad_norm": 0.23138471132633792, + "learning_rate": 2.688792688792689e-05, + "loss": 0.4242, + "step": 1193 + }, + { + "epoch": 1.5489941596365995, + "grad_norm": 0.2320529813667351, + "learning_rate": 2.6863876863876865e-05, + "loss": 0.4363, + "step": 1194 + }, + { + "epoch": 1.5502920181700195, + "grad_norm": 0.22679612862184145, + "learning_rate": 2.6839826839826843e-05, + "loss": 0.4253, + "step": 1195 + }, + { + "epoch": 1.5515898767034393, + "grad_norm": 0.2665688161045152, + "learning_rate": 2.6815776815776818e-05, + "loss": 0.4222, + "step": 1196 + }, + { + "epoch": 1.552887735236859, + "grad_norm": 0.21178913986030537, + "learning_rate": 2.6791726791726796e-05, + "loss": 0.422, + "step": 1197 + }, + { + "epoch": 1.554185593770279, + "grad_norm": 0.24464931528999015, + "learning_rate": 2.676767676767677e-05, + "loss": 0.4241, + "step": 1198 + }, + { + "epoch": 1.5554834523036989, + "grad_norm": 0.22319718290311183, + "learning_rate": 2.6743626743626742e-05, + "loss": 0.4168, + "step": 1199 + }, + { + "epoch": 1.5567813108371187, + "grad_norm": 0.2302808693777694, + "learning_rate": 2.6719576719576723e-05, + "loss": 0.41, + "step": 1200 + }, + { + "epoch": 1.5580791693705387, + "grad_norm": 0.2317544115600513, + "learning_rate": 2.6695526695526695e-05, + "loss": 0.4555, + "step": 1201 + }, + { + "epoch": 1.5593770279039585, + "grad_norm": 0.2554067046842974, + "learning_rate": 2.6671476671476676e-05, + "loss": 0.4075, + "step": 1202 + }, + { + "epoch": 1.5606748864373783, + "grad_norm": 0.23832375199078534, + "learning_rate": 2.6647426647426648e-05, + "loss": 0.4149, + "step": 1203 + }, + { + "epoch": 1.5619727449707983, + "grad_norm": 0.2387421511606967, + "learning_rate": 2.6623376623376623e-05, + "loss": 0.4349, + "step": 1204 + }, + { + "epoch": 1.563270603504218, + "grad_norm": 0.24466721743899011, + "learning_rate": 2.65993265993266e-05, + "loss": 0.4158, + "step": 1205 + }, + { + "epoch": 1.5645684620376379, + "grad_norm": 0.23174513616055498, + "learning_rate": 2.6575276575276575e-05, + "loss": 0.4443, + "step": 1206 + }, + { + "epoch": 1.565866320571058, + "grad_norm": 0.218582346579111, + "learning_rate": 2.6551226551226554e-05, + "loss": 0.4228, + "step": 1207 + }, + { + "epoch": 1.5671641791044775, + "grad_norm": 0.23236180907143378, + "learning_rate": 2.652717652717653e-05, + "loss": 0.4198, + "step": 1208 + }, + { + "epoch": 1.5684620376378975, + "grad_norm": 0.2461597550351122, + "learning_rate": 2.6503126503126503e-05, + "loss": 0.4388, + "step": 1209 + }, + { + "epoch": 1.5697598961713173, + "grad_norm": 0.24135274528584466, + "learning_rate": 2.647907647907648e-05, + "loss": 0.4182, + "step": 1210 + }, + { + "epoch": 1.571057754704737, + "grad_norm": 0.23011430180334824, + "learning_rate": 2.6455026455026456e-05, + "loss": 0.4345, + "step": 1211 + }, + { + "epoch": 1.572355613238157, + "grad_norm": 0.25813925411615873, + "learning_rate": 2.6430976430976434e-05, + "loss": 0.4152, + "step": 1212 + }, + { + "epoch": 1.5736534717715769, + "grad_norm": 0.2361569395941438, + "learning_rate": 2.640692640692641e-05, + "loss": 0.4107, + "step": 1213 + }, + { + "epoch": 1.5749513303049967, + "grad_norm": 0.26363884372789825, + "learning_rate": 2.638287638287638e-05, + "loss": 0.4392, + "step": 1214 + }, + { + "epoch": 1.5762491888384167, + "grad_norm": 0.24244610329485705, + "learning_rate": 2.6358826358826362e-05, + "loss": 0.4164, + "step": 1215 + }, + { + "epoch": 1.5775470473718365, + "grad_norm": 0.2552987758465308, + "learning_rate": 2.6334776334776333e-05, + "loss": 0.4339, + "step": 1216 + }, + { + "epoch": 1.5788449059052563, + "grad_norm": 0.2622601300659554, + "learning_rate": 2.6310726310726315e-05, + "loss": 0.4081, + "step": 1217 + }, + { + "epoch": 1.5801427644386763, + "grad_norm": 0.23435950487013313, + "learning_rate": 2.6286676286676286e-05, + "loss": 0.4266, + "step": 1218 + }, + { + "epoch": 1.581440622972096, + "grad_norm": 0.31150362868262865, + "learning_rate": 2.6262626262626268e-05, + "loss": 0.4205, + "step": 1219 + }, + { + "epoch": 1.5827384815055159, + "grad_norm": 0.2356568945579236, + "learning_rate": 2.623857623857624e-05, + "loss": 0.4235, + "step": 1220 + }, + { + "epoch": 1.584036340038936, + "grad_norm": 0.2636851026847217, + "learning_rate": 2.6214526214526214e-05, + "loss": 0.4194, + "step": 1221 + }, + { + "epoch": 1.5853341985723555, + "grad_norm": 0.2609824789762705, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.4386, + "step": 1222 + }, + { + "epoch": 1.5866320571057755, + "grad_norm": 0.2503475112982072, + "learning_rate": 2.6166426166426167e-05, + "loss": 0.4295, + "step": 1223 + }, + { + "epoch": 1.5879299156391953, + "grad_norm": 0.2748789264904923, + "learning_rate": 2.6142376142376145e-05, + "loss": 0.431, + "step": 1224 + }, + { + "epoch": 1.589227774172615, + "grad_norm": 0.2122856536086439, + "learning_rate": 2.611832611832612e-05, + "loss": 0.4151, + "step": 1225 + }, + { + "epoch": 1.590525632706035, + "grad_norm": 0.2882371321327433, + "learning_rate": 2.6094276094276095e-05, + "loss": 0.4242, + "step": 1226 + }, + { + "epoch": 1.5918234912394549, + "grad_norm": 0.22024360438567706, + "learning_rate": 2.6070226070226073e-05, + "loss": 0.4173, + "step": 1227 + }, + { + "epoch": 1.5931213497728747, + "grad_norm": 0.23708353175014626, + "learning_rate": 2.6046176046176048e-05, + "loss": 0.4251, + "step": 1228 + }, + { + "epoch": 1.5944192083062947, + "grad_norm": 0.2658200863217972, + "learning_rate": 2.6022126022126026e-05, + "loss": 0.4408, + "step": 1229 + }, + { + "epoch": 1.5957170668397145, + "grad_norm": 0.21583066555363375, + "learning_rate": 2.5998075998076e-05, + "loss": 0.4191, + "step": 1230 + }, + { + "epoch": 1.5970149253731343, + "grad_norm": 0.2777242614566809, + "learning_rate": 2.5974025974025972e-05, + "loss": 0.4393, + "step": 1231 + }, + { + "epoch": 1.5983127839065543, + "grad_norm": 0.23219187892619703, + "learning_rate": 2.5949975949975954e-05, + "loss": 0.4265, + "step": 1232 + }, + { + "epoch": 1.599610642439974, + "grad_norm": 0.29387387169794943, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.4244, + "step": 1233 + }, + { + "epoch": 1.6009085009733939, + "grad_norm": 0.22151955032464254, + "learning_rate": 2.5901875901875906e-05, + "loss": 0.4085, + "step": 1234 + }, + { + "epoch": 1.602206359506814, + "grad_norm": 0.24242248455059523, + "learning_rate": 2.5877825877825878e-05, + "loss": 0.4285, + "step": 1235 + }, + { + "epoch": 1.6035042180402335, + "grad_norm": 0.2621217435997206, + "learning_rate": 2.5853775853775853e-05, + "loss": 0.4379, + "step": 1236 + }, + { + "epoch": 1.6048020765736535, + "grad_norm": 0.22823804591889496, + "learning_rate": 2.582972582972583e-05, + "loss": 0.4211, + "step": 1237 + }, + { + "epoch": 1.6060999351070735, + "grad_norm": 0.2353758128022499, + "learning_rate": 2.5805675805675806e-05, + "loss": 0.4127, + "step": 1238 + }, + { + "epoch": 1.607397793640493, + "grad_norm": 0.22811130965496038, + "learning_rate": 2.5781625781625784e-05, + "loss": 0.4246, + "step": 1239 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.2366434232412805, + "learning_rate": 2.575757575757576e-05, + "loss": 0.4109, + "step": 1240 + }, + { + "epoch": 1.6099935107073329, + "grad_norm": 0.20375567441674386, + "learning_rate": 2.5733525733525737e-05, + "loss": 0.4146, + "step": 1241 + }, + { + "epoch": 1.6112913692407527, + "grad_norm": 0.2436449942466404, + "learning_rate": 2.570947570947571e-05, + "loss": 0.4162, + "step": 1242 + }, + { + "epoch": 1.6125892277741727, + "grad_norm": 0.22023021001348508, + "learning_rate": 2.5685425685425686e-05, + "loss": 0.4136, + "step": 1243 + }, + { + "epoch": 1.6138870863075925, + "grad_norm": 0.2069116265186359, + "learning_rate": 2.5661375661375664e-05, + "loss": 0.3999, + "step": 1244 + }, + { + "epoch": 1.6151849448410123, + "grad_norm": 0.24450308671714907, + "learning_rate": 2.563732563732564e-05, + "loss": 0.4352, + "step": 1245 + }, + { + "epoch": 1.6164828033744323, + "grad_norm": 0.2361666753423955, + "learning_rate": 2.5613275613275617e-05, + "loss": 0.4215, + "step": 1246 + }, + { + "epoch": 1.617780661907852, + "grad_norm": 0.24709753794900446, + "learning_rate": 2.5589225589225592e-05, + "loss": 0.4095, + "step": 1247 + }, + { + "epoch": 1.6190785204412719, + "grad_norm": 0.22770211218246428, + "learning_rate": 2.5565175565175563e-05, + "loss": 0.4208, + "step": 1248 + }, + { + "epoch": 1.6203763789746919, + "grad_norm": 0.24442985342584414, + "learning_rate": 2.5541125541125545e-05, + "loss": 0.4048, + "step": 1249 + }, + { + "epoch": 1.6216742375081115, + "grad_norm": 0.2449341182023967, + "learning_rate": 2.5517075517075516e-05, + "loss": 0.429, + "step": 1250 + }, + { + "epoch": 1.6229720960415315, + "grad_norm": 0.22314422338157636, + "learning_rate": 2.5493025493025498e-05, + "loss": 0.4161, + "step": 1251 + }, + { + "epoch": 1.6242699545749515, + "grad_norm": 0.22271710889727703, + "learning_rate": 2.546897546897547e-05, + "loss": 0.4223, + "step": 1252 + }, + { + "epoch": 1.625567813108371, + "grad_norm": 0.23943855813232637, + "learning_rate": 2.5444925444925444e-05, + "loss": 0.4263, + "step": 1253 + }, + { + "epoch": 1.626865671641791, + "grad_norm": 0.22346829290932305, + "learning_rate": 2.5420875420875422e-05, + "loss": 0.4002, + "step": 1254 + }, + { + "epoch": 1.6281635301752109, + "grad_norm": 0.21819410830608127, + "learning_rate": 2.5396825396825397e-05, + "loss": 0.4228, + "step": 1255 + }, + { + "epoch": 1.6294613887086307, + "grad_norm": 0.2487542450136884, + "learning_rate": 2.5372775372775375e-05, + "loss": 0.431, + "step": 1256 + }, + { + "epoch": 1.6307592472420507, + "grad_norm": 0.22276858066653343, + "learning_rate": 2.534872534872535e-05, + "loss": 0.3975, + "step": 1257 + }, + { + "epoch": 1.6320571057754705, + "grad_norm": 0.20406534653386582, + "learning_rate": 2.5324675324675325e-05, + "loss": 0.4308, + "step": 1258 + }, + { + "epoch": 1.6333549643088903, + "grad_norm": 0.2369459882014465, + "learning_rate": 2.5300625300625303e-05, + "loss": 0.4434, + "step": 1259 + }, + { + "epoch": 1.6346528228423103, + "grad_norm": 0.23054872564198348, + "learning_rate": 2.5276575276575278e-05, + "loss": 0.4296, + "step": 1260 + }, + { + "epoch": 1.63595068137573, + "grad_norm": 0.21314688817002478, + "learning_rate": 2.5252525252525256e-05, + "loss": 0.4234, + "step": 1261 + }, + { + "epoch": 1.6372485399091499, + "grad_norm": 0.22937591574682323, + "learning_rate": 2.522847522847523e-05, + "loss": 0.425, + "step": 1262 + }, + { + "epoch": 1.6385463984425699, + "grad_norm": 0.23974213218799267, + "learning_rate": 2.520442520442521e-05, + "loss": 0.4393, + "step": 1263 + }, + { + "epoch": 1.6398442569759895, + "grad_norm": 0.23441342590653153, + "learning_rate": 2.5180375180375184e-05, + "loss": 0.4474, + "step": 1264 + }, + { + "epoch": 1.6411421155094095, + "grad_norm": 0.22460634450789943, + "learning_rate": 2.5156325156325155e-05, + "loss": 0.4405, + "step": 1265 + }, + { + "epoch": 1.6424399740428295, + "grad_norm": 0.21099257965853147, + "learning_rate": 2.5132275132275137e-05, + "loss": 0.4147, + "step": 1266 + }, + { + "epoch": 1.643737832576249, + "grad_norm": 0.24160346011701583, + "learning_rate": 2.5108225108225108e-05, + "loss": 0.4397, + "step": 1267 + }, + { + "epoch": 1.645035691109669, + "grad_norm": 0.21504387068528427, + "learning_rate": 2.5084175084175086e-05, + "loss": 0.4134, + "step": 1268 + }, + { + "epoch": 1.6463335496430889, + "grad_norm": 0.20136235310740322, + "learning_rate": 2.506012506012506e-05, + "loss": 0.4352, + "step": 1269 + }, + { + "epoch": 1.6476314081765087, + "grad_norm": 0.20297036044525715, + "learning_rate": 2.5036075036075036e-05, + "loss": 0.4181, + "step": 1270 + }, + { + "epoch": 1.6489292667099287, + "grad_norm": 0.22303019507601843, + "learning_rate": 2.5012025012025014e-05, + "loss": 0.4028, + "step": 1271 + }, + { + "epoch": 1.6502271252433485, + "grad_norm": 0.22166881997259968, + "learning_rate": 2.498797498797499e-05, + "loss": 0.4347, + "step": 1272 + }, + { + "epoch": 1.6515249837767683, + "grad_norm": 0.20648838786480744, + "learning_rate": 2.4963924963924963e-05, + "loss": 0.4236, + "step": 1273 + }, + { + "epoch": 1.6528228423101883, + "grad_norm": 0.23349839066379247, + "learning_rate": 2.493987493987494e-05, + "loss": 0.419, + "step": 1274 + }, + { + "epoch": 1.654120700843608, + "grad_norm": 0.23063394385414213, + "learning_rate": 2.4915824915824916e-05, + "loss": 0.4152, + "step": 1275 + }, + { + "epoch": 1.6554185593770279, + "grad_norm": 0.2190005315364852, + "learning_rate": 2.4891774891774894e-05, + "loss": 0.4115, + "step": 1276 + }, + { + "epoch": 1.6567164179104479, + "grad_norm": 0.20078160624348626, + "learning_rate": 2.4867724867724866e-05, + "loss": 0.4245, + "step": 1277 + }, + { + "epoch": 1.6580142764438677, + "grad_norm": 0.24133729159661466, + "learning_rate": 2.4843674843674844e-05, + "loss": 0.4293, + "step": 1278 + }, + { + "epoch": 1.6593121349772875, + "grad_norm": 0.23794916923086656, + "learning_rate": 2.481962481962482e-05, + "loss": 0.4271, + "step": 1279 + }, + { + "epoch": 1.6606099935107075, + "grad_norm": 0.2574981267536903, + "learning_rate": 2.4795574795574797e-05, + "loss": 0.447, + "step": 1280 + }, + { + "epoch": 1.661907852044127, + "grad_norm": 0.23168835516119046, + "learning_rate": 2.4771524771524772e-05, + "loss": 0.4193, + "step": 1281 + }, + { + "epoch": 1.663205710577547, + "grad_norm": 0.23742732882318857, + "learning_rate": 2.474747474747475e-05, + "loss": 0.4476, + "step": 1282 + }, + { + "epoch": 1.6645035691109669, + "grad_norm": 0.2514178670168895, + "learning_rate": 2.4723424723424725e-05, + "loss": 0.4204, + "step": 1283 + }, + { + "epoch": 1.6658014276443867, + "grad_norm": 0.24624362103838568, + "learning_rate": 2.46993746993747e-05, + "loss": 0.4212, + "step": 1284 + }, + { + "epoch": 1.6670992861778067, + "grad_norm": 0.23275179943867672, + "learning_rate": 2.4675324675324678e-05, + "loss": 0.4312, + "step": 1285 + }, + { + "epoch": 1.6683971447112265, + "grad_norm": 0.24951766353093746, + "learning_rate": 2.4651274651274652e-05, + "loss": 0.4068, + "step": 1286 + }, + { + "epoch": 1.6696950032446463, + "grad_norm": 0.2052390086188538, + "learning_rate": 2.462722462722463e-05, + "loss": 0.4173, + "step": 1287 + }, + { + "epoch": 1.6709928617780663, + "grad_norm": 0.23198703435419166, + "learning_rate": 2.4603174603174602e-05, + "loss": 0.4177, + "step": 1288 + }, + { + "epoch": 1.672290720311486, + "grad_norm": 0.212107274947473, + "learning_rate": 2.457912457912458e-05, + "loss": 0.4166, + "step": 1289 + }, + { + "epoch": 1.6735885788449059, + "grad_norm": 0.2322378778891487, + "learning_rate": 2.4555074555074555e-05, + "loss": 0.434, + "step": 1290 + }, + { + "epoch": 1.6748864373783259, + "grad_norm": 0.21435317963286998, + "learning_rate": 2.4531024531024533e-05, + "loss": 0.4133, + "step": 1291 + }, + { + "epoch": 1.6761842959117457, + "grad_norm": 0.2156907157084962, + "learning_rate": 2.4506974506974508e-05, + "loss": 0.4285, + "step": 1292 + }, + { + "epoch": 1.6774821544451655, + "grad_norm": 0.2568679217265247, + "learning_rate": 2.4482924482924486e-05, + "loss": 0.4134, + "step": 1293 + }, + { + "epoch": 1.6787800129785855, + "grad_norm": 0.23974841708540973, + "learning_rate": 2.4458874458874457e-05, + "loss": 0.432, + "step": 1294 + }, + { + "epoch": 1.680077871512005, + "grad_norm": 0.24031934038845462, + "learning_rate": 2.4434824434824436e-05, + "loss": 0.4444, + "step": 1295 + }, + { + "epoch": 1.681375730045425, + "grad_norm": 0.2672955058745279, + "learning_rate": 2.441077441077441e-05, + "loss": 0.4369, + "step": 1296 + }, + { + "epoch": 1.6826735885788449, + "grad_norm": 0.250150232180256, + "learning_rate": 2.438672438672439e-05, + "loss": 0.4273, + "step": 1297 + }, + { + "epoch": 1.6839714471122647, + "grad_norm": 0.23626717443698833, + "learning_rate": 2.4362674362674363e-05, + "loss": 0.4236, + "step": 1298 + }, + { + "epoch": 1.6852693056456847, + "grad_norm": 0.22422589742898313, + "learning_rate": 2.4338624338624338e-05, + "loss": 0.4338, + "step": 1299 + }, + { + "epoch": 1.6865671641791045, + "grad_norm": 0.2308927071463409, + "learning_rate": 2.4314574314574316e-05, + "loss": 0.4386, + "step": 1300 + }, + { + "epoch": 1.6878650227125243, + "grad_norm": 0.2564472802977678, + "learning_rate": 2.429052429052429e-05, + "loss": 0.4117, + "step": 1301 + }, + { + "epoch": 1.6891628812459443, + "grad_norm": 0.23392174813654176, + "learning_rate": 2.426647426647427e-05, + "loss": 0.4154, + "step": 1302 + }, + { + "epoch": 1.690460739779364, + "grad_norm": 0.2397393509201778, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.4117, + "step": 1303 + }, + { + "epoch": 1.6917585983127839, + "grad_norm": 0.22912904331451653, + "learning_rate": 2.4218374218374222e-05, + "loss": 0.4273, + "step": 1304 + }, + { + "epoch": 1.6930564568462039, + "grad_norm": 0.23255973129828944, + "learning_rate": 2.4194324194324193e-05, + "loss": 0.4199, + "step": 1305 + }, + { + "epoch": 1.6943543153796237, + "grad_norm": 0.2459474867528304, + "learning_rate": 2.417027417027417e-05, + "loss": 0.3879, + "step": 1306 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.21055785182005404, + "learning_rate": 2.4146224146224146e-05, + "loss": 0.4284, + "step": 1307 + }, + { + "epoch": 1.6969500324464635, + "grad_norm": 0.23246957442423627, + "learning_rate": 2.4122174122174125e-05, + "loss": 0.4131, + "step": 1308 + }, + { + "epoch": 1.698247890979883, + "grad_norm": 0.23994403266599254, + "learning_rate": 2.40981240981241e-05, + "loss": 0.4333, + "step": 1309 + }, + { + "epoch": 1.699545749513303, + "grad_norm": 0.2316301617751929, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.4486, + "step": 1310 + }, + { + "epoch": 1.7008436080467229, + "grad_norm": 0.22426960542300423, + "learning_rate": 2.405002405002405e-05, + "loss": 0.4087, + "step": 1311 + }, + { + "epoch": 1.7021414665801426, + "grad_norm": 0.2150721322470821, + "learning_rate": 2.4025974025974027e-05, + "loss": 0.4073, + "step": 1312 + }, + { + "epoch": 1.7034393251135627, + "grad_norm": 0.2519319561232589, + "learning_rate": 2.4001924001924002e-05, + "loss": 0.4312, + "step": 1313 + }, + { + "epoch": 1.7047371836469825, + "grad_norm": 0.21097482449181498, + "learning_rate": 2.397787397787398e-05, + "loss": 0.4319, + "step": 1314 + }, + { + "epoch": 1.7060350421804023, + "grad_norm": 0.24220719812817199, + "learning_rate": 2.3953823953823955e-05, + "loss": 0.4203, + "step": 1315 + }, + { + "epoch": 1.7073329007138223, + "grad_norm": 0.22812751540222995, + "learning_rate": 2.392977392977393e-05, + "loss": 0.4325, + "step": 1316 + }, + { + "epoch": 1.708630759247242, + "grad_norm": 0.21991172303538978, + "learning_rate": 2.3905723905723908e-05, + "loss": 0.4307, + "step": 1317 + }, + { + "epoch": 1.7099286177806619, + "grad_norm": 0.24924033452167085, + "learning_rate": 2.3881673881673882e-05, + "loss": 0.4049, + "step": 1318 + }, + { + "epoch": 1.7112264763140819, + "grad_norm": 0.20641915282626047, + "learning_rate": 2.385762385762386e-05, + "loss": 0.4171, + "step": 1319 + }, + { + "epoch": 1.7125243348475017, + "grad_norm": 0.24635070927670322, + "learning_rate": 2.3833573833573835e-05, + "loss": 0.4273, + "step": 1320 + }, + { + "epoch": 1.7138221933809215, + "grad_norm": 0.22793016083957607, + "learning_rate": 2.380952380952381e-05, + "loss": 0.4028, + "step": 1321 + }, + { + "epoch": 1.7151200519143415, + "grad_norm": 0.21686313598641418, + "learning_rate": 2.3785473785473785e-05, + "loss": 0.4216, + "step": 1322 + }, + { + "epoch": 1.716417910447761, + "grad_norm": 0.20935343473455395, + "learning_rate": 2.3761423761423763e-05, + "loss": 0.4002, + "step": 1323 + }, + { + "epoch": 1.717715768981181, + "grad_norm": 0.22891020801302248, + "learning_rate": 2.3737373737373738e-05, + "loss": 0.411, + "step": 1324 + }, + { + "epoch": 1.719013627514601, + "grad_norm": 0.20360937762091913, + "learning_rate": 2.3713323713323716e-05, + "loss": 0.4445, + "step": 1325 + }, + { + "epoch": 1.7203114860480206, + "grad_norm": 0.21662567258205914, + "learning_rate": 2.368927368927369e-05, + "loss": 0.4182, + "step": 1326 + }, + { + "epoch": 1.7216093445814407, + "grad_norm": 0.2112365052652544, + "learning_rate": 2.3665223665223666e-05, + "loss": 0.4119, + "step": 1327 + }, + { + "epoch": 1.7229072031148605, + "grad_norm": 0.24045539800451038, + "learning_rate": 2.364117364117364e-05, + "loss": 0.4294, + "step": 1328 + }, + { + "epoch": 1.7242050616482802, + "grad_norm": 0.22344351793204972, + "learning_rate": 2.361712361712362e-05, + "loss": 0.4159, + "step": 1329 + }, + { + "epoch": 1.7255029201817003, + "grad_norm": 0.21385701142195507, + "learning_rate": 2.3593073593073593e-05, + "loss": 0.4208, + "step": 1330 + }, + { + "epoch": 1.72680077871512, + "grad_norm": 0.22306282993754703, + "learning_rate": 2.356902356902357e-05, + "loss": 0.417, + "step": 1331 + }, + { + "epoch": 1.7280986372485398, + "grad_norm": 0.23450328976859844, + "learning_rate": 2.3544973544973546e-05, + "loss": 0.397, + "step": 1332 + }, + { + "epoch": 1.7293964957819599, + "grad_norm": 0.23314049600175976, + "learning_rate": 2.352092352092352e-05, + "loss": 0.4297, + "step": 1333 + }, + { + "epoch": 1.7306943543153797, + "grad_norm": 0.21399972133644776, + "learning_rate": 2.34968734968735e-05, + "loss": 0.4366, + "step": 1334 + }, + { + "epoch": 1.7319922128487995, + "grad_norm": 0.22426899700350952, + "learning_rate": 2.3472823472823474e-05, + "loss": 0.433, + "step": 1335 + }, + { + "epoch": 1.7332900713822195, + "grad_norm": 0.21720443031252623, + "learning_rate": 2.3448773448773452e-05, + "loss": 0.4135, + "step": 1336 + }, + { + "epoch": 1.734587929915639, + "grad_norm": 0.22022369229968872, + "learning_rate": 2.3424723424723427e-05, + "loss": 0.4035, + "step": 1337 + }, + { + "epoch": 1.735885788449059, + "grad_norm": 0.2432882987842844, + "learning_rate": 2.34006734006734e-05, + "loss": 0.4274, + "step": 1338 + }, + { + "epoch": 1.737183646982479, + "grad_norm": 0.22954645223280482, + "learning_rate": 2.3376623376623376e-05, + "loss": 0.4265, + "step": 1339 + }, + { + "epoch": 1.7384815055158986, + "grad_norm": 0.23456332298959323, + "learning_rate": 2.3352573352573355e-05, + "loss": 0.4261, + "step": 1340 + }, + { + "epoch": 1.7397793640493187, + "grad_norm": 0.23090513352220413, + "learning_rate": 2.332852332852333e-05, + "loss": 0.4343, + "step": 1341 + }, + { + "epoch": 1.7410772225827384, + "grad_norm": 0.22635007188997747, + "learning_rate": 2.3304473304473308e-05, + "loss": 0.434, + "step": 1342 + }, + { + "epoch": 1.7423750811161582, + "grad_norm": 0.24328716551223983, + "learning_rate": 2.328042328042328e-05, + "loss": 0.4329, + "step": 1343 + }, + { + "epoch": 1.7436729396495783, + "grad_norm": 0.2245296632717372, + "learning_rate": 2.3256373256373257e-05, + "loss": 0.4135, + "step": 1344 + }, + { + "epoch": 1.744970798182998, + "grad_norm": 0.23430249945874695, + "learning_rate": 2.3232323232323232e-05, + "loss": 0.4178, + "step": 1345 + }, + { + "epoch": 1.7462686567164178, + "grad_norm": 0.21397181948116892, + "learning_rate": 2.320827320827321e-05, + "loss": 0.4236, + "step": 1346 + }, + { + "epoch": 1.7475665152498379, + "grad_norm": 0.21776737560072357, + "learning_rate": 2.3184223184223185e-05, + "loss": 0.425, + "step": 1347 + }, + { + "epoch": 1.7488643737832577, + "grad_norm": 0.23739059292954565, + "learning_rate": 2.3160173160173163e-05, + "loss": 0.4262, + "step": 1348 + }, + { + "epoch": 1.7501622323166774, + "grad_norm": 0.2207747605074272, + "learning_rate": 2.3136123136123138e-05, + "loss": 0.4097, + "step": 1349 + }, + { + "epoch": 1.7514600908500975, + "grad_norm": 0.19291564676436485, + "learning_rate": 2.3112073112073113e-05, + "loss": 0.4222, + "step": 1350 + }, + { + "epoch": 1.752757949383517, + "grad_norm": 0.20208738954938904, + "learning_rate": 2.308802308802309e-05, + "loss": 0.4199, + "step": 1351 + }, + { + "epoch": 1.754055807916937, + "grad_norm": 0.22066527169458836, + "learning_rate": 2.3063973063973065e-05, + "loss": 0.426, + "step": 1352 + }, + { + "epoch": 1.755353666450357, + "grad_norm": 0.22615489279435733, + "learning_rate": 2.3039923039923044e-05, + "loss": 0.4103, + "step": 1353 + }, + { + "epoch": 1.7566515249837766, + "grad_norm": 0.24657435823356594, + "learning_rate": 2.3015873015873015e-05, + "loss": 0.4006, + "step": 1354 + }, + { + "epoch": 1.7579493835171967, + "grad_norm": 0.2287984884377898, + "learning_rate": 2.2991822991822993e-05, + "loss": 0.4481, + "step": 1355 + }, + { + "epoch": 1.7592472420506164, + "grad_norm": 0.21060281438071618, + "learning_rate": 2.2967772967772968e-05, + "loss": 0.399, + "step": 1356 + }, + { + "epoch": 1.7605451005840362, + "grad_norm": 0.2265028463512503, + "learning_rate": 2.2943722943722946e-05, + "loss": 0.4258, + "step": 1357 + }, + { + "epoch": 1.7618429591174563, + "grad_norm": 0.21362689950493072, + "learning_rate": 2.291967291967292e-05, + "loss": 0.4178, + "step": 1358 + }, + { + "epoch": 1.763140817650876, + "grad_norm": 0.21905319629937445, + "learning_rate": 2.28956228956229e-05, + "loss": 0.435, + "step": 1359 + }, + { + "epoch": 1.7644386761842958, + "grad_norm": 0.2390674190592371, + "learning_rate": 2.287157287157287e-05, + "loss": 0.4374, + "step": 1360 + }, + { + "epoch": 1.7657365347177159, + "grad_norm": 0.262047575243414, + "learning_rate": 2.284752284752285e-05, + "loss": 0.4249, + "step": 1361 + }, + { + "epoch": 1.7670343932511356, + "grad_norm": 0.215263905999125, + "learning_rate": 2.2823472823472823e-05, + "loss": 0.4358, + "step": 1362 + }, + { + "epoch": 1.7683322517845554, + "grad_norm": 0.2566335364538136, + "learning_rate": 2.27994227994228e-05, + "loss": 0.4358, + "step": 1363 + }, + { + "epoch": 1.7696301103179755, + "grad_norm": 0.23396394290955702, + "learning_rate": 2.2775372775372776e-05, + "loss": 0.4345, + "step": 1364 + }, + { + "epoch": 1.7709279688513953, + "grad_norm": 0.2997951835182788, + "learning_rate": 2.275132275132275e-05, + "loss": 0.417, + "step": 1365 + }, + { + "epoch": 1.772225827384815, + "grad_norm": 0.25908281100055225, + "learning_rate": 2.272727272727273e-05, + "loss": 0.4283, + "step": 1366 + }, + { + "epoch": 1.773523685918235, + "grad_norm": 0.2710663340672724, + "learning_rate": 2.2703222703222704e-05, + "loss": 0.4405, + "step": 1367 + }, + { + "epoch": 1.7748215444516546, + "grad_norm": 0.25367420601807966, + "learning_rate": 2.267917267917268e-05, + "loss": 0.4149, + "step": 1368 + }, + { + "epoch": 1.7761194029850746, + "grad_norm": 0.2872738552914004, + "learning_rate": 2.2655122655122657e-05, + "loss": 0.4339, + "step": 1369 + }, + { + "epoch": 1.7774172615184944, + "grad_norm": 0.21919850447737751, + "learning_rate": 2.2631072631072632e-05, + "loss": 0.4445, + "step": 1370 + }, + { + "epoch": 1.7787151200519142, + "grad_norm": 0.27892242525176375, + "learning_rate": 2.2607022607022607e-05, + "loss": 0.4131, + "step": 1371 + }, + { + "epoch": 1.7800129785853342, + "grad_norm": 0.24026730555070555, + "learning_rate": 2.2582972582972585e-05, + "loss": 0.427, + "step": 1372 + }, + { + "epoch": 1.781310837118754, + "grad_norm": 0.2145688542497997, + "learning_rate": 2.255892255892256e-05, + "loss": 0.4391, + "step": 1373 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.23661442477067585, + "learning_rate": 2.2534872534872538e-05, + "loss": 0.41, + "step": 1374 + }, + { + "epoch": 1.7839065541855939, + "grad_norm": 0.2610547392581578, + "learning_rate": 2.2510822510822512e-05, + "loss": 0.4188, + "step": 1375 + }, + { + "epoch": 1.7852044127190136, + "grad_norm": 0.2493506467256105, + "learning_rate": 2.2486772486772487e-05, + "loss": 0.4206, + "step": 1376 + }, + { + "epoch": 1.7865022712524334, + "grad_norm": 0.2649167628997299, + "learning_rate": 2.2462722462722462e-05, + "loss": 0.4237, + "step": 1377 + }, + { + "epoch": 1.7878001297858535, + "grad_norm": 0.2534895267108062, + "learning_rate": 2.243867243867244e-05, + "loss": 0.4283, + "step": 1378 + }, + { + "epoch": 1.7890979883192732, + "grad_norm": 0.2709616156112994, + "learning_rate": 2.2414622414622415e-05, + "loss": 0.4113, + "step": 1379 + }, + { + "epoch": 1.790395846852693, + "grad_norm": 0.22792963880042075, + "learning_rate": 2.2390572390572393e-05, + "loss": 0.4267, + "step": 1380 + }, + { + "epoch": 1.791693705386113, + "grad_norm": 0.24622268955355062, + "learning_rate": 2.2366522366522368e-05, + "loss": 0.4051, + "step": 1381 + }, + { + "epoch": 1.7929915639195326, + "grad_norm": 0.27590035842972194, + "learning_rate": 2.2342472342472343e-05, + "loss": 0.4378, + "step": 1382 + }, + { + "epoch": 1.7942894224529526, + "grad_norm": 0.2264860712514965, + "learning_rate": 2.2318422318422317e-05, + "loss": 0.4171, + "step": 1383 + }, + { + "epoch": 1.7955872809863724, + "grad_norm": 0.27527712703496315, + "learning_rate": 2.2294372294372296e-05, + "loss": 0.4136, + "step": 1384 + }, + { + "epoch": 1.7968851395197922, + "grad_norm": 0.27052531643386396, + "learning_rate": 2.227032227032227e-05, + "loss": 0.3935, + "step": 1385 + }, + { + "epoch": 1.7981829980532122, + "grad_norm": 0.2479444281803134, + "learning_rate": 2.224627224627225e-05, + "loss": 0.4331, + "step": 1386 + }, + { + "epoch": 1.799480856586632, + "grad_norm": 0.2373284631481721, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.417, + "step": 1387 + }, + { + "epoch": 1.8007787151200518, + "grad_norm": 0.2575638652052547, + "learning_rate": 2.2198172198172198e-05, + "loss": 0.4323, + "step": 1388 + }, + { + "epoch": 1.8020765736534718, + "grad_norm": 0.2407980171885747, + "learning_rate": 2.2174122174122176e-05, + "loss": 0.4127, + "step": 1389 + }, + { + "epoch": 1.8033744321868916, + "grad_norm": 0.21117036443387086, + "learning_rate": 2.215007215007215e-05, + "loss": 0.4191, + "step": 1390 + }, + { + "epoch": 1.8046722907203114, + "grad_norm": 0.20129164818193102, + "learning_rate": 2.212602212602213e-05, + "loss": 0.4115, + "step": 1391 + }, + { + "epoch": 1.8059701492537314, + "grad_norm": 0.2540001501490721, + "learning_rate": 2.2101972101972104e-05, + "loss": 0.431, + "step": 1392 + }, + { + "epoch": 1.8072680077871512, + "grad_norm": 0.24804686013462887, + "learning_rate": 2.207792207792208e-05, + "loss": 0.4228, + "step": 1393 + }, + { + "epoch": 1.808565866320571, + "grad_norm": 0.2022520818624456, + "learning_rate": 2.2053872053872053e-05, + "loss": 0.4203, + "step": 1394 + }, + { + "epoch": 1.809863724853991, + "grad_norm": 0.23238075295532062, + "learning_rate": 2.202982202982203e-05, + "loss": 0.425, + "step": 1395 + }, + { + "epoch": 1.8111615833874106, + "grad_norm": 0.21664815140422355, + "learning_rate": 2.2005772005772006e-05, + "loss": 0.4299, + "step": 1396 + }, + { + "epoch": 1.8124594419208306, + "grad_norm": 0.21744531033366538, + "learning_rate": 2.1981721981721985e-05, + "loss": 0.4019, + "step": 1397 + }, + { + "epoch": 1.8137573004542504, + "grad_norm": 0.2138946808987489, + "learning_rate": 2.1957671957671956e-05, + "loss": 0.4256, + "step": 1398 + }, + { + "epoch": 1.8150551589876702, + "grad_norm": 0.2224576706896047, + "learning_rate": 2.1933621933621934e-05, + "loss": 0.4147, + "step": 1399 + }, + { + "epoch": 1.8163530175210902, + "grad_norm": 0.2002966133486591, + "learning_rate": 2.190957190957191e-05, + "loss": 0.426, + "step": 1400 + }, + { + "epoch": 1.81765087605451, + "grad_norm": 0.202531126088113, + "learning_rate": 2.1885521885521887e-05, + "loss": 0.4429, + "step": 1401 + }, + { + "epoch": 1.8189487345879298, + "grad_norm": 0.20516498421820234, + "learning_rate": 2.1861471861471862e-05, + "loss": 0.4208, + "step": 1402 + }, + { + "epoch": 1.8202465931213498, + "grad_norm": 0.23389135616856488, + "learning_rate": 2.183742183742184e-05, + "loss": 0.4261, + "step": 1403 + }, + { + "epoch": 1.8215444516547696, + "grad_norm": 0.21459274849252136, + "learning_rate": 2.1813371813371815e-05, + "loss": 0.4346, + "step": 1404 + }, + { + "epoch": 1.8228423101881894, + "grad_norm": 0.2078147272516738, + "learning_rate": 2.178932178932179e-05, + "loss": 0.4092, + "step": 1405 + }, + { + "epoch": 1.8241401687216094, + "grad_norm": 0.24580373752808737, + "learning_rate": 2.1765271765271768e-05, + "loss": 0.4233, + "step": 1406 + }, + { + "epoch": 1.8254380272550292, + "grad_norm": 0.22454106978014404, + "learning_rate": 2.1741221741221743e-05, + "loss": 0.4147, + "step": 1407 + }, + { + "epoch": 1.826735885788449, + "grad_norm": 0.21599295500558674, + "learning_rate": 2.171717171717172e-05, + "loss": 0.4349, + "step": 1408 + }, + { + "epoch": 1.828033744321869, + "grad_norm": 0.2127033614651673, + "learning_rate": 2.1693121693121692e-05, + "loss": 0.4206, + "step": 1409 + }, + { + "epoch": 1.8293316028552886, + "grad_norm": 0.21925596786696352, + "learning_rate": 2.166907166907167e-05, + "loss": 0.4399, + "step": 1410 + }, + { + "epoch": 1.8306294613887086, + "grad_norm": 0.21016324905145667, + "learning_rate": 2.1645021645021645e-05, + "loss": 0.4222, + "step": 1411 + }, + { + "epoch": 1.8319273199221286, + "grad_norm": 0.22520381391920555, + "learning_rate": 2.1620971620971623e-05, + "loss": 0.4258, + "step": 1412 + }, + { + "epoch": 1.8332251784555482, + "grad_norm": 0.22141690462102792, + "learning_rate": 2.1596921596921598e-05, + "loss": 0.412, + "step": 1413 + }, + { + "epoch": 1.8345230369889682, + "grad_norm": 0.2429839281627191, + "learning_rate": 2.1572871572871576e-05, + "loss": 0.4269, + "step": 1414 + }, + { + "epoch": 1.835820895522388, + "grad_norm": 0.2160140354835784, + "learning_rate": 2.1548821548821547e-05, + "loss": 0.4205, + "step": 1415 + }, + { + "epoch": 1.8371187540558078, + "grad_norm": 0.2402260672982623, + "learning_rate": 2.1524771524771526e-05, + "loss": 0.4193, + "step": 1416 + }, + { + "epoch": 1.8384166125892278, + "grad_norm": 0.29744843810112265, + "learning_rate": 2.15007215007215e-05, + "loss": 0.4325, + "step": 1417 + }, + { + "epoch": 1.8397144711226476, + "grad_norm": 0.22530015703559994, + "learning_rate": 2.147667147667148e-05, + "loss": 0.4318, + "step": 1418 + }, + { + "epoch": 1.8410123296560674, + "grad_norm": 0.29390956909610316, + "learning_rate": 2.1452621452621453e-05, + "loss": 0.4288, + "step": 1419 + }, + { + "epoch": 1.8423101881894874, + "grad_norm": 0.23358318964698258, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.4084, + "step": 1420 + }, + { + "epoch": 1.8436080467229072, + "grad_norm": 0.21167664114993395, + "learning_rate": 2.1404521404521406e-05, + "loss": 0.4313, + "step": 1421 + }, + { + "epoch": 1.844905905256327, + "grad_norm": 0.2460698519801602, + "learning_rate": 2.138047138047138e-05, + "loss": 0.4155, + "step": 1422 + }, + { + "epoch": 1.846203763789747, + "grad_norm": 0.23025941782631765, + "learning_rate": 2.135642135642136e-05, + "loss": 0.4057, + "step": 1423 + }, + { + "epoch": 1.8475016223231666, + "grad_norm": 0.19865359681586736, + "learning_rate": 2.1332371332371334e-05, + "loss": 0.4075, + "step": 1424 + }, + { + "epoch": 1.8487994808565866, + "grad_norm": 0.22150167838157933, + "learning_rate": 2.1308321308321312e-05, + "loss": 0.4338, + "step": 1425 + }, + { + "epoch": 1.8500973393900066, + "grad_norm": 0.27381218064289997, + "learning_rate": 2.1284271284271284e-05, + "loss": 0.4385, + "step": 1426 + }, + { + "epoch": 1.8513951979234262, + "grad_norm": 0.2386126810899565, + "learning_rate": 2.1260221260221262e-05, + "loss": 0.4138, + "step": 1427 + }, + { + "epoch": 1.8526930564568462, + "grad_norm": 0.23844253499070778, + "learning_rate": 2.1236171236171237e-05, + "loss": 0.417, + "step": 1428 + }, + { + "epoch": 1.853990914990266, + "grad_norm": 0.24734871284649604, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.4266, + "step": 1429 + }, + { + "epoch": 1.8552887735236858, + "grad_norm": 0.2581372866509555, + "learning_rate": 2.118807118807119e-05, + "loss": 0.4073, + "step": 1430 + }, + { + "epoch": 1.8565866320571058, + "grad_norm": 0.20591243236055737, + "learning_rate": 2.1164021164021164e-05, + "loss": 0.3973, + "step": 1431 + }, + { + "epoch": 1.8578844905905256, + "grad_norm": 0.25393718244850216, + "learning_rate": 2.113997113997114e-05, + "loss": 0.4237, + "step": 1432 + }, + { + "epoch": 1.8591823491239454, + "grad_norm": 0.256757051595813, + "learning_rate": 2.1115921115921117e-05, + "loss": 0.4276, + "step": 1433 + }, + { + "epoch": 1.8604802076573654, + "grad_norm": 0.2199746107316156, + "learning_rate": 2.1091871091871092e-05, + "loss": 0.4027, + "step": 1434 + }, + { + "epoch": 1.8617780661907852, + "grad_norm": 0.22993418151409517, + "learning_rate": 2.106782106782107e-05, + "loss": 0.4258, + "step": 1435 + }, + { + "epoch": 1.863075924724205, + "grad_norm": 0.23986794245337564, + "learning_rate": 2.1043771043771045e-05, + "loss": 0.4092, + "step": 1436 + }, + { + "epoch": 1.864373783257625, + "grad_norm": 0.2503767269878855, + "learning_rate": 2.101972101972102e-05, + "loss": 0.4337, + "step": 1437 + }, + { + "epoch": 1.8656716417910446, + "grad_norm": 0.19966379931345576, + "learning_rate": 2.0995670995670998e-05, + "loss": 0.4083, + "step": 1438 + }, + { + "epoch": 1.8669695003244646, + "grad_norm": 0.22975695557758422, + "learning_rate": 2.0971620971620973e-05, + "loss": 0.4155, + "step": 1439 + }, + { + "epoch": 1.8682673588578846, + "grad_norm": 0.26927614268096606, + "learning_rate": 2.094757094757095e-05, + "loss": 0.3885, + "step": 1440 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.20373734252329936, + "learning_rate": 2.0923520923520926e-05, + "loss": 0.4291, + "step": 1441 + }, + { + "epoch": 1.8708630759247242, + "grad_norm": 0.2683958306016899, + "learning_rate": 2.08994708994709e-05, + "loss": 0.433, + "step": 1442 + }, + { + "epoch": 1.872160934458144, + "grad_norm": 0.26568578858883407, + "learning_rate": 2.0875420875420875e-05, + "loss": 0.4379, + "step": 1443 + }, + { + "epoch": 1.8734587929915638, + "grad_norm": 0.2365065973899857, + "learning_rate": 2.0851370851370853e-05, + "loss": 0.4372, + "step": 1444 + }, + { + "epoch": 1.8747566515249838, + "grad_norm": 0.2160536365282337, + "learning_rate": 2.0827320827320828e-05, + "loss": 0.4249, + "step": 1445 + }, + { + "epoch": 1.8760545100584036, + "grad_norm": 0.2698594967367338, + "learning_rate": 2.0803270803270806e-05, + "loss": 0.4251, + "step": 1446 + }, + { + "epoch": 1.8773523685918234, + "grad_norm": 0.240476141319818, + "learning_rate": 2.077922077922078e-05, + "loss": 0.4051, + "step": 1447 + }, + { + "epoch": 1.8786502271252434, + "grad_norm": 0.20313150197250998, + "learning_rate": 2.0755170755170756e-05, + "loss": 0.4142, + "step": 1448 + }, + { + "epoch": 1.8799480856586632, + "grad_norm": 0.2513888218859537, + "learning_rate": 2.073112073112073e-05, + "loss": 0.4235, + "step": 1449 + }, + { + "epoch": 1.881245944192083, + "grad_norm": 0.263020254508393, + "learning_rate": 2.070707070707071e-05, + "loss": 0.4133, + "step": 1450 + }, + { + "epoch": 1.882543802725503, + "grad_norm": 0.19807928758761542, + "learning_rate": 2.0683020683020683e-05, + "loss": 0.4211, + "step": 1451 + }, + { + "epoch": 1.8838416612589228, + "grad_norm": 0.28553479995616016, + "learning_rate": 2.065897065897066e-05, + "loss": 0.4295, + "step": 1452 + }, + { + "epoch": 1.8851395197923426, + "grad_norm": 0.22454512768715873, + "learning_rate": 2.0634920634920636e-05, + "loss": 0.4164, + "step": 1453 + }, + { + "epoch": 1.8864373783257626, + "grad_norm": 0.2212283630425153, + "learning_rate": 2.061087061087061e-05, + "loss": 0.4309, + "step": 1454 + }, + { + "epoch": 1.8877352368591822, + "grad_norm": 0.21880750363041376, + "learning_rate": 2.058682058682059e-05, + "loss": 0.4148, + "step": 1455 + }, + { + "epoch": 1.8890330953926022, + "grad_norm": 0.2698709703952382, + "learning_rate": 2.0562770562770564e-05, + "loss": 0.4339, + "step": 1456 + }, + { + "epoch": 1.890330953926022, + "grad_norm": 0.21631366892137663, + "learning_rate": 2.0538720538720542e-05, + "loss": 0.4209, + "step": 1457 + }, + { + "epoch": 1.8916288124594418, + "grad_norm": 0.22312561756649457, + "learning_rate": 2.0514670514670517e-05, + "loss": 0.4205, + "step": 1458 + }, + { + "epoch": 1.8929266709928618, + "grad_norm": 0.22982817420831553, + "learning_rate": 2.0490620490620492e-05, + "loss": 0.4127, + "step": 1459 + }, + { + "epoch": 1.8942245295262816, + "grad_norm": 0.23011803773822845, + "learning_rate": 2.0466570466570467e-05, + "loss": 0.4018, + "step": 1460 + }, + { + "epoch": 1.8955223880597014, + "grad_norm": 0.2156774757909124, + "learning_rate": 2.0442520442520445e-05, + "loss": 0.448, + "step": 1461 + }, + { + "epoch": 1.8968202465931214, + "grad_norm": 0.2465313564522942, + "learning_rate": 2.041847041847042e-05, + "loss": 0.4433, + "step": 1462 + }, + { + "epoch": 1.8981181051265412, + "grad_norm": 0.21906089100592563, + "learning_rate": 2.0394420394420398e-05, + "loss": 0.4103, + "step": 1463 + }, + { + "epoch": 1.899415963659961, + "grad_norm": 0.21024875722994074, + "learning_rate": 2.037037037037037e-05, + "loss": 0.3951, + "step": 1464 + }, + { + "epoch": 1.900713822193381, + "grad_norm": 0.18955824822069273, + "learning_rate": 2.0346320346320347e-05, + "loss": 0.4174, + "step": 1465 + }, + { + "epoch": 1.9020116807268008, + "grad_norm": 0.23144049792472646, + "learning_rate": 2.0322270322270322e-05, + "loss": 0.3998, + "step": 1466 + }, + { + "epoch": 1.9033095392602206, + "grad_norm": 0.21081067517865779, + "learning_rate": 2.02982202982203e-05, + "loss": 0.4103, + "step": 1467 + }, + { + "epoch": 1.9046073977936406, + "grad_norm": 0.2194010873045385, + "learning_rate": 2.0274170274170275e-05, + "loss": 0.4123, + "step": 1468 + }, + { + "epoch": 1.9059052563270602, + "grad_norm": 0.20757690294910305, + "learning_rate": 2.025012025012025e-05, + "loss": 0.4012, + "step": 1469 + }, + { + "epoch": 1.9072031148604802, + "grad_norm": 0.2628488845364361, + "learning_rate": 2.0226070226070225e-05, + "loss": 0.4118, + "step": 1470 + }, + { + "epoch": 1.9085009733939, + "grad_norm": 0.2503984380267546, + "learning_rate": 2.0202020202020203e-05, + "loss": 0.4277, + "step": 1471 + }, + { + "epoch": 1.9097988319273198, + "grad_norm": 0.20731051341055765, + "learning_rate": 2.0177970177970177e-05, + "loss": 0.4223, + "step": 1472 + }, + { + "epoch": 1.9110966904607398, + "grad_norm": 0.2469892137078129, + "learning_rate": 2.0153920153920156e-05, + "loss": 0.4278, + "step": 1473 + }, + { + "epoch": 1.9123945489941596, + "grad_norm": 0.2302509090632293, + "learning_rate": 2.012987012987013e-05, + "loss": 0.4286, + "step": 1474 + }, + { + "epoch": 1.9136924075275794, + "grad_norm": 0.21560820581873713, + "learning_rate": 2.0105820105820105e-05, + "loss": 0.4036, + "step": 1475 + }, + { + "epoch": 1.9149902660609994, + "grad_norm": 0.21761526673837062, + "learning_rate": 2.0081770081770083e-05, + "loss": 0.4383, + "step": 1476 + }, + { + "epoch": 1.9162881245944192, + "grad_norm": 0.25419859148323953, + "learning_rate": 2.0057720057720058e-05, + "loss": 0.4168, + "step": 1477 + }, + { + "epoch": 1.917585983127839, + "grad_norm": 0.21447148417291215, + "learning_rate": 2.0033670033670036e-05, + "loss": 0.4233, + "step": 1478 + }, + { + "epoch": 1.918883841661259, + "grad_norm": 0.22177181102304355, + "learning_rate": 2.000962000962001e-05, + "loss": 0.3973, + "step": 1479 + }, + { + "epoch": 1.9201817001946788, + "grad_norm": 0.2361964777550035, + "learning_rate": 1.9985569985569986e-05, + "loss": 0.4062, + "step": 1480 + }, + { + "epoch": 1.9214795587280986, + "grad_norm": 0.2268625180335479, + "learning_rate": 1.996151996151996e-05, + "loss": 0.4288, + "step": 1481 + }, + { + "epoch": 1.9227774172615186, + "grad_norm": 0.22109891172640594, + "learning_rate": 1.993746993746994e-05, + "loss": 0.4168, + "step": 1482 + }, + { + "epoch": 1.9240752757949382, + "grad_norm": 0.24199558362942594, + "learning_rate": 1.9913419913419914e-05, + "loss": 0.4368, + "step": 1483 + }, + { + "epoch": 1.9253731343283582, + "grad_norm": 0.23386196057480746, + "learning_rate": 1.9889369889369892e-05, + "loss": 0.4929, + "step": 1484 + }, + { + "epoch": 1.9266709928617782, + "grad_norm": 2.3863223206635507, + "learning_rate": 1.9865319865319866e-05, + "loss": 0.4321, + "step": 1485 + }, + { + "epoch": 1.9279688513951978, + "grad_norm": 0.2117744722116347, + "learning_rate": 1.984126984126984e-05, + "loss": 0.4046, + "step": 1486 + }, + { + "epoch": 1.9292667099286178, + "grad_norm": 0.23753639694866985, + "learning_rate": 1.9817219817219816e-05, + "loss": 0.4088, + "step": 1487 + }, + { + "epoch": 1.9305645684620376, + "grad_norm": 0.2029549567060751, + "learning_rate": 1.9793169793169794e-05, + "loss": 0.4129, + "step": 1488 + }, + { + "epoch": 1.9318624269954574, + "grad_norm": 0.20999056789664505, + "learning_rate": 1.976911976911977e-05, + "loss": 0.4149, + "step": 1489 + }, + { + "epoch": 1.9331602855288774, + "grad_norm": 0.25609740431868805, + "learning_rate": 1.9745069745069747e-05, + "loss": 0.4422, + "step": 1490 + }, + { + "epoch": 1.9344581440622972, + "grad_norm": 0.23240223312760538, + "learning_rate": 1.9721019721019722e-05, + "loss": 0.4376, + "step": 1491 + }, + { + "epoch": 1.935756002595717, + "grad_norm": 0.22288941915151747, + "learning_rate": 1.9696969696969697e-05, + "loss": 0.4031, + "step": 1492 + }, + { + "epoch": 1.937053861129137, + "grad_norm": 0.20823811668022293, + "learning_rate": 1.9672919672919675e-05, + "loss": 0.4141, + "step": 1493 + }, + { + "epoch": 1.9383517196625568, + "grad_norm": 0.22958571482808876, + "learning_rate": 1.964886964886965e-05, + "loss": 0.4205, + "step": 1494 + }, + { + "epoch": 1.9396495781959766, + "grad_norm": 0.24269527070284858, + "learning_rate": 1.9624819624819628e-05, + "loss": 0.414, + "step": 1495 + }, + { + "epoch": 1.9409474367293966, + "grad_norm": 0.2070554761070819, + "learning_rate": 1.9600769600769603e-05, + "loss": 0.4045, + "step": 1496 + }, + { + "epoch": 1.9422452952628162, + "grad_norm": 0.24376293095622897, + "learning_rate": 1.9576719576719577e-05, + "loss": 0.4257, + "step": 1497 + }, + { + "epoch": 1.9435431537962362, + "grad_norm": 0.2254498669697948, + "learning_rate": 1.9552669552669552e-05, + "loss": 0.421, + "step": 1498 + }, + { + "epoch": 1.9448410123296562, + "grad_norm": 0.21748513808130843, + "learning_rate": 1.952861952861953e-05, + "loss": 0.4062, + "step": 1499 + }, + { + "epoch": 1.9461388708630758, + "grad_norm": 0.2148376810996354, + "learning_rate": 1.9504569504569505e-05, + "loss": 0.4203, + "step": 1500 + }, + { + "epoch": 1.9474367293964958, + "grad_norm": 0.25871259714383205, + "learning_rate": 1.9480519480519483e-05, + "loss": 0.4229, + "step": 1501 + }, + { + "epoch": 1.9487345879299156, + "grad_norm": 0.19582347887373358, + "learning_rate": 1.9456469456469455e-05, + "loss": 0.4081, + "step": 1502 + }, + { + "epoch": 1.9500324464633354, + "grad_norm": 0.22789399470009464, + "learning_rate": 1.9432419432419433e-05, + "loss": 0.4245, + "step": 1503 + }, + { + "epoch": 1.9513303049967554, + "grad_norm": 0.23018173092515049, + "learning_rate": 1.9408369408369408e-05, + "loss": 0.4216, + "step": 1504 + }, + { + "epoch": 1.9526281635301752, + "grad_norm": 0.21444832133823605, + "learning_rate": 1.9384319384319386e-05, + "loss": 0.4147, + "step": 1505 + }, + { + "epoch": 1.953926022063595, + "grad_norm": 0.20562647584839736, + "learning_rate": 1.936026936026936e-05, + "loss": 0.4008, + "step": 1506 + }, + { + "epoch": 1.955223880597015, + "grad_norm": 0.23110467483063488, + "learning_rate": 1.933621933621934e-05, + "loss": 0.3972, + "step": 1507 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.21834150518213843, + "learning_rate": 1.9312169312169313e-05, + "loss": 0.4256, + "step": 1508 + }, + { + "epoch": 1.9578195976638546, + "grad_norm": 0.21958450668275112, + "learning_rate": 1.9288119288119288e-05, + "loss": 0.4088, + "step": 1509 + }, + { + "epoch": 1.9591174561972746, + "grad_norm": 0.20052094185224426, + "learning_rate": 1.9264069264069266e-05, + "loss": 0.4121, + "step": 1510 + }, + { + "epoch": 1.9604153147306942, + "grad_norm": 0.24326880305407378, + "learning_rate": 1.924001924001924e-05, + "loss": 0.422, + "step": 1511 + }, + { + "epoch": 1.9617131732641142, + "grad_norm": 0.2410106190975958, + "learning_rate": 1.921596921596922e-05, + "loss": 0.4085, + "step": 1512 + }, + { + "epoch": 1.9630110317975342, + "grad_norm": 0.20377491892233185, + "learning_rate": 1.919191919191919e-05, + "loss": 0.4312, + "step": 1513 + }, + { + "epoch": 1.9643088903309538, + "grad_norm": 0.22992091739225845, + "learning_rate": 1.916786916786917e-05, + "loss": 0.4283, + "step": 1514 + }, + { + "epoch": 1.9656067488643738, + "grad_norm": 0.23320180740415136, + "learning_rate": 1.9143819143819144e-05, + "loss": 0.4167, + "step": 1515 + }, + { + "epoch": 1.9669046073977936, + "grad_norm": 0.21478096347520134, + "learning_rate": 1.9119769119769122e-05, + "loss": 0.4373, + "step": 1516 + }, + { + "epoch": 1.9682024659312134, + "grad_norm": 0.24312143244424492, + "learning_rate": 1.9095719095719097e-05, + "loss": 0.4384, + "step": 1517 + }, + { + "epoch": 1.9695003244646334, + "grad_norm": 0.22013684407762923, + "learning_rate": 1.9071669071669075e-05, + "loss": 0.4244, + "step": 1518 + }, + { + "epoch": 1.9707981829980532, + "grad_norm": 0.22394887240003014, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.419, + "step": 1519 + }, + { + "epoch": 1.972096041531473, + "grad_norm": 0.2319362635386066, + "learning_rate": 1.9023569023569024e-05, + "loss": 0.4059, + "step": 1520 + }, + { + "epoch": 1.973393900064893, + "grad_norm": 0.1980072863895625, + "learning_rate": 1.8999518999519e-05, + "loss": 0.4012, + "step": 1521 + }, + { + "epoch": 1.9746917585983128, + "grad_norm": 0.21340412052310542, + "learning_rate": 1.8975468975468977e-05, + "loss": 0.4252, + "step": 1522 + }, + { + "epoch": 1.9759896171317326, + "grad_norm": 0.20523875562201788, + "learning_rate": 1.8951418951418952e-05, + "loss": 0.3954, + "step": 1523 + }, + { + "epoch": 1.9772874756651526, + "grad_norm": 0.21593958433489607, + "learning_rate": 1.8927368927368927e-05, + "loss": 0.4204, + "step": 1524 + }, + { + "epoch": 1.9785853341985724, + "grad_norm": 0.1982941991349422, + "learning_rate": 1.8903318903318905e-05, + "loss": 0.3982, + "step": 1525 + }, + { + "epoch": 1.9798831927319922, + "grad_norm": 0.20466190375196575, + "learning_rate": 1.887926887926888e-05, + "loss": 0.4134, + "step": 1526 + }, + { + "epoch": 1.9811810512654122, + "grad_norm": 0.21442210205444864, + "learning_rate": 1.8855218855218858e-05, + "loss": 0.4221, + "step": 1527 + }, + { + "epoch": 1.9824789097988318, + "grad_norm": 0.2077434816627499, + "learning_rate": 1.8831168831168833e-05, + "loss": 0.4168, + "step": 1528 + }, + { + "epoch": 1.9837767683322518, + "grad_norm": 0.20932779718622976, + "learning_rate": 1.880711880711881e-05, + "loss": 0.4129, + "step": 1529 + }, + { + "epoch": 1.9850746268656716, + "grad_norm": 0.20073142812922465, + "learning_rate": 1.8783068783068782e-05, + "loss": 0.4282, + "step": 1530 + }, + { + "epoch": 1.9863724853990914, + "grad_norm": 0.21070295646641607, + "learning_rate": 1.875901875901876e-05, + "loss": 0.414, + "step": 1531 + }, + { + "epoch": 1.9876703439325114, + "grad_norm": 0.1983254407503139, + "learning_rate": 1.8734968734968735e-05, + "loss": 0.4246, + "step": 1532 + }, + { + "epoch": 1.9889682024659312, + "grad_norm": 0.2063440750783136, + "learning_rate": 1.8710918710918713e-05, + "loss": 0.4127, + "step": 1533 + }, + { + "epoch": 1.990266060999351, + "grad_norm": 0.20062950578015543, + "learning_rate": 1.8686868686868688e-05, + "loss": 0.404, + "step": 1534 + }, + { + "epoch": 1.991563919532771, + "grad_norm": 0.22191712136507424, + "learning_rate": 1.8662818662818663e-05, + "loss": 0.4119, + "step": 1535 + }, + { + "epoch": 1.9928617780661908, + "grad_norm": 0.2215336165604822, + "learning_rate": 1.8638768638768638e-05, + "loss": 0.4121, + "step": 1536 + }, + { + "epoch": 1.9941596365996106, + "grad_norm": 0.20271253230410582, + "learning_rate": 1.8614718614718616e-05, + "loss": 0.4023, + "step": 1537 + }, + { + "epoch": 1.9954574951330306, + "grad_norm": 0.26159702346568764, + "learning_rate": 1.859066859066859e-05, + "loss": 0.4102, + "step": 1538 + }, + { + "epoch": 1.9967553536664504, + "grad_norm": 0.21830457585192162, + "learning_rate": 1.856661856661857e-05, + "loss": 0.4129, + "step": 1539 + }, + { + "epoch": 1.9980532121998702, + "grad_norm": 0.21634003518886286, + "learning_rate": 1.8542568542568544e-05, + "loss": 0.3979, + "step": 1540 + }, + { + "epoch": 1.9993510707332902, + "grad_norm": 0.21520818627840688, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.4072, + "step": 1541 + }, + { + "epoch": 2.0, + "grad_norm": 0.3162004070666462, + "learning_rate": 1.8494468494468496e-05, + "loss": 0.3731, + "step": 1542 + }, + { + "epoch": 2.00129785853342, + "grad_norm": 0.3002746540509754, + "learning_rate": 1.847041847041847e-05, + "loss": 0.351, + "step": 1543 + }, + { + "epoch": 2.0025957170668396, + "grad_norm": 0.22179389161987628, + "learning_rate": 1.844636844636845e-05, + "loss": 0.3591, + "step": 1544 + }, + { + "epoch": 2.0038935756002596, + "grad_norm": 0.29853309659990307, + "learning_rate": 1.8422318422318424e-05, + "loss": 0.3502, + "step": 1545 + }, + { + "epoch": 2.0051914341336796, + "grad_norm": 0.27887815631774426, + "learning_rate": 1.83982683982684e-05, + "loss": 0.3397, + "step": 1546 + }, + { + "epoch": 2.006489292667099, + "grad_norm": 0.21973812049478386, + "learning_rate": 1.8374218374218374e-05, + "loss": 0.3429, + "step": 1547 + }, + { + "epoch": 2.007787151200519, + "grad_norm": 0.26933885808676494, + "learning_rate": 1.8350168350168352e-05, + "loss": 0.3436, + "step": 1548 + }, + { + "epoch": 2.009085009733939, + "grad_norm": 0.2996171251030684, + "learning_rate": 1.8326118326118327e-05, + "loss": 0.3409, + "step": 1549 + }, + { + "epoch": 2.010382868267359, + "grad_norm": 0.24083124258461439, + "learning_rate": 1.8302068302068305e-05, + "loss": 0.3386, + "step": 1550 + }, + { + "epoch": 2.011680726800779, + "grad_norm": 0.30787012297971555, + "learning_rate": 1.827801827801828e-05, + "loss": 0.3478, + "step": 1551 + }, + { + "epoch": 2.0129785853341984, + "grad_norm": 0.2857849577396285, + "learning_rate": 1.8253968253968254e-05, + "loss": 0.347, + "step": 1552 + }, + { + "epoch": 2.0142764438676184, + "grad_norm": 0.2458814691129703, + "learning_rate": 1.822991822991823e-05, + "loss": 0.338, + "step": 1553 + }, + { + "epoch": 2.0155743024010384, + "grad_norm": 0.24217238914022393, + "learning_rate": 1.8205868205868207e-05, + "loss": 0.3527, + "step": 1554 + }, + { + "epoch": 2.016872160934458, + "grad_norm": 0.2708034381508514, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.343, + "step": 1555 + }, + { + "epoch": 2.018170019467878, + "grad_norm": 0.2771300303365467, + "learning_rate": 1.815776815776816e-05, + "loss": 0.3578, + "step": 1556 + }, + { + "epoch": 2.019467878001298, + "grad_norm": 0.24314761286723605, + "learning_rate": 1.8133718133718135e-05, + "loss": 0.3628, + "step": 1557 + }, + { + "epoch": 2.0207657365347176, + "grad_norm": 0.24528635873262158, + "learning_rate": 1.810966810966811e-05, + "loss": 0.3478, + "step": 1558 + }, + { + "epoch": 2.0220635950681376, + "grad_norm": 0.2441718287388452, + "learning_rate": 1.8085618085618085e-05, + "loss": 0.3463, + "step": 1559 + }, + { + "epoch": 2.0233614536015576, + "grad_norm": 0.2429476875932917, + "learning_rate": 1.8061568061568063e-05, + "loss": 0.3533, + "step": 1560 + }, + { + "epoch": 2.024659312134977, + "grad_norm": 0.2584288721917747, + "learning_rate": 1.8037518037518038e-05, + "loss": 0.3443, + "step": 1561 + }, + { + "epoch": 2.025957170668397, + "grad_norm": 0.21619552093380776, + "learning_rate": 1.8013468013468016e-05, + "loss": 0.3313, + "step": 1562 + }, + { + "epoch": 2.027255029201817, + "grad_norm": 0.22416123032351348, + "learning_rate": 1.798941798941799e-05, + "loss": 0.3396, + "step": 1563 + }, + { + "epoch": 2.028552887735237, + "grad_norm": 0.23762867221262482, + "learning_rate": 1.7965367965367965e-05, + "loss": 0.3328, + "step": 1564 + }, + { + "epoch": 2.029850746268657, + "grad_norm": 0.23910726203342522, + "learning_rate": 1.7941317941317943e-05, + "loss": 0.3373, + "step": 1565 + }, + { + "epoch": 2.0311486048020764, + "grad_norm": 0.2270224344033579, + "learning_rate": 1.7917267917267918e-05, + "loss": 0.3414, + "step": 1566 + }, + { + "epoch": 2.0324464633354964, + "grad_norm": 0.22685492291746387, + "learning_rate": 1.7893217893217896e-05, + "loss": 0.334, + "step": 1567 + }, + { + "epoch": 2.0337443218689164, + "grad_norm": 0.23079013731214076, + "learning_rate": 1.7869167869167868e-05, + "loss": 0.3373, + "step": 1568 + }, + { + "epoch": 2.035042180402336, + "grad_norm": 0.23832074407215584, + "learning_rate": 1.7845117845117846e-05, + "loss": 0.3402, + "step": 1569 + }, + { + "epoch": 2.036340038935756, + "grad_norm": 0.2413146931238051, + "learning_rate": 1.782106782106782e-05, + "loss": 0.3476, + "step": 1570 + }, + { + "epoch": 2.037637897469176, + "grad_norm": 0.23685355784727574, + "learning_rate": 1.77970177970178e-05, + "loss": 0.3397, + "step": 1571 + }, + { + "epoch": 2.0389357560025956, + "grad_norm": 0.24437850977020956, + "learning_rate": 1.7772967772967774e-05, + "loss": 0.3445, + "step": 1572 + }, + { + "epoch": 2.0402336145360156, + "grad_norm": 0.22724458516557208, + "learning_rate": 1.7748917748917752e-05, + "loss": 0.3443, + "step": 1573 + }, + { + "epoch": 2.0415314730694356, + "grad_norm": 0.23475541449011794, + "learning_rate": 1.7724867724867723e-05, + "loss": 0.3432, + "step": 1574 + }, + { + "epoch": 2.042829331602855, + "grad_norm": 0.21469511225658197, + "learning_rate": 1.77008177008177e-05, + "loss": 0.3473, + "step": 1575 + }, + { + "epoch": 2.044127190136275, + "grad_norm": 0.22486022557380209, + "learning_rate": 1.7676767676767676e-05, + "loss": 0.3566, + "step": 1576 + }, + { + "epoch": 2.045425048669695, + "grad_norm": 0.22895350371242218, + "learning_rate": 1.7652717652717654e-05, + "loss": 0.3478, + "step": 1577 + }, + { + "epoch": 2.046722907203115, + "grad_norm": 0.24538812579393815, + "learning_rate": 1.762866762866763e-05, + "loss": 0.3332, + "step": 1578 + }, + { + "epoch": 2.048020765736535, + "grad_norm": 0.25912360209705504, + "learning_rate": 1.7604617604617604e-05, + "loss": 0.344, + "step": 1579 + }, + { + "epoch": 2.0493186242699544, + "grad_norm": 0.19959244601082998, + "learning_rate": 1.7580567580567582e-05, + "loss": 0.3337, + "step": 1580 + }, + { + "epoch": 2.0506164828033744, + "grad_norm": 0.22265382752478494, + "learning_rate": 1.7556517556517557e-05, + "loss": 0.3385, + "step": 1581 + }, + { + "epoch": 2.0519143413367944, + "grad_norm": 0.2165757161328648, + "learning_rate": 1.7532467532467535e-05, + "loss": 0.3339, + "step": 1582 + }, + { + "epoch": 2.053212199870214, + "grad_norm": 0.21372021503164076, + "learning_rate": 1.750841750841751e-05, + "loss": 0.3507, + "step": 1583 + }, + { + "epoch": 2.054510058403634, + "grad_norm": 0.2336377004408556, + "learning_rate": 1.7484367484367488e-05, + "loss": 0.3491, + "step": 1584 + }, + { + "epoch": 2.055807916937054, + "grad_norm": 0.2117993328839063, + "learning_rate": 1.746031746031746e-05, + "loss": 0.3407, + "step": 1585 + }, + { + "epoch": 2.0571057754704736, + "grad_norm": 0.21231266244658922, + "learning_rate": 1.7436267436267437e-05, + "loss": 0.3444, + "step": 1586 + }, + { + "epoch": 2.0584036340038936, + "grad_norm": 0.21032023819015722, + "learning_rate": 1.7412217412217412e-05, + "loss": 0.3398, + "step": 1587 + }, + { + "epoch": 2.0597014925373136, + "grad_norm": 0.2371048058409055, + "learning_rate": 1.738816738816739e-05, + "loss": 0.3445, + "step": 1588 + }, + { + "epoch": 2.060999351070733, + "grad_norm": 0.2059222075267882, + "learning_rate": 1.7364117364117365e-05, + "loss": 0.3443, + "step": 1589 + }, + { + "epoch": 2.062297209604153, + "grad_norm": 0.22719406397240552, + "learning_rate": 1.734006734006734e-05, + "loss": 0.3495, + "step": 1590 + }, + { + "epoch": 2.063595068137573, + "grad_norm": 0.2222707506963988, + "learning_rate": 1.7316017316017315e-05, + "loss": 0.3399, + "step": 1591 + }, + { + "epoch": 2.064892926670993, + "grad_norm": 0.22555682797470167, + "learning_rate": 1.7291967291967293e-05, + "loss": 0.3383, + "step": 1592 + }, + { + "epoch": 2.066190785204413, + "grad_norm": 0.22889368520998704, + "learning_rate": 1.7267917267917268e-05, + "loss": 0.3418, + "step": 1593 + }, + { + "epoch": 2.0674886437378324, + "grad_norm": 0.22102058324621057, + "learning_rate": 1.7243867243867246e-05, + "loss": 0.3325, + "step": 1594 + }, + { + "epoch": 2.0687865022712524, + "grad_norm": 0.23774221641545448, + "learning_rate": 1.721981721981722e-05, + "loss": 0.3369, + "step": 1595 + }, + { + "epoch": 2.0700843608046724, + "grad_norm": 0.24890061582412498, + "learning_rate": 1.7195767195767195e-05, + "loss": 0.3441, + "step": 1596 + }, + { + "epoch": 2.071382219338092, + "grad_norm": 0.2100873376295878, + "learning_rate": 1.7171717171717173e-05, + "loss": 0.3303, + "step": 1597 + }, + { + "epoch": 2.072680077871512, + "grad_norm": 0.22680230775268373, + "learning_rate": 1.7147667147667148e-05, + "loss": 0.3371, + "step": 1598 + }, + { + "epoch": 2.073977936404932, + "grad_norm": 0.20382351560763964, + "learning_rate": 1.7123617123617126e-05, + "loss": 0.3337, + "step": 1599 + }, + { + "epoch": 2.0752757949383516, + "grad_norm": 0.21534525112062658, + "learning_rate": 1.70995670995671e-05, + "loss": 0.3371, + "step": 1600 + }, + { + "epoch": 2.0765736534717716, + "grad_norm": 0.21827103094501965, + "learning_rate": 1.7075517075517076e-05, + "loss": 0.3357, + "step": 1601 + }, + { + "epoch": 2.0778715120051916, + "grad_norm": 0.21047818536264323, + "learning_rate": 1.705146705146705e-05, + "loss": 0.3358, + "step": 1602 + }, + { + "epoch": 2.079169370538611, + "grad_norm": 0.24359393587806577, + "learning_rate": 1.702741702741703e-05, + "loss": 0.3409, + "step": 1603 + }, + { + "epoch": 2.080467229072031, + "grad_norm": 0.32771668536616283, + "learning_rate": 1.7003367003367004e-05, + "loss": 0.3296, + "step": 1604 + }, + { + "epoch": 2.081765087605451, + "grad_norm": 0.2105738510166506, + "learning_rate": 1.6979316979316982e-05, + "loss": 0.3471, + "step": 1605 + }, + { + "epoch": 2.0830629461388708, + "grad_norm": 0.2609805732511619, + "learning_rate": 1.6955266955266957e-05, + "loss": 0.3395, + "step": 1606 + }, + { + "epoch": 2.084360804672291, + "grad_norm": 0.20813077643429093, + "learning_rate": 1.693121693121693e-05, + "loss": 0.3323, + "step": 1607 + }, + { + "epoch": 2.0856586632057104, + "grad_norm": 0.20588845509767667, + "learning_rate": 1.6907166907166906e-05, + "loss": 0.3395, + "step": 1608 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.2231300777445713, + "learning_rate": 1.6883116883116884e-05, + "loss": 0.3448, + "step": 1609 + }, + { + "epoch": 2.0882543802725504, + "grad_norm": 0.2262598516285643, + "learning_rate": 1.685906685906686e-05, + "loss": 0.3524, + "step": 1610 + }, + { + "epoch": 2.08955223880597, + "grad_norm": 0.2112479505042923, + "learning_rate": 1.6835016835016837e-05, + "loss": 0.335, + "step": 1611 + }, + { + "epoch": 2.09085009733939, + "grad_norm": 0.22189847243097133, + "learning_rate": 1.6810966810966812e-05, + "loss": 0.3312, + "step": 1612 + }, + { + "epoch": 2.09214795587281, + "grad_norm": 0.21750458672162346, + "learning_rate": 1.6786916786916787e-05, + "loss": 0.3382, + "step": 1613 + }, + { + "epoch": 2.0934458144062296, + "grad_norm": 0.22791119863516698, + "learning_rate": 1.6762866762866765e-05, + "loss": 0.3346, + "step": 1614 + }, + { + "epoch": 2.0947436729396496, + "grad_norm": 0.2253801306495037, + "learning_rate": 1.673881673881674e-05, + "loss": 0.3587, + "step": 1615 + }, + { + "epoch": 2.0960415314730696, + "grad_norm": 0.21941342953368462, + "learning_rate": 1.6714766714766718e-05, + "loss": 0.322, + "step": 1616 + }, + { + "epoch": 2.097339390006489, + "grad_norm": 0.24442422654973892, + "learning_rate": 1.6690716690716693e-05, + "loss": 0.3613, + "step": 1617 + }, + { + "epoch": 2.098637248539909, + "grad_norm": 0.20892381899190737, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.3372, + "step": 1618 + }, + { + "epoch": 2.099935107073329, + "grad_norm": 0.2204239619365003, + "learning_rate": 1.6642616642616642e-05, + "loss": 0.345, + "step": 1619 + }, + { + "epoch": 2.1012329656067488, + "grad_norm": 0.2433539902885864, + "learning_rate": 1.661856661856662e-05, + "loss": 0.3399, + "step": 1620 + }, + { + "epoch": 2.102530824140169, + "grad_norm": 0.21756713439856754, + "learning_rate": 1.6594516594516595e-05, + "loss": 0.3258, + "step": 1621 + }, + { + "epoch": 2.103828682673589, + "grad_norm": 0.21185754465198486, + "learning_rate": 1.6570466570466573e-05, + "loss": 0.3338, + "step": 1622 + }, + { + "epoch": 2.1051265412070084, + "grad_norm": 0.19886634840357484, + "learning_rate": 1.6546416546416545e-05, + "loss": 0.3379, + "step": 1623 + }, + { + "epoch": 2.1064243997404284, + "grad_norm": 0.23378633487014983, + "learning_rate": 1.6522366522366523e-05, + "loss": 0.3518, + "step": 1624 + }, + { + "epoch": 2.107722258273848, + "grad_norm": 0.22367396102680723, + "learning_rate": 1.6498316498316498e-05, + "loss": 0.3341, + "step": 1625 + }, + { + "epoch": 2.109020116807268, + "grad_norm": 0.2063911182342616, + "learning_rate": 1.6474266474266476e-05, + "loss": 0.3446, + "step": 1626 + }, + { + "epoch": 2.110317975340688, + "grad_norm": 0.22050227302851286, + "learning_rate": 1.645021645021645e-05, + "loss": 0.3359, + "step": 1627 + }, + { + "epoch": 2.1116158338741076, + "grad_norm": 0.24896315784423037, + "learning_rate": 1.642616642616643e-05, + "loss": 0.3311, + "step": 1628 + }, + { + "epoch": 2.1129136924075276, + "grad_norm": 0.24014540903069778, + "learning_rate": 1.6402116402116404e-05, + "loss": 0.3455, + "step": 1629 + }, + { + "epoch": 2.1142115509409476, + "grad_norm": 0.22208241595868494, + "learning_rate": 1.637806637806638e-05, + "loss": 0.3609, + "step": 1630 + }, + { + "epoch": 2.115509409474367, + "grad_norm": 0.25185975208100675, + "learning_rate": 1.6354016354016356e-05, + "loss": 0.3355, + "step": 1631 + }, + { + "epoch": 2.116807268007787, + "grad_norm": 0.25068577932779473, + "learning_rate": 1.632996632996633e-05, + "loss": 0.3378, + "step": 1632 + }, + { + "epoch": 2.118105126541207, + "grad_norm": 0.23743465262529753, + "learning_rate": 1.630591630591631e-05, + "loss": 0.3453, + "step": 1633 + }, + { + "epoch": 2.1194029850746268, + "grad_norm": 0.2873478730260964, + "learning_rate": 1.628186628186628e-05, + "loss": 0.3469, + "step": 1634 + }, + { + "epoch": 2.120700843608047, + "grad_norm": 0.23190098964145786, + "learning_rate": 1.625781625781626e-05, + "loss": 0.3299, + "step": 1635 + }, + { + "epoch": 2.1219987021414664, + "grad_norm": 0.24692649408648704, + "learning_rate": 1.6233766233766234e-05, + "loss": 0.3413, + "step": 1636 + }, + { + "epoch": 2.1232965606748864, + "grad_norm": 0.2601882707422891, + "learning_rate": 1.6209716209716212e-05, + "loss": 0.3237, + "step": 1637 + }, + { + "epoch": 2.1245944192083064, + "grad_norm": 0.2307540235499652, + "learning_rate": 1.6185666185666187e-05, + "loss": 0.3479, + "step": 1638 + }, + { + "epoch": 2.125892277741726, + "grad_norm": 0.24207343026907496, + "learning_rate": 1.6161616161616165e-05, + "loss": 0.342, + "step": 1639 + }, + { + "epoch": 2.127190136275146, + "grad_norm": 0.25704183477977455, + "learning_rate": 1.6137566137566136e-05, + "loss": 0.347, + "step": 1640 + }, + { + "epoch": 2.128487994808566, + "grad_norm": 0.2204667533876293, + "learning_rate": 1.6113516113516114e-05, + "loss": 0.3421, + "step": 1641 + }, + { + "epoch": 2.1297858533419856, + "grad_norm": 0.209970904431735, + "learning_rate": 1.608946608946609e-05, + "loss": 0.3434, + "step": 1642 + }, + { + "epoch": 2.1310837118754056, + "grad_norm": 0.2322772777553896, + "learning_rate": 1.6065416065416067e-05, + "loss": 0.3462, + "step": 1643 + }, + { + "epoch": 2.1323815704088256, + "grad_norm": 0.24539006715088627, + "learning_rate": 1.6041366041366042e-05, + "loss": 0.3474, + "step": 1644 + }, + { + "epoch": 2.133679428942245, + "grad_norm": 0.2071657985362404, + "learning_rate": 1.6017316017316017e-05, + "loss": 0.315, + "step": 1645 + }, + { + "epoch": 2.134977287475665, + "grad_norm": 0.22559015132345064, + "learning_rate": 1.5993265993265995e-05, + "loss": 0.3435, + "step": 1646 + }, + { + "epoch": 2.136275146009085, + "grad_norm": 0.21866568313609377, + "learning_rate": 1.596921596921597e-05, + "loss": 0.3214, + "step": 1647 + }, + { + "epoch": 2.1375730045425048, + "grad_norm": 0.22054227732700918, + "learning_rate": 1.5945165945165948e-05, + "loss": 0.3366, + "step": 1648 + }, + { + "epoch": 2.1388708630759248, + "grad_norm": 0.2513837156975892, + "learning_rate": 1.5921115921115923e-05, + "loss": 0.3661, + "step": 1649 + }, + { + "epoch": 2.140168721609345, + "grad_norm": 0.22217224038750824, + "learning_rate": 1.5897065897065898e-05, + "loss": 0.3377, + "step": 1650 + }, + { + "epoch": 2.1414665801427644, + "grad_norm": 0.21936038896279167, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.3326, + "step": 1651 + }, + { + "epoch": 2.1427644386761844, + "grad_norm": 0.2064910202725993, + "learning_rate": 1.584896584896585e-05, + "loss": 0.3365, + "step": 1652 + }, + { + "epoch": 2.144062297209604, + "grad_norm": 0.20100161814145587, + "learning_rate": 1.5824915824915825e-05, + "loss": 0.3278, + "step": 1653 + }, + { + "epoch": 2.145360155743024, + "grad_norm": 0.21879340248910953, + "learning_rate": 1.5800865800865803e-05, + "loss": 0.3431, + "step": 1654 + }, + { + "epoch": 2.146658014276444, + "grad_norm": 0.2129168285291938, + "learning_rate": 1.5776815776815778e-05, + "loss": 0.3503, + "step": 1655 + }, + { + "epoch": 2.1479558728098636, + "grad_norm": 0.2091423056576906, + "learning_rate": 1.5752765752765753e-05, + "loss": 0.3492, + "step": 1656 + }, + { + "epoch": 2.1492537313432836, + "grad_norm": 0.2828239816378975, + "learning_rate": 1.5728715728715728e-05, + "loss": 0.3564, + "step": 1657 + }, + { + "epoch": 2.1505515898767036, + "grad_norm": 0.21413983501492062, + "learning_rate": 1.5704665704665706e-05, + "loss": 0.3544, + "step": 1658 + }, + { + "epoch": 2.151849448410123, + "grad_norm": 0.21542714936607865, + "learning_rate": 1.568061568061568e-05, + "loss": 0.3399, + "step": 1659 + }, + { + "epoch": 2.153147306943543, + "grad_norm": 0.23220715953725188, + "learning_rate": 1.565656565656566e-05, + "loss": 0.3322, + "step": 1660 + }, + { + "epoch": 2.154445165476963, + "grad_norm": 0.20221618002166372, + "learning_rate": 1.563251563251563e-05, + "loss": 0.3297, + "step": 1661 + }, + { + "epoch": 2.1557430240103828, + "grad_norm": 0.23075242395941706, + "learning_rate": 1.560846560846561e-05, + "loss": 0.3372, + "step": 1662 + }, + { + "epoch": 2.1570408825438028, + "grad_norm": 0.2065260094429175, + "learning_rate": 1.5584415584415583e-05, + "loss": 0.3396, + "step": 1663 + }, + { + "epoch": 2.158338741077223, + "grad_norm": 0.2048397557697161, + "learning_rate": 1.556036556036556e-05, + "loss": 0.338, + "step": 1664 + }, + { + "epoch": 2.1596365996106424, + "grad_norm": 0.21058822127057414, + "learning_rate": 1.5536315536315536e-05, + "loss": 0.3475, + "step": 1665 + }, + { + "epoch": 2.1609344581440624, + "grad_norm": 0.20856117486455514, + "learning_rate": 1.5512265512265514e-05, + "loss": 0.3565, + "step": 1666 + }, + { + "epoch": 2.162232316677482, + "grad_norm": 0.2101578604301147, + "learning_rate": 1.548821548821549e-05, + "loss": 0.3341, + "step": 1667 + }, + { + "epoch": 2.163530175210902, + "grad_norm": 0.1948843188896066, + "learning_rate": 1.5464165464165464e-05, + "loss": 0.3307, + "step": 1668 + }, + { + "epoch": 2.164828033744322, + "grad_norm": 0.22409902293345668, + "learning_rate": 1.5440115440115442e-05, + "loss": 0.3484, + "step": 1669 + }, + { + "epoch": 2.1661258922777415, + "grad_norm": 0.22671547696900604, + "learning_rate": 1.5416065416065417e-05, + "loss": 0.3523, + "step": 1670 + }, + { + "epoch": 2.1674237508111616, + "grad_norm": 0.21140422198147263, + "learning_rate": 1.5392015392015395e-05, + "loss": 0.3455, + "step": 1671 + }, + { + "epoch": 2.1687216093445816, + "grad_norm": 0.21245427765806485, + "learning_rate": 1.5367965367965366e-05, + "loss": 0.3429, + "step": 1672 + }, + { + "epoch": 2.170019467878001, + "grad_norm": 0.20475755840127652, + "learning_rate": 1.5343915343915344e-05, + "loss": 0.334, + "step": 1673 + }, + { + "epoch": 2.171317326411421, + "grad_norm": 0.22161947726300274, + "learning_rate": 1.531986531986532e-05, + "loss": 0.3445, + "step": 1674 + }, + { + "epoch": 2.172615184944841, + "grad_norm": 0.20686901638420155, + "learning_rate": 1.5295815295815297e-05, + "loss": 0.3423, + "step": 1675 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.21241410102955396, + "learning_rate": 1.5271765271765272e-05, + "loss": 0.3272, + "step": 1676 + }, + { + "epoch": 2.1752109020116808, + "grad_norm": 0.20178219626778365, + "learning_rate": 1.524771524771525e-05, + "loss": 0.3429, + "step": 1677 + }, + { + "epoch": 2.176508760545101, + "grad_norm": 0.24450118577688046, + "learning_rate": 1.5223665223665223e-05, + "loss": 0.3481, + "step": 1678 + }, + { + "epoch": 2.1778066190785204, + "grad_norm": 0.20454351303651527, + "learning_rate": 1.51996151996152e-05, + "loss": 0.3581, + "step": 1679 + }, + { + "epoch": 2.1791044776119404, + "grad_norm": 0.21613091983231933, + "learning_rate": 1.5175565175565176e-05, + "loss": 0.3299, + "step": 1680 + }, + { + "epoch": 2.1804023361453604, + "grad_norm": 0.23413467617760164, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.3622, + "step": 1681 + }, + { + "epoch": 2.18170019467878, + "grad_norm": 0.21133908102093862, + "learning_rate": 1.512746512746513e-05, + "loss": 0.3323, + "step": 1682 + }, + { + "epoch": 2.1829980532122, + "grad_norm": 0.2117083520838165, + "learning_rate": 1.5103415103415102e-05, + "loss": 0.3394, + "step": 1683 + }, + { + "epoch": 2.1842959117456195, + "grad_norm": 0.21732760719194733, + "learning_rate": 1.5079365079365079e-05, + "loss": 0.3419, + "step": 1684 + }, + { + "epoch": 2.1855937702790396, + "grad_norm": 0.20428681560868892, + "learning_rate": 1.5055315055315055e-05, + "loss": 0.3346, + "step": 1685 + }, + { + "epoch": 2.1868916288124596, + "grad_norm": 0.21590103314227366, + "learning_rate": 1.5031265031265032e-05, + "loss": 0.3662, + "step": 1686 + }, + { + "epoch": 2.188189487345879, + "grad_norm": 0.21412757670479102, + "learning_rate": 1.5007215007215008e-05, + "loss": 0.3334, + "step": 1687 + }, + { + "epoch": 2.189487345879299, + "grad_norm": 0.2161794768756431, + "learning_rate": 1.4983164983164985e-05, + "loss": 0.334, + "step": 1688 + }, + { + "epoch": 2.190785204412719, + "grad_norm": 0.2121354087724828, + "learning_rate": 1.495911495911496e-05, + "loss": 0.344, + "step": 1689 + }, + { + "epoch": 2.1920830629461387, + "grad_norm": 0.20921645686294296, + "learning_rate": 1.4935064935064936e-05, + "loss": 0.3397, + "step": 1690 + }, + { + "epoch": 2.1933809214795588, + "grad_norm": 0.20966308146376983, + "learning_rate": 1.4911014911014912e-05, + "loss": 0.3464, + "step": 1691 + }, + { + "epoch": 2.194678780012979, + "grad_norm": 0.21908775100267822, + "learning_rate": 1.4886964886964889e-05, + "loss": 0.3317, + "step": 1692 + }, + { + "epoch": 2.1959766385463984, + "grad_norm": 0.22468058324603687, + "learning_rate": 1.4862914862914865e-05, + "loss": 0.3407, + "step": 1693 + }, + { + "epoch": 2.1972744970798184, + "grad_norm": 0.22075013149066078, + "learning_rate": 1.4838864838864838e-05, + "loss": 0.3283, + "step": 1694 + }, + { + "epoch": 2.198572355613238, + "grad_norm": 0.21551594408964253, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.3199, + "step": 1695 + }, + { + "epoch": 2.199870214146658, + "grad_norm": 0.21485802609507748, + "learning_rate": 1.4790764790764791e-05, + "loss": 0.3287, + "step": 1696 + }, + { + "epoch": 2.201168072680078, + "grad_norm": 0.2151673200316436, + "learning_rate": 1.4766714766714768e-05, + "loss": 0.3426, + "step": 1697 + }, + { + "epoch": 2.2024659312134975, + "grad_norm": 0.22612559865377715, + "learning_rate": 1.4742664742664744e-05, + "loss": 0.3573, + "step": 1698 + }, + { + "epoch": 2.2037637897469176, + "grad_norm": 0.21837527709726925, + "learning_rate": 1.471861471861472e-05, + "loss": 0.3325, + "step": 1699 + }, + { + "epoch": 2.2050616482803376, + "grad_norm": 0.21236356177018392, + "learning_rate": 1.4694564694564694e-05, + "loss": 0.3553, + "step": 1700 + }, + { + "epoch": 2.206359506813757, + "grad_norm": 0.19590709294592334, + "learning_rate": 1.467051467051467e-05, + "loss": 0.3215, + "step": 1701 + }, + { + "epoch": 2.207657365347177, + "grad_norm": 0.22771176110161379, + "learning_rate": 1.4646464646464647e-05, + "loss": 0.3465, + "step": 1702 + }, + { + "epoch": 2.208955223880597, + "grad_norm": 0.22229793471063694, + "learning_rate": 1.4622414622414623e-05, + "loss": 0.3432, + "step": 1703 + }, + { + "epoch": 2.2102530824140167, + "grad_norm": 0.21252084935676113, + "learning_rate": 1.45983645983646e-05, + "loss": 0.3508, + "step": 1704 + }, + { + "epoch": 2.2115509409474368, + "grad_norm": 0.2202632087669158, + "learning_rate": 1.4574314574314573e-05, + "loss": 0.3369, + "step": 1705 + }, + { + "epoch": 2.2128487994808568, + "grad_norm": 0.21520394343961766, + "learning_rate": 1.455026455026455e-05, + "loss": 0.3327, + "step": 1706 + }, + { + "epoch": 2.2141466580142763, + "grad_norm": 0.23322877877504564, + "learning_rate": 1.4526214526214526e-05, + "loss": 0.3491, + "step": 1707 + }, + { + "epoch": 2.2154445165476964, + "grad_norm": 0.22519715240573737, + "learning_rate": 1.4502164502164502e-05, + "loss": 0.3403, + "step": 1708 + }, + { + "epoch": 2.2167423750811164, + "grad_norm": 0.22210131676178194, + "learning_rate": 1.4478114478114479e-05, + "loss": 0.344, + "step": 1709 + }, + { + "epoch": 2.218040233614536, + "grad_norm": 0.2119128486555464, + "learning_rate": 1.4454064454064455e-05, + "loss": 0.3562, + "step": 1710 + }, + { + "epoch": 2.219338092147956, + "grad_norm": 0.26673890205097306, + "learning_rate": 1.443001443001443e-05, + "loss": 0.3284, + "step": 1711 + }, + { + "epoch": 2.2206359506813755, + "grad_norm": 0.22633092379431619, + "learning_rate": 1.4405964405964406e-05, + "loss": 0.3383, + "step": 1712 + }, + { + "epoch": 2.2219338092147956, + "grad_norm": 0.21248117786007845, + "learning_rate": 1.4381914381914383e-05, + "loss": 0.3458, + "step": 1713 + }, + { + "epoch": 2.2232316677482156, + "grad_norm": 0.23708025124779677, + "learning_rate": 1.435786435786436e-05, + "loss": 0.3521, + "step": 1714 + }, + { + "epoch": 2.224529526281635, + "grad_norm": 0.2216080470200345, + "learning_rate": 1.4333814333814336e-05, + "loss": 0.338, + "step": 1715 + }, + { + "epoch": 2.225827384815055, + "grad_norm": 0.20064012115244553, + "learning_rate": 1.4309764309764309e-05, + "loss": 0.3323, + "step": 1716 + }, + { + "epoch": 2.227125243348475, + "grad_norm": 0.22946959688466415, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.3533, + "step": 1717 + }, + { + "epoch": 2.2284231018818947, + "grad_norm": 0.23129226313271936, + "learning_rate": 1.4261664261664262e-05, + "loss": 0.3477, + "step": 1718 + }, + { + "epoch": 2.2297209604153148, + "grad_norm": 0.23801346245682223, + "learning_rate": 1.4237614237614238e-05, + "loss": 0.3525, + "step": 1719 + }, + { + "epoch": 2.2310188189487348, + "grad_norm": 0.20874849017749872, + "learning_rate": 1.4213564213564215e-05, + "loss": 0.3414, + "step": 1720 + }, + { + "epoch": 2.2323166774821543, + "grad_norm": 0.21629318124043379, + "learning_rate": 1.4189514189514191e-05, + "loss": 0.3379, + "step": 1721 + }, + { + "epoch": 2.2336145360155744, + "grad_norm": 0.22905806827791111, + "learning_rate": 1.4165464165464164e-05, + "loss": 0.337, + "step": 1722 + }, + { + "epoch": 2.234912394548994, + "grad_norm": 0.2279391640803175, + "learning_rate": 1.4141414141414141e-05, + "loss": 0.3536, + "step": 1723 + }, + { + "epoch": 2.236210253082414, + "grad_norm": 0.20743141809266266, + "learning_rate": 1.4117364117364117e-05, + "loss": 0.3331, + "step": 1724 + }, + { + "epoch": 2.237508111615834, + "grad_norm": 0.21930312729137855, + "learning_rate": 1.4093314093314094e-05, + "loss": 0.3426, + "step": 1725 + }, + { + "epoch": 2.2388059701492535, + "grad_norm": 0.21584060808323735, + "learning_rate": 1.406926406926407e-05, + "loss": 0.3326, + "step": 1726 + }, + { + "epoch": 2.2401038286826735, + "grad_norm": 0.20452984796446286, + "learning_rate": 1.4045214045214045e-05, + "loss": 0.3407, + "step": 1727 + }, + { + "epoch": 2.2414016872160936, + "grad_norm": 0.21278894400954876, + "learning_rate": 1.4021164021164022e-05, + "loss": 0.3272, + "step": 1728 + }, + { + "epoch": 2.242699545749513, + "grad_norm": 0.19989940211340443, + "learning_rate": 1.3997113997113998e-05, + "loss": 0.3268, + "step": 1729 + }, + { + "epoch": 2.243997404282933, + "grad_norm": 0.20756531216735427, + "learning_rate": 1.3973063973063974e-05, + "loss": 0.3428, + "step": 1730 + }, + { + "epoch": 2.245295262816353, + "grad_norm": 0.2289163023889471, + "learning_rate": 1.3949013949013951e-05, + "loss": 0.3531, + "step": 1731 + }, + { + "epoch": 2.2465931213497727, + "grad_norm": 0.19474714109733912, + "learning_rate": 1.3924963924963927e-05, + "loss": 0.3232, + "step": 1732 + }, + { + "epoch": 2.2478909798831928, + "grad_norm": 0.20474938955793603, + "learning_rate": 1.39009139009139e-05, + "loss": 0.3368, + "step": 1733 + }, + { + "epoch": 2.2491888384166128, + "grad_norm": 0.2134278581088506, + "learning_rate": 1.3876863876863877e-05, + "loss": 0.3332, + "step": 1734 + }, + { + "epoch": 2.2504866969500323, + "grad_norm": 0.20557863182364947, + "learning_rate": 1.3852813852813853e-05, + "loss": 0.3189, + "step": 1735 + }, + { + "epoch": 2.2517845554834524, + "grad_norm": 0.20536064005062202, + "learning_rate": 1.382876382876383e-05, + "loss": 0.3432, + "step": 1736 + }, + { + "epoch": 2.2530824140168724, + "grad_norm": 0.20167055126244063, + "learning_rate": 1.3804713804713806e-05, + "loss": 0.3333, + "step": 1737 + }, + { + "epoch": 2.254380272550292, + "grad_norm": 0.21070060115934763, + "learning_rate": 1.378066378066378e-05, + "loss": 0.3447, + "step": 1738 + }, + { + "epoch": 2.255678131083712, + "grad_norm": 0.2207628970020607, + "learning_rate": 1.3756613756613756e-05, + "loss": 0.3583, + "step": 1739 + }, + { + "epoch": 2.256975989617132, + "grad_norm": 0.2127435360633616, + "learning_rate": 1.3732563732563732e-05, + "loss": 0.3348, + "step": 1740 + }, + { + "epoch": 2.2582738481505515, + "grad_norm": 0.21284122659552568, + "learning_rate": 1.3708513708513709e-05, + "loss": 0.3625, + "step": 1741 + }, + { + "epoch": 2.2595717066839716, + "grad_norm": 0.1950008896432417, + "learning_rate": 1.3684463684463685e-05, + "loss": 0.3324, + "step": 1742 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.2020508328430615, + "learning_rate": 1.3660413660413662e-05, + "loss": 0.3479, + "step": 1743 + }, + { + "epoch": 2.262167423750811, + "grad_norm": 0.20145545550530372, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.3499, + "step": 1744 + }, + { + "epoch": 2.263465282284231, + "grad_norm": 0.20947296790103498, + "learning_rate": 1.3612313612313613e-05, + "loss": 0.3499, + "step": 1745 + }, + { + "epoch": 2.2647631408176507, + "grad_norm": 0.20527034801018748, + "learning_rate": 1.358826358826359e-05, + "loss": 0.3311, + "step": 1746 + }, + { + "epoch": 2.2660609993510707, + "grad_norm": 0.21571814965895064, + "learning_rate": 1.3564213564213566e-05, + "loss": 0.3445, + "step": 1747 + }, + { + "epoch": 2.2673588578844908, + "grad_norm": 0.1951070244580519, + "learning_rate": 1.3540163540163542e-05, + "loss": 0.3317, + "step": 1748 + }, + { + "epoch": 2.2686567164179103, + "grad_norm": 0.20164098805440273, + "learning_rate": 1.3516113516113516e-05, + "loss": 0.3489, + "step": 1749 + }, + { + "epoch": 2.2699545749513304, + "grad_norm": 0.2037235219687979, + "learning_rate": 1.3492063492063492e-05, + "loss": 0.3447, + "step": 1750 + }, + { + "epoch": 2.27125243348475, + "grad_norm": 0.21220292515247122, + "learning_rate": 1.3468013468013468e-05, + "loss": 0.3395, + "step": 1751 + }, + { + "epoch": 2.27255029201817, + "grad_norm": 0.2037471406983462, + "learning_rate": 1.3443963443963445e-05, + "loss": 0.3415, + "step": 1752 + }, + { + "epoch": 2.27384815055159, + "grad_norm": 0.20298389579886292, + "learning_rate": 1.3419913419913421e-05, + "loss": 0.3374, + "step": 1753 + }, + { + "epoch": 2.2751460090850095, + "grad_norm": 0.20691493521870025, + "learning_rate": 1.3395863395863398e-05, + "loss": 0.3519, + "step": 1754 + }, + { + "epoch": 2.2764438676184295, + "grad_norm": 0.2013802963124298, + "learning_rate": 1.3371813371813371e-05, + "loss": 0.3411, + "step": 1755 + }, + { + "epoch": 2.2777417261518496, + "grad_norm": 0.20436433898344716, + "learning_rate": 1.3347763347763347e-05, + "loss": 0.3447, + "step": 1756 + }, + { + "epoch": 2.279039584685269, + "grad_norm": 0.1926683227358606, + "learning_rate": 1.3323713323713324e-05, + "loss": 0.3326, + "step": 1757 + }, + { + "epoch": 2.280337443218689, + "grad_norm": 0.22362998880849946, + "learning_rate": 1.32996632996633e-05, + "loss": 0.338, + "step": 1758 + }, + { + "epoch": 2.281635301752109, + "grad_norm": 0.19696154548860742, + "learning_rate": 1.3275613275613277e-05, + "loss": 0.3302, + "step": 1759 + }, + { + "epoch": 2.2829331602855287, + "grad_norm": 0.21290597699073446, + "learning_rate": 1.3251563251563252e-05, + "loss": 0.3341, + "step": 1760 + }, + { + "epoch": 2.2842310188189487, + "grad_norm": 0.2025599154856036, + "learning_rate": 1.3227513227513228e-05, + "loss": 0.3292, + "step": 1761 + }, + { + "epoch": 2.2855288773523688, + "grad_norm": 0.20827197895684288, + "learning_rate": 1.3203463203463205e-05, + "loss": 0.348, + "step": 1762 + }, + { + "epoch": 2.2868267358857883, + "grad_norm": 0.2031367197949487, + "learning_rate": 1.3179413179413181e-05, + "loss": 0.3353, + "step": 1763 + }, + { + "epoch": 2.2881245944192083, + "grad_norm": 0.20119905187117781, + "learning_rate": 1.3155363155363157e-05, + "loss": 0.3291, + "step": 1764 + }, + { + "epoch": 2.2894224529526284, + "grad_norm": 0.19920483051203258, + "learning_rate": 1.3131313131313134e-05, + "loss": 0.3475, + "step": 1765 + }, + { + "epoch": 2.290720311486048, + "grad_norm": 0.3011556934952649, + "learning_rate": 1.3107263107263107e-05, + "loss": 0.3535, + "step": 1766 + }, + { + "epoch": 2.292018170019468, + "grad_norm": 0.21100649854372194, + "learning_rate": 1.3083213083213083e-05, + "loss": 0.3469, + "step": 1767 + }, + { + "epoch": 2.293316028552888, + "grad_norm": 0.1984331828091875, + "learning_rate": 1.305916305916306e-05, + "loss": 0.346, + "step": 1768 + }, + { + "epoch": 2.2946138870863075, + "grad_norm": 0.2128841448679342, + "learning_rate": 1.3035113035113036e-05, + "loss": 0.3426, + "step": 1769 + }, + { + "epoch": 2.2959117456197276, + "grad_norm": 0.20032623160831234, + "learning_rate": 1.3011063011063013e-05, + "loss": 0.3323, + "step": 1770 + }, + { + "epoch": 2.297209604153147, + "grad_norm": 0.21020072070413953, + "learning_rate": 1.2987012987012986e-05, + "loss": 0.345, + "step": 1771 + }, + { + "epoch": 2.298507462686567, + "grad_norm": 0.220140017454239, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.338, + "step": 1772 + }, + { + "epoch": 2.299805321219987, + "grad_norm": 0.20602192883284073, + "learning_rate": 1.2938912938912939e-05, + "loss": 0.3295, + "step": 1773 + }, + { + "epoch": 2.3011031797534067, + "grad_norm": 0.19782214494971023, + "learning_rate": 1.2914862914862915e-05, + "loss": 0.3443, + "step": 1774 + }, + { + "epoch": 2.3024010382868267, + "grad_norm": 0.2212349266901331, + "learning_rate": 1.2890812890812892e-05, + "loss": 0.3372, + "step": 1775 + }, + { + "epoch": 2.3036988968202468, + "grad_norm": 0.20379287350155195, + "learning_rate": 1.2866762866762868e-05, + "loss": 0.3298, + "step": 1776 + }, + { + "epoch": 2.3049967553536663, + "grad_norm": 0.22633645234017427, + "learning_rate": 1.2842712842712843e-05, + "loss": 0.3358, + "step": 1777 + }, + { + "epoch": 2.3062946138870863, + "grad_norm": 0.22571485246122885, + "learning_rate": 1.281866281866282e-05, + "loss": 0.3535, + "step": 1778 + }, + { + "epoch": 2.3075924724205064, + "grad_norm": 0.1971894196957368, + "learning_rate": 1.2794612794612796e-05, + "loss": 0.3437, + "step": 1779 + }, + { + "epoch": 2.308890330953926, + "grad_norm": 0.21039001676553398, + "learning_rate": 1.2770562770562773e-05, + "loss": 0.3436, + "step": 1780 + }, + { + "epoch": 2.310188189487346, + "grad_norm": 0.21586183237415976, + "learning_rate": 1.2746512746512749e-05, + "loss": 0.3371, + "step": 1781 + }, + { + "epoch": 2.3114860480207655, + "grad_norm": 0.22449090420078008, + "learning_rate": 1.2722462722462722e-05, + "loss": 0.3426, + "step": 1782 + }, + { + "epoch": 2.3127839065541855, + "grad_norm": 0.20000822387430392, + "learning_rate": 1.2698412698412699e-05, + "loss": 0.3273, + "step": 1783 + }, + { + "epoch": 2.3140817650876055, + "grad_norm": 0.19796366453881245, + "learning_rate": 1.2674362674362675e-05, + "loss": 0.341, + "step": 1784 + }, + { + "epoch": 2.315379623621025, + "grad_norm": 0.20625398962095803, + "learning_rate": 1.2650312650312651e-05, + "loss": 0.3327, + "step": 1785 + }, + { + "epoch": 2.316677482154445, + "grad_norm": 0.2018595792474862, + "learning_rate": 1.2626262626262628e-05, + "loss": 0.3425, + "step": 1786 + }, + { + "epoch": 2.317975340687865, + "grad_norm": 0.21810379177370443, + "learning_rate": 1.2602212602212604e-05, + "loss": 0.3411, + "step": 1787 + }, + { + "epoch": 2.3192731992212847, + "grad_norm": 0.21122946645750976, + "learning_rate": 1.2578162578162577e-05, + "loss": 0.3519, + "step": 1788 + }, + { + "epoch": 2.3205710577547047, + "grad_norm": 0.21682608106667023, + "learning_rate": 1.2554112554112554e-05, + "loss": 0.3391, + "step": 1789 + }, + { + "epoch": 2.3218689162881248, + "grad_norm": 0.21596334841986267, + "learning_rate": 1.253006253006253e-05, + "loss": 0.3478, + "step": 1790 + }, + { + "epoch": 2.3231667748215443, + "grad_norm": 0.21752697824305056, + "learning_rate": 1.2506012506012507e-05, + "loss": 0.3327, + "step": 1791 + }, + { + "epoch": 2.3244646333549643, + "grad_norm": 0.19911928426673434, + "learning_rate": 1.2481962481962482e-05, + "loss": 0.3337, + "step": 1792 + }, + { + "epoch": 2.3257624918883844, + "grad_norm": 0.21067483324446745, + "learning_rate": 1.2457912457912458e-05, + "loss": 0.3383, + "step": 1793 + }, + { + "epoch": 2.327060350421804, + "grad_norm": 0.21855693528672904, + "learning_rate": 1.2433862433862433e-05, + "loss": 0.3392, + "step": 1794 + }, + { + "epoch": 2.328358208955224, + "grad_norm": 0.2313881551522217, + "learning_rate": 1.240981240981241e-05, + "loss": 0.344, + "step": 1795 + }, + { + "epoch": 2.329656067488644, + "grad_norm": 0.21647217328647403, + "learning_rate": 1.2385762385762386e-05, + "loss": 0.3498, + "step": 1796 + }, + { + "epoch": 2.3309539260220635, + "grad_norm": 0.23563572241047098, + "learning_rate": 1.2361712361712362e-05, + "loss": 0.3492, + "step": 1797 + }, + { + "epoch": 2.3322517845554835, + "grad_norm": 0.22879253600129817, + "learning_rate": 1.2337662337662339e-05, + "loss": 0.3476, + "step": 1798 + }, + { + "epoch": 2.3335496430889036, + "grad_norm": 0.2194770239864021, + "learning_rate": 1.2313612313612315e-05, + "loss": 0.3499, + "step": 1799 + }, + { + "epoch": 2.334847501622323, + "grad_norm": 0.2323432486506921, + "learning_rate": 1.228956228956229e-05, + "loss": 0.3498, + "step": 1800 + }, + { + "epoch": 2.336145360155743, + "grad_norm": 0.2379829541757471, + "learning_rate": 1.2265512265512267e-05, + "loss": 0.3455, + "step": 1801 + }, + { + "epoch": 2.3374432186891627, + "grad_norm": 0.22305188673020696, + "learning_rate": 1.2241462241462243e-05, + "loss": 0.3464, + "step": 1802 + }, + { + "epoch": 2.3387410772225827, + "grad_norm": 0.20626742501221118, + "learning_rate": 1.2217412217412218e-05, + "loss": 0.3383, + "step": 1803 + }, + { + "epoch": 2.3400389357560027, + "grad_norm": 0.20476597434634727, + "learning_rate": 1.2193362193362194e-05, + "loss": 0.3271, + "step": 1804 + }, + { + "epoch": 2.3413367942894223, + "grad_norm": 0.2361285638480301, + "learning_rate": 1.2169312169312169e-05, + "loss": 0.363, + "step": 1805 + }, + { + "epoch": 2.3426346528228423, + "grad_norm": 0.21885176640576573, + "learning_rate": 1.2145262145262145e-05, + "loss": 0.3322, + "step": 1806 + }, + { + "epoch": 2.3439325113562623, + "grad_norm": 0.2186287737946408, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.3517, + "step": 1807 + }, + { + "epoch": 2.345230369889682, + "grad_norm": 0.22242925263530489, + "learning_rate": 1.2097162097162097e-05, + "loss": 0.3556, + "step": 1808 + }, + { + "epoch": 2.346528228423102, + "grad_norm": 0.21839105455863506, + "learning_rate": 1.2073112073112073e-05, + "loss": 0.3537, + "step": 1809 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.19288692731778082, + "learning_rate": 1.204906204906205e-05, + "loss": 0.3267, + "step": 1810 + }, + { + "epoch": 2.3491239454899415, + "grad_norm": 0.21299009964604498, + "learning_rate": 1.2025012025012024e-05, + "loss": 0.3343, + "step": 1811 + }, + { + "epoch": 2.3504218040233615, + "grad_norm": 0.20073387684513502, + "learning_rate": 1.2000962000962001e-05, + "loss": 0.3366, + "step": 1812 + }, + { + "epoch": 2.351719662556781, + "grad_norm": 0.20924433976474296, + "learning_rate": 1.1976911976911977e-05, + "loss": 0.3503, + "step": 1813 + }, + { + "epoch": 2.353017521090201, + "grad_norm": 0.2024959718616962, + "learning_rate": 1.1952861952861954e-05, + "loss": 0.3398, + "step": 1814 + }, + { + "epoch": 2.354315379623621, + "grad_norm": 0.20136147992617448, + "learning_rate": 1.192881192881193e-05, + "loss": 0.3329, + "step": 1815 + }, + { + "epoch": 2.3556132381570407, + "grad_norm": 0.2023856480954257, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.3431, + "step": 1816 + }, + { + "epoch": 2.3569110966904607, + "grad_norm": 0.19571106019001464, + "learning_rate": 1.1880711880711882e-05, + "loss": 0.3409, + "step": 1817 + }, + { + "epoch": 2.3582089552238807, + "grad_norm": 0.20043918045056716, + "learning_rate": 1.1856661856661858e-05, + "loss": 0.3457, + "step": 1818 + }, + { + "epoch": 2.3595068137573003, + "grad_norm": 0.20301799957042976, + "learning_rate": 1.1832611832611833e-05, + "loss": 0.358, + "step": 1819 + }, + { + "epoch": 2.3608046722907203, + "grad_norm": 0.1985053306211951, + "learning_rate": 1.180856180856181e-05, + "loss": 0.3476, + "step": 1820 + }, + { + "epoch": 2.3621025308241403, + "grad_norm": 0.1946833217729552, + "learning_rate": 1.1784511784511786e-05, + "loss": 0.3381, + "step": 1821 + }, + { + "epoch": 2.36340038935756, + "grad_norm": 0.20694128635780762, + "learning_rate": 1.176046176046176e-05, + "loss": 0.3484, + "step": 1822 + }, + { + "epoch": 2.36469824789098, + "grad_norm": 0.19680244677531286, + "learning_rate": 1.1736411736411737e-05, + "loss": 0.3412, + "step": 1823 + }, + { + "epoch": 2.3659961064244, + "grad_norm": 0.19475058414044913, + "learning_rate": 1.1712361712361713e-05, + "loss": 0.3338, + "step": 1824 + }, + { + "epoch": 2.3672939649578195, + "grad_norm": 0.20017845696117334, + "learning_rate": 1.1688311688311688e-05, + "loss": 0.3292, + "step": 1825 + }, + { + "epoch": 2.3685918234912395, + "grad_norm": 0.21405181485690658, + "learning_rate": 1.1664261664261665e-05, + "loss": 0.3319, + "step": 1826 + }, + { + "epoch": 2.3698896820246595, + "grad_norm": 0.2255906580166369, + "learning_rate": 1.164021164021164e-05, + "loss": 0.3583, + "step": 1827 + }, + { + "epoch": 2.371187540558079, + "grad_norm": 0.19990250671178067, + "learning_rate": 1.1616161616161616e-05, + "loss": 0.337, + "step": 1828 + }, + { + "epoch": 2.372485399091499, + "grad_norm": 0.19827952220037648, + "learning_rate": 1.1592111592111592e-05, + "loss": 0.3432, + "step": 1829 + }, + { + "epoch": 2.3737832576249187, + "grad_norm": 0.19939279257051523, + "learning_rate": 1.1568061568061569e-05, + "loss": 0.3374, + "step": 1830 + }, + { + "epoch": 2.3750811161583387, + "grad_norm": 0.1906800360211246, + "learning_rate": 1.1544011544011545e-05, + "loss": 0.3423, + "step": 1831 + }, + { + "epoch": 2.3763789746917587, + "grad_norm": 0.2027119176012166, + "learning_rate": 1.1519961519961522e-05, + "loss": 0.3431, + "step": 1832 + }, + { + "epoch": 2.3776768332251783, + "grad_norm": 0.20771653103434248, + "learning_rate": 1.1495911495911497e-05, + "loss": 0.3621, + "step": 1833 + }, + { + "epoch": 2.3789746917585983, + "grad_norm": 0.18554441127828813, + "learning_rate": 1.1471861471861473e-05, + "loss": 0.326, + "step": 1834 + }, + { + "epoch": 2.3802725502920183, + "grad_norm": 0.19747340923777565, + "learning_rate": 1.144781144781145e-05, + "loss": 0.3247, + "step": 1835 + }, + { + "epoch": 2.381570408825438, + "grad_norm": 0.20647886448091093, + "learning_rate": 1.1423761423761424e-05, + "loss": 0.3341, + "step": 1836 + }, + { + "epoch": 2.382868267358858, + "grad_norm": 0.1957627449624196, + "learning_rate": 1.13997113997114e-05, + "loss": 0.3328, + "step": 1837 + }, + { + "epoch": 2.3841661258922775, + "grad_norm": 0.19525704689585352, + "learning_rate": 1.1375661375661376e-05, + "loss": 0.347, + "step": 1838 + }, + { + "epoch": 2.3854639844256975, + "grad_norm": 0.20804623050610585, + "learning_rate": 1.1351611351611352e-05, + "loss": 0.3313, + "step": 1839 + }, + { + "epoch": 2.3867618429591175, + "grad_norm": 0.21139171271078994, + "learning_rate": 1.1327561327561329e-05, + "loss": 0.3295, + "step": 1840 + }, + { + "epoch": 2.388059701492537, + "grad_norm": 0.19695078516317913, + "learning_rate": 1.1303511303511303e-05, + "loss": 0.3244, + "step": 1841 + }, + { + "epoch": 2.389357560025957, + "grad_norm": 0.21329704668287522, + "learning_rate": 1.127946127946128e-05, + "loss": 0.3496, + "step": 1842 + }, + { + "epoch": 2.390655418559377, + "grad_norm": 0.19692253824786773, + "learning_rate": 1.1255411255411256e-05, + "loss": 0.3414, + "step": 1843 + }, + { + "epoch": 2.3919532770927967, + "grad_norm": 0.20145116731281673, + "learning_rate": 1.1231361231361231e-05, + "loss": 0.3411, + "step": 1844 + }, + { + "epoch": 2.3932511356262167, + "grad_norm": 0.21094040457540925, + "learning_rate": 1.1207311207311207e-05, + "loss": 0.3425, + "step": 1845 + }, + { + "epoch": 2.3945489941596367, + "grad_norm": 0.20933806797553264, + "learning_rate": 1.1183261183261184e-05, + "loss": 0.3447, + "step": 1846 + }, + { + "epoch": 2.3958468526930563, + "grad_norm": 0.22025133979918343, + "learning_rate": 1.1159211159211159e-05, + "loss": 0.3276, + "step": 1847 + }, + { + "epoch": 2.3971447112264763, + "grad_norm": 0.2033688921914023, + "learning_rate": 1.1135161135161135e-05, + "loss": 0.3427, + "step": 1848 + }, + { + "epoch": 2.3984425697598963, + "grad_norm": 0.20001119671379927, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.3291, + "step": 1849 + }, + { + "epoch": 2.399740428293316, + "grad_norm": 0.2096895946679153, + "learning_rate": 1.1087061087061088e-05, + "loss": 0.3529, + "step": 1850 + }, + { + "epoch": 2.401038286826736, + "grad_norm": 0.20086557980683176, + "learning_rate": 1.1063011063011065e-05, + "loss": 0.3386, + "step": 1851 + }, + { + "epoch": 2.402336145360156, + "grad_norm": 0.19654561939004062, + "learning_rate": 1.103896103896104e-05, + "loss": 0.347, + "step": 1852 + }, + { + "epoch": 2.4036340038935755, + "grad_norm": 0.20190747910646842, + "learning_rate": 1.1014911014911016e-05, + "loss": 0.3355, + "step": 1853 + }, + { + "epoch": 2.4049318624269955, + "grad_norm": 0.20073723518377382, + "learning_rate": 1.0990860990860992e-05, + "loss": 0.3506, + "step": 1854 + }, + { + "epoch": 2.4062297209604155, + "grad_norm": 0.19812022485550956, + "learning_rate": 1.0966810966810967e-05, + "loss": 0.3326, + "step": 1855 + }, + { + "epoch": 2.407527579493835, + "grad_norm": 0.20293093391898026, + "learning_rate": 1.0942760942760944e-05, + "loss": 0.3437, + "step": 1856 + }, + { + "epoch": 2.408825438027255, + "grad_norm": 0.18445064925773152, + "learning_rate": 1.091871091871092e-05, + "loss": 0.3412, + "step": 1857 + }, + { + "epoch": 2.4101232965606747, + "grad_norm": 0.18982196010225733, + "learning_rate": 1.0894660894660895e-05, + "loss": 0.3334, + "step": 1858 + }, + { + "epoch": 2.4114211550940947, + "grad_norm": 0.19790842980140105, + "learning_rate": 1.0870610870610871e-05, + "loss": 0.341, + "step": 1859 + }, + { + "epoch": 2.4127190136275147, + "grad_norm": 0.21825470803362326, + "learning_rate": 1.0846560846560846e-05, + "loss": 0.3463, + "step": 1860 + }, + { + "epoch": 2.4140168721609343, + "grad_norm": 0.19742393864907667, + "learning_rate": 1.0822510822510823e-05, + "loss": 0.3275, + "step": 1861 + }, + { + "epoch": 2.4153147306943543, + "grad_norm": 0.19224523357142126, + "learning_rate": 1.0798460798460799e-05, + "loss": 0.3325, + "step": 1862 + }, + { + "epoch": 2.4166125892277743, + "grad_norm": 0.19938049624693138, + "learning_rate": 1.0774410774410774e-05, + "loss": 0.3262, + "step": 1863 + }, + { + "epoch": 2.417910447761194, + "grad_norm": 0.19485806663699845, + "learning_rate": 1.075036075036075e-05, + "loss": 0.3357, + "step": 1864 + }, + { + "epoch": 2.419208306294614, + "grad_norm": 0.20583844260408463, + "learning_rate": 1.0726310726310727e-05, + "loss": 0.3408, + "step": 1865 + }, + { + "epoch": 2.420506164828034, + "grad_norm": 0.19685425374116253, + "learning_rate": 1.0702260702260703e-05, + "loss": 0.34, + "step": 1866 + }, + { + "epoch": 2.4218040233614535, + "grad_norm": 0.19475698760728785, + "learning_rate": 1.067821067821068e-05, + "loss": 0.3415, + "step": 1867 + }, + { + "epoch": 2.4231018818948735, + "grad_norm": 0.197170239994665, + "learning_rate": 1.0654160654160656e-05, + "loss": 0.3328, + "step": 1868 + }, + { + "epoch": 2.424399740428293, + "grad_norm": 0.2312526818007591, + "learning_rate": 1.0630110630110631e-05, + "loss": 0.3517, + "step": 1869 + }, + { + "epoch": 2.425697598961713, + "grad_norm": 0.21684364271507464, + "learning_rate": 1.0606060606060607e-05, + "loss": 0.3383, + "step": 1870 + }, + { + "epoch": 2.426995457495133, + "grad_norm": 0.1985082584320083, + "learning_rate": 1.0582010582010582e-05, + "loss": 0.3246, + "step": 1871 + }, + { + "epoch": 2.4282933160285527, + "grad_norm": 0.21714639429646007, + "learning_rate": 1.0557960557960559e-05, + "loss": 0.3497, + "step": 1872 + }, + { + "epoch": 2.4295911745619727, + "grad_norm": 0.22321524923011035, + "learning_rate": 1.0533910533910535e-05, + "loss": 0.3553, + "step": 1873 + }, + { + "epoch": 2.4308890330953927, + "grad_norm": 0.23372261304917366, + "learning_rate": 1.050986050986051e-05, + "loss": 0.3686, + "step": 1874 + }, + { + "epoch": 2.4321868916288123, + "grad_norm": 0.20001574330722693, + "learning_rate": 1.0485810485810486e-05, + "loss": 0.3351, + "step": 1875 + }, + { + "epoch": 2.4334847501622323, + "grad_norm": 0.20673565072923414, + "learning_rate": 1.0461760461760463e-05, + "loss": 0.3596, + "step": 1876 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.20095320206649409, + "learning_rate": 1.0437710437710438e-05, + "loss": 0.3328, + "step": 1877 + }, + { + "epoch": 2.436080467229072, + "grad_norm": 0.2789190666720269, + "learning_rate": 1.0413660413660414e-05, + "loss": 0.3509, + "step": 1878 + }, + { + "epoch": 2.437378325762492, + "grad_norm": 0.2079166722723446, + "learning_rate": 1.038961038961039e-05, + "loss": 0.3492, + "step": 1879 + }, + { + "epoch": 2.438676184295912, + "grad_norm": 0.21591855675091434, + "learning_rate": 1.0365560365560365e-05, + "loss": 0.3375, + "step": 1880 + }, + { + "epoch": 2.4399740428293315, + "grad_norm": 0.21397550791993689, + "learning_rate": 1.0341510341510342e-05, + "loss": 0.3495, + "step": 1881 + }, + { + "epoch": 2.4412719013627515, + "grad_norm": 0.3807729201699182, + "learning_rate": 1.0317460317460318e-05, + "loss": 0.3449, + "step": 1882 + }, + { + "epoch": 2.4425697598961715, + "grad_norm": 0.19473145374740133, + "learning_rate": 1.0293410293410295e-05, + "loss": 0.3412, + "step": 1883 + }, + { + "epoch": 2.443867618429591, + "grad_norm": 0.20643794857809838, + "learning_rate": 1.0269360269360271e-05, + "loss": 0.3548, + "step": 1884 + }, + { + "epoch": 2.445165476963011, + "grad_norm": 0.19967818102155932, + "learning_rate": 1.0245310245310246e-05, + "loss": 0.3251, + "step": 1885 + }, + { + "epoch": 2.446463335496431, + "grad_norm": 0.2068701964008534, + "learning_rate": 1.0221260221260222e-05, + "loss": 0.3503, + "step": 1886 + }, + { + "epoch": 2.4477611940298507, + "grad_norm": 0.19485423370421984, + "learning_rate": 1.0197210197210199e-05, + "loss": 0.3316, + "step": 1887 + }, + { + "epoch": 2.4490590525632707, + "grad_norm": 0.20945089504608722, + "learning_rate": 1.0173160173160174e-05, + "loss": 0.3723, + "step": 1888 + }, + { + "epoch": 2.4503569110966903, + "grad_norm": 0.21961631414794303, + "learning_rate": 1.014911014911015e-05, + "loss": 0.3555, + "step": 1889 + }, + { + "epoch": 2.4516547696301103, + "grad_norm": 0.20913009484803424, + "learning_rate": 1.0125060125060125e-05, + "loss": 0.3433, + "step": 1890 + }, + { + "epoch": 2.4529526281635303, + "grad_norm": 0.1975805461078145, + "learning_rate": 1.0101010101010101e-05, + "loss": 0.3522, + "step": 1891 + }, + { + "epoch": 2.45425048669695, + "grad_norm": 0.18824749115573988, + "learning_rate": 1.0076960076960078e-05, + "loss": 0.3262, + "step": 1892 + }, + { + "epoch": 2.45554834523037, + "grad_norm": 0.19363390712933798, + "learning_rate": 1.0052910052910053e-05, + "loss": 0.3304, + "step": 1893 + }, + { + "epoch": 2.45684620376379, + "grad_norm": 0.20877531448498393, + "learning_rate": 1.0028860028860029e-05, + "loss": 0.3502, + "step": 1894 + }, + { + "epoch": 2.4581440622972095, + "grad_norm": 0.2061242277033731, + "learning_rate": 1.0004810004810006e-05, + "loss": 0.3382, + "step": 1895 + }, + { + "epoch": 2.4594419208306295, + "grad_norm": 0.20527048632536887, + "learning_rate": 9.98075998075998e-06, + "loss": 0.3391, + "step": 1896 + }, + { + "epoch": 2.460739779364049, + "grad_norm": 0.20055534262640298, + "learning_rate": 9.956709956709957e-06, + "loss": 0.3564, + "step": 1897 + }, + { + "epoch": 2.462037637897469, + "grad_norm": 0.19785197665929594, + "learning_rate": 9.932659932659933e-06, + "loss": 0.3443, + "step": 1898 + }, + { + "epoch": 2.463335496430889, + "grad_norm": 0.2037702638037453, + "learning_rate": 9.908609908609908e-06, + "loss": 0.348, + "step": 1899 + }, + { + "epoch": 2.4646333549643087, + "grad_norm": 0.21220856247877268, + "learning_rate": 9.884559884559884e-06, + "loss": 0.3428, + "step": 1900 + }, + { + "epoch": 2.4659312134977287, + "grad_norm": 0.2219575251336399, + "learning_rate": 9.860509860509861e-06, + "loss": 0.3418, + "step": 1901 + }, + { + "epoch": 2.4672290720311487, + "grad_norm": 0.19647517582803717, + "learning_rate": 9.836459836459837e-06, + "loss": 0.3345, + "step": 1902 + }, + { + "epoch": 2.4685269305645683, + "grad_norm": 0.19654210291881336, + "learning_rate": 9.812409812409814e-06, + "loss": 0.3439, + "step": 1903 + }, + { + "epoch": 2.4698247890979883, + "grad_norm": 0.20215632118085225, + "learning_rate": 9.788359788359789e-06, + "loss": 0.343, + "step": 1904 + }, + { + "epoch": 2.4711226476314083, + "grad_norm": 0.21700717313986337, + "learning_rate": 9.764309764309765e-06, + "loss": 0.3422, + "step": 1905 + }, + { + "epoch": 2.472420506164828, + "grad_norm": 0.22639821333763582, + "learning_rate": 9.740259740259742e-06, + "loss": 0.3545, + "step": 1906 + }, + { + "epoch": 2.473718364698248, + "grad_norm": 0.20194895913978017, + "learning_rate": 9.716209716209716e-06, + "loss": 0.3389, + "step": 1907 + }, + { + "epoch": 2.475016223231668, + "grad_norm": 0.20577729584744323, + "learning_rate": 9.692159692159693e-06, + "loss": 0.3399, + "step": 1908 + }, + { + "epoch": 2.4763140817650875, + "grad_norm": 0.19912299573383097, + "learning_rate": 9.66810966810967e-06, + "loss": 0.3373, + "step": 1909 + }, + { + "epoch": 2.4776119402985075, + "grad_norm": 0.21269657354433175, + "learning_rate": 9.644059644059644e-06, + "loss": 0.3266, + "step": 1910 + }, + { + "epoch": 2.4789097988319275, + "grad_norm": 0.20736564955023581, + "learning_rate": 9.62000962000962e-06, + "loss": 0.3778, + "step": 1911 + }, + { + "epoch": 2.480207657365347, + "grad_norm": 0.20499397029892405, + "learning_rate": 9.595959595959595e-06, + "loss": 0.3476, + "step": 1912 + }, + { + "epoch": 2.481505515898767, + "grad_norm": 0.21222717155575171, + "learning_rate": 9.571909571909572e-06, + "loss": 0.3614, + "step": 1913 + }, + { + "epoch": 2.482803374432187, + "grad_norm": 0.19863505520154515, + "learning_rate": 9.547859547859548e-06, + "loss": 0.3405, + "step": 1914 + }, + { + "epoch": 2.4841012329656067, + "grad_norm": 0.18907377927162114, + "learning_rate": 9.523809523809523e-06, + "loss": 0.3286, + "step": 1915 + }, + { + "epoch": 2.4853990914990267, + "grad_norm": 0.21676854818994787, + "learning_rate": 9.4997594997595e-06, + "loss": 0.3464, + "step": 1916 + }, + { + "epoch": 2.4866969500324463, + "grad_norm": 0.20682249845168457, + "learning_rate": 9.475709475709476e-06, + "loss": 0.3482, + "step": 1917 + }, + { + "epoch": 2.4879948085658663, + "grad_norm": 0.20359288013276863, + "learning_rate": 9.451659451659452e-06, + "loss": 0.3361, + "step": 1918 + }, + { + "epoch": 2.4892926670992863, + "grad_norm": 0.20782718092266766, + "learning_rate": 9.427609427609429e-06, + "loss": 0.3437, + "step": 1919 + }, + { + "epoch": 2.490590525632706, + "grad_norm": 0.19751302818390967, + "learning_rate": 9.403559403559405e-06, + "loss": 0.3291, + "step": 1920 + }, + { + "epoch": 2.491888384166126, + "grad_norm": 0.21156577333606844, + "learning_rate": 9.37950937950938e-06, + "loss": 0.3402, + "step": 1921 + }, + { + "epoch": 2.493186242699546, + "grad_norm": 0.20413132600430678, + "learning_rate": 9.355459355459357e-06, + "loss": 0.3483, + "step": 1922 + }, + { + "epoch": 2.4944841012329655, + "grad_norm": 0.1966838578810161, + "learning_rate": 9.331409331409331e-06, + "loss": 0.3412, + "step": 1923 + }, + { + "epoch": 2.4957819597663855, + "grad_norm": 0.19973328891467468, + "learning_rate": 9.307359307359308e-06, + "loss": 0.3458, + "step": 1924 + }, + { + "epoch": 2.497079818299805, + "grad_norm": 0.2020638046963019, + "learning_rate": 9.283309283309284e-06, + "loss": 0.3393, + "step": 1925 + }, + { + "epoch": 2.498377676833225, + "grad_norm": 0.22766804942928376, + "learning_rate": 9.259259259259259e-06, + "loss": 0.3255, + "step": 1926 + }, + { + "epoch": 2.499675535366645, + "grad_norm": 0.1997890343589566, + "learning_rate": 9.235209235209236e-06, + "loss": 0.3379, + "step": 1927 + }, + { + "epoch": 2.5009733939000647, + "grad_norm": 0.19417057565689014, + "learning_rate": 9.211159211159212e-06, + "loss": 0.3362, + "step": 1928 + }, + { + "epoch": 2.5022712524334847, + "grad_norm": 0.2159121005299441, + "learning_rate": 9.187109187109187e-06, + "loss": 0.333, + "step": 1929 + }, + { + "epoch": 2.5035691109669047, + "grad_norm": 0.2152179356087276, + "learning_rate": 9.163059163059163e-06, + "loss": 0.3299, + "step": 1930 + }, + { + "epoch": 2.5048669695003243, + "grad_norm": 0.19905359149185228, + "learning_rate": 9.13900913900914e-06, + "loss": 0.3402, + "step": 1931 + }, + { + "epoch": 2.5061648280337443, + "grad_norm": 0.20691948697032309, + "learning_rate": 9.114959114959115e-06, + "loss": 0.3513, + "step": 1932 + }, + { + "epoch": 2.5074626865671643, + "grad_norm": 0.19894235448528472, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3288, + "step": 1933 + }, + { + "epoch": 2.508760545100584, + "grad_norm": 0.21082966995035995, + "learning_rate": 9.066859066859068e-06, + "loss": 0.3332, + "step": 1934 + }, + { + "epoch": 2.510058403634004, + "grad_norm": 0.20169901454411296, + "learning_rate": 9.042809042809042e-06, + "loss": 0.3223, + "step": 1935 + }, + { + "epoch": 2.511356262167424, + "grad_norm": 0.21866454354506568, + "learning_rate": 9.018759018759019e-06, + "loss": 0.3392, + "step": 1936 + }, + { + "epoch": 2.5126541207008435, + "grad_norm": 0.21257413698744998, + "learning_rate": 8.994708994708995e-06, + "loss": 0.3452, + "step": 1937 + }, + { + "epoch": 2.5139519792342635, + "grad_norm": 0.19946752360820091, + "learning_rate": 8.970658970658972e-06, + "loss": 0.3426, + "step": 1938 + }, + { + "epoch": 2.5152498377676835, + "grad_norm": 0.2048047114149695, + "learning_rate": 8.946608946608948e-06, + "loss": 0.3496, + "step": 1939 + }, + { + "epoch": 2.516547696301103, + "grad_norm": 0.2060713206544726, + "learning_rate": 8.922558922558923e-06, + "loss": 0.3466, + "step": 1940 + }, + { + "epoch": 2.517845554834523, + "grad_norm": 0.21499517524206285, + "learning_rate": 8.8985088985089e-06, + "loss": 0.3391, + "step": 1941 + }, + { + "epoch": 2.519143413367943, + "grad_norm": 0.2043639715896453, + "learning_rate": 8.874458874458876e-06, + "loss": 0.3463, + "step": 1942 + }, + { + "epoch": 2.5204412719013627, + "grad_norm": 0.2183414837407522, + "learning_rate": 8.85040885040885e-06, + "loss": 0.3427, + "step": 1943 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.20285948319637043, + "learning_rate": 8.826358826358827e-06, + "loss": 0.3377, + "step": 1944 + }, + { + "epoch": 2.5230369889682027, + "grad_norm": 0.19391653715006987, + "learning_rate": 8.802308802308802e-06, + "loss": 0.3431, + "step": 1945 + }, + { + "epoch": 2.5243348475016223, + "grad_norm": 0.19725005677345353, + "learning_rate": 8.778258778258778e-06, + "loss": 0.3402, + "step": 1946 + }, + { + "epoch": 2.5256327060350423, + "grad_norm": 0.20776675755315593, + "learning_rate": 8.754208754208755e-06, + "loss": 0.3432, + "step": 1947 + }, + { + "epoch": 2.526930564568462, + "grad_norm": 0.20770257485418178, + "learning_rate": 8.73015873015873e-06, + "loss": 0.3309, + "step": 1948 + }, + { + "epoch": 2.528228423101882, + "grad_norm": 0.19473645944952384, + "learning_rate": 8.706108706108706e-06, + "loss": 0.3335, + "step": 1949 + }, + { + "epoch": 2.529526281635302, + "grad_norm": 0.20760911826948453, + "learning_rate": 8.682058682058683e-06, + "loss": 0.3418, + "step": 1950 + }, + { + "epoch": 2.5308241401687215, + "grad_norm": 0.2094780858154728, + "learning_rate": 8.658008658008657e-06, + "loss": 0.3425, + "step": 1951 + }, + { + "epoch": 2.5321219987021415, + "grad_norm": 0.19840367931370975, + "learning_rate": 8.633958633958634e-06, + "loss": 0.3417, + "step": 1952 + }, + { + "epoch": 2.533419857235561, + "grad_norm": 0.19624410854082408, + "learning_rate": 8.60990860990861e-06, + "loss": 0.3456, + "step": 1953 + }, + { + "epoch": 2.534717715768981, + "grad_norm": 0.19653675310930965, + "learning_rate": 8.585858585858587e-06, + "loss": 0.3287, + "step": 1954 + }, + { + "epoch": 2.536015574302401, + "grad_norm": 0.2033148019089131, + "learning_rate": 8.561808561808563e-06, + "loss": 0.3416, + "step": 1955 + }, + { + "epoch": 2.5373134328358207, + "grad_norm": 0.19360893151616687, + "learning_rate": 8.537758537758538e-06, + "loss": 0.3554, + "step": 1956 + }, + { + "epoch": 2.5386112913692407, + "grad_norm": 0.18776659365965748, + "learning_rate": 8.513708513708514e-06, + "loss": 0.3394, + "step": 1957 + }, + { + "epoch": 2.5399091499026607, + "grad_norm": 0.21687377756718906, + "learning_rate": 8.489658489658491e-06, + "loss": 0.3447, + "step": 1958 + }, + { + "epoch": 2.5412070084360803, + "grad_norm": 0.19788163493461905, + "learning_rate": 8.465608465608466e-06, + "loss": 0.3383, + "step": 1959 + }, + { + "epoch": 2.5425048669695003, + "grad_norm": 0.20991331629937682, + "learning_rate": 8.441558441558442e-06, + "loss": 0.3405, + "step": 1960 + }, + { + "epoch": 2.5438027255029203, + "grad_norm": 0.20315647768031084, + "learning_rate": 8.417508417508419e-06, + "loss": 0.3454, + "step": 1961 + }, + { + "epoch": 2.54510058403634, + "grad_norm": 0.23523075305367147, + "learning_rate": 8.393458393458393e-06, + "loss": 0.3618, + "step": 1962 + }, + { + "epoch": 2.54639844256976, + "grad_norm": 0.2157971364103013, + "learning_rate": 8.36940836940837e-06, + "loss": 0.3497, + "step": 1963 + }, + { + "epoch": 2.54769630110318, + "grad_norm": 0.20499915960426765, + "learning_rate": 8.345358345358346e-06, + "loss": 0.3309, + "step": 1964 + }, + { + "epoch": 2.5489941596365995, + "grad_norm": 0.20698252650858404, + "learning_rate": 8.321308321308321e-06, + "loss": 0.3331, + "step": 1965 + }, + { + "epoch": 2.5502920181700195, + "grad_norm": 0.20188066664017376, + "learning_rate": 8.297258297258298e-06, + "loss": 0.346, + "step": 1966 + }, + { + "epoch": 2.5515898767034395, + "grad_norm": 0.22092684369504248, + "learning_rate": 8.273208273208272e-06, + "loss": 0.3487, + "step": 1967 + }, + { + "epoch": 2.552887735236859, + "grad_norm": 0.20340970119145418, + "learning_rate": 8.249158249158249e-06, + "loss": 0.3341, + "step": 1968 + }, + { + "epoch": 2.554185593770279, + "grad_norm": 0.20252572395415291, + "learning_rate": 8.225108225108225e-06, + "loss": 0.3332, + "step": 1969 + }, + { + "epoch": 2.555483452303699, + "grad_norm": 0.1879834295873596, + "learning_rate": 8.201058201058202e-06, + "loss": 0.3212, + "step": 1970 + }, + { + "epoch": 2.5567813108371187, + "grad_norm": 0.19299674074621231, + "learning_rate": 8.177008177008178e-06, + "loss": 0.3404, + "step": 1971 + }, + { + "epoch": 2.5580791693705387, + "grad_norm": 0.20796973098772337, + "learning_rate": 8.152958152958155e-06, + "loss": 0.3389, + "step": 1972 + }, + { + "epoch": 2.5593770279039587, + "grad_norm": 0.21058263937992205, + "learning_rate": 8.12890812890813e-06, + "loss": 0.3382, + "step": 1973 + }, + { + "epoch": 2.5606748864373783, + "grad_norm": 0.19698041890440957, + "learning_rate": 8.104858104858106e-06, + "loss": 0.3406, + "step": 1974 + }, + { + "epoch": 2.5619727449707983, + "grad_norm": 0.20441051012340986, + "learning_rate": 8.080808080808082e-06, + "loss": 0.3568, + "step": 1975 + }, + { + "epoch": 2.5632706035042183, + "grad_norm": 0.20023706231769592, + "learning_rate": 8.056758056758057e-06, + "loss": 0.3453, + "step": 1976 + }, + { + "epoch": 2.564568462037638, + "grad_norm": 0.19384483347770198, + "learning_rate": 8.032708032708034e-06, + "loss": 0.3541, + "step": 1977 + }, + { + "epoch": 2.565866320571058, + "grad_norm": 0.1902457140768143, + "learning_rate": 8.008658008658008e-06, + "loss": 0.3259, + "step": 1978 + }, + { + "epoch": 2.5671641791044775, + "grad_norm": 0.2065028347871094, + "learning_rate": 7.984607984607985e-06, + "loss": 0.3543, + "step": 1979 + }, + { + "epoch": 2.5684620376378975, + "grad_norm": 0.19772657046385608, + "learning_rate": 7.960557960557961e-06, + "loss": 0.3353, + "step": 1980 + }, + { + "epoch": 2.569759896171317, + "grad_norm": 0.18849021172503813, + "learning_rate": 7.936507936507936e-06, + "loss": 0.3223, + "step": 1981 + }, + { + "epoch": 2.571057754704737, + "grad_norm": 0.208888741548615, + "learning_rate": 7.912457912457913e-06, + "loss": 0.3444, + "step": 1982 + }, + { + "epoch": 2.572355613238157, + "grad_norm": 0.22525608656131163, + "learning_rate": 7.888407888407889e-06, + "loss": 0.3361, + "step": 1983 + }, + { + "epoch": 2.5736534717715767, + "grad_norm": 0.207515663353734, + "learning_rate": 7.864357864357864e-06, + "loss": 0.3368, + "step": 1984 + }, + { + "epoch": 2.5749513303049967, + "grad_norm": 0.2154517901626009, + "learning_rate": 7.84030784030784e-06, + "loss": 0.3345, + "step": 1985 + }, + { + "epoch": 2.5762491888384167, + "grad_norm": 0.2059272342950417, + "learning_rate": 7.816257816257815e-06, + "loss": 0.3657, + "step": 1986 + }, + { + "epoch": 2.5775470473718363, + "grad_norm": 0.20247815650907755, + "learning_rate": 7.792207792207792e-06, + "loss": 0.3248, + "step": 1987 + }, + { + "epoch": 2.5788449059052563, + "grad_norm": 0.18967243424535923, + "learning_rate": 7.768157768157768e-06, + "loss": 0.3345, + "step": 1988 + }, + { + "epoch": 2.5801427644386763, + "grad_norm": 0.20514510031749197, + "learning_rate": 7.744107744107745e-06, + "loss": 0.3407, + "step": 1989 + }, + { + "epoch": 2.581440622972096, + "grad_norm": 0.2047332936097657, + "learning_rate": 7.720057720057721e-06, + "loss": 0.3404, + "step": 1990 + }, + { + "epoch": 2.582738481505516, + "grad_norm": 0.19001160907023656, + "learning_rate": 7.696007696007697e-06, + "loss": 0.3363, + "step": 1991 + }, + { + "epoch": 2.584036340038936, + "grad_norm": 0.2007791373982576, + "learning_rate": 7.671957671957672e-06, + "loss": 0.345, + "step": 1992 + }, + { + "epoch": 2.5853341985723555, + "grad_norm": 0.1995014007677766, + "learning_rate": 7.647907647907649e-06, + "loss": 0.3373, + "step": 1993 + }, + { + "epoch": 2.5866320571057755, + "grad_norm": 0.22960201823541218, + "learning_rate": 7.623857623857625e-06, + "loss": 0.35, + "step": 1994 + }, + { + "epoch": 2.5879299156391955, + "grad_norm": 0.2041708033798508, + "learning_rate": 7.5998075998076e-06, + "loss": 0.3299, + "step": 1995 + }, + { + "epoch": 2.589227774172615, + "grad_norm": 0.19021132164070584, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.3434, + "step": 1996 + }, + { + "epoch": 2.590525632706035, + "grad_norm": 0.19160256744066273, + "learning_rate": 7.551707551707551e-06, + "loss": 0.3376, + "step": 1997 + }, + { + "epoch": 2.591823491239455, + "grad_norm": 0.1955416018816942, + "learning_rate": 7.527657527657528e-06, + "loss": 0.3407, + "step": 1998 + }, + { + "epoch": 2.5931213497728747, + "grad_norm": 0.1994363091424892, + "learning_rate": 7.503607503607504e-06, + "loss": 0.3271, + "step": 1999 + }, + { + "epoch": 2.5944192083062947, + "grad_norm": 0.19719206828799127, + "learning_rate": 7.47955747955748e-06, + "loss": 0.3491, + "step": 2000 + }, + { + "epoch": 2.5957170668397147, + "grad_norm": 0.21854928408023744, + "learning_rate": 7.455507455507456e-06, + "loss": 0.3559, + "step": 2001 + }, + { + "epoch": 2.5970149253731343, + "grad_norm": 0.2004433126098414, + "learning_rate": 7.431457431457433e-06, + "loss": 0.353, + "step": 2002 + }, + { + "epoch": 2.5983127839065543, + "grad_norm": 0.21521520240395595, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.3533, + "step": 2003 + }, + { + "epoch": 2.5996106424399743, + "grad_norm": 0.1904741360498635, + "learning_rate": 7.383357383357384e-06, + "loss": 0.3375, + "step": 2004 + }, + { + "epoch": 2.600908500973394, + "grad_norm": 0.20148594815991572, + "learning_rate": 7.35930735930736e-06, + "loss": 0.3404, + "step": 2005 + }, + { + "epoch": 2.602206359506814, + "grad_norm": 0.20222565117203797, + "learning_rate": 7.335257335257335e-06, + "loss": 0.3325, + "step": 2006 + }, + { + "epoch": 2.6035042180402335, + "grad_norm": 0.2086022602926794, + "learning_rate": 7.311207311207312e-06, + "loss": 0.3323, + "step": 2007 + }, + { + "epoch": 2.6048020765736535, + "grad_norm": 0.19695846789935645, + "learning_rate": 7.2871572871572864e-06, + "loss": 0.3419, + "step": 2008 + }, + { + "epoch": 2.6060999351070735, + "grad_norm": 0.2003548511519371, + "learning_rate": 7.263107263107263e-06, + "loss": 0.3356, + "step": 2009 + }, + { + "epoch": 2.607397793640493, + "grad_norm": 0.20970980037255266, + "learning_rate": 7.239057239057239e-06, + "loss": 0.3385, + "step": 2010 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.19898685255635887, + "learning_rate": 7.215007215007215e-06, + "loss": 0.3427, + "step": 2011 + }, + { + "epoch": 2.6099935107073327, + "grad_norm": 0.20226511821516943, + "learning_rate": 7.1909571909571915e-06, + "loss": 0.3272, + "step": 2012 + }, + { + "epoch": 2.6112913692407527, + "grad_norm": 0.18832236422091425, + "learning_rate": 7.166907166907168e-06, + "loss": 0.3354, + "step": 2013 + }, + { + "epoch": 2.6125892277741727, + "grad_norm": 0.18327010064211902, + "learning_rate": 7.142857142857143e-06, + "loss": 0.3286, + "step": 2014 + }, + { + "epoch": 2.6138870863075923, + "grad_norm": 0.19895899041097867, + "learning_rate": 7.118807118807119e-06, + "loss": 0.3297, + "step": 2015 + }, + { + "epoch": 2.6151849448410123, + "grad_norm": 0.2069075643758518, + "learning_rate": 7.094757094757096e-06, + "loss": 0.3302, + "step": 2016 + }, + { + "epoch": 2.6164828033744323, + "grad_norm": 0.19524313780033972, + "learning_rate": 7.0707070707070704e-06, + "loss": 0.3365, + "step": 2017 + }, + { + "epoch": 2.617780661907852, + "grad_norm": 0.19938059659766813, + "learning_rate": 7.046657046657047e-06, + "loss": 0.3338, + "step": 2018 + }, + { + "epoch": 2.619078520441272, + "grad_norm": 0.19493636421814658, + "learning_rate": 7.0226070226070225e-06, + "loss": 0.3315, + "step": 2019 + }, + { + "epoch": 2.620376378974692, + "grad_norm": 0.19839871511366805, + "learning_rate": 6.998556998556999e-06, + "loss": 0.3328, + "step": 2020 + }, + { + "epoch": 2.6216742375081115, + "grad_norm": 0.20131069199570611, + "learning_rate": 6.9745069745069755e-06, + "loss": 0.3509, + "step": 2021 + }, + { + "epoch": 2.6229720960415315, + "grad_norm": 0.19462988449384028, + "learning_rate": 6.95045695045695e-06, + "loss": 0.3491, + "step": 2022 + }, + { + "epoch": 2.6242699545749515, + "grad_norm": 0.21282789287292148, + "learning_rate": 6.926406926406927e-06, + "loss": 0.34, + "step": 2023 + }, + { + "epoch": 2.625567813108371, + "grad_norm": 0.1993074561130325, + "learning_rate": 6.902356902356903e-06, + "loss": 0.3466, + "step": 2024 + }, + { + "epoch": 2.626865671641791, + "grad_norm": 0.20351150350698377, + "learning_rate": 6.878306878306878e-06, + "loss": 0.3598, + "step": 2025 + }, + { + "epoch": 2.628163530175211, + "grad_norm": 0.19771094464637703, + "learning_rate": 6.854256854256854e-06, + "loss": 0.336, + "step": 2026 + }, + { + "epoch": 2.6294613887086307, + "grad_norm": 0.21482701918153888, + "learning_rate": 6.830206830206831e-06, + "loss": 0.34, + "step": 2027 + }, + { + "epoch": 2.6307592472420507, + "grad_norm": 0.2003506016053636, + "learning_rate": 6.8061568061568065e-06, + "loss": 0.3337, + "step": 2028 + }, + { + "epoch": 2.6320571057754707, + "grad_norm": 0.19105711872310324, + "learning_rate": 6.782106782106783e-06, + "loss": 0.3384, + "step": 2029 + }, + { + "epoch": 2.6333549643088903, + "grad_norm": 0.20159758006212594, + "learning_rate": 6.758056758056758e-06, + "loss": 0.3375, + "step": 2030 + }, + { + "epoch": 2.6346528228423103, + "grad_norm": 0.20143060805909305, + "learning_rate": 6.734006734006734e-06, + "loss": 0.3491, + "step": 2031 + }, + { + "epoch": 2.6359506813757303, + "grad_norm": 0.19866829942887484, + "learning_rate": 6.709956709956711e-06, + "loss": 0.3376, + "step": 2032 + }, + { + "epoch": 2.63724853990915, + "grad_norm": 0.2037924476485615, + "learning_rate": 6.6859066859066855e-06, + "loss": 0.3353, + "step": 2033 + }, + { + "epoch": 2.63854639844257, + "grad_norm": 0.19840737231971423, + "learning_rate": 6.661856661856662e-06, + "loss": 0.3419, + "step": 2034 + }, + { + "epoch": 2.6398442569759895, + "grad_norm": 0.2012069800711804, + "learning_rate": 6.637806637806638e-06, + "loss": 0.3374, + "step": 2035 + }, + { + "epoch": 2.6411421155094095, + "grad_norm": 0.2083568986903432, + "learning_rate": 6.613756613756614e-06, + "loss": 0.3535, + "step": 2036 + }, + { + "epoch": 2.6424399740428295, + "grad_norm": 0.19697725314574688, + "learning_rate": 6.5897065897065905e-06, + "loss": 0.3449, + "step": 2037 + }, + { + "epoch": 2.643737832576249, + "grad_norm": 0.19818150221871475, + "learning_rate": 6.565656565656567e-06, + "loss": 0.3409, + "step": 2038 + }, + { + "epoch": 2.645035691109669, + "grad_norm": 0.1978527267292534, + "learning_rate": 6.541606541606542e-06, + "loss": 0.3373, + "step": 2039 + }, + { + "epoch": 2.6463335496430886, + "grad_norm": 0.18635435314872623, + "learning_rate": 6.517556517556518e-06, + "loss": 0.3342, + "step": 2040 + }, + { + "epoch": 2.6476314081765087, + "grad_norm": 0.19790356410543516, + "learning_rate": 6.493506493506493e-06, + "loss": 0.3436, + "step": 2041 + }, + { + "epoch": 2.6489292667099287, + "grad_norm": 0.20316872420087156, + "learning_rate": 6.4694564694564695e-06, + "loss": 0.3448, + "step": 2042 + }, + { + "epoch": 2.6502271252433482, + "grad_norm": 0.19838963723019196, + "learning_rate": 6.445406445406446e-06, + "loss": 0.3467, + "step": 2043 + }, + { + "epoch": 2.6515249837767683, + "grad_norm": 0.19768166770591664, + "learning_rate": 6.4213564213564216e-06, + "loss": 0.3412, + "step": 2044 + }, + { + "epoch": 2.6528228423101883, + "grad_norm": 0.18870632230633216, + "learning_rate": 6.397306397306398e-06, + "loss": 0.3451, + "step": 2045 + }, + { + "epoch": 2.654120700843608, + "grad_norm": 0.19129582515190627, + "learning_rate": 6.3732563732563745e-06, + "loss": 0.3454, + "step": 2046 + }, + { + "epoch": 2.655418559377028, + "grad_norm": 0.20596889818566627, + "learning_rate": 6.349206349206349e-06, + "loss": 0.3534, + "step": 2047 + }, + { + "epoch": 2.656716417910448, + "grad_norm": 0.1951677844464869, + "learning_rate": 6.325156325156326e-06, + "loss": 0.3465, + "step": 2048 + }, + { + "epoch": 2.6580142764438675, + "grad_norm": 0.1839023960232542, + "learning_rate": 6.301106301106302e-06, + "loss": 0.3237, + "step": 2049 + }, + { + "epoch": 2.6593121349772875, + "grad_norm": 0.18411994909099402, + "learning_rate": 6.277056277056277e-06, + "loss": 0.3402, + "step": 2050 + }, + { + "epoch": 2.6606099935107075, + "grad_norm": 0.19031776842501638, + "learning_rate": 6.2530062530062535e-06, + "loss": 0.3354, + "step": 2051 + }, + { + "epoch": 2.661907852044127, + "grad_norm": 0.21384183187184516, + "learning_rate": 6.228956228956229e-06, + "loss": 0.342, + "step": 2052 + }, + { + "epoch": 2.663205710577547, + "grad_norm": 0.18478906620074495, + "learning_rate": 6.204906204906205e-06, + "loss": 0.3348, + "step": 2053 + }, + { + "epoch": 2.664503569110967, + "grad_norm": 0.20269895193238666, + "learning_rate": 6.180856180856181e-06, + "loss": 0.3351, + "step": 2054 + }, + { + "epoch": 2.6658014276443867, + "grad_norm": 0.20051102629513898, + "learning_rate": 6.156806156806158e-06, + "loss": 0.3385, + "step": 2055 + }, + { + "epoch": 2.6670992861778067, + "grad_norm": 0.18318606924324213, + "learning_rate": 6.132756132756133e-06, + "loss": 0.3364, + "step": 2056 + }, + { + "epoch": 2.6683971447112267, + "grad_norm": 0.19493171826087835, + "learning_rate": 6.108706108706109e-06, + "loss": 0.3348, + "step": 2057 + }, + { + "epoch": 2.6696950032446463, + "grad_norm": 0.18393836124851373, + "learning_rate": 6.0846560846560845e-06, + "loss": 0.3364, + "step": 2058 + }, + { + "epoch": 2.6709928617780663, + "grad_norm": 0.19573395789299228, + "learning_rate": 6.060606060606061e-06, + "loss": 0.3501, + "step": 2059 + }, + { + "epoch": 2.6722907203114863, + "grad_norm": 0.19289300424261566, + "learning_rate": 6.036556036556037e-06, + "loss": 0.3331, + "step": 2060 + }, + { + "epoch": 2.673588578844906, + "grad_norm": 0.20391450990957627, + "learning_rate": 6.012506012506012e-06, + "loss": 0.3523, + "step": 2061 + }, + { + "epoch": 2.674886437378326, + "grad_norm": 0.20029237980281211, + "learning_rate": 5.988455988455989e-06, + "loss": 0.3333, + "step": 2062 + }, + { + "epoch": 2.676184295911746, + "grad_norm": 0.1866620677693766, + "learning_rate": 5.964405964405965e-06, + "loss": 0.3311, + "step": 2063 + }, + { + "epoch": 2.6774821544451655, + "grad_norm": 0.19121945839733376, + "learning_rate": 5.940355940355941e-06, + "loss": 0.3411, + "step": 2064 + }, + { + "epoch": 2.6787800129785855, + "grad_norm": 0.19768175445567718, + "learning_rate": 5.916305916305916e-06, + "loss": 0.3331, + "step": 2065 + }, + { + "epoch": 2.680077871512005, + "grad_norm": 0.19564286501355985, + "learning_rate": 5.892255892255893e-06, + "loss": 0.3494, + "step": 2066 + }, + { + "epoch": 2.681375730045425, + "grad_norm": 0.18680263316102796, + "learning_rate": 5.8682058682058685e-06, + "loss": 0.3261, + "step": 2067 + }, + { + "epoch": 2.6826735885788446, + "grad_norm": 0.18888920820250896, + "learning_rate": 5.844155844155844e-06, + "loss": 0.3319, + "step": 2068 + }, + { + "epoch": 2.6839714471122647, + "grad_norm": 0.19397052933336428, + "learning_rate": 5.82010582010582e-06, + "loss": 0.3361, + "step": 2069 + }, + { + "epoch": 2.6852693056456847, + "grad_norm": 0.19077138187186174, + "learning_rate": 5.796055796055796e-06, + "loss": 0.3299, + "step": 2070 + }, + { + "epoch": 2.6865671641791042, + "grad_norm": 0.21127811092361148, + "learning_rate": 5.772005772005773e-06, + "loss": 0.3351, + "step": 2071 + }, + { + "epoch": 2.6878650227125243, + "grad_norm": 0.19439235797880994, + "learning_rate": 5.747955747955748e-06, + "loss": 0.3311, + "step": 2072 + }, + { + "epoch": 2.6891628812459443, + "grad_norm": 0.18768581482039637, + "learning_rate": 5.723905723905725e-06, + "loss": 0.3408, + "step": 2073 + }, + { + "epoch": 2.690460739779364, + "grad_norm": 0.20150498398104075, + "learning_rate": 5.6998556998557e-06, + "loss": 0.3297, + "step": 2074 + }, + { + "epoch": 2.691758598312784, + "grad_norm": 0.2030708714806736, + "learning_rate": 5.675805675805676e-06, + "loss": 0.3464, + "step": 2075 + }, + { + "epoch": 2.693056456846204, + "grad_norm": 0.19331515757587614, + "learning_rate": 5.651755651755652e-06, + "loss": 0.341, + "step": 2076 + }, + { + "epoch": 2.6943543153796234, + "grad_norm": 0.20425020368275043, + "learning_rate": 5.627705627705628e-06, + "loss": 0.3533, + "step": 2077 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.2332987934303732, + "learning_rate": 5.603655603655604e-06, + "loss": 0.3395, + "step": 2078 + }, + { + "epoch": 2.6969500324464635, + "grad_norm": 0.19691484544610982, + "learning_rate": 5.579605579605579e-06, + "loss": 0.3493, + "step": 2079 + }, + { + "epoch": 2.698247890979883, + "grad_norm": 0.2052051327790077, + "learning_rate": 5.555555555555556e-06, + "loss": 0.3622, + "step": 2080 + }, + { + "epoch": 2.699545749513303, + "grad_norm": 0.19298325427314583, + "learning_rate": 5.531505531505532e-06, + "loss": 0.3275, + "step": 2081 + }, + { + "epoch": 2.700843608046723, + "grad_norm": 0.19205634748728384, + "learning_rate": 5.507455507455508e-06, + "loss": 0.3364, + "step": 2082 + }, + { + "epoch": 2.7021414665801426, + "grad_norm": 0.199120001609843, + "learning_rate": 5.4834054834054835e-06, + "loss": 0.3604, + "step": 2083 + }, + { + "epoch": 2.7034393251135627, + "grad_norm": 0.19279039644707233, + "learning_rate": 5.45935545935546e-06, + "loss": 0.3531, + "step": 2084 + }, + { + "epoch": 2.7047371836469827, + "grad_norm": 0.19816454924229257, + "learning_rate": 5.435305435305436e-06, + "loss": 0.3386, + "step": 2085 + }, + { + "epoch": 2.7060350421804023, + "grad_norm": 0.1978192250026057, + "learning_rate": 5.411255411255411e-06, + "loss": 0.3353, + "step": 2086 + }, + { + "epoch": 2.7073329007138223, + "grad_norm": 0.1866947813546459, + "learning_rate": 5.387205387205387e-06, + "loss": 0.3361, + "step": 2087 + }, + { + "epoch": 2.7086307592472423, + "grad_norm": 0.19130243354442364, + "learning_rate": 5.363155363155363e-06, + "loss": 0.3339, + "step": 2088 + }, + { + "epoch": 2.709928617780662, + "grad_norm": 0.20123966523314857, + "learning_rate": 5.33910533910534e-06, + "loss": 0.352, + "step": 2089 + }, + { + "epoch": 2.711226476314082, + "grad_norm": 0.204082189254048, + "learning_rate": 5.3150553150553154e-06, + "loss": 0.3465, + "step": 2090 + }, + { + "epoch": 2.712524334847502, + "grad_norm": 0.18978137246025184, + "learning_rate": 5.291005291005291e-06, + "loss": 0.3436, + "step": 2091 + }, + { + "epoch": 2.7138221933809215, + "grad_norm": 0.18714999050678918, + "learning_rate": 5.2669552669552675e-06, + "loss": 0.3191, + "step": 2092 + }, + { + "epoch": 2.7151200519143415, + "grad_norm": 0.19155333104247205, + "learning_rate": 5.242905242905243e-06, + "loss": 0.3298, + "step": 2093 + }, + { + "epoch": 2.716417910447761, + "grad_norm": 0.19864886304787258, + "learning_rate": 5.218855218855219e-06, + "loss": 0.337, + "step": 2094 + }, + { + "epoch": 2.717715768981181, + "grad_norm": 0.20900011549166994, + "learning_rate": 5.194805194805195e-06, + "loss": 0.3351, + "step": 2095 + }, + { + "epoch": 2.719013627514601, + "grad_norm": 0.19283552005981475, + "learning_rate": 5.170755170755171e-06, + "loss": 0.3415, + "step": 2096 + }, + { + "epoch": 2.7203114860480206, + "grad_norm": 0.19735002707288396, + "learning_rate": 5.146705146705147e-06, + "loss": 0.3511, + "step": 2097 + }, + { + "epoch": 2.7216093445814407, + "grad_norm": 0.19007560174944538, + "learning_rate": 5.122655122655123e-06, + "loss": 0.3337, + "step": 2098 + }, + { + "epoch": 2.7229072031148602, + "grad_norm": 0.1945528470127734, + "learning_rate": 5.0986050986050994e-06, + "loss": 0.3333, + "step": 2099 + }, + { + "epoch": 2.7242050616482802, + "grad_norm": 0.19541816607187268, + "learning_rate": 5.074555074555075e-06, + "loss": 0.3375, + "step": 2100 + }, + { + "epoch": 2.7255029201817003, + "grad_norm": 0.1965347736989204, + "learning_rate": 5.050505050505051e-06, + "loss": 0.3374, + "step": 2101 + }, + { + "epoch": 2.72680077871512, + "grad_norm": 0.20176319781405136, + "learning_rate": 5.026455026455026e-06, + "loss": 0.3292, + "step": 2102 + }, + { + "epoch": 2.72809863724854, + "grad_norm": 0.20334136636920452, + "learning_rate": 5.002405002405003e-06, + "loss": 0.3546, + "step": 2103 + }, + { + "epoch": 2.72939649578196, + "grad_norm": 0.20179646454271286, + "learning_rate": 4.978354978354978e-06, + "loss": 0.3401, + "step": 2104 + }, + { + "epoch": 2.7306943543153794, + "grad_norm": 0.20239489126680496, + "learning_rate": 4.954304954304954e-06, + "loss": 0.362, + "step": 2105 + }, + { + "epoch": 2.7319922128487995, + "grad_norm": 0.1870990703654753, + "learning_rate": 4.9302549302549305e-06, + "loss": 0.3443, + "step": 2106 + }, + { + "epoch": 2.7332900713822195, + "grad_norm": 0.18557759206233268, + "learning_rate": 4.906204906204907e-06, + "loss": 0.3313, + "step": 2107 + }, + { + "epoch": 2.734587929915639, + "grad_norm": 0.19525861207810466, + "learning_rate": 4.8821548821548826e-06, + "loss": 0.3471, + "step": 2108 + }, + { + "epoch": 2.735885788449059, + "grad_norm": 0.1863895368451734, + "learning_rate": 4.858104858104858e-06, + "loss": 0.3302, + "step": 2109 + }, + { + "epoch": 2.737183646982479, + "grad_norm": 0.19520260521522104, + "learning_rate": 4.834054834054835e-06, + "loss": 0.3285, + "step": 2110 + }, + { + "epoch": 2.7384815055158986, + "grad_norm": 0.19296947043696586, + "learning_rate": 4.81000481000481e-06, + "loss": 0.3493, + "step": 2111 + }, + { + "epoch": 2.7397793640493187, + "grad_norm": 0.19207052807282113, + "learning_rate": 4.785954785954786e-06, + "loss": 0.3557, + "step": 2112 + }, + { + "epoch": 2.7410772225827387, + "grad_norm": 0.19274032451154754, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.3363, + "step": 2113 + }, + { + "epoch": 2.7423750811161582, + "grad_norm": 0.1991025079179398, + "learning_rate": 4.737854737854738e-06, + "loss": 0.3383, + "step": 2114 + }, + { + "epoch": 2.7436729396495783, + "grad_norm": 0.20322726939044952, + "learning_rate": 4.7138047138047145e-06, + "loss": 0.3309, + "step": 2115 + }, + { + "epoch": 2.7449707981829983, + "grad_norm": 0.1953261545822402, + "learning_rate": 4.68975468975469e-06, + "loss": 0.3324, + "step": 2116 + }, + { + "epoch": 2.746268656716418, + "grad_norm": 0.20572870381079225, + "learning_rate": 4.665704665704666e-06, + "loss": 0.3519, + "step": 2117 + }, + { + "epoch": 2.747566515249838, + "grad_norm": 0.1896514715456808, + "learning_rate": 4.641654641654642e-06, + "loss": 0.3269, + "step": 2118 + }, + { + "epoch": 2.748864373783258, + "grad_norm": 0.20395631514698148, + "learning_rate": 4.617604617604618e-06, + "loss": 0.3378, + "step": 2119 + }, + { + "epoch": 2.7501622323166774, + "grad_norm": 0.18572879939359394, + "learning_rate": 4.5935545935545934e-06, + "loss": 0.3577, + "step": 2120 + }, + { + "epoch": 2.7514600908500975, + "grad_norm": 0.1888314312446457, + "learning_rate": 4.56950456950457e-06, + "loss": 0.3323, + "step": 2121 + }, + { + "epoch": 2.752757949383517, + "grad_norm": 0.19430952155026918, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.3439, + "step": 2122 + }, + { + "epoch": 2.754055807916937, + "grad_norm": 0.42636074237494337, + "learning_rate": 4.521404521404521e-06, + "loss": 0.3473, + "step": 2123 + }, + { + "epoch": 2.755353666450357, + "grad_norm": 0.2086969030917497, + "learning_rate": 4.497354497354498e-06, + "loss": 0.3651, + "step": 2124 + }, + { + "epoch": 2.7566515249837766, + "grad_norm": 0.18713017378956393, + "learning_rate": 4.473304473304474e-06, + "loss": 0.3391, + "step": 2125 + }, + { + "epoch": 2.7579493835171967, + "grad_norm": 0.1997677484748378, + "learning_rate": 4.44925444925445e-06, + "loss": 0.3296, + "step": 2126 + }, + { + "epoch": 2.7592472420506162, + "grad_norm": 0.19085510775569323, + "learning_rate": 4.425204425204425e-06, + "loss": 0.3306, + "step": 2127 + }, + { + "epoch": 2.7605451005840362, + "grad_norm": 0.1897332096553123, + "learning_rate": 4.401154401154401e-06, + "loss": 0.3319, + "step": 2128 + }, + { + "epoch": 2.7618429591174563, + "grad_norm": 0.18447234964642742, + "learning_rate": 4.377104377104377e-06, + "loss": 0.3381, + "step": 2129 + }, + { + "epoch": 2.763140817650876, + "grad_norm": 0.19659685588771536, + "learning_rate": 4.353054353054353e-06, + "loss": 0.3411, + "step": 2130 + }, + { + "epoch": 2.764438676184296, + "grad_norm": 0.19285636988233915, + "learning_rate": 4.329004329004329e-06, + "loss": 0.3363, + "step": 2131 + }, + { + "epoch": 2.765736534717716, + "grad_norm": 0.1873294345390938, + "learning_rate": 4.304954304954305e-06, + "loss": 0.3479, + "step": 2132 + }, + { + "epoch": 2.7670343932511354, + "grad_norm": 0.20641352605961297, + "learning_rate": 4.280904280904282e-06, + "loss": 0.3479, + "step": 2133 + }, + { + "epoch": 2.7683322517845554, + "grad_norm": 0.19611830886976564, + "learning_rate": 4.256854256854257e-06, + "loss": 0.3598, + "step": 2134 + }, + { + "epoch": 2.7696301103179755, + "grad_norm": 0.19362363231720492, + "learning_rate": 4.232804232804233e-06, + "loss": 0.3587, + "step": 2135 + }, + { + "epoch": 2.770927968851395, + "grad_norm": 0.18330247717729053, + "learning_rate": 4.208754208754209e-06, + "loss": 0.3395, + "step": 2136 + }, + { + "epoch": 2.772225827384815, + "grad_norm": 0.194933200111678, + "learning_rate": 4.184704184704185e-06, + "loss": 0.3508, + "step": 2137 + }, + { + "epoch": 2.773523685918235, + "grad_norm": 0.21351784132569623, + "learning_rate": 4.1606541606541606e-06, + "loss": 0.3569, + "step": 2138 + }, + { + "epoch": 2.7748215444516546, + "grad_norm": 0.2022298125861802, + "learning_rate": 4.136604136604136e-06, + "loss": 0.3452, + "step": 2139 + }, + { + "epoch": 2.7761194029850746, + "grad_norm": 0.19008703004655647, + "learning_rate": 4.112554112554113e-06, + "loss": 0.3261, + "step": 2140 + }, + { + "epoch": 2.7774172615184947, + "grad_norm": 0.1996334755494925, + "learning_rate": 4.088504088504089e-06, + "loss": 0.3348, + "step": 2141 + }, + { + "epoch": 2.7787151200519142, + "grad_norm": 0.2016162252782046, + "learning_rate": 4.064454064454065e-06, + "loss": 0.339, + "step": 2142 + }, + { + "epoch": 2.7800129785853342, + "grad_norm": 0.20145152124370821, + "learning_rate": 4.040404040404041e-06, + "loss": 0.3384, + "step": 2143 + }, + { + "epoch": 2.7813108371187543, + "grad_norm": 0.20921251143803626, + "learning_rate": 4.016354016354017e-06, + "loss": 0.3449, + "step": 2144 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.18838640289015987, + "learning_rate": 3.9923039923039925e-06, + "loss": 0.3249, + "step": 2145 + }, + { + "epoch": 2.783906554185594, + "grad_norm": 0.203141862957814, + "learning_rate": 3.968253968253968e-06, + "loss": 0.3474, + "step": 2146 + }, + { + "epoch": 2.785204412719014, + "grad_norm": 0.18959922111635913, + "learning_rate": 3.9442039442039446e-06, + "loss": 0.3358, + "step": 2147 + }, + { + "epoch": 2.7865022712524334, + "grad_norm": 0.2034563222680623, + "learning_rate": 3.92015392015392e-06, + "loss": 0.3399, + "step": 2148 + }, + { + "epoch": 2.7878001297858535, + "grad_norm": 0.197654010004629, + "learning_rate": 3.896103896103896e-06, + "loss": 0.3384, + "step": 2149 + }, + { + "epoch": 2.7890979883192735, + "grad_norm": 0.20680720078235312, + "learning_rate": 3.872053872053872e-06, + "loss": 0.3326, + "step": 2150 + }, + { + "epoch": 2.790395846852693, + "grad_norm": 0.1927827514450044, + "learning_rate": 3.848003848003849e-06, + "loss": 0.3276, + "step": 2151 + }, + { + "epoch": 2.791693705386113, + "grad_norm": 0.1899459712119537, + "learning_rate": 3.823953823953824e-06, + "loss": 0.3368, + "step": 2152 + }, + { + "epoch": 2.7929915639195326, + "grad_norm": 0.18276333170806777, + "learning_rate": 3.7999037999038e-06, + "loss": 0.3252, + "step": 2153 + }, + { + "epoch": 2.7942894224529526, + "grad_norm": 0.19256844716061763, + "learning_rate": 3.7758537758537756e-06, + "loss": 0.3483, + "step": 2154 + }, + { + "epoch": 2.795587280986372, + "grad_norm": 0.18473354170932832, + "learning_rate": 3.751803751803752e-06, + "loss": 0.3329, + "step": 2155 + }, + { + "epoch": 2.7968851395197922, + "grad_norm": 0.20272211241013205, + "learning_rate": 3.727753727753728e-06, + "loss": 0.363, + "step": 2156 + }, + { + "epoch": 2.7981829980532122, + "grad_norm": 0.210319586938317, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.3547, + "step": 2157 + }, + { + "epoch": 2.799480856586632, + "grad_norm": 0.18619320502174647, + "learning_rate": 3.67965367965368e-06, + "loss": 0.3282, + "step": 2158 + }, + { + "epoch": 2.800778715120052, + "grad_norm": 0.1772781571540208, + "learning_rate": 3.655603655603656e-06, + "loss": 0.33, + "step": 2159 + }, + { + "epoch": 2.802076573653472, + "grad_norm": 0.19401337158226317, + "learning_rate": 3.6315536315536315e-06, + "loss": 0.3418, + "step": 2160 + }, + { + "epoch": 2.8033744321868914, + "grad_norm": 0.20093342296511638, + "learning_rate": 3.6075036075036075e-06, + "loss": 0.3324, + "step": 2161 + }, + { + "epoch": 2.8046722907203114, + "grad_norm": 0.18887641527097687, + "learning_rate": 3.583453583453584e-06, + "loss": 0.3256, + "step": 2162 + }, + { + "epoch": 2.8059701492537314, + "grad_norm": 0.17961558157640115, + "learning_rate": 3.5594035594035596e-06, + "loss": 0.3328, + "step": 2163 + }, + { + "epoch": 2.807268007787151, + "grad_norm": 0.20347532828593798, + "learning_rate": 3.5353535353535352e-06, + "loss": 0.3421, + "step": 2164 + }, + { + "epoch": 2.808565866320571, + "grad_norm": 0.18010386527024905, + "learning_rate": 3.5113035113035113e-06, + "loss": 0.326, + "step": 2165 + }, + { + "epoch": 2.809863724853991, + "grad_norm": 0.18682472145471216, + "learning_rate": 3.4872534872534877e-06, + "loss": 0.3277, + "step": 2166 + }, + { + "epoch": 2.8111615833874106, + "grad_norm": 0.18668496331694528, + "learning_rate": 3.4632034632034634e-06, + "loss": 0.3441, + "step": 2167 + }, + { + "epoch": 2.8124594419208306, + "grad_norm": 0.18876447344150002, + "learning_rate": 3.439153439153439e-06, + "loss": 0.3543, + "step": 2168 + }, + { + "epoch": 2.8137573004542507, + "grad_norm": 0.1880026989268264, + "learning_rate": 3.4151034151034154e-06, + "loss": 0.3419, + "step": 2169 + }, + { + "epoch": 2.8150551589876702, + "grad_norm": 0.19326058199934312, + "learning_rate": 3.3910533910533915e-06, + "loss": 0.3332, + "step": 2170 + }, + { + "epoch": 2.8163530175210902, + "grad_norm": 0.18329023377490067, + "learning_rate": 3.367003367003367e-06, + "loss": 0.3632, + "step": 2171 + }, + { + "epoch": 2.8176508760545103, + "grad_norm": 0.19371890235019304, + "learning_rate": 3.3429533429533427e-06, + "loss": 0.3377, + "step": 2172 + }, + { + "epoch": 2.81894873458793, + "grad_norm": 0.18600523979644987, + "learning_rate": 3.318903318903319e-06, + "loss": 0.3469, + "step": 2173 + }, + { + "epoch": 2.82024659312135, + "grad_norm": 0.19389227471137455, + "learning_rate": 3.2948532948532953e-06, + "loss": 0.3387, + "step": 2174 + }, + { + "epoch": 2.82154445165477, + "grad_norm": 0.18954299093028096, + "learning_rate": 3.270803270803271e-06, + "loss": 0.3279, + "step": 2175 + }, + { + "epoch": 2.8228423101881894, + "grad_norm": 0.19152410986871543, + "learning_rate": 3.2467532467532465e-06, + "loss": 0.3374, + "step": 2176 + }, + { + "epoch": 2.8241401687216094, + "grad_norm": 0.196063104407719, + "learning_rate": 3.222703222703223e-06, + "loss": 0.3385, + "step": 2177 + }, + { + "epoch": 2.8254380272550295, + "grad_norm": 0.18850542953314792, + "learning_rate": 3.198653198653199e-06, + "loss": 0.349, + "step": 2178 + }, + { + "epoch": 2.826735885788449, + "grad_norm": 0.20124590955928826, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.3415, + "step": 2179 + }, + { + "epoch": 2.828033744321869, + "grad_norm": 0.20248550914830157, + "learning_rate": 3.150553150553151e-06, + "loss": 0.3461, + "step": 2180 + }, + { + "epoch": 2.8293316028552886, + "grad_norm": 0.20035006058516966, + "learning_rate": 3.1265031265031267e-06, + "loss": 0.3432, + "step": 2181 + }, + { + "epoch": 2.8306294613887086, + "grad_norm": 0.1855009910859687, + "learning_rate": 3.1024531024531023e-06, + "loss": 0.3187, + "step": 2182 + }, + { + "epoch": 2.8319273199221286, + "grad_norm": 0.2143582348750643, + "learning_rate": 3.078403078403079e-06, + "loss": 0.3405, + "step": 2183 + }, + { + "epoch": 2.833225178455548, + "grad_norm": 0.19882736391050926, + "learning_rate": 3.0543530543530544e-06, + "loss": 0.3437, + "step": 2184 + }, + { + "epoch": 2.8345230369889682, + "grad_norm": 0.19373263047996803, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.3391, + "step": 2185 + }, + { + "epoch": 2.835820895522388, + "grad_norm": 0.19378870740750723, + "learning_rate": 3.006253006253006e-06, + "loss": 0.3478, + "step": 2186 + }, + { + "epoch": 2.837118754055808, + "grad_norm": 0.24107367256249349, + "learning_rate": 2.9822029822029826e-06, + "loss": 0.3458, + "step": 2187 + }, + { + "epoch": 2.838416612589228, + "grad_norm": 0.18521053064833032, + "learning_rate": 2.958152958152958e-06, + "loss": 0.3383, + "step": 2188 + }, + { + "epoch": 2.8397144711226474, + "grad_norm": 0.1881021394765785, + "learning_rate": 2.9341029341029342e-06, + "loss": 0.3517, + "step": 2189 + }, + { + "epoch": 2.8410123296560674, + "grad_norm": 0.20171700206005888, + "learning_rate": 2.91005291005291e-06, + "loss": 0.3689, + "step": 2190 + }, + { + "epoch": 2.8423101881894874, + "grad_norm": 0.18467275960567497, + "learning_rate": 2.8860028860028863e-06, + "loss": 0.3271, + "step": 2191 + }, + { + "epoch": 2.843608046722907, + "grad_norm": 0.1935758058556294, + "learning_rate": 2.8619528619528624e-06, + "loss": 0.3457, + "step": 2192 + }, + { + "epoch": 2.844905905256327, + "grad_norm": 0.20313128506718564, + "learning_rate": 2.837902837902838e-06, + "loss": 0.3315, + "step": 2193 + }, + { + "epoch": 2.846203763789747, + "grad_norm": 0.2694894225052093, + "learning_rate": 2.813852813852814e-06, + "loss": 0.3651, + "step": 2194 + }, + { + "epoch": 2.8475016223231666, + "grad_norm": 0.191890934429557, + "learning_rate": 2.7898027898027897e-06, + "loss": 0.3457, + "step": 2195 + }, + { + "epoch": 2.8487994808565866, + "grad_norm": 0.18753053839949832, + "learning_rate": 2.765752765752766e-06, + "loss": 0.3438, + "step": 2196 + }, + { + "epoch": 2.8500973393900066, + "grad_norm": 0.18148036317355992, + "learning_rate": 2.7417027417027418e-06, + "loss": 0.3271, + "step": 2197 + }, + { + "epoch": 2.851395197923426, + "grad_norm": 0.20235355112701617, + "learning_rate": 2.717652717652718e-06, + "loss": 0.3468, + "step": 2198 + }, + { + "epoch": 2.8526930564568462, + "grad_norm": 0.19882040438487505, + "learning_rate": 2.6936026936026934e-06, + "loss": 0.3649, + "step": 2199 + }, + { + "epoch": 2.8539909149902662, + "grad_norm": 0.1919921579971501, + "learning_rate": 2.66955266955267e-06, + "loss": 0.3373, + "step": 2200 + }, + { + "epoch": 2.855288773523686, + "grad_norm": 0.19166967508036267, + "learning_rate": 2.6455026455026455e-06, + "loss": 0.3407, + "step": 2201 + }, + { + "epoch": 2.856586632057106, + "grad_norm": 0.18413982998209266, + "learning_rate": 2.6214526214526216e-06, + "loss": 0.3281, + "step": 2202 + }, + { + "epoch": 2.857884490590526, + "grad_norm": 0.1963183914870018, + "learning_rate": 2.5974025974025976e-06, + "loss": 0.3519, + "step": 2203 + }, + { + "epoch": 2.8591823491239454, + "grad_norm": 0.19124327719338702, + "learning_rate": 2.5733525733525737e-06, + "loss": 0.3546, + "step": 2204 + }, + { + "epoch": 2.8604802076573654, + "grad_norm": 0.1908455405574935, + "learning_rate": 2.5493025493025497e-06, + "loss": 0.3441, + "step": 2205 + }, + { + "epoch": 2.8617780661907855, + "grad_norm": 0.20074510401322135, + "learning_rate": 2.5252525252525253e-06, + "loss": 0.3623, + "step": 2206 + }, + { + "epoch": 2.863075924724205, + "grad_norm": 0.19215908226190542, + "learning_rate": 2.5012025012025014e-06, + "loss": 0.3323, + "step": 2207 + }, + { + "epoch": 2.864373783257625, + "grad_norm": 0.18434791337783116, + "learning_rate": 2.477152477152477e-06, + "loss": 0.3319, + "step": 2208 + }, + { + "epoch": 2.8656716417910446, + "grad_norm": 0.19538722948283108, + "learning_rate": 2.4531024531024535e-06, + "loss": 0.3275, + "step": 2209 + }, + { + "epoch": 2.8669695003244646, + "grad_norm": 0.1882905413491712, + "learning_rate": 2.429052429052429e-06, + "loss": 0.3301, + "step": 2210 + }, + { + "epoch": 2.8682673588578846, + "grad_norm": 0.18413079624889964, + "learning_rate": 2.405002405002405e-06, + "loss": 0.3316, + "step": 2211 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.18114340544426985, + "learning_rate": 2.3809523809523808e-06, + "loss": 0.3293, + "step": 2212 + }, + { + "epoch": 2.8708630759247242, + "grad_norm": 0.18339952929530665, + "learning_rate": 2.3569023569023572e-06, + "loss": 0.3385, + "step": 2213 + }, + { + "epoch": 2.872160934458144, + "grad_norm": 0.19149046899206099, + "learning_rate": 2.332852332852333e-06, + "loss": 0.3362, + "step": 2214 + }, + { + "epoch": 2.873458792991564, + "grad_norm": 0.21144971266274312, + "learning_rate": 2.308802308802309e-06, + "loss": 0.3426, + "step": 2215 + }, + { + "epoch": 2.874756651524984, + "grad_norm": 0.18252025886810355, + "learning_rate": 2.284752284752285e-06, + "loss": 0.3461, + "step": 2216 + }, + { + "epoch": 2.8760545100584034, + "grad_norm": 0.18661014168815393, + "learning_rate": 2.2607022607022606e-06, + "loss": 0.3532, + "step": 2217 + }, + { + "epoch": 2.8773523685918234, + "grad_norm": 0.19921497008286657, + "learning_rate": 2.236652236652237e-06, + "loss": 0.3456, + "step": 2218 + }, + { + "epoch": 2.8786502271252434, + "grad_norm": 0.19199558097836697, + "learning_rate": 2.2126022126022127e-06, + "loss": 0.336, + "step": 2219 + }, + { + "epoch": 2.879948085658663, + "grad_norm": 0.18697397277980365, + "learning_rate": 2.1885521885521887e-06, + "loss": 0.3274, + "step": 2220 + }, + { + "epoch": 2.881245944192083, + "grad_norm": 0.18651465996633548, + "learning_rate": 2.1645021645021643e-06, + "loss": 0.3362, + "step": 2221 + }, + { + "epoch": 2.882543802725503, + "grad_norm": 0.18811253057615676, + "learning_rate": 2.140452140452141e-06, + "loss": 0.3441, + "step": 2222 + }, + { + "epoch": 2.8838416612589226, + "grad_norm": 0.1807114251046355, + "learning_rate": 2.1164021164021164e-06, + "loss": 0.3237, + "step": 2223 + }, + { + "epoch": 2.8851395197923426, + "grad_norm": 0.18668302514006135, + "learning_rate": 2.0923520923520925e-06, + "loss": 0.3556, + "step": 2224 + }, + { + "epoch": 2.8864373783257626, + "grad_norm": 0.1951670359448047, + "learning_rate": 2.068302068302068e-06, + "loss": 0.3181, + "step": 2225 + }, + { + "epoch": 2.887735236859182, + "grad_norm": 0.1874121175894903, + "learning_rate": 2.0442520442520446e-06, + "loss": 0.3447, + "step": 2226 + }, + { + "epoch": 2.8890330953926022, + "grad_norm": 0.18533011275342226, + "learning_rate": 2.0202020202020206e-06, + "loss": 0.3363, + "step": 2227 + }, + { + "epoch": 2.8903309539260222, + "grad_norm": 0.18793474012414535, + "learning_rate": 1.9961519961519962e-06, + "loss": 0.345, + "step": 2228 + }, + { + "epoch": 2.891628812459442, + "grad_norm": 0.18922319032385707, + "learning_rate": 1.9721019721019723e-06, + "loss": 0.3386, + "step": 2229 + }, + { + "epoch": 2.892926670992862, + "grad_norm": 0.19294630111421893, + "learning_rate": 1.948051948051948e-06, + "loss": 0.3513, + "step": 2230 + }, + { + "epoch": 2.894224529526282, + "grad_norm": 0.1831585914628477, + "learning_rate": 1.9240019240019244e-06, + "loss": 0.3185, + "step": 2231 + }, + { + "epoch": 2.8955223880597014, + "grad_norm": 0.18959540248425852, + "learning_rate": 1.8999518999519e-06, + "loss": 0.3448, + "step": 2232 + }, + { + "epoch": 2.8968202465931214, + "grad_norm": 0.18655567267427287, + "learning_rate": 1.875901875901876e-06, + "loss": 0.3262, + "step": 2233 + }, + { + "epoch": 2.8981181051265414, + "grad_norm": 0.18952603893507794, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.3282, + "step": 2234 + }, + { + "epoch": 2.899415963659961, + "grad_norm": 0.1876885696047965, + "learning_rate": 1.827801827801828e-06, + "loss": 0.3435, + "step": 2235 + }, + { + "epoch": 2.900713822193381, + "grad_norm": 0.1865624408496654, + "learning_rate": 1.8037518037518038e-06, + "loss": 0.3331, + "step": 2236 + }, + { + "epoch": 2.902011680726801, + "grad_norm": 0.19122160432352084, + "learning_rate": 1.7797017797017798e-06, + "loss": 0.3371, + "step": 2237 + }, + { + "epoch": 2.9033095392602206, + "grad_norm": 0.19352943277773518, + "learning_rate": 1.7556517556517556e-06, + "loss": 0.3405, + "step": 2238 + }, + { + "epoch": 2.9046073977936406, + "grad_norm": 0.19319000412284978, + "learning_rate": 1.7316017316017317e-06, + "loss": 0.3399, + "step": 2239 + }, + { + "epoch": 2.90590525632706, + "grad_norm": 0.19327464804923486, + "learning_rate": 1.7075517075517077e-06, + "loss": 0.3319, + "step": 2240 + }, + { + "epoch": 2.90720311486048, + "grad_norm": 0.19976992892290674, + "learning_rate": 1.6835016835016836e-06, + "loss": 0.3432, + "step": 2241 + }, + { + "epoch": 2.9085009733939, + "grad_norm": 0.1892168913000648, + "learning_rate": 1.6594516594516596e-06, + "loss": 0.3463, + "step": 2242 + }, + { + "epoch": 2.90979883192732, + "grad_norm": 0.19443589296751324, + "learning_rate": 1.6354016354016354e-06, + "loss": 0.357, + "step": 2243 + }, + { + "epoch": 2.91109669046074, + "grad_norm": 0.18449321307823713, + "learning_rate": 1.6113516113516115e-06, + "loss": 0.345, + "step": 2244 + }, + { + "epoch": 2.9123945489941594, + "grad_norm": 0.1858254182171351, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.3447, + "step": 2245 + }, + { + "epoch": 2.9136924075275794, + "grad_norm": 0.2004597208671706, + "learning_rate": 1.5632515632515634e-06, + "loss": 0.3366, + "step": 2246 + }, + { + "epoch": 2.9149902660609994, + "grad_norm": 0.18322062527491037, + "learning_rate": 1.5392015392015394e-06, + "loss": 0.3375, + "step": 2247 + }, + { + "epoch": 2.916288124594419, + "grad_norm": 0.1905983562403602, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.3411, + "step": 2248 + }, + { + "epoch": 2.917585983127839, + "grad_norm": 0.1867573407190652, + "learning_rate": 1.4911014911014913e-06, + "loss": 0.336, + "step": 2249 + }, + { + "epoch": 2.918883841661259, + "grad_norm": 0.19058206255135468, + "learning_rate": 1.4670514670514671e-06, + "loss": 0.3474, + "step": 2250 + }, + { + "epoch": 2.9201817001946786, + "grad_norm": 0.18685316734046567, + "learning_rate": 1.4430014430014432e-06, + "loss": 0.3502, + "step": 2251 + }, + { + "epoch": 2.9214795587280986, + "grad_norm": 0.18640666171276632, + "learning_rate": 1.418951418951419e-06, + "loss": 0.3317, + "step": 2252 + }, + { + "epoch": 2.9227774172615186, + "grad_norm": 0.19155464358225485, + "learning_rate": 1.3949013949013948e-06, + "loss": 0.3352, + "step": 2253 + }, + { + "epoch": 2.924075275794938, + "grad_norm": 0.1868855983928718, + "learning_rate": 1.3708513708513709e-06, + "loss": 0.328, + "step": 2254 + }, + { + "epoch": 2.925373134328358, + "grad_norm": 0.18433362766279043, + "learning_rate": 1.3468013468013467e-06, + "loss": 0.3452, + "step": 2255 + }, + { + "epoch": 2.9266709928617782, + "grad_norm": 0.19259127006608057, + "learning_rate": 1.3227513227513228e-06, + "loss": 0.3404, + "step": 2256 + }, + { + "epoch": 2.927968851395198, + "grad_norm": 0.18080624245350022, + "learning_rate": 1.2987012987012988e-06, + "loss": 0.3266, + "step": 2257 + }, + { + "epoch": 2.929266709928618, + "grad_norm": 0.17871124072334996, + "learning_rate": 1.2746512746512749e-06, + "loss": 0.3395, + "step": 2258 + }, + { + "epoch": 2.930564568462038, + "grad_norm": 0.18709418548907147, + "learning_rate": 1.2506012506012507e-06, + "loss": 0.346, + "step": 2259 + }, + { + "epoch": 2.9318624269954574, + "grad_norm": 0.18683092960850883, + "learning_rate": 1.2265512265512267e-06, + "loss": 0.336, + "step": 2260 + }, + { + "epoch": 2.9331602855288774, + "grad_norm": 0.18777575130149565, + "learning_rate": 1.2025012025012026e-06, + "loss": 0.3368, + "step": 2261 + }, + { + "epoch": 2.9344581440622974, + "grad_norm": 0.18324636658598714, + "learning_rate": 1.1784511784511786e-06, + "loss": 0.3292, + "step": 2262 + }, + { + "epoch": 2.935756002595717, + "grad_norm": 0.1851227917603969, + "learning_rate": 1.1544011544011545e-06, + "loss": 0.3378, + "step": 2263 + }, + { + "epoch": 2.937053861129137, + "grad_norm": 0.19847788748302606, + "learning_rate": 1.1303511303511303e-06, + "loss": 0.3754, + "step": 2264 + }, + { + "epoch": 2.938351719662557, + "grad_norm": 0.177806134860686, + "learning_rate": 1.1063011063011063e-06, + "loss": 0.327, + "step": 2265 + }, + { + "epoch": 2.9396495781959766, + "grad_norm": 0.197095005642012, + "learning_rate": 1.0822510822510822e-06, + "loss": 0.344, + "step": 2266 + }, + { + "epoch": 2.9409474367293966, + "grad_norm": 0.18894739122645604, + "learning_rate": 1.0582010582010582e-06, + "loss": 0.3437, + "step": 2267 + }, + { + "epoch": 2.942245295262816, + "grad_norm": 0.1763401949490533, + "learning_rate": 1.034151034151034e-06, + "loss": 0.3285, + "step": 2268 + }, + { + "epoch": 2.943543153796236, + "grad_norm": 0.1786777654803748, + "learning_rate": 1.0101010101010103e-06, + "loss": 0.3264, + "step": 2269 + }, + { + "epoch": 2.9448410123296562, + "grad_norm": 0.18511698934832105, + "learning_rate": 9.860509860509861e-07, + "loss": 0.3403, + "step": 2270 + }, + { + "epoch": 2.946138870863076, + "grad_norm": 0.18872890425471747, + "learning_rate": 9.620009620009622e-07, + "loss": 0.3288, + "step": 2271 + }, + { + "epoch": 2.947436729396496, + "grad_norm": 0.18279278752067737, + "learning_rate": 9.37950937950938e-07, + "loss": 0.3422, + "step": 2272 + }, + { + "epoch": 2.9487345879299154, + "grad_norm": 0.18006141885171842, + "learning_rate": 9.13900913900914e-07, + "loss": 0.3446, + "step": 2273 + }, + { + "epoch": 2.9500324464633354, + "grad_norm": 0.19260565452121156, + "learning_rate": 8.898508898508899e-07, + "loss": 0.3398, + "step": 2274 + }, + { + "epoch": 2.9513303049967554, + "grad_norm": 0.21921401490874187, + "learning_rate": 8.658008658008658e-07, + "loss": 0.3568, + "step": 2275 + }, + { + "epoch": 2.952628163530175, + "grad_norm": 0.18842161857636638, + "learning_rate": 8.417508417508418e-07, + "loss": 0.3498, + "step": 2276 + }, + { + "epoch": 2.953926022063595, + "grad_norm": 0.17891619649785445, + "learning_rate": 8.177008177008177e-07, + "loss": 0.3272, + "step": 2277 + }, + { + "epoch": 2.955223880597015, + "grad_norm": 0.17755022636746284, + "learning_rate": 7.936507936507937e-07, + "loss": 0.3231, + "step": 2278 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.1863595372909174, + "learning_rate": 7.696007696007697e-07, + "loss": 0.3408, + "step": 2279 + }, + { + "epoch": 2.9578195976638546, + "grad_norm": 0.17943740925178142, + "learning_rate": 7.455507455507456e-07, + "loss": 0.3222, + "step": 2280 + }, + { + "epoch": 2.9591174561972746, + "grad_norm": 0.1875857112899972, + "learning_rate": 7.215007215007216e-07, + "loss": 0.33, + "step": 2281 + }, + { + "epoch": 2.960415314730694, + "grad_norm": 0.18672707365536773, + "learning_rate": 6.974506974506974e-07, + "loss": 0.3377, + "step": 2282 + }, + { + "epoch": 2.961713173264114, + "grad_norm": 0.18854762948864245, + "learning_rate": 6.734006734006734e-07, + "loss": 0.3511, + "step": 2283 + }, + { + "epoch": 2.9630110317975342, + "grad_norm": 0.1846633145163194, + "learning_rate": 6.493506493506494e-07, + "loss": 0.3301, + "step": 2284 + }, + { + "epoch": 2.964308890330954, + "grad_norm": 0.1782259793697707, + "learning_rate": 6.253006253006253e-07, + "loss": 0.3327, + "step": 2285 + }, + { + "epoch": 2.965606748864374, + "grad_norm": 0.19406554087810526, + "learning_rate": 6.012506012506013e-07, + "loss": 0.3374, + "step": 2286 + }, + { + "epoch": 2.966904607397794, + "grad_norm": 0.18974085077422986, + "learning_rate": 5.772005772005772e-07, + "loss": 0.3193, + "step": 2287 + }, + { + "epoch": 2.9682024659312134, + "grad_norm": 0.19068994562305627, + "learning_rate": 5.531505531505532e-07, + "loss": 0.3387, + "step": 2288 + }, + { + "epoch": 2.9695003244646334, + "grad_norm": 0.17821215477258306, + "learning_rate": 5.291005291005291e-07, + "loss": 0.3328, + "step": 2289 + }, + { + "epoch": 2.9707981829980534, + "grad_norm": 0.18413236462451124, + "learning_rate": 5.050505050505052e-07, + "loss": 0.3362, + "step": 2290 + }, + { + "epoch": 2.972096041531473, + "grad_norm": 0.18085396718066815, + "learning_rate": 4.810004810004811e-07, + "loss": 0.3302, + "step": 2291 + }, + { + "epoch": 2.973393900064893, + "grad_norm": 0.18231587065998014, + "learning_rate": 4.56950456950457e-07, + "loss": 0.3338, + "step": 2292 + }, + { + "epoch": 2.974691758598313, + "grad_norm": 0.18433591352078926, + "learning_rate": 4.329004329004329e-07, + "loss": 0.3373, + "step": 2293 + }, + { + "epoch": 2.9759896171317326, + "grad_norm": 0.17897899773682865, + "learning_rate": 4.0885040885040886e-07, + "loss": 0.3303, + "step": 2294 + }, + { + "epoch": 2.9772874756651526, + "grad_norm": 0.18401504997308174, + "learning_rate": 3.8480038480038485e-07, + "loss": 0.3295, + "step": 2295 + }, + { + "epoch": 2.9785853341985726, + "grad_norm": 0.18601416230069387, + "learning_rate": 3.607503607503608e-07, + "loss": 0.3384, + "step": 2296 + }, + { + "epoch": 2.979883192731992, + "grad_norm": 0.18294283749703283, + "learning_rate": 3.367003367003367e-07, + "loss": 0.3414, + "step": 2297 + }, + { + "epoch": 2.981181051265412, + "grad_norm": 0.18248918456066632, + "learning_rate": 3.1265031265031267e-07, + "loss": 0.3522, + "step": 2298 + }, + { + "epoch": 2.982478909798832, + "grad_norm": 0.18259396477447506, + "learning_rate": 2.886002886002886e-07, + "loss": 0.3194, + "step": 2299 + }, + { + "epoch": 2.983776768332252, + "grad_norm": 0.1885739190189894, + "learning_rate": 2.6455026455026455e-07, + "loss": 0.3407, + "step": 2300 + }, + { + "epoch": 2.9850746268656714, + "grad_norm": 0.18564276265962515, + "learning_rate": 2.4050024050024055e-07, + "loss": 0.3431, + "step": 2301 + }, + { + "epoch": 2.9863724853990914, + "grad_norm": 0.17794361853046303, + "learning_rate": 2.1645021645021646e-07, + "loss": 0.3228, + "step": 2302 + }, + { + "epoch": 2.9876703439325114, + "grad_norm": 0.18130600626680995, + "learning_rate": 1.9240019240019243e-07, + "loss": 0.3336, + "step": 2303 + }, + { + "epoch": 2.988968202465931, + "grad_norm": 0.18435788490293198, + "learning_rate": 1.6835016835016834e-07, + "loss": 0.3308, + "step": 2304 + }, + { + "epoch": 2.990266060999351, + "grad_norm": 0.18127043211913135, + "learning_rate": 1.443001443001443e-07, + "loss": 0.3449, + "step": 2305 + }, + { + "epoch": 2.991563919532771, + "grad_norm": 0.1799885551759602, + "learning_rate": 1.2025012025012027e-07, + "loss": 0.3267, + "step": 2306 + }, + { + "epoch": 2.9928617780661906, + "grad_norm": 0.18020390891152432, + "learning_rate": 9.620009620009621e-08, + "loss": 0.3376, + "step": 2307 + }, + { + "epoch": 2.9941596365996106, + "grad_norm": 0.18299772015112434, + "learning_rate": 7.215007215007215e-08, + "loss": 0.3404, + "step": 2308 + }, + { + "epoch": 2.9954574951330306, + "grad_norm": 0.17991344814248053, + "learning_rate": 4.8100048100048107e-08, + "loss": 0.3293, + "step": 2309 + }, + { + "epoch": 2.99675535366645, + "grad_norm": 0.18545369820383786, + "learning_rate": 2.4050024050024053e-08, + "loss": 0.3376, + "step": 2310 + }, + { + "epoch": 2.99675535366645, + "step": 2310, + "total_flos": 2.5679880641918796e+19, + "train_loss": 0.5017085661361744, + "train_runtime": 66070.1568, + "train_samples_per_second": 0.56, + "train_steps_per_second": 0.035 + } + ], + "logging_steps": 1, + "max_steps": 2310, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.5679880641918796e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}