diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4515 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.996199941537562, + "eval_steps": 500, + "global_step": 639, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00467699503069278, + "grad_norm": 0.5310407876968384, + "learning_rate": 9.98435054773083e-06, + "loss": 15.7421, + "step": 1 + }, + { + "epoch": 0.00935399006138556, + "grad_norm": 0.6928378343582153, + "learning_rate": 9.96870109546166e-06, + "loss": 22.4837, + "step": 2 + }, + { + "epoch": 0.014030985092078339, + "grad_norm": 0.5985817909240723, + "learning_rate": 9.953051643192489e-06, + "loss": 19.2438, + "step": 3 + }, + { + "epoch": 0.01870798012277112, + "grad_norm": 0.3373333215713501, + "learning_rate": 9.937402190923318e-06, + "loss": 17.8898, + "step": 4 + }, + { + "epoch": 0.0233849751534639, + "grad_norm": 0.41831186413764954, + "learning_rate": 9.921752738654147e-06, + "loss": 17.4461, + "step": 5 + }, + { + "epoch": 0.028061970184156678, + "grad_norm": 0.3748932182788849, + "learning_rate": 9.906103286384977e-06, + "loss": 17.7855, + "step": 6 + }, + { + "epoch": 0.03273896521484946, + "grad_norm": 0.3773082494735718, + "learning_rate": 9.890453834115806e-06, + "loss": 16.9238, + "step": 7 + }, + { + "epoch": 0.03741596024554224, + "grad_norm": 0.3397878408432007, + "learning_rate": 9.874804381846637e-06, + "loss": 16.7176, + "step": 8 + }, + { + "epoch": 0.04209295527623502, + "grad_norm": 0.5136957168579102, + "learning_rate": 9.859154929577466e-06, + "loss": 15.4666, + "step": 9 + }, + { + "epoch": 0.0467699503069278, + "grad_norm": 0.3085887134075165, + "learning_rate": 9.843505477308296e-06, + "loss": 13.7508, + "step": 10 + }, + { + "epoch": 0.05144694533762058, + "grad_norm": 0.3942926228046417, + "learning_rate": 9.827856025039125e-06, + "loss": 17.3618, + "step": 11 + }, + { + "epoch": 0.056123940368313356, + "grad_norm": 0.27711015939712524, + "learning_rate": 9.812206572769954e-06, + "loss": 16.2495, + "step": 12 + }, + { + "epoch": 0.06080093539900614, + "grad_norm": 0.27834147214889526, + "learning_rate": 9.796557120500783e-06, + "loss": 15.9543, + "step": 13 + }, + { + "epoch": 0.06547793042969892, + "grad_norm": 0.24677161872386932, + "learning_rate": 9.780907668231613e-06, + "loss": 15.6386, + "step": 14 + }, + { + "epoch": 0.0701549254603917, + "grad_norm": 0.36406269669532776, + "learning_rate": 9.765258215962442e-06, + "loss": 15.6255, + "step": 15 + }, + { + "epoch": 0.07483192049108447, + "grad_norm": 0.307948499917984, + "learning_rate": 9.749608763693271e-06, + "loss": 14.9189, + "step": 16 + }, + { + "epoch": 0.07950891552177726, + "grad_norm": 0.2980886697769165, + "learning_rate": 9.7339593114241e-06, + "loss": 15.0406, + "step": 17 + }, + { + "epoch": 0.08418591055247004, + "grad_norm": 0.412708044052124, + "learning_rate": 9.71830985915493e-06, + "loss": 14.7356, + "step": 18 + }, + { + "epoch": 0.08886290558316282, + "grad_norm": 0.2903729975223541, + "learning_rate": 9.70266040688576e-06, + "loss": 14.8173, + "step": 19 + }, + { + "epoch": 0.0935399006138556, + "grad_norm": 0.2171318084001541, + "learning_rate": 9.687010954616589e-06, + "loss": 15.0565, + "step": 20 + }, + { + "epoch": 0.09821689564454837, + "grad_norm": 0.4166527986526489, + "learning_rate": 9.671361502347418e-06, + "loss": 14.1942, + "step": 21 + }, + { + "epoch": 0.10289389067524116, + "grad_norm": 0.2564053237438202, + "learning_rate": 9.655712050078247e-06, + "loss": 14.3627, + "step": 22 + }, + { + "epoch": 0.10757088570593394, + "grad_norm": 0.254341185092926, + "learning_rate": 9.640062597809078e-06, + "loss": 13.7315, + "step": 23 + }, + { + "epoch": 0.11224788073662671, + "grad_norm": 0.24184982478618622, + "learning_rate": 9.624413145539908e-06, + "loss": 13.9785, + "step": 24 + }, + { + "epoch": 0.1169248757673195, + "grad_norm": 0.3118051588535309, + "learning_rate": 9.608763693270737e-06, + "loss": 15.1744, + "step": 25 + }, + { + "epoch": 0.12160187079801228, + "grad_norm": 0.2545301020145416, + "learning_rate": 9.593114241001566e-06, + "loss": 15.9676, + "step": 26 + }, + { + "epoch": 0.12627886582870507, + "grad_norm": 0.2265356183052063, + "learning_rate": 9.577464788732394e-06, + "loss": 14.8985, + "step": 27 + }, + { + "epoch": 0.13095586085939784, + "grad_norm": 0.2141331285238266, + "learning_rate": 9.561815336463225e-06, + "loss": 13.6498, + "step": 28 + }, + { + "epoch": 0.1356328558900906, + "grad_norm": 0.27572301030158997, + "learning_rate": 9.546165884194054e-06, + "loss": 13.9124, + "step": 29 + }, + { + "epoch": 0.1403098509207834, + "grad_norm": 0.1987282633781433, + "learning_rate": 9.530516431924883e-06, + "loss": 12.9095, + "step": 30 + }, + { + "epoch": 0.14498684595147618, + "grad_norm": 0.2444925159215927, + "learning_rate": 9.514866979655713e-06, + "loss": 13.365, + "step": 31 + }, + { + "epoch": 0.14966384098216895, + "grad_norm": 0.4400818645954132, + "learning_rate": 9.499217527386542e-06, + "loss": 13.2832, + "step": 32 + }, + { + "epoch": 0.15434083601286175, + "grad_norm": 0.2764039933681488, + "learning_rate": 9.483568075117371e-06, + "loss": 14.3228, + "step": 33 + }, + { + "epoch": 0.15901783104355452, + "grad_norm": 0.21101799607276917, + "learning_rate": 9.4679186228482e-06, + "loss": 14.156, + "step": 34 + }, + { + "epoch": 0.1636948260742473, + "grad_norm": 0.267008513212204, + "learning_rate": 9.45226917057903e-06, + "loss": 14.1084, + "step": 35 + }, + { + "epoch": 0.1683718211049401, + "grad_norm": 0.2759203016757965, + "learning_rate": 9.43661971830986e-06, + "loss": 13.5008, + "step": 36 + }, + { + "epoch": 0.17304881613563286, + "grad_norm": 0.2793346643447876, + "learning_rate": 9.42097026604069e-06, + "loss": 14.2747, + "step": 37 + }, + { + "epoch": 0.17772581116632563, + "grad_norm": 0.25120246410369873, + "learning_rate": 9.40532081377152e-06, + "loss": 13.4347, + "step": 38 + }, + { + "epoch": 0.1824028061970184, + "grad_norm": 0.1591794341802597, + "learning_rate": 9.389671361502349e-06, + "loss": 12.619, + "step": 39 + }, + { + "epoch": 0.1870798012277112, + "grad_norm": 0.2054363638162613, + "learning_rate": 9.374021909233178e-06, + "loss": 11.8876, + "step": 40 + }, + { + "epoch": 0.19175679625840397, + "grad_norm": 0.23818843066692352, + "learning_rate": 9.358372456964007e-06, + "loss": 13.4683, + "step": 41 + }, + { + "epoch": 0.19643379128909674, + "grad_norm": 0.32269319891929626, + "learning_rate": 9.342723004694837e-06, + "loss": 12.5351, + "step": 42 + }, + { + "epoch": 0.20111078631978954, + "grad_norm": 0.29193466901779175, + "learning_rate": 9.327073552425666e-06, + "loss": 11.917, + "step": 43 + }, + { + "epoch": 0.2057877813504823, + "grad_norm": 0.20844891667366028, + "learning_rate": 9.311424100156495e-06, + "loss": 12.0984, + "step": 44 + }, + { + "epoch": 0.21046477638117508, + "grad_norm": 0.26920032501220703, + "learning_rate": 9.295774647887325e-06, + "loss": 14.1542, + "step": 45 + }, + { + "epoch": 0.21514177141186788, + "grad_norm": 0.20874425768852234, + "learning_rate": 9.280125195618154e-06, + "loss": 13.9397, + "step": 46 + }, + { + "epoch": 0.21981876644256065, + "grad_norm": 0.28703245520591736, + "learning_rate": 9.264475743348983e-06, + "loss": 12.7704, + "step": 47 + }, + { + "epoch": 0.22449576147325342, + "grad_norm": 0.23402653634548187, + "learning_rate": 9.248826291079813e-06, + "loss": 12.8326, + "step": 48 + }, + { + "epoch": 0.22917275650394622, + "grad_norm": 0.28065574169158936, + "learning_rate": 9.233176838810642e-06, + "loss": 11.4735, + "step": 49 + }, + { + "epoch": 0.233849751534639, + "grad_norm": 0.21932877600193024, + "learning_rate": 9.217527386541471e-06, + "loss": 12.2491, + "step": 50 + }, + { + "epoch": 0.23852674656533177, + "grad_norm": 0.24466539919376373, + "learning_rate": 9.2018779342723e-06, + "loss": 12.3501, + "step": 51 + }, + { + "epoch": 0.24320374159602456, + "grad_norm": 0.17424331605434418, + "learning_rate": 9.186228482003131e-06, + "loss": 12.6445, + "step": 52 + }, + { + "epoch": 0.24788073662671734, + "grad_norm": 0.2298133671283722, + "learning_rate": 9.17057902973396e-06, + "loss": 12.5759, + "step": 53 + }, + { + "epoch": 0.25255773165741013, + "grad_norm": 0.30562305450439453, + "learning_rate": 9.15492957746479e-06, + "loss": 13.4988, + "step": 54 + }, + { + "epoch": 0.2572347266881029, + "grad_norm": 0.21225547790527344, + "learning_rate": 9.13928012519562e-06, + "loss": 13.3909, + "step": 55 + }, + { + "epoch": 0.2619117217187957, + "grad_norm": 0.3120986223220825, + "learning_rate": 9.123630672926449e-06, + "loss": 13.8276, + "step": 56 + }, + { + "epoch": 0.2665887167494885, + "grad_norm": 0.18036110699176788, + "learning_rate": 9.107981220657278e-06, + "loss": 13.7724, + "step": 57 + }, + { + "epoch": 0.2712657117801812, + "grad_norm": 0.22987115383148193, + "learning_rate": 9.092331768388107e-06, + "loss": 12.2669, + "step": 58 + }, + { + "epoch": 0.275942706810874, + "grad_norm": 0.23878921568393707, + "learning_rate": 9.076682316118937e-06, + "loss": 12.7097, + "step": 59 + }, + { + "epoch": 0.2806197018415668, + "grad_norm": 0.20319631695747375, + "learning_rate": 9.061032863849766e-06, + "loss": 12.2795, + "step": 60 + }, + { + "epoch": 0.28529669687225956, + "grad_norm": 0.18609336018562317, + "learning_rate": 9.045383411580595e-06, + "loss": 11.2712, + "step": 61 + }, + { + "epoch": 0.28997369190295236, + "grad_norm": 0.21320512890815735, + "learning_rate": 9.029733959311425e-06, + "loss": 11.6637, + "step": 62 + }, + { + "epoch": 0.29465068693364516, + "grad_norm": 0.23330001533031464, + "learning_rate": 9.014084507042254e-06, + "loss": 12.9509, + "step": 63 + }, + { + "epoch": 0.2993276819643379, + "grad_norm": 0.21313583850860596, + "learning_rate": 8.998435054773083e-06, + "loss": 13.8547, + "step": 64 + }, + { + "epoch": 0.3040046769950307, + "grad_norm": 0.20739194750785828, + "learning_rate": 8.982785602503912e-06, + "loss": 12.735, + "step": 65 + }, + { + "epoch": 0.3086816720257235, + "grad_norm": 0.2453576922416687, + "learning_rate": 8.967136150234742e-06, + "loss": 12.7951, + "step": 66 + }, + { + "epoch": 0.31335866705641624, + "grad_norm": 0.21135878562927246, + "learning_rate": 8.951486697965573e-06, + "loss": 13.7611, + "step": 67 + }, + { + "epoch": 0.31803566208710904, + "grad_norm": 0.2257193773984909, + "learning_rate": 8.935837245696402e-06, + "loss": 11.3833, + "step": 68 + }, + { + "epoch": 0.32271265711780184, + "grad_norm": 0.1934535950422287, + "learning_rate": 8.920187793427231e-06, + "loss": 11.4428, + "step": 69 + }, + { + "epoch": 0.3273896521484946, + "grad_norm": 0.19537678360939026, + "learning_rate": 8.90453834115806e-06, + "loss": 13.1129, + "step": 70 + }, + { + "epoch": 0.3320666471791874, + "grad_norm": 0.2596362233161926, + "learning_rate": 8.888888888888888e-06, + "loss": 11.9323, + "step": 71 + }, + { + "epoch": 0.3367436422098802, + "grad_norm": 0.28119221329689026, + "learning_rate": 8.87323943661972e-06, + "loss": 12.1397, + "step": 72 + }, + { + "epoch": 0.3414206372405729, + "grad_norm": 0.2443932145833969, + "learning_rate": 8.857589984350549e-06, + "loss": 11.1756, + "step": 73 + }, + { + "epoch": 0.3460976322712657, + "grad_norm": 0.23586861789226532, + "learning_rate": 8.841940532081378e-06, + "loss": 12.2808, + "step": 74 + }, + { + "epoch": 0.3507746273019585, + "grad_norm": 0.2984711229801178, + "learning_rate": 8.826291079812207e-06, + "loss": 11.8437, + "step": 75 + }, + { + "epoch": 0.35545162233265126, + "grad_norm": 0.2404984086751938, + "learning_rate": 8.810641627543037e-06, + "loss": 11.6321, + "step": 76 + }, + { + "epoch": 0.36012861736334406, + "grad_norm": 0.22745920717716217, + "learning_rate": 8.794992175273866e-06, + "loss": 12.969, + "step": 77 + }, + { + "epoch": 0.3648056123940368, + "grad_norm": 0.22989057004451752, + "learning_rate": 8.779342723004695e-06, + "loss": 12.1793, + "step": 78 + }, + { + "epoch": 0.3694826074247296, + "grad_norm": 0.22097162902355194, + "learning_rate": 8.763693270735524e-06, + "loss": 12.5693, + "step": 79 + }, + { + "epoch": 0.3741596024554224, + "grad_norm": 0.19985444843769073, + "learning_rate": 8.748043818466354e-06, + "loss": 13.3868, + "step": 80 + }, + { + "epoch": 0.37883659748611515, + "grad_norm": 0.2339348942041397, + "learning_rate": 8.732394366197183e-06, + "loss": 11.5905, + "step": 81 + }, + { + "epoch": 0.38351359251680794, + "grad_norm": 0.28241512179374695, + "learning_rate": 8.716744913928014e-06, + "loss": 12.6998, + "step": 82 + }, + { + "epoch": 0.38819058754750074, + "grad_norm": 0.2848986089229584, + "learning_rate": 8.701095461658843e-06, + "loss": 11.3058, + "step": 83 + }, + { + "epoch": 0.3928675825781935, + "grad_norm": 0.2118872106075287, + "learning_rate": 8.685446009389673e-06, + "loss": 10.4664, + "step": 84 + }, + { + "epoch": 0.3975445776088863, + "grad_norm": 0.16718249022960663, + "learning_rate": 8.669796557120502e-06, + "loss": 13.2492, + "step": 85 + }, + { + "epoch": 0.4022215726395791, + "grad_norm": 0.2131660282611847, + "learning_rate": 8.65414710485133e-06, + "loss": 12.0166, + "step": 86 + }, + { + "epoch": 0.4068985676702718, + "grad_norm": 0.2012370079755783, + "learning_rate": 8.63849765258216e-06, + "loss": 12.5326, + "step": 87 + }, + { + "epoch": 0.4115755627009646, + "grad_norm": 0.2684880793094635, + "learning_rate": 8.62284820031299e-06, + "loss": 12.8071, + "step": 88 + }, + { + "epoch": 0.4162525577316574, + "grad_norm": 0.2500629127025604, + "learning_rate": 8.60719874804382e-06, + "loss": 12.1628, + "step": 89 + }, + { + "epoch": 0.42092955276235017, + "grad_norm": 0.18125677108764648, + "learning_rate": 8.591549295774648e-06, + "loss": 11.3137, + "step": 90 + }, + { + "epoch": 0.42560654779304297, + "grad_norm": 0.1830630898475647, + "learning_rate": 8.575899843505478e-06, + "loss": 11.8507, + "step": 91 + }, + { + "epoch": 0.43028354282373577, + "grad_norm": 0.1481466144323349, + "learning_rate": 8.560250391236307e-06, + "loss": 10.8795, + "step": 92 + }, + { + "epoch": 0.4349605378544285, + "grad_norm": 0.18768347799777985, + "learning_rate": 8.544600938967136e-06, + "loss": 11.2509, + "step": 93 + }, + { + "epoch": 0.4396375328851213, + "grad_norm": 0.22724182903766632, + "learning_rate": 8.528951486697966e-06, + "loss": 11.6564, + "step": 94 + }, + { + "epoch": 0.4443145279158141, + "grad_norm": 0.1806531399488449, + "learning_rate": 8.513302034428795e-06, + "loss": 11.9111, + "step": 95 + }, + { + "epoch": 0.44899152294650685, + "grad_norm": 0.2578674554824829, + "learning_rate": 8.497652582159626e-06, + "loss": 13.1609, + "step": 96 + }, + { + "epoch": 0.45366851797719965, + "grad_norm": 0.21666157245635986, + "learning_rate": 8.482003129890455e-06, + "loss": 12.3285, + "step": 97 + }, + { + "epoch": 0.45834551300789245, + "grad_norm": 0.2574619948863983, + "learning_rate": 8.466353677621285e-06, + "loss": 11.4998, + "step": 98 + }, + { + "epoch": 0.4630225080385852, + "grad_norm": 0.28588882088661194, + "learning_rate": 8.450704225352114e-06, + "loss": 11.0233, + "step": 99 + }, + { + "epoch": 0.467699503069278, + "grad_norm": 0.28356659412384033, + "learning_rate": 8.435054773082943e-06, + "loss": 10.9355, + "step": 100 + }, + { + "epoch": 0.4723764980999708, + "grad_norm": 0.18748782575130463, + "learning_rate": 8.419405320813773e-06, + "loss": 13.5926, + "step": 101 + }, + { + "epoch": 0.47705349313066353, + "grad_norm": 0.17172126471996307, + "learning_rate": 8.403755868544602e-06, + "loss": 11.4017, + "step": 102 + }, + { + "epoch": 0.48173048816135633, + "grad_norm": 0.1956973671913147, + "learning_rate": 8.388106416275431e-06, + "loss": 12.1463, + "step": 103 + }, + { + "epoch": 0.48640748319204913, + "grad_norm": 0.30823975801467896, + "learning_rate": 8.37245696400626e-06, + "loss": 10.2949, + "step": 104 + }, + { + "epoch": 0.49108447822274187, + "grad_norm": 0.23158958554267883, + "learning_rate": 8.35680751173709e-06, + "loss": 11.2003, + "step": 105 + }, + { + "epoch": 0.49576147325343467, + "grad_norm": 0.23977261781692505, + "learning_rate": 8.341158059467919e-06, + "loss": 11.5904, + "step": 106 + }, + { + "epoch": 0.5004384682841274, + "grad_norm": 0.17250728607177734, + "learning_rate": 8.325508607198748e-06, + "loss": 11.2648, + "step": 107 + }, + { + "epoch": 0.5051154633148203, + "grad_norm": 0.23300261795520782, + "learning_rate": 8.309859154929578e-06, + "loss": 11.9646, + "step": 108 + }, + { + "epoch": 0.509792458345513, + "grad_norm": 0.2430488020181656, + "learning_rate": 8.294209702660407e-06, + "loss": 12.046, + "step": 109 + }, + { + "epoch": 0.5144694533762058, + "grad_norm": 0.18206799030303955, + "learning_rate": 8.278560250391236e-06, + "loss": 12.0767, + "step": 110 + }, + { + "epoch": 0.5191464484068986, + "grad_norm": 0.25876322388648987, + "learning_rate": 8.262910798122067e-06, + "loss": 11.7794, + "step": 111 + }, + { + "epoch": 0.5238234434375914, + "grad_norm": 0.28936639428138733, + "learning_rate": 8.247261345852897e-06, + "loss": 10.3819, + "step": 112 + }, + { + "epoch": 0.5285004384682841, + "grad_norm": 0.214036762714386, + "learning_rate": 8.231611893583726e-06, + "loss": 10.3209, + "step": 113 + }, + { + "epoch": 0.533177433498977, + "grad_norm": 0.23764470219612122, + "learning_rate": 8.215962441314555e-06, + "loss": 10.8417, + "step": 114 + }, + { + "epoch": 0.5378544285296697, + "grad_norm": 0.2604602575302124, + "learning_rate": 8.200312989045383e-06, + "loss": 12.534, + "step": 115 + }, + { + "epoch": 0.5425314235603624, + "grad_norm": 0.24597330391407013, + "learning_rate": 8.184663536776214e-06, + "loss": 12.348, + "step": 116 + }, + { + "epoch": 0.5472084185910553, + "grad_norm": 0.2204928994178772, + "learning_rate": 8.169014084507043e-06, + "loss": 10.979, + "step": 117 + }, + { + "epoch": 0.551885413621748, + "grad_norm": 0.15487593412399292, + "learning_rate": 8.153364632237872e-06, + "loss": 11.0756, + "step": 118 + }, + { + "epoch": 0.5565624086524408, + "grad_norm": 0.23864871263504028, + "learning_rate": 8.137715179968702e-06, + "loss": 11.66, + "step": 119 + }, + { + "epoch": 0.5612394036831336, + "grad_norm": 0.22024200856685638, + "learning_rate": 8.122065727699531e-06, + "loss": 10.7713, + "step": 120 + }, + { + "epoch": 0.5659163987138264, + "grad_norm": 0.19292014837265015, + "learning_rate": 8.10641627543036e-06, + "loss": 9.4704, + "step": 121 + }, + { + "epoch": 0.5705933937445191, + "grad_norm": 0.16765080392360687, + "learning_rate": 8.09076682316119e-06, + "loss": 10.7993, + "step": 122 + }, + { + "epoch": 0.575270388775212, + "grad_norm": 0.26758840680122375, + "learning_rate": 8.075117370892019e-06, + "loss": 11.354, + "step": 123 + }, + { + "epoch": 0.5799473838059047, + "grad_norm": 0.25225985050201416, + "learning_rate": 8.059467918622848e-06, + "loss": 11.2162, + "step": 124 + }, + { + "epoch": 0.5846243788365975, + "grad_norm": 0.22062422335147858, + "learning_rate": 8.043818466353678e-06, + "loss": 9.9452, + "step": 125 + }, + { + "epoch": 0.5893013738672903, + "grad_norm": 0.2589726746082306, + "learning_rate": 8.028169014084509e-06, + "loss": 11.6098, + "step": 126 + }, + { + "epoch": 0.5939783688979831, + "grad_norm": 0.23492346704006195, + "learning_rate": 8.012519561815338e-06, + "loss": 10.6918, + "step": 127 + }, + { + "epoch": 0.5986553639286758, + "grad_norm": 0.29631978273391724, + "learning_rate": 7.996870109546167e-06, + "loss": 11.4451, + "step": 128 + }, + { + "epoch": 0.6033323589593687, + "grad_norm": 0.195633202791214, + "learning_rate": 7.981220657276996e-06, + "loss": 11.3396, + "step": 129 + }, + { + "epoch": 0.6080093539900614, + "grad_norm": 0.14094115793704987, + "learning_rate": 7.965571205007824e-06, + "loss": 10.9388, + "step": 130 + }, + { + "epoch": 0.6126863490207541, + "grad_norm": 0.2307533323764801, + "learning_rate": 7.949921752738655e-06, + "loss": 12.2129, + "step": 131 + }, + { + "epoch": 0.617363344051447, + "grad_norm": 0.2004641741514206, + "learning_rate": 7.934272300469484e-06, + "loss": 9.9139, + "step": 132 + }, + { + "epoch": 0.6220403390821397, + "grad_norm": 0.22784000635147095, + "learning_rate": 7.918622848200314e-06, + "loss": 10.2306, + "step": 133 + }, + { + "epoch": 0.6267173341128325, + "grad_norm": 0.21663011610507965, + "learning_rate": 7.902973395931143e-06, + "loss": 9.9467, + "step": 134 + }, + { + "epoch": 0.6313943291435253, + "grad_norm": 0.18714800477027893, + "learning_rate": 7.887323943661972e-06, + "loss": 9.7232, + "step": 135 + }, + { + "epoch": 0.6360713241742181, + "grad_norm": 0.23525570333003998, + "learning_rate": 7.871674491392802e-06, + "loss": 9.9539, + "step": 136 + }, + { + "epoch": 0.6407483192049108, + "grad_norm": 0.22870206832885742, + "learning_rate": 7.856025039123631e-06, + "loss": 11.9964, + "step": 137 + }, + { + "epoch": 0.6454253142356037, + "grad_norm": 0.19730104506015778, + "learning_rate": 7.84037558685446e-06, + "loss": 10.8391, + "step": 138 + }, + { + "epoch": 0.6501023092662964, + "grad_norm": 0.1873929351568222, + "learning_rate": 7.82472613458529e-06, + "loss": 10.7179, + "step": 139 + }, + { + "epoch": 0.6547793042969892, + "grad_norm": 0.14801403880119324, + "learning_rate": 7.809076682316119e-06, + "loss": 10.9041, + "step": 140 + }, + { + "epoch": 0.659456299327682, + "grad_norm": 0.21909023821353912, + "learning_rate": 7.79342723004695e-06, + "loss": 11.5497, + "step": 141 + }, + { + "epoch": 0.6641332943583748, + "grad_norm": 0.20469622313976288, + "learning_rate": 7.77777777777778e-06, + "loss": 11.0387, + "step": 142 + }, + { + "epoch": 0.6688102893890675, + "grad_norm": 0.20616918802261353, + "learning_rate": 7.762128325508608e-06, + "loss": 9.5392, + "step": 143 + }, + { + "epoch": 0.6734872844197604, + "grad_norm": 0.1846546232700348, + "learning_rate": 7.746478873239436e-06, + "loss": 11.5538, + "step": 144 + }, + { + "epoch": 0.6781642794504531, + "grad_norm": 0.17778314650058746, + "learning_rate": 7.730829420970265e-06, + "loss": 12.8435, + "step": 145 + }, + { + "epoch": 0.6828412744811458, + "grad_norm": 0.24238605797290802, + "learning_rate": 7.715179968701096e-06, + "loss": 9.4674, + "step": 146 + }, + { + "epoch": 0.6875182695118387, + "grad_norm": 0.20961545407772064, + "learning_rate": 7.699530516431926e-06, + "loss": 10.1325, + "step": 147 + }, + { + "epoch": 0.6921952645425314, + "grad_norm": 0.20476683974266052, + "learning_rate": 7.683881064162755e-06, + "loss": 11.1375, + "step": 148 + }, + { + "epoch": 0.6968722595732242, + "grad_norm": 0.22241833806037903, + "learning_rate": 7.668231611893584e-06, + "loss": 9.6296, + "step": 149 + }, + { + "epoch": 0.701549254603917, + "grad_norm": 0.2302970439195633, + "learning_rate": 7.652582159624414e-06, + "loss": 10.8763, + "step": 150 + }, + { + "epoch": 0.7062262496346098, + "grad_norm": 0.20484097301959991, + "learning_rate": 7.636932707355243e-06, + "loss": 9.0306, + "step": 151 + }, + { + "epoch": 0.7109032446653025, + "grad_norm": 0.20411114394664764, + "learning_rate": 7.621283255086073e-06, + "loss": 11.5865, + "step": 152 + }, + { + "epoch": 0.7155802396959953, + "grad_norm": 0.37148869037628174, + "learning_rate": 7.6056338028169015e-06, + "loss": 10.4929, + "step": 153 + }, + { + "epoch": 0.7202572347266881, + "grad_norm": 0.19864030182361603, + "learning_rate": 7.589984350547731e-06, + "loss": 10.4561, + "step": 154 + }, + { + "epoch": 0.7249342297573809, + "grad_norm": 0.21187515556812286, + "learning_rate": 7.574334898278561e-06, + "loss": 9.6848, + "step": 155 + }, + { + "epoch": 0.7296112247880736, + "grad_norm": 0.18564990162849426, + "learning_rate": 7.55868544600939e-06, + "loss": 11.2932, + "step": 156 + }, + { + "epoch": 0.7342882198187665, + "grad_norm": 0.21274517476558685, + "learning_rate": 7.54303599374022e-06, + "loss": 10.2206, + "step": 157 + }, + { + "epoch": 0.7389652148494592, + "grad_norm": 0.23622578382492065, + "learning_rate": 7.527386541471049e-06, + "loss": 9.4342, + "step": 158 + }, + { + "epoch": 0.743642209880152, + "grad_norm": 0.21262332797050476, + "learning_rate": 7.511737089201878e-06, + "loss": 11.4181, + "step": 159 + }, + { + "epoch": 0.7483192049108448, + "grad_norm": 0.22142890095710754, + "learning_rate": 7.496087636932708e-06, + "loss": 10.4912, + "step": 160 + }, + { + "epoch": 0.7529961999415375, + "grad_norm": 0.219626322388649, + "learning_rate": 7.480438184663538e-06, + "loss": 10.902, + "step": 161 + }, + { + "epoch": 0.7576731949722303, + "grad_norm": 0.19913645088672638, + "learning_rate": 7.464788732394367e-06, + "loss": 8.9078, + "step": 162 + }, + { + "epoch": 0.7623501900029231, + "grad_norm": 0.19409991800785065, + "learning_rate": 7.449139280125196e-06, + "loss": 10.7111, + "step": 163 + }, + { + "epoch": 0.7670271850336159, + "grad_norm": 0.20056220889091492, + "learning_rate": 7.433489827856026e-06, + "loss": 11.438, + "step": 164 + }, + { + "epoch": 0.7717041800643086, + "grad_norm": 0.19502754509449005, + "learning_rate": 7.417840375586856e-06, + "loss": 10.1837, + "step": 165 + }, + { + "epoch": 0.7763811750950015, + "grad_norm": 0.17272567749023438, + "learning_rate": 7.402190923317685e-06, + "loss": 10.7406, + "step": 166 + }, + { + "epoch": 0.7810581701256942, + "grad_norm": 0.19558610022068024, + "learning_rate": 7.386541471048514e-06, + "loss": 10.1322, + "step": 167 + }, + { + "epoch": 0.785735165156387, + "grad_norm": 0.2161480039358139, + "learning_rate": 7.370892018779343e-06, + "loss": 9.7506, + "step": 168 + }, + { + "epoch": 0.7904121601870798, + "grad_norm": 0.25595343112945557, + "learning_rate": 7.355242566510172e-06, + "loss": 11.0059, + "step": 169 + }, + { + "epoch": 0.7950891552177726, + "grad_norm": 0.21218866109848022, + "learning_rate": 7.339593114241002e-06, + "loss": 11.2122, + "step": 170 + }, + { + "epoch": 0.7997661502484653, + "grad_norm": 0.1922176331281662, + "learning_rate": 7.3239436619718316e-06, + "loss": 11.0585, + "step": 171 + }, + { + "epoch": 0.8044431452791582, + "grad_norm": 0.1726471334695816, + "learning_rate": 7.308294209702661e-06, + "loss": 11.3007, + "step": 172 + }, + { + "epoch": 0.8091201403098509, + "grad_norm": 0.20865805447101593, + "learning_rate": 7.29264475743349e-06, + "loss": 12.5848, + "step": 173 + }, + { + "epoch": 0.8137971353405437, + "grad_norm": 0.2097303569316864, + "learning_rate": 7.2769953051643195e-06, + "loss": 11.694, + "step": 174 + }, + { + "epoch": 0.8184741303712365, + "grad_norm": 0.22343699634075165, + "learning_rate": 7.26134585289515e-06, + "loss": 9.9861, + "step": 175 + }, + { + "epoch": 0.8231511254019293, + "grad_norm": 0.19908592104911804, + "learning_rate": 7.245696400625979e-06, + "loss": 10.7263, + "step": 176 + }, + { + "epoch": 0.827828120432622, + "grad_norm": 0.2062506079673767, + "learning_rate": 7.230046948356808e-06, + "loss": 10.7234, + "step": 177 + }, + { + "epoch": 0.8325051154633148, + "grad_norm": 0.23186688125133514, + "learning_rate": 7.2143974960876376e-06, + "loss": 10.7846, + "step": 178 + }, + { + "epoch": 0.8371821104940076, + "grad_norm": 0.20528610050678253, + "learning_rate": 7.198748043818467e-06, + "loss": 10.6732, + "step": 179 + }, + { + "epoch": 0.8418591055247003, + "grad_norm": 0.21028846502304077, + "learning_rate": 7.183098591549297e-06, + "loss": 9.5007, + "step": 180 + }, + { + "epoch": 0.8465361005553932, + "grad_norm": 0.1943686306476593, + "learning_rate": 7.167449139280126e-06, + "loss": 10.6163, + "step": 181 + }, + { + "epoch": 0.8512130955860859, + "grad_norm": 0.15791501104831696, + "learning_rate": 7.151799687010955e-06, + "loss": 10.4564, + "step": 182 + }, + { + "epoch": 0.8558900906167787, + "grad_norm": 0.15603427588939667, + "learning_rate": 7.136150234741784e-06, + "loss": 11.4006, + "step": 183 + }, + { + "epoch": 0.8605670856474715, + "grad_norm": 0.1737872064113617, + "learning_rate": 7.120500782472613e-06, + "loss": 10.2583, + "step": 184 + }, + { + "epoch": 0.8652440806781643, + "grad_norm": 0.16742144525051117, + "learning_rate": 7.1048513302034435e-06, + "loss": 9.6543, + "step": 185 + }, + { + "epoch": 0.869921075708857, + "grad_norm": 0.2204071581363678, + "learning_rate": 7.089201877934273e-06, + "loss": 10.6068, + "step": 186 + }, + { + "epoch": 0.8745980707395499, + "grad_norm": 0.17526549100875854, + "learning_rate": 7.073552425665102e-06, + "loss": 10.5927, + "step": 187 + }, + { + "epoch": 0.8792750657702426, + "grad_norm": 0.18857762217521667, + "learning_rate": 7.0579029733959315e-06, + "loss": 10.0686, + "step": 188 + }, + { + "epoch": 0.8839520608009354, + "grad_norm": 0.16617538034915924, + "learning_rate": 7.042253521126761e-06, + "loss": 11.0356, + "step": 189 + }, + { + "epoch": 0.8886290558316282, + "grad_norm": 0.20443867146968842, + "learning_rate": 7.026604068857591e-06, + "loss": 9.764, + "step": 190 + }, + { + "epoch": 0.893306050862321, + "grad_norm": 0.16466206312179565, + "learning_rate": 7.01095461658842e-06, + "loss": 8.9783, + "step": 191 + }, + { + "epoch": 0.8979830458930137, + "grad_norm": 0.2051703780889511, + "learning_rate": 6.9953051643192495e-06, + "loss": 10.5345, + "step": 192 + }, + { + "epoch": 0.9026600409237066, + "grad_norm": 0.19935429096221924, + "learning_rate": 6.979655712050079e-06, + "loss": 10.1047, + "step": 193 + }, + { + "epoch": 0.9073370359543993, + "grad_norm": 0.14471961557865143, + "learning_rate": 6.964006259780907e-06, + "loss": 8.9315, + "step": 194 + }, + { + "epoch": 0.912014030985092, + "grad_norm": 0.21026520431041718, + "learning_rate": 6.948356807511738e-06, + "loss": 11.0192, + "step": 195 + }, + { + "epoch": 0.9166910260157849, + "grad_norm": 0.22124925255775452, + "learning_rate": 6.932707355242568e-06, + "loss": 10.7211, + "step": 196 + }, + { + "epoch": 0.9213680210464776, + "grad_norm": 0.6166573166847229, + "learning_rate": 6.917057902973396e-06, + "loss": 10.1654, + "step": 197 + }, + { + "epoch": 0.9260450160771704, + "grad_norm": 0.14892670512199402, + "learning_rate": 6.901408450704225e-06, + "loss": 9.6949, + "step": 198 + }, + { + "epoch": 0.9307220111078632, + "grad_norm": 0.17058013379573822, + "learning_rate": 6.885758998435055e-06, + "loss": 9.9864, + "step": 199 + }, + { + "epoch": 0.935399006138556, + "grad_norm": 0.19176752865314484, + "learning_rate": 6.870109546165885e-06, + "loss": 9.7219, + "step": 200 + }, + { + "epoch": 0.9400760011692487, + "grad_norm": 0.1923060268163681, + "learning_rate": 6.854460093896714e-06, + "loss": 9.0111, + "step": 201 + }, + { + "epoch": 0.9447529961999416, + "grad_norm": 0.22771762311458588, + "learning_rate": 6.8388106416275434e-06, + "loss": 9.9277, + "step": 202 + }, + { + "epoch": 0.9494299912306343, + "grad_norm": 0.21972382068634033, + "learning_rate": 6.823161189358373e-06, + "loss": 10.5451, + "step": 203 + }, + { + "epoch": 0.9541069862613271, + "grad_norm": 0.32944294810295105, + "learning_rate": 6.807511737089203e-06, + "loss": 9.8053, + "step": 204 + }, + { + "epoch": 0.9587839812920199, + "grad_norm": 0.1875985562801361, + "learning_rate": 6.791862284820032e-06, + "loss": 10.3256, + "step": 205 + }, + { + "epoch": 0.9634609763227127, + "grad_norm": 0.17583012580871582, + "learning_rate": 6.7762128325508615e-06, + "loss": 10.4922, + "step": 206 + }, + { + "epoch": 0.9681379713534054, + "grad_norm": 0.22149552404880524, + "learning_rate": 6.760563380281691e-06, + "loss": 10.1547, + "step": 207 + }, + { + "epoch": 0.9728149663840983, + "grad_norm": 0.18506276607513428, + "learning_rate": 6.74491392801252e-06, + "loss": 10.5188, + "step": 208 + }, + { + "epoch": 0.977491961414791, + "grad_norm": 0.21199573576450348, + "learning_rate": 6.72926447574335e-06, + "loss": 11.3258, + "step": 209 + }, + { + "epoch": 0.9821689564454837, + "grad_norm": 0.18747669458389282, + "learning_rate": 6.71361502347418e-06, + "loss": 10.251, + "step": 210 + }, + { + "epoch": 0.9868459514761766, + "grad_norm": 0.1887262761592865, + "learning_rate": 6.697965571205008e-06, + "loss": 9.2012, + "step": 211 + }, + { + "epoch": 0.9915229465068693, + "grad_norm": 0.16557927429676056, + "learning_rate": 6.682316118935837e-06, + "loss": 9.2171, + "step": 212 + }, + { + "epoch": 0.9961999415375621, + "grad_norm": 0.19340123236179352, + "learning_rate": 6.666666666666667e-06, + "loss": 9.4988, + "step": 213 + }, + { + "epoch": 1.0046769950306929, + "grad_norm": 0.3001099228858948, + "learning_rate": 6.651017214397497e-06, + "loss": 11.8577, + "step": 214 + }, + { + "epoch": 1.0093539900613855, + "grad_norm": 0.18085287511348724, + "learning_rate": 6.635367762128326e-06, + "loss": 10.4356, + "step": 215 + }, + { + "epoch": 1.0140309850920783, + "grad_norm": 0.17791183292865753, + "learning_rate": 6.619718309859155e-06, + "loss": 10.3929, + "step": 216 + }, + { + "epoch": 1.0187079801227712, + "grad_norm": 0.20649202167987823, + "learning_rate": 6.604068857589985e-06, + "loss": 9.342, + "step": 217 + }, + { + "epoch": 1.0233849751534638, + "grad_norm": 0.2049955129623413, + "learning_rate": 6.588419405320814e-06, + "loss": 10.656, + "step": 218 + }, + { + "epoch": 1.0280619701841567, + "grad_norm": 0.18064165115356445, + "learning_rate": 6.572769953051644e-06, + "loss": 10.1633, + "step": 219 + }, + { + "epoch": 1.0327389652148495, + "grad_norm": 0.1652020812034607, + "learning_rate": 6.5571205007824735e-06, + "loss": 8.9937, + "step": 220 + }, + { + "epoch": 1.0374159602455422, + "grad_norm": 0.16658996045589447, + "learning_rate": 6.541471048513303e-06, + "loss": 11.0051, + "step": 221 + }, + { + "epoch": 1.042092955276235, + "grad_norm": 0.1875378042459488, + "learning_rate": 6.525821596244132e-06, + "loss": 9.7089, + "step": 222 + }, + { + "epoch": 1.0467699503069279, + "grad_norm": 0.19267050921916962, + "learning_rate": 6.510172143974961e-06, + "loss": 10.0252, + "step": 223 + }, + { + "epoch": 1.0514469453376205, + "grad_norm": 0.2656681537628174, + "learning_rate": 6.4945226917057916e-06, + "loss": 9.7082, + "step": 224 + }, + { + "epoch": 1.0561239403683134, + "grad_norm": 0.16058804094791412, + "learning_rate": 6.478873239436621e-06, + "loss": 9.6689, + "step": 225 + }, + { + "epoch": 1.0608009353990062, + "grad_norm": 0.14145280420780182, + "learning_rate": 6.463223787167449e-06, + "loss": 8.6923, + "step": 226 + }, + { + "epoch": 1.0654779304296988, + "grad_norm": 0.14217382669448853, + "learning_rate": 6.447574334898279e-06, + "loss": 10.4302, + "step": 227 + }, + { + "epoch": 1.0701549254603917, + "grad_norm": 0.18387371301651, + "learning_rate": 6.431924882629108e-06, + "loss": 9.5514, + "step": 228 + }, + { + "epoch": 1.0748319204910846, + "grad_norm": 0.15731996297836304, + "learning_rate": 6.416275430359938e-06, + "loss": 9.2854, + "step": 229 + }, + { + "epoch": 1.0795089155217772, + "grad_norm": 0.1794990450143814, + "learning_rate": 6.400625978090767e-06, + "loss": 11.0837, + "step": 230 + }, + { + "epoch": 1.08418591055247, + "grad_norm": 0.19289837777614594, + "learning_rate": 6.384976525821597e-06, + "loss": 9.3129, + "step": 231 + }, + { + "epoch": 1.088862905583163, + "grad_norm": 0.1858958899974823, + "learning_rate": 6.369327073552426e-06, + "loss": 10.7238, + "step": 232 + }, + { + "epoch": 1.0935399006138555, + "grad_norm": 0.26388686895370483, + "learning_rate": 6.353677621283255e-06, + "loss": 9.2242, + "step": 233 + }, + { + "epoch": 1.0982168956445484, + "grad_norm": 0.17551296949386597, + "learning_rate": 6.3380281690140855e-06, + "loss": 8.3665, + "step": 234 + }, + { + "epoch": 1.1028938906752412, + "grad_norm": 0.20290863513946533, + "learning_rate": 6.322378716744915e-06, + "loss": 9.6916, + "step": 235 + }, + { + "epoch": 1.1075708857059339, + "grad_norm": 0.11323179304599762, + "learning_rate": 6.306729264475744e-06, + "loss": 10.218, + "step": 236 + }, + { + "epoch": 1.1122478807366267, + "grad_norm": 0.22893109917640686, + "learning_rate": 6.291079812206573e-06, + "loss": 10.3068, + "step": 237 + }, + { + "epoch": 1.1169248757673196, + "grad_norm": 0.1943362057209015, + "learning_rate": 6.275430359937402e-06, + "loss": 9.738, + "step": 238 + }, + { + "epoch": 1.1216018707980122, + "grad_norm": 0.22017931938171387, + "learning_rate": 6.259780907668233e-06, + "loss": 8.5765, + "step": 239 + }, + { + "epoch": 1.126278865828705, + "grad_norm": 0.1584814190864563, + "learning_rate": 6.244131455399062e-06, + "loss": 11.0436, + "step": 240 + }, + { + "epoch": 1.130955860859398, + "grad_norm": 0.182816743850708, + "learning_rate": 6.228482003129891e-06, + "loss": 11.1518, + "step": 241 + }, + { + "epoch": 1.1356328558900906, + "grad_norm": 0.21375828981399536, + "learning_rate": 6.21283255086072e-06, + "loss": 10.0972, + "step": 242 + }, + { + "epoch": 1.1403098509207834, + "grad_norm": 0.1926356703042984, + "learning_rate": 6.197183098591549e-06, + "loss": 9.0861, + "step": 243 + }, + { + "epoch": 1.1449868459514763, + "grad_norm": 0.13788476586341858, + "learning_rate": 6.181533646322379e-06, + "loss": 9.1896, + "step": 244 + }, + { + "epoch": 1.149663840982169, + "grad_norm": 0.24886344373226166, + "learning_rate": 6.165884194053209e-06, + "loss": 8.9126, + "step": 245 + }, + { + "epoch": 1.1543408360128617, + "grad_norm": 0.21492387354373932, + "learning_rate": 6.150234741784038e-06, + "loss": 9.1809, + "step": 246 + }, + { + "epoch": 1.1590178310435546, + "grad_norm": 0.20666466653347015, + "learning_rate": 6.134585289514867e-06, + "loss": 9.8609, + "step": 247 + }, + { + "epoch": 1.1636948260742472, + "grad_norm": 0.12884530425071716, + "learning_rate": 6.118935837245697e-06, + "loss": 9.0015, + "step": 248 + }, + { + "epoch": 1.16837182110494, + "grad_norm": 0.2109869420528412, + "learning_rate": 6.103286384976527e-06, + "loss": 8.4398, + "step": 249 + }, + { + "epoch": 1.173048816135633, + "grad_norm": 0.1602170467376709, + "learning_rate": 6.087636932707356e-06, + "loss": 8.9123, + "step": 250 + }, + { + "epoch": 1.1777258111663256, + "grad_norm": 0.1901443898677826, + "learning_rate": 6.071987480438185e-06, + "loss": 9.3279, + "step": 251 + }, + { + "epoch": 1.1824028061970184, + "grad_norm": 0.12106055021286011, + "learning_rate": 6.056338028169015e-06, + "loss": 8.8215, + "step": 252 + }, + { + "epoch": 1.1870798012277113, + "grad_norm": 0.15600277483463287, + "learning_rate": 6.040688575899843e-06, + "loss": 9.5461, + "step": 253 + }, + { + "epoch": 1.191756796258404, + "grad_norm": 0.211564302444458, + "learning_rate": 6.025039123630674e-06, + "loss": 9.9196, + "step": 254 + }, + { + "epoch": 1.1964337912890968, + "grad_norm": 0.16480544209480286, + "learning_rate": 6.0093896713615026e-06, + "loss": 9.488, + "step": 255 + }, + { + "epoch": 1.2011107863197896, + "grad_norm": 0.22194457054138184, + "learning_rate": 5.993740219092332e-06, + "loss": 10.415, + "step": 256 + }, + { + "epoch": 1.2057877813504823, + "grad_norm": 0.27972927689552307, + "learning_rate": 5.978090766823161e-06, + "loss": 9.3022, + "step": 257 + }, + { + "epoch": 1.2104647763811751, + "grad_norm": 0.23484700918197632, + "learning_rate": 5.9624413145539905e-06, + "loss": 8.218, + "step": 258 + }, + { + "epoch": 1.215141771411868, + "grad_norm": 0.20119240880012512, + "learning_rate": 5.946791862284821e-06, + "loss": 8.204, + "step": 259 + }, + { + "epoch": 1.2198187664425606, + "grad_norm": 0.19867953658103943, + "learning_rate": 5.93114241001565e-06, + "loss": 9.4491, + "step": 260 + }, + { + "epoch": 1.2244957614732535, + "grad_norm": 0.19878610968589783, + "learning_rate": 5.915492957746479e-06, + "loss": 10.683, + "step": 261 + }, + { + "epoch": 1.2291727565039463, + "grad_norm": 0.18710929155349731, + "learning_rate": 5.8998435054773086e-06, + "loss": 10.2426, + "step": 262 + }, + { + "epoch": 1.233849751534639, + "grad_norm": 0.1873483806848526, + "learning_rate": 5.884194053208139e-06, + "loss": 10.1553, + "step": 263 + }, + { + "epoch": 1.2385267465653318, + "grad_norm": 0.26153287291526794, + "learning_rate": 5.868544600938968e-06, + "loss": 9.8046, + "step": 264 + }, + { + "epoch": 1.2432037415960246, + "grad_norm": 0.17956022918224335, + "learning_rate": 5.852895148669797e-06, + "loss": 9.2137, + "step": 265 + }, + { + "epoch": 1.2478807366267173, + "grad_norm": 0.15572352707386017, + "learning_rate": 5.837245696400627e-06, + "loss": 9.2382, + "step": 266 + }, + { + "epoch": 1.2525577316574101, + "grad_norm": 0.16768573224544525, + "learning_rate": 5.821596244131456e-06, + "loss": 10.1462, + "step": 267 + }, + { + "epoch": 1.257234726688103, + "grad_norm": 0.14606249332427979, + "learning_rate": 5.805946791862286e-06, + "loss": 9.6735, + "step": 268 + }, + { + "epoch": 1.2619117217187956, + "grad_norm": 0.20985975861549377, + "learning_rate": 5.790297339593115e-06, + "loss": 10.9061, + "step": 269 + }, + { + "epoch": 1.2665887167494885, + "grad_norm": 0.17635460197925568, + "learning_rate": 5.774647887323944e-06, + "loss": 9.1385, + "step": 270 + }, + { + "epoch": 1.271265711780181, + "grad_norm": 0.19080878794193268, + "learning_rate": 5.758998435054773e-06, + "loss": 9.8189, + "step": 271 + }, + { + "epoch": 1.275942706810874, + "grad_norm": 0.1511276364326477, + "learning_rate": 5.7433489827856025e-06, + "loss": 9.9191, + "step": 272 + }, + { + "epoch": 1.2806197018415668, + "grad_norm": 0.2525511085987091, + "learning_rate": 5.727699530516433e-06, + "loss": 8.7398, + "step": 273 + }, + { + "epoch": 1.2852966968722597, + "grad_norm": 0.18259669840335846, + "learning_rate": 5.712050078247262e-06, + "loss": 10.7875, + "step": 274 + }, + { + "epoch": 1.2899736919029523, + "grad_norm": 0.2251911461353302, + "learning_rate": 5.696400625978091e-06, + "loss": 8.9997, + "step": 275 + }, + { + "epoch": 1.2946506869336452, + "grad_norm": 0.17306119203567505, + "learning_rate": 5.6807511737089205e-06, + "loss": 10.0071, + "step": 276 + }, + { + "epoch": 1.2993276819643378, + "grad_norm": 0.23585619032382965, + "learning_rate": 5.66510172143975e-06, + "loss": 9.5575, + "step": 277 + }, + { + "epoch": 1.3040046769950306, + "grad_norm": 0.2100452035665512, + "learning_rate": 5.64945226917058e-06, + "loss": 9.6862, + "step": 278 + }, + { + "epoch": 1.3086816720257235, + "grad_norm": 0.19781209528446198, + "learning_rate": 5.633802816901409e-06, + "loss": 9.6712, + "step": 279 + }, + { + "epoch": 1.3133586670564164, + "grad_norm": 0.20990189909934998, + "learning_rate": 5.618153364632239e-06, + "loss": 9.1145, + "step": 280 + }, + { + "epoch": 1.318035662087109, + "grad_norm": 0.14471188187599182, + "learning_rate": 5.602503912363068e-06, + "loss": 10.0124, + "step": 281 + }, + { + "epoch": 1.3227126571178018, + "grad_norm": 0.181657612323761, + "learning_rate": 5.586854460093896e-06, + "loss": 8.5702, + "step": 282 + }, + { + "epoch": 1.3273896521484945, + "grad_norm": 0.28895941376686096, + "learning_rate": 5.571205007824727e-06, + "loss": 8.7288, + "step": 283 + }, + { + "epoch": 1.3320666471791873, + "grad_norm": 0.19658011198043823, + "learning_rate": 5.555555555555557e-06, + "loss": 10.2721, + "step": 284 + }, + { + "epoch": 1.3367436422098802, + "grad_norm": 0.1778428554534912, + "learning_rate": 5.539906103286385e-06, + "loss": 8.6042, + "step": 285 + }, + { + "epoch": 1.341420637240573, + "grad_norm": 0.1622474491596222, + "learning_rate": 5.5242566510172144e-06, + "loss": 9.0871, + "step": 286 + }, + { + "epoch": 1.3460976322712657, + "grad_norm": 0.17768928408622742, + "learning_rate": 5.508607198748044e-06, + "loss": 9.1438, + "step": 287 + }, + { + "epoch": 1.3507746273019585, + "grad_norm": 0.15472590923309326, + "learning_rate": 5.492957746478874e-06, + "loss": 8.1626, + "step": 288 + }, + { + "epoch": 1.3554516223326512, + "grad_norm": 0.151944100856781, + "learning_rate": 5.477308294209703e-06, + "loss": 10.6628, + "step": 289 + }, + { + "epoch": 1.360128617363344, + "grad_norm": 0.2412179410457611, + "learning_rate": 5.4616588419405325e-06, + "loss": 10.0811, + "step": 290 + }, + { + "epoch": 1.3648056123940369, + "grad_norm": 0.1254899650812149, + "learning_rate": 5.446009389671362e-06, + "loss": 8.7967, + "step": 291 + }, + { + "epoch": 1.3694826074247297, + "grad_norm": 0.1940433233976364, + "learning_rate": 5.430359937402191e-06, + "loss": 10.7896, + "step": 292 + }, + { + "epoch": 1.3741596024554223, + "grad_norm": 0.23099660873413086, + "learning_rate": 5.414710485133021e-06, + "loss": 10.3398, + "step": 293 + }, + { + "epoch": 1.3788365974861152, + "grad_norm": 0.14648781716823578, + "learning_rate": 5.3990610328638506e-06, + "loss": 9.3573, + "step": 294 + }, + { + "epoch": 1.3835135925168078, + "grad_norm": 0.18853303790092468, + "learning_rate": 5.38341158059468e-06, + "loss": 9.8656, + "step": 295 + }, + { + "epoch": 1.3881905875475007, + "grad_norm": 0.20366129279136658, + "learning_rate": 5.367762128325509e-06, + "loss": 10.2061, + "step": 296 + }, + { + "epoch": 1.3928675825781935, + "grad_norm": 0.18720601499080658, + "learning_rate": 5.352112676056338e-06, + "loss": 8.4737, + "step": 297 + }, + { + "epoch": 1.3975445776088864, + "grad_norm": 0.1396239697933197, + "learning_rate": 5.336463223787169e-06, + "loss": 9.3009, + "step": 298 + }, + { + "epoch": 1.402221572639579, + "grad_norm": 0.19741852581501007, + "learning_rate": 5.320813771517997e-06, + "loss": 9.7318, + "step": 299 + }, + { + "epoch": 1.4068985676702719, + "grad_norm": 0.1550920307636261, + "learning_rate": 5.305164319248826e-06, + "loss": 9.0948, + "step": 300 + }, + { + "epoch": 1.4115755627009645, + "grad_norm": 0.20845593512058258, + "learning_rate": 5.289514866979656e-06, + "loss": 8.555, + "step": 301 + }, + { + "epoch": 1.4162525577316574, + "grad_norm": 0.15616929531097412, + "learning_rate": 5.273865414710485e-06, + "loss": 9.293, + "step": 302 + }, + { + "epoch": 1.4209295527623502, + "grad_norm": 0.18581336736679077, + "learning_rate": 5.258215962441315e-06, + "loss": 8.6798, + "step": 303 + }, + { + "epoch": 1.425606547793043, + "grad_norm": 0.14762163162231445, + "learning_rate": 5.2425665101721445e-06, + "loss": 7.7574, + "step": 304 + }, + { + "epoch": 1.4302835428237357, + "grad_norm": 0.11617639660835266, + "learning_rate": 5.226917057902974e-06, + "loss": 9.9937, + "step": 305 + }, + { + "epoch": 1.4349605378544286, + "grad_norm": 0.12888303399085999, + "learning_rate": 5.211267605633803e-06, + "loss": 9.5393, + "step": 306 + }, + { + "epoch": 1.4396375328851212, + "grad_norm": 0.14450183510780334, + "learning_rate": 5.195618153364632e-06, + "loss": 10.9441, + "step": 307 + }, + { + "epoch": 1.444314527915814, + "grad_norm": 0.20856888592243195, + "learning_rate": 5.1799687010954625e-06, + "loss": 9.6833, + "step": 308 + }, + { + "epoch": 1.448991522946507, + "grad_norm": 0.23422713577747345, + "learning_rate": 5.164319248826292e-06, + "loss": 9.2532, + "step": 309 + }, + { + "epoch": 1.4536685179771998, + "grad_norm": 0.19145800173282623, + "learning_rate": 5.148669796557121e-06, + "loss": 9.7285, + "step": 310 + }, + { + "epoch": 1.4583455130078924, + "grad_norm": 0.1990247666835785, + "learning_rate": 5.1330203442879505e-06, + "loss": 7.6512, + "step": 311 + }, + { + "epoch": 1.4630225080385852, + "grad_norm": 0.17829596996307373, + "learning_rate": 5.117370892018779e-06, + "loss": 9.5529, + "step": 312 + }, + { + "epoch": 1.4676995030692779, + "grad_norm": 0.162981778383255, + "learning_rate": 5.10172143974961e-06, + "loss": 10.0274, + "step": 313 + }, + { + "epoch": 1.4723764980999707, + "grad_norm": 0.17965111136436462, + "learning_rate": 5.086071987480438e-06, + "loss": 9.2513, + "step": 314 + }, + { + "epoch": 1.4770534931306636, + "grad_norm": 0.28804492950439453, + "learning_rate": 5.070422535211268e-06, + "loss": 10.0194, + "step": 315 + }, + { + "epoch": 1.4817304881613564, + "grad_norm": 0.1571478545665741, + "learning_rate": 5.054773082942097e-06, + "loss": 10.0889, + "step": 316 + }, + { + "epoch": 1.486407483192049, + "grad_norm": 0.2101372927427292, + "learning_rate": 5.039123630672926e-06, + "loss": 8.6775, + "step": 317 + }, + { + "epoch": 1.491084478222742, + "grad_norm": 0.20323887467384338, + "learning_rate": 5.0234741784037565e-06, + "loss": 9.8082, + "step": 318 + }, + { + "epoch": 1.4957614732534346, + "grad_norm": 0.16192995011806488, + "learning_rate": 5.007824726134586e-06, + "loss": 8.0025, + "step": 319 + }, + { + "epoch": 1.5004384682841274, + "grad_norm": 0.16440463066101074, + "learning_rate": 4.992175273865415e-06, + "loss": 9.579, + "step": 320 + }, + { + "epoch": 1.5051154633148203, + "grad_norm": 0.19055482745170593, + "learning_rate": 4.976525821596244e-06, + "loss": 8.7398, + "step": 321 + }, + { + "epoch": 1.5097924583455131, + "grad_norm": 0.17318573594093323, + "learning_rate": 4.960876369327074e-06, + "loss": 9.7488, + "step": 322 + }, + { + "epoch": 1.5144694533762058, + "grad_norm": 0.24867770075798035, + "learning_rate": 4.945226917057903e-06, + "loss": 10.5706, + "step": 323 + }, + { + "epoch": 1.5191464484068986, + "grad_norm": 0.1796032041311264, + "learning_rate": 4.929577464788733e-06, + "loss": 9.4351, + "step": 324 + }, + { + "epoch": 1.5238234434375912, + "grad_norm": 0.21675661206245422, + "learning_rate": 4.9139280125195624e-06, + "loss": 10.6771, + "step": 325 + }, + { + "epoch": 1.528500438468284, + "grad_norm": 0.17892418801784515, + "learning_rate": 4.898278560250392e-06, + "loss": 7.6976, + "step": 326 + }, + { + "epoch": 1.533177433498977, + "grad_norm": 0.16854748129844666, + "learning_rate": 4.882629107981221e-06, + "loss": 9.0202, + "step": 327 + }, + { + "epoch": 1.5378544285296698, + "grad_norm": 0.20898739993572235, + "learning_rate": 4.86697965571205e-06, + "loss": 9.3772, + "step": 328 + }, + { + "epoch": 1.5425314235603624, + "grad_norm": 0.2980878949165344, + "learning_rate": 4.85133020344288e-06, + "loss": 10.5012, + "step": 329 + }, + { + "epoch": 1.5472084185910553, + "grad_norm": 0.12076615542173386, + "learning_rate": 4.835680751173709e-06, + "loss": 10.1389, + "step": 330 + }, + { + "epoch": 1.551885413621748, + "grad_norm": 0.1814320981502533, + "learning_rate": 4.820031298904539e-06, + "loss": 8.4015, + "step": 331 + }, + { + "epoch": 1.5565624086524408, + "grad_norm": 0.16422027349472046, + "learning_rate": 4.8043818466353684e-06, + "loss": 8.4772, + "step": 332 + }, + { + "epoch": 1.5612394036831336, + "grad_norm": 0.12222316116094589, + "learning_rate": 4.788732394366197e-06, + "loss": 8.7358, + "step": 333 + }, + { + "epoch": 1.5659163987138265, + "grad_norm": 0.20471377670764923, + "learning_rate": 4.773082942097027e-06, + "loss": 8.9805, + "step": 334 + }, + { + "epoch": 1.5705933937445191, + "grad_norm": 0.1602873057126999, + "learning_rate": 4.757433489827856e-06, + "loss": 7.7731, + "step": 335 + }, + { + "epoch": 1.575270388775212, + "grad_norm": 0.1620335578918457, + "learning_rate": 4.741784037558686e-06, + "loss": 8.5971, + "step": 336 + }, + { + "epoch": 1.5799473838059046, + "grad_norm": 0.14822766184806824, + "learning_rate": 4.726134585289515e-06, + "loss": 8.1521, + "step": 337 + }, + { + "epoch": 1.5846243788365975, + "grad_norm": 0.16832107305526733, + "learning_rate": 4.710485133020345e-06, + "loss": 9.0838, + "step": 338 + }, + { + "epoch": 1.5893013738672903, + "grad_norm": 0.1385219246149063, + "learning_rate": 4.694835680751174e-06, + "loss": 7.4367, + "step": 339 + }, + { + "epoch": 1.5939783688979832, + "grad_norm": 0.13664643466472626, + "learning_rate": 4.679186228482004e-06, + "loss": 8.5027, + "step": 340 + }, + { + "epoch": 1.5986553639286758, + "grad_norm": 0.18891537189483643, + "learning_rate": 4.663536776212833e-06, + "loss": 8.6301, + "step": 341 + }, + { + "epoch": 1.6033323589593687, + "grad_norm": 0.19962970912456512, + "learning_rate": 4.647887323943662e-06, + "loss": 10.6293, + "step": 342 + }, + { + "epoch": 1.6080093539900613, + "grad_norm": 0.18747878074645996, + "learning_rate": 4.632237871674492e-06, + "loss": 10.0322, + "step": 343 + }, + { + "epoch": 1.6126863490207541, + "grad_norm": 0.3010605573654175, + "learning_rate": 4.616588419405321e-06, + "loss": 9.1209, + "step": 344 + }, + { + "epoch": 1.617363344051447, + "grad_norm": 0.11245454847812653, + "learning_rate": 4.60093896713615e-06, + "loss": 8.0594, + "step": 345 + }, + { + "epoch": 1.6220403390821398, + "grad_norm": 0.20886649191379547, + "learning_rate": 4.58528951486698e-06, + "loss": 9.1715, + "step": 346 + }, + { + "epoch": 1.6267173341128325, + "grad_norm": 0.14630508422851562, + "learning_rate": 4.56964006259781e-06, + "loss": 8.7735, + "step": 347 + }, + { + "epoch": 1.6313943291435253, + "grad_norm": 0.21093368530273438, + "learning_rate": 4.553990610328639e-06, + "loss": 8.2183, + "step": 348 + }, + { + "epoch": 1.636071324174218, + "grad_norm": 0.22136329114437103, + "learning_rate": 4.538341158059468e-06, + "loss": 9.067, + "step": 349 + }, + { + "epoch": 1.6407483192049108, + "grad_norm": 0.15906454622745514, + "learning_rate": 4.522691705790298e-06, + "loss": 9.3209, + "step": 350 + }, + { + "epoch": 1.6454253142356037, + "grad_norm": 0.2312268763780594, + "learning_rate": 4.507042253521127e-06, + "loss": 9.2316, + "step": 351 + }, + { + "epoch": 1.6501023092662965, + "grad_norm": 0.24528440833091736, + "learning_rate": 4.491392801251956e-06, + "loss": 9.0482, + "step": 352 + }, + { + "epoch": 1.6547793042969892, + "grad_norm": 0.19777342677116394, + "learning_rate": 4.475743348982786e-06, + "loss": 10.1556, + "step": 353 + }, + { + "epoch": 1.659456299327682, + "grad_norm": 0.2033587247133255, + "learning_rate": 4.460093896713616e-06, + "loss": 8.9973, + "step": 354 + }, + { + "epoch": 1.6641332943583746, + "grad_norm": 0.16927585005760193, + "learning_rate": 4.444444444444444e-06, + "loss": 9.5144, + "step": 355 + }, + { + "epoch": 1.6688102893890675, + "grad_norm": 0.16959340870380402, + "learning_rate": 4.428794992175274e-06, + "loss": 9.5447, + "step": 356 + }, + { + "epoch": 1.6734872844197604, + "grad_norm": 0.18593505024909973, + "learning_rate": 4.413145539906104e-06, + "loss": 9.6471, + "step": 357 + }, + { + "epoch": 1.6781642794504532, + "grad_norm": 0.16945506632328033, + "learning_rate": 4.397496087636933e-06, + "loss": 8.5418, + "step": 358 + }, + { + "epoch": 1.6828412744811458, + "grad_norm": 0.16277293860912323, + "learning_rate": 4.381846635367762e-06, + "loss": 9.2884, + "step": 359 + }, + { + "epoch": 1.6875182695118387, + "grad_norm": 0.2155790776014328, + "learning_rate": 4.3661971830985915e-06, + "loss": 9.4547, + "step": 360 + }, + { + "epoch": 1.6921952645425313, + "grad_norm": 0.19257700443267822, + "learning_rate": 4.350547730829422e-06, + "loss": 8.7859, + "step": 361 + }, + { + "epoch": 1.6968722595732242, + "grad_norm": 0.21113352477550507, + "learning_rate": 4.334898278560251e-06, + "loss": 9.3654, + "step": 362 + }, + { + "epoch": 1.701549254603917, + "grad_norm": 0.17781415581703186, + "learning_rate": 4.31924882629108e-06, + "loss": 9.5482, + "step": 363 + }, + { + "epoch": 1.70622624963461, + "grad_norm": 0.14610658586025238, + "learning_rate": 4.30359937402191e-06, + "loss": 9.2182, + "step": 364 + }, + { + "epoch": 1.7109032446653025, + "grad_norm": 0.19297371804714203, + "learning_rate": 4.287949921752739e-06, + "loss": 8.5858, + "step": 365 + }, + { + "epoch": 1.7155802396959952, + "grad_norm": 0.16764657199382782, + "learning_rate": 4.272300469483568e-06, + "loss": 8.6679, + "step": 366 + }, + { + "epoch": 1.720257234726688, + "grad_norm": 0.1740255355834961, + "learning_rate": 4.2566510172143975e-06, + "loss": 8.3984, + "step": 367 + }, + { + "epoch": 1.7249342297573809, + "grad_norm": 0.2171589732170105, + "learning_rate": 4.241001564945228e-06, + "loss": 8.6767, + "step": 368 + }, + { + "epoch": 1.7296112247880737, + "grad_norm": 0.15334008634090424, + "learning_rate": 4.225352112676057e-06, + "loss": 9.0357, + "step": 369 + }, + { + "epoch": 1.7342882198187666, + "grad_norm": 0.1901715248823166, + "learning_rate": 4.209702660406886e-06, + "loss": 9.1397, + "step": 370 + }, + { + "epoch": 1.7389652148494592, + "grad_norm": 0.14479465782642365, + "learning_rate": 4.194053208137716e-06, + "loss": 8.0689, + "step": 371 + }, + { + "epoch": 1.7436422098801518, + "grad_norm": 0.13776177167892456, + "learning_rate": 4.178403755868545e-06, + "loss": 8.2216, + "step": 372 + }, + { + "epoch": 1.7483192049108447, + "grad_norm": 0.13980716466903687, + "learning_rate": 4.162754303599374e-06, + "loss": 10.2694, + "step": 373 + }, + { + "epoch": 1.7529961999415375, + "grad_norm": 0.15243536233901978, + "learning_rate": 4.1471048513302035e-06, + "loss": 8.4832, + "step": 374 + }, + { + "epoch": 1.7576731949722304, + "grad_norm": 0.1408737152814865, + "learning_rate": 4.131455399061034e-06, + "loss": 10.5995, + "step": 375 + }, + { + "epoch": 1.7623501900029233, + "grad_norm": 0.16743288934230804, + "learning_rate": 4.115805946791863e-06, + "loss": 9.0306, + "step": 376 + }, + { + "epoch": 1.7670271850336159, + "grad_norm": 0.13096289336681366, + "learning_rate": 4.100156494522691e-06, + "loss": 8.799, + "step": 377 + }, + { + "epoch": 1.7717041800643085, + "grad_norm": 0.18536189198493958, + "learning_rate": 4.0845070422535216e-06, + "loss": 8.6714, + "step": 378 + }, + { + "epoch": 1.7763811750950014, + "grad_norm": 0.21224500238895416, + "learning_rate": 4.068857589984351e-06, + "loss": 8.8822, + "step": 379 + }, + { + "epoch": 1.7810581701256942, + "grad_norm": 0.15303047001361847, + "learning_rate": 4.05320813771518e-06, + "loss": 8.8666, + "step": 380 + }, + { + "epoch": 1.785735165156387, + "grad_norm": 0.14419591426849365, + "learning_rate": 4.0375586854460095e-06, + "loss": 8.916, + "step": 381 + }, + { + "epoch": 1.79041216018708, + "grad_norm": 0.1363951712846756, + "learning_rate": 4.021909233176839e-06, + "loss": 8.3857, + "step": 382 + }, + { + "epoch": 1.7950891552177726, + "grad_norm": 0.20621058344841003, + "learning_rate": 4.006259780907669e-06, + "loss": 10.1237, + "step": 383 + }, + { + "epoch": 1.7997661502484652, + "grad_norm": 0.21105414628982544, + "learning_rate": 3.990610328638498e-06, + "loss": 9.5554, + "step": 384 + }, + { + "epoch": 1.804443145279158, + "grad_norm": 0.21915097534656525, + "learning_rate": 3.9749608763693276e-06, + "loss": 7.717, + "step": 385 + }, + { + "epoch": 1.809120140309851, + "grad_norm": 0.17555522918701172, + "learning_rate": 3.959311424100157e-06, + "loss": 9.1899, + "step": 386 + }, + { + "epoch": 1.8137971353405438, + "grad_norm": 0.1890765279531479, + "learning_rate": 3.943661971830986e-06, + "loss": 8.0672, + "step": 387 + }, + { + "epoch": 1.8184741303712366, + "grad_norm": 0.16451717913150787, + "learning_rate": 3.9280125195618155e-06, + "loss": 8.8205, + "step": 388 + }, + { + "epoch": 1.8231511254019293, + "grad_norm": 0.16023708879947662, + "learning_rate": 3.912363067292645e-06, + "loss": 9.319, + "step": 389 + }, + { + "epoch": 1.8278281204326219, + "grad_norm": 0.15548115968704224, + "learning_rate": 3.896713615023475e-06, + "loss": 8.2246, + "step": 390 + }, + { + "epoch": 1.8325051154633147, + "grad_norm": 0.21226494014263153, + "learning_rate": 3.881064162754304e-06, + "loss": 9.135, + "step": 391 + }, + { + "epoch": 1.8371821104940076, + "grad_norm": 0.14461496472358704, + "learning_rate": 3.865414710485133e-06, + "loss": 8.962, + "step": 392 + }, + { + "epoch": 1.8418591055247004, + "grad_norm": 0.20766492187976837, + "learning_rate": 3.849765258215963e-06, + "loss": 8.8991, + "step": 393 + }, + { + "epoch": 1.8465361005553933, + "grad_norm": 0.20327630639076233, + "learning_rate": 3.834115805946792e-06, + "loss": 9.1291, + "step": 394 + }, + { + "epoch": 1.851213095586086, + "grad_norm": 0.23052388429641724, + "learning_rate": 3.8184663536776215e-06, + "loss": 8.3602, + "step": 395 + }, + { + "epoch": 1.8558900906167786, + "grad_norm": 0.16140541434288025, + "learning_rate": 3.8028169014084508e-06, + "loss": 9.3176, + "step": 396 + }, + { + "epoch": 1.8605670856474714, + "grad_norm": 0.17049185931682587, + "learning_rate": 3.7871674491392805e-06, + "loss": 8.6602, + "step": 397 + }, + { + "epoch": 1.8652440806781643, + "grad_norm": 0.11496849358081818, + "learning_rate": 3.77151799687011e-06, + "loss": 10.3293, + "step": 398 + }, + { + "epoch": 1.8699210757088571, + "grad_norm": 0.1907191127538681, + "learning_rate": 3.755868544600939e-06, + "loss": 8.4035, + "step": 399 + }, + { + "epoch": 1.87459807073955, + "grad_norm": 0.16409359872341156, + "learning_rate": 3.740219092331769e-06, + "loss": 8.9062, + "step": 400 + }, + { + "epoch": 1.8792750657702426, + "grad_norm": 0.15642918646335602, + "learning_rate": 3.724569640062598e-06, + "loss": 8.8751, + "step": 401 + }, + { + "epoch": 1.8839520608009352, + "grad_norm": 0.1641726940870285, + "learning_rate": 3.708920187793428e-06, + "loss": 8.3851, + "step": 402 + }, + { + "epoch": 1.888629055831628, + "grad_norm": 0.15342937409877777, + "learning_rate": 3.693270735524257e-06, + "loss": 9.3965, + "step": 403 + }, + { + "epoch": 1.893306050862321, + "grad_norm": 0.15916384756565094, + "learning_rate": 3.677621283255086e-06, + "loss": 8.7446, + "step": 404 + }, + { + "epoch": 1.8979830458930138, + "grad_norm": 0.21401815116405487, + "learning_rate": 3.6619718309859158e-06, + "loss": 8.8994, + "step": 405 + }, + { + "epoch": 1.9026600409237067, + "grad_norm": 0.19148550927639008, + "learning_rate": 3.646322378716745e-06, + "loss": 8.5996, + "step": 406 + }, + { + "epoch": 1.9073370359543993, + "grad_norm": 0.1755845844745636, + "learning_rate": 3.630672926447575e-06, + "loss": 8.7611, + "step": 407 + }, + { + "epoch": 1.912014030985092, + "grad_norm": 0.17193089425563812, + "learning_rate": 3.615023474178404e-06, + "loss": 8.9488, + "step": 408 + }, + { + "epoch": 1.9166910260157848, + "grad_norm": 0.17173364758491516, + "learning_rate": 3.5993740219092334e-06, + "loss": 8.0517, + "step": 409 + }, + { + "epoch": 1.9213680210464776, + "grad_norm": 0.22657723724842072, + "learning_rate": 3.583724569640063e-06, + "loss": 8.7361, + "step": 410 + }, + { + "epoch": 1.9260450160771705, + "grad_norm": 0.21941417455673218, + "learning_rate": 3.568075117370892e-06, + "loss": 9.2343, + "step": 411 + }, + { + "epoch": 1.9307220111078633, + "grad_norm": 0.18514755368232727, + "learning_rate": 3.5524256651017218e-06, + "loss": 8.2767, + "step": 412 + }, + { + "epoch": 1.935399006138556, + "grad_norm": 0.13066066801548004, + "learning_rate": 3.536776212832551e-06, + "loss": 8.7371, + "step": 413 + }, + { + "epoch": 1.9400760011692486, + "grad_norm": 0.16903606057167053, + "learning_rate": 3.5211267605633804e-06, + "loss": 9.3067, + "step": 414 + }, + { + "epoch": 1.9447529961999415, + "grad_norm": 0.14286428689956665, + "learning_rate": 3.50547730829421e-06, + "loss": 7.8586, + "step": 415 + }, + { + "epoch": 1.9494299912306343, + "grad_norm": 0.1969095915555954, + "learning_rate": 3.4898278560250394e-06, + "loss": 9.6053, + "step": 416 + }, + { + "epoch": 1.9541069862613272, + "grad_norm": 0.1750202775001526, + "learning_rate": 3.474178403755869e-06, + "loss": 9.0714, + "step": 417 + }, + { + "epoch": 1.95878398129202, + "grad_norm": 0.21293002367019653, + "learning_rate": 3.458528951486698e-06, + "loss": 9.8726, + "step": 418 + }, + { + "epoch": 1.9634609763227127, + "grad_norm": 0.1672164648771286, + "learning_rate": 3.4428794992175273e-06, + "loss": 9.5275, + "step": 419 + }, + { + "epoch": 1.9681379713534053, + "grad_norm": 0.17561869323253632, + "learning_rate": 3.427230046948357e-06, + "loss": 7.2097, + "step": 420 + }, + { + "epoch": 1.9728149663840981, + "grad_norm": 0.16326965391635895, + "learning_rate": 3.4115805946791864e-06, + "loss": 9.3302, + "step": 421 + }, + { + "epoch": 1.977491961414791, + "grad_norm": 0.15163388848304749, + "learning_rate": 3.395931142410016e-06, + "loss": 9.1933, + "step": 422 + }, + { + "epoch": 1.9821689564454839, + "grad_norm": 0.16277414560317993, + "learning_rate": 3.3802816901408454e-06, + "loss": 8.3196, + "step": 423 + }, + { + "epoch": 1.9868459514761767, + "grad_norm": 0.18385657668113708, + "learning_rate": 3.364632237871675e-06, + "loss": 8.1472, + "step": 424 + }, + { + "epoch": 1.9915229465068693, + "grad_norm": 0.1768423169851303, + "learning_rate": 3.348982785602504e-06, + "loss": 8.3639, + "step": 425 + }, + { + "epoch": 1.996199941537562, + "grad_norm": 0.2325451821088791, + "learning_rate": 3.3333333333333333e-06, + "loss": 9.265, + "step": 426 + }, + { + "epoch": 2.004676995030693, + "grad_norm": 0.22825832664966583, + "learning_rate": 3.317683881064163e-06, + "loss": 10.0732, + "step": 427 + }, + { + "epoch": 2.0093539900613857, + "grad_norm": 0.16034899652004242, + "learning_rate": 3.3020344287949924e-06, + "loss": 7.7232, + "step": 428 + }, + { + "epoch": 2.014030985092078, + "grad_norm": 0.1737372726202011, + "learning_rate": 3.286384976525822e-06, + "loss": 8.0928, + "step": 429 + }, + { + "epoch": 2.018707980122771, + "grad_norm": 0.20644846558570862, + "learning_rate": 3.2707355242566514e-06, + "loss": 8.6956, + "step": 430 + }, + { + "epoch": 2.023384975153464, + "grad_norm": 0.3140431344509125, + "learning_rate": 3.2550860719874807e-06, + "loss": 9.0442, + "step": 431 + }, + { + "epoch": 2.0280619701841567, + "grad_norm": 0.2457619458436966, + "learning_rate": 3.2394366197183104e-06, + "loss": 8.6256, + "step": 432 + }, + { + "epoch": 2.0327389652148495, + "grad_norm": 0.2014688104391098, + "learning_rate": 3.2237871674491393e-06, + "loss": 9.7276, + "step": 433 + }, + { + "epoch": 2.0374159602455424, + "grad_norm": 0.1970800757408142, + "learning_rate": 3.208137715179969e-06, + "loss": 8.19, + "step": 434 + }, + { + "epoch": 2.0420929552762352, + "grad_norm": 0.12662629783153534, + "learning_rate": 3.1924882629107983e-06, + "loss": 9.235, + "step": 435 + }, + { + "epoch": 2.0467699503069277, + "grad_norm": 0.15353932976722717, + "learning_rate": 3.1768388106416277e-06, + "loss": 8.8255, + "step": 436 + }, + { + "epoch": 2.0514469453376205, + "grad_norm": 0.2180812507867813, + "learning_rate": 3.1611893583724574e-06, + "loss": 9.1142, + "step": 437 + }, + { + "epoch": 2.0561239403683134, + "grad_norm": 0.18303510546684265, + "learning_rate": 3.1455399061032867e-06, + "loss": 10.1061, + "step": 438 + }, + { + "epoch": 2.060800935399006, + "grad_norm": 0.15254124999046326, + "learning_rate": 3.1298904538341164e-06, + "loss": 8.5431, + "step": 439 + }, + { + "epoch": 2.065477930429699, + "grad_norm": 0.16063688695430756, + "learning_rate": 3.1142410015649453e-06, + "loss": 8.4382, + "step": 440 + }, + { + "epoch": 2.0701549254603915, + "grad_norm": 0.20583708584308624, + "learning_rate": 3.0985915492957746e-06, + "loss": 8.1778, + "step": 441 + }, + { + "epoch": 2.0748319204910843, + "grad_norm": 0.11699045449495316, + "learning_rate": 3.0829420970266043e-06, + "loss": 7.8459, + "step": 442 + }, + { + "epoch": 2.079508915521777, + "grad_norm": 0.1605014204978943, + "learning_rate": 3.0672926447574336e-06, + "loss": 8.4224, + "step": 443 + }, + { + "epoch": 2.08418591055247, + "grad_norm": 0.14405608177185059, + "learning_rate": 3.0516431924882634e-06, + "loss": 8.3442, + "step": 444 + }, + { + "epoch": 2.088862905583163, + "grad_norm": 0.17145852744579315, + "learning_rate": 3.0359937402190927e-06, + "loss": 8.7685, + "step": 445 + }, + { + "epoch": 2.0935399006138558, + "grad_norm": 0.14711640775203705, + "learning_rate": 3.0203442879499216e-06, + "loss": 7.3568, + "step": 446 + }, + { + "epoch": 2.098216895644548, + "grad_norm": 0.13734185695648193, + "learning_rate": 3.0046948356807513e-06, + "loss": 8.4425, + "step": 447 + }, + { + "epoch": 2.102893890675241, + "grad_norm": 0.1571117639541626, + "learning_rate": 2.9890453834115806e-06, + "loss": 7.6952, + "step": 448 + }, + { + "epoch": 2.107570885705934, + "grad_norm": 0.15319029986858368, + "learning_rate": 2.9733959311424103e-06, + "loss": 7.9937, + "step": 449 + }, + { + "epoch": 2.1122478807366267, + "grad_norm": 0.18363691866397858, + "learning_rate": 2.9577464788732396e-06, + "loss": 8.4406, + "step": 450 + }, + { + "epoch": 2.1169248757673196, + "grad_norm": 0.1433074176311493, + "learning_rate": 2.9420970266040694e-06, + "loss": 9.7219, + "step": 451 + }, + { + "epoch": 2.1216018707980124, + "grad_norm": 0.14841365814208984, + "learning_rate": 2.9264475743348987e-06, + "loss": 8.2459, + "step": 452 + }, + { + "epoch": 2.126278865828705, + "grad_norm": 0.18753403425216675, + "learning_rate": 2.910798122065728e-06, + "loss": 8.7057, + "step": 453 + }, + { + "epoch": 2.1309558608593977, + "grad_norm": 0.1748085618019104, + "learning_rate": 2.8951486697965577e-06, + "loss": 8.5651, + "step": 454 + }, + { + "epoch": 2.1356328558900906, + "grad_norm": 0.17874014377593994, + "learning_rate": 2.8794992175273866e-06, + "loss": 8.5838, + "step": 455 + }, + { + "epoch": 2.1403098509207834, + "grad_norm": 0.16495150327682495, + "learning_rate": 2.8638497652582163e-06, + "loss": 9.8249, + "step": 456 + }, + { + "epoch": 2.1449868459514763, + "grad_norm": 0.12347421795129776, + "learning_rate": 2.8482003129890456e-06, + "loss": 7.1875, + "step": 457 + }, + { + "epoch": 2.149663840982169, + "grad_norm": 0.1617746353149414, + "learning_rate": 2.832550860719875e-06, + "loss": 7.7209, + "step": 458 + }, + { + "epoch": 2.154340836012862, + "grad_norm": 0.160769984126091, + "learning_rate": 2.8169014084507046e-06, + "loss": 7.7851, + "step": 459 + }, + { + "epoch": 2.1590178310435544, + "grad_norm": 0.14725424349308014, + "learning_rate": 2.801251956181534e-06, + "loss": 7.8194, + "step": 460 + }, + { + "epoch": 2.1636948260742472, + "grad_norm": 0.11912764608860016, + "learning_rate": 2.7856025039123637e-06, + "loss": 7.7984, + "step": 461 + }, + { + "epoch": 2.16837182110494, + "grad_norm": 0.17748208343982697, + "learning_rate": 2.7699530516431926e-06, + "loss": 8.0672, + "step": 462 + }, + { + "epoch": 2.173048816135633, + "grad_norm": 0.1708259880542755, + "learning_rate": 2.754303599374022e-06, + "loss": 9.2099, + "step": 463 + }, + { + "epoch": 2.177725811166326, + "grad_norm": 0.15187622606754303, + "learning_rate": 2.7386541471048516e-06, + "loss": 8.3165, + "step": 464 + }, + { + "epoch": 2.182402806197018, + "grad_norm": 0.18263490498065948, + "learning_rate": 2.723004694835681e-06, + "loss": 9.9331, + "step": 465 + }, + { + "epoch": 2.187079801227711, + "grad_norm": 0.12427602708339691, + "learning_rate": 2.7073552425665106e-06, + "loss": 8.5229, + "step": 466 + }, + { + "epoch": 2.191756796258404, + "grad_norm": 0.13961510360240936, + "learning_rate": 2.69170579029734e-06, + "loss": 8.3661, + "step": 467 + }, + { + "epoch": 2.1964337912890968, + "grad_norm": 0.14999401569366455, + "learning_rate": 2.676056338028169e-06, + "loss": 7.2095, + "step": 468 + }, + { + "epoch": 2.2011107863197896, + "grad_norm": 0.14472222328186035, + "learning_rate": 2.6604068857589986e-06, + "loss": 8.6861, + "step": 469 + }, + { + "epoch": 2.2057877813504825, + "grad_norm": 0.14089444279670715, + "learning_rate": 2.644757433489828e-06, + "loss": 7.8008, + "step": 470 + }, + { + "epoch": 2.210464776381175, + "grad_norm": 0.13669133186340332, + "learning_rate": 2.6291079812206576e-06, + "loss": 8.5063, + "step": 471 + }, + { + "epoch": 2.2151417714118677, + "grad_norm": 0.1669352799654007, + "learning_rate": 2.613458528951487e-06, + "loss": 8.7907, + "step": 472 + }, + { + "epoch": 2.2198187664425606, + "grad_norm": 0.15821270644664764, + "learning_rate": 2.597809076682316e-06, + "loss": 8.0308, + "step": 473 + }, + { + "epoch": 2.2244957614732535, + "grad_norm": 0.21483926475048065, + "learning_rate": 2.582159624413146e-06, + "loss": 8.502, + "step": 474 + }, + { + "epoch": 2.2291727565039463, + "grad_norm": 0.18459928035736084, + "learning_rate": 2.5665101721439752e-06, + "loss": 9.5206, + "step": 475 + }, + { + "epoch": 2.233849751534639, + "grad_norm": 0.1487099826335907, + "learning_rate": 2.550860719874805e-06, + "loss": 6.9168, + "step": 476 + }, + { + "epoch": 2.2385267465653316, + "grad_norm": 0.2513448894023895, + "learning_rate": 2.535211267605634e-06, + "loss": 9.3783, + "step": 477 + }, + { + "epoch": 2.2432037415960244, + "grad_norm": 0.1873185932636261, + "learning_rate": 2.519561815336463e-06, + "loss": 8.2886, + "step": 478 + }, + { + "epoch": 2.2478807366267173, + "grad_norm": 0.19832056760787964, + "learning_rate": 2.503912363067293e-06, + "loss": 8.1959, + "step": 479 + }, + { + "epoch": 2.25255773165741, + "grad_norm": 0.20701546967029572, + "learning_rate": 2.488262910798122e-06, + "loss": 8.1702, + "step": 480 + }, + { + "epoch": 2.257234726688103, + "grad_norm": 0.12690390646457672, + "learning_rate": 2.4726134585289515e-06, + "loss": 8.7747, + "step": 481 + }, + { + "epoch": 2.261911721718796, + "grad_norm": 0.1636572629213333, + "learning_rate": 2.4569640062597812e-06, + "loss": 7.8555, + "step": 482 + }, + { + "epoch": 2.2665887167494887, + "grad_norm": 0.12632915377616882, + "learning_rate": 2.4413145539906105e-06, + "loss": 7.9758, + "step": 483 + }, + { + "epoch": 2.271265711780181, + "grad_norm": 0.16761943697929382, + "learning_rate": 2.42566510172144e-06, + "loss": 8.0032, + "step": 484 + }, + { + "epoch": 2.275942706810874, + "grad_norm": 0.15796944499015808, + "learning_rate": 2.4100156494522696e-06, + "loss": 8.8154, + "step": 485 + }, + { + "epoch": 2.280619701841567, + "grad_norm": 0.16528886556625366, + "learning_rate": 2.3943661971830984e-06, + "loss": 7.3999, + "step": 486 + }, + { + "epoch": 2.2852966968722597, + "grad_norm": 0.14766015112400055, + "learning_rate": 2.378716744913928e-06, + "loss": 7.8343, + "step": 487 + }, + { + "epoch": 2.2899736919029525, + "grad_norm": 0.12624794244766235, + "learning_rate": 2.3630672926447575e-06, + "loss": 8.0017, + "step": 488 + }, + { + "epoch": 2.294650686933645, + "grad_norm": 0.16594719886779785, + "learning_rate": 2.347417840375587e-06, + "loss": 7.7649, + "step": 489 + }, + { + "epoch": 2.299327681964338, + "grad_norm": 0.1574728637933731, + "learning_rate": 2.3317683881064165e-06, + "loss": 9.2884, + "step": 490 + }, + { + "epoch": 2.3040046769950306, + "grad_norm": 0.1298084557056427, + "learning_rate": 2.316118935837246e-06, + "loss": 8.4339, + "step": 491 + }, + { + "epoch": 2.3086816720257235, + "grad_norm": 0.15643304586410522, + "learning_rate": 2.300469483568075e-06, + "loss": 8.0997, + "step": 492 + }, + { + "epoch": 2.3133586670564164, + "grad_norm": 0.13263966143131256, + "learning_rate": 2.284820031298905e-06, + "loss": 8.109, + "step": 493 + }, + { + "epoch": 2.318035662087109, + "grad_norm": 0.21980319917201996, + "learning_rate": 2.269170579029734e-06, + "loss": 8.2741, + "step": 494 + }, + { + "epoch": 2.322712657117802, + "grad_norm": 0.13680629432201385, + "learning_rate": 2.2535211267605635e-06, + "loss": 8.5315, + "step": 495 + }, + { + "epoch": 2.3273896521484945, + "grad_norm": 0.1529272496700287, + "learning_rate": 2.237871674491393e-06, + "loss": 8.0531, + "step": 496 + }, + { + "epoch": 2.3320666471791873, + "grad_norm": 0.174594908952713, + "learning_rate": 2.222222222222222e-06, + "loss": 7.7507, + "step": 497 + }, + { + "epoch": 2.33674364220988, + "grad_norm": 0.17085200548171997, + "learning_rate": 2.206572769953052e-06, + "loss": 7.1328, + "step": 498 + }, + { + "epoch": 2.341420637240573, + "grad_norm": 0.14975635707378387, + "learning_rate": 2.190923317683881e-06, + "loss": 9.8064, + "step": 499 + }, + { + "epoch": 2.346097632271266, + "grad_norm": 0.15309952199459076, + "learning_rate": 2.175273865414711e-06, + "loss": 8.6898, + "step": 500 + }, + { + "epoch": 2.3507746273019583, + "grad_norm": 0.13084295392036438, + "learning_rate": 2.15962441314554e-06, + "loss": 8.1384, + "step": 501 + }, + { + "epoch": 2.355451622332651, + "grad_norm": 0.16496095061302185, + "learning_rate": 2.1439749608763695e-06, + "loss": 8.9057, + "step": 502 + }, + { + "epoch": 2.360128617363344, + "grad_norm": 0.157500758767128, + "learning_rate": 2.1283255086071988e-06, + "loss": 7.967, + "step": 503 + }, + { + "epoch": 2.364805612394037, + "grad_norm": 0.1988188475370407, + "learning_rate": 2.1126760563380285e-06, + "loss": 7.4129, + "step": 504 + }, + { + "epoch": 2.3694826074247297, + "grad_norm": 0.21104207634925842, + "learning_rate": 2.097026604068858e-06, + "loss": 7.5442, + "step": 505 + }, + { + "epoch": 2.3741596024554226, + "grad_norm": 0.20285457372665405, + "learning_rate": 2.081377151799687e-06, + "loss": 7.3523, + "step": 506 + }, + { + "epoch": 2.378836597486115, + "grad_norm": 0.24479469656944275, + "learning_rate": 2.065727699530517e-06, + "loss": 7.8577, + "step": 507 + }, + { + "epoch": 2.383513592516808, + "grad_norm": 0.150054469704628, + "learning_rate": 2.0500782472613457e-06, + "loss": 8.2585, + "step": 508 + }, + { + "epoch": 2.3881905875475007, + "grad_norm": 0.12602077424526215, + "learning_rate": 2.0344287949921754e-06, + "loss": 7.7554, + "step": 509 + }, + { + "epoch": 2.3928675825781935, + "grad_norm": 0.18626457452774048, + "learning_rate": 2.0187793427230047e-06, + "loss": 7.5464, + "step": 510 + }, + { + "epoch": 2.3975445776088864, + "grad_norm": 0.20931190252304077, + "learning_rate": 2.0031298904538345e-06, + "loss": 7.5159, + "step": 511 + }, + { + "epoch": 2.4022215726395793, + "grad_norm": 0.2555796802043915, + "learning_rate": 1.9874804381846638e-06, + "loss": 7.5645, + "step": 512 + }, + { + "epoch": 2.4068985676702717, + "grad_norm": 0.17398537695407867, + "learning_rate": 1.971830985915493e-06, + "loss": 8.211, + "step": 513 + }, + { + "epoch": 2.4115755627009645, + "grad_norm": 0.19993047416210175, + "learning_rate": 1.9561815336463224e-06, + "loss": 8.3602, + "step": 514 + }, + { + "epoch": 2.4162525577316574, + "grad_norm": 0.15980151295661926, + "learning_rate": 1.940532081377152e-06, + "loss": 7.6245, + "step": 515 + }, + { + "epoch": 2.4209295527623502, + "grad_norm": 0.16947968304157257, + "learning_rate": 1.9248826291079814e-06, + "loss": 8.2847, + "step": 516 + }, + { + "epoch": 2.425606547793043, + "grad_norm": 0.1670764982700348, + "learning_rate": 1.9092331768388107e-06, + "loss": 8.4169, + "step": 517 + }, + { + "epoch": 2.430283542823736, + "grad_norm": 0.17053499817848206, + "learning_rate": 1.8935837245696402e-06, + "loss": 8.8886, + "step": 518 + }, + { + "epoch": 2.4349605378544283, + "grad_norm": 0.16047680377960205, + "learning_rate": 1.8779342723004696e-06, + "loss": 7.9574, + "step": 519 + }, + { + "epoch": 2.439637532885121, + "grad_norm": 0.2619805932044983, + "learning_rate": 1.862284820031299e-06, + "loss": 8.203, + "step": 520 + }, + { + "epoch": 2.444314527915814, + "grad_norm": 0.2122809886932373, + "learning_rate": 1.8466353677621286e-06, + "loss": 7.8094, + "step": 521 + }, + { + "epoch": 2.448991522946507, + "grad_norm": 0.15507692098617554, + "learning_rate": 1.8309859154929579e-06, + "loss": 7.7085, + "step": 522 + }, + { + "epoch": 2.4536685179771998, + "grad_norm": 0.1406126171350479, + "learning_rate": 1.8153364632237874e-06, + "loss": 8.483, + "step": 523 + }, + { + "epoch": 2.4583455130078926, + "grad_norm": 0.19436419010162354, + "learning_rate": 1.7996870109546167e-06, + "loss": 8.271, + "step": 524 + }, + { + "epoch": 2.463022508038585, + "grad_norm": 0.17198602855205536, + "learning_rate": 1.784037558685446e-06, + "loss": 8.3665, + "step": 525 + }, + { + "epoch": 2.467699503069278, + "grad_norm": 0.28165027499198914, + "learning_rate": 1.7683881064162755e-06, + "loss": 7.8636, + "step": 526 + }, + { + "epoch": 2.4723764980999707, + "grad_norm": 0.2032092958688736, + "learning_rate": 1.752738654147105e-06, + "loss": 7.5732, + "step": 527 + }, + { + "epoch": 2.4770534931306636, + "grad_norm": 0.13977749645709991, + "learning_rate": 1.7370892018779346e-06, + "loss": 7.2479, + "step": 528 + }, + { + "epoch": 2.4817304881613564, + "grad_norm": 0.13071084022521973, + "learning_rate": 1.7214397496087637e-06, + "loss": 7.008, + "step": 529 + }, + { + "epoch": 2.4864074831920493, + "grad_norm": 0.15741536021232605, + "learning_rate": 1.7057902973395932e-06, + "loss": 8.0612, + "step": 530 + }, + { + "epoch": 2.4910844782227417, + "grad_norm": 0.16548508405685425, + "learning_rate": 1.6901408450704227e-06, + "loss": 8.0312, + "step": 531 + }, + { + "epoch": 2.4957614732534346, + "grad_norm": 0.16299135982990265, + "learning_rate": 1.674491392801252e-06, + "loss": 8.8502, + "step": 532 + }, + { + "epoch": 2.5004384682841274, + "grad_norm": 0.159685879945755, + "learning_rate": 1.6588419405320815e-06, + "loss": 9.5205, + "step": 533 + }, + { + "epoch": 2.5051154633148203, + "grad_norm": 0.1804819405078888, + "learning_rate": 1.643192488262911e-06, + "loss": 7.683, + "step": 534 + }, + { + "epoch": 2.509792458345513, + "grad_norm": 0.16809211671352386, + "learning_rate": 1.6275430359937403e-06, + "loss": 8.6418, + "step": 535 + }, + { + "epoch": 2.514469453376206, + "grad_norm": 0.17984607815742493, + "learning_rate": 1.6118935837245697e-06, + "loss": 7.68, + "step": 536 + }, + { + "epoch": 2.5191464484068984, + "grad_norm": 0.17649582028388977, + "learning_rate": 1.5962441314553992e-06, + "loss": 8.1753, + "step": 537 + }, + { + "epoch": 2.5238234434375912, + "grad_norm": 0.16467247903347015, + "learning_rate": 1.5805946791862287e-06, + "loss": 7.6117, + "step": 538 + }, + { + "epoch": 2.528500438468284, + "grad_norm": 0.17968781292438507, + "learning_rate": 1.5649452269170582e-06, + "loss": 8.549, + "step": 539 + }, + { + "epoch": 2.533177433498977, + "grad_norm": 0.15423156321048737, + "learning_rate": 1.5492957746478873e-06, + "loss": 8.7104, + "step": 540 + }, + { + "epoch": 2.53785442852967, + "grad_norm": 0.14077003300189972, + "learning_rate": 1.5336463223787168e-06, + "loss": 8.934, + "step": 541 + }, + { + "epoch": 2.542531423560362, + "grad_norm": 0.16637051105499268, + "learning_rate": 1.5179968701095463e-06, + "loss": 7.4252, + "step": 542 + }, + { + "epoch": 2.5472084185910555, + "grad_norm": 0.1724003106355667, + "learning_rate": 1.5023474178403756e-06, + "loss": 7.9955, + "step": 543 + }, + { + "epoch": 2.551885413621748, + "grad_norm": 0.19609539210796356, + "learning_rate": 1.4866979655712052e-06, + "loss": 8.3348, + "step": 544 + }, + { + "epoch": 2.5565624086524408, + "grad_norm": 0.12707825005054474, + "learning_rate": 1.4710485133020347e-06, + "loss": 8.0848, + "step": 545 + }, + { + "epoch": 2.5612394036831336, + "grad_norm": 0.2031966894865036, + "learning_rate": 1.455399061032864e-06, + "loss": 9.8729, + "step": 546 + }, + { + "epoch": 2.5659163987138265, + "grad_norm": 0.18515604734420776, + "learning_rate": 1.4397496087636933e-06, + "loss": 7.8293, + "step": 547 + }, + { + "epoch": 2.5705933937445193, + "grad_norm": 0.15621398389339447, + "learning_rate": 1.4241001564945228e-06, + "loss": 9.3478, + "step": 548 + }, + { + "epoch": 2.5752703887752117, + "grad_norm": 0.22210869193077087, + "learning_rate": 1.4084507042253523e-06, + "loss": 8.1303, + "step": 549 + }, + { + "epoch": 2.5799473838059046, + "grad_norm": 0.27393949031829834, + "learning_rate": 1.3928012519561818e-06, + "loss": 8.4729, + "step": 550 + }, + { + "epoch": 2.5846243788365975, + "grad_norm": 0.13042934238910675, + "learning_rate": 1.377151799687011e-06, + "loss": 8.2335, + "step": 551 + }, + { + "epoch": 2.5893013738672903, + "grad_norm": 0.207389697432518, + "learning_rate": 1.3615023474178405e-06, + "loss": 8.0168, + "step": 552 + }, + { + "epoch": 2.593978368897983, + "grad_norm": 0.14343053102493286, + "learning_rate": 1.34585289514867e-06, + "loss": 7.9552, + "step": 553 + }, + { + "epoch": 2.5986553639286756, + "grad_norm": 0.1722148060798645, + "learning_rate": 1.3302034428794993e-06, + "loss": 7.6877, + "step": 554 + }, + { + "epoch": 2.603332358959369, + "grad_norm": 0.18076814711093903, + "learning_rate": 1.3145539906103288e-06, + "loss": 8.0741, + "step": 555 + }, + { + "epoch": 2.6080093539900613, + "grad_norm": 0.14633478224277496, + "learning_rate": 1.298904538341158e-06, + "loss": 7.3683, + "step": 556 + }, + { + "epoch": 2.612686349020754, + "grad_norm": 0.14783795177936554, + "learning_rate": 1.2832550860719876e-06, + "loss": 7.8992, + "step": 557 + }, + { + "epoch": 2.617363344051447, + "grad_norm": 0.15360093116760254, + "learning_rate": 1.267605633802817e-06, + "loss": 8.8425, + "step": 558 + }, + { + "epoch": 2.62204033908214, + "grad_norm": 0.1691809445619583, + "learning_rate": 1.2519561815336464e-06, + "loss": 8.669, + "step": 559 + }, + { + "epoch": 2.6267173341128327, + "grad_norm": 0.16426807641983032, + "learning_rate": 1.2363067292644757e-06, + "loss": 9.1265, + "step": 560 + }, + { + "epoch": 2.631394329143525, + "grad_norm": 0.1331864446401596, + "learning_rate": 1.2206572769953053e-06, + "loss": 7.4892, + "step": 561 + }, + { + "epoch": 2.636071324174218, + "grad_norm": 0.1330748349428177, + "learning_rate": 1.2050078247261348e-06, + "loss": 8.6181, + "step": 562 + }, + { + "epoch": 2.640748319204911, + "grad_norm": 0.14942462742328644, + "learning_rate": 1.189358372456964e-06, + "loss": 7.8301, + "step": 563 + }, + { + "epoch": 2.6454253142356037, + "grad_norm": 0.16964685916900635, + "learning_rate": 1.1737089201877936e-06, + "loss": 7.1293, + "step": 564 + }, + { + "epoch": 2.6501023092662965, + "grad_norm": 0.1727379858493805, + "learning_rate": 1.158059467918623e-06, + "loss": 7.1773, + "step": 565 + }, + { + "epoch": 2.654779304296989, + "grad_norm": 0.14950168132781982, + "learning_rate": 1.1424100156494524e-06, + "loss": 7.5172, + "step": 566 + }, + { + "epoch": 2.6594562993276822, + "grad_norm": 0.16068300604820251, + "learning_rate": 1.1267605633802817e-06, + "loss": 8.7739, + "step": 567 + }, + { + "epoch": 2.6641332943583746, + "grad_norm": 0.18006567656993866, + "learning_rate": 1.111111111111111e-06, + "loss": 8.2067, + "step": 568 + }, + { + "epoch": 2.6688102893890675, + "grad_norm": 0.19861166179180145, + "learning_rate": 1.0954616588419406e-06, + "loss": 7.1208, + "step": 569 + }, + { + "epoch": 2.6734872844197604, + "grad_norm": 0.13374726474285126, + "learning_rate": 1.07981220657277e-06, + "loss": 7.8631, + "step": 570 + }, + { + "epoch": 2.678164279450453, + "grad_norm": 0.17814220488071442, + "learning_rate": 1.0641627543035994e-06, + "loss": 7.4765, + "step": 571 + }, + { + "epoch": 2.682841274481146, + "grad_norm": 0.22474409639835358, + "learning_rate": 1.048513302034429e-06, + "loss": 7.0754, + "step": 572 + }, + { + "epoch": 2.6875182695118385, + "grad_norm": 0.16655339300632477, + "learning_rate": 1.0328638497652584e-06, + "loss": 7.2505, + "step": 573 + }, + { + "epoch": 2.6921952645425313, + "grad_norm": 0.172933891415596, + "learning_rate": 1.0172143974960877e-06, + "loss": 8.0832, + "step": 574 + }, + { + "epoch": 2.696872259573224, + "grad_norm": 0.14097332954406738, + "learning_rate": 1.0015649452269172e-06, + "loss": 8.0197, + "step": 575 + }, + { + "epoch": 2.701549254603917, + "grad_norm": 0.1363203376531601, + "learning_rate": 9.859154929577465e-07, + "loss": 7.2466, + "step": 576 + }, + { + "epoch": 2.70622624963461, + "grad_norm": 0.17508287727832794, + "learning_rate": 9.70266040688576e-07, + "loss": 9.3567, + "step": 577 + }, + { + "epoch": 2.7109032446653023, + "grad_norm": 0.169004425406456, + "learning_rate": 9.546165884194054e-07, + "loss": 8.0132, + "step": 578 + }, + { + "epoch": 2.715580239695995, + "grad_norm": 0.14103683829307556, + "learning_rate": 9.389671361502348e-07, + "loss": 9.0192, + "step": 579 + }, + { + "epoch": 2.720257234726688, + "grad_norm": 0.197422057390213, + "learning_rate": 9.233176838810643e-07, + "loss": 7.2779, + "step": 580 + }, + { + "epoch": 2.724934229757381, + "grad_norm": 0.1950581669807434, + "learning_rate": 9.076682316118937e-07, + "loss": 7.6089, + "step": 581 + }, + { + "epoch": 2.7296112247880737, + "grad_norm": 0.23691439628601074, + "learning_rate": 8.92018779342723e-07, + "loss": 7.9658, + "step": 582 + }, + { + "epoch": 2.7342882198187666, + "grad_norm": 0.2558799684047699, + "learning_rate": 8.763693270735525e-07, + "loss": 8.4791, + "step": 583 + }, + { + "epoch": 2.7389652148494594, + "grad_norm": 0.17010214924812317, + "learning_rate": 8.607198748043818e-07, + "loss": 8.4854, + "step": 584 + }, + { + "epoch": 2.743642209880152, + "grad_norm": 0.13403132557868958, + "learning_rate": 8.450704225352114e-07, + "loss": 6.7996, + "step": 585 + }, + { + "epoch": 2.7483192049108447, + "grad_norm": 0.14201347529888153, + "learning_rate": 8.294209702660408e-07, + "loss": 8.3937, + "step": 586 + }, + { + "epoch": 2.7529961999415375, + "grad_norm": 0.28258565068244934, + "learning_rate": 8.137715179968702e-07, + "loss": 9.3709, + "step": 587 + }, + { + "epoch": 2.7576731949722304, + "grad_norm": 0.17337313294410706, + "learning_rate": 7.981220657276996e-07, + "loss": 7.6751, + "step": 588 + }, + { + "epoch": 2.7623501900029233, + "grad_norm": 0.1940070241689682, + "learning_rate": 7.824726134585291e-07, + "loss": 7.3925, + "step": 589 + }, + { + "epoch": 2.7670271850336157, + "grad_norm": 0.14429809153079987, + "learning_rate": 7.668231611893584e-07, + "loss": 7.4402, + "step": 590 + }, + { + "epoch": 2.7717041800643085, + "grad_norm": 0.17765949666500092, + "learning_rate": 7.511737089201878e-07, + "loss": 7.6954, + "step": 591 + }, + { + "epoch": 2.7763811750950014, + "grad_norm": 0.15836399793624878, + "learning_rate": 7.355242566510173e-07, + "loss": 7.6582, + "step": 592 + }, + { + "epoch": 2.7810581701256942, + "grad_norm": 0.14881688356399536, + "learning_rate": 7.198748043818466e-07, + "loss": 8.7099, + "step": 593 + }, + { + "epoch": 2.785735165156387, + "grad_norm": 0.21536029875278473, + "learning_rate": 7.042253521126762e-07, + "loss": 8.3015, + "step": 594 + }, + { + "epoch": 2.79041216018708, + "grad_norm": 0.14025098085403442, + "learning_rate": 6.885758998435055e-07, + "loss": 8.7512, + "step": 595 + }, + { + "epoch": 2.795089155217773, + "grad_norm": 0.13290052115917206, + "learning_rate": 6.72926447574335e-07, + "loss": 7.5792, + "step": 596 + }, + { + "epoch": 2.799766150248465, + "grad_norm": 0.3149656057357788, + "learning_rate": 6.572769953051644e-07, + "loss": 8.4982, + "step": 597 + }, + { + "epoch": 2.804443145279158, + "grad_norm": 0.16543497145175934, + "learning_rate": 6.416275430359938e-07, + "loss": 8.3097, + "step": 598 + }, + { + "epoch": 2.809120140309851, + "grad_norm": 0.17708784341812134, + "learning_rate": 6.259780907668232e-07, + "loss": 8.8047, + "step": 599 + }, + { + "epoch": 2.8137971353405438, + "grad_norm": 0.14560888707637787, + "learning_rate": 6.103286384976526e-07, + "loss": 8.0925, + "step": 600 + }, + { + "epoch": 2.8184741303712366, + "grad_norm": 0.1902446448802948, + "learning_rate": 5.94679186228482e-07, + "loss": 7.7629, + "step": 601 + }, + { + "epoch": 2.823151125401929, + "grad_norm": 0.1473388820886612, + "learning_rate": 5.790297339593115e-07, + "loss": 8.1077, + "step": 602 + }, + { + "epoch": 2.827828120432622, + "grad_norm": 0.16258402168750763, + "learning_rate": 5.633802816901409e-07, + "loss": 7.8582, + "step": 603 + }, + { + "epoch": 2.8325051154633147, + "grad_norm": 0.1769980639219284, + "learning_rate": 5.477308294209703e-07, + "loss": 7.1382, + "step": 604 + }, + { + "epoch": 2.8371821104940076, + "grad_norm": 0.1444021314382553, + "learning_rate": 5.320813771517997e-07, + "loss": 7.1383, + "step": 605 + }, + { + "epoch": 2.8418591055247004, + "grad_norm": 0.21616753935813904, + "learning_rate": 5.164319248826292e-07, + "loss": 7.605, + "step": 606 + }, + { + "epoch": 2.8465361005553933, + "grad_norm": 0.20384635031223297, + "learning_rate": 5.007824726134586e-07, + "loss": 8.8131, + "step": 607 + }, + { + "epoch": 2.851213095586086, + "grad_norm": 0.1579245626926422, + "learning_rate": 4.85133020344288e-07, + "loss": 7.8475, + "step": 608 + }, + { + "epoch": 2.8558900906167786, + "grad_norm": 0.20689930021762848, + "learning_rate": 4.694835680751174e-07, + "loss": 7.1765, + "step": 609 + }, + { + "epoch": 2.8605670856474714, + "grad_norm": 0.1589430868625641, + "learning_rate": 4.5383411580594685e-07, + "loss": 8.8879, + "step": 610 + }, + { + "epoch": 2.8652440806781643, + "grad_norm": 0.16509409248828888, + "learning_rate": 4.3818466353677626e-07, + "loss": 7.6589, + "step": 611 + }, + { + "epoch": 2.869921075708857, + "grad_norm": 0.1439896821975708, + "learning_rate": 4.225352112676057e-07, + "loss": 8.0296, + "step": 612 + }, + { + "epoch": 2.87459807073955, + "grad_norm": 0.19501394033432007, + "learning_rate": 4.068857589984351e-07, + "loss": 8.5349, + "step": 613 + }, + { + "epoch": 2.8792750657702424, + "grad_norm": 0.18828211724758148, + "learning_rate": 3.9123630672926455e-07, + "loss": 7.3593, + "step": 614 + }, + { + "epoch": 2.8839520608009352, + "grad_norm": 0.15072734653949738, + "learning_rate": 3.755868544600939e-07, + "loss": 8.2512, + "step": 615 + }, + { + "epoch": 2.888629055831628, + "grad_norm": 0.1598856896162033, + "learning_rate": 3.599374021909233e-07, + "loss": 7.3246, + "step": 616 + }, + { + "epoch": 2.893306050862321, + "grad_norm": 0.15382905304431915, + "learning_rate": 3.4428794992175273e-07, + "loss": 7.674, + "step": 617 + }, + { + "epoch": 2.897983045893014, + "grad_norm": 0.13851745426654816, + "learning_rate": 3.286384976525822e-07, + "loss": 7.6664, + "step": 618 + }, + { + "epoch": 2.9026600409237067, + "grad_norm": 0.12572415173053741, + "learning_rate": 3.129890453834116e-07, + "loss": 8.6634, + "step": 619 + }, + { + "epoch": 2.9073370359543995, + "grad_norm": 0.181121364235878, + "learning_rate": 2.97339593114241e-07, + "loss": 7.5301, + "step": 620 + }, + { + "epoch": 2.912014030985092, + "grad_norm": 0.18877944350242615, + "learning_rate": 2.8169014084507043e-07, + "loss": 7.4353, + "step": 621 + }, + { + "epoch": 2.916691026015785, + "grad_norm": 0.1800297349691391, + "learning_rate": 2.6604068857589984e-07, + "loss": 8.0261, + "step": 622 + }, + { + "epoch": 2.9213680210464776, + "grad_norm": 0.1459706425666809, + "learning_rate": 2.503912363067293e-07, + "loss": 8.5898, + "step": 623 + }, + { + "epoch": 2.9260450160771705, + "grad_norm": 0.19272330403327942, + "learning_rate": 2.347417840375587e-07, + "loss": 7.5655, + "step": 624 + }, + { + "epoch": 2.9307220111078633, + "grad_norm": 0.13995127379894257, + "learning_rate": 2.1909233176838813e-07, + "loss": 8.807, + "step": 625 + }, + { + "epoch": 2.9353990061385558, + "grad_norm": 0.19578878581523895, + "learning_rate": 2.0344287949921754e-07, + "loss": 8.2283, + "step": 626 + }, + { + "epoch": 2.9400760011692486, + "grad_norm": 0.18744409084320068, + "learning_rate": 1.8779342723004696e-07, + "loss": 7.8586, + "step": 627 + }, + { + "epoch": 2.9447529961999415, + "grad_norm": 0.18906202912330627, + "learning_rate": 1.7214397496087637e-07, + "loss": 8.9175, + "step": 628 + }, + { + "epoch": 2.9494299912306343, + "grad_norm": 0.2817856967449188, + "learning_rate": 1.564945226917058e-07, + "loss": 7.8464, + "step": 629 + }, + { + "epoch": 2.954106986261327, + "grad_norm": 0.1482636034488678, + "learning_rate": 1.4084507042253522e-07, + "loss": 8.0478, + "step": 630 + }, + { + "epoch": 2.95878398129202, + "grad_norm": 0.1729574054479599, + "learning_rate": 1.2519561815336465e-07, + "loss": 8.3514, + "step": 631 + }, + { + "epoch": 2.963460976322713, + "grad_norm": 0.23052264750003815, + "learning_rate": 1.0954616588419407e-07, + "loss": 8.397, + "step": 632 + }, + { + "epoch": 2.9681379713534053, + "grad_norm": 0.16747911274433136, + "learning_rate": 9.389671361502348e-08, + "loss": 8.0443, + "step": 633 + }, + { + "epoch": 2.972814966384098, + "grad_norm": 0.14860796928405762, + "learning_rate": 7.82472613458529e-08, + "loss": 7.317, + "step": 634 + }, + { + "epoch": 2.977491961414791, + "grad_norm": 0.13674141466617584, + "learning_rate": 6.259780907668233e-08, + "loss": 7.6038, + "step": 635 + }, + { + "epoch": 2.982168956445484, + "grad_norm": 0.163039892911911, + "learning_rate": 4.694835680751174e-08, + "loss": 7.4459, + "step": 636 + }, + { + "epoch": 2.9868459514761767, + "grad_norm": 0.1598978042602539, + "learning_rate": 3.1298904538341164e-08, + "loss": 9.9334, + "step": 637 + }, + { + "epoch": 2.991522946506869, + "grad_norm": 0.16937094926834106, + "learning_rate": 1.5649452269170582e-08, + "loss": 7.9632, + "step": 638 + }, + { + "epoch": 2.996199941537562, + "grad_norm": 0.11614558100700378, + "learning_rate": 0.0, + "loss": 7.3632, + "step": 639 + }, + { + "epoch": 2.996199941537562, + "step": 639, + "total_flos": 2.8450474856619704e+18, + "train_loss": 9.811768141524146, + "train_runtime": 60574.6064, + "train_samples_per_second": 1.355, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1.0, + "max_steps": 639, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8450474856619704e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}