diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7202 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1024, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009765625, + "grad_norm": 0.6541444063186646, + "learning_rate": 0.0, + "loss": 1.0280990600585938, + "step": 1 + }, + { + "epoch": 0.001953125, + "grad_norm": 0.4356674551963806, + "learning_rate": 4e-05, + "loss": 0.8305179476737976, + "step": 2 + }, + { + "epoch": 0.0029296875, + "grad_norm": 0.3900858759880066, + "learning_rate": 8e-05, + "loss": 0.7835474014282227, + "step": 3 + }, + { + "epoch": 0.00390625, + "grad_norm": 0.3717947006225586, + "learning_rate": 0.00012, + "loss": 1.1571688652038574, + "step": 4 + }, + { + "epoch": 0.0048828125, + "grad_norm": 0.2760661542415619, + "learning_rate": 0.00016, + "loss": 0.8141135573387146, + "step": 5 + }, + { + "epoch": 0.005859375, + "grad_norm": 0.24524882435798645, + "learning_rate": 0.0002, + "loss": 0.29919666051864624, + "step": 6 + }, + { + "epoch": 0.0068359375, + "grad_norm": 0.3155483305454254, + "learning_rate": 0.00019980372914622178, + "loss": 0.916366696357727, + "step": 7 + }, + { + "epoch": 0.0078125, + "grad_norm": 1.0419310331344604, + "learning_rate": 0.00019960745829244357, + "loss": 0.986505389213562, + "step": 8 + }, + { + "epoch": 0.0087890625, + "grad_norm": 0.32395845651626587, + "learning_rate": 0.00019941118743866537, + "loss": 0.7845190167427063, + "step": 9 + }, + { + "epoch": 0.009765625, + "grad_norm": 0.564084529876709, + "learning_rate": 0.00019921491658488717, + "loss": 1.0922366380691528, + "step": 10 + }, + { + "epoch": 0.0107421875, + "grad_norm": 0.4066593647003174, + "learning_rate": 0.00019901864573110893, + "loss": 1.0279463529586792, + "step": 11 + }, + { + "epoch": 0.01171875, + "grad_norm": 0.43442535400390625, + "learning_rate": 0.00019882237487733073, + "loss": 0.9713175892829895, + "step": 12 + }, + { + "epoch": 0.0126953125, + "grad_norm": 0.26689526438713074, + "learning_rate": 0.0001986261040235525, + "loss": 0.38461241126060486, + "step": 13 + }, + { + "epoch": 0.013671875, + "grad_norm": 0.41254541277885437, + "learning_rate": 0.0001984298331697743, + "loss": 0.7746479511260986, + "step": 14 + }, + { + "epoch": 0.0146484375, + "grad_norm": 0.39432424306869507, + "learning_rate": 0.0001982335623159961, + "loss": 0.7843194603919983, + "step": 15 + }, + { + "epoch": 0.015625, + "grad_norm": 0.4303337037563324, + "learning_rate": 0.0001980372914622179, + "loss": 0.6613403558731079, + "step": 16 + }, + { + "epoch": 0.0166015625, + "grad_norm": 0.875269889831543, + "learning_rate": 0.00019784102060843966, + "loss": 1.0992671251296997, + "step": 17 + }, + { + "epoch": 0.017578125, + "grad_norm": 0.21415413916110992, + "learning_rate": 0.00019764474975466145, + "loss": 0.2784216105937958, + "step": 18 + }, + { + "epoch": 0.0185546875, + "grad_norm": 0.4318086504936218, + "learning_rate": 0.00019744847890088322, + "loss": 0.6146124005317688, + "step": 19 + }, + { + "epoch": 0.01953125, + "grad_norm": 0.20149515569210052, + "learning_rate": 0.00019725220804710502, + "loss": 0.3920556306838989, + "step": 20 + }, + { + "epoch": 0.0205078125, + "grad_norm": 0.358688622713089, + "learning_rate": 0.0001970559371933268, + "loss": 0.6672685742378235, + "step": 21 + }, + { + "epoch": 0.021484375, + "grad_norm": 0.5916730165481567, + "learning_rate": 0.00019685966633954858, + "loss": 1.0804443359375, + "step": 22 + }, + { + "epoch": 0.0224609375, + "grad_norm": 0.3139825761318207, + "learning_rate": 0.00019666339548577038, + "loss": 0.7358766794204712, + "step": 23 + }, + { + "epoch": 0.0234375, + "grad_norm": 0.4019712805747986, + "learning_rate": 0.00019646712463199215, + "loss": 0.7362902164459229, + "step": 24 + }, + { + "epoch": 0.0244140625, + "grad_norm": 0.2874290347099304, + "learning_rate": 0.00019627085377821394, + "loss": 0.6446189284324646, + "step": 25 + }, + { + "epoch": 0.025390625, + "grad_norm": 0.357494592666626, + "learning_rate": 0.0001960745829244357, + "loss": 0.2820976972579956, + "step": 26 + }, + { + "epoch": 0.0263671875, + "grad_norm": 0.22216391563415527, + "learning_rate": 0.00019587831207065753, + "loss": 0.6020435094833374, + "step": 27 + }, + { + "epoch": 0.02734375, + "grad_norm": 0.23284995555877686, + "learning_rate": 0.0001956820412168793, + "loss": 0.44151532649993896, + "step": 28 + }, + { + "epoch": 0.0283203125, + "grad_norm": 0.3594605028629303, + "learning_rate": 0.0001954857703631011, + "loss": 0.9414041042327881, + "step": 29 + }, + { + "epoch": 0.029296875, + "grad_norm": 0.4460504353046417, + "learning_rate": 0.00019528949950932287, + "loss": 0.7148531079292297, + "step": 30 + }, + { + "epoch": 0.0302734375, + "grad_norm": 0.3392362892627716, + "learning_rate": 0.00019509322865554466, + "loss": 0.7185512781143188, + "step": 31 + }, + { + "epoch": 0.03125, + "grad_norm": 0.3340625464916229, + "learning_rate": 0.00019489695780176643, + "loss": 0.6613262891769409, + "step": 32 + }, + { + "epoch": 0.0322265625, + "grad_norm": 0.26223355531692505, + "learning_rate": 0.00019470068694798826, + "loss": 0.590149462223053, + "step": 33 + }, + { + "epoch": 0.033203125, + "grad_norm": 0.3481689691543579, + "learning_rate": 0.00019450441609421002, + "loss": 0.5590913891792297, + "step": 34 + }, + { + "epoch": 0.0341796875, + "grad_norm": 0.4775488078594208, + "learning_rate": 0.00019430814524043182, + "loss": 0.927351176738739, + "step": 35 + }, + { + "epoch": 0.03515625, + "grad_norm": 0.4474835693836212, + "learning_rate": 0.0001941118743866536, + "loss": 0.7719380855560303, + "step": 36 + }, + { + "epoch": 0.0361328125, + "grad_norm": 0.3538999855518341, + "learning_rate": 0.00019391560353287536, + "loss": 1.0287561416625977, + "step": 37 + }, + { + "epoch": 0.037109375, + "grad_norm": 0.5018237233161926, + "learning_rate": 0.00019371933267909715, + "loss": 1.049814224243164, + "step": 38 + }, + { + "epoch": 0.0380859375, + "grad_norm": 0.5052743554115295, + "learning_rate": 0.00019352306182531895, + "loss": 0.39767658710479736, + "step": 39 + }, + { + "epoch": 0.0390625, + "grad_norm": 0.46170520782470703, + "learning_rate": 0.00019332679097154075, + "loss": 0.9849376678466797, + "step": 40 + }, + { + "epoch": 0.0400390625, + "grad_norm": 0.5961291193962097, + "learning_rate": 0.00019313052011776251, + "loss": 0.8527336716651917, + "step": 41 + }, + { + "epoch": 0.041015625, + "grad_norm": 0.4002876579761505, + "learning_rate": 0.0001929342492639843, + "loss": 0.7445047497749329, + "step": 42 + }, + { + "epoch": 0.0419921875, + "grad_norm": 0.6382992267608643, + "learning_rate": 0.00019273797841020608, + "loss": 0.7587878704071045, + "step": 43 + }, + { + "epoch": 0.04296875, + "grad_norm": 0.4204530715942383, + "learning_rate": 0.00019254170755642788, + "loss": 0.943995475769043, + "step": 44 + }, + { + "epoch": 0.0439453125, + "grad_norm": 0.29038068652153015, + "learning_rate": 0.00019234543670264967, + "loss": 0.4540131688117981, + "step": 45 + }, + { + "epoch": 0.044921875, + "grad_norm": 0.41968628764152527, + "learning_rate": 0.00019214916584887147, + "loss": 0.3900204300880432, + "step": 46 + }, + { + "epoch": 0.0458984375, + "grad_norm": 0.5870251059532166, + "learning_rate": 0.00019195289499509324, + "loss": 0.8700598478317261, + "step": 47 + }, + { + "epoch": 0.046875, + "grad_norm": 0.3120124042034149, + "learning_rate": 0.00019175662414131503, + "loss": 0.2866731882095337, + "step": 48 + }, + { + "epoch": 0.0478515625, + "grad_norm": 0.31891942024230957, + "learning_rate": 0.0001915603532875368, + "loss": 0.7711223363876343, + "step": 49 + }, + { + "epoch": 0.048828125, + "grad_norm": 0.4250207543373108, + "learning_rate": 0.0001913640824337586, + "loss": 0.7499758005142212, + "step": 50 + }, + { + "epoch": 0.0498046875, + "grad_norm": 0.4769924581050873, + "learning_rate": 0.0001911678115799804, + "loss": 0.8479812145233154, + "step": 51 + }, + { + "epoch": 0.05078125, + "grad_norm": 0.2966979146003723, + "learning_rate": 0.00019097154072620216, + "loss": 0.8125182390213013, + "step": 52 + }, + { + "epoch": 0.0517578125, + "grad_norm": 0.4924452006816864, + "learning_rate": 0.00019077526987242396, + "loss": 1.006331443786621, + "step": 53 + }, + { + "epoch": 0.052734375, + "grad_norm": 0.5558736324310303, + "learning_rate": 0.00019057899901864573, + "loss": 0.8218062520027161, + "step": 54 + }, + { + "epoch": 0.0537109375, + "grad_norm": 0.488903284072876, + "learning_rate": 0.00019038272816486752, + "loss": 0.7451006770133972, + "step": 55 + }, + { + "epoch": 0.0546875, + "grad_norm": 0.6092124581336975, + "learning_rate": 0.00019018645731108932, + "loss": 0.3371097445487976, + "step": 56 + }, + { + "epoch": 0.0556640625, + "grad_norm": 0.34885621070861816, + "learning_rate": 0.00018999018645731111, + "loss": 0.9263520836830139, + "step": 57 + }, + { + "epoch": 0.056640625, + "grad_norm": 0.41470521688461304, + "learning_rate": 0.00018979391560353288, + "loss": 0.8741390109062195, + "step": 58 + }, + { + "epoch": 0.0576171875, + "grad_norm": 0.32286664843559265, + "learning_rate": 0.00018959764474975468, + "loss": 0.6128658056259155, + "step": 59 + }, + { + "epoch": 0.05859375, + "grad_norm": 0.43667954206466675, + "learning_rate": 0.00018940137389597645, + "loss": 0.822106122970581, + "step": 60 + }, + { + "epoch": 0.0595703125, + "grad_norm": 0.5501149892807007, + "learning_rate": 0.00018920510304219824, + "loss": 0.2981743812561035, + "step": 61 + }, + { + "epoch": 0.060546875, + "grad_norm": 0.5234649777412415, + "learning_rate": 0.00018900883218842004, + "loss": 0.710310161113739, + "step": 62 + }, + { + "epoch": 0.0615234375, + "grad_norm": 0.5040559768676758, + "learning_rate": 0.00018881256133464184, + "loss": 1.0355676412582397, + "step": 63 + }, + { + "epoch": 0.0625, + "grad_norm": 0.4435643255710602, + "learning_rate": 0.0001886162904808636, + "loss": 1.031105399131775, + "step": 64 + }, + { + "epoch": 0.0634765625, + "grad_norm": 0.4987465441226959, + "learning_rate": 0.0001884200196270854, + "loss": 0.7753915190696716, + "step": 65 + }, + { + "epoch": 0.064453125, + "grad_norm": 0.3633696436882019, + "learning_rate": 0.00018822374877330717, + "loss": 1.2376799583435059, + "step": 66 + }, + { + "epoch": 0.0654296875, + "grad_norm": 1.0342258214950562, + "learning_rate": 0.00018802747791952894, + "loss": 0.6145737171173096, + "step": 67 + }, + { + "epoch": 0.06640625, + "grad_norm": 0.47045138478279114, + "learning_rate": 0.00018783120706575076, + "loss": 0.8622407913208008, + "step": 68 + }, + { + "epoch": 0.0673828125, + "grad_norm": 0.47864851355552673, + "learning_rate": 0.00018763493621197253, + "loss": 0.6727300882339478, + "step": 69 + }, + { + "epoch": 0.068359375, + "grad_norm": 0.38102060556411743, + "learning_rate": 0.00018743866535819433, + "loss": 0.7417519092559814, + "step": 70 + }, + { + "epoch": 0.0693359375, + "grad_norm": 0.4229515492916107, + "learning_rate": 0.0001872423945044161, + "loss": 0.46951866149902344, + "step": 71 + }, + { + "epoch": 0.0703125, + "grad_norm": 0.4868115186691284, + "learning_rate": 0.0001870461236506379, + "loss": 0.32457292079925537, + "step": 72 + }, + { + "epoch": 0.0712890625, + "grad_norm": 0.298020601272583, + "learning_rate": 0.00018684985279685966, + "loss": 0.2501494288444519, + "step": 73 + }, + { + "epoch": 0.072265625, + "grad_norm": 0.49870651960372925, + "learning_rate": 0.00018665358194308145, + "loss": 0.5599403381347656, + "step": 74 + }, + { + "epoch": 0.0732421875, + "grad_norm": 0.5717479586601257, + "learning_rate": 0.00018645731108930325, + "loss": 0.4725653827190399, + "step": 75 + }, + { + "epoch": 0.07421875, + "grad_norm": 0.5230128765106201, + "learning_rate": 0.00018626104023552505, + "loss": 1.0607699155807495, + "step": 76 + }, + { + "epoch": 0.0751953125, + "grad_norm": 0.4279435873031616, + "learning_rate": 0.00018606476938174682, + "loss": 0.5628142952919006, + "step": 77 + }, + { + "epoch": 0.076171875, + "grad_norm": 0.6166331171989441, + "learning_rate": 0.0001858684985279686, + "loss": 0.44837141036987305, + "step": 78 + }, + { + "epoch": 0.0771484375, + "grad_norm": 0.6329861879348755, + "learning_rate": 0.00018567222767419038, + "loss": 0.5013883709907532, + "step": 79 + }, + { + "epoch": 0.078125, + "grad_norm": 0.2921103239059448, + "learning_rate": 0.00018547595682041218, + "loss": 0.541824996471405, + "step": 80 + }, + { + "epoch": 0.0791015625, + "grad_norm": 0.36744800209999084, + "learning_rate": 0.00018527968596663397, + "loss": 0.3878925144672394, + "step": 81 + }, + { + "epoch": 0.080078125, + "grad_norm": 0.34045904874801636, + "learning_rate": 0.00018508341511285574, + "loss": 0.33476194739341736, + "step": 82 + }, + { + "epoch": 0.0810546875, + "grad_norm": 0.48908546566963196, + "learning_rate": 0.00018488714425907754, + "loss": 1.003555178642273, + "step": 83 + }, + { + "epoch": 0.08203125, + "grad_norm": 0.4683694839477539, + "learning_rate": 0.0001846908734052993, + "loss": 0.7300649285316467, + "step": 84 + }, + { + "epoch": 0.0830078125, + "grad_norm": 0.3560928404331207, + "learning_rate": 0.0001844946025515211, + "loss": 0.4525097608566284, + "step": 85 + }, + { + "epoch": 0.083984375, + "grad_norm": 1.481307864189148, + "learning_rate": 0.0001842983316977429, + "loss": 0.5444833040237427, + "step": 86 + }, + { + "epoch": 0.0849609375, + "grad_norm": 0.42610403895378113, + "learning_rate": 0.0001841020608439647, + "loss": 0.7340827584266663, + "step": 87 + }, + { + "epoch": 0.0859375, + "grad_norm": 0.6035026907920837, + "learning_rate": 0.00018390578999018646, + "loss": 0.5589049458503723, + "step": 88 + }, + { + "epoch": 0.0869140625, + "grad_norm": 0.6075074076652527, + "learning_rate": 0.00018370951913640826, + "loss": 0.4969009757041931, + "step": 89 + }, + { + "epoch": 0.087890625, + "grad_norm": 0.6751372814178467, + "learning_rate": 0.00018351324828263003, + "loss": 0.46451041102409363, + "step": 90 + }, + { + "epoch": 0.0888671875, + "grad_norm": 0.5816373229026794, + "learning_rate": 0.00018331697742885182, + "loss": 1.024427056312561, + "step": 91 + }, + { + "epoch": 0.08984375, + "grad_norm": 0.6644161939620972, + "learning_rate": 0.00018312070657507362, + "loss": 0.778592586517334, + "step": 92 + }, + { + "epoch": 0.0908203125, + "grad_norm": 0.652209997177124, + "learning_rate": 0.00018292443572129541, + "loss": 0.8565710783004761, + "step": 93 + }, + { + "epoch": 0.091796875, + "grad_norm": 0.9109074473381042, + "learning_rate": 0.00018272816486751718, + "loss": 0.6693978309631348, + "step": 94 + }, + { + "epoch": 0.0927734375, + "grad_norm": 0.5235186219215393, + "learning_rate": 0.00018253189401373895, + "loss": 0.8255172967910767, + "step": 95 + }, + { + "epoch": 0.09375, + "grad_norm": 0.8362122178077698, + "learning_rate": 0.00018233562315996075, + "loss": 0.5858157873153687, + "step": 96 + }, + { + "epoch": 0.0947265625, + "grad_norm": 0.6753116846084595, + "learning_rate": 0.00018213935230618254, + "loss": 0.6682421565055847, + "step": 97 + }, + { + "epoch": 0.095703125, + "grad_norm": 0.5394794940948486, + "learning_rate": 0.00018194308145240434, + "loss": 0.3218158781528473, + "step": 98 + }, + { + "epoch": 0.0966796875, + "grad_norm": 3.2796010971069336, + "learning_rate": 0.0001817468105986261, + "loss": 0.681085467338562, + "step": 99 + }, + { + "epoch": 0.09765625, + "grad_norm": 0.38390907645225525, + "learning_rate": 0.0001815505397448479, + "loss": 0.39554187655448914, + "step": 100 + }, + { + "epoch": 0.0986328125, + "grad_norm": 0.5289499759674072, + "learning_rate": 0.00018135426889106967, + "loss": 1.0264520645141602, + "step": 101 + }, + { + "epoch": 0.099609375, + "grad_norm": 0.8211148977279663, + "learning_rate": 0.00018115799803729147, + "loss": 0.8588113784790039, + "step": 102 + }, + { + "epoch": 0.1005859375, + "grad_norm": 0.4771063029766083, + "learning_rate": 0.00018096172718351327, + "loss": 0.7471244931221008, + "step": 103 + }, + { + "epoch": 0.1015625, + "grad_norm": 0.6326794624328613, + "learning_rate": 0.00018076545632973506, + "loss": 0.6081597805023193, + "step": 104 + }, + { + "epoch": 0.1025390625, + "grad_norm": 0.7229248285293579, + "learning_rate": 0.00018056918547595683, + "loss": 0.8315082788467407, + "step": 105 + }, + { + "epoch": 0.103515625, + "grad_norm": 0.6803163290023804, + "learning_rate": 0.00018037291462217863, + "loss": 0.8308911323547363, + "step": 106 + }, + { + "epoch": 0.1044921875, + "grad_norm": 0.5268850326538086, + "learning_rate": 0.0001801766437684004, + "loss": 0.8480656743049622, + "step": 107 + }, + { + "epoch": 0.10546875, + "grad_norm": 0.7849289178848267, + "learning_rate": 0.0001799803729146222, + "loss": 0.8200575113296509, + "step": 108 + }, + { + "epoch": 0.1064453125, + "grad_norm": 0.4259982407093048, + "learning_rate": 0.00017978410206084396, + "loss": 0.44367721676826477, + "step": 109 + }, + { + "epoch": 0.107421875, + "grad_norm": 0.4788619577884674, + "learning_rate": 0.00017958783120706576, + "loss": 0.6017763018608093, + "step": 110 + }, + { + "epoch": 0.1083984375, + "grad_norm": 0.34434452652931213, + "learning_rate": 0.00017939156035328755, + "loss": 0.29681769013404846, + "step": 111 + }, + { + "epoch": 0.109375, + "grad_norm": 1.1506884098052979, + "learning_rate": 0.00017919528949950932, + "loss": 0.6520863771438599, + "step": 112 + }, + { + "epoch": 0.1103515625, + "grad_norm": 0.8348999619483948, + "learning_rate": 0.00017899901864573112, + "loss": 0.6035414934158325, + "step": 113 + }, + { + "epoch": 0.111328125, + "grad_norm": 0.5550518035888672, + "learning_rate": 0.00017880274779195289, + "loss": 0.7711564302444458, + "step": 114 + }, + { + "epoch": 0.1123046875, + "grad_norm": 0.28814634680747986, + "learning_rate": 0.00017860647693817468, + "loss": 0.8325987458229065, + "step": 115 + }, + { + "epoch": 0.11328125, + "grad_norm": 0.3833630084991455, + "learning_rate": 0.00017841020608439648, + "loss": 0.3345921039581299, + "step": 116 + }, + { + "epoch": 0.1142578125, + "grad_norm": 0.8784507513046265, + "learning_rate": 0.00017821393523061827, + "loss": 0.4186948239803314, + "step": 117 + }, + { + "epoch": 0.115234375, + "grad_norm": 0.7263842225074768, + "learning_rate": 0.00017801766437684004, + "loss": 0.5570493936538696, + "step": 118 + }, + { + "epoch": 0.1162109375, + "grad_norm": 0.6391569972038269, + "learning_rate": 0.00017782139352306184, + "loss": 1.0257431268692017, + "step": 119 + }, + { + "epoch": 0.1171875, + "grad_norm": 0.6025450229644775, + "learning_rate": 0.0001776251226692836, + "loss": 0.8676729202270508, + "step": 120 + }, + { + "epoch": 0.1181640625, + "grad_norm": 0.3776579201221466, + "learning_rate": 0.0001774288518155054, + "loss": 0.5870720148086548, + "step": 121 + }, + { + "epoch": 0.119140625, + "grad_norm": 0.40912336111068726, + "learning_rate": 0.0001772325809617272, + "loss": 0.9210044145584106, + "step": 122 + }, + { + "epoch": 0.1201171875, + "grad_norm": 0.5036085247993469, + "learning_rate": 0.000177036310107949, + "loss": 0.47378072142601013, + "step": 123 + }, + { + "epoch": 0.12109375, + "grad_norm": 0.5508134961128235, + "learning_rate": 0.00017684003925417076, + "loss": 0.8295834064483643, + "step": 124 + }, + { + "epoch": 0.1220703125, + "grad_norm": 0.5522392392158508, + "learning_rate": 0.00017664376840039253, + "loss": 0.793156087398529, + "step": 125 + }, + { + "epoch": 0.123046875, + "grad_norm": 1.0098820924758911, + "learning_rate": 0.00017644749754661433, + "loss": 0.5780155658721924, + "step": 126 + }, + { + "epoch": 0.1240234375, + "grad_norm": 0.6178780198097229, + "learning_rate": 0.00017625122669283612, + "loss": 0.5129156708717346, + "step": 127 + }, + { + "epoch": 0.125, + "grad_norm": 0.6224352121353149, + "learning_rate": 0.00017605495583905792, + "loss": 0.8498928546905518, + "step": 128 + }, + { + "epoch": 0.1259765625, + "grad_norm": 0.7869983315467834, + "learning_rate": 0.0001758586849852797, + "loss": 0.9180670976638794, + "step": 129 + }, + { + "epoch": 0.126953125, + "grad_norm": 0.4122680127620697, + "learning_rate": 0.00017566241413150148, + "loss": 0.510919988155365, + "step": 130 + }, + { + "epoch": 0.1279296875, + "grad_norm": 0.7221843004226685, + "learning_rate": 0.00017546614327772325, + "loss": 0.3977488875389099, + "step": 131 + }, + { + "epoch": 0.12890625, + "grad_norm": 1.155800461769104, + "learning_rate": 0.00017526987242394505, + "loss": 0.6549078226089478, + "step": 132 + }, + { + "epoch": 0.1298828125, + "grad_norm": 0.7164724469184875, + "learning_rate": 0.00017507360157016685, + "loss": 0.8306566476821899, + "step": 133 + }, + { + "epoch": 0.130859375, + "grad_norm": 0.7600284814834595, + "learning_rate": 0.00017487733071638864, + "loss": 0.34278520941734314, + "step": 134 + }, + { + "epoch": 0.1318359375, + "grad_norm": 0.8636081218719482, + "learning_rate": 0.0001746810598626104, + "loss": 0.8881778717041016, + "step": 135 + }, + { + "epoch": 0.1328125, + "grad_norm": 1.0904357433319092, + "learning_rate": 0.0001744847890088322, + "loss": 0.4423227310180664, + "step": 136 + }, + { + "epoch": 0.1337890625, + "grad_norm": 0.5639862418174744, + "learning_rate": 0.00017428851815505397, + "loss": 0.8610935211181641, + "step": 137 + }, + { + "epoch": 0.134765625, + "grad_norm": 1.05929696559906, + "learning_rate": 0.00017409224730127577, + "loss": 1.1729753017425537, + "step": 138 + }, + { + "epoch": 0.1357421875, + "grad_norm": 1.0731761455535889, + "learning_rate": 0.00017389597644749757, + "loss": 0.6459341049194336, + "step": 139 + }, + { + "epoch": 0.13671875, + "grad_norm": 0.7464702725410461, + "learning_rate": 0.00017369970559371934, + "loss": 0.5368601083755493, + "step": 140 + }, + { + "epoch": 0.1376953125, + "grad_norm": 0.5722304582595825, + "learning_rate": 0.00017350343473994113, + "loss": 0.9642695784568787, + "step": 141 + }, + { + "epoch": 0.138671875, + "grad_norm": 0.5044945478439331, + "learning_rate": 0.0001733071638861629, + "loss": 0.49555253982543945, + "step": 142 + }, + { + "epoch": 0.1396484375, + "grad_norm": 0.8069168329238892, + "learning_rate": 0.0001731108930323847, + "loss": 0.8796389698982239, + "step": 143 + }, + { + "epoch": 0.140625, + "grad_norm": 0.5269959568977356, + "learning_rate": 0.00017291462217860646, + "loss": 0.9928920269012451, + "step": 144 + }, + { + "epoch": 0.1416015625, + "grad_norm": 0.6606360077857971, + "learning_rate": 0.0001727183513248283, + "loss": 1.0528640747070312, + "step": 145 + }, + { + "epoch": 0.142578125, + "grad_norm": 0.7145242691040039, + "learning_rate": 0.00017252208047105006, + "loss": 1.1252766847610474, + "step": 146 + }, + { + "epoch": 0.1435546875, + "grad_norm": 0.5808660984039307, + "learning_rate": 0.00017232580961727185, + "loss": 0.24914072453975677, + "step": 147 + }, + { + "epoch": 0.14453125, + "grad_norm": 0.8544529676437378, + "learning_rate": 0.00017212953876349362, + "loss": 0.4420434832572937, + "step": 148 + }, + { + "epoch": 0.1455078125, + "grad_norm": 0.899334728717804, + "learning_rate": 0.00017193326790971542, + "loss": 0.7128512263298035, + "step": 149 + }, + { + "epoch": 0.146484375, + "grad_norm": 0.36327579617500305, + "learning_rate": 0.00017173699705593719, + "loss": 0.5503419637680054, + "step": 150 + }, + { + "epoch": 0.1474609375, + "grad_norm": 0.553255021572113, + "learning_rate": 0.000171540726202159, + "loss": 0.5796535015106201, + "step": 151 + }, + { + "epoch": 0.1484375, + "grad_norm": 0.41036659479141235, + "learning_rate": 0.00017134445534838078, + "loss": 0.8935849666595459, + "step": 152 + }, + { + "epoch": 0.1494140625, + "grad_norm": 0.3723013997077942, + "learning_rate": 0.00017114818449460257, + "loss": 0.39106485247612, + "step": 153 + }, + { + "epoch": 0.150390625, + "grad_norm": 0.654262900352478, + "learning_rate": 0.00017095191364082434, + "loss": 1.0176405906677246, + "step": 154 + }, + { + "epoch": 0.1513671875, + "grad_norm": 0.5707812309265137, + "learning_rate": 0.0001707556427870461, + "loss": 0.6580768823623657, + "step": 155 + }, + { + "epoch": 0.15234375, + "grad_norm": 0.35879406332969666, + "learning_rate": 0.0001705593719332679, + "loss": 0.4050876200199127, + "step": 156 + }, + { + "epoch": 0.1533203125, + "grad_norm": 0.5701449513435364, + "learning_rate": 0.0001703631010794897, + "loss": 0.9737375974655151, + "step": 157 + }, + { + "epoch": 0.154296875, + "grad_norm": 0.4461202919483185, + "learning_rate": 0.0001701668302257115, + "loss": 0.9864733815193176, + "step": 158 + }, + { + "epoch": 0.1552734375, + "grad_norm": 0.6229621767997742, + "learning_rate": 0.00016997055937193327, + "loss": 0.35883933305740356, + "step": 159 + }, + { + "epoch": 0.15625, + "grad_norm": 0.5390028357505798, + "learning_rate": 0.00016977428851815506, + "loss": 0.5791765451431274, + "step": 160 + }, + { + "epoch": 0.1572265625, + "grad_norm": 0.7851611375808716, + "learning_rate": 0.00016957801766437683, + "loss": 0.9032300114631653, + "step": 161 + }, + { + "epoch": 0.158203125, + "grad_norm": 0.6211395263671875, + "learning_rate": 0.00016938174681059863, + "loss": 0.5069928765296936, + "step": 162 + }, + { + "epoch": 0.1591796875, + "grad_norm": 0.8290377855300903, + "learning_rate": 0.00016918547595682042, + "loss": 0.8917738795280457, + "step": 163 + }, + { + "epoch": 0.16015625, + "grad_norm": 0.42707324028015137, + "learning_rate": 0.00016898920510304222, + "loss": 0.606585681438446, + "step": 164 + }, + { + "epoch": 0.1611328125, + "grad_norm": 0.49472010135650635, + "learning_rate": 0.000168792934249264, + "loss": 1.0100075006484985, + "step": 165 + }, + { + "epoch": 0.162109375, + "grad_norm": 0.48441267013549805, + "learning_rate": 0.00016859666339548579, + "loss": 0.7145558595657349, + "step": 166 + }, + { + "epoch": 0.1630859375, + "grad_norm": 0.5181763172149658, + "learning_rate": 0.00016840039254170755, + "loss": 0.8088749647140503, + "step": 167 + }, + { + "epoch": 0.1640625, + "grad_norm": 0.4702328145503998, + "learning_rate": 0.00016820412168792935, + "loss": 0.5631542801856995, + "step": 168 + }, + { + "epoch": 0.1650390625, + "grad_norm": 0.35454344749450684, + "learning_rate": 0.00016800785083415115, + "loss": 0.31744396686553955, + "step": 169 + }, + { + "epoch": 0.166015625, + "grad_norm": 0.5193122029304504, + "learning_rate": 0.00016781157998037291, + "loss": 0.7338438034057617, + "step": 170 + }, + { + "epoch": 0.1669921875, + "grad_norm": 0.49799400568008423, + "learning_rate": 0.0001676153091265947, + "loss": 0.7910654544830322, + "step": 171 + }, + { + "epoch": 0.16796875, + "grad_norm": 0.4855571389198303, + "learning_rate": 0.00016741903827281648, + "loss": 0.38415610790252686, + "step": 172 + }, + { + "epoch": 0.1689453125, + "grad_norm": 0.8796041011810303, + "learning_rate": 0.00016722276741903828, + "loss": 0.6042807102203369, + "step": 173 + }, + { + "epoch": 0.169921875, + "grad_norm": 0.6005135774612427, + "learning_rate": 0.00016702649656526007, + "loss": 0.6617047786712646, + "step": 174 + }, + { + "epoch": 0.1708984375, + "grad_norm": 0.6359293460845947, + "learning_rate": 0.00016683022571148187, + "loss": 0.5227914452552795, + "step": 175 + }, + { + "epoch": 0.171875, + "grad_norm": 0.46007266640663147, + "learning_rate": 0.00016663395485770364, + "loss": 0.6881235837936401, + "step": 176 + }, + { + "epoch": 0.1728515625, + "grad_norm": 0.37411797046661377, + "learning_rate": 0.00016643768400392543, + "loss": 0.7384200096130371, + "step": 177 + }, + { + "epoch": 0.173828125, + "grad_norm": 0.4021860659122467, + "learning_rate": 0.0001662414131501472, + "loss": 1.1738500595092773, + "step": 178 + }, + { + "epoch": 0.1748046875, + "grad_norm": 0.3674755096435547, + "learning_rate": 0.000166045142296369, + "loss": 0.37539663910865784, + "step": 179 + }, + { + "epoch": 0.17578125, + "grad_norm": 0.5051441788673401, + "learning_rate": 0.0001658488714425908, + "loss": 0.6273016333580017, + "step": 180 + }, + { + "epoch": 0.1767578125, + "grad_norm": 0.6807597279548645, + "learning_rate": 0.0001656526005888126, + "loss": 0.4195510447025299, + "step": 181 + }, + { + "epoch": 0.177734375, + "grad_norm": 0.3345419466495514, + "learning_rate": 0.00016545632973503436, + "loss": 0.8546851873397827, + "step": 182 + }, + { + "epoch": 0.1787109375, + "grad_norm": 0.33821800351142883, + "learning_rate": 0.00016526005888125615, + "loss": 0.522655725479126, + "step": 183 + }, + { + "epoch": 0.1796875, + "grad_norm": 0.3145562708377838, + "learning_rate": 0.00016506378802747792, + "loss": 0.3799128532409668, + "step": 184 + }, + { + "epoch": 0.1806640625, + "grad_norm": 0.44908636808395386, + "learning_rate": 0.0001648675171736997, + "loss": 0.6263326406478882, + "step": 185 + }, + { + "epoch": 0.181640625, + "grad_norm": 0.7736865282058716, + "learning_rate": 0.00016467124631992151, + "loss": 0.3385460078716278, + "step": 186 + }, + { + "epoch": 0.1826171875, + "grad_norm": 0.5184527635574341, + "learning_rate": 0.00016447497546614328, + "loss": 0.7980771064758301, + "step": 187 + }, + { + "epoch": 0.18359375, + "grad_norm": 0.41774502396583557, + "learning_rate": 0.00016427870461236508, + "loss": 0.7745299339294434, + "step": 188 + }, + { + "epoch": 0.1845703125, + "grad_norm": 0.43824154138565063, + "learning_rate": 0.00016408243375858685, + "loss": 0.9190135598182678, + "step": 189 + }, + { + "epoch": 0.185546875, + "grad_norm": 0.4037880301475525, + "learning_rate": 0.00016388616290480864, + "loss": 0.5671911239624023, + "step": 190 + }, + { + "epoch": 0.1865234375, + "grad_norm": 0.3757816255092621, + "learning_rate": 0.0001636898920510304, + "loss": 0.39916592836380005, + "step": 191 + }, + { + "epoch": 0.1875, + "grad_norm": 0.4747844636440277, + "learning_rate": 0.00016349362119725224, + "loss": 0.9217299818992615, + "step": 192 + }, + { + "epoch": 0.1884765625, + "grad_norm": 0.42307209968566895, + "learning_rate": 0.000163297350343474, + "loss": 0.8852982521057129, + "step": 193 + }, + { + "epoch": 0.189453125, + "grad_norm": 0.47294488549232483, + "learning_rate": 0.0001631010794896958, + "loss": 1.0635476112365723, + "step": 194 + }, + { + "epoch": 0.1904296875, + "grad_norm": 0.3519342243671417, + "learning_rate": 0.00016290480863591757, + "loss": 0.33460623025894165, + "step": 195 + }, + { + "epoch": 0.19140625, + "grad_norm": 0.418151319026947, + "learning_rate": 0.00016270853778213936, + "loss": 0.8776851296424866, + "step": 196 + }, + { + "epoch": 0.1923828125, + "grad_norm": 0.3954712152481079, + "learning_rate": 0.00016251226692836113, + "loss": 0.9358173608779907, + "step": 197 + }, + { + "epoch": 0.193359375, + "grad_norm": 0.35646897554397583, + "learning_rate": 0.00016231599607458293, + "loss": 0.43795716762542725, + "step": 198 + }, + { + "epoch": 0.1943359375, + "grad_norm": 0.41675063967704773, + "learning_rate": 0.00016211972522080473, + "loss": 0.8348654508590698, + "step": 199 + }, + { + "epoch": 0.1953125, + "grad_norm": 0.5800544023513794, + "learning_rate": 0.0001619234543670265, + "loss": 0.5580507516860962, + "step": 200 + }, + { + "epoch": 0.1962890625, + "grad_norm": 0.44925832748413086, + "learning_rate": 0.0001617271835132483, + "loss": 0.47444453835487366, + "step": 201 + }, + { + "epoch": 0.197265625, + "grad_norm": 0.48447439074516296, + "learning_rate": 0.00016153091265947006, + "loss": 0.5927308797836304, + "step": 202 + }, + { + "epoch": 0.1982421875, + "grad_norm": 0.37814846634864807, + "learning_rate": 0.00016133464180569186, + "loss": 0.8504298329353333, + "step": 203 + }, + { + "epoch": 0.19921875, + "grad_norm": 0.4171026051044464, + "learning_rate": 0.00016113837095191365, + "loss": 1.0796414613723755, + "step": 204 + }, + { + "epoch": 0.2001953125, + "grad_norm": 0.4570372402667999, + "learning_rate": 0.00016094210009813545, + "loss": 0.6229358315467834, + "step": 205 + }, + { + "epoch": 0.201171875, + "grad_norm": 0.6294324994087219, + "learning_rate": 0.00016074582924435722, + "loss": 0.8749011158943176, + "step": 206 + }, + { + "epoch": 0.2021484375, + "grad_norm": 0.42371129989624023, + "learning_rate": 0.000160549558390579, + "loss": 0.9866290092468262, + "step": 207 + }, + { + "epoch": 0.203125, + "grad_norm": 0.5329370498657227, + "learning_rate": 0.00016035328753680078, + "loss": 0.7568405270576477, + "step": 208 + }, + { + "epoch": 0.2041015625, + "grad_norm": 0.37205901741981506, + "learning_rate": 0.00016015701668302258, + "loss": 0.7115534543991089, + "step": 209 + }, + { + "epoch": 0.205078125, + "grad_norm": 0.4536517262458801, + "learning_rate": 0.00015996074582924437, + "loss": 0.5152509808540344, + "step": 210 + }, + { + "epoch": 0.2060546875, + "grad_norm": 2.319321393966675, + "learning_rate": 0.00015976447497546617, + "loss": 0.2915653586387634, + "step": 211 + }, + { + "epoch": 0.20703125, + "grad_norm": 0.7047526836395264, + "learning_rate": 0.00015956820412168794, + "loss": 0.3070187568664551, + "step": 212 + }, + { + "epoch": 0.2080078125, + "grad_norm": 0.6068500280380249, + "learning_rate": 0.0001593719332679097, + "loss": 0.8103427290916443, + "step": 213 + }, + { + "epoch": 0.208984375, + "grad_norm": 0.3588794469833374, + "learning_rate": 0.0001591756624141315, + "loss": 0.4655485153198242, + "step": 214 + }, + { + "epoch": 0.2099609375, + "grad_norm": 0.6561040878295898, + "learning_rate": 0.0001589793915603533, + "loss": 0.5353362560272217, + "step": 215 + }, + { + "epoch": 0.2109375, + "grad_norm": 0.6485084891319275, + "learning_rate": 0.0001587831207065751, + "loss": 0.8601769804954529, + "step": 216 + }, + { + "epoch": 0.2119140625, + "grad_norm": 0.4718208909034729, + "learning_rate": 0.00015858684985279686, + "loss": 0.6897189617156982, + "step": 217 + }, + { + "epoch": 0.212890625, + "grad_norm": 0.7453560829162598, + "learning_rate": 0.00015839057899901866, + "loss": 1.0387171506881714, + "step": 218 + }, + { + "epoch": 0.2138671875, + "grad_norm": 0.41157087683677673, + "learning_rate": 0.00015819430814524043, + "loss": 0.4910873770713806, + "step": 219 + }, + { + "epoch": 0.21484375, + "grad_norm": 0.4198990762233734, + "learning_rate": 0.00015799803729146222, + "loss": 0.588080108165741, + "step": 220 + }, + { + "epoch": 0.2158203125, + "grad_norm": 0.7791650295257568, + "learning_rate": 0.00015780176643768402, + "loss": 0.754984974861145, + "step": 221 + }, + { + "epoch": 0.216796875, + "grad_norm": 1.4430909156799316, + "learning_rate": 0.00015760549558390581, + "loss": 0.5313946008682251, + "step": 222 + }, + { + "epoch": 0.2177734375, + "grad_norm": 0.4399142861366272, + "learning_rate": 0.00015740922473012758, + "loss": 0.523280918598175, + "step": 223 + }, + { + "epoch": 0.21875, + "grad_norm": 0.4177611470222473, + "learning_rate": 0.00015721295387634938, + "loss": 0.7598159313201904, + "step": 224 + }, + { + "epoch": 0.2197265625, + "grad_norm": 0.4408816397190094, + "learning_rate": 0.00015701668302257115, + "loss": 0.8131666779518127, + "step": 225 + }, + { + "epoch": 0.220703125, + "grad_norm": 0.4228694438934326, + "learning_rate": 0.00015682041216879294, + "loss": 1.0456180572509766, + "step": 226 + }, + { + "epoch": 0.2216796875, + "grad_norm": 0.6313449144363403, + "learning_rate": 0.00015662414131501474, + "loss": 0.496864914894104, + "step": 227 + }, + { + "epoch": 0.22265625, + "grad_norm": 0.48103493452072144, + "learning_rate": 0.0001564278704612365, + "loss": 0.5967347621917725, + "step": 228 + }, + { + "epoch": 0.2236328125, + "grad_norm": 0.3548172116279602, + "learning_rate": 0.0001562315996074583, + "loss": 0.3325611650943756, + "step": 229 + }, + { + "epoch": 0.224609375, + "grad_norm": 0.41543763875961304, + "learning_rate": 0.00015603532875368007, + "loss": 0.9223452806472778, + "step": 230 + }, + { + "epoch": 0.2255859375, + "grad_norm": 0.6072061061859131, + "learning_rate": 0.00015583905789990187, + "loss": 0.2860236167907715, + "step": 231 + }, + { + "epoch": 0.2265625, + "grad_norm": 0.3232869505882263, + "learning_rate": 0.00015564278704612364, + "loss": 0.7308738231658936, + "step": 232 + }, + { + "epoch": 0.2275390625, + "grad_norm": 0.5271327495574951, + "learning_rate": 0.00015544651619234546, + "loss": 1.0354498624801636, + "step": 233 + }, + { + "epoch": 0.228515625, + "grad_norm": 0.626105546951294, + "learning_rate": 0.00015525024533856723, + "loss": 1.0841856002807617, + "step": 234 + }, + { + "epoch": 0.2294921875, + "grad_norm": 0.5628311634063721, + "learning_rate": 0.00015505397448478903, + "loss": 0.8868529200553894, + "step": 235 + }, + { + "epoch": 0.23046875, + "grad_norm": 0.4290577471256256, + "learning_rate": 0.0001548577036310108, + "loss": 0.5887943506240845, + "step": 236 + }, + { + "epoch": 0.2314453125, + "grad_norm": 0.743786096572876, + "learning_rate": 0.0001546614327772326, + "loss": 0.8314348459243774, + "step": 237 + }, + { + "epoch": 0.232421875, + "grad_norm": 0.34498658776283264, + "learning_rate": 0.00015446516192345436, + "loss": 0.6171099543571472, + "step": 238 + }, + { + "epoch": 0.2333984375, + "grad_norm": 0.7894997596740723, + "learning_rate": 0.00015426889106967616, + "loss": 0.614283561706543, + "step": 239 + }, + { + "epoch": 0.234375, + "grad_norm": 0.4631381034851074, + "learning_rate": 0.00015407262021589795, + "loss": 0.6744101047515869, + "step": 240 + }, + { + "epoch": 0.2353515625, + "grad_norm": 0.44523295760154724, + "learning_rate": 0.00015387634936211975, + "loss": 0.7094103097915649, + "step": 241 + }, + { + "epoch": 0.236328125, + "grad_norm": 0.7059242725372314, + "learning_rate": 0.00015368007850834152, + "loss": 0.6856737732887268, + "step": 242 + }, + { + "epoch": 0.2373046875, + "grad_norm": 1.0360506772994995, + "learning_rate": 0.00015348380765456329, + "loss": 1.101341962814331, + "step": 243 + }, + { + "epoch": 0.23828125, + "grad_norm": 0.6630859375, + "learning_rate": 0.00015328753680078508, + "loss": 0.8815068006515503, + "step": 244 + }, + { + "epoch": 0.2392578125, + "grad_norm": 0.4162105321884155, + "learning_rate": 0.00015309126594700688, + "loss": 0.39801689982414246, + "step": 245 + }, + { + "epoch": 0.240234375, + "grad_norm": 0.5786510109901428, + "learning_rate": 0.00015289499509322867, + "loss": 0.5399383902549744, + "step": 246 + }, + { + "epoch": 0.2412109375, + "grad_norm": 0.5430185794830322, + "learning_rate": 0.00015269872423945044, + "loss": 0.5432325601577759, + "step": 247 + }, + { + "epoch": 0.2421875, + "grad_norm": 0.3750382959842682, + "learning_rate": 0.00015250245338567224, + "loss": 0.49265092611312866, + "step": 248 + }, + { + "epoch": 0.2431640625, + "grad_norm": 0.5081580877304077, + "learning_rate": 0.000152306182531894, + "loss": 0.8720104098320007, + "step": 249 + }, + { + "epoch": 0.244140625, + "grad_norm": 0.5619673728942871, + "learning_rate": 0.0001521099116781158, + "loss": 0.4022529125213623, + "step": 250 + }, + { + "epoch": 0.2451171875, + "grad_norm": 0.3996225893497467, + "learning_rate": 0.0001519136408243376, + "loss": 0.443879097700119, + "step": 251 + }, + { + "epoch": 0.24609375, + "grad_norm": 0.4688915014266968, + "learning_rate": 0.0001517173699705594, + "loss": 0.47562721371650696, + "step": 252 + }, + { + "epoch": 0.2470703125, + "grad_norm": 1.7595641613006592, + "learning_rate": 0.00015152109911678116, + "loss": 0.5174474716186523, + "step": 253 + }, + { + "epoch": 0.248046875, + "grad_norm": 0.47813650965690613, + "learning_rate": 0.00015132482826300296, + "loss": 0.8565359711647034, + "step": 254 + }, + { + "epoch": 0.2490234375, + "grad_norm": 0.49612802267074585, + "learning_rate": 0.00015112855740922473, + "loss": 0.4736977815628052, + "step": 255 + }, + { + "epoch": 0.25, + "grad_norm": 0.4370449483394623, + "learning_rate": 0.00015093228655544652, + "loss": 0.7566809058189392, + "step": 256 + }, + { + "epoch": 0.2509765625, + "grad_norm": 0.43916988372802734, + "learning_rate": 0.00015073601570166832, + "loss": 0.8396226763725281, + "step": 257 + }, + { + "epoch": 0.251953125, + "grad_norm": 0.7745673060417175, + "learning_rate": 0.0001505397448478901, + "loss": 0.3085971772670746, + "step": 258 + }, + { + "epoch": 0.2529296875, + "grad_norm": 0.4097643792629242, + "learning_rate": 0.00015034347399411188, + "loss": 0.2730502188205719, + "step": 259 + }, + { + "epoch": 0.25390625, + "grad_norm": 0.4131183624267578, + "learning_rate": 0.00015014720314033365, + "loss": 0.5422588586807251, + "step": 260 + }, + { + "epoch": 0.2548828125, + "grad_norm": 0.469498872756958, + "learning_rate": 0.00014995093228655545, + "loss": 0.6572885513305664, + "step": 261 + }, + { + "epoch": 0.255859375, + "grad_norm": 0.3662133514881134, + "learning_rate": 0.00014975466143277725, + "loss": 0.9272421598434448, + "step": 262 + }, + { + "epoch": 0.2568359375, + "grad_norm": 0.38194844126701355, + "learning_rate": 0.00014955839057899904, + "loss": 0.6010634303092957, + "step": 263 + }, + { + "epoch": 0.2578125, + "grad_norm": 0.3645467758178711, + "learning_rate": 0.0001493621197252208, + "loss": 0.9131143093109131, + "step": 264 + }, + { + "epoch": 0.2587890625, + "grad_norm": 0.3304290771484375, + "learning_rate": 0.0001491658488714426, + "loss": 0.4593530297279358, + "step": 265 + }, + { + "epoch": 0.259765625, + "grad_norm": 0.7529020309448242, + "learning_rate": 0.00014896957801766437, + "loss": 0.5219628810882568, + "step": 266 + }, + { + "epoch": 0.2607421875, + "grad_norm": 0.4974548816680908, + "learning_rate": 0.00014877330716388617, + "loss": 0.7617945075035095, + "step": 267 + }, + { + "epoch": 0.26171875, + "grad_norm": 0.28884655237197876, + "learning_rate": 0.00014857703631010797, + "loss": 0.4288986921310425, + "step": 268 + }, + { + "epoch": 0.2626953125, + "grad_norm": 0.5195730328559875, + "learning_rate": 0.00014838076545632976, + "loss": 0.830593466758728, + "step": 269 + }, + { + "epoch": 0.263671875, + "grad_norm": 0.40689924359321594, + "learning_rate": 0.00014818449460255153, + "loss": 0.7528857588768005, + "step": 270 + }, + { + "epoch": 0.2646484375, + "grad_norm": 0.33955928683280945, + "learning_rate": 0.00014798822374877333, + "loss": 0.5274187326431274, + "step": 271 + }, + { + "epoch": 0.265625, + "grad_norm": 1.0572726726531982, + "learning_rate": 0.0001477919528949951, + "loss": 0.7389089465141296, + "step": 272 + }, + { + "epoch": 0.2666015625, + "grad_norm": 0.5191348791122437, + "learning_rate": 0.00014759568204121686, + "loss": 0.4842514991760254, + "step": 273 + }, + { + "epoch": 0.267578125, + "grad_norm": 0.3779315650463104, + "learning_rate": 0.00014739941118743866, + "loss": 0.7406666278839111, + "step": 274 + }, + { + "epoch": 0.2685546875, + "grad_norm": 0.6065999865531921, + "learning_rate": 0.00014720314033366046, + "loss": 0.6771246790885925, + "step": 275 + }, + { + "epoch": 0.26953125, + "grad_norm": 0.537529468536377, + "learning_rate": 0.00014700686947988225, + "loss": 0.861257791519165, + "step": 276 + }, + { + "epoch": 0.2705078125, + "grad_norm": 0.3961732089519501, + "learning_rate": 0.00014681059862610402, + "loss": 0.9672999382019043, + "step": 277 + }, + { + "epoch": 0.271484375, + "grad_norm": 0.45974740386009216, + "learning_rate": 0.00014661432777232582, + "loss": 0.5789016485214233, + "step": 278 + }, + { + "epoch": 0.2724609375, + "grad_norm": 0.7211292386054993, + "learning_rate": 0.00014641805691854759, + "loss": 0.867314338684082, + "step": 279 + }, + { + "epoch": 0.2734375, + "grad_norm": 0.6938930749893188, + "learning_rate": 0.00014622178606476938, + "loss": 0.4570122957229614, + "step": 280 + }, + { + "epoch": 0.2744140625, + "grad_norm": 0.5093329548835754, + "learning_rate": 0.00014602551521099118, + "loss": 0.9487482309341431, + "step": 281 + }, + { + "epoch": 0.275390625, + "grad_norm": 0.4403358995914459, + "learning_rate": 0.00014582924435721297, + "loss": 0.5330759286880493, + "step": 282 + }, + { + "epoch": 0.2763671875, + "grad_norm": 0.5305198431015015, + "learning_rate": 0.00014563297350343474, + "loss": 0.8727459907531738, + "step": 283 + }, + { + "epoch": 0.27734375, + "grad_norm": 0.49577099084854126, + "learning_rate": 0.00014543670264965654, + "loss": 0.6166709065437317, + "step": 284 + }, + { + "epoch": 0.2783203125, + "grad_norm": 0.4856763780117035, + "learning_rate": 0.0001452404317958783, + "loss": 0.920722484588623, + "step": 285 + }, + { + "epoch": 0.279296875, + "grad_norm": 0.3397112786769867, + "learning_rate": 0.0001450441609421001, + "loss": 1.001542329788208, + "step": 286 + }, + { + "epoch": 0.2802734375, + "grad_norm": 0.591691792011261, + "learning_rate": 0.0001448478900883219, + "loss": 0.4898494780063629, + "step": 287 + }, + { + "epoch": 0.28125, + "grad_norm": 0.45293164253234863, + "learning_rate": 0.00014465161923454367, + "loss": 0.4958389401435852, + "step": 288 + }, + { + "epoch": 0.2822265625, + "grad_norm": 0.38414305448532104, + "learning_rate": 0.00014445534838076546, + "loss": 0.3971215784549713, + "step": 289 + }, + { + "epoch": 0.283203125, + "grad_norm": 0.5568608045578003, + "learning_rate": 0.00014425907752698723, + "loss": 0.7953230142593384, + "step": 290 + }, + { + "epoch": 0.2841796875, + "grad_norm": 0.3680984377861023, + "learning_rate": 0.00014406280667320903, + "loss": 0.703729510307312, + "step": 291 + }, + { + "epoch": 0.28515625, + "grad_norm": 0.4263870120048523, + "learning_rate": 0.00014386653581943082, + "loss": 0.7433100938796997, + "step": 292 + }, + { + "epoch": 0.2861328125, + "grad_norm": 1.3262213468551636, + "learning_rate": 0.00014367026496565262, + "loss": 0.8011248111724854, + "step": 293 + }, + { + "epoch": 0.287109375, + "grad_norm": 0.44766104221343994, + "learning_rate": 0.0001434739941118744, + "loss": 0.6682827472686768, + "step": 294 + }, + { + "epoch": 0.2880859375, + "grad_norm": 0.7399169206619263, + "learning_rate": 0.00014327772325809619, + "loss": 0.8356127142906189, + "step": 295 + }, + { + "epoch": 0.2890625, + "grad_norm": 0.3582242727279663, + "learning_rate": 0.00014308145240431795, + "loss": 0.7127545475959778, + "step": 296 + }, + { + "epoch": 0.2900390625, + "grad_norm": 0.5251145958900452, + "learning_rate": 0.00014288518155053975, + "loss": 0.7467620968818665, + "step": 297 + }, + { + "epoch": 0.291015625, + "grad_norm": 0.639377772808075, + "learning_rate": 0.00014268891069676155, + "loss": 0.434887170791626, + "step": 298 + }, + { + "epoch": 0.2919921875, + "grad_norm": 0.5007404685020447, + "learning_rate": 0.00014249263984298334, + "loss": 1.028229832649231, + "step": 299 + }, + { + "epoch": 0.29296875, + "grad_norm": 0.41101035475730896, + "learning_rate": 0.0001422963689892051, + "loss": 0.8766242265701294, + "step": 300 + }, + { + "epoch": 0.2939453125, + "grad_norm": 0.3938690721988678, + "learning_rate": 0.0001421000981354269, + "loss": 0.7176960706710815, + "step": 301 + }, + { + "epoch": 0.294921875, + "grad_norm": 0.5939344763755798, + "learning_rate": 0.00014190382728164868, + "loss": 0.6655953526496887, + "step": 302 + }, + { + "epoch": 0.2958984375, + "grad_norm": 0.47224998474121094, + "learning_rate": 0.00014170755642787047, + "loss": 0.9155608415603638, + "step": 303 + }, + { + "epoch": 0.296875, + "grad_norm": 0.41344454884529114, + "learning_rate": 0.00014151128557409227, + "loss": 0.6017557382583618, + "step": 304 + }, + { + "epoch": 0.2978515625, + "grad_norm": 0.514320969581604, + "learning_rate": 0.00014131501472031404, + "loss": 0.6184566617012024, + "step": 305 + }, + { + "epoch": 0.298828125, + "grad_norm": 0.5005887150764465, + "learning_rate": 0.00014111874386653583, + "loss": 0.6652892231941223, + "step": 306 + }, + { + "epoch": 0.2998046875, + "grad_norm": 0.5872619152069092, + "learning_rate": 0.0001409224730127576, + "loss": 0.8618959784507751, + "step": 307 + }, + { + "epoch": 0.30078125, + "grad_norm": 0.5114542245864868, + "learning_rate": 0.0001407262021589794, + "loss": 0.6637990474700928, + "step": 308 + }, + { + "epoch": 0.3017578125, + "grad_norm": 1.141750693321228, + "learning_rate": 0.00014052993130520117, + "loss": 0.6234999299049377, + "step": 309 + }, + { + "epoch": 0.302734375, + "grad_norm": 0.4786873459815979, + "learning_rate": 0.000140333660451423, + "loss": 0.9601540565490723, + "step": 310 + }, + { + "epoch": 0.3037109375, + "grad_norm": 0.6048462390899658, + "learning_rate": 0.00014013738959764476, + "loss": 0.5895652770996094, + "step": 311 + }, + { + "epoch": 0.3046875, + "grad_norm": 0.7435188889503479, + "learning_rate": 0.00013994111874386655, + "loss": 1.196149468421936, + "step": 312 + }, + { + "epoch": 0.3056640625, + "grad_norm": 0.7936303019523621, + "learning_rate": 0.00013974484789008832, + "loss": 0.6073983907699585, + "step": 313 + }, + { + "epoch": 0.306640625, + "grad_norm": 0.5199156403541565, + "learning_rate": 0.00013954857703631012, + "loss": 0.2734944224357605, + "step": 314 + }, + { + "epoch": 0.3076171875, + "grad_norm": 0.38845276832580566, + "learning_rate": 0.0001393523061825319, + "loss": 0.604506254196167, + "step": 315 + }, + { + "epoch": 0.30859375, + "grad_norm": 0.6925122737884521, + "learning_rate": 0.0001391560353287537, + "loss": 1.0446012020111084, + "step": 316 + }, + { + "epoch": 0.3095703125, + "grad_norm": 0.4950433075428009, + "learning_rate": 0.00013895976447497548, + "loss": 1.027349591255188, + "step": 317 + }, + { + "epoch": 0.310546875, + "grad_norm": 0.36179935932159424, + "learning_rate": 0.00013876349362119725, + "loss": 0.6760075688362122, + "step": 318 + }, + { + "epoch": 0.3115234375, + "grad_norm": 0.3730153739452362, + "learning_rate": 0.00013856722276741904, + "loss": 0.47779884934425354, + "step": 319 + }, + { + "epoch": 0.3125, + "grad_norm": 0.6181739568710327, + "learning_rate": 0.0001383709519136408, + "loss": 0.4747524857521057, + "step": 320 + }, + { + "epoch": 0.3134765625, + "grad_norm": 0.8233240246772766, + "learning_rate": 0.0001381746810598626, + "loss": 0.490276575088501, + "step": 321 + }, + { + "epoch": 0.314453125, + "grad_norm": 0.6492604613304138, + "learning_rate": 0.0001379784102060844, + "loss": 0.44847172498703003, + "step": 322 + }, + { + "epoch": 0.3154296875, + "grad_norm": 0.5506369471549988, + "learning_rate": 0.0001377821393523062, + "loss": 0.47955968976020813, + "step": 323 + }, + { + "epoch": 0.31640625, + "grad_norm": 0.4187554717063904, + "learning_rate": 0.00013758586849852797, + "loss": 0.6466250419616699, + "step": 324 + }, + { + "epoch": 0.3173828125, + "grad_norm": 0.3976380527019501, + "learning_rate": 0.00013738959764474977, + "loss": 0.756473183631897, + "step": 325 + }, + { + "epoch": 0.318359375, + "grad_norm": 0.6089552044868469, + "learning_rate": 0.00013719332679097153, + "loss": 0.9309840202331543, + "step": 326 + }, + { + "epoch": 0.3193359375, + "grad_norm": 0.31628501415252686, + "learning_rate": 0.00013699705593719333, + "loss": 0.7739764451980591, + "step": 327 + }, + { + "epoch": 0.3203125, + "grad_norm": 0.6984357237815857, + "learning_rate": 0.00013680078508341513, + "loss": 1.0047030448913574, + "step": 328 + }, + { + "epoch": 0.3212890625, + "grad_norm": 0.42705219984054565, + "learning_rate": 0.00013660451422963692, + "loss": 0.5215034484863281, + "step": 329 + }, + { + "epoch": 0.322265625, + "grad_norm": 0.3548984229564667, + "learning_rate": 0.0001364082433758587, + "loss": 0.777184009552002, + "step": 330 + }, + { + "epoch": 0.3232421875, + "grad_norm": 0.6042805314064026, + "learning_rate": 0.00013621197252208046, + "loss": 0.469806432723999, + "step": 331 + }, + { + "epoch": 0.32421875, + "grad_norm": 0.43482446670532227, + "learning_rate": 0.00013601570166830226, + "loss": 0.8123322129249573, + "step": 332 + }, + { + "epoch": 0.3251953125, + "grad_norm": 0.4851783812046051, + "learning_rate": 0.00013581943081452405, + "loss": 1.1560527086257935, + "step": 333 + }, + { + "epoch": 0.326171875, + "grad_norm": 0.681423008441925, + "learning_rate": 0.00013562315996074585, + "loss": 0.5681013464927673, + "step": 334 + }, + { + "epoch": 0.3271484375, + "grad_norm": 0.43838411569595337, + "learning_rate": 0.00013542688910696762, + "loss": 0.8758999109268188, + "step": 335 + }, + { + "epoch": 0.328125, + "grad_norm": 0.5508302450180054, + "learning_rate": 0.0001352306182531894, + "loss": 0.7725740671157837, + "step": 336 + }, + { + "epoch": 0.3291015625, + "grad_norm": 0.2603519856929779, + "learning_rate": 0.00013503434739941118, + "loss": 0.357033908367157, + "step": 337 + }, + { + "epoch": 0.330078125, + "grad_norm": 0.38098394870758057, + "learning_rate": 0.00013483807654563298, + "loss": 0.41752922534942627, + "step": 338 + }, + { + "epoch": 0.3310546875, + "grad_norm": 0.5308575630187988, + "learning_rate": 0.00013464180569185477, + "loss": 0.6187021732330322, + "step": 339 + }, + { + "epoch": 0.33203125, + "grad_norm": 0.4033392369747162, + "learning_rate": 0.00013444553483807657, + "loss": 0.9481551647186279, + "step": 340 + }, + { + "epoch": 0.3330078125, + "grad_norm": 0.3999135494232178, + "learning_rate": 0.00013424926398429834, + "loss": 0.6853100657463074, + "step": 341 + }, + { + "epoch": 0.333984375, + "grad_norm": 0.4521353840827942, + "learning_rate": 0.00013405299313052013, + "loss": 1.0335659980773926, + "step": 342 + }, + { + "epoch": 0.3349609375, + "grad_norm": 0.3538281321525574, + "learning_rate": 0.0001338567222767419, + "loss": 0.821506142616272, + "step": 343 + }, + { + "epoch": 0.3359375, + "grad_norm": 0.49575889110565186, + "learning_rate": 0.0001336604514229637, + "loss": 0.6124354004859924, + "step": 344 + }, + { + "epoch": 0.3369140625, + "grad_norm": 0.37985700368881226, + "learning_rate": 0.0001334641805691855, + "loss": 0.6803320646286011, + "step": 345 + }, + { + "epoch": 0.337890625, + "grad_norm": 0.3533600866794586, + "learning_rate": 0.00013326790971540726, + "loss": 0.7260403037071228, + "step": 346 + }, + { + "epoch": 0.3388671875, + "grad_norm": 0.49213504791259766, + "learning_rate": 0.00013307163886162906, + "loss": 0.9051091074943542, + "step": 347 + }, + { + "epoch": 0.33984375, + "grad_norm": 0.37704166769981384, + "learning_rate": 0.00013287536800785083, + "loss": 0.4471222460269928, + "step": 348 + }, + { + "epoch": 0.3408203125, + "grad_norm": 0.4309573471546173, + "learning_rate": 0.00013267909715407262, + "loss": 0.749025285243988, + "step": 349 + }, + { + "epoch": 0.341796875, + "grad_norm": 0.7491689920425415, + "learning_rate": 0.0001324828263002944, + "loss": 1.1318167448043823, + "step": 350 + }, + { + "epoch": 0.3427734375, + "grad_norm": 0.3965498208999634, + "learning_rate": 0.00013228655544651622, + "loss": 0.8451839685440063, + "step": 351 + }, + { + "epoch": 0.34375, + "grad_norm": 0.4446418285369873, + "learning_rate": 0.00013209028459273798, + "loss": 0.7875360250473022, + "step": 352 + }, + { + "epoch": 0.3447265625, + "grad_norm": 0.3396705985069275, + "learning_rate": 0.00013189401373895978, + "loss": 0.8446518182754517, + "step": 353 + }, + { + "epoch": 0.345703125, + "grad_norm": 0.3436250388622284, + "learning_rate": 0.00013169774288518155, + "loss": 0.8995112180709839, + "step": 354 + }, + { + "epoch": 0.3466796875, + "grad_norm": 0.33643823862075806, + "learning_rate": 0.00013150147203140334, + "loss": 0.6253601312637329, + "step": 355 + }, + { + "epoch": 0.34765625, + "grad_norm": 0.39978718757629395, + "learning_rate": 0.0001313052011776251, + "loss": 0.31882500648498535, + "step": 356 + }, + { + "epoch": 0.3486328125, + "grad_norm": 0.3054925799369812, + "learning_rate": 0.00013110893032384694, + "loss": 0.3698769807815552, + "step": 357 + }, + { + "epoch": 0.349609375, + "grad_norm": 0.3789948523044586, + "learning_rate": 0.0001309126594700687, + "loss": 0.9039162397384644, + "step": 358 + }, + { + "epoch": 0.3505859375, + "grad_norm": 0.4192582964897156, + "learning_rate": 0.0001307163886162905, + "loss": 0.7852678298950195, + "step": 359 + }, + { + "epoch": 0.3515625, + "grad_norm": 0.5130710601806641, + "learning_rate": 0.00013052011776251227, + "loss": 0.7745686769485474, + "step": 360 + }, + { + "epoch": 0.3525390625, + "grad_norm": 0.39334234595298767, + "learning_rate": 0.00013032384690873404, + "loss": 0.7644802331924438, + "step": 361 + }, + { + "epoch": 0.353515625, + "grad_norm": 0.6141180992126465, + "learning_rate": 0.00013012757605495583, + "loss": 0.6028044819831848, + "step": 362 + }, + { + "epoch": 0.3544921875, + "grad_norm": 0.33263200521469116, + "learning_rate": 0.00012993130520117763, + "loss": 0.6908546090126038, + "step": 363 + }, + { + "epoch": 0.35546875, + "grad_norm": 0.3901807367801666, + "learning_rate": 0.00012973503434739943, + "loss": 0.8896909952163696, + "step": 364 + }, + { + "epoch": 0.3564453125, + "grad_norm": 0.3889808654785156, + "learning_rate": 0.0001295387634936212, + "loss": 0.622492790222168, + "step": 365 + }, + { + "epoch": 0.357421875, + "grad_norm": 0.41004979610443115, + "learning_rate": 0.000129342492639843, + "loss": 0.6293104887008667, + "step": 366 + }, + { + "epoch": 0.3583984375, + "grad_norm": 0.32929369807243347, + "learning_rate": 0.00012914622178606476, + "loss": 0.7049382925033569, + "step": 367 + }, + { + "epoch": 0.359375, + "grad_norm": 0.5189999341964722, + "learning_rate": 0.00012894995093228656, + "loss": 0.9230547547340393, + "step": 368 + }, + { + "epoch": 0.3603515625, + "grad_norm": 0.290991872549057, + "learning_rate": 0.00012875368007850835, + "loss": 0.5716772079467773, + "step": 369 + }, + { + "epoch": 0.361328125, + "grad_norm": 0.3976893126964569, + "learning_rate": 0.00012855740922473015, + "loss": 0.4593455493450165, + "step": 370 + }, + { + "epoch": 0.3623046875, + "grad_norm": 0.38385459780693054, + "learning_rate": 0.00012836113837095192, + "loss": 0.4766542315483093, + "step": 371 + }, + { + "epoch": 0.36328125, + "grad_norm": 0.45652449131011963, + "learning_rate": 0.0001281648675171737, + "loss": 0.9292062520980835, + "step": 372 + }, + { + "epoch": 0.3642578125, + "grad_norm": 0.384463906288147, + "learning_rate": 0.00012796859666339548, + "loss": 0.7896109223365784, + "step": 373 + }, + { + "epoch": 0.365234375, + "grad_norm": 0.43412724137306213, + "learning_rate": 0.00012777232580961728, + "loss": 0.6185650825500488, + "step": 374 + }, + { + "epoch": 0.3662109375, + "grad_norm": 0.4574507772922516, + "learning_rate": 0.00012757605495583907, + "loss": 0.5614027380943298, + "step": 375 + }, + { + "epoch": 0.3671875, + "grad_norm": 0.2921536862850189, + "learning_rate": 0.00012737978410206084, + "loss": 0.26786333322525024, + "step": 376 + }, + { + "epoch": 0.3681640625, + "grad_norm": 0.5887529850006104, + "learning_rate": 0.00012718351324828264, + "loss": 0.4167410433292389, + "step": 377 + }, + { + "epoch": 0.369140625, + "grad_norm": 0.3651127815246582, + "learning_rate": 0.0001269872423945044, + "loss": 1.0140016078948975, + "step": 378 + }, + { + "epoch": 0.3701171875, + "grad_norm": 0.47206228971481323, + "learning_rate": 0.0001267909715407262, + "loss": 0.8293377757072449, + "step": 379 + }, + { + "epoch": 0.37109375, + "grad_norm": 0.6319689154624939, + "learning_rate": 0.000126594700686948, + "loss": 0.7301446795463562, + "step": 380 + }, + { + "epoch": 0.3720703125, + "grad_norm": 0.5163951516151428, + "learning_rate": 0.0001263984298331698, + "loss": 0.9944421648979187, + "step": 381 + }, + { + "epoch": 0.373046875, + "grad_norm": 0.519072949886322, + "learning_rate": 0.00012620215897939156, + "loss": 0.6176541447639465, + "step": 382 + }, + { + "epoch": 0.3740234375, + "grad_norm": 3.0750813484191895, + "learning_rate": 0.00012600588812561336, + "loss": 0.7531320452690125, + "step": 383 + }, + { + "epoch": 0.375, + "grad_norm": 0.3246331512928009, + "learning_rate": 0.00012580961727183513, + "loss": 0.3269459903240204, + "step": 384 + }, + { + "epoch": 0.3759765625, + "grad_norm": 1.1105197668075562, + "learning_rate": 0.00012561334641805692, + "loss": 0.4228656589984894, + "step": 385 + }, + { + "epoch": 0.376953125, + "grad_norm": 0.6776182055473328, + "learning_rate": 0.00012541707556427872, + "loss": 0.791953980922699, + "step": 386 + }, + { + "epoch": 0.3779296875, + "grad_norm": 0.4413786828517914, + "learning_rate": 0.00012522080471050052, + "loss": 0.7953442335128784, + "step": 387 + }, + { + "epoch": 0.37890625, + "grad_norm": 0.4036264419555664, + "learning_rate": 0.00012502453385672228, + "loss": 0.6062744855880737, + "step": 388 + }, + { + "epoch": 0.3798828125, + "grad_norm": 1.0638166666030884, + "learning_rate": 0.00012482826300294408, + "loss": 1.0578093528747559, + "step": 389 + }, + { + "epoch": 0.380859375, + "grad_norm": 0.2518276572227478, + "learning_rate": 0.00012463199214916585, + "loss": 0.5070685148239136, + "step": 390 + }, + { + "epoch": 0.3818359375, + "grad_norm": 0.3338214159011841, + "learning_rate": 0.00012443572129538762, + "loss": 0.7665579915046692, + "step": 391 + }, + { + "epoch": 0.3828125, + "grad_norm": 0.4730507433414459, + "learning_rate": 0.00012423945044160944, + "loss": 0.48353517055511475, + "step": 392 + }, + { + "epoch": 0.3837890625, + "grad_norm": 0.3488924503326416, + "learning_rate": 0.0001240431795878312, + "loss": 0.4422420561313629, + "step": 393 + }, + { + "epoch": 0.384765625, + "grad_norm": 0.2397361695766449, + "learning_rate": 0.000123846908734053, + "loss": 0.7025644183158875, + "step": 394 + }, + { + "epoch": 0.3857421875, + "grad_norm": 0.3638167679309845, + "learning_rate": 0.00012365063788027478, + "loss": 0.5372107625007629, + "step": 395 + }, + { + "epoch": 0.38671875, + "grad_norm": 0.4088346064090729, + "learning_rate": 0.00012345436702649657, + "loss": 0.7636011838912964, + "step": 396 + }, + { + "epoch": 0.3876953125, + "grad_norm": 0.36985111236572266, + "learning_rate": 0.00012325809617271834, + "loss": 0.6720612645149231, + "step": 397 + }, + { + "epoch": 0.388671875, + "grad_norm": 0.37556055188179016, + "learning_rate": 0.00012306182531894016, + "loss": 0.8087592124938965, + "step": 398 + }, + { + "epoch": 0.3896484375, + "grad_norm": 0.6851724982261658, + "learning_rate": 0.00012286555446516193, + "loss": 0.780835747718811, + "step": 399 + }, + { + "epoch": 0.390625, + "grad_norm": 0.3453989326953888, + "learning_rate": 0.00012266928361138373, + "loss": 0.8235517740249634, + "step": 400 + }, + { + "epoch": 0.3916015625, + "grad_norm": 0.43622198700904846, + "learning_rate": 0.0001224730127576055, + "loss": 0.3758167028427124, + "step": 401 + }, + { + "epoch": 0.392578125, + "grad_norm": 0.4364018142223358, + "learning_rate": 0.0001222767419038273, + "loss": 0.7123017907142639, + "step": 402 + }, + { + "epoch": 0.3935546875, + "grad_norm": 0.24169716238975525, + "learning_rate": 0.00012208047105004906, + "loss": 0.48390328884124756, + "step": 403 + }, + { + "epoch": 0.39453125, + "grad_norm": 3.4902851581573486, + "learning_rate": 0.00012188420019627087, + "loss": 0.8519951105117798, + "step": 404 + }, + { + "epoch": 0.3955078125, + "grad_norm": 0.8332751989364624, + "learning_rate": 0.00012168792934249264, + "loss": 0.7562370896339417, + "step": 405 + }, + { + "epoch": 0.396484375, + "grad_norm": 0.3582589030265808, + "learning_rate": 0.00012149165848871442, + "loss": 0.3723471164703369, + "step": 406 + }, + { + "epoch": 0.3974609375, + "grad_norm": 0.48302146792411804, + "learning_rate": 0.00012129538763493622, + "loss": 1.0008171796798706, + "step": 407 + }, + { + "epoch": 0.3984375, + "grad_norm": 0.3510138988494873, + "learning_rate": 0.000121099116781158, + "loss": 0.30772703886032104, + "step": 408 + }, + { + "epoch": 0.3994140625, + "grad_norm": 0.2771015763282776, + "learning_rate": 0.0001209028459273798, + "loss": 0.4403090178966522, + "step": 409 + }, + { + "epoch": 0.400390625, + "grad_norm": 0.42239415645599365, + "learning_rate": 0.00012070657507360156, + "loss": 0.5451241731643677, + "step": 410 + }, + { + "epoch": 0.4013671875, + "grad_norm": 0.27876874804496765, + "learning_rate": 0.00012051030421982336, + "loss": 0.3590753972530365, + "step": 411 + }, + { + "epoch": 0.40234375, + "grad_norm": 0.42854824662208557, + "learning_rate": 0.00012031403336604514, + "loss": 1.0192680358886719, + "step": 412 + }, + { + "epoch": 0.4033203125, + "grad_norm": 0.32980695366859436, + "learning_rate": 0.00012011776251226694, + "loss": 0.6476566195487976, + "step": 413 + }, + { + "epoch": 0.404296875, + "grad_norm": 0.45046037435531616, + "learning_rate": 0.00011992149165848872, + "loss": 0.9548048973083496, + "step": 414 + }, + { + "epoch": 0.4052734375, + "grad_norm": 0.4176082909107208, + "learning_rate": 0.00011972522080471052, + "loss": 0.3793225586414337, + "step": 415 + }, + { + "epoch": 0.40625, + "grad_norm": 0.335823118686676, + "learning_rate": 0.00011952894995093229, + "loss": 0.5807560086250305, + "step": 416 + }, + { + "epoch": 0.4072265625, + "grad_norm": 0.4758591651916504, + "learning_rate": 0.00011933267909715408, + "loss": 0.3924551010131836, + "step": 417 + }, + { + "epoch": 0.408203125, + "grad_norm": 0.21527709066867828, + "learning_rate": 0.00011913640824337586, + "loss": 0.1651245653629303, + "step": 418 + }, + { + "epoch": 0.4091796875, + "grad_norm": 0.31255391240119934, + "learning_rate": 0.00011894013738959766, + "loss": 0.6133516430854797, + "step": 419 + }, + { + "epoch": 0.41015625, + "grad_norm": 0.40668365359306335, + "learning_rate": 0.00011874386653581944, + "loss": 0.894720196723938, + "step": 420 + }, + { + "epoch": 0.4111328125, + "grad_norm": 0.35574087500572205, + "learning_rate": 0.00011854759568204121, + "loss": 0.9017484188079834, + "step": 421 + }, + { + "epoch": 0.412109375, + "grad_norm": 0.3389612138271332, + "learning_rate": 0.00011835132482826301, + "loss": 0.7961660623550415, + "step": 422 + }, + { + "epoch": 0.4130859375, + "grad_norm": 0.8334202766418457, + "learning_rate": 0.00011815505397448479, + "loss": 0.8654063940048218, + "step": 423 + }, + { + "epoch": 0.4140625, + "grad_norm": 0.5917571187019348, + "learning_rate": 0.00011795878312070659, + "loss": 0.631730318069458, + "step": 424 + }, + { + "epoch": 0.4150390625, + "grad_norm": 0.4908443093299866, + "learning_rate": 0.00011776251226692835, + "loss": 0.3205869495868683, + "step": 425 + }, + { + "epoch": 0.416015625, + "grad_norm": 0.8349789381027222, + "learning_rate": 0.00011756624141315016, + "loss": 0.8526176810264587, + "step": 426 + }, + { + "epoch": 0.4169921875, + "grad_norm": 0.38712671399116516, + "learning_rate": 0.00011736997055937193, + "loss": 0.6580482125282288, + "step": 427 + }, + { + "epoch": 0.41796875, + "grad_norm": 0.766034722328186, + "learning_rate": 0.00011717369970559373, + "loss": 0.5494309663772583, + "step": 428 + }, + { + "epoch": 0.4189453125, + "grad_norm": 0.33322349190711975, + "learning_rate": 0.00011697742885181551, + "loss": 0.38351887464523315, + "step": 429 + }, + { + "epoch": 0.419921875, + "grad_norm": 0.411155641078949, + "learning_rate": 0.00011678115799803731, + "loss": 0.8139836192131042, + "step": 430 + }, + { + "epoch": 0.4208984375, + "grad_norm": 0.5857217907905579, + "learning_rate": 0.00011658488714425908, + "loss": 0.5668150186538696, + "step": 431 + }, + { + "epoch": 0.421875, + "grad_norm": 0.8849710822105408, + "learning_rate": 0.00011638861629048087, + "loss": 0.5478008985519409, + "step": 432 + }, + { + "epoch": 0.4228515625, + "grad_norm": 0.6771020293235779, + "learning_rate": 0.00011619234543670265, + "loss": 0.608709454536438, + "step": 433 + }, + { + "epoch": 0.423828125, + "grad_norm": 0.30138713121414185, + "learning_rate": 0.00011599607458292445, + "loss": 0.8240669369697571, + "step": 434 + }, + { + "epoch": 0.4248046875, + "grad_norm": 0.3273598253726959, + "learning_rate": 0.00011579980372914623, + "loss": 0.6287229657173157, + "step": 435 + }, + { + "epoch": 0.42578125, + "grad_norm": 0.5044806003570557, + "learning_rate": 0.000115603532875368, + "loss": 0.735835075378418, + "step": 436 + }, + { + "epoch": 0.4267578125, + "grad_norm": 0.34495776891708374, + "learning_rate": 0.0001154072620215898, + "loss": 0.7688421010971069, + "step": 437 + }, + { + "epoch": 0.427734375, + "grad_norm": 0.41923069953918457, + "learning_rate": 0.00011521099116781158, + "loss": 0.679617166519165, + "step": 438 + }, + { + "epoch": 0.4287109375, + "grad_norm": 0.3509843945503235, + "learning_rate": 0.00011501472031403338, + "loss": 0.7478575110435486, + "step": 439 + }, + { + "epoch": 0.4296875, + "grad_norm": 0.4758707582950592, + "learning_rate": 0.00011481844946025514, + "loss": 0.48871147632598877, + "step": 440 + }, + { + "epoch": 0.4306640625, + "grad_norm": 0.30272597074508667, + "learning_rate": 0.00011462217860647695, + "loss": 0.4311315715312958, + "step": 441 + }, + { + "epoch": 0.431640625, + "grad_norm": 0.5226417779922485, + "learning_rate": 0.00011442590775269872, + "loss": 0.8198300004005432, + "step": 442 + }, + { + "epoch": 0.4326171875, + "grad_norm": 0.41183850169181824, + "learning_rate": 0.00011422963689892052, + "loss": 0.9958367347717285, + "step": 443 + }, + { + "epoch": 0.43359375, + "grad_norm": 0.384048193693161, + "learning_rate": 0.0001140333660451423, + "loss": 0.3194778859615326, + "step": 444 + }, + { + "epoch": 0.4345703125, + "grad_norm": 0.5035115480422974, + "learning_rate": 0.0001138370951913641, + "loss": 0.6455928683280945, + "step": 445 + }, + { + "epoch": 0.435546875, + "grad_norm": 0.4875551462173462, + "learning_rate": 0.00011364082433758587, + "loss": 0.799978494644165, + "step": 446 + }, + { + "epoch": 0.4365234375, + "grad_norm": 0.3395763337612152, + "learning_rate": 0.00011344455348380768, + "loss": 0.47672414779663086, + "step": 447 + }, + { + "epoch": 0.4375, + "grad_norm": 0.5594314932823181, + "learning_rate": 0.00011324828263002944, + "loss": 0.4325803518295288, + "step": 448 + }, + { + "epoch": 0.4384765625, + "grad_norm": 0.44647228717803955, + "learning_rate": 0.00011305201177625124, + "loss": 0.8119433522224426, + "step": 449 + }, + { + "epoch": 0.439453125, + "grad_norm": 0.3190518915653229, + "learning_rate": 0.00011285574092247302, + "loss": 0.4949466288089752, + "step": 450 + }, + { + "epoch": 0.4404296875, + "grad_norm": 0.5943452715873718, + "learning_rate": 0.00011265947006869479, + "loss": 0.8245764374732971, + "step": 451 + }, + { + "epoch": 0.44140625, + "grad_norm": 0.8067309260368347, + "learning_rate": 0.00011246319921491659, + "loss": 0.39331740140914917, + "step": 452 + }, + { + "epoch": 0.4423828125, + "grad_norm": 0.4130857288837433, + "learning_rate": 0.00011226692836113837, + "loss": 1.0005946159362793, + "step": 453 + }, + { + "epoch": 0.443359375, + "grad_norm": 0.6839224100112915, + "learning_rate": 0.00011207065750736017, + "loss": 0.453269362449646, + "step": 454 + }, + { + "epoch": 0.4443359375, + "grad_norm": 0.6282085180282593, + "learning_rate": 0.00011187438665358195, + "loss": 0.7137607932090759, + "step": 455 + }, + { + "epoch": 0.4453125, + "grad_norm": 0.49894508719444275, + "learning_rate": 0.00011167811579980374, + "loss": 0.6289803981781006, + "step": 456 + }, + { + "epoch": 0.4462890625, + "grad_norm": 0.3570895493030548, + "learning_rate": 0.00011148184494602551, + "loss": 0.3711976110935211, + "step": 457 + }, + { + "epoch": 0.447265625, + "grad_norm": 0.28931114077568054, + "learning_rate": 0.00011128557409224731, + "loss": 0.5629679560661316, + "step": 458 + }, + { + "epoch": 0.4482421875, + "grad_norm": 1.2492791414260864, + "learning_rate": 0.00011108930323846909, + "loss": 0.5821082592010498, + "step": 459 + }, + { + "epoch": 0.44921875, + "grad_norm": 0.29861876368522644, + "learning_rate": 0.00011089303238469089, + "loss": 0.4129573106765747, + "step": 460 + }, + { + "epoch": 0.4501953125, + "grad_norm": 0.5244950652122498, + "learning_rate": 0.00011069676153091267, + "loss": 0.8300201296806335, + "step": 461 + }, + { + "epoch": 0.451171875, + "grad_norm": 0.446435809135437, + "learning_rate": 0.00011050049067713446, + "loss": 0.7500958442687988, + "step": 462 + }, + { + "epoch": 0.4521484375, + "grad_norm": 0.4531306028366089, + "learning_rate": 0.00011030421982335623, + "loss": 0.8492609262466431, + "step": 463 + }, + { + "epoch": 0.453125, + "grad_norm": 0.46944308280944824, + "learning_rate": 0.00011010794896957802, + "loss": 0.6209090948104858, + "step": 464 + }, + { + "epoch": 0.4541015625, + "grad_norm": 0.5465651154518127, + "learning_rate": 0.00010991167811579981, + "loss": 0.5176469087600708, + "step": 465 + }, + { + "epoch": 0.455078125, + "grad_norm": 0.36550402641296387, + "learning_rate": 0.00010971540726202158, + "loss": 0.6358295679092407, + "step": 466 + }, + { + "epoch": 0.4560546875, + "grad_norm": 0.48919910192489624, + "learning_rate": 0.00010951913640824338, + "loss": 0.5903019905090332, + "step": 467 + }, + { + "epoch": 0.45703125, + "grad_norm": 0.4378332793712616, + "learning_rate": 0.00010932286555446516, + "loss": 0.6710047721862793, + "step": 468 + }, + { + "epoch": 0.4580078125, + "grad_norm": 0.3095405101776123, + "learning_rate": 0.00010912659470068696, + "loss": 0.6787213683128357, + "step": 469 + }, + { + "epoch": 0.458984375, + "grad_norm": 0.40901967883110046, + "learning_rate": 0.00010893032384690874, + "loss": 0.6371384859085083, + "step": 470 + }, + { + "epoch": 0.4599609375, + "grad_norm": 0.3962486982345581, + "learning_rate": 0.00010873405299313053, + "loss": 0.5823498964309692, + "step": 471 + }, + { + "epoch": 0.4609375, + "grad_norm": 0.4094708263874054, + "learning_rate": 0.0001085377821393523, + "loss": 1.0396480560302734, + "step": 472 + }, + { + "epoch": 0.4619140625, + "grad_norm": 0.5117614269256592, + "learning_rate": 0.0001083415112855741, + "loss": 0.6320610642433167, + "step": 473 + }, + { + "epoch": 0.462890625, + "grad_norm": 0.28345227241516113, + "learning_rate": 0.00010814524043179588, + "loss": 0.33279290795326233, + "step": 474 + }, + { + "epoch": 0.4638671875, + "grad_norm": 0.5475791096687317, + "learning_rate": 0.00010794896957801768, + "loss": 0.359570175409317, + "step": 475 + }, + { + "epoch": 0.46484375, + "grad_norm": 0.44176843762397766, + "learning_rate": 0.00010775269872423946, + "loss": 0.7576714158058167, + "step": 476 + }, + { + "epoch": 0.4658203125, + "grad_norm": 0.473562628030777, + "learning_rate": 0.00010755642787046125, + "loss": 0.8758799433708191, + "step": 477 + }, + { + "epoch": 0.466796875, + "grad_norm": 0.41919219493865967, + "learning_rate": 0.00010736015701668302, + "loss": 0.863654375076294, + "step": 478 + }, + { + "epoch": 0.4677734375, + "grad_norm": 0.4215691089630127, + "learning_rate": 0.0001071638861629048, + "loss": 0.5004569292068481, + "step": 479 + }, + { + "epoch": 0.46875, + "grad_norm": 0.36801034212112427, + "learning_rate": 0.0001069676153091266, + "loss": 0.9330754280090332, + "step": 480 + }, + { + "epoch": 0.4697265625, + "grad_norm": 0.42489972710609436, + "learning_rate": 0.00010677134445534837, + "loss": 1.0529820919036865, + "step": 481 + }, + { + "epoch": 0.470703125, + "grad_norm": 0.4067368507385254, + "learning_rate": 0.00010657507360157018, + "loss": 0.5453970432281494, + "step": 482 + }, + { + "epoch": 0.4716796875, + "grad_norm": 0.28611162304878235, + "learning_rate": 0.00010637880274779195, + "loss": 0.2348572313785553, + "step": 483 + }, + { + "epoch": 0.47265625, + "grad_norm": 0.40047627687454224, + "learning_rate": 0.00010618253189401374, + "loss": 0.4776308834552765, + "step": 484 + }, + { + "epoch": 0.4736328125, + "grad_norm": 0.5168628692626953, + "learning_rate": 0.00010598626104023553, + "loss": 0.9922167062759399, + "step": 485 + }, + { + "epoch": 0.474609375, + "grad_norm": 0.3620246946811676, + "learning_rate": 0.00010578999018645732, + "loss": 0.7285036444664001, + "step": 486 + }, + { + "epoch": 0.4755859375, + "grad_norm": 0.42711782455444336, + "learning_rate": 0.00010559371933267909, + "loss": 0.6387231349945068, + "step": 487 + }, + { + "epoch": 0.4765625, + "grad_norm": 0.2139827311038971, + "learning_rate": 0.0001053974484789009, + "loss": 0.4295338988304138, + "step": 488 + }, + { + "epoch": 0.4775390625, + "grad_norm": 0.31191739439964294, + "learning_rate": 0.00010520117762512267, + "loss": 0.42860671877861023, + "step": 489 + }, + { + "epoch": 0.478515625, + "grad_norm": 0.2909379303455353, + "learning_rate": 0.00010500490677134447, + "loss": 0.47065097093582153, + "step": 490 + }, + { + "epoch": 0.4794921875, + "grad_norm": 0.48990437388420105, + "learning_rate": 0.00010480863591756625, + "loss": 0.8870656490325928, + "step": 491 + }, + { + "epoch": 0.48046875, + "grad_norm": 0.5662127733230591, + "learning_rate": 0.00010461236506378804, + "loss": 0.8007984161376953, + "step": 492 + }, + { + "epoch": 0.4814453125, + "grad_norm": 0.3656634986400604, + "learning_rate": 0.00010441609421000981, + "loss": 0.41389334201812744, + "step": 493 + }, + { + "epoch": 0.482421875, + "grad_norm": 0.39840465784072876, + "learning_rate": 0.0001042198233562316, + "loss": 0.6927056908607483, + "step": 494 + }, + { + "epoch": 0.4833984375, + "grad_norm": 0.641647219657898, + "learning_rate": 0.00010402355250245339, + "loss": 0.7912976145744324, + "step": 495 + }, + { + "epoch": 0.484375, + "grad_norm": 0.4522266685962677, + "learning_rate": 0.00010382728164867517, + "loss": 0.615374743938446, + "step": 496 + }, + { + "epoch": 0.4853515625, + "grad_norm": 0.415444016456604, + "learning_rate": 0.00010363101079489697, + "loss": 0.8559135794639587, + "step": 497 + }, + { + "epoch": 0.486328125, + "grad_norm": 0.4477578401565552, + "learning_rate": 0.00010343473994111874, + "loss": 0.6109384298324585, + "step": 498 + }, + { + "epoch": 0.4873046875, + "grad_norm": 0.33097633719444275, + "learning_rate": 0.00010323846908734053, + "loss": 0.6325762271881104, + "step": 499 + }, + { + "epoch": 0.48828125, + "grad_norm": 0.38771572709083557, + "learning_rate": 0.00010304219823356232, + "loss": 0.5979640483856201, + "step": 500 + }, + { + "epoch": 0.4892578125, + "grad_norm": 0.3339928984642029, + "learning_rate": 0.00010284592737978411, + "loss": 0.6619001626968384, + "step": 501 + }, + { + "epoch": 0.490234375, + "grad_norm": 0.6400135159492493, + "learning_rate": 0.00010264965652600588, + "loss": 0.28338727355003357, + "step": 502 + }, + { + "epoch": 0.4912109375, + "grad_norm": 0.35763970017433167, + "learning_rate": 0.00010245338567222769, + "loss": 0.6373124122619629, + "step": 503 + }, + { + "epoch": 0.4921875, + "grad_norm": 0.2136622965335846, + "learning_rate": 0.00010225711481844946, + "loss": 0.2315329760313034, + "step": 504 + }, + { + "epoch": 0.4931640625, + "grad_norm": 0.6324110627174377, + "learning_rate": 0.00010206084396467126, + "loss": 1.0045514106750488, + "step": 505 + }, + { + "epoch": 0.494140625, + "grad_norm": 0.4471307694911957, + "learning_rate": 0.00010186457311089304, + "loss": 0.5188390016555786, + "step": 506 + }, + { + "epoch": 0.4951171875, + "grad_norm": 0.38222211599349976, + "learning_rate": 0.00010166830225711483, + "loss": 0.7351740598678589, + "step": 507 + }, + { + "epoch": 0.49609375, + "grad_norm": 0.41885000467300415, + "learning_rate": 0.0001014720314033366, + "loss": 0.9071688055992126, + "step": 508 + }, + { + "epoch": 0.4970703125, + "grad_norm": 0.8193621635437012, + "learning_rate": 0.00010127576054955839, + "loss": 0.7240473031997681, + "step": 509 + }, + { + "epoch": 0.498046875, + "grad_norm": 0.2846645712852478, + "learning_rate": 0.00010107948969578018, + "loss": 0.351628839969635, + "step": 510 + }, + { + "epoch": 0.4990234375, + "grad_norm": 0.4778954088687897, + "learning_rate": 0.00010088321884200196, + "loss": 0.7705833911895752, + "step": 511 + }, + { + "epoch": 0.5, + "grad_norm": 0.3384702503681183, + "learning_rate": 0.00010068694798822376, + "loss": 0.5467265248298645, + "step": 512 + }, + { + "epoch": 0.5009765625, + "grad_norm": 0.43917056918144226, + "learning_rate": 0.00010049067713444553, + "loss": 0.9810686707496643, + "step": 513 + }, + { + "epoch": 0.501953125, + "grad_norm": 0.4351615607738495, + "learning_rate": 0.00010029440628066732, + "loss": 0.9716764688491821, + "step": 514 + }, + { + "epoch": 0.5029296875, + "grad_norm": 0.49873459339141846, + "learning_rate": 0.00010009813542688911, + "loss": 0.9183788299560547, + "step": 515 + }, + { + "epoch": 0.50390625, + "grad_norm": 0.36710789799690247, + "learning_rate": 9.990186457311089e-05, + "loss": 0.49884548783302307, + "step": 516 + }, + { + "epoch": 0.5048828125, + "grad_norm": 0.5402531623840332, + "learning_rate": 9.970559371933269e-05, + "loss": 0.6645570993423462, + "step": 517 + }, + { + "epoch": 0.505859375, + "grad_norm": 0.4990559220314026, + "learning_rate": 9.950932286555447e-05, + "loss": 1.0321924686431885, + "step": 518 + }, + { + "epoch": 0.5068359375, + "grad_norm": 0.4634752869606018, + "learning_rate": 9.931305201177625e-05, + "loss": 0.8484972715377808, + "step": 519 + }, + { + "epoch": 0.5078125, + "grad_norm": 0.38584330677986145, + "learning_rate": 9.911678115799805e-05, + "loss": 0.3424939513206482, + "step": 520 + }, + { + "epoch": 0.5087890625, + "grad_norm": 0.41148415207862854, + "learning_rate": 9.892051030421983e-05, + "loss": 0.7890703678131104, + "step": 521 + }, + { + "epoch": 0.509765625, + "grad_norm": 0.35891374945640564, + "learning_rate": 9.872423945044161e-05, + "loss": 0.7387750744819641, + "step": 522 + }, + { + "epoch": 0.5107421875, + "grad_norm": 0.4174203872680664, + "learning_rate": 9.85279685966634e-05, + "loss": 0.5610706806182861, + "step": 523 + }, + { + "epoch": 0.51171875, + "grad_norm": 0.4062010645866394, + "learning_rate": 9.833169774288519e-05, + "loss": 0.6016039252281189, + "step": 524 + }, + { + "epoch": 0.5126953125, + "grad_norm": 0.35915061831474304, + "learning_rate": 9.813542688910697e-05, + "loss": 0.37933990359306335, + "step": 525 + }, + { + "epoch": 0.513671875, + "grad_norm": 0.49826234579086304, + "learning_rate": 9.793915603532877e-05, + "loss": 0.9650976657867432, + "step": 526 + }, + { + "epoch": 0.5146484375, + "grad_norm": 0.4122180938720703, + "learning_rate": 9.774288518155055e-05, + "loss": 0.5477824211120605, + "step": 527 + }, + { + "epoch": 0.515625, + "grad_norm": 0.3824058175086975, + "learning_rate": 9.754661432777233e-05, + "loss": 0.5163108706474304, + "step": 528 + }, + { + "epoch": 0.5166015625, + "grad_norm": 0.4485555589199066, + "learning_rate": 9.735034347399413e-05, + "loss": 0.9402418732643127, + "step": 529 + }, + { + "epoch": 0.517578125, + "grad_norm": 0.4053209722042084, + "learning_rate": 9.715407262021591e-05, + "loss": 0.9314478039741516, + "step": 530 + }, + { + "epoch": 0.5185546875, + "grad_norm": 0.3183811604976654, + "learning_rate": 9.695780176643768e-05, + "loss": 0.6706205606460571, + "step": 531 + }, + { + "epoch": 0.51953125, + "grad_norm": 0.40083932876586914, + "learning_rate": 9.676153091265947e-05, + "loss": 1.102424144744873, + "step": 532 + }, + { + "epoch": 0.5205078125, + "grad_norm": 0.5949054956436157, + "learning_rate": 9.656526005888126e-05, + "loss": 0.8396608829498291, + "step": 533 + }, + { + "epoch": 0.521484375, + "grad_norm": 0.41966959834098816, + "learning_rate": 9.636898920510304e-05, + "loss": 0.5641101002693176, + "step": 534 + }, + { + "epoch": 0.5224609375, + "grad_norm": 0.448281466960907, + "learning_rate": 9.617271835132484e-05, + "loss": 0.44873932003974915, + "step": 535 + }, + { + "epoch": 0.5234375, + "grad_norm": 0.47785645723342896, + "learning_rate": 9.597644749754662e-05, + "loss": 0.8799008131027222, + "step": 536 + }, + { + "epoch": 0.5244140625, + "grad_norm": 0.45459261536598206, + "learning_rate": 9.57801766437684e-05, + "loss": 0.8261788487434387, + "step": 537 + }, + { + "epoch": 0.525390625, + "grad_norm": 0.6168074607849121, + "learning_rate": 9.55839057899902e-05, + "loss": 0.9762136936187744, + "step": 538 + }, + { + "epoch": 0.5263671875, + "grad_norm": 0.6500818133354187, + "learning_rate": 9.538763493621198e-05, + "loss": 0.9044640064239502, + "step": 539 + }, + { + "epoch": 0.52734375, + "grad_norm": 0.31668490171432495, + "learning_rate": 9.519136408243376e-05, + "loss": 0.42503029108047485, + "step": 540 + }, + { + "epoch": 0.5283203125, + "grad_norm": 0.4041314721107483, + "learning_rate": 9.499509322865556e-05, + "loss": 0.6643175482749939, + "step": 541 + }, + { + "epoch": 0.529296875, + "grad_norm": 1.011020541191101, + "learning_rate": 9.479882237487734e-05, + "loss": 0.7636033892631531, + "step": 542 + }, + { + "epoch": 0.5302734375, + "grad_norm": 0.3690396845340729, + "learning_rate": 9.460255152109912e-05, + "loss": 1.0516947507858276, + "step": 543 + }, + { + "epoch": 0.53125, + "grad_norm": 0.288604199886322, + "learning_rate": 9.440628066732092e-05, + "loss": 0.3806208372116089, + "step": 544 + }, + { + "epoch": 0.5322265625, + "grad_norm": 0.4247501790523529, + "learning_rate": 9.42100098135427e-05, + "loss": 0.8651745319366455, + "step": 545 + }, + { + "epoch": 0.533203125, + "grad_norm": 1.1893255710601807, + "learning_rate": 9.401373895976447e-05, + "loss": 0.28601521253585815, + "step": 546 + }, + { + "epoch": 0.5341796875, + "grad_norm": 0.3229619562625885, + "learning_rate": 9.381746810598626e-05, + "loss": 0.8316909670829773, + "step": 547 + }, + { + "epoch": 0.53515625, + "grad_norm": 0.390278160572052, + "learning_rate": 9.362119725220805e-05, + "loss": 0.7263185977935791, + "step": 548 + }, + { + "epoch": 0.5361328125, + "grad_norm": 0.2949998378753662, + "learning_rate": 9.342492639842983e-05, + "loss": 0.5417062044143677, + "step": 549 + }, + { + "epoch": 0.537109375, + "grad_norm": 0.47482210397720337, + "learning_rate": 9.322865554465163e-05, + "loss": 0.6505849361419678, + "step": 550 + }, + { + "epoch": 0.5380859375, + "grad_norm": 0.3653123676776886, + "learning_rate": 9.303238469087341e-05, + "loss": 0.7270935773849487, + "step": 551 + }, + { + "epoch": 0.5390625, + "grad_norm": 0.5652351975440979, + "learning_rate": 9.283611383709519e-05, + "loss": 0.8330069780349731, + "step": 552 + }, + { + "epoch": 0.5400390625, + "grad_norm": 0.448408842086792, + "learning_rate": 9.263984298331699e-05, + "loss": 0.8804951310157776, + "step": 553 + }, + { + "epoch": 0.541015625, + "grad_norm": 0.7700690031051636, + "learning_rate": 9.244357212953877e-05, + "loss": 0.6466813087463379, + "step": 554 + }, + { + "epoch": 0.5419921875, + "grad_norm": 0.45755863189697266, + "learning_rate": 9.224730127576055e-05, + "loss": 0.5548572540283203, + "step": 555 + }, + { + "epoch": 0.54296875, + "grad_norm": 0.4113846719264984, + "learning_rate": 9.205103042198235e-05, + "loss": 0.9286736845970154, + "step": 556 + }, + { + "epoch": 0.5439453125, + "grad_norm": 0.4555431604385376, + "learning_rate": 9.185475956820413e-05, + "loss": 0.8332977890968323, + "step": 557 + }, + { + "epoch": 0.544921875, + "grad_norm": 0.5103408098220825, + "learning_rate": 9.165848871442591e-05, + "loss": 1.0110094547271729, + "step": 558 + }, + { + "epoch": 0.5458984375, + "grad_norm": 0.299912691116333, + "learning_rate": 9.146221786064771e-05, + "loss": 0.3136459290981293, + "step": 559 + }, + { + "epoch": 0.546875, + "grad_norm": 0.40499091148376465, + "learning_rate": 9.126594700686948e-05, + "loss": 0.6785961389541626, + "step": 560 + }, + { + "epoch": 0.5478515625, + "grad_norm": 0.4190375804901123, + "learning_rate": 9.106967615309127e-05, + "loss": 0.9891744256019592, + "step": 561 + }, + { + "epoch": 0.548828125, + "grad_norm": 0.6265519261360168, + "learning_rate": 9.087340529931305e-05, + "loss": 0.48712462186813354, + "step": 562 + }, + { + "epoch": 0.5498046875, + "grad_norm": 0.466420978307724, + "learning_rate": 9.067713444553484e-05, + "loss": 0.5573943257331848, + "step": 563 + }, + { + "epoch": 0.55078125, + "grad_norm": 0.3990301191806793, + "learning_rate": 9.048086359175663e-05, + "loss": 0.5893411040306091, + "step": 564 + }, + { + "epoch": 0.5517578125, + "grad_norm": 0.31471043825149536, + "learning_rate": 9.028459273797842e-05, + "loss": 0.593424379825592, + "step": 565 + }, + { + "epoch": 0.552734375, + "grad_norm": 0.46789905428886414, + "learning_rate": 9.00883218842002e-05, + "loss": 0.9398684501647949, + "step": 566 + }, + { + "epoch": 0.5537109375, + "grad_norm": 0.48358282446861267, + "learning_rate": 8.989205103042198e-05, + "loss": 0.895098865032196, + "step": 567 + }, + { + "epoch": 0.5546875, + "grad_norm": 0.25878453254699707, + "learning_rate": 8.969578017664378e-05, + "loss": 0.4817226231098175, + "step": 568 + }, + { + "epoch": 0.5556640625, + "grad_norm": 0.5319378972053528, + "learning_rate": 8.949950932286556e-05, + "loss": 0.6119651794433594, + "step": 569 + }, + { + "epoch": 0.556640625, + "grad_norm": 0.3002898097038269, + "learning_rate": 8.930323846908734e-05, + "loss": 0.28599199652671814, + "step": 570 + }, + { + "epoch": 0.5576171875, + "grad_norm": 0.37161317467689514, + "learning_rate": 8.910696761530914e-05, + "loss": 0.3307079076766968, + "step": 571 + }, + { + "epoch": 0.55859375, + "grad_norm": 0.4755436182022095, + "learning_rate": 8.891069676153092e-05, + "loss": 0.5868921279907227, + "step": 572 + }, + { + "epoch": 0.5595703125, + "grad_norm": 0.3264123499393463, + "learning_rate": 8.87144259077527e-05, + "loss": 0.6682905554771423, + "step": 573 + }, + { + "epoch": 0.560546875, + "grad_norm": 0.43468573689460754, + "learning_rate": 8.85181550539745e-05, + "loss": 0.6316066980361938, + "step": 574 + }, + { + "epoch": 0.5615234375, + "grad_norm": 0.5759416222572327, + "learning_rate": 8.832188420019627e-05, + "loss": 0.5687480568885803, + "step": 575 + }, + { + "epoch": 0.5625, + "grad_norm": 0.39352041482925415, + "learning_rate": 8.812561334641806e-05, + "loss": 0.3803275525569916, + "step": 576 + }, + { + "epoch": 0.5634765625, + "grad_norm": 0.4155440926551819, + "learning_rate": 8.792934249263984e-05, + "loss": 0.3923049569129944, + "step": 577 + }, + { + "epoch": 0.564453125, + "grad_norm": 0.34934133291244507, + "learning_rate": 8.773307163886163e-05, + "loss": 0.7100962996482849, + "step": 578 + }, + { + "epoch": 0.5654296875, + "grad_norm": 0.3993069529533386, + "learning_rate": 8.753680078508342e-05, + "loss": 0.6711176037788391, + "step": 579 + }, + { + "epoch": 0.56640625, + "grad_norm": 0.3445776700973511, + "learning_rate": 8.73405299313052e-05, + "loss": 0.6986067295074463, + "step": 580 + }, + { + "epoch": 0.5673828125, + "grad_norm": 0.45837292075157166, + "learning_rate": 8.714425907752699e-05, + "loss": 0.9020513892173767, + "step": 581 + }, + { + "epoch": 0.568359375, + "grad_norm": 0.3630208671092987, + "learning_rate": 8.694798822374878e-05, + "loss": 0.42499858140945435, + "step": 582 + }, + { + "epoch": 0.5693359375, + "grad_norm": 0.41205838322639465, + "learning_rate": 8.675171736997057e-05, + "loss": 0.5535018444061279, + "step": 583 + }, + { + "epoch": 0.5703125, + "grad_norm": 0.2596284747123718, + "learning_rate": 8.655544651619235e-05, + "loss": 0.3234618902206421, + "step": 584 + }, + { + "epoch": 0.5712890625, + "grad_norm": 0.3716956079006195, + "learning_rate": 8.635917566241414e-05, + "loss": 0.7567611932754517, + "step": 585 + }, + { + "epoch": 0.572265625, + "grad_norm": 0.42999619245529175, + "learning_rate": 8.616290480863593e-05, + "loss": 0.8695427179336548, + "step": 586 + }, + { + "epoch": 0.5732421875, + "grad_norm": 0.3309305012226105, + "learning_rate": 8.596663395485771e-05, + "loss": 0.989714503288269, + "step": 587 + }, + { + "epoch": 0.57421875, + "grad_norm": 0.40024474263191223, + "learning_rate": 8.57703631010795e-05, + "loss": 1.0608711242675781, + "step": 588 + }, + { + "epoch": 0.5751953125, + "grad_norm": 0.453950434923172, + "learning_rate": 8.557409224730129e-05, + "loss": 0.7340632677078247, + "step": 589 + }, + { + "epoch": 0.576171875, + "grad_norm": 0.4473342299461365, + "learning_rate": 8.537782139352306e-05, + "loss": 0.7264219522476196, + "step": 590 + }, + { + "epoch": 0.5771484375, + "grad_norm": 0.420469731092453, + "learning_rate": 8.518155053974485e-05, + "loss": 0.8141539692878723, + "step": 591 + }, + { + "epoch": 0.578125, + "grad_norm": 0.4068243205547333, + "learning_rate": 8.498527968596663e-05, + "loss": 0.5802872180938721, + "step": 592 + }, + { + "epoch": 0.5791015625, + "grad_norm": 0.4243272840976715, + "learning_rate": 8.478900883218842e-05, + "loss": 0.350595086812973, + "step": 593 + }, + { + "epoch": 0.580078125, + "grad_norm": 0.4519834518432617, + "learning_rate": 8.459273797841021e-05, + "loss": 0.7131458520889282, + "step": 594 + }, + { + "epoch": 0.5810546875, + "grad_norm": 0.34145745635032654, + "learning_rate": 8.4396467124632e-05, + "loss": 0.7618221640586853, + "step": 595 + }, + { + "epoch": 0.58203125, + "grad_norm": 0.46494174003601074, + "learning_rate": 8.420019627085378e-05, + "loss": 0.5102145075798035, + "step": 596 + }, + { + "epoch": 0.5830078125, + "grad_norm": 0.3305060565471649, + "learning_rate": 8.400392541707557e-05, + "loss": 0.7812811732292175, + "step": 597 + }, + { + "epoch": 0.583984375, + "grad_norm": 0.47092583775520325, + "learning_rate": 8.380765456329736e-05, + "loss": 0.7497634887695312, + "step": 598 + }, + { + "epoch": 0.5849609375, + "grad_norm": 0.38902655243873596, + "learning_rate": 8.361138370951914e-05, + "loss": 0.4198119640350342, + "step": 599 + }, + { + "epoch": 0.5859375, + "grad_norm": 0.43659287691116333, + "learning_rate": 8.341511285574093e-05, + "loss": 0.824333667755127, + "step": 600 + }, + { + "epoch": 0.5869140625, + "grad_norm": 0.4277879595756531, + "learning_rate": 8.321884200196272e-05, + "loss": 0.445267915725708, + "step": 601 + }, + { + "epoch": 0.587890625, + "grad_norm": 0.3186829090118408, + "learning_rate": 8.30225711481845e-05, + "loss": 0.9906235337257385, + "step": 602 + }, + { + "epoch": 0.5888671875, + "grad_norm": 0.2983294427394867, + "learning_rate": 8.28263002944063e-05, + "loss": 0.5342146754264832, + "step": 603 + }, + { + "epoch": 0.58984375, + "grad_norm": 0.4127228856086731, + "learning_rate": 8.263002944062808e-05, + "loss": 0.41288450360298157, + "step": 604 + }, + { + "epoch": 0.5908203125, + "grad_norm": 0.3961617052555084, + "learning_rate": 8.243375858684985e-05, + "loss": 0.43576663732528687, + "step": 605 + }, + { + "epoch": 0.591796875, + "grad_norm": 0.4124387502670288, + "learning_rate": 8.223748773307164e-05, + "loss": 0.5837401747703552, + "step": 606 + }, + { + "epoch": 0.5927734375, + "grad_norm": 0.4274151921272278, + "learning_rate": 8.204121687929342e-05, + "loss": 0.8666547536849976, + "step": 607 + }, + { + "epoch": 0.59375, + "grad_norm": 0.3881700932979584, + "learning_rate": 8.18449460255152e-05, + "loss": 0.9063656330108643, + "step": 608 + }, + { + "epoch": 0.5947265625, + "grad_norm": 0.46216556429862976, + "learning_rate": 8.1648675171737e-05, + "loss": 0.4573599696159363, + "step": 609 + }, + { + "epoch": 0.595703125, + "grad_norm": 0.3843960762023926, + "learning_rate": 8.145240431795878e-05, + "loss": 0.6214632391929626, + "step": 610 + }, + { + "epoch": 0.5966796875, + "grad_norm": 0.538301408290863, + "learning_rate": 8.125613346418057e-05, + "loss": 0.8800979852676392, + "step": 611 + }, + { + "epoch": 0.59765625, + "grad_norm": 0.49643319845199585, + "learning_rate": 8.105986261040236e-05, + "loss": 0.48715031147003174, + "step": 612 + }, + { + "epoch": 0.5986328125, + "grad_norm": 0.4753062427043915, + "learning_rate": 8.086359175662415e-05, + "loss": 0.8127011060714722, + "step": 613 + }, + { + "epoch": 0.599609375, + "grad_norm": 0.7572022676467896, + "learning_rate": 8.066732090284593e-05, + "loss": 0.7151535153388977, + "step": 614 + }, + { + "epoch": 0.6005859375, + "grad_norm": 0.35117295384407043, + "learning_rate": 8.047105004906772e-05, + "loss": 0.9221618175506592, + "step": 615 + }, + { + "epoch": 0.6015625, + "grad_norm": 0.2643633186817169, + "learning_rate": 8.02747791952895e-05, + "loss": 0.5025840401649475, + "step": 616 + }, + { + "epoch": 0.6025390625, + "grad_norm": 0.45553916692733765, + "learning_rate": 8.007850834151129e-05, + "loss": 0.452494740486145, + "step": 617 + }, + { + "epoch": 0.603515625, + "grad_norm": 0.386594295501709, + "learning_rate": 7.988223748773308e-05, + "loss": 0.7942792773246765, + "step": 618 + }, + { + "epoch": 0.6044921875, + "grad_norm": 0.3616650700569153, + "learning_rate": 7.968596663395485e-05, + "loss": 0.5697340965270996, + "step": 619 + }, + { + "epoch": 0.60546875, + "grad_norm": 0.3885051906108856, + "learning_rate": 7.948969578017665e-05, + "loss": 0.7082506418228149, + "step": 620 + }, + { + "epoch": 0.6064453125, + "grad_norm": 0.4484117329120636, + "learning_rate": 7.929342492639843e-05, + "loss": 0.5993860960006714, + "step": 621 + }, + { + "epoch": 0.607421875, + "grad_norm": 0.44654563069343567, + "learning_rate": 7.909715407262021e-05, + "loss": 0.5804839134216309, + "step": 622 + }, + { + "epoch": 0.6083984375, + "grad_norm": 0.3943687081336975, + "learning_rate": 7.890088321884201e-05, + "loss": 0.6422688364982605, + "step": 623 + }, + { + "epoch": 0.609375, + "grad_norm": 0.4153381288051605, + "learning_rate": 7.870461236506379e-05, + "loss": 0.6437400579452515, + "step": 624 + }, + { + "epoch": 0.6103515625, + "grad_norm": 0.38221171498298645, + "learning_rate": 7.850834151128557e-05, + "loss": 0.8738820552825928, + "step": 625 + }, + { + "epoch": 0.611328125, + "grad_norm": 0.339599609375, + "learning_rate": 7.831207065750737e-05, + "loss": 0.517478883266449, + "step": 626 + }, + { + "epoch": 0.6123046875, + "grad_norm": 0.7177076935768127, + "learning_rate": 7.811579980372915e-05, + "loss": 0.7372115254402161, + "step": 627 + }, + { + "epoch": 0.61328125, + "grad_norm": 0.47573140263557434, + "learning_rate": 7.791952894995093e-05, + "loss": 0.649010181427002, + "step": 628 + }, + { + "epoch": 0.6142578125, + "grad_norm": 0.44851094484329224, + "learning_rate": 7.772325809617273e-05, + "loss": 0.6269842386245728, + "step": 629 + }, + { + "epoch": 0.615234375, + "grad_norm": 0.3544669449329376, + "learning_rate": 7.752698724239451e-05, + "loss": 0.8870983123779297, + "step": 630 + }, + { + "epoch": 0.6162109375, + "grad_norm": 0.4103491008281708, + "learning_rate": 7.73307163886163e-05, + "loss": 0.8711034059524536, + "step": 631 + }, + { + "epoch": 0.6171875, + "grad_norm": 0.3651062548160553, + "learning_rate": 7.713444553483808e-05, + "loss": 0.8420337438583374, + "step": 632 + }, + { + "epoch": 0.6181640625, + "grad_norm": 0.4135638475418091, + "learning_rate": 7.693817468105987e-05, + "loss": 0.601078450679779, + "step": 633 + }, + { + "epoch": 0.619140625, + "grad_norm": 0.5965299010276794, + "learning_rate": 7.674190382728164e-05, + "loss": 0.604471743106842, + "step": 634 + }, + { + "epoch": 0.6201171875, + "grad_norm": 0.4340416491031647, + "learning_rate": 7.654563297350344e-05, + "loss": 0.905183732509613, + "step": 635 + }, + { + "epoch": 0.62109375, + "grad_norm": 0.361518919467926, + "learning_rate": 7.634936211972522e-05, + "loss": 0.6569675207138062, + "step": 636 + }, + { + "epoch": 0.6220703125, + "grad_norm": 1.04604971408844, + "learning_rate": 7.6153091265947e-05, + "loss": 0.7399482727050781, + "step": 637 + }, + { + "epoch": 0.623046875, + "grad_norm": 0.8039460778236389, + "learning_rate": 7.59568204121688e-05, + "loss": 0.6003617644309998, + "step": 638 + }, + { + "epoch": 0.6240234375, + "grad_norm": 0.5462118983268738, + "learning_rate": 7.576054955839058e-05, + "loss": 0.7750217914581299, + "step": 639 + }, + { + "epoch": 0.625, + "grad_norm": 0.29333505034446716, + "learning_rate": 7.556427870461236e-05, + "loss": 0.47371456027030945, + "step": 640 + }, + { + "epoch": 0.6259765625, + "grad_norm": 0.2468312531709671, + "learning_rate": 7.536800785083416e-05, + "loss": 0.4615188241004944, + "step": 641 + }, + { + "epoch": 0.626953125, + "grad_norm": 0.48467332124710083, + "learning_rate": 7.517173699705594e-05, + "loss": 0.6456693410873413, + "step": 642 + }, + { + "epoch": 0.6279296875, + "grad_norm": 0.5471943020820618, + "learning_rate": 7.497546614327772e-05, + "loss": 0.5899155139923096, + "step": 643 + }, + { + "epoch": 0.62890625, + "grad_norm": 0.3715604841709137, + "learning_rate": 7.477919528949952e-05, + "loss": 0.7910970449447632, + "step": 644 + }, + { + "epoch": 0.6298828125, + "grad_norm": 0.3298327922821045, + "learning_rate": 7.45829244357213e-05, + "loss": 0.5769776701927185, + "step": 645 + }, + { + "epoch": 0.630859375, + "grad_norm": 0.44131916761398315, + "learning_rate": 7.438665358194309e-05, + "loss": 0.8805806636810303, + "step": 646 + }, + { + "epoch": 0.6318359375, + "grad_norm": 0.4686948359012604, + "learning_rate": 7.419038272816488e-05, + "loss": 0.7262091636657715, + "step": 647 + }, + { + "epoch": 0.6328125, + "grad_norm": 0.48123931884765625, + "learning_rate": 7.399411187438666e-05, + "loss": 0.8481992483139038, + "step": 648 + }, + { + "epoch": 0.6337890625, + "grad_norm": 0.5582646131515503, + "learning_rate": 7.379784102060843e-05, + "loss": 0.4963653087615967, + "step": 649 + }, + { + "epoch": 0.634765625, + "grad_norm": 0.30464881658554077, + "learning_rate": 7.360157016683023e-05, + "loss": 0.6772556900978088, + "step": 650 + }, + { + "epoch": 0.6357421875, + "grad_norm": 0.44710803031921387, + "learning_rate": 7.340529931305201e-05, + "loss": 0.5476983189582825, + "step": 651 + }, + { + "epoch": 0.63671875, + "grad_norm": 0.35922887921333313, + "learning_rate": 7.320902845927379e-05, + "loss": 0.8256508111953735, + "step": 652 + }, + { + "epoch": 0.6376953125, + "grad_norm": 0.40085500478744507, + "learning_rate": 7.301275760549559e-05, + "loss": 0.5783500671386719, + "step": 653 + }, + { + "epoch": 0.638671875, + "grad_norm": 0.47579512000083923, + "learning_rate": 7.281648675171737e-05, + "loss": 0.5591031908988953, + "step": 654 + }, + { + "epoch": 0.6396484375, + "grad_norm": 0.5594353675842285, + "learning_rate": 7.262021589793915e-05, + "loss": 0.8133666515350342, + "step": 655 + }, + { + "epoch": 0.640625, + "grad_norm": 0.44030821323394775, + "learning_rate": 7.242394504416095e-05, + "loss": 1.0282940864562988, + "step": 656 + }, + { + "epoch": 0.6416015625, + "grad_norm": 0.7038627862930298, + "learning_rate": 7.222767419038273e-05, + "loss": 0.2322971373796463, + "step": 657 + }, + { + "epoch": 0.642578125, + "grad_norm": 0.223698228597641, + "learning_rate": 7.203140333660451e-05, + "loss": 0.7056642174720764, + "step": 658 + }, + { + "epoch": 0.6435546875, + "grad_norm": 0.3815765976905823, + "learning_rate": 7.183513248282631e-05, + "loss": 1.074477195739746, + "step": 659 + }, + { + "epoch": 0.64453125, + "grad_norm": 0.35606271028518677, + "learning_rate": 7.163886162904809e-05, + "loss": 0.4300801753997803, + "step": 660 + }, + { + "epoch": 0.6455078125, + "grad_norm": 0.32899999618530273, + "learning_rate": 7.144259077526988e-05, + "loss": 0.5923078060150146, + "step": 661 + }, + { + "epoch": 0.646484375, + "grad_norm": 0.49968358874320984, + "learning_rate": 7.124631992149167e-05, + "loss": 0.8295183181762695, + "step": 662 + }, + { + "epoch": 0.6474609375, + "grad_norm": 0.3393777012825012, + "learning_rate": 7.105004906771345e-05, + "loss": 0.30383622646331787, + "step": 663 + }, + { + "epoch": 0.6484375, + "grad_norm": 0.24977968633174896, + "learning_rate": 7.085377821393524e-05, + "loss": 0.429612934589386, + "step": 664 + }, + { + "epoch": 0.6494140625, + "grad_norm": 0.35886242985725403, + "learning_rate": 7.065750736015702e-05, + "loss": 0.9189084768295288, + "step": 665 + }, + { + "epoch": 0.650390625, + "grad_norm": 0.3856249153614044, + "learning_rate": 7.04612365063788e-05, + "loss": 0.4880048930644989, + "step": 666 + }, + { + "epoch": 0.6513671875, + "grad_norm": 0.4439884424209595, + "learning_rate": 7.026496565260058e-05, + "loss": 0.7537186145782471, + "step": 667 + }, + { + "epoch": 0.65234375, + "grad_norm": 0.29563215374946594, + "learning_rate": 7.006869479882238e-05, + "loss": 0.38701343536376953, + "step": 668 + }, + { + "epoch": 0.6533203125, + "grad_norm": 0.1909576952457428, + "learning_rate": 6.987242394504416e-05, + "loss": 0.15140604972839355, + "step": 669 + }, + { + "epoch": 0.654296875, + "grad_norm": 0.3344849944114685, + "learning_rate": 6.967615309126594e-05, + "loss": 0.527427077293396, + "step": 670 + }, + { + "epoch": 0.6552734375, + "grad_norm": 0.3609422743320465, + "learning_rate": 6.947988223748774e-05, + "loss": 0.29116177558898926, + "step": 671 + }, + { + "epoch": 0.65625, + "grad_norm": 0.4419811964035034, + "learning_rate": 6.928361138370952e-05, + "loss": 0.7166855931282043, + "step": 672 + }, + { + "epoch": 0.6572265625, + "grad_norm": 0.31890806555747986, + "learning_rate": 6.90873405299313e-05, + "loss": 0.5259425640106201, + "step": 673 + }, + { + "epoch": 0.658203125, + "grad_norm": 0.39572352170944214, + "learning_rate": 6.88910696761531e-05, + "loss": 0.5964791774749756, + "step": 674 + }, + { + "epoch": 0.6591796875, + "grad_norm": 0.4501058757305145, + "learning_rate": 6.869479882237488e-05, + "loss": 0.2289922833442688, + "step": 675 + }, + { + "epoch": 0.66015625, + "grad_norm": 0.2884235680103302, + "learning_rate": 6.849852796859666e-05, + "loss": 0.2730886936187744, + "step": 676 + }, + { + "epoch": 0.6611328125, + "grad_norm": 0.32970431447029114, + "learning_rate": 6.830225711481846e-05, + "loss": 0.4283568859100342, + "step": 677 + }, + { + "epoch": 0.662109375, + "grad_norm": 0.39025789499282837, + "learning_rate": 6.810598626104023e-05, + "loss": 0.9361288547515869, + "step": 678 + }, + { + "epoch": 0.6630859375, + "grad_norm": 0.48386886715888977, + "learning_rate": 6.790971540726203e-05, + "loss": 0.4907494783401489, + "step": 679 + }, + { + "epoch": 0.6640625, + "grad_norm": 0.41783151030540466, + "learning_rate": 6.771344455348381e-05, + "loss": 0.7485824823379517, + "step": 680 + }, + { + "epoch": 0.6650390625, + "grad_norm": 0.4826144278049469, + "learning_rate": 6.751717369970559e-05, + "loss": 0.6413211226463318, + "step": 681 + }, + { + "epoch": 0.666015625, + "grad_norm": 0.27521079778671265, + "learning_rate": 6.732090284592739e-05, + "loss": 0.5747159123420715, + "step": 682 + }, + { + "epoch": 0.6669921875, + "grad_norm": 0.3745660185813904, + "learning_rate": 6.712463199214917e-05, + "loss": 0.414341002702713, + "step": 683 + }, + { + "epoch": 0.66796875, + "grad_norm": 0.45048731565475464, + "learning_rate": 6.692836113837095e-05, + "loss": 0.3665570318698883, + "step": 684 + }, + { + "epoch": 0.6689453125, + "grad_norm": 0.5048633217811584, + "learning_rate": 6.673209028459275e-05, + "loss": 0.5923498272895813, + "step": 685 + }, + { + "epoch": 0.669921875, + "grad_norm": 0.46423155069351196, + "learning_rate": 6.653581943081453e-05, + "loss": 0.7506915330886841, + "step": 686 + }, + { + "epoch": 0.6708984375, + "grad_norm": 0.42965108156204224, + "learning_rate": 6.633954857703631e-05, + "loss": 0.7576399445533752, + "step": 687 + }, + { + "epoch": 0.671875, + "grad_norm": 0.48331597447395325, + "learning_rate": 6.614327772325811e-05, + "loss": 0.5249682068824768, + "step": 688 + }, + { + "epoch": 0.6728515625, + "grad_norm": 0.4685790240764618, + "learning_rate": 6.594700686947989e-05, + "loss": 0.8056750297546387, + "step": 689 + }, + { + "epoch": 0.673828125, + "grad_norm": 0.46440044045448303, + "learning_rate": 6.575073601570167e-05, + "loss": 0.9252493381500244, + "step": 690 + }, + { + "epoch": 0.6748046875, + "grad_norm": 0.46564289927482605, + "learning_rate": 6.555446516192347e-05, + "loss": 0.8182022571563721, + "step": 691 + }, + { + "epoch": 0.67578125, + "grad_norm": 0.4397750496864319, + "learning_rate": 6.535819430814525e-05, + "loss": 0.7928388118743896, + "step": 692 + }, + { + "epoch": 0.6767578125, + "grad_norm": 0.3233174681663513, + "learning_rate": 6.516192345436702e-05, + "loss": 0.5252426862716675, + "step": 693 + }, + { + "epoch": 0.677734375, + "grad_norm": 0.6012148857116699, + "learning_rate": 6.496565260058882e-05, + "loss": 0.44195663928985596, + "step": 694 + }, + { + "epoch": 0.6787109375, + "grad_norm": 0.6329052448272705, + "learning_rate": 6.47693817468106e-05, + "loss": 0.5354570150375366, + "step": 695 + }, + { + "epoch": 0.6796875, + "grad_norm": 0.47926270961761475, + "learning_rate": 6.457311089303238e-05, + "loss": 0.4950491786003113, + "step": 696 + }, + { + "epoch": 0.6806640625, + "grad_norm": 0.5051383972167969, + "learning_rate": 6.437684003925418e-05, + "loss": 0.6795849204063416, + "step": 697 + }, + { + "epoch": 0.681640625, + "grad_norm": 0.4022398591041565, + "learning_rate": 6.418056918547596e-05, + "loss": 1.0388166904449463, + "step": 698 + }, + { + "epoch": 0.6826171875, + "grad_norm": 0.4309573471546173, + "learning_rate": 6.398429833169774e-05, + "loss": 0.6022897362709045, + "step": 699 + }, + { + "epoch": 0.68359375, + "grad_norm": 0.3301983177661896, + "learning_rate": 6.378802747791954e-05, + "loss": 0.6451660394668579, + "step": 700 + }, + { + "epoch": 0.6845703125, + "grad_norm": 0.6647156476974487, + "learning_rate": 6.359175662414132e-05, + "loss": 0.9699732661247253, + "step": 701 + }, + { + "epoch": 0.685546875, + "grad_norm": 0.37545597553253174, + "learning_rate": 6.33954857703631e-05, + "loss": 0.43181508779525757, + "step": 702 + }, + { + "epoch": 0.6865234375, + "grad_norm": 0.40882429480552673, + "learning_rate": 6.31992149165849e-05, + "loss": 0.665264368057251, + "step": 703 + }, + { + "epoch": 0.6875, + "grad_norm": 0.46597936749458313, + "learning_rate": 6.300294406280668e-05, + "loss": 0.8813620209693909, + "step": 704 + }, + { + "epoch": 0.6884765625, + "grad_norm": 0.4355461597442627, + "learning_rate": 6.280667320902846e-05, + "loss": 0.595770537853241, + "step": 705 + }, + { + "epoch": 0.689453125, + "grad_norm": 0.45896056294441223, + "learning_rate": 6.261040235525026e-05, + "loss": 0.7571601271629333, + "step": 706 + }, + { + "epoch": 0.6904296875, + "grad_norm": 0.37643495202064514, + "learning_rate": 6.241413150147204e-05, + "loss": 0.47930869460105896, + "step": 707 + }, + { + "epoch": 0.69140625, + "grad_norm": 0.49690738320350647, + "learning_rate": 6.221786064769381e-05, + "loss": 0.3727263808250427, + "step": 708 + }, + { + "epoch": 0.6923828125, + "grad_norm": 0.44111907482147217, + "learning_rate": 6.20215897939156e-05, + "loss": 0.7276532649993896, + "step": 709 + }, + { + "epoch": 0.693359375, + "grad_norm": 0.44872644543647766, + "learning_rate": 6.182531894013739e-05, + "loss": 0.5082123279571533, + "step": 710 + }, + { + "epoch": 0.6943359375, + "grad_norm": 0.3345314562320709, + "learning_rate": 6.162904808635917e-05, + "loss": 0.5472716093063354, + "step": 711 + }, + { + "epoch": 0.6953125, + "grad_norm": 0.4269154667854309, + "learning_rate": 6.143277723258097e-05, + "loss": 0.7036910057067871, + "step": 712 + }, + { + "epoch": 0.6962890625, + "grad_norm": 0.5314676761627197, + "learning_rate": 6.123650637880275e-05, + "loss": 0.8663474917411804, + "step": 713 + }, + { + "epoch": 0.697265625, + "grad_norm": 0.2820166349411011, + "learning_rate": 6.104023552502453e-05, + "loss": 0.6397068500518799, + "step": 714 + }, + { + "epoch": 0.6982421875, + "grad_norm": 0.40954726934432983, + "learning_rate": 6.084396467124632e-05, + "loss": 0.5477964282035828, + "step": 715 + }, + { + "epoch": 0.69921875, + "grad_norm": 0.6858615279197693, + "learning_rate": 6.064769381746811e-05, + "loss": 0.694764256477356, + "step": 716 + }, + { + "epoch": 0.7001953125, + "grad_norm": 2.901998281478882, + "learning_rate": 6.04514229636899e-05, + "loss": 0.5803335309028625, + "step": 717 + }, + { + "epoch": 0.701171875, + "grad_norm": 0.6065869927406311, + "learning_rate": 6.025515210991168e-05, + "loss": 0.49790292978286743, + "step": 718 + }, + { + "epoch": 0.7021484375, + "grad_norm": 0.3678690195083618, + "learning_rate": 6.005888125613347e-05, + "loss": 0.38595882058143616, + "step": 719 + }, + { + "epoch": 0.703125, + "grad_norm": 0.32496991753578186, + "learning_rate": 5.986261040235526e-05, + "loss": 0.3554360866546631, + "step": 720 + }, + { + "epoch": 0.7041015625, + "grad_norm": 0.5348960161209106, + "learning_rate": 5.966633954857704e-05, + "loss": 1.0386948585510254, + "step": 721 + }, + { + "epoch": 0.705078125, + "grad_norm": 0.42248818278312683, + "learning_rate": 5.947006869479883e-05, + "loss": 0.4950508177280426, + "step": 722 + }, + { + "epoch": 0.7060546875, + "grad_norm": 0.36575669050216675, + "learning_rate": 5.9273797841020606e-05, + "loss": 0.8793643712997437, + "step": 723 + }, + { + "epoch": 0.70703125, + "grad_norm": 0.30802977085113525, + "learning_rate": 5.9077526987242395e-05, + "loss": 0.7557331919670105, + "step": 724 + }, + { + "epoch": 0.7080078125, + "grad_norm": 0.36057788133621216, + "learning_rate": 5.888125613346418e-05, + "loss": 0.793386697769165, + "step": 725 + }, + { + "epoch": 0.708984375, + "grad_norm": 0.5049283504486084, + "learning_rate": 5.8684985279685966e-05, + "loss": 0.3805343210697174, + "step": 726 + }, + { + "epoch": 0.7099609375, + "grad_norm": 0.4448167681694031, + "learning_rate": 5.8488714425907756e-05, + "loss": 0.8297110199928284, + "step": 727 + }, + { + "epoch": 0.7109375, + "grad_norm": 0.5144803524017334, + "learning_rate": 5.829244357212954e-05, + "loss": 0.8582932949066162, + "step": 728 + }, + { + "epoch": 0.7119140625, + "grad_norm": 0.48559248447418213, + "learning_rate": 5.809617271835133e-05, + "loss": 0.851997971534729, + "step": 729 + }, + { + "epoch": 0.712890625, + "grad_norm": 0.5277959704399109, + "learning_rate": 5.7899901864573116e-05, + "loss": 0.8560271859169006, + "step": 730 + }, + { + "epoch": 0.7138671875, + "grad_norm": 0.39055025577545166, + "learning_rate": 5.77036310107949e-05, + "loss": 0.5023626685142517, + "step": 731 + }, + { + "epoch": 0.71484375, + "grad_norm": 0.4014328718185425, + "learning_rate": 5.750736015701669e-05, + "loss": 0.7782986760139465, + "step": 732 + }, + { + "epoch": 0.7158203125, + "grad_norm": 0.9840988516807556, + "learning_rate": 5.731108930323848e-05, + "loss": 0.5097107887268066, + "step": 733 + }, + { + "epoch": 0.716796875, + "grad_norm": 0.512140691280365, + "learning_rate": 5.711481844946026e-05, + "loss": 0.5448895692825317, + "step": 734 + }, + { + "epoch": 0.7177734375, + "grad_norm": 0.45195046067237854, + "learning_rate": 5.691854759568205e-05, + "loss": 0.7583330273628235, + "step": 735 + }, + { + "epoch": 0.71875, + "grad_norm": 0.4155009090900421, + "learning_rate": 5.672227674190384e-05, + "loss": 0.5220797061920166, + "step": 736 + }, + { + "epoch": 0.7197265625, + "grad_norm": 0.552148699760437, + "learning_rate": 5.652600588812562e-05, + "loss": 0.8043540716171265, + "step": 737 + }, + { + "epoch": 0.720703125, + "grad_norm": 0.30510297417640686, + "learning_rate": 5.6329735034347396e-05, + "loss": 0.5110808610916138, + "step": 738 + }, + { + "epoch": 0.7216796875, + "grad_norm": 0.522339940071106, + "learning_rate": 5.6133464180569185e-05, + "loss": 1.0245096683502197, + "step": 739 + }, + { + "epoch": 0.72265625, + "grad_norm": 0.27751341462135315, + "learning_rate": 5.5937193326790974e-05, + "loss": 0.6376601457595825, + "step": 740 + }, + { + "epoch": 0.7236328125, + "grad_norm": 0.4283340573310852, + "learning_rate": 5.5740922473012756e-05, + "loss": 1.1317777633666992, + "step": 741 + }, + { + "epoch": 0.724609375, + "grad_norm": 0.541248619556427, + "learning_rate": 5.5544651619234545e-05, + "loss": 0.8086187839508057, + "step": 742 + }, + { + "epoch": 0.7255859375, + "grad_norm": 0.24750906229019165, + "learning_rate": 5.5348380765456335e-05, + "loss": 0.4873177409172058, + "step": 743 + }, + { + "epoch": 0.7265625, + "grad_norm": 0.42374616861343384, + "learning_rate": 5.515210991167812e-05, + "loss": 0.41606956720352173, + "step": 744 + }, + { + "epoch": 0.7275390625, + "grad_norm": 0.35455161333084106, + "learning_rate": 5.4955839057899906e-05, + "loss": 0.49936947226524353, + "step": 745 + }, + { + "epoch": 0.728515625, + "grad_norm": 0.4243617653846741, + "learning_rate": 5.475956820412169e-05, + "loss": 0.6650359630584717, + "step": 746 + }, + { + "epoch": 0.7294921875, + "grad_norm": 0.4106060862541199, + "learning_rate": 5.456329735034348e-05, + "loss": 0.37870654463768005, + "step": 747 + }, + { + "epoch": 0.73046875, + "grad_norm": 0.3536394536495209, + "learning_rate": 5.436702649656527e-05, + "loss": 1.0944924354553223, + "step": 748 + }, + { + "epoch": 0.7314453125, + "grad_norm": 0.3067559003829956, + "learning_rate": 5.417075564278705e-05, + "loss": 0.6380996704101562, + "step": 749 + }, + { + "epoch": 0.732421875, + "grad_norm": 0.40423691272735596, + "learning_rate": 5.397448478900884e-05, + "loss": 0.712358295917511, + "step": 750 + }, + { + "epoch": 0.7333984375, + "grad_norm": 0.451038658618927, + "learning_rate": 5.377821393523063e-05, + "loss": 0.6221305727958679, + "step": 751 + }, + { + "epoch": 0.734375, + "grad_norm": 0.32606229186058044, + "learning_rate": 5.35819430814524e-05, + "loss": 0.6600078344345093, + "step": 752 + }, + { + "epoch": 0.7353515625, + "grad_norm": 0.746896505355835, + "learning_rate": 5.3385672227674185e-05, + "loss": 0.5533967614173889, + "step": 753 + }, + { + "epoch": 0.736328125, + "grad_norm": 0.403277724981308, + "learning_rate": 5.3189401373895974e-05, + "loss": 0.7483388185501099, + "step": 754 + }, + { + "epoch": 0.7373046875, + "grad_norm": 0.6016709208488464, + "learning_rate": 5.2993130520117764e-05, + "loss": 0.539909839630127, + "step": 755 + }, + { + "epoch": 0.73828125, + "grad_norm": 0.39885231852531433, + "learning_rate": 5.2796859666339546e-05, + "loss": 0.7900533676147461, + "step": 756 + }, + { + "epoch": 0.7392578125, + "grad_norm": 0.3245362639427185, + "learning_rate": 5.2600588812561335e-05, + "loss": 0.42862433195114136, + "step": 757 + }, + { + "epoch": 0.740234375, + "grad_norm": 0.47334104776382446, + "learning_rate": 5.2404317958783124e-05, + "loss": 0.3249909281730652, + "step": 758 + }, + { + "epoch": 0.7412109375, + "grad_norm": 0.3029737174510956, + "learning_rate": 5.220804710500491e-05, + "loss": 0.4264957308769226, + "step": 759 + }, + { + "epoch": 0.7421875, + "grad_norm": 0.33878564834594727, + "learning_rate": 5.2011776251226696e-05, + "loss": 0.4446904957294464, + "step": 760 + }, + { + "epoch": 0.7431640625, + "grad_norm": 0.3307798206806183, + "learning_rate": 5.1815505397448485e-05, + "loss": 0.461605966091156, + "step": 761 + }, + { + "epoch": 0.744140625, + "grad_norm": 0.4146850109100342, + "learning_rate": 5.161923454367027e-05, + "loss": 0.758568525314331, + "step": 762 + }, + { + "epoch": 0.7451171875, + "grad_norm": 0.3531327545642853, + "learning_rate": 5.1422963689892056e-05, + "loss": 0.4580535292625427, + "step": 763 + }, + { + "epoch": 0.74609375, + "grad_norm": 0.3952695429325104, + "learning_rate": 5.1226692836113846e-05, + "loss": 0.333244651556015, + "step": 764 + }, + { + "epoch": 0.7470703125, + "grad_norm": 0.5774162411689758, + "learning_rate": 5.103042198233563e-05, + "loss": 0.6433362364768982, + "step": 765 + }, + { + "epoch": 0.748046875, + "grad_norm": 0.49668964743614197, + "learning_rate": 5.083415112855742e-05, + "loss": 0.8478100895881653, + "step": 766 + }, + { + "epoch": 0.7490234375, + "grad_norm": 0.3303810954093933, + "learning_rate": 5.063788027477919e-05, + "loss": 0.7296837568283081, + "step": 767 + }, + { + "epoch": 0.75, + "grad_norm": 0.27652832865715027, + "learning_rate": 5.044160942100098e-05, + "loss": 0.6442312598228455, + "step": 768 + }, + { + "epoch": 0.7509765625, + "grad_norm": 1.0828924179077148, + "learning_rate": 5.0245338567222764e-05, + "loss": 0.9848635196685791, + "step": 769 + }, + { + "epoch": 0.751953125, + "grad_norm": 0.38959333300590515, + "learning_rate": 5.0049067713444553e-05, + "loss": 0.722776472568512, + "step": 770 + }, + { + "epoch": 0.7529296875, + "grad_norm": 0.3470323383808136, + "learning_rate": 4.985279685966634e-05, + "loss": 0.6584157943725586, + "step": 771 + }, + { + "epoch": 0.75390625, + "grad_norm": 0.4060254693031311, + "learning_rate": 4.9656526005888125e-05, + "loss": 0.6276923418045044, + "step": 772 + }, + { + "epoch": 0.7548828125, + "grad_norm": 0.34566962718963623, + "learning_rate": 4.9460255152109914e-05, + "loss": 0.972516655921936, + "step": 773 + }, + { + "epoch": 0.755859375, + "grad_norm": 0.41829708218574524, + "learning_rate": 4.92639842983317e-05, + "loss": 0.6937177181243896, + "step": 774 + }, + { + "epoch": 0.7568359375, + "grad_norm": 0.7653974294662476, + "learning_rate": 4.9067713444553486e-05, + "loss": 0.6027823090553284, + "step": 775 + }, + { + "epoch": 0.7578125, + "grad_norm": 1.0477155447006226, + "learning_rate": 4.8871442590775275e-05, + "loss": 0.925806999206543, + "step": 776 + }, + { + "epoch": 0.7587890625, + "grad_norm": 0.43484824895858765, + "learning_rate": 4.8675171736997064e-05, + "loss": 0.7783142328262329, + "step": 777 + }, + { + "epoch": 0.759765625, + "grad_norm": 0.33719849586486816, + "learning_rate": 4.847890088321884e-05, + "loss": 0.6108527779579163, + "step": 778 + }, + { + "epoch": 0.7607421875, + "grad_norm": 0.3983028531074524, + "learning_rate": 4.828263002944063e-05, + "loss": 0.9976012706756592, + "step": 779 + }, + { + "epoch": 0.76171875, + "grad_norm": 0.3278787136077881, + "learning_rate": 4.808635917566242e-05, + "loss": 0.5754845142364502, + "step": 780 + }, + { + "epoch": 0.7626953125, + "grad_norm": 0.42433467507362366, + "learning_rate": 4.78900883218842e-05, + "loss": 0.8455826640129089, + "step": 781 + }, + { + "epoch": 0.763671875, + "grad_norm": 0.33245334029197693, + "learning_rate": 4.769381746810599e-05, + "loss": 0.5207083225250244, + "step": 782 + }, + { + "epoch": 0.7646484375, + "grad_norm": 0.4390372931957245, + "learning_rate": 4.749754661432778e-05, + "loss": 0.7208432555198669, + "step": 783 + }, + { + "epoch": 0.765625, + "grad_norm": 0.325720876455307, + "learning_rate": 4.730127576054956e-05, + "loss": 0.3017955422401428, + "step": 784 + }, + { + "epoch": 0.7666015625, + "grad_norm": 0.3036203980445862, + "learning_rate": 4.710500490677135e-05, + "loss": 0.47869423031806946, + "step": 785 + }, + { + "epoch": 0.767578125, + "grad_norm": 0.4316065013408661, + "learning_rate": 4.690873405299313e-05, + "loss": 0.7984920740127563, + "step": 786 + }, + { + "epoch": 0.7685546875, + "grad_norm": 0.46907728910446167, + "learning_rate": 4.6712463199214915e-05, + "loss": 0.7288491725921631, + "step": 787 + }, + { + "epoch": 0.76953125, + "grad_norm": 0.38269418478012085, + "learning_rate": 4.6516192345436704e-05, + "loss": 0.46745771169662476, + "step": 788 + }, + { + "epoch": 0.7705078125, + "grad_norm": 0.6045718193054199, + "learning_rate": 4.631992149165849e-05, + "loss": 0.5405256152153015, + "step": 789 + }, + { + "epoch": 0.771484375, + "grad_norm": 0.3303053677082062, + "learning_rate": 4.6123650637880275e-05, + "loss": 0.6721948981285095, + "step": 790 + }, + { + "epoch": 0.7724609375, + "grad_norm": 0.42014074325561523, + "learning_rate": 4.5927379784102065e-05, + "loss": 0.9322581887245178, + "step": 791 + }, + { + "epoch": 0.7734375, + "grad_norm": 0.3720149099826813, + "learning_rate": 4.5731108930323854e-05, + "loss": 0.7807843685150146, + "step": 792 + }, + { + "epoch": 0.7744140625, + "grad_norm": 0.31559938192367554, + "learning_rate": 4.5534838076545636e-05, + "loss": 0.8503724336624146, + "step": 793 + }, + { + "epoch": 0.775390625, + "grad_norm": 0.4096013903617859, + "learning_rate": 4.533856722276742e-05, + "loss": 0.6950633525848389, + "step": 794 + }, + { + "epoch": 0.7763671875, + "grad_norm": 0.3791837990283966, + "learning_rate": 4.514229636898921e-05, + "loss": 0.7583197951316833, + "step": 795 + }, + { + "epoch": 0.77734375, + "grad_norm": 0.5274584889411926, + "learning_rate": 4.494602551521099e-05, + "loss": 0.4712093770503998, + "step": 796 + }, + { + "epoch": 0.7783203125, + "grad_norm": 0.29654791951179504, + "learning_rate": 4.474975466143278e-05, + "loss": 0.552979588508606, + "step": 797 + }, + { + "epoch": 0.779296875, + "grad_norm": 0.25629475712776184, + "learning_rate": 4.455348380765457e-05, + "loss": 0.5225521922111511, + "step": 798 + }, + { + "epoch": 0.7802734375, + "grad_norm": 0.2676495611667633, + "learning_rate": 4.435721295387635e-05, + "loss": 0.4382556080818176, + "step": 799 + }, + { + "epoch": 0.78125, + "grad_norm": 0.4117366075515747, + "learning_rate": 4.416094210009813e-05, + "loss": 0.5639417767524719, + "step": 800 + }, + { + "epoch": 0.7822265625, + "grad_norm": 0.26305386424064636, + "learning_rate": 4.396467124631992e-05, + "loss": 0.28840768337249756, + "step": 801 + }, + { + "epoch": 0.783203125, + "grad_norm": 0.7253789305686951, + "learning_rate": 4.376840039254171e-05, + "loss": 0.4104336202144623, + "step": 802 + }, + { + "epoch": 0.7841796875, + "grad_norm": 0.371288001537323, + "learning_rate": 4.3572129538763494e-05, + "loss": 0.609147310256958, + "step": 803 + }, + { + "epoch": 0.78515625, + "grad_norm": 0.634273111820221, + "learning_rate": 4.337585868498528e-05, + "loss": 0.5141665935516357, + "step": 804 + }, + { + "epoch": 0.7861328125, + "grad_norm": 0.4442044496536255, + "learning_rate": 4.317958783120707e-05, + "loss": 0.4882044494152069, + "step": 805 + }, + { + "epoch": 0.787109375, + "grad_norm": 0.3099007308483124, + "learning_rate": 4.2983316977428854e-05, + "loss": 0.3148588538169861, + "step": 806 + }, + { + "epoch": 0.7880859375, + "grad_norm": 0.41893890500068665, + "learning_rate": 4.2787046123650643e-05, + "loss": 0.6678078174591064, + "step": 807 + }, + { + "epoch": 0.7890625, + "grad_norm": 0.47682809829711914, + "learning_rate": 4.2590775269872426e-05, + "loss": 0.46614763140678406, + "step": 808 + }, + { + "epoch": 0.7900390625, + "grad_norm": 0.25193366408348083, + "learning_rate": 4.239450441609421e-05, + "loss": 0.3707652986049652, + "step": 809 + }, + { + "epoch": 0.791015625, + "grad_norm": 0.3425232768058777, + "learning_rate": 4.2198233562316e-05, + "loss": 0.604179859161377, + "step": 810 + }, + { + "epoch": 0.7919921875, + "grad_norm": 0.31459808349609375, + "learning_rate": 4.2001962708537786e-05, + "loss": 0.748989999294281, + "step": 811 + }, + { + "epoch": 0.79296875, + "grad_norm": 0.3478514850139618, + "learning_rate": 4.180569185475957e-05, + "loss": 0.6651142835617065, + "step": 812 + }, + { + "epoch": 0.7939453125, + "grad_norm": 0.3951675295829773, + "learning_rate": 4.160942100098136e-05, + "loss": 0.7293418049812317, + "step": 813 + }, + { + "epoch": 0.794921875, + "grad_norm": 0.26888158917427063, + "learning_rate": 4.141315014720315e-05, + "loss": 0.2181730419397354, + "step": 814 + }, + { + "epoch": 0.7958984375, + "grad_norm": 0.17496585845947266, + "learning_rate": 4.121687929342492e-05, + "loss": 0.18257993459701538, + "step": 815 + }, + { + "epoch": 0.796875, + "grad_norm": 0.3386918306350708, + "learning_rate": 4.102060843964671e-05, + "loss": 0.43010956048965454, + "step": 816 + }, + { + "epoch": 0.7978515625, + "grad_norm": 0.5185137987136841, + "learning_rate": 4.08243375858685e-05, + "loss": 0.9117882251739502, + "step": 817 + }, + { + "epoch": 0.798828125, + "grad_norm": 0.499529093503952, + "learning_rate": 4.0628066732090283e-05, + "loss": 0.8601939678192139, + "step": 818 + }, + { + "epoch": 0.7998046875, + "grad_norm": 0.44401317834854126, + "learning_rate": 4.043179587831207e-05, + "loss": 0.8643960356712341, + "step": 819 + }, + { + "epoch": 0.80078125, + "grad_norm": 0.30553653836250305, + "learning_rate": 4.023552502453386e-05, + "loss": 0.7741817235946655, + "step": 820 + }, + { + "epoch": 0.8017578125, + "grad_norm": 0.443541944026947, + "learning_rate": 4.0039254170755644e-05, + "loss": 0.9571224451065063, + "step": 821 + }, + { + "epoch": 0.802734375, + "grad_norm": 0.2611587643623352, + "learning_rate": 3.9842983316977426e-05, + "loss": 0.4755222201347351, + "step": 822 + }, + { + "epoch": 0.8037109375, + "grad_norm": 0.38695722818374634, + "learning_rate": 3.9646712463199216e-05, + "loss": 0.9597996473312378, + "step": 823 + }, + { + "epoch": 0.8046875, + "grad_norm": 0.505346953868866, + "learning_rate": 3.9450441609421005e-05, + "loss": 0.328266441822052, + "step": 824 + }, + { + "epoch": 0.8056640625, + "grad_norm": 0.38910478353500366, + "learning_rate": 3.925417075564279e-05, + "loss": 0.4758382737636566, + "step": 825 + }, + { + "epoch": 0.806640625, + "grad_norm": 0.4268342852592468, + "learning_rate": 3.9057899901864576e-05, + "loss": 0.6131553649902344, + "step": 826 + }, + { + "epoch": 0.8076171875, + "grad_norm": 0.32205328345298767, + "learning_rate": 3.8861629048086365e-05, + "loss": 0.6047544479370117, + "step": 827 + }, + { + "epoch": 0.80859375, + "grad_norm": 0.6975948214530945, + "learning_rate": 3.866535819430815e-05, + "loss": 0.7599061727523804, + "step": 828 + }, + { + "epoch": 0.8095703125, + "grad_norm": 0.20186780393123627, + "learning_rate": 3.846908734052994e-05, + "loss": 0.3639545738697052, + "step": 829 + }, + { + "epoch": 0.810546875, + "grad_norm": 0.443435937166214, + "learning_rate": 3.827281648675172e-05, + "loss": 0.6933274269104004, + "step": 830 + }, + { + "epoch": 0.8115234375, + "grad_norm": 0.44157811999320984, + "learning_rate": 3.80765456329735e-05, + "loss": 0.5135524272918701, + "step": 831 + }, + { + "epoch": 0.8125, + "grad_norm": 0.3959600031375885, + "learning_rate": 3.788027477919529e-05, + "loss": 0.6713152527809143, + "step": 832 + }, + { + "epoch": 0.8134765625, + "grad_norm": 0.5439519882202148, + "learning_rate": 3.768400392541708e-05, + "loss": 0.3603706359863281, + "step": 833 + }, + { + "epoch": 0.814453125, + "grad_norm": 0.36693719029426575, + "learning_rate": 3.748773307163886e-05, + "loss": 0.8574247360229492, + "step": 834 + }, + { + "epoch": 0.8154296875, + "grad_norm": 0.3476804792881012, + "learning_rate": 3.729146221786065e-05, + "loss": 0.6845530867576599, + "step": 835 + }, + { + "epoch": 0.81640625, + "grad_norm": 0.48850229382514954, + "learning_rate": 3.709519136408244e-05, + "loss": 0.788569450378418, + "step": 836 + }, + { + "epoch": 0.8173828125, + "grad_norm": 0.5997111797332764, + "learning_rate": 3.6898920510304216e-05, + "loss": 0.5885312557220459, + "step": 837 + }, + { + "epoch": 0.818359375, + "grad_norm": 0.43312472105026245, + "learning_rate": 3.6702649656526005e-05, + "loss": 0.5300126075744629, + "step": 838 + }, + { + "epoch": 0.8193359375, + "grad_norm": 0.6505857110023499, + "learning_rate": 3.6506378802747795e-05, + "loss": 0.7164736986160278, + "step": 839 + }, + { + "epoch": 0.8203125, + "grad_norm": 0.34061765670776367, + "learning_rate": 3.631010794896958e-05, + "loss": 0.5405696034431458, + "step": 840 + }, + { + "epoch": 0.8212890625, + "grad_norm": 0.4188057780265808, + "learning_rate": 3.6113837095191366e-05, + "loss": 1.0057684183120728, + "step": 841 + }, + { + "epoch": 0.822265625, + "grad_norm": 0.392007052898407, + "learning_rate": 3.5917566241413155e-05, + "loss": 0.6687936782836914, + "step": 842 + }, + { + "epoch": 0.8232421875, + "grad_norm": 0.44254210591316223, + "learning_rate": 3.572129538763494e-05, + "loss": 0.39150726795196533, + "step": 843 + }, + { + "epoch": 0.82421875, + "grad_norm": 0.41756534576416016, + "learning_rate": 3.552502453385673e-05, + "loss": 0.764665961265564, + "step": 844 + }, + { + "epoch": 0.8251953125, + "grad_norm": 0.9839560985565186, + "learning_rate": 3.532875368007851e-05, + "loss": 0.45259296894073486, + "step": 845 + }, + { + "epoch": 0.826171875, + "grad_norm": 0.3465111553668976, + "learning_rate": 3.513248282630029e-05, + "loss": 0.5895928740501404, + "step": 846 + }, + { + "epoch": 0.8271484375, + "grad_norm": 0.4883447289466858, + "learning_rate": 3.493621197252208e-05, + "loss": 0.8401346802711487, + "step": 847 + }, + { + "epoch": 0.828125, + "grad_norm": 0.3590312898159027, + "learning_rate": 3.473994111874387e-05, + "loss": 0.6134470105171204, + "step": 848 + }, + { + "epoch": 0.8291015625, + "grad_norm": 0.48273324966430664, + "learning_rate": 3.454367026496565e-05, + "loss": 0.6351644992828369, + "step": 849 + }, + { + "epoch": 0.830078125, + "grad_norm": 0.32156500220298767, + "learning_rate": 3.434739941118744e-05, + "loss": 0.5098355412483215, + "step": 850 + }, + { + "epoch": 0.8310546875, + "grad_norm": 0.38239747285842896, + "learning_rate": 3.415112855740923e-05, + "loss": 1.0178660154342651, + "step": 851 + }, + { + "epoch": 0.83203125, + "grad_norm": 0.6875290274620056, + "learning_rate": 3.395485770363101e-05, + "loss": 0.4496825337409973, + "step": 852 + }, + { + "epoch": 0.8330078125, + "grad_norm": 0.27034860849380493, + "learning_rate": 3.3758586849852795e-05, + "loss": 0.41253381967544556, + "step": 853 + }, + { + "epoch": 0.833984375, + "grad_norm": 0.5166223049163818, + "learning_rate": 3.3562315996074584e-05, + "loss": 0.7344639897346497, + "step": 854 + }, + { + "epoch": 0.8349609375, + "grad_norm": 0.39597758650779724, + "learning_rate": 3.3366045142296373e-05, + "loss": 0.6066821217536926, + "step": 855 + }, + { + "epoch": 0.8359375, + "grad_norm": 0.44033098220825195, + "learning_rate": 3.3169774288518156e-05, + "loss": 0.7928174734115601, + "step": 856 + }, + { + "epoch": 0.8369140625, + "grad_norm": 0.3340597450733185, + "learning_rate": 3.2973503434739945e-05, + "loss": 0.4783233404159546, + "step": 857 + }, + { + "epoch": 0.837890625, + "grad_norm": 0.5634653568267822, + "learning_rate": 3.2777232580961734e-05, + "loss": 0.785845935344696, + "step": 858 + }, + { + "epoch": 0.8388671875, + "grad_norm": 0.24581296741962433, + "learning_rate": 3.258096172718351e-05, + "loss": 0.36480462551116943, + "step": 859 + }, + { + "epoch": 0.83984375, + "grad_norm": 0.316773384809494, + "learning_rate": 3.23846908734053e-05, + "loss": 0.886894941329956, + "step": 860 + }, + { + "epoch": 0.8408203125, + "grad_norm": 0.4605409502983093, + "learning_rate": 3.218842001962709e-05, + "loss": 0.7125131487846375, + "step": 861 + }, + { + "epoch": 0.841796875, + "grad_norm": 0.5473557114601135, + "learning_rate": 3.199214916584887e-05, + "loss": 0.45582157373428345, + "step": 862 + }, + { + "epoch": 0.8427734375, + "grad_norm": 0.4604926109313965, + "learning_rate": 3.179587831207066e-05, + "loss": 0.5392733812332153, + "step": 863 + }, + { + "epoch": 0.84375, + "grad_norm": 0.3192322552204132, + "learning_rate": 3.159960745829245e-05, + "loss": 0.3216538727283478, + "step": 864 + }, + { + "epoch": 0.8447265625, + "grad_norm": 0.4225713610649109, + "learning_rate": 3.140333660451423e-05, + "loss": 0.36403900384902954, + "step": 865 + }, + { + "epoch": 0.845703125, + "grad_norm": 0.7738484740257263, + "learning_rate": 3.120706575073602e-05, + "loss": 0.5428112149238586, + "step": 866 + }, + { + "epoch": 0.8466796875, + "grad_norm": 0.7795976400375366, + "learning_rate": 3.10107948969578e-05, + "loss": 0.838668704032898, + "step": 867 + }, + { + "epoch": 0.84765625, + "grad_norm": 0.4240044355392456, + "learning_rate": 3.0814524043179585e-05, + "loss": 0.5039677023887634, + "step": 868 + }, + { + "epoch": 0.8486328125, + "grad_norm": 0.7870606780052185, + "learning_rate": 3.0618253189401374e-05, + "loss": 0.2639703154563904, + "step": 869 + }, + { + "epoch": 0.849609375, + "grad_norm": 4.898192405700684, + "learning_rate": 3.042198233562316e-05, + "loss": 0.9641809463500977, + "step": 870 + }, + { + "epoch": 0.8505859375, + "grad_norm": 0.4090663194656372, + "learning_rate": 3.022571148184495e-05, + "loss": 0.5249053835868835, + "step": 871 + }, + { + "epoch": 0.8515625, + "grad_norm": 0.5761129856109619, + "learning_rate": 3.0029440628066735e-05, + "loss": 0.8987921476364136, + "step": 872 + }, + { + "epoch": 0.8525390625, + "grad_norm": 0.2440023124217987, + "learning_rate": 2.983316977428852e-05, + "loss": 0.3279159367084503, + "step": 873 + }, + { + "epoch": 0.853515625, + "grad_norm": 0.438519150018692, + "learning_rate": 2.9636898920510303e-05, + "loss": 0.8272308111190796, + "step": 874 + }, + { + "epoch": 0.8544921875, + "grad_norm": 0.4011988639831543, + "learning_rate": 2.944062806673209e-05, + "loss": 0.3140803873538971, + "step": 875 + }, + { + "epoch": 0.85546875, + "grad_norm": 0.5748201012611389, + "learning_rate": 2.9244357212953878e-05, + "loss": 0.6699116230010986, + "step": 876 + }, + { + "epoch": 0.8564453125, + "grad_norm": 0.3001462519168854, + "learning_rate": 2.9048086359175664e-05, + "loss": 0.19382989406585693, + "step": 877 + }, + { + "epoch": 0.857421875, + "grad_norm": 0.40844887495040894, + "learning_rate": 2.885181550539745e-05, + "loss": 0.6494845747947693, + "step": 878 + }, + { + "epoch": 0.8583984375, + "grad_norm": 0.3480914235115051, + "learning_rate": 2.865554465161924e-05, + "loss": 0.5555131435394287, + "step": 879 + }, + { + "epoch": 0.859375, + "grad_norm": 0.3903101682662964, + "learning_rate": 2.8459273797841024e-05, + "loss": 0.6830955147743225, + "step": 880 + }, + { + "epoch": 0.8603515625, + "grad_norm": 0.3058629333972931, + "learning_rate": 2.826300294406281e-05, + "loss": 0.3747236728668213, + "step": 881 + }, + { + "epoch": 0.861328125, + "grad_norm": 0.49275287985801697, + "learning_rate": 2.8066732090284592e-05, + "loss": 1.0192487239837646, + "step": 882 + }, + { + "epoch": 0.8623046875, + "grad_norm": 0.4016769826412201, + "learning_rate": 2.7870461236506378e-05, + "loss": 0.4012300372123718, + "step": 883 + }, + { + "epoch": 0.86328125, + "grad_norm": 0.4790811240673065, + "learning_rate": 2.7674190382728167e-05, + "loss": 0.6936056613922119, + "step": 884 + }, + { + "epoch": 0.8642578125, + "grad_norm": 0.39931413531303406, + "learning_rate": 2.7477919528949953e-05, + "loss": 0.3612633943557739, + "step": 885 + }, + { + "epoch": 0.865234375, + "grad_norm": 0.3250795006752014, + "learning_rate": 2.728164867517174e-05, + "loss": 0.5146504640579224, + "step": 886 + }, + { + "epoch": 0.8662109375, + "grad_norm": 0.5216737985610962, + "learning_rate": 2.7085377821393525e-05, + "loss": 0.6185201406478882, + "step": 887 + }, + { + "epoch": 0.8671875, + "grad_norm": 0.5681923031806946, + "learning_rate": 2.6889106967615314e-05, + "loss": 0.9492973685264587, + "step": 888 + }, + { + "epoch": 0.8681640625, + "grad_norm": 0.5284391045570374, + "learning_rate": 2.6692836113837093e-05, + "loss": 0.7801765203475952, + "step": 889 + }, + { + "epoch": 0.869140625, + "grad_norm": 0.42510825395584106, + "learning_rate": 2.6496565260058882e-05, + "loss": 0.4871942102909088, + "step": 890 + }, + { + "epoch": 0.8701171875, + "grad_norm": 0.39092326164245605, + "learning_rate": 2.6300294406280668e-05, + "loss": 0.5123960375785828, + "step": 891 + }, + { + "epoch": 0.87109375, + "grad_norm": 0.37694281339645386, + "learning_rate": 2.6104023552502453e-05, + "loss": 0.3543451428413391, + "step": 892 + }, + { + "epoch": 0.8720703125, + "grad_norm": 0.26519376039505005, + "learning_rate": 2.5907752698724242e-05, + "loss": 0.2388455718755722, + "step": 893 + }, + { + "epoch": 0.873046875, + "grad_norm": 0.6303861141204834, + "learning_rate": 2.5711481844946028e-05, + "loss": 0.7195224761962891, + "step": 894 + }, + { + "epoch": 0.8740234375, + "grad_norm": 0.4436159133911133, + "learning_rate": 2.5515210991167814e-05, + "loss": 0.8888048529624939, + "step": 895 + }, + { + "epoch": 0.875, + "grad_norm": 0.6473313570022583, + "learning_rate": 2.5318940137389596e-05, + "loss": 0.8557075262069702, + "step": 896 + }, + { + "epoch": 0.8759765625, + "grad_norm": 0.6625436544418335, + "learning_rate": 2.5122669283611382e-05, + "loss": 0.7132158279418945, + "step": 897 + }, + { + "epoch": 0.876953125, + "grad_norm": 0.7241202592849731, + "learning_rate": 2.492639842983317e-05, + "loss": 0.9367854595184326, + "step": 898 + }, + { + "epoch": 0.8779296875, + "grad_norm": 0.5321157574653625, + "learning_rate": 2.4730127576054957e-05, + "loss": 1.0013937950134277, + "step": 899 + }, + { + "epoch": 0.87890625, + "grad_norm": 0.3287423253059387, + "learning_rate": 2.4533856722276743e-05, + "loss": 0.4560258984565735, + "step": 900 + }, + { + "epoch": 0.8798828125, + "grad_norm": 0.5040727257728577, + "learning_rate": 2.4337585868498532e-05, + "loss": 0.5655212998390198, + "step": 901 + }, + { + "epoch": 0.880859375, + "grad_norm": 0.4150228202342987, + "learning_rate": 2.4141315014720314e-05, + "loss": 0.43106216192245483, + "step": 902 + }, + { + "epoch": 0.8818359375, + "grad_norm": 0.4006192684173584, + "learning_rate": 2.39450441609421e-05, + "loss": 0.4401901364326477, + "step": 903 + }, + { + "epoch": 0.8828125, + "grad_norm": 0.5145865678787231, + "learning_rate": 2.374877330716389e-05, + "loss": 0.9345691800117493, + "step": 904 + }, + { + "epoch": 0.8837890625, + "grad_norm": 0.7273013591766357, + "learning_rate": 2.3552502453385675e-05, + "loss": 0.27768659591674805, + "step": 905 + }, + { + "epoch": 0.884765625, + "grad_norm": 0.3039482831954956, + "learning_rate": 2.3356231599607457e-05, + "loss": 0.6196010112762451, + "step": 906 + }, + { + "epoch": 0.8857421875, + "grad_norm": 0.35697150230407715, + "learning_rate": 2.3159960745829247e-05, + "loss": 0.34777021408081055, + "step": 907 + }, + { + "epoch": 0.88671875, + "grad_norm": 0.356717050075531, + "learning_rate": 2.2963689892051032e-05, + "loss": 0.4651508331298828, + "step": 908 + }, + { + "epoch": 0.8876953125, + "grad_norm": 0.485963374376297, + "learning_rate": 2.2767419038272818e-05, + "loss": 0.3906201720237732, + "step": 909 + }, + { + "epoch": 0.888671875, + "grad_norm": 0.38827836513519287, + "learning_rate": 2.2571148184494604e-05, + "loss": 0.48782849311828613, + "step": 910 + }, + { + "epoch": 0.8896484375, + "grad_norm": 0.39589494466781616, + "learning_rate": 2.237487733071639e-05, + "loss": 0.5089969635009766, + "step": 911 + }, + { + "epoch": 0.890625, + "grad_norm": 0.6619493365287781, + "learning_rate": 2.2178606476938175e-05, + "loss": 0.9266189932823181, + "step": 912 + }, + { + "epoch": 0.8916015625, + "grad_norm": 0.407817542552948, + "learning_rate": 2.198233562315996e-05, + "loss": 0.3518386483192444, + "step": 913 + }, + { + "epoch": 0.892578125, + "grad_norm": 0.4645719826221466, + "learning_rate": 2.1786064769381747e-05, + "loss": 0.9297075271606445, + "step": 914 + }, + { + "epoch": 0.8935546875, + "grad_norm": 0.434517502784729, + "learning_rate": 2.1589793915603536e-05, + "loss": 0.7716128826141357, + "step": 915 + }, + { + "epoch": 0.89453125, + "grad_norm": 0.49387747049331665, + "learning_rate": 2.1393523061825322e-05, + "loss": 0.5475488901138306, + "step": 916 + }, + { + "epoch": 0.8955078125, + "grad_norm": 0.5593905448913574, + "learning_rate": 2.1197252208047104e-05, + "loss": 0.7304456233978271, + "step": 917 + }, + { + "epoch": 0.896484375, + "grad_norm": 0.3386078178882599, + "learning_rate": 2.1000981354268893e-05, + "loss": 0.7872465252876282, + "step": 918 + }, + { + "epoch": 0.8974609375, + "grad_norm": 0.2872868478298187, + "learning_rate": 2.080471050049068e-05, + "loss": 0.3295198976993561, + "step": 919 + }, + { + "epoch": 0.8984375, + "grad_norm": 0.4897945523262024, + "learning_rate": 2.060843964671246e-05, + "loss": 0.3939395546913147, + "step": 920 + }, + { + "epoch": 0.8994140625, + "grad_norm": 0.5068129897117615, + "learning_rate": 2.041216879293425e-05, + "loss": 0.4646037817001343, + "step": 921 + }, + { + "epoch": 0.900390625, + "grad_norm": 0.3769625425338745, + "learning_rate": 2.0215897939156036e-05, + "loss": 0.811498761177063, + "step": 922 + }, + { + "epoch": 0.9013671875, + "grad_norm": 0.380655974149704, + "learning_rate": 2.0019627085377822e-05, + "loss": 0.6260181665420532, + "step": 923 + }, + { + "epoch": 0.90234375, + "grad_norm": 0.5810602903366089, + "learning_rate": 1.9823356231599608e-05, + "loss": 0.7125158309936523, + "step": 924 + }, + { + "epoch": 0.9033203125, + "grad_norm": 0.4367387592792511, + "learning_rate": 1.9627085377821394e-05, + "loss": 0.7728107571601868, + "step": 925 + }, + { + "epoch": 0.904296875, + "grad_norm": 0.604702353477478, + "learning_rate": 1.9430814524043183e-05, + "loss": 0.5136534571647644, + "step": 926 + }, + { + "epoch": 0.9052734375, + "grad_norm": 0.40865615010261536, + "learning_rate": 1.923454367026497e-05, + "loss": 0.5040115714073181, + "step": 927 + }, + { + "epoch": 0.90625, + "grad_norm": 0.3602078855037689, + "learning_rate": 1.903827281648675e-05, + "loss": 0.4498569965362549, + "step": 928 + }, + { + "epoch": 0.9072265625, + "grad_norm": 0.46351152658462524, + "learning_rate": 1.884200196270854e-05, + "loss": 0.8635745644569397, + "step": 929 + }, + { + "epoch": 0.908203125, + "grad_norm": 0.5490495562553406, + "learning_rate": 1.8645731108930326e-05, + "loss": 0.9265761375427246, + "step": 930 + }, + { + "epoch": 0.9091796875, + "grad_norm": 0.4198157489299774, + "learning_rate": 1.8449460255152108e-05, + "loss": 0.8148217797279358, + "step": 931 + }, + { + "epoch": 0.91015625, + "grad_norm": 0.5183578729629517, + "learning_rate": 1.8253189401373897e-05, + "loss": 0.7837534546852112, + "step": 932 + }, + { + "epoch": 0.9111328125, + "grad_norm": 0.41839340329170227, + "learning_rate": 1.8056918547595683e-05, + "loss": 0.7239848971366882, + "step": 933 + }, + { + "epoch": 0.912109375, + "grad_norm": 0.49158063530921936, + "learning_rate": 1.786064769381747e-05, + "loss": 0.7751527428627014, + "step": 934 + }, + { + "epoch": 0.9130859375, + "grad_norm": 0.20171599090099335, + "learning_rate": 1.7664376840039255e-05, + "loss": 0.181843563914299, + "step": 935 + }, + { + "epoch": 0.9140625, + "grad_norm": 0.36237961053848267, + "learning_rate": 1.746810598626104e-05, + "loss": 0.5150234699249268, + "step": 936 + }, + { + "epoch": 0.9150390625, + "grad_norm": 0.4587535858154297, + "learning_rate": 1.7271835132482826e-05, + "loss": 0.6178685426712036, + "step": 937 + }, + { + "epoch": 0.916015625, + "grad_norm": 0.392635703086853, + "learning_rate": 1.7075564278704615e-05, + "loss": 0.7002321481704712, + "step": 938 + }, + { + "epoch": 0.9169921875, + "grad_norm": 0.28255772590637207, + "learning_rate": 1.6879293424926398e-05, + "loss": 0.6161627769470215, + "step": 939 + }, + { + "epoch": 0.91796875, + "grad_norm": 0.31382182240486145, + "learning_rate": 1.6683022571148187e-05, + "loss": 0.6143029928207397, + "step": 940 + }, + { + "epoch": 0.9189453125, + "grad_norm": 0.5099475383758545, + "learning_rate": 1.6486751717369972e-05, + "loss": 0.9116108417510986, + "step": 941 + }, + { + "epoch": 0.919921875, + "grad_norm": 0.4015892446041107, + "learning_rate": 1.6290480863591755e-05, + "loss": 0.7331390380859375, + "step": 942 + }, + { + "epoch": 0.9208984375, + "grad_norm": 0.4519053101539612, + "learning_rate": 1.6094210009813544e-05, + "loss": 0.6662384867668152, + "step": 943 + }, + { + "epoch": 0.921875, + "grad_norm": 0.5565328598022461, + "learning_rate": 1.589793915603533e-05, + "loss": 0.37386590242385864, + "step": 944 + }, + { + "epoch": 0.9228515625, + "grad_norm": 0.398419588804245, + "learning_rate": 1.5701668302257116e-05, + "loss": 0.9127399325370789, + "step": 945 + }, + { + "epoch": 0.923828125, + "grad_norm": 0.37491804361343384, + "learning_rate": 1.55053974484789e-05, + "loss": 0.47025924921035767, + "step": 946 + }, + { + "epoch": 0.9248046875, + "grad_norm": 0.49557894468307495, + "learning_rate": 1.5309126594700687e-05, + "loss": 0.6349594593048096, + "step": 947 + }, + { + "epoch": 0.92578125, + "grad_norm": 0.2361314743757248, + "learning_rate": 1.5112855740922475e-05, + "loss": 0.3594982922077179, + "step": 948 + }, + { + "epoch": 0.9267578125, + "grad_norm": 0.40022003650665283, + "learning_rate": 1.491658488714426e-05, + "loss": 0.41701436042785645, + "step": 949 + }, + { + "epoch": 0.927734375, + "grad_norm": 0.349528431892395, + "learning_rate": 1.4720314033366044e-05, + "loss": 0.2943156063556671, + "step": 950 + }, + { + "epoch": 0.9287109375, + "grad_norm": 0.4660559892654419, + "learning_rate": 1.4524043179587832e-05, + "loss": 0.3633948564529419, + "step": 951 + }, + { + "epoch": 0.9296875, + "grad_norm": 0.28590673208236694, + "learning_rate": 1.432777232580962e-05, + "loss": 0.4886907935142517, + "step": 952 + }, + { + "epoch": 0.9306640625, + "grad_norm": 0.4388448894023895, + "learning_rate": 1.4131501472031405e-05, + "loss": 0.6123654246330261, + "step": 953 + }, + { + "epoch": 0.931640625, + "grad_norm": 0.4807531237602234, + "learning_rate": 1.3935230618253189e-05, + "loss": 0.32400381565093994, + "step": 954 + }, + { + "epoch": 0.9326171875, + "grad_norm": 0.3903636932373047, + "learning_rate": 1.3738959764474977e-05, + "loss": 0.6839208006858826, + "step": 955 + }, + { + "epoch": 0.93359375, + "grad_norm": 0.2925507426261902, + "learning_rate": 1.3542688910696762e-05, + "loss": 0.5898708701133728, + "step": 956 + }, + { + "epoch": 0.9345703125, + "grad_norm": 0.39300912618637085, + "learning_rate": 1.3346418056918546e-05, + "loss": 0.3898833692073822, + "step": 957 + }, + { + "epoch": 0.935546875, + "grad_norm": 0.4321513772010803, + "learning_rate": 1.3150147203140334e-05, + "loss": 0.5717346668243408, + "step": 958 + }, + { + "epoch": 0.9365234375, + "grad_norm": 0.47681212425231934, + "learning_rate": 1.2953876349362121e-05, + "loss": 0.9711145162582397, + "step": 959 + }, + { + "epoch": 0.9375, + "grad_norm": 0.524958610534668, + "learning_rate": 1.2757605495583907e-05, + "loss": 0.6577808260917664, + "step": 960 + }, + { + "epoch": 0.9384765625, + "grad_norm": 0.40814298391342163, + "learning_rate": 1.2561334641805691e-05, + "loss": 0.5148733258247375, + "step": 961 + }, + { + "epoch": 0.939453125, + "grad_norm": 0.3122687041759491, + "learning_rate": 1.2365063788027479e-05, + "loss": 0.884072482585907, + "step": 962 + }, + { + "epoch": 0.9404296875, + "grad_norm": 0.4473840594291687, + "learning_rate": 1.2168792934249266e-05, + "loss": 0.660685658454895, + "step": 963 + }, + { + "epoch": 0.94140625, + "grad_norm": 0.3491450548171997, + "learning_rate": 1.197252208047105e-05, + "loss": 0.8680378794670105, + "step": 964 + }, + { + "epoch": 0.9423828125, + "grad_norm": 0.6323879957199097, + "learning_rate": 1.1776251226692837e-05, + "loss": 0.8196921348571777, + "step": 965 + }, + { + "epoch": 0.943359375, + "grad_norm": 0.354900062084198, + "learning_rate": 1.1579980372914623e-05, + "loss": 0.5380838513374329, + "step": 966 + }, + { + "epoch": 0.9443359375, + "grad_norm": 0.3235265612602234, + "learning_rate": 1.1383709519136409e-05, + "loss": 0.39993464946746826, + "step": 967 + }, + { + "epoch": 0.9453125, + "grad_norm": 0.3700491786003113, + "learning_rate": 1.1187438665358195e-05, + "loss": 0.6613435745239258, + "step": 968 + }, + { + "epoch": 0.9462890625, + "grad_norm": 0.29880228638648987, + "learning_rate": 1.099116781157998e-05, + "loss": 0.5756196975708008, + "step": 969 + }, + { + "epoch": 0.947265625, + "grad_norm": 0.4585433304309845, + "learning_rate": 1.0794896957801768e-05, + "loss": 0.5012968182563782, + "step": 970 + }, + { + "epoch": 0.9482421875, + "grad_norm": 0.5275799632072449, + "learning_rate": 1.0598626104023552e-05, + "loss": 0.4986013174057007, + "step": 971 + }, + { + "epoch": 0.94921875, + "grad_norm": 0.30642619729042053, + "learning_rate": 1.040235525024534e-05, + "loss": 0.29793277382850647, + "step": 972 + }, + { + "epoch": 0.9501953125, + "grad_norm": 0.7356166243553162, + "learning_rate": 1.0206084396467125e-05, + "loss": 0.6518126726150513, + "step": 973 + }, + { + "epoch": 0.951171875, + "grad_norm": 0.6069150567054749, + "learning_rate": 1.0009813542688911e-05, + "loss": 0.7005544900894165, + "step": 974 + }, + { + "epoch": 0.9521484375, + "grad_norm": 0.500067949295044, + "learning_rate": 9.813542688910697e-06, + "loss": 0.5567950010299683, + "step": 975 + }, + { + "epoch": 0.953125, + "grad_norm": 0.5926097631454468, + "learning_rate": 9.617271835132484e-06, + "loss": 0.6974345445632935, + "step": 976 + }, + { + "epoch": 0.9541015625, + "grad_norm": 0.28873002529144287, + "learning_rate": 9.42100098135427e-06, + "loss": 0.28231939673423767, + "step": 977 + }, + { + "epoch": 0.955078125, + "grad_norm": 0.6644822359085083, + "learning_rate": 9.224730127576054e-06, + "loss": 0.46575701236724854, + "step": 978 + }, + { + "epoch": 0.9560546875, + "grad_norm": 0.34748774766921997, + "learning_rate": 9.028459273797842e-06, + "loss": 0.7192713022232056, + "step": 979 + }, + { + "epoch": 0.95703125, + "grad_norm": 0.4444558024406433, + "learning_rate": 8.832188420019627e-06, + "loss": 0.34014150500297546, + "step": 980 + }, + { + "epoch": 0.9580078125, + "grad_norm": 0.4814091920852661, + "learning_rate": 8.635917566241413e-06, + "loss": 0.8042552471160889, + "step": 981 + }, + { + "epoch": 0.958984375, + "grad_norm": 0.5443412661552429, + "learning_rate": 8.439646712463199e-06, + "loss": 0.6534023880958557, + "step": 982 + }, + { + "epoch": 0.9599609375, + "grad_norm": 0.40025195479393005, + "learning_rate": 8.243375858684986e-06, + "loss": 0.9056930541992188, + "step": 983 + }, + { + "epoch": 0.9609375, + "grad_norm": 0.41958069801330566, + "learning_rate": 8.047105004906772e-06, + "loss": 0.5610394477844238, + "step": 984 + }, + { + "epoch": 0.9619140625, + "grad_norm": 0.33056482672691345, + "learning_rate": 7.850834151128558e-06, + "loss": 0.5796000361442566, + "step": 985 + }, + { + "epoch": 0.962890625, + "grad_norm": 0.5056169629096985, + "learning_rate": 7.654563297350344e-06, + "loss": 0.7795373201370239, + "step": 986 + }, + { + "epoch": 0.9638671875, + "grad_norm": 0.4030667543411255, + "learning_rate": 7.45829244357213e-06, + "loss": 0.761528491973877, + "step": 987 + }, + { + "epoch": 0.96484375, + "grad_norm": 0.22716952860355377, + "learning_rate": 7.262021589793916e-06, + "loss": 0.21712671220302582, + "step": 988 + }, + { + "epoch": 0.9658203125, + "grad_norm": 0.4826786518096924, + "learning_rate": 7.0657507360157025e-06, + "loss": 0.6192560791969299, + "step": 989 + }, + { + "epoch": 0.966796875, + "grad_norm": 0.3611379861831665, + "learning_rate": 6.869479882237488e-06, + "loss": 0.5660407543182373, + "step": 990 + }, + { + "epoch": 0.9677734375, + "grad_norm": 0.44197750091552734, + "learning_rate": 6.673209028459273e-06, + "loss": 0.8223164081573486, + "step": 991 + }, + { + "epoch": 0.96875, + "grad_norm": 0.45650866627693176, + "learning_rate": 6.476938174681061e-06, + "loss": 0.5810177326202393, + "step": 992 + }, + { + "epoch": 0.9697265625, + "grad_norm": 0.6275922060012817, + "learning_rate": 6.2806673209028455e-06, + "loss": 0.46302127838134766, + "step": 993 + }, + { + "epoch": 0.970703125, + "grad_norm": 0.29163289070129395, + "learning_rate": 6.084396467124633e-06, + "loss": 0.49744415283203125, + "step": 994 + }, + { + "epoch": 0.9716796875, + "grad_norm": 0.4289768934249878, + "learning_rate": 5.888125613346419e-06, + "loss": 0.39710360765457153, + "step": 995 + }, + { + "epoch": 0.97265625, + "grad_norm": 0.43311089277267456, + "learning_rate": 5.6918547595682045e-06, + "loss": 0.4934995174407959, + "step": 996 + }, + { + "epoch": 0.9736328125, + "grad_norm": 0.4249640703201294, + "learning_rate": 5.49558390578999e-06, + "loss": 0.6822129487991333, + "step": 997 + }, + { + "epoch": 0.974609375, + "grad_norm": 0.4080635607242584, + "learning_rate": 5.299313052011776e-06, + "loss": 0.2851019501686096, + "step": 998 + }, + { + "epoch": 0.9755859375, + "grad_norm": 0.3082174062728882, + "learning_rate": 5.103042198233563e-06, + "loss": 0.8851650357246399, + "step": 999 + }, + { + "epoch": 0.9765625, + "grad_norm": 0.5285578370094299, + "learning_rate": 4.906771344455348e-06, + "loss": 0.5684286952018738, + "step": 1000 + }, + { + "epoch": 0.9775390625, + "grad_norm": 0.37052616477012634, + "learning_rate": 4.710500490677135e-06, + "loss": 0.8170924782752991, + "step": 1001 + }, + { + "epoch": 0.978515625, + "grad_norm": 0.46926191449165344, + "learning_rate": 4.514229636898921e-06, + "loss": 0.665911853313446, + "step": 1002 + }, + { + "epoch": 0.9794921875, + "grad_norm": 0.38110095262527466, + "learning_rate": 4.3179587831207065e-06, + "loss": 0.9365942478179932, + "step": 1003 + }, + { + "epoch": 0.98046875, + "grad_norm": 0.3803754150867462, + "learning_rate": 4.121687929342493e-06, + "loss": 0.756361722946167, + "step": 1004 + }, + { + "epoch": 0.9814453125, + "grad_norm": 0.6576887965202332, + "learning_rate": 3.925417075564279e-06, + "loss": 0.6846331357955933, + "step": 1005 + }, + { + "epoch": 0.982421875, + "grad_norm": 0.6425113081932068, + "learning_rate": 3.729146221786065e-06, + "loss": 0.7665562629699707, + "step": 1006 + }, + { + "epoch": 0.9833984375, + "grad_norm": 0.28858375549316406, + "learning_rate": 3.5328753680078512e-06, + "loss": 0.2748746871948242, + "step": 1007 + }, + { + "epoch": 0.984375, + "grad_norm": 0.38693365454673767, + "learning_rate": 3.3366045142296366e-06, + "loss": 0.6602081060409546, + "step": 1008 + }, + { + "epoch": 0.9853515625, + "grad_norm": 0.39297735691070557, + "learning_rate": 3.1403336604514228e-06, + "loss": 0.43784576654434204, + "step": 1009 + }, + { + "epoch": 0.986328125, + "grad_norm": 0.4182215929031372, + "learning_rate": 2.9440628066732094e-06, + "loss": 0.7852948307991028, + "step": 1010 + }, + { + "epoch": 0.9873046875, + "grad_norm": 0.4079328775405884, + "learning_rate": 2.747791952894995e-06, + "loss": 0.5413305759429932, + "step": 1011 + }, + { + "epoch": 0.98828125, + "grad_norm": 0.41826963424682617, + "learning_rate": 2.5515210991167813e-06, + "loss": 0.449452668428421, + "step": 1012 + }, + { + "epoch": 0.9892578125, + "grad_norm": 0.31969836354255676, + "learning_rate": 2.3552502453385675e-06, + "loss": 0.26595592498779297, + "step": 1013 + }, + { + "epoch": 0.990234375, + "grad_norm": 0.466192364692688, + "learning_rate": 2.1589793915603533e-06, + "loss": 0.6175995469093323, + "step": 1014 + }, + { + "epoch": 0.9912109375, + "grad_norm": 0.4734349846839905, + "learning_rate": 1.9627085377821394e-06, + "loss": 0.6440984010696411, + "step": 1015 + }, + { + "epoch": 0.9921875, + "grad_norm": 0.4446095824241638, + "learning_rate": 1.7664376840039256e-06, + "loss": 0.5738557577133179, + "step": 1016 + }, + { + "epoch": 0.9931640625, + "grad_norm": 0.24098840355873108, + "learning_rate": 1.5701668302257114e-06, + "loss": 0.6320365071296692, + "step": 1017 + }, + { + "epoch": 0.994140625, + "grad_norm": 0.5342791676521301, + "learning_rate": 1.3738959764474976e-06, + "loss": 0.9431695938110352, + "step": 1018 + }, + { + "epoch": 0.9951171875, + "grad_norm": 0.31406712532043457, + "learning_rate": 1.1776251226692837e-06, + "loss": 0.6406105160713196, + "step": 1019 + }, + { + "epoch": 0.99609375, + "grad_norm": 0.5162865519523621, + "learning_rate": 9.813542688910697e-07, + "loss": 0.7935853004455566, + "step": 1020 + }, + { + "epoch": 0.9970703125, + "grad_norm": 0.4624859690666199, + "learning_rate": 7.850834151128557e-07, + "loss": 0.9667851328849792, + "step": 1021 + }, + { + "epoch": 0.998046875, + "grad_norm": 0.43549951910972595, + "learning_rate": 5.888125613346419e-07, + "loss": 0.73248291015625, + "step": 1022 + }, + { + "epoch": 0.9990234375, + "grad_norm": 0.6080308556556702, + "learning_rate": 3.9254170755642785e-07, + "loss": 0.5045021772384644, + "step": 1023 + }, + { + "epoch": 1.0, + "grad_norm": 0.3927266299724579, + "learning_rate": 1.9627085377821392e-07, + "loss": 0.37262263894081116, + "step": 1024 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.871410239702333e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}