| { |
| "best_metric": 0.239013671875, |
| "best_model_checkpoint": "./results_morgangen_auto/checkpoint-240000", |
| "epoch": 0.0024, |
| "eval_steps": 20000, |
| "global_step": 240000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 1e-06, |
| "grad_norm": 7.192080497741699, |
| "learning_rate": 4.944309013222119e-06, |
| "loss": 3.578, |
| "step": 100 |
| }, |
| { |
| "epoch": 2e-06, |
| "grad_norm": 5.5353922843933105, |
| "learning_rate": 5.725086528406295e-06, |
| "loss": 1.9606, |
| "step": 200 |
| }, |
| { |
| "epoch": 3e-06, |
| "grad_norm": 6.314189910888672, |
| "learning_rate": 6.1745550399454076e-06, |
| "loss": 1.5525, |
| "step": 300 |
| }, |
| { |
| "epoch": 4e-06, |
| "grad_norm": 4.842369556427002, |
| "learning_rate": 6.4914927390661495e-06, |
| "loss": 1.3692, |
| "step": 400 |
| }, |
| { |
| "epoch": 5e-06, |
| "grad_norm": 6.878078460693359, |
| "learning_rate": 6.736512997333922e-06, |
| "loss": 1.2631, |
| "step": 500 |
| }, |
| { |
| "epoch": 6e-06, |
| "grad_norm": 5.2255377769470215, |
| "learning_rate": 6.936292414321374e-06, |
| "loss": 1.2033, |
| "step": 600 |
| }, |
| { |
| "epoch": 7e-06, |
| "grad_norm": 6.255804538726807, |
| "learning_rate": 7.104962011475284e-06, |
| "loss": 1.176, |
| "step": 700 |
| }, |
| { |
| "epoch": 8e-06, |
| "grad_norm": 5.792397499084473, |
| "learning_rate": 7.250917821641176e-06, |
| "loss": 1.1391, |
| "step": 800 |
| }, |
| { |
| "epoch": 9e-06, |
| "grad_norm": 6.366394996643066, |
| "learning_rate": 7.37955758828978e-06, |
| "loss": 1.0937, |
| "step": 900 |
| }, |
| { |
| "epoch": 1e-05, |
| "grad_norm": 5.171053886413574, |
| "learning_rate": 7.494557701864313e-06, |
| "loss": 1.0665, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.1e-05, |
| "grad_norm": 5.637969017028809, |
| "learning_rate": 7.598535297940343e-06, |
| "loss": 1.0483, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.2e-05, |
| "grad_norm": 5.52170467376709, |
| "learning_rate": 7.69341976321039e-06, |
| "loss": 1.0372, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.3e-05, |
| "grad_norm": 4.622586727142334, |
| "learning_rate": 7.780674421043177e-06, |
| "loss": 1.0008, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.4e-05, |
| "grad_norm": 4.899344444274902, |
| "learning_rate": 7.86143551902404e-06, |
| "loss": 0.9829, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.5e-05, |
| "grad_norm": 4.919180870056152, |
| "learning_rate": 7.936602981651121e-06, |
| "loss": 0.9821, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.6e-05, |
| "grad_norm": 5.980068206787109, |
| "learning_rate": 8.006901718483e-06, |
| "loss": 0.9506, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.7e-05, |
| "grad_norm": 5.955260276794434, |
| "learning_rate": 8.072924256347751e-06, |
| "loss": 0.9382, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.8e-05, |
| "grad_norm": 4.828701019287109, |
| "learning_rate": 8.135161132285844e-06, |
| "loss": 0.923, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.9e-05, |
| "grad_norm": 5.517646789550781, |
| "learning_rate": 8.194023035760226e-06, |
| "loss": 0.9019, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2e-05, |
| "grad_norm": 4.49784517288208, |
| "learning_rate": 8.249857250056917e-06, |
| "loss": 0.9013, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.1e-05, |
| "grad_norm": 4.274550914764404, |
| "learning_rate": 8.302960068255784e-06, |
| "loss": 0.9038, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.2e-05, |
| "grad_norm": 5.395852565765381, |
| "learning_rate": 8.35358631144535e-06, |
| "loss": 0.9049, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.3e-05, |
| "grad_norm": 4.8221611976623535, |
| "learning_rate": 8.4019567246832e-06, |
| "loss": 0.8675, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.4e-05, |
| "grad_norm": 4.574971675872803, |
| "learning_rate": 8.448263794376455e-06, |
| "loss": 0.8743, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.5e-05, |
| "grad_norm": 4.795435905456543, |
| "learning_rate": 8.492676374898522e-06, |
| "loss": 0.8473, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.6e-05, |
| "grad_norm": 4.126605987548828, |
| "learning_rate": 8.535343405461191e-06, |
| "loss": 0.8553, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.7e-05, |
| "grad_norm": 4.7121992111206055, |
| "learning_rate": 8.576396923806893e-06, |
| "loss": 0.8496, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.8e-05, |
| "grad_norm": 4.441779136657715, |
| "learning_rate": 8.615954530556104e-06, |
| "loss": 0.8336, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.9e-05, |
| "grad_norm": 4.51931095123291, |
| "learning_rate": 8.654121420158637e-06, |
| "loss": 0.8287, |
| "step": 2900 |
| }, |
| { |
| "epoch": 3e-05, |
| "grad_norm": 4.081902980804443, |
| "learning_rate": 8.690992066813325e-06, |
| "loss": 0.8168, |
| "step": 3000 |
| }, |
| { |
| "epoch": 3.1e-05, |
| "grad_norm": 4.671773910522461, |
| "learning_rate": 8.726651633390342e-06, |
| "loss": 0.8166, |
| "step": 3100 |
| }, |
| { |
| "epoch": 3.2e-05, |
| "grad_norm": 4.348089694976807, |
| "learning_rate": 8.76083727950616e-06, |
| "loss": 0.81, |
| "step": 3200 |
| }, |
| { |
| "epoch": 3.3e-05, |
| "grad_norm": 4.5769829750061035, |
| "learning_rate": 8.794308987084338e-06, |
| "loss": 0.7987, |
| "step": 3300 |
| }, |
| { |
| "epoch": 3.4e-05, |
| "grad_norm": 4.201402187347412, |
| "learning_rate": 8.826459649112795e-06, |
| "loss": 0.8051, |
| "step": 3400 |
| }, |
| { |
| "epoch": 3.5e-05, |
| "grad_norm": 4.153467655181885, |
| "learning_rate": 8.857996464094115e-06, |
| "loss": 0.8102, |
| "step": 3500 |
| }, |
| { |
| "epoch": 3.6e-05, |
| "grad_norm": 4.17486047744751, |
| "learning_rate": 8.888643043011622e-06, |
| "loss": 0.7847, |
| "step": 3600 |
| }, |
| { |
| "epoch": 3.7e-05, |
| "grad_norm": 4.006905555725098, |
| "learning_rate": 8.918448269127447e-06, |
| "loss": 0.7765, |
| "step": 3700 |
| }, |
| { |
| "epoch": 3.8e-05, |
| "grad_norm": 3.959653377532959, |
| "learning_rate": 8.947170821665072e-06, |
| "loss": 0.7704, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.9e-05, |
| "grad_norm": 5.123909950256348, |
| "learning_rate": 8.975432078990786e-06, |
| "loss": 0.7671, |
| "step": 3900 |
| }, |
| { |
| "epoch": 4e-05, |
| "grad_norm": 3.873758316040039, |
| "learning_rate": 9.002976331538332e-06, |
| "loss": 0.7567, |
| "step": 4000 |
| }, |
| { |
| "epoch": 4.1e-05, |
| "grad_norm": 4.340254783630371, |
| "learning_rate": 9.029839062600307e-06, |
| "loss": 0.7416, |
| "step": 4100 |
| }, |
| { |
| "epoch": 4.2e-05, |
| "grad_norm": 4.056208610534668, |
| "learning_rate": 9.056053184939176e-06, |
| "loss": 0.7445, |
| "step": 4200 |
| }, |
| { |
| "epoch": 4.3e-05, |
| "grad_norm": 3.6428372859954834, |
| "learning_rate": 9.081649283234784e-06, |
| "loss": 0.7456, |
| "step": 4300 |
| }, |
| { |
| "epoch": 4.4e-05, |
| "grad_norm": 3.506011486053467, |
| "learning_rate": 9.106655828605087e-06, |
| "loss": 0.7342, |
| "step": 4400 |
| }, |
| { |
| "epoch": 4.5e-05, |
| "grad_norm": 3.7343637943267822, |
| "learning_rate": 9.13109936897355e-06, |
| "loss": 0.7435, |
| "step": 4500 |
| }, |
| { |
| "epoch": 4.6e-05, |
| "grad_norm": 3.65120267868042, |
| "learning_rate": 9.155004698474792e-06, |
| "loss": 0.7346, |
| "step": 4600 |
| }, |
| { |
| "epoch": 4.7e-05, |
| "grad_norm": 3.66025710105896, |
| "learning_rate": 9.17839500860873e-06, |
| "loss": 0.7394, |
| "step": 4700 |
| }, |
| { |
| "epoch": 4.8e-05, |
| "grad_norm": 3.7857818603515625, |
| "learning_rate": 9.201292023453135e-06, |
| "loss": 0.7137, |
| "step": 4800 |
| }, |
| { |
| "epoch": 4.9e-05, |
| "grad_norm": 4.2525858879089355, |
| "learning_rate": 9.22371612091062e-06, |
| "loss": 0.7179, |
| "step": 4900 |
| }, |
| { |
| "epoch": 5e-05, |
| "grad_norm": 3.6449532508850098, |
| "learning_rate": 9.245686441685918e-06, |
| "loss": 0.7149, |
| "step": 5000 |
| }, |
| { |
| "epoch": 5.1e-05, |
| "grad_norm": 4.4344401359558105, |
| "learning_rate": 9.267220987454044e-06, |
| "loss": 0.7209, |
| "step": 5100 |
| }, |
| { |
| "epoch": 5.2e-05, |
| "grad_norm": 3.7667882442474365, |
| "learning_rate": 9.28833670948078e-06, |
| "loss": 0.7104, |
| "step": 5200 |
| }, |
| { |
| "epoch": 5.3e-05, |
| "grad_norm": 3.8509140014648438, |
| "learning_rate": 9.309049588788657e-06, |
| "loss": 0.6939, |
| "step": 5300 |
| }, |
| { |
| "epoch": 5.4e-05, |
| "grad_norm": 4.200638294219971, |
| "learning_rate": 9.329374708818158e-06, |
| "loss": 0.6934, |
| "step": 5400 |
| }, |
| { |
| "epoch": 5.5e-05, |
| "grad_norm": 3.6815011501312256, |
| "learning_rate": 9.349326321411793e-06, |
| "loss": 0.6841, |
| "step": 5500 |
| }, |
| { |
| "epoch": 5.6e-05, |
| "grad_norm": 4.6741719245910645, |
| "learning_rate": 9.368917906844062e-06, |
| "loss": 0.7124, |
| "step": 5600 |
| }, |
| { |
| "epoch": 5.7e-05, |
| "grad_norm": 3.726712942123413, |
| "learning_rate": 9.388162228530614e-06, |
| "loss": 0.6749, |
| "step": 5700 |
| }, |
| { |
| "epoch": 5.8e-05, |
| "grad_norm": 3.2452657222747803, |
| "learning_rate": 9.407071382972726e-06, |
| "loss": 0.7073, |
| "step": 5800 |
| }, |
| { |
| "epoch": 5.9e-05, |
| "grad_norm": 3.7711005210876465, |
| "learning_rate": 9.425656845426483e-06, |
| "loss": 0.6816, |
| "step": 5900 |
| }, |
| { |
| "epoch": 6e-05, |
| "grad_norm": 3.617072105407715, |
| "learning_rate": 9.443929511728523e-06, |
| "loss": 0.6788, |
| "step": 6000 |
| }, |
| { |
| "epoch": 6.1e-05, |
| "grad_norm": 3.677022933959961, |
| "learning_rate": 9.461721498753552e-06, |
| "loss": 0.6707, |
| "step": 6100 |
| }, |
| { |
| "epoch": 6.2e-05, |
| "grad_norm": 3.563831329345703, |
| "learning_rate": 9.479402010032261e-06, |
| "loss": 0.6688, |
| "step": 6200 |
| }, |
| { |
| "epoch": 6.3e-05, |
| "grad_norm": 5.436884880065918, |
| "learning_rate": 9.496799212962515e-06, |
| "loss": 0.6667, |
| "step": 6300 |
| }, |
| { |
| "epoch": 6.4e-05, |
| "grad_norm": 3.69850754737854, |
| "learning_rate": 9.51392204387139e-06, |
| "loss": 0.665, |
| "step": 6400 |
| }, |
| { |
| "epoch": 6.5e-05, |
| "grad_norm": 4.021072864532471, |
| "learning_rate": 9.530779022827808e-06, |
| "loss": 0.6652, |
| "step": 6500 |
| }, |
| { |
| "epoch": 6.6e-05, |
| "grad_norm": 3.9425106048583984, |
| "learning_rate": 9.547378279100432e-06, |
| "loss": 0.6593, |
| "step": 6600 |
| }, |
| { |
| "epoch": 6.7e-05, |
| "grad_norm": 4.435314655303955, |
| "learning_rate": 9.563727574698575e-06, |
| "loss": 0.6521, |
| "step": 6700 |
| }, |
| { |
| "epoch": 6.8e-05, |
| "grad_norm": 3.426171064376831, |
| "learning_rate": 9.579674435701252e-06, |
| "loss": 0.6592, |
| "step": 6800 |
| }, |
| { |
| "epoch": 6.9e-05, |
| "grad_norm": 3.2786061763763428, |
| "learning_rate": 9.595548054769063e-06, |
| "loss": 0.659, |
| "step": 6900 |
| }, |
| { |
| "epoch": 7e-05, |
| "grad_norm": 3.6172714233398438, |
| "learning_rate": 9.611192939364202e-06, |
| "loss": 0.6425, |
| "step": 7000 |
| }, |
| { |
| "epoch": 7.1e-05, |
| "grad_norm": 3.1984446048736572, |
| "learning_rate": 9.626615587957666e-06, |
| "loss": 0.6476, |
| "step": 7100 |
| }, |
| { |
| "epoch": 7.2e-05, |
| "grad_norm": 4.468976974487305, |
| "learning_rate": 9.641822225957206e-06, |
| "loss": 0.6394, |
| "step": 7200 |
| }, |
| { |
| "epoch": 7.3e-05, |
| "grad_norm": 3.582566261291504, |
| "learning_rate": 9.656818820794936e-06, |
| "loss": 0.6312, |
| "step": 7300 |
| }, |
| { |
| "epoch": 7.4e-05, |
| "grad_norm": 3.2956085205078125, |
| "learning_rate": 9.671611095987065e-06, |
| "loss": 0.6349, |
| "step": 7400 |
| }, |
| { |
| "epoch": 7.5e-05, |
| "grad_norm": 3.747368574142456, |
| "learning_rate": 9.686204544248665e-06, |
| "loss": 0.641, |
| "step": 7500 |
| }, |
| { |
| "epoch": 7.6e-05, |
| "grad_norm": 3.5763564109802246, |
| "learning_rate": 9.7006044397387e-06, |
| "loss": 0.635, |
| "step": 7600 |
| }, |
| { |
| "epoch": 7.7e-05, |
| "grad_norm": 3.662113666534424, |
| "learning_rate": 9.714815849503578e-06, |
| "loss": 0.6216, |
| "step": 7700 |
| }, |
| { |
| "epoch": 7.8e-05, |
| "grad_norm": 4.00654935836792, |
| "learning_rate": 9.728843644181411e-06, |
| "loss": 0.6385, |
| "step": 7800 |
| }, |
| { |
| "epoch": 7.9e-05, |
| "grad_norm": 3.1744225025177, |
| "learning_rate": 9.74269250802355e-06, |
| "loss": 0.6225, |
| "step": 7900 |
| }, |
| { |
| "epoch": 8e-05, |
| "grad_norm": 2.9433505535125732, |
| "learning_rate": 9.756366948284976e-06, |
| "loss": 0.6218, |
| "step": 8000 |
| }, |
| { |
| "epoch": 8.1e-05, |
| "grad_norm": 3.351646661758423, |
| "learning_rate": 9.76987130403068e-06, |
| "loss": 0.6156, |
| "step": 8100 |
| }, |
| { |
| "epoch": 8.2e-05, |
| "grad_norm": 3.297624111175537, |
| "learning_rate": 9.783209754401046e-06, |
| "loss": 0.6105, |
| "step": 8200 |
| }, |
| { |
| "epoch": 8.3e-05, |
| "grad_norm": 3.242459535598755, |
| "learning_rate": 9.796386326375682e-06, |
| "loss": 0.6121, |
| "step": 8300 |
| }, |
| { |
| "epoch": 8.4e-05, |
| "grad_norm": 3.301387071609497, |
| "learning_rate": 9.80940490207175e-06, |
| "loss": 0.6103, |
| "step": 8400 |
| }, |
| { |
| "epoch": 8.5e-05, |
| "grad_norm": 3.537336587905884, |
| "learning_rate": 9.822269225609881e-06, |
| "loss": 0.5999, |
| "step": 8500 |
| }, |
| { |
| "epoch": 8.6e-05, |
| "grad_norm": 3.297769784927368, |
| "learning_rate": 9.834856506853153e-06, |
| "loss": 0.6137, |
| "step": 8600 |
| }, |
| { |
| "epoch": 8.7e-05, |
| "grad_norm": 3.5436480045318604, |
| "learning_rate": 9.847424493057225e-06, |
| "loss": 0.5982, |
| "step": 8700 |
| }, |
| { |
| "epoch": 8.8e-05, |
| "grad_norm": 3.3553879261016846, |
| "learning_rate": 9.85984866118054e-06, |
| "loss": 0.5987, |
| "step": 8800 |
| }, |
| { |
| "epoch": 8.9e-05, |
| "grad_norm": 2.8393940925598145, |
| "learning_rate": 9.872010114832027e-06, |
| "loss": 0.5954, |
| "step": 8900 |
| }, |
| { |
| "epoch": 9e-05, |
| "grad_norm": 3.286961317062378, |
| "learning_rate": 9.884157659367727e-06, |
| "loss": 0.6128, |
| "step": 9000 |
| }, |
| { |
| "epoch": 9.1e-05, |
| "grad_norm": 3.427604913711548, |
| "learning_rate": 9.896170795917358e-06, |
| "loss": 0.5997, |
| "step": 9100 |
| }, |
| { |
| "epoch": 9.2e-05, |
| "grad_norm": 4.3479323387146, |
| "learning_rate": 9.908052466307471e-06, |
| "loss": 0.592, |
| "step": 9200 |
| }, |
| { |
| "epoch": 9.3e-05, |
| "grad_norm": 3.0680465698242188, |
| "learning_rate": 9.919805516826294e-06, |
| "loss": 0.5973, |
| "step": 9300 |
| }, |
| { |
| "epoch": 9.4e-05, |
| "grad_norm": 3.6079111099243164, |
| "learning_rate": 9.931432702316388e-06, |
| "loss": 0.5995, |
| "step": 9400 |
| }, |
| { |
| "epoch": 9.5e-05, |
| "grad_norm": 3.1534416675567627, |
| "learning_rate": 9.942936690050469e-06, |
| "loss": 0.5899, |
| "step": 9500 |
| }, |
| { |
| "epoch": 9.6e-05, |
| "grad_norm": 2.7021210193634033, |
| "learning_rate": 9.95432006340404e-06, |
| "loss": 0.5963, |
| "step": 9600 |
| }, |
| { |
| "epoch": 9.7e-05, |
| "grad_norm": 2.8992557525634766, |
| "learning_rate": 9.965585325337488e-06, |
| "loss": 0.5919, |
| "step": 9700 |
| }, |
| { |
| "epoch": 9.8e-05, |
| "grad_norm": 3.391969680786133, |
| "learning_rate": 9.976734901699378e-06, |
| "loss": 0.5814, |
| "step": 9800 |
| }, |
| { |
| "epoch": 9.9e-05, |
| "grad_norm": 2.826235771179199, |
| "learning_rate": 9.987771144361851e-06, |
| "loss": 0.5675, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.0001, |
| "grad_norm": 3.625103235244751, |
| "learning_rate": 9.998696334198274e-06, |
| "loss": 0.589, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.000101, |
| "grad_norm": 3.538278341293335, |
| "learning_rate": 1e-05, |
| "loss": 0.5863, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.000102, |
| "grad_norm": 3.2610013484954834, |
| "learning_rate": 1e-05, |
| "loss": 0.5853, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.000103, |
| "grad_norm": 3.018453359603882, |
| "learning_rate": 1e-05, |
| "loss": 0.581, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.000104, |
| "grad_norm": 2.997459650039673, |
| "learning_rate": 1e-05, |
| "loss": 0.5749, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.000105, |
| "grad_norm": 3.1793456077575684, |
| "learning_rate": 1e-05, |
| "loss": 0.5624, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.000106, |
| "grad_norm": 3.571202039718628, |
| "learning_rate": 1e-05, |
| "loss": 0.574, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.000107, |
| "grad_norm": 3.742325782775879, |
| "learning_rate": 1e-05, |
| "loss": 0.5736, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.000108, |
| "grad_norm": 3.2514116764068604, |
| "learning_rate": 1e-05, |
| "loss": 0.5611, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.000109, |
| "grad_norm": 3.0863165855407715, |
| "learning_rate": 1e-05, |
| "loss": 0.5564, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.00011, |
| "grad_norm": 3.638606071472168, |
| "learning_rate": 1e-05, |
| "loss": 0.5711, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.000111, |
| "grad_norm": 2.756107807159424, |
| "learning_rate": 1e-05, |
| "loss": 0.5711, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.000112, |
| "grad_norm": 3.2166287899017334, |
| "learning_rate": 1e-05, |
| "loss": 0.5781, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.000113, |
| "grad_norm": 2.7138330936431885, |
| "learning_rate": 1e-05, |
| "loss": 0.5522, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.000114, |
| "grad_norm": 3.1922643184661865, |
| "learning_rate": 1e-05, |
| "loss": 0.5686, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.000115, |
| "grad_norm": 3.2307920455932617, |
| "learning_rate": 1e-05, |
| "loss": 0.5621, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.000116, |
| "grad_norm": 2.9271585941314697, |
| "learning_rate": 1e-05, |
| "loss": 0.554, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.000117, |
| "grad_norm": 2.994710922241211, |
| "learning_rate": 1e-05, |
| "loss": 0.5564, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.000118, |
| "grad_norm": 3.0383167266845703, |
| "learning_rate": 1e-05, |
| "loss": 0.5584, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.000119, |
| "grad_norm": 2.635859489440918, |
| "learning_rate": 1e-05, |
| "loss": 0.5657, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.00012, |
| "grad_norm": 2.850497245788574, |
| "learning_rate": 1e-05, |
| "loss": 0.5538, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.000121, |
| "grad_norm": 3.060102701187134, |
| "learning_rate": 1e-05, |
| "loss": 0.5671, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.000122, |
| "grad_norm": 2.882080316543579, |
| "learning_rate": 1e-05, |
| "loss": 0.5563, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.000123, |
| "grad_norm": 2.516627311706543, |
| "learning_rate": 1e-05, |
| "loss": 0.5499, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.000124, |
| "grad_norm": 3.2647488117218018, |
| "learning_rate": 1e-05, |
| "loss": 0.5523, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.000125, |
| "grad_norm": 3.0820746421813965, |
| "learning_rate": 1e-05, |
| "loss": 0.5551, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.000126, |
| "grad_norm": 3.288663864135742, |
| "learning_rate": 1e-05, |
| "loss": 0.5421, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.000127, |
| "grad_norm": 2.8991341590881348, |
| "learning_rate": 1e-05, |
| "loss": 0.5379, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.000128, |
| "grad_norm": 3.0275886058807373, |
| "learning_rate": 1e-05, |
| "loss": 0.5555, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.000129, |
| "grad_norm": 2.8435568809509277, |
| "learning_rate": 1e-05, |
| "loss": 0.5475, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.00013, |
| "grad_norm": 3.5080063343048096, |
| "learning_rate": 1e-05, |
| "loss": 0.5464, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.000131, |
| "grad_norm": 3.4270200729370117, |
| "learning_rate": 1e-05, |
| "loss": 0.5489, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.000132, |
| "grad_norm": 3.046891689300537, |
| "learning_rate": 1e-05, |
| "loss": 0.5427, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.000133, |
| "grad_norm": 2.9758501052856445, |
| "learning_rate": 1e-05, |
| "loss": 0.547, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.000134, |
| "grad_norm": 2.984278917312622, |
| "learning_rate": 1e-05, |
| "loss": 0.5405, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.000135, |
| "grad_norm": 2.7465741634368896, |
| "learning_rate": 1e-05, |
| "loss": 0.5449, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.000136, |
| "grad_norm": 2.6770498752593994, |
| "learning_rate": 1e-05, |
| "loss": 0.5313, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.000137, |
| "grad_norm": 3.1820075511932373, |
| "learning_rate": 1e-05, |
| "loss": 0.5444, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.000138, |
| "grad_norm": 2.7672953605651855, |
| "learning_rate": 1e-05, |
| "loss": 0.5366, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.000139, |
| "grad_norm": 3.25970458984375, |
| "learning_rate": 1e-05, |
| "loss": 0.5331, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.00014, |
| "grad_norm": 2.8809070587158203, |
| "learning_rate": 1e-05, |
| "loss": 0.5332, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.000141, |
| "grad_norm": 2.7005808353424072, |
| "learning_rate": 1e-05, |
| "loss": 0.5335, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.000142, |
| "grad_norm": 3.1988399028778076, |
| "learning_rate": 1e-05, |
| "loss": 0.533, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.000143, |
| "grad_norm": 2.7792532444000244, |
| "learning_rate": 1e-05, |
| "loss": 0.5277, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.000144, |
| "grad_norm": 3.010068655014038, |
| "learning_rate": 1e-05, |
| "loss": 0.5314, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.000145, |
| "grad_norm": 3.3190596103668213, |
| "learning_rate": 1e-05, |
| "loss": 0.5308, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.000146, |
| "grad_norm": 3.3294529914855957, |
| "learning_rate": 1e-05, |
| "loss": 0.5369, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.000147, |
| "grad_norm": 3.0750784873962402, |
| "learning_rate": 1e-05, |
| "loss": 0.5399, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.000148, |
| "grad_norm": 2.720137357711792, |
| "learning_rate": 1e-05, |
| "loss": 0.5255, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.000149, |
| "grad_norm": 3.3225038051605225, |
| "learning_rate": 1e-05, |
| "loss": 0.5324, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.00015, |
| "grad_norm": 2.890933036804199, |
| "learning_rate": 1e-05, |
| "loss": 0.5356, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.000151, |
| "grad_norm": 3.118818998336792, |
| "learning_rate": 1e-05, |
| "loss": 0.5209, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.000152, |
| "grad_norm": 2.549999237060547, |
| "learning_rate": 1e-05, |
| "loss": 0.5327, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.000153, |
| "grad_norm": 2.8534693717956543, |
| "learning_rate": 1e-05, |
| "loss": 0.5161, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.000154, |
| "grad_norm": 3.0132813453674316, |
| "learning_rate": 1e-05, |
| "loss": 0.5225, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.000155, |
| "grad_norm": 2.934532880783081, |
| "learning_rate": 1e-05, |
| "loss": 0.5185, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.000156, |
| "grad_norm": 3.079315423965454, |
| "learning_rate": 1e-05, |
| "loss": 0.5228, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.000157, |
| "grad_norm": 2.9515817165374756, |
| "learning_rate": 1e-05, |
| "loss": 0.5183, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.000158, |
| "grad_norm": 2.9063994884490967, |
| "learning_rate": 1e-05, |
| "loss": 0.5221, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.000159, |
| "grad_norm": 2.873257875442505, |
| "learning_rate": 1e-05, |
| "loss": 0.5184, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.00016, |
| "grad_norm": 3.2980196475982666, |
| "learning_rate": 1e-05, |
| "loss": 0.5208, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.000161, |
| "grad_norm": 3.3925833702087402, |
| "learning_rate": 1e-05, |
| "loss": 0.5165, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.000162, |
| "grad_norm": 3.8521616458892822, |
| "learning_rate": 1e-05, |
| "loss": 0.525, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.000163, |
| "grad_norm": 2.9561917781829834, |
| "learning_rate": 1e-05, |
| "loss": 0.5266, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.000164, |
| "grad_norm": 3.1145403385162354, |
| "learning_rate": 1e-05, |
| "loss": 0.51, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.000165, |
| "grad_norm": 2.615156888961792, |
| "learning_rate": 1e-05, |
| "loss": 0.5253, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.000166, |
| "grad_norm": 2.6065399646759033, |
| "learning_rate": 1e-05, |
| "loss": 0.5121, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.000167, |
| "grad_norm": 3.8787238597869873, |
| "learning_rate": 1e-05, |
| "loss": 0.518, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.000168, |
| "grad_norm": 2.8339273929595947, |
| "learning_rate": 1e-05, |
| "loss": 0.5098, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.000169, |
| "grad_norm": 3.400110960006714, |
| "learning_rate": 1e-05, |
| "loss": 0.5139, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.00017, |
| "grad_norm": 2.8647141456604004, |
| "learning_rate": 1e-05, |
| "loss": 0.51, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.000171, |
| "grad_norm": 3.0346758365631104, |
| "learning_rate": 1e-05, |
| "loss": 0.52, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.000172, |
| "grad_norm": 2.840468168258667, |
| "learning_rate": 1e-05, |
| "loss": 0.5126, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.000173, |
| "grad_norm": 2.32464861869812, |
| "learning_rate": 1e-05, |
| "loss": 0.5141, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.000174, |
| "grad_norm": 2.6493277549743652, |
| "learning_rate": 1e-05, |
| "loss": 0.5061, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.000175, |
| "grad_norm": 2.9182634353637695, |
| "learning_rate": 1e-05, |
| "loss": 0.5079, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.000176, |
| "grad_norm": 2.817209243774414, |
| "learning_rate": 1e-05, |
| "loss": 0.5209, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.000177, |
| "grad_norm": 3.2138490676879883, |
| "learning_rate": 1e-05, |
| "loss": 0.5008, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.000178, |
| "grad_norm": 2.925663471221924, |
| "learning_rate": 1e-05, |
| "loss": 0.5045, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.000179, |
| "grad_norm": 3.3058528900146484, |
| "learning_rate": 1e-05, |
| "loss": 0.5036, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.00018, |
| "grad_norm": 2.73296856880188, |
| "learning_rate": 1e-05, |
| "loss": 0.5023, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.000181, |
| "grad_norm": 3.112847089767456, |
| "learning_rate": 1e-05, |
| "loss": 0.5178, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.000182, |
| "grad_norm": 2.471736431121826, |
| "learning_rate": 1e-05, |
| "loss": 0.5082, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.000183, |
| "grad_norm": 2.6374621391296387, |
| "learning_rate": 1e-05, |
| "loss": 0.5069, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.000184, |
| "grad_norm": 2.838254690170288, |
| "learning_rate": 1e-05, |
| "loss": 0.5101, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.000185, |
| "grad_norm": 2.9953255653381348, |
| "learning_rate": 1e-05, |
| "loss": 0.5076, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.000186, |
| "grad_norm": 2.7611818313598633, |
| "learning_rate": 1e-05, |
| "loss": 0.4942, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.000187, |
| "grad_norm": 2.739598274230957, |
| "learning_rate": 1e-05, |
| "loss": 0.508, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.000188, |
| "grad_norm": 2.6710898876190186, |
| "learning_rate": 1e-05, |
| "loss": 0.5125, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.000189, |
| "grad_norm": 2.7952322959899902, |
| "learning_rate": 1e-05, |
| "loss": 0.4844, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.00019, |
| "grad_norm": 2.4689576625823975, |
| "learning_rate": 1e-05, |
| "loss": 0.5072, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.000191, |
| "grad_norm": 2.6042397022247314, |
| "learning_rate": 1e-05, |
| "loss": 0.491, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.000192, |
| "grad_norm": 2.915821075439453, |
| "learning_rate": 1e-05, |
| "loss": 0.5029, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.000193, |
| "grad_norm": 3.0724246501922607, |
| "learning_rate": 1e-05, |
| "loss": 0.5085, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.000194, |
| "grad_norm": 2.755842924118042, |
| "learning_rate": 1e-05, |
| "loss": 0.5073, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.000195, |
| "grad_norm": 2.6537370681762695, |
| "learning_rate": 1e-05, |
| "loss": 0.4953, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.000196, |
| "grad_norm": 2.8526852130889893, |
| "learning_rate": 1e-05, |
| "loss": 0.4943, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.000197, |
| "grad_norm": 2.4322919845581055, |
| "learning_rate": 1e-05, |
| "loss": 0.4985, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.000198, |
| "grad_norm": 3.18966007232666, |
| "learning_rate": 1e-05, |
| "loss": 0.4946, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.000199, |
| "grad_norm": 2.5495989322662354, |
| "learning_rate": 1e-05, |
| "loss": 0.4962, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 3.0627942085266113, |
| "learning_rate": 1e-05, |
| "loss": 0.4876, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.0002, |
| "eval_loss": 0.46728515625, |
| "eval_runtime": 111.4705, |
| "eval_samples_per_second": 448.549, |
| "eval_steps_per_second": 28.034, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.000201, |
| "grad_norm": 2.799205780029297, |
| "learning_rate": 1e-05, |
| "loss": 0.5117, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.000202, |
| "grad_norm": 2.509645700454712, |
| "learning_rate": 1e-05, |
| "loss": 0.4903, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.000203, |
| "grad_norm": 3.120729684829712, |
| "learning_rate": 1e-05, |
| "loss": 0.5027, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.000204, |
| "grad_norm": 2.390143394470215, |
| "learning_rate": 1e-05, |
| "loss": 0.4989, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.000205, |
| "grad_norm": 2.5487399101257324, |
| "learning_rate": 1e-05, |
| "loss": 0.4945, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.000206, |
| "grad_norm": 2.9931600093841553, |
| "learning_rate": 1e-05, |
| "loss": 0.5048, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.000207, |
| "grad_norm": 2.5760886669158936, |
| "learning_rate": 1e-05, |
| "loss": 0.4923, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.000208, |
| "grad_norm": 3.149047613143921, |
| "learning_rate": 1e-05, |
| "loss": 0.4937, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.000209, |
| "grad_norm": 2.982687473297119, |
| "learning_rate": 1e-05, |
| "loss": 0.4901, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.00021, |
| "grad_norm": 2.6681571006774902, |
| "learning_rate": 1e-05, |
| "loss": 0.4938, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.000211, |
| "grad_norm": 2.9358367919921875, |
| "learning_rate": 1e-05, |
| "loss": 0.4962, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.000212, |
| "grad_norm": 2.8525304794311523, |
| "learning_rate": 1e-05, |
| "loss": 0.4998, |
| "step": 21200 |
| }, |
| { |
| "epoch": 0.000213, |
| "grad_norm": 2.7039895057678223, |
| "learning_rate": 1e-05, |
| "loss": 0.4876, |
| "step": 21300 |
| }, |
| { |
| "epoch": 0.000214, |
| "grad_norm": 2.591728448867798, |
| "learning_rate": 1e-05, |
| "loss": 0.4895, |
| "step": 21400 |
| }, |
| { |
| "epoch": 0.000215, |
| "grad_norm": 4.136421203613281, |
| "learning_rate": 1e-05, |
| "loss": 0.4892, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.000216, |
| "grad_norm": 2.715740919113159, |
| "learning_rate": 1e-05, |
| "loss": 0.4966, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.000217, |
| "grad_norm": 2.804382562637329, |
| "learning_rate": 1e-05, |
| "loss": 0.5002, |
| "step": 21700 |
| }, |
| { |
| "epoch": 0.000218, |
| "grad_norm": 2.568103551864624, |
| "learning_rate": 1e-05, |
| "loss": 0.4925, |
| "step": 21800 |
| }, |
| { |
| "epoch": 0.000219, |
| "grad_norm": 2.671935796737671, |
| "learning_rate": 1e-05, |
| "loss": 0.4855, |
| "step": 21900 |
| }, |
| { |
| "epoch": 0.00022, |
| "grad_norm": 3.6780941486358643, |
| "learning_rate": 1e-05, |
| "loss": 0.4778, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.000221, |
| "grad_norm": 2.6798088550567627, |
| "learning_rate": 1e-05, |
| "loss": 0.4893, |
| "step": 22100 |
| }, |
| { |
| "epoch": 0.000222, |
| "grad_norm": 2.930389642715454, |
| "learning_rate": 1e-05, |
| "loss": 0.4904, |
| "step": 22200 |
| }, |
| { |
| "epoch": 0.000223, |
| "grad_norm": 2.2273404598236084, |
| "learning_rate": 1e-05, |
| "loss": 0.4864, |
| "step": 22300 |
| }, |
| { |
| "epoch": 0.000224, |
| "grad_norm": 2.7305030822753906, |
| "learning_rate": 1e-05, |
| "loss": 0.4962, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.000225, |
| "grad_norm": 2.6182594299316406, |
| "learning_rate": 1e-05, |
| "loss": 0.4843, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.000226, |
| "grad_norm": 2.7195889949798584, |
| "learning_rate": 1e-05, |
| "loss": 0.4848, |
| "step": 22600 |
| }, |
| { |
| "epoch": 0.000227, |
| "grad_norm": 2.842867136001587, |
| "learning_rate": 1e-05, |
| "loss": 0.4868, |
| "step": 22700 |
| }, |
| { |
| "epoch": 0.000228, |
| "grad_norm": 2.6133205890655518, |
| "learning_rate": 1e-05, |
| "loss": 0.4854, |
| "step": 22800 |
| }, |
| { |
| "epoch": 0.000229, |
| "grad_norm": 2.9739954471588135, |
| "learning_rate": 1e-05, |
| "loss": 0.4799, |
| "step": 22900 |
| }, |
| { |
| "epoch": 0.00023, |
| "grad_norm": 2.704303503036499, |
| "learning_rate": 1e-05, |
| "loss": 0.4787, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.000231, |
| "grad_norm": 2.6430766582489014, |
| "learning_rate": 1e-05, |
| "loss": 0.4838, |
| "step": 23100 |
| }, |
| { |
| "epoch": 0.000232, |
| "grad_norm": 2.57578444480896, |
| "learning_rate": 1e-05, |
| "loss": 0.4818, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.000233, |
| "grad_norm": 2.553027868270874, |
| "learning_rate": 1e-05, |
| "loss": 0.4878, |
| "step": 23300 |
| }, |
| { |
| "epoch": 0.000234, |
| "grad_norm": 2.853264331817627, |
| "learning_rate": 1e-05, |
| "loss": 0.4841, |
| "step": 23400 |
| }, |
| { |
| "epoch": 0.000235, |
| "grad_norm": 2.591419219970703, |
| "learning_rate": 1e-05, |
| "loss": 0.4818, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.000236, |
| "grad_norm": 2.8946421146392822, |
| "learning_rate": 1e-05, |
| "loss": 0.4808, |
| "step": 23600 |
| }, |
| { |
| "epoch": 0.000237, |
| "grad_norm": 2.9158196449279785, |
| "learning_rate": 1e-05, |
| "loss": 0.4755, |
| "step": 23700 |
| }, |
| { |
| "epoch": 0.000238, |
| "grad_norm": 2.578831195831299, |
| "learning_rate": 1e-05, |
| "loss": 0.4759, |
| "step": 23800 |
| }, |
| { |
| "epoch": 0.000239, |
| "grad_norm": 2.6290273666381836, |
| "learning_rate": 1e-05, |
| "loss": 0.4858, |
| "step": 23900 |
| }, |
| { |
| "epoch": 0.00024, |
| "grad_norm": 2.525026321411133, |
| "learning_rate": 1e-05, |
| "loss": 0.4913, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.000241, |
| "grad_norm": 4.174901008605957, |
| "learning_rate": 1e-05, |
| "loss": 0.474, |
| "step": 24100 |
| }, |
| { |
| "epoch": 0.000242, |
| "grad_norm": 2.6417720317840576, |
| "learning_rate": 1e-05, |
| "loss": 0.4831, |
| "step": 24200 |
| }, |
| { |
| "epoch": 0.000243, |
| "grad_norm": 2.4943110942840576, |
| "learning_rate": 1e-05, |
| "loss": 0.4763, |
| "step": 24300 |
| }, |
| { |
| "epoch": 0.000244, |
| "grad_norm": 2.609255075454712, |
| "learning_rate": 1e-05, |
| "loss": 0.4868, |
| "step": 24400 |
| }, |
| { |
| "epoch": 0.000245, |
| "grad_norm": 3.1849722862243652, |
| "learning_rate": 1e-05, |
| "loss": 0.48, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.000246, |
| "grad_norm": 2.6401076316833496, |
| "learning_rate": 1e-05, |
| "loss": 0.4873, |
| "step": 24600 |
| }, |
| { |
| "epoch": 0.000247, |
| "grad_norm": 2.936086416244507, |
| "learning_rate": 1e-05, |
| "loss": 0.4769, |
| "step": 24700 |
| }, |
| { |
| "epoch": 0.000248, |
| "grad_norm": 2.6310338973999023, |
| "learning_rate": 1e-05, |
| "loss": 0.4794, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.000249, |
| "grad_norm": 2.3982949256896973, |
| "learning_rate": 1e-05, |
| "loss": 0.4818, |
| "step": 24900 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 2.8052144050598145, |
| "learning_rate": 1e-05, |
| "loss": 0.4674, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.000251, |
| "grad_norm": 3.136012315750122, |
| "learning_rate": 1e-05, |
| "loss": 0.4739, |
| "step": 25100 |
| }, |
| { |
| "epoch": 0.000252, |
| "grad_norm": 2.721803665161133, |
| "learning_rate": 1e-05, |
| "loss": 0.4653, |
| "step": 25200 |
| }, |
| { |
| "epoch": 0.000253, |
| "grad_norm": 2.9835872650146484, |
| "learning_rate": 1e-05, |
| "loss": 0.4752, |
| "step": 25300 |
| }, |
| { |
| "epoch": 0.000254, |
| "grad_norm": 2.4551830291748047, |
| "learning_rate": 1e-05, |
| "loss": 0.4732, |
| "step": 25400 |
| }, |
| { |
| "epoch": 0.000255, |
| "grad_norm": 3.3453078269958496, |
| "learning_rate": 1e-05, |
| "loss": 0.4766, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.000256, |
| "grad_norm": 2.7177910804748535, |
| "learning_rate": 1e-05, |
| "loss": 0.4761, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.000257, |
| "grad_norm": 2.9529471397399902, |
| "learning_rate": 1e-05, |
| "loss": 0.4816, |
| "step": 25700 |
| }, |
| { |
| "epoch": 0.000258, |
| "grad_norm": 2.5339162349700928, |
| "learning_rate": 1e-05, |
| "loss": 0.4719, |
| "step": 25800 |
| }, |
| { |
| "epoch": 0.000259, |
| "grad_norm": 2.5781121253967285, |
| "learning_rate": 1e-05, |
| "loss": 0.4742, |
| "step": 25900 |
| }, |
| { |
| "epoch": 0.00026, |
| "grad_norm": 2.7583415508270264, |
| "learning_rate": 1e-05, |
| "loss": 0.478, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.000261, |
| "grad_norm": 2.307037591934204, |
| "learning_rate": 1e-05, |
| "loss": 0.4767, |
| "step": 26100 |
| }, |
| { |
| "epoch": 0.000262, |
| "grad_norm": 2.9219844341278076, |
| "learning_rate": 1e-05, |
| "loss": 0.4783, |
| "step": 26200 |
| }, |
| { |
| "epoch": 0.000263, |
| "grad_norm": 2.4626011848449707, |
| "learning_rate": 1e-05, |
| "loss": 0.4806, |
| "step": 26300 |
| }, |
| { |
| "epoch": 0.000264, |
| "grad_norm": 2.7708845138549805, |
| "learning_rate": 1e-05, |
| "loss": 0.4717, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.000265, |
| "grad_norm": 2.806086301803589, |
| "learning_rate": 1e-05, |
| "loss": 0.4641, |
| "step": 26500 |
| }, |
| { |
| "epoch": 0.000266, |
| "grad_norm": 2.395228385925293, |
| "learning_rate": 1e-05, |
| "loss": 0.4769, |
| "step": 26600 |
| }, |
| { |
| "epoch": 0.000267, |
| "grad_norm": 2.482196807861328, |
| "learning_rate": 1e-05, |
| "loss": 0.4637, |
| "step": 26700 |
| }, |
| { |
| "epoch": 0.000268, |
| "grad_norm": 2.3239686489105225, |
| "learning_rate": 1e-05, |
| "loss": 0.4728, |
| "step": 26800 |
| }, |
| { |
| "epoch": 0.000269, |
| "grad_norm": 2.8760108947753906, |
| "learning_rate": 1e-05, |
| "loss": 0.4683, |
| "step": 26900 |
| }, |
| { |
| "epoch": 0.00027, |
| "grad_norm": 3.0095269680023193, |
| "learning_rate": 1e-05, |
| "loss": 0.4641, |
| "step": 27000 |
| }, |
| { |
| "epoch": 0.000271, |
| "grad_norm": 2.7603840827941895, |
| "learning_rate": 1e-05, |
| "loss": 0.4824, |
| "step": 27100 |
| }, |
| { |
| "epoch": 0.000272, |
| "grad_norm": 2.6380269527435303, |
| "learning_rate": 1e-05, |
| "loss": 0.4626, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.000273, |
| "grad_norm": 3.095323324203491, |
| "learning_rate": 1e-05, |
| "loss": 0.4671, |
| "step": 27300 |
| }, |
| { |
| "epoch": 0.000274, |
| "grad_norm": 2.6990623474121094, |
| "learning_rate": 1e-05, |
| "loss": 0.4766, |
| "step": 27400 |
| }, |
| { |
| "epoch": 0.000275, |
| "grad_norm": 2.502337694168091, |
| "learning_rate": 1e-05, |
| "loss": 0.4608, |
| "step": 27500 |
| }, |
| { |
| "epoch": 0.000276, |
| "grad_norm": 2.3775081634521484, |
| "learning_rate": 1e-05, |
| "loss": 0.4749, |
| "step": 27600 |
| }, |
| { |
| "epoch": 0.000277, |
| "grad_norm": 2.7666544914245605, |
| "learning_rate": 1e-05, |
| "loss": 0.465, |
| "step": 27700 |
| }, |
| { |
| "epoch": 0.000278, |
| "grad_norm": 2.4668657779693604, |
| "learning_rate": 1e-05, |
| "loss": 0.4605, |
| "step": 27800 |
| }, |
| { |
| "epoch": 0.000279, |
| "grad_norm": 3.91645884513855, |
| "learning_rate": 1e-05, |
| "loss": 0.4684, |
| "step": 27900 |
| }, |
| { |
| "epoch": 0.00028, |
| "grad_norm": 2.781068801879883, |
| "learning_rate": 1e-05, |
| "loss": 0.4612, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.000281, |
| "grad_norm": 2.4313833713531494, |
| "learning_rate": 1e-05, |
| "loss": 0.4661, |
| "step": 28100 |
| }, |
| { |
| "epoch": 0.000282, |
| "grad_norm": 2.236158847808838, |
| "learning_rate": 1e-05, |
| "loss": 0.4728, |
| "step": 28200 |
| }, |
| { |
| "epoch": 0.000283, |
| "grad_norm": 2.7676749229431152, |
| "learning_rate": 1e-05, |
| "loss": 0.4667, |
| "step": 28300 |
| }, |
| { |
| "epoch": 0.000284, |
| "grad_norm": 2.1664578914642334, |
| "learning_rate": 1e-05, |
| "loss": 0.4632, |
| "step": 28400 |
| }, |
| { |
| "epoch": 0.000285, |
| "grad_norm": 2.4924814701080322, |
| "learning_rate": 1e-05, |
| "loss": 0.4699, |
| "step": 28500 |
| }, |
| { |
| "epoch": 0.000286, |
| "grad_norm": 3.364351749420166, |
| "learning_rate": 1e-05, |
| "loss": 0.4563, |
| "step": 28600 |
| }, |
| { |
| "epoch": 0.000287, |
| "grad_norm": 2.29882550239563, |
| "learning_rate": 1e-05, |
| "loss": 0.4689, |
| "step": 28700 |
| }, |
| { |
| "epoch": 0.000288, |
| "grad_norm": 2.626985549926758, |
| "learning_rate": 1e-05, |
| "loss": 0.463, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.000289, |
| "grad_norm": 2.7008321285247803, |
| "learning_rate": 1e-05, |
| "loss": 0.4601, |
| "step": 28900 |
| }, |
| { |
| "epoch": 0.00029, |
| "grad_norm": 2.5816690921783447, |
| "learning_rate": 1e-05, |
| "loss": 0.4597, |
| "step": 29000 |
| }, |
| { |
| "epoch": 0.000291, |
| "grad_norm": 2.4688684940338135, |
| "learning_rate": 1e-05, |
| "loss": 0.4599, |
| "step": 29100 |
| }, |
| { |
| "epoch": 0.000292, |
| "grad_norm": 2.839632749557495, |
| "learning_rate": 1e-05, |
| "loss": 0.4677, |
| "step": 29200 |
| }, |
| { |
| "epoch": 0.000293, |
| "grad_norm": 3.800483465194702, |
| "learning_rate": 1e-05, |
| "loss": 0.4565, |
| "step": 29300 |
| }, |
| { |
| "epoch": 0.000294, |
| "grad_norm": 4.7663726806640625, |
| "learning_rate": 1e-05, |
| "loss": 0.4522, |
| "step": 29400 |
| }, |
| { |
| "epoch": 0.000295, |
| "grad_norm": 2.4576992988586426, |
| "learning_rate": 1e-05, |
| "loss": 0.4638, |
| "step": 29500 |
| }, |
| { |
| "epoch": 0.000296, |
| "grad_norm": 2.6215529441833496, |
| "learning_rate": 1e-05, |
| "loss": 0.4522, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.000297, |
| "grad_norm": 2.35202693939209, |
| "learning_rate": 1e-05, |
| "loss": 0.452, |
| "step": 29700 |
| }, |
| { |
| "epoch": 0.000298, |
| "grad_norm": 2.1658172607421875, |
| "learning_rate": 1e-05, |
| "loss": 0.4638, |
| "step": 29800 |
| }, |
| { |
| "epoch": 0.000299, |
| "grad_norm": 2.6954879760742188, |
| "learning_rate": 1e-05, |
| "loss": 0.4533, |
| "step": 29900 |
| }, |
| { |
| "epoch": 0.0003, |
| "grad_norm": 2.465700626373291, |
| "learning_rate": 1e-05, |
| "loss": 0.4595, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.000301, |
| "grad_norm": 2.5099382400512695, |
| "learning_rate": 1e-05, |
| "loss": 0.464, |
| "step": 30100 |
| }, |
| { |
| "epoch": 0.000302, |
| "grad_norm": 2.632709264755249, |
| "learning_rate": 1e-05, |
| "loss": 0.4583, |
| "step": 30200 |
| }, |
| { |
| "epoch": 0.000303, |
| "grad_norm": 3.451253890991211, |
| "learning_rate": 1e-05, |
| "loss": 0.4641, |
| "step": 30300 |
| }, |
| { |
| "epoch": 0.000304, |
| "grad_norm": 2.5576229095458984, |
| "learning_rate": 1e-05, |
| "loss": 0.4643, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.000305, |
| "grad_norm": 2.63854718208313, |
| "learning_rate": 1e-05, |
| "loss": 0.4509, |
| "step": 30500 |
| }, |
| { |
| "epoch": 0.000306, |
| "grad_norm": 2.7660105228424072, |
| "learning_rate": 1e-05, |
| "loss": 0.4533, |
| "step": 30600 |
| }, |
| { |
| "epoch": 0.000307, |
| "grad_norm": 2.862382411956787, |
| "learning_rate": 1e-05, |
| "loss": 0.4536, |
| "step": 30700 |
| }, |
| { |
| "epoch": 0.000308, |
| "grad_norm": 2.6443052291870117, |
| "learning_rate": 1e-05, |
| "loss": 0.4485, |
| "step": 30800 |
| }, |
| { |
| "epoch": 0.000309, |
| "grad_norm": 2.525301456451416, |
| "learning_rate": 1e-05, |
| "loss": 0.4593, |
| "step": 30900 |
| }, |
| { |
| "epoch": 0.00031, |
| "grad_norm": 2.4334826469421387, |
| "learning_rate": 1e-05, |
| "loss": 0.4495, |
| "step": 31000 |
| }, |
| { |
| "epoch": 0.000311, |
| "grad_norm": 2.443531036376953, |
| "learning_rate": 1e-05, |
| "loss": 0.4563, |
| "step": 31100 |
| }, |
| { |
| "epoch": 0.000312, |
| "grad_norm": 2.3432857990264893, |
| "learning_rate": 1e-05, |
| "loss": 0.4587, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.000313, |
| "grad_norm": 2.470900058746338, |
| "learning_rate": 1e-05, |
| "loss": 0.4466, |
| "step": 31300 |
| }, |
| { |
| "epoch": 0.000314, |
| "grad_norm": 2.719302177429199, |
| "learning_rate": 1e-05, |
| "loss": 0.4633, |
| "step": 31400 |
| }, |
| { |
| "epoch": 0.000315, |
| "grad_norm": 2.7963156700134277, |
| "learning_rate": 1e-05, |
| "loss": 0.4536, |
| "step": 31500 |
| }, |
| { |
| "epoch": 0.000316, |
| "grad_norm": 2.750457763671875, |
| "learning_rate": 1e-05, |
| "loss": 0.4515, |
| "step": 31600 |
| }, |
| { |
| "epoch": 0.000317, |
| "grad_norm": 2.4325835704803467, |
| "learning_rate": 1e-05, |
| "loss": 0.4555, |
| "step": 31700 |
| }, |
| { |
| "epoch": 0.000318, |
| "grad_norm": 2.4605915546417236, |
| "learning_rate": 1e-05, |
| "loss": 0.4571, |
| "step": 31800 |
| }, |
| { |
| "epoch": 0.000319, |
| "grad_norm": 2.3610308170318604, |
| "learning_rate": 1e-05, |
| "loss": 0.455, |
| "step": 31900 |
| }, |
| { |
| "epoch": 0.00032, |
| "grad_norm": 2.4374446868896484, |
| "learning_rate": 1e-05, |
| "loss": 0.4577, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.000321, |
| "grad_norm": 2.618852138519287, |
| "learning_rate": 1e-05, |
| "loss": 0.4434, |
| "step": 32100 |
| }, |
| { |
| "epoch": 0.000322, |
| "grad_norm": 3.4312095642089844, |
| "learning_rate": 1e-05, |
| "loss": 0.448, |
| "step": 32200 |
| }, |
| { |
| "epoch": 0.000323, |
| "grad_norm": 3.933258056640625, |
| "learning_rate": 1e-05, |
| "loss": 0.4542, |
| "step": 32300 |
| }, |
| { |
| "epoch": 0.000324, |
| "grad_norm": 2.4201653003692627, |
| "learning_rate": 1e-05, |
| "loss": 0.4501, |
| "step": 32400 |
| }, |
| { |
| "epoch": 0.000325, |
| "grad_norm": 2.638230085372925, |
| "learning_rate": 1e-05, |
| "loss": 0.4587, |
| "step": 32500 |
| }, |
| { |
| "epoch": 0.000326, |
| "grad_norm": 2.310612201690674, |
| "learning_rate": 1e-05, |
| "loss": 0.4647, |
| "step": 32600 |
| }, |
| { |
| "epoch": 0.000327, |
| "grad_norm": 2.466186285018921, |
| "learning_rate": 1e-05, |
| "loss": 0.4462, |
| "step": 32700 |
| }, |
| { |
| "epoch": 0.000328, |
| "grad_norm": 2.8925211429595947, |
| "learning_rate": 1e-05, |
| "loss": 0.4556, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.000329, |
| "grad_norm": 2.5643179416656494, |
| "learning_rate": 1e-05, |
| "loss": 0.4479, |
| "step": 32900 |
| }, |
| { |
| "epoch": 0.00033, |
| "grad_norm": 2.372391939163208, |
| "learning_rate": 1e-05, |
| "loss": 0.4583, |
| "step": 33000 |
| }, |
| { |
| "epoch": 0.000331, |
| "grad_norm": 2.6954376697540283, |
| "learning_rate": 1e-05, |
| "loss": 0.4514, |
| "step": 33100 |
| }, |
| { |
| "epoch": 0.000332, |
| "grad_norm": 2.380615234375, |
| "learning_rate": 1e-05, |
| "loss": 0.4468, |
| "step": 33200 |
| }, |
| { |
| "epoch": 0.000333, |
| "grad_norm": 2.5895583629608154, |
| "learning_rate": 1e-05, |
| "loss": 0.4503, |
| "step": 33300 |
| }, |
| { |
| "epoch": 0.000334, |
| "grad_norm": 3.745288133621216, |
| "learning_rate": 1e-05, |
| "loss": 0.4453, |
| "step": 33400 |
| }, |
| { |
| "epoch": 0.000335, |
| "grad_norm": 2.346338987350464, |
| "learning_rate": 1e-05, |
| "loss": 0.4605, |
| "step": 33500 |
| }, |
| { |
| "epoch": 0.000336, |
| "grad_norm": 3.660322666168213, |
| "learning_rate": 1e-05, |
| "loss": 0.4518, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.000337, |
| "grad_norm": 2.2425618171691895, |
| "learning_rate": 1e-05, |
| "loss": 0.4433, |
| "step": 33700 |
| }, |
| { |
| "epoch": 0.000338, |
| "grad_norm": 2.385923385620117, |
| "learning_rate": 1e-05, |
| "loss": 0.4452, |
| "step": 33800 |
| }, |
| { |
| "epoch": 0.000339, |
| "grad_norm": 2.3236701488494873, |
| "learning_rate": 1e-05, |
| "loss": 0.4428, |
| "step": 33900 |
| }, |
| { |
| "epoch": 0.00034, |
| "grad_norm": 2.2188169956207275, |
| "learning_rate": 1e-05, |
| "loss": 0.4536, |
| "step": 34000 |
| }, |
| { |
| "epoch": 0.000341, |
| "grad_norm": 2.760098934173584, |
| "learning_rate": 1e-05, |
| "loss": 0.4487, |
| "step": 34100 |
| }, |
| { |
| "epoch": 0.000342, |
| "grad_norm": 2.2766711711883545, |
| "learning_rate": 1e-05, |
| "loss": 0.4545, |
| "step": 34200 |
| }, |
| { |
| "epoch": 0.000343, |
| "grad_norm": 2.6107327938079834, |
| "learning_rate": 1e-05, |
| "loss": 0.4503, |
| "step": 34300 |
| }, |
| { |
| "epoch": 0.000344, |
| "grad_norm": 2.5845329761505127, |
| "learning_rate": 1e-05, |
| "loss": 0.4503, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.000345, |
| "grad_norm": 2.1708152294158936, |
| "learning_rate": 1e-05, |
| "loss": 0.4474, |
| "step": 34500 |
| }, |
| { |
| "epoch": 0.000346, |
| "grad_norm": 2.408508062362671, |
| "learning_rate": 1e-05, |
| "loss": 0.4504, |
| "step": 34600 |
| }, |
| { |
| "epoch": 0.000347, |
| "grad_norm": 2.5341970920562744, |
| "learning_rate": 1e-05, |
| "loss": 0.4531, |
| "step": 34700 |
| }, |
| { |
| "epoch": 0.000348, |
| "grad_norm": 2.5573482513427734, |
| "learning_rate": 1e-05, |
| "loss": 0.4472, |
| "step": 34800 |
| }, |
| { |
| "epoch": 0.000349, |
| "grad_norm": 2.318730115890503, |
| "learning_rate": 1e-05, |
| "loss": 0.4462, |
| "step": 34900 |
| }, |
| { |
| "epoch": 0.00035, |
| "grad_norm": 2.341620922088623, |
| "learning_rate": 1e-05, |
| "loss": 0.4423, |
| "step": 35000 |
| }, |
| { |
| "epoch": 0.000351, |
| "grad_norm": 2.1792995929718018, |
| "learning_rate": 1e-05, |
| "loss": 0.4519, |
| "step": 35100 |
| }, |
| { |
| "epoch": 0.000352, |
| "grad_norm": 2.4928019046783447, |
| "learning_rate": 1e-05, |
| "loss": 0.4519, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.000353, |
| "grad_norm": 2.512012481689453, |
| "learning_rate": 1e-05, |
| "loss": 0.4578, |
| "step": 35300 |
| }, |
| { |
| "epoch": 0.000354, |
| "grad_norm": 2.510221242904663, |
| "learning_rate": 1e-05, |
| "loss": 0.4506, |
| "step": 35400 |
| }, |
| { |
| "epoch": 0.000355, |
| "grad_norm": 2.637925624847412, |
| "learning_rate": 1e-05, |
| "loss": 0.4454, |
| "step": 35500 |
| }, |
| { |
| "epoch": 0.000356, |
| "grad_norm": 2.6336724758148193, |
| "learning_rate": 1e-05, |
| "loss": 0.4477, |
| "step": 35600 |
| }, |
| { |
| "epoch": 0.000357, |
| "grad_norm": 2.6396801471710205, |
| "learning_rate": 1e-05, |
| "loss": 0.4424, |
| "step": 35700 |
| }, |
| { |
| "epoch": 0.000358, |
| "grad_norm": 2.2982890605926514, |
| "learning_rate": 1e-05, |
| "loss": 0.4434, |
| "step": 35800 |
| }, |
| { |
| "epoch": 0.000359, |
| "grad_norm": 2.617039680480957, |
| "learning_rate": 1e-05, |
| "loss": 0.443, |
| "step": 35900 |
| }, |
| { |
| "epoch": 0.00036, |
| "grad_norm": 2.3763229846954346, |
| "learning_rate": 1e-05, |
| "loss": 0.4462, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.000361, |
| "grad_norm": 2.4981770515441895, |
| "learning_rate": 1e-05, |
| "loss": 0.445, |
| "step": 36100 |
| }, |
| { |
| "epoch": 0.000362, |
| "grad_norm": 2.2644827365875244, |
| "learning_rate": 1e-05, |
| "loss": 0.4423, |
| "step": 36200 |
| }, |
| { |
| "epoch": 0.000363, |
| "grad_norm": 2.1267762184143066, |
| "learning_rate": 1e-05, |
| "loss": 0.4391, |
| "step": 36300 |
| }, |
| { |
| "epoch": 0.000364, |
| "grad_norm": 2.2303924560546875, |
| "learning_rate": 1e-05, |
| "loss": 0.445, |
| "step": 36400 |
| }, |
| { |
| "epoch": 0.000365, |
| "grad_norm": 2.383427619934082, |
| "learning_rate": 1e-05, |
| "loss": 0.4391, |
| "step": 36500 |
| }, |
| { |
| "epoch": 0.000366, |
| "grad_norm": 2.600222587585449, |
| "learning_rate": 1e-05, |
| "loss": 0.446, |
| "step": 36600 |
| }, |
| { |
| "epoch": 0.000367, |
| "grad_norm": 2.557803153991699, |
| "learning_rate": 1e-05, |
| "loss": 0.4519, |
| "step": 36700 |
| }, |
| { |
| "epoch": 0.000368, |
| "grad_norm": 2.5691545009613037, |
| "learning_rate": 1e-05, |
| "loss": 0.4385, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.000369, |
| "grad_norm": 2.3497157096862793, |
| "learning_rate": 1e-05, |
| "loss": 0.4383, |
| "step": 36900 |
| }, |
| { |
| "epoch": 0.00037, |
| "grad_norm": 2.87781023979187, |
| "learning_rate": 1e-05, |
| "loss": 0.4409, |
| "step": 37000 |
| }, |
| { |
| "epoch": 0.000371, |
| "grad_norm": 2.447251558303833, |
| "learning_rate": 1e-05, |
| "loss": 0.4485, |
| "step": 37100 |
| }, |
| { |
| "epoch": 0.000372, |
| "grad_norm": 2.536022424697876, |
| "learning_rate": 1e-05, |
| "loss": 0.4406, |
| "step": 37200 |
| }, |
| { |
| "epoch": 0.000373, |
| "grad_norm": 2.4963696002960205, |
| "learning_rate": 1e-05, |
| "loss": 0.4311, |
| "step": 37300 |
| }, |
| { |
| "epoch": 0.000374, |
| "grad_norm": 2.701169967651367, |
| "learning_rate": 1e-05, |
| "loss": 0.4385, |
| "step": 37400 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 2.514227867126465, |
| "learning_rate": 1e-05, |
| "loss": 0.4357, |
| "step": 37500 |
| }, |
| { |
| "epoch": 0.000376, |
| "grad_norm": 2.5205540657043457, |
| "learning_rate": 1e-05, |
| "loss": 0.4447, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.000377, |
| "grad_norm": 2.7338292598724365, |
| "learning_rate": 1e-05, |
| "loss": 0.4452, |
| "step": 37700 |
| }, |
| { |
| "epoch": 0.000378, |
| "grad_norm": 2.3861348628997803, |
| "learning_rate": 1e-05, |
| "loss": 0.4377, |
| "step": 37800 |
| }, |
| { |
| "epoch": 0.000379, |
| "grad_norm": 2.41011381149292, |
| "learning_rate": 1e-05, |
| "loss": 0.4405, |
| "step": 37900 |
| }, |
| { |
| "epoch": 0.00038, |
| "grad_norm": 2.449092388153076, |
| "learning_rate": 1e-05, |
| "loss": 0.4451, |
| "step": 38000 |
| }, |
| { |
| "epoch": 0.000381, |
| "grad_norm": 2.571415662765503, |
| "learning_rate": 1e-05, |
| "loss": 0.439, |
| "step": 38100 |
| }, |
| { |
| "epoch": 0.000382, |
| "grad_norm": 2.169980764389038, |
| "learning_rate": 1e-05, |
| "loss": 0.4384, |
| "step": 38200 |
| }, |
| { |
| "epoch": 0.000383, |
| "grad_norm": 3.838111162185669, |
| "learning_rate": 1e-05, |
| "loss": 0.4322, |
| "step": 38300 |
| }, |
| { |
| "epoch": 0.000384, |
| "grad_norm": 3.1614363193511963, |
| "learning_rate": 1e-05, |
| "loss": 0.4426, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.000385, |
| "grad_norm": 1.975185513496399, |
| "learning_rate": 1e-05, |
| "loss": 0.4375, |
| "step": 38500 |
| }, |
| { |
| "epoch": 0.000386, |
| "grad_norm": 2.3980181217193604, |
| "learning_rate": 1e-05, |
| "loss": 0.4411, |
| "step": 38600 |
| }, |
| { |
| "epoch": 0.000387, |
| "grad_norm": 2.372525453567505, |
| "learning_rate": 1e-05, |
| "loss": 0.4362, |
| "step": 38700 |
| }, |
| { |
| "epoch": 0.000388, |
| "grad_norm": 2.3161978721618652, |
| "learning_rate": 1e-05, |
| "loss": 0.4405, |
| "step": 38800 |
| }, |
| { |
| "epoch": 0.000389, |
| "grad_norm": 2.4494197368621826, |
| "learning_rate": 1e-05, |
| "loss": 0.4374, |
| "step": 38900 |
| }, |
| { |
| "epoch": 0.00039, |
| "grad_norm": 2.3522799015045166, |
| "learning_rate": 1e-05, |
| "loss": 0.4432, |
| "step": 39000 |
| }, |
| { |
| "epoch": 0.000391, |
| "grad_norm": 2.3715996742248535, |
| "learning_rate": 1e-05, |
| "loss": 0.4237, |
| "step": 39100 |
| }, |
| { |
| "epoch": 0.000392, |
| "grad_norm": 2.429914951324463, |
| "learning_rate": 1e-05, |
| "loss": 0.4332, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.000393, |
| "grad_norm": 2.4736123085021973, |
| "learning_rate": 1e-05, |
| "loss": 0.4345, |
| "step": 39300 |
| }, |
| { |
| "epoch": 0.000394, |
| "grad_norm": 2.033489942550659, |
| "learning_rate": 1e-05, |
| "loss": 0.4371, |
| "step": 39400 |
| }, |
| { |
| "epoch": 0.000395, |
| "grad_norm": 2.22459077835083, |
| "learning_rate": 1e-05, |
| "loss": 0.4336, |
| "step": 39500 |
| }, |
| { |
| "epoch": 0.000396, |
| "grad_norm": 2.475951910018921, |
| "learning_rate": 1e-05, |
| "loss": 0.4349, |
| "step": 39600 |
| }, |
| { |
| "epoch": 0.000397, |
| "grad_norm": 2.2297749519348145, |
| "learning_rate": 1e-05, |
| "loss": 0.4274, |
| "step": 39700 |
| }, |
| { |
| "epoch": 0.000398, |
| "grad_norm": 2.445439338684082, |
| "learning_rate": 1e-05, |
| "loss": 0.4366, |
| "step": 39800 |
| }, |
| { |
| "epoch": 0.000399, |
| "grad_norm": 2.4138917922973633, |
| "learning_rate": 1e-05, |
| "loss": 0.4351, |
| "step": 39900 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 2.551255226135254, |
| "learning_rate": 1e-05, |
| "loss": 0.4333, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.0004, |
| "eval_loss": 0.4091796875, |
| "eval_runtime": 110.8814, |
| "eval_samples_per_second": 450.932, |
| "eval_steps_per_second": 28.183, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.000401, |
| "grad_norm": 2.6333506107330322, |
| "learning_rate": 1e-05, |
| "loss": 0.4332, |
| "step": 40100 |
| }, |
| { |
| "epoch": 0.000402, |
| "grad_norm": 2.0419421195983887, |
| "learning_rate": 1e-05, |
| "loss": 0.4324, |
| "step": 40200 |
| }, |
| { |
| "epoch": 0.000403, |
| "grad_norm": 2.1599907875061035, |
| "learning_rate": 1e-05, |
| "loss": 0.4261, |
| "step": 40300 |
| }, |
| { |
| "epoch": 0.000404, |
| "grad_norm": 2.1608216762542725, |
| "learning_rate": 1e-05, |
| "loss": 0.4265, |
| "step": 40400 |
| }, |
| { |
| "epoch": 0.000405, |
| "grad_norm": 2.0742979049682617, |
| "learning_rate": 1e-05, |
| "loss": 0.4298, |
| "step": 40500 |
| }, |
| { |
| "epoch": 0.000406, |
| "grad_norm": 2.487959146499634, |
| "learning_rate": 1e-05, |
| "loss": 0.43, |
| "step": 40600 |
| }, |
| { |
| "epoch": 0.000407, |
| "grad_norm": 2.436591148376465, |
| "learning_rate": 1e-05, |
| "loss": 0.4356, |
| "step": 40700 |
| }, |
| { |
| "epoch": 0.000408, |
| "grad_norm": 2.2447760105133057, |
| "learning_rate": 1e-05, |
| "loss": 0.4269, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.000409, |
| "grad_norm": 2.1390585899353027, |
| "learning_rate": 1e-05, |
| "loss": 0.4268, |
| "step": 40900 |
| }, |
| { |
| "epoch": 0.00041, |
| "grad_norm": 2.390690326690674, |
| "learning_rate": 1e-05, |
| "loss": 0.4233, |
| "step": 41000 |
| }, |
| { |
| "epoch": 0.000411, |
| "grad_norm": 1.8873522281646729, |
| "learning_rate": 1e-05, |
| "loss": 0.4259, |
| "step": 41100 |
| }, |
| { |
| "epoch": 0.000412, |
| "grad_norm": 2.3923439979553223, |
| "learning_rate": 1e-05, |
| "loss": 0.4296, |
| "step": 41200 |
| }, |
| { |
| "epoch": 0.000413, |
| "grad_norm": 2.7505736351013184, |
| "learning_rate": 1e-05, |
| "loss": 0.4265, |
| "step": 41300 |
| }, |
| { |
| "epoch": 0.000414, |
| "grad_norm": 2.2666115760803223, |
| "learning_rate": 1e-05, |
| "loss": 0.4339, |
| "step": 41400 |
| }, |
| { |
| "epoch": 0.000415, |
| "grad_norm": 2.128662586212158, |
| "learning_rate": 1e-05, |
| "loss": 0.4279, |
| "step": 41500 |
| }, |
| { |
| "epoch": 0.000416, |
| "grad_norm": 2.4806056022644043, |
| "learning_rate": 1e-05, |
| "loss": 0.4335, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.000417, |
| "grad_norm": 2.5022566318511963, |
| "learning_rate": 1e-05, |
| "loss": 0.4321, |
| "step": 41700 |
| }, |
| { |
| "epoch": 0.000418, |
| "grad_norm": 2.682896137237549, |
| "learning_rate": 1e-05, |
| "loss": 0.4373, |
| "step": 41800 |
| }, |
| { |
| "epoch": 0.000419, |
| "grad_norm": 2.7449374198913574, |
| "learning_rate": 1e-05, |
| "loss": 0.4362, |
| "step": 41900 |
| }, |
| { |
| "epoch": 0.00042, |
| "grad_norm": 2.0306496620178223, |
| "learning_rate": 1e-05, |
| "loss": 0.4264, |
| "step": 42000 |
| }, |
| { |
| "epoch": 0.000421, |
| "grad_norm": 2.0226821899414062, |
| "learning_rate": 1e-05, |
| "loss": 0.4301, |
| "step": 42100 |
| }, |
| { |
| "epoch": 0.000422, |
| "grad_norm": 2.372490406036377, |
| "learning_rate": 1e-05, |
| "loss": 0.429, |
| "step": 42200 |
| }, |
| { |
| "epoch": 0.000423, |
| "grad_norm": 2.4113259315490723, |
| "learning_rate": 1e-05, |
| "loss": 0.4347, |
| "step": 42300 |
| }, |
| { |
| "epoch": 0.000424, |
| "grad_norm": 2.3437299728393555, |
| "learning_rate": 1e-05, |
| "loss": 0.4204, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.000425, |
| "grad_norm": 2.460440158843994, |
| "learning_rate": 1e-05, |
| "loss": 0.4195, |
| "step": 42500 |
| }, |
| { |
| "epoch": 0.000426, |
| "grad_norm": 2.3652024269104004, |
| "learning_rate": 1e-05, |
| "loss": 0.418, |
| "step": 42600 |
| }, |
| { |
| "epoch": 0.000427, |
| "grad_norm": 2.48496150970459, |
| "learning_rate": 1e-05, |
| "loss": 0.4263, |
| "step": 42700 |
| }, |
| { |
| "epoch": 0.000428, |
| "grad_norm": 2.4841647148132324, |
| "learning_rate": 1e-05, |
| "loss": 0.4259, |
| "step": 42800 |
| }, |
| { |
| "epoch": 0.000429, |
| "grad_norm": 2.573284149169922, |
| "learning_rate": 1e-05, |
| "loss": 0.4302, |
| "step": 42900 |
| }, |
| { |
| "epoch": 0.00043, |
| "grad_norm": 1.9319133758544922, |
| "learning_rate": 1e-05, |
| "loss": 0.4227, |
| "step": 43000 |
| }, |
| { |
| "epoch": 0.000431, |
| "grad_norm": 2.4806172847747803, |
| "learning_rate": 1e-05, |
| "loss": 0.4325, |
| "step": 43100 |
| }, |
| { |
| "epoch": 0.000432, |
| "grad_norm": 2.4880504608154297, |
| "learning_rate": 1e-05, |
| "loss": 0.4229, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.000433, |
| "grad_norm": 2.395817995071411, |
| "learning_rate": 1e-05, |
| "loss": 0.43, |
| "step": 43300 |
| }, |
| { |
| "epoch": 0.000434, |
| "grad_norm": 2.6356828212738037, |
| "learning_rate": 1e-05, |
| "loss": 0.4207, |
| "step": 43400 |
| }, |
| { |
| "epoch": 0.000435, |
| "grad_norm": 2.1054494380950928, |
| "learning_rate": 1e-05, |
| "loss": 0.4284, |
| "step": 43500 |
| }, |
| { |
| "epoch": 0.000436, |
| "grad_norm": 2.3987913131713867, |
| "learning_rate": 1e-05, |
| "loss": 0.4267, |
| "step": 43600 |
| }, |
| { |
| "epoch": 0.000437, |
| "grad_norm": 2.606956720352173, |
| "learning_rate": 1e-05, |
| "loss": 0.4203, |
| "step": 43700 |
| }, |
| { |
| "epoch": 0.000438, |
| "grad_norm": 2.5072622299194336, |
| "learning_rate": 1e-05, |
| "loss": 0.4276, |
| "step": 43800 |
| }, |
| { |
| "epoch": 0.000439, |
| "grad_norm": 2.300851345062256, |
| "learning_rate": 1e-05, |
| "loss": 0.427, |
| "step": 43900 |
| }, |
| { |
| "epoch": 0.00044, |
| "grad_norm": 2.514756441116333, |
| "learning_rate": 1e-05, |
| "loss": 0.4233, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.000441, |
| "grad_norm": 2.5738296508789062, |
| "learning_rate": 1e-05, |
| "loss": 0.4255, |
| "step": 44100 |
| }, |
| { |
| "epoch": 0.000442, |
| "grad_norm": 2.324410915374756, |
| "learning_rate": 1e-05, |
| "loss": 0.4225, |
| "step": 44200 |
| }, |
| { |
| "epoch": 0.000443, |
| "grad_norm": 2.1797661781311035, |
| "learning_rate": 1e-05, |
| "loss": 0.4204, |
| "step": 44300 |
| }, |
| { |
| "epoch": 0.000444, |
| "grad_norm": 2.667961359024048, |
| "learning_rate": 1e-05, |
| "loss": 0.4298, |
| "step": 44400 |
| }, |
| { |
| "epoch": 0.000445, |
| "grad_norm": 2.6222057342529297, |
| "learning_rate": 1e-05, |
| "loss": 0.4159, |
| "step": 44500 |
| }, |
| { |
| "epoch": 0.000446, |
| "grad_norm": 2.4339888095855713, |
| "learning_rate": 1e-05, |
| "loss": 0.4298, |
| "step": 44600 |
| }, |
| { |
| "epoch": 0.000447, |
| "grad_norm": 2.394127368927002, |
| "learning_rate": 1e-05, |
| "loss": 0.4299, |
| "step": 44700 |
| }, |
| { |
| "epoch": 0.000448, |
| "grad_norm": 2.3612658977508545, |
| "learning_rate": 1e-05, |
| "loss": 0.4285, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.000449, |
| "grad_norm": 2.4719297885894775, |
| "learning_rate": 1e-05, |
| "loss": 0.4218, |
| "step": 44900 |
| }, |
| { |
| "epoch": 0.00045, |
| "grad_norm": 2.1661250591278076, |
| "learning_rate": 1e-05, |
| "loss": 0.4213, |
| "step": 45000 |
| }, |
| { |
| "epoch": 0.000451, |
| "grad_norm": 2.559985637664795, |
| "learning_rate": 1e-05, |
| "loss": 0.4289, |
| "step": 45100 |
| }, |
| { |
| "epoch": 0.000452, |
| "grad_norm": 2.452289342880249, |
| "learning_rate": 1e-05, |
| "loss": 0.4257, |
| "step": 45200 |
| }, |
| { |
| "epoch": 0.000453, |
| "grad_norm": 2.0307326316833496, |
| "learning_rate": 1e-05, |
| "loss": 0.4229, |
| "step": 45300 |
| }, |
| { |
| "epoch": 0.000454, |
| "grad_norm": 2.489323377609253, |
| "learning_rate": 1e-05, |
| "loss": 0.4269, |
| "step": 45400 |
| }, |
| { |
| "epoch": 0.000455, |
| "grad_norm": 2.5684876441955566, |
| "learning_rate": 1e-05, |
| "loss": 0.4173, |
| "step": 45500 |
| }, |
| { |
| "epoch": 0.000456, |
| "grad_norm": 2.4012162685394287, |
| "learning_rate": 1e-05, |
| "loss": 0.4139, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.000457, |
| "grad_norm": 2.4833133220672607, |
| "learning_rate": 1e-05, |
| "loss": 0.4233, |
| "step": 45700 |
| }, |
| { |
| "epoch": 0.000458, |
| "grad_norm": 2.4413490295410156, |
| "learning_rate": 1e-05, |
| "loss": 0.4089, |
| "step": 45800 |
| }, |
| { |
| "epoch": 0.000459, |
| "grad_norm": 2.192959785461426, |
| "learning_rate": 1e-05, |
| "loss": 0.4156, |
| "step": 45900 |
| }, |
| { |
| "epoch": 0.00046, |
| "grad_norm": 2.3064184188842773, |
| "learning_rate": 1e-05, |
| "loss": 0.4187, |
| "step": 46000 |
| }, |
| { |
| "epoch": 0.000461, |
| "grad_norm": 2.4842922687530518, |
| "learning_rate": 1e-05, |
| "loss": 0.4131, |
| "step": 46100 |
| }, |
| { |
| "epoch": 0.000462, |
| "grad_norm": 2.074312925338745, |
| "learning_rate": 1e-05, |
| "loss": 0.4265, |
| "step": 46200 |
| }, |
| { |
| "epoch": 0.000463, |
| "grad_norm": 2.4513862133026123, |
| "learning_rate": 1e-05, |
| "loss": 0.4153, |
| "step": 46300 |
| }, |
| { |
| "epoch": 0.000464, |
| "grad_norm": 2.21403431892395, |
| "learning_rate": 1e-05, |
| "loss": 0.417, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.000465, |
| "grad_norm": 2.469252109527588, |
| "learning_rate": 1e-05, |
| "loss": 0.4181, |
| "step": 46500 |
| }, |
| { |
| "epoch": 0.000466, |
| "grad_norm": 2.3535096645355225, |
| "learning_rate": 1e-05, |
| "loss": 0.4159, |
| "step": 46600 |
| }, |
| { |
| "epoch": 0.000467, |
| "grad_norm": 2.8495819568634033, |
| "learning_rate": 1e-05, |
| "loss": 0.4118, |
| "step": 46700 |
| }, |
| { |
| "epoch": 0.000468, |
| "grad_norm": 2.2924575805664062, |
| "learning_rate": 1e-05, |
| "loss": 0.4221, |
| "step": 46800 |
| }, |
| { |
| "epoch": 0.000469, |
| "grad_norm": 2.1148035526275635, |
| "learning_rate": 1e-05, |
| "loss": 0.4221, |
| "step": 46900 |
| }, |
| { |
| "epoch": 0.00047, |
| "grad_norm": 2.5257456302642822, |
| "learning_rate": 1e-05, |
| "loss": 0.4129, |
| "step": 47000 |
| }, |
| { |
| "epoch": 0.000471, |
| "grad_norm": 2.734550714492798, |
| "learning_rate": 1e-05, |
| "loss": 0.4214, |
| "step": 47100 |
| }, |
| { |
| "epoch": 0.000472, |
| "grad_norm": 2.844151496887207, |
| "learning_rate": 1e-05, |
| "loss": 0.4133, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.000473, |
| "grad_norm": 2.2594943046569824, |
| "learning_rate": 1e-05, |
| "loss": 0.4154, |
| "step": 47300 |
| }, |
| { |
| "epoch": 0.000474, |
| "grad_norm": 2.037102699279785, |
| "learning_rate": 1e-05, |
| "loss": 0.4095, |
| "step": 47400 |
| }, |
| { |
| "epoch": 0.000475, |
| "grad_norm": 2.472301483154297, |
| "learning_rate": 1e-05, |
| "loss": 0.4156, |
| "step": 47500 |
| }, |
| { |
| "epoch": 0.000476, |
| "grad_norm": 2.0751333236694336, |
| "learning_rate": 1e-05, |
| "loss": 0.4196, |
| "step": 47600 |
| }, |
| { |
| "epoch": 0.000477, |
| "grad_norm": 2.197103977203369, |
| "learning_rate": 1e-05, |
| "loss": 0.4129, |
| "step": 47700 |
| }, |
| { |
| "epoch": 0.000478, |
| "grad_norm": 2.5141637325286865, |
| "learning_rate": 1e-05, |
| "loss": 0.4085, |
| "step": 47800 |
| }, |
| { |
| "epoch": 0.000479, |
| "grad_norm": 2.441049337387085, |
| "learning_rate": 1e-05, |
| "loss": 0.4137, |
| "step": 47900 |
| }, |
| { |
| "epoch": 0.00048, |
| "grad_norm": 2.2101807594299316, |
| "learning_rate": 1e-05, |
| "loss": 0.415, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.000481, |
| "grad_norm": 2.235775947570801, |
| "learning_rate": 1e-05, |
| "loss": 0.4169, |
| "step": 48100 |
| }, |
| { |
| "epoch": 0.000482, |
| "grad_norm": 2.0968542098999023, |
| "learning_rate": 1e-05, |
| "loss": 0.4131, |
| "step": 48200 |
| }, |
| { |
| "epoch": 0.000483, |
| "grad_norm": 2.1529128551483154, |
| "learning_rate": 1e-05, |
| "loss": 0.42, |
| "step": 48300 |
| }, |
| { |
| "epoch": 0.000484, |
| "grad_norm": 2.2251384258270264, |
| "learning_rate": 1e-05, |
| "loss": 0.4153, |
| "step": 48400 |
| }, |
| { |
| "epoch": 0.000485, |
| "grad_norm": 2.5039467811584473, |
| "learning_rate": 1e-05, |
| "loss": 0.4133, |
| "step": 48500 |
| }, |
| { |
| "epoch": 0.000486, |
| "grad_norm": 2.3165180683135986, |
| "learning_rate": 1e-05, |
| "loss": 0.4104, |
| "step": 48600 |
| }, |
| { |
| "epoch": 0.000487, |
| "grad_norm": 2.2625648975372314, |
| "learning_rate": 1e-05, |
| "loss": 0.4047, |
| "step": 48700 |
| }, |
| { |
| "epoch": 0.000488, |
| "grad_norm": 2.4777987003326416, |
| "learning_rate": 1e-05, |
| "loss": 0.4112, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.000489, |
| "grad_norm": 2.794090986251831, |
| "learning_rate": 1e-05, |
| "loss": 0.4143, |
| "step": 48900 |
| }, |
| { |
| "epoch": 0.00049, |
| "grad_norm": 2.3887550830841064, |
| "learning_rate": 1e-05, |
| "loss": 0.4172, |
| "step": 49000 |
| }, |
| { |
| "epoch": 0.000491, |
| "grad_norm": 2.0181326866149902, |
| "learning_rate": 1e-05, |
| "loss": 0.4147, |
| "step": 49100 |
| }, |
| { |
| "epoch": 0.000492, |
| "grad_norm": 2.037066698074341, |
| "learning_rate": 1e-05, |
| "loss": 0.4079, |
| "step": 49200 |
| }, |
| { |
| "epoch": 0.000493, |
| "grad_norm": 2.349827289581299, |
| "learning_rate": 1e-05, |
| "loss": 0.4203, |
| "step": 49300 |
| }, |
| { |
| "epoch": 0.000494, |
| "grad_norm": 2.35591459274292, |
| "learning_rate": 1e-05, |
| "loss": 0.4096, |
| "step": 49400 |
| }, |
| { |
| "epoch": 0.000495, |
| "grad_norm": 2.994199752807617, |
| "learning_rate": 1e-05, |
| "loss": 0.4171, |
| "step": 49500 |
| }, |
| { |
| "epoch": 0.000496, |
| "grad_norm": 2.415408134460449, |
| "learning_rate": 1e-05, |
| "loss": 0.4027, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.000497, |
| "grad_norm": 2.205004930496216, |
| "learning_rate": 1e-05, |
| "loss": 0.4208, |
| "step": 49700 |
| }, |
| { |
| "epoch": 0.000498, |
| "grad_norm": 2.1636242866516113, |
| "learning_rate": 1e-05, |
| "loss": 0.4131, |
| "step": 49800 |
| }, |
| { |
| "epoch": 0.000499, |
| "grad_norm": 2.4507057666778564, |
| "learning_rate": 1e-05, |
| "loss": 0.4135, |
| "step": 49900 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 2.1614506244659424, |
| "learning_rate": 1e-05, |
| "loss": 0.41, |
| "step": 50000 |
| }, |
| { |
| "epoch": 0.000501, |
| "grad_norm": 2.070063591003418, |
| "learning_rate": 1e-05, |
| "loss": 0.4089, |
| "step": 50100 |
| }, |
| { |
| "epoch": 0.000502, |
| "grad_norm": 2.339935779571533, |
| "learning_rate": 1e-05, |
| "loss": 0.4115, |
| "step": 50200 |
| }, |
| { |
| "epoch": 0.000503, |
| "grad_norm": 2.25191330909729, |
| "learning_rate": 1e-05, |
| "loss": 0.4064, |
| "step": 50300 |
| }, |
| { |
| "epoch": 0.000504, |
| "grad_norm": 2.198077440261841, |
| "learning_rate": 1e-05, |
| "loss": 0.4049, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.000505, |
| "grad_norm": 2.4001047611236572, |
| "learning_rate": 1e-05, |
| "loss": 0.4146, |
| "step": 50500 |
| }, |
| { |
| "epoch": 0.000506, |
| "grad_norm": 2.311879873275757, |
| "learning_rate": 1e-05, |
| "loss": 0.4149, |
| "step": 50600 |
| }, |
| { |
| "epoch": 0.000507, |
| "grad_norm": 2.249931573867798, |
| "learning_rate": 1e-05, |
| "loss": 0.4091, |
| "step": 50700 |
| }, |
| { |
| "epoch": 0.000508, |
| "grad_norm": 1.957440733909607, |
| "learning_rate": 1e-05, |
| "loss": 0.407, |
| "step": 50800 |
| }, |
| { |
| "epoch": 0.000509, |
| "grad_norm": 2.3248322010040283, |
| "learning_rate": 1e-05, |
| "loss": 0.4138, |
| "step": 50900 |
| }, |
| { |
| "epoch": 0.00051, |
| "grad_norm": 2.2417356967926025, |
| "learning_rate": 1e-05, |
| "loss": 0.4075, |
| "step": 51000 |
| }, |
| { |
| "epoch": 0.000511, |
| "grad_norm": 2.2850210666656494, |
| "learning_rate": 1e-05, |
| "loss": 0.4148, |
| "step": 51100 |
| }, |
| { |
| "epoch": 0.000512, |
| "grad_norm": 2.3242995738983154, |
| "learning_rate": 1e-05, |
| "loss": 0.4134, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.000513, |
| "grad_norm": 2.0709969997406006, |
| "learning_rate": 1e-05, |
| "loss": 0.4196, |
| "step": 51300 |
| }, |
| { |
| "epoch": 0.000514, |
| "grad_norm": 2.149703025817871, |
| "learning_rate": 1e-05, |
| "loss": 0.4105, |
| "step": 51400 |
| }, |
| { |
| "epoch": 0.000515, |
| "grad_norm": 2.2497308254241943, |
| "learning_rate": 1e-05, |
| "loss": 0.4098, |
| "step": 51500 |
| }, |
| { |
| "epoch": 0.000516, |
| "grad_norm": 2.0328240394592285, |
| "learning_rate": 1e-05, |
| "loss": 0.406, |
| "step": 51600 |
| }, |
| { |
| "epoch": 0.000517, |
| "grad_norm": 2.052591562271118, |
| "learning_rate": 1e-05, |
| "loss": 0.4086, |
| "step": 51700 |
| }, |
| { |
| "epoch": 0.000518, |
| "grad_norm": 2.353180170059204, |
| "learning_rate": 1e-05, |
| "loss": 0.4058, |
| "step": 51800 |
| }, |
| { |
| "epoch": 0.000519, |
| "grad_norm": 2.352935791015625, |
| "learning_rate": 1e-05, |
| "loss": 0.4057, |
| "step": 51900 |
| }, |
| { |
| "epoch": 0.00052, |
| "grad_norm": 2.1475372314453125, |
| "learning_rate": 1e-05, |
| "loss": 0.4043, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.000521, |
| "grad_norm": 2.2819299697875977, |
| "learning_rate": 1e-05, |
| "loss": 0.4079, |
| "step": 52100 |
| }, |
| { |
| "epoch": 0.000522, |
| "grad_norm": 2.23323392868042, |
| "learning_rate": 1e-05, |
| "loss": 0.4142, |
| "step": 52200 |
| }, |
| { |
| "epoch": 0.000523, |
| "grad_norm": 2.1115095615386963, |
| "learning_rate": 1e-05, |
| "loss": 0.4091, |
| "step": 52300 |
| }, |
| { |
| "epoch": 0.000524, |
| "grad_norm": 2.34243106842041, |
| "learning_rate": 1e-05, |
| "loss": 0.4028, |
| "step": 52400 |
| }, |
| { |
| "epoch": 0.000525, |
| "grad_norm": 2.1626434326171875, |
| "learning_rate": 1e-05, |
| "loss": 0.4042, |
| "step": 52500 |
| }, |
| { |
| "epoch": 0.000526, |
| "grad_norm": 2.283756732940674, |
| "learning_rate": 1e-05, |
| "loss": 0.4068, |
| "step": 52600 |
| }, |
| { |
| "epoch": 0.000527, |
| "grad_norm": 2.1026229858398438, |
| "learning_rate": 1e-05, |
| "loss": 0.4082, |
| "step": 52700 |
| }, |
| { |
| "epoch": 0.000528, |
| "grad_norm": 2.1464221477508545, |
| "learning_rate": 1e-05, |
| "loss": 0.4093, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.000529, |
| "grad_norm": 2.1100659370422363, |
| "learning_rate": 1e-05, |
| "loss": 0.4049, |
| "step": 52900 |
| }, |
| { |
| "epoch": 0.00053, |
| "grad_norm": 2.06082820892334, |
| "learning_rate": 1e-05, |
| "loss": 0.4174, |
| "step": 53000 |
| }, |
| { |
| "epoch": 0.000531, |
| "grad_norm": 2.226346492767334, |
| "learning_rate": 1e-05, |
| "loss": 0.4084, |
| "step": 53100 |
| }, |
| { |
| "epoch": 0.000532, |
| "grad_norm": 2.432999849319458, |
| "learning_rate": 1e-05, |
| "loss": 0.406, |
| "step": 53200 |
| }, |
| { |
| "epoch": 0.000533, |
| "grad_norm": 2.4239957332611084, |
| "learning_rate": 1e-05, |
| "loss": 0.4073, |
| "step": 53300 |
| }, |
| { |
| "epoch": 0.000534, |
| "grad_norm": 2.08341646194458, |
| "learning_rate": 1e-05, |
| "loss": 0.4092, |
| "step": 53400 |
| }, |
| { |
| "epoch": 0.000535, |
| "grad_norm": 2.4562456607818604, |
| "learning_rate": 1e-05, |
| "loss": 0.3995, |
| "step": 53500 |
| }, |
| { |
| "epoch": 0.000536, |
| "grad_norm": 2.1700892448425293, |
| "learning_rate": 1e-05, |
| "loss": 0.4107, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.000537, |
| "grad_norm": 2.071171998977661, |
| "learning_rate": 1e-05, |
| "loss": 0.4085, |
| "step": 53700 |
| }, |
| { |
| "epoch": 0.000538, |
| "grad_norm": 1.977064847946167, |
| "learning_rate": 1e-05, |
| "loss": 0.4046, |
| "step": 53800 |
| }, |
| { |
| "epoch": 0.000539, |
| "grad_norm": 4.314730644226074, |
| "learning_rate": 1e-05, |
| "loss": 0.4023, |
| "step": 53900 |
| }, |
| { |
| "epoch": 0.00054, |
| "grad_norm": 2.5248172283172607, |
| "learning_rate": 1e-05, |
| "loss": 0.3983, |
| "step": 54000 |
| }, |
| { |
| "epoch": 0.000541, |
| "grad_norm": 2.1219537258148193, |
| "learning_rate": 1e-05, |
| "loss": 0.4068, |
| "step": 54100 |
| }, |
| { |
| "epoch": 0.000542, |
| "grad_norm": 2.4074840545654297, |
| "learning_rate": 1e-05, |
| "loss": 0.4029, |
| "step": 54200 |
| }, |
| { |
| "epoch": 0.000543, |
| "grad_norm": 2.462904930114746, |
| "learning_rate": 1e-05, |
| "loss": 0.4082, |
| "step": 54300 |
| }, |
| { |
| "epoch": 0.000544, |
| "grad_norm": 2.5849449634552, |
| "learning_rate": 1e-05, |
| "loss": 0.3998, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.000545, |
| "grad_norm": 2.1051547527313232, |
| "learning_rate": 1e-05, |
| "loss": 0.4018, |
| "step": 54500 |
| }, |
| { |
| "epoch": 0.000546, |
| "grad_norm": 2.4176714420318604, |
| "learning_rate": 1e-05, |
| "loss": 0.3965, |
| "step": 54600 |
| }, |
| { |
| "epoch": 0.000547, |
| "grad_norm": 2.1228177547454834, |
| "learning_rate": 1e-05, |
| "loss": 0.4007, |
| "step": 54700 |
| }, |
| { |
| "epoch": 0.000548, |
| "grad_norm": 2.0286078453063965, |
| "learning_rate": 1e-05, |
| "loss": 0.402, |
| "step": 54800 |
| }, |
| { |
| "epoch": 0.000549, |
| "grad_norm": 2.300497531890869, |
| "learning_rate": 1e-05, |
| "loss": 0.4084, |
| "step": 54900 |
| }, |
| { |
| "epoch": 0.00055, |
| "grad_norm": 2.1815927028656006, |
| "learning_rate": 1e-05, |
| "loss": 0.4031, |
| "step": 55000 |
| }, |
| { |
| "epoch": 0.000551, |
| "grad_norm": 2.347383975982666, |
| "learning_rate": 1e-05, |
| "loss": 0.402, |
| "step": 55100 |
| }, |
| { |
| "epoch": 0.000552, |
| "grad_norm": 2.059412717819214, |
| "learning_rate": 1e-05, |
| "loss": 0.4169, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.000553, |
| "grad_norm": 2.089460849761963, |
| "learning_rate": 1e-05, |
| "loss": 0.4059, |
| "step": 55300 |
| }, |
| { |
| "epoch": 0.000554, |
| "grad_norm": 2.608187675476074, |
| "learning_rate": 1e-05, |
| "loss": 0.4059, |
| "step": 55400 |
| }, |
| { |
| "epoch": 0.000555, |
| "grad_norm": 2.468566656112671, |
| "learning_rate": 1e-05, |
| "loss": 0.4007, |
| "step": 55500 |
| }, |
| { |
| "epoch": 0.000556, |
| "grad_norm": 2.740276336669922, |
| "learning_rate": 1e-05, |
| "loss": 0.4099, |
| "step": 55600 |
| }, |
| { |
| "epoch": 0.000557, |
| "grad_norm": 2.447087526321411, |
| "learning_rate": 1e-05, |
| "loss": 0.4157, |
| "step": 55700 |
| }, |
| { |
| "epoch": 0.000558, |
| "grad_norm": 2.1900322437286377, |
| "learning_rate": 1e-05, |
| "loss": 0.4018, |
| "step": 55800 |
| }, |
| { |
| "epoch": 0.000559, |
| "grad_norm": 2.332939386367798, |
| "learning_rate": 1e-05, |
| "loss": 0.3949, |
| "step": 55900 |
| }, |
| { |
| "epoch": 0.00056, |
| "grad_norm": 2.050628900527954, |
| "learning_rate": 1e-05, |
| "loss": 0.4062, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.000561, |
| "grad_norm": 2.101712226867676, |
| "learning_rate": 1e-05, |
| "loss": 0.4014, |
| "step": 56100 |
| }, |
| { |
| "epoch": 0.000562, |
| "grad_norm": 2.093705177307129, |
| "learning_rate": 1e-05, |
| "loss": 0.3951, |
| "step": 56200 |
| }, |
| { |
| "epoch": 0.000563, |
| "grad_norm": 2.02903413772583, |
| "learning_rate": 1e-05, |
| "loss": 0.4059, |
| "step": 56300 |
| }, |
| { |
| "epoch": 0.000564, |
| "grad_norm": 2.0588796138763428, |
| "learning_rate": 1e-05, |
| "loss": 0.4104, |
| "step": 56400 |
| }, |
| { |
| "epoch": 0.000565, |
| "grad_norm": 1.968138575553894, |
| "learning_rate": 1e-05, |
| "loss": 0.3955, |
| "step": 56500 |
| }, |
| { |
| "epoch": 0.000566, |
| "grad_norm": 2.0863802433013916, |
| "learning_rate": 1e-05, |
| "loss": 0.4056, |
| "step": 56600 |
| }, |
| { |
| "epoch": 0.000567, |
| "grad_norm": 2.0999319553375244, |
| "learning_rate": 1e-05, |
| "loss": 0.3999, |
| "step": 56700 |
| }, |
| { |
| "epoch": 0.000568, |
| "grad_norm": 2.2497940063476562, |
| "learning_rate": 1e-05, |
| "loss": 0.3944, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.000569, |
| "grad_norm": 2.327509880065918, |
| "learning_rate": 1e-05, |
| "loss": 0.4045, |
| "step": 56900 |
| }, |
| { |
| "epoch": 0.00057, |
| "grad_norm": 1.9509259462356567, |
| "learning_rate": 1e-05, |
| "loss": 0.3937, |
| "step": 57000 |
| }, |
| { |
| "epoch": 0.000571, |
| "grad_norm": 1.9733527898788452, |
| "learning_rate": 1e-05, |
| "loss": 0.3994, |
| "step": 57100 |
| }, |
| { |
| "epoch": 0.000572, |
| "grad_norm": 2.3149795532226562, |
| "learning_rate": 1e-05, |
| "loss": 0.3964, |
| "step": 57200 |
| }, |
| { |
| "epoch": 0.000573, |
| "grad_norm": 2.2869510650634766, |
| "learning_rate": 1e-05, |
| "loss": 0.3972, |
| "step": 57300 |
| }, |
| { |
| "epoch": 0.000574, |
| "grad_norm": 2.807288885116577, |
| "learning_rate": 1e-05, |
| "loss": 0.3979, |
| "step": 57400 |
| }, |
| { |
| "epoch": 0.000575, |
| "grad_norm": 1.9130806922912598, |
| "learning_rate": 1e-05, |
| "loss": 0.3909, |
| "step": 57500 |
| }, |
| { |
| "epoch": 0.000576, |
| "grad_norm": 2.392228841781616, |
| "learning_rate": 1e-05, |
| "loss": 0.4081, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.000577, |
| "grad_norm": 2.2016382217407227, |
| "learning_rate": 1e-05, |
| "loss": 0.3942, |
| "step": 57700 |
| }, |
| { |
| "epoch": 0.000578, |
| "grad_norm": 1.9153637886047363, |
| "learning_rate": 1e-05, |
| "loss": 0.3856, |
| "step": 57800 |
| }, |
| { |
| "epoch": 0.000579, |
| "grad_norm": 2.334127902984619, |
| "learning_rate": 1e-05, |
| "loss": 0.4011, |
| "step": 57900 |
| }, |
| { |
| "epoch": 0.00058, |
| "grad_norm": 2.0389389991760254, |
| "learning_rate": 1e-05, |
| "loss": 0.3964, |
| "step": 58000 |
| }, |
| { |
| "epoch": 0.000581, |
| "grad_norm": 1.817014217376709, |
| "learning_rate": 1e-05, |
| "loss": 0.401, |
| "step": 58100 |
| }, |
| { |
| "epoch": 0.000582, |
| "grad_norm": 2.2769718170166016, |
| "learning_rate": 1e-05, |
| "loss": 0.4025, |
| "step": 58200 |
| }, |
| { |
| "epoch": 0.000583, |
| "grad_norm": 2.2681713104248047, |
| "learning_rate": 1e-05, |
| "loss": 0.3998, |
| "step": 58300 |
| }, |
| { |
| "epoch": 0.000584, |
| "grad_norm": 2.0518765449523926, |
| "learning_rate": 1e-05, |
| "loss": 0.3958, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.000585, |
| "grad_norm": 2.0787107944488525, |
| "learning_rate": 1e-05, |
| "loss": 0.3979, |
| "step": 58500 |
| }, |
| { |
| "epoch": 0.000586, |
| "grad_norm": 2.2114005088806152, |
| "learning_rate": 1e-05, |
| "loss": 0.3953, |
| "step": 58600 |
| }, |
| { |
| "epoch": 0.000587, |
| "grad_norm": 2.2382404804229736, |
| "learning_rate": 1e-05, |
| "loss": 0.4011, |
| "step": 58700 |
| }, |
| { |
| "epoch": 0.000588, |
| "grad_norm": 2.2104434967041016, |
| "learning_rate": 1e-05, |
| "loss": 0.3897, |
| "step": 58800 |
| }, |
| { |
| "epoch": 0.000589, |
| "grad_norm": 2.5010359287261963, |
| "learning_rate": 1e-05, |
| "loss": 0.3929, |
| "step": 58900 |
| }, |
| { |
| "epoch": 0.00059, |
| "grad_norm": 2.6456377506256104, |
| "learning_rate": 1e-05, |
| "loss": 0.4048, |
| "step": 59000 |
| }, |
| { |
| "epoch": 0.000591, |
| "grad_norm": 2.2201075553894043, |
| "learning_rate": 1e-05, |
| "loss": 0.4011, |
| "step": 59100 |
| }, |
| { |
| "epoch": 0.000592, |
| "grad_norm": 2.050746440887451, |
| "learning_rate": 1e-05, |
| "loss": 0.4003, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.000593, |
| "grad_norm": 4.129772663116455, |
| "learning_rate": 1e-05, |
| "loss": 0.3889, |
| "step": 59300 |
| }, |
| { |
| "epoch": 0.000594, |
| "grad_norm": 2.160189628601074, |
| "learning_rate": 1e-05, |
| "loss": 0.4046, |
| "step": 59400 |
| }, |
| { |
| "epoch": 0.000595, |
| "grad_norm": 2.5370826721191406, |
| "learning_rate": 1e-05, |
| "loss": 0.3829, |
| "step": 59500 |
| }, |
| { |
| "epoch": 0.000596, |
| "grad_norm": 2.0123531818389893, |
| "learning_rate": 1e-05, |
| "loss": 0.3976, |
| "step": 59600 |
| }, |
| { |
| "epoch": 0.000597, |
| "grad_norm": 2.175504207611084, |
| "learning_rate": 1e-05, |
| "loss": 0.3908, |
| "step": 59700 |
| }, |
| { |
| "epoch": 0.000598, |
| "grad_norm": 1.993752360343933, |
| "learning_rate": 1e-05, |
| "loss": 0.401, |
| "step": 59800 |
| }, |
| { |
| "epoch": 0.000599, |
| "grad_norm": 2.1103925704956055, |
| "learning_rate": 1e-05, |
| "loss": 0.3886, |
| "step": 59900 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 1.866847038269043, |
| "learning_rate": 1e-05, |
| "loss": 0.3929, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.0006, |
| "eval_loss": 0.3701171875, |
| "eval_runtime": 109.3176, |
| "eval_samples_per_second": 457.383, |
| "eval_steps_per_second": 28.586, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.000601, |
| "grad_norm": 2.07293701171875, |
| "learning_rate": 1e-05, |
| "loss": 0.3941, |
| "step": 60100 |
| }, |
| { |
| "epoch": 0.000602, |
| "grad_norm": 2.153665542602539, |
| "learning_rate": 1e-05, |
| "loss": 0.3933, |
| "step": 60200 |
| }, |
| { |
| "epoch": 0.000603, |
| "grad_norm": 1.8806813955307007, |
| "learning_rate": 1e-05, |
| "loss": 0.3908, |
| "step": 60300 |
| }, |
| { |
| "epoch": 0.000604, |
| "grad_norm": 1.9428766965866089, |
| "learning_rate": 1e-05, |
| "loss": 0.3941, |
| "step": 60400 |
| }, |
| { |
| "epoch": 0.000605, |
| "grad_norm": 2.4207301139831543, |
| "learning_rate": 1e-05, |
| "loss": 0.395, |
| "step": 60500 |
| }, |
| { |
| "epoch": 0.000606, |
| "grad_norm": 2.292665958404541, |
| "learning_rate": 1e-05, |
| "loss": 0.3893, |
| "step": 60600 |
| }, |
| { |
| "epoch": 0.000607, |
| "grad_norm": 2.2332205772399902, |
| "learning_rate": 1e-05, |
| "loss": 0.3939, |
| "step": 60700 |
| }, |
| { |
| "epoch": 0.000608, |
| "grad_norm": 2.379991054534912, |
| "learning_rate": 1e-05, |
| "loss": 0.3911, |
| "step": 60800 |
| }, |
| { |
| "epoch": 0.000609, |
| "grad_norm": 2.1357316970825195, |
| "learning_rate": 1e-05, |
| "loss": 0.394, |
| "step": 60900 |
| }, |
| { |
| "epoch": 0.00061, |
| "grad_norm": 2.218677520751953, |
| "learning_rate": 1e-05, |
| "loss": 0.3908, |
| "step": 61000 |
| }, |
| { |
| "epoch": 0.000611, |
| "grad_norm": 2.144749641418457, |
| "learning_rate": 1e-05, |
| "loss": 0.3948, |
| "step": 61100 |
| }, |
| { |
| "epoch": 0.000612, |
| "grad_norm": 1.9096667766571045, |
| "learning_rate": 1e-05, |
| "loss": 0.4055, |
| "step": 61200 |
| }, |
| { |
| "epoch": 0.000613, |
| "grad_norm": 1.813551664352417, |
| "learning_rate": 1e-05, |
| "loss": 0.3909, |
| "step": 61300 |
| }, |
| { |
| "epoch": 0.000614, |
| "grad_norm": 2.0957746505737305, |
| "learning_rate": 1e-05, |
| "loss": 0.3934, |
| "step": 61400 |
| }, |
| { |
| "epoch": 0.000615, |
| "grad_norm": 2.288628578186035, |
| "learning_rate": 1e-05, |
| "loss": 0.3948, |
| "step": 61500 |
| }, |
| { |
| "epoch": 0.000616, |
| "grad_norm": 1.8869370222091675, |
| "learning_rate": 1e-05, |
| "loss": 0.3896, |
| "step": 61600 |
| }, |
| { |
| "epoch": 0.000617, |
| "grad_norm": 1.8751919269561768, |
| "learning_rate": 1e-05, |
| "loss": 0.3946, |
| "step": 61700 |
| }, |
| { |
| "epoch": 0.000618, |
| "grad_norm": 2.0423409938812256, |
| "learning_rate": 1e-05, |
| "loss": 0.3914, |
| "step": 61800 |
| }, |
| { |
| "epoch": 0.000619, |
| "grad_norm": 2.154679536819458, |
| "learning_rate": 1e-05, |
| "loss": 0.3942, |
| "step": 61900 |
| }, |
| { |
| "epoch": 0.00062, |
| "grad_norm": 2.293510913848877, |
| "learning_rate": 1e-05, |
| "loss": 0.3867, |
| "step": 62000 |
| }, |
| { |
| "epoch": 0.000621, |
| "grad_norm": 2.034313678741455, |
| "learning_rate": 1e-05, |
| "loss": 0.3844, |
| "step": 62100 |
| }, |
| { |
| "epoch": 0.000622, |
| "grad_norm": 2.105489730834961, |
| "learning_rate": 1e-05, |
| "loss": 0.3886, |
| "step": 62200 |
| }, |
| { |
| "epoch": 0.000623, |
| "grad_norm": 1.9530473947525024, |
| "learning_rate": 1e-05, |
| "loss": 0.3962, |
| "step": 62300 |
| }, |
| { |
| "epoch": 0.000624, |
| "grad_norm": 2.291125774383545, |
| "learning_rate": 1e-05, |
| "loss": 0.392, |
| "step": 62400 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 2.169159412384033, |
| "learning_rate": 1e-05, |
| "loss": 0.3916, |
| "step": 62500 |
| }, |
| { |
| "epoch": 0.000626, |
| "grad_norm": 2.2920339107513428, |
| "learning_rate": 1e-05, |
| "loss": 0.3886, |
| "step": 62600 |
| }, |
| { |
| "epoch": 0.000627, |
| "grad_norm": 2.248567819595337, |
| "learning_rate": 1e-05, |
| "loss": 0.3997, |
| "step": 62700 |
| }, |
| { |
| "epoch": 0.000628, |
| "grad_norm": 1.9369299411773682, |
| "learning_rate": 1e-05, |
| "loss": 0.3945, |
| "step": 62800 |
| }, |
| { |
| "epoch": 0.000629, |
| "grad_norm": 1.912782073020935, |
| "learning_rate": 1e-05, |
| "loss": 0.3911, |
| "step": 62900 |
| }, |
| { |
| "epoch": 0.00063, |
| "grad_norm": 1.7592915296554565, |
| "learning_rate": 1e-05, |
| "loss": 0.3989, |
| "step": 63000 |
| }, |
| { |
| "epoch": 0.000631, |
| "grad_norm": 2.0076982975006104, |
| "learning_rate": 1e-05, |
| "loss": 0.3918, |
| "step": 63100 |
| }, |
| { |
| "epoch": 0.000632, |
| "grad_norm": 2.0114753246307373, |
| "learning_rate": 1e-05, |
| "loss": 0.3932, |
| "step": 63200 |
| }, |
| { |
| "epoch": 0.000633, |
| "grad_norm": 2.50410795211792, |
| "learning_rate": 1e-05, |
| "loss": 0.3958, |
| "step": 63300 |
| }, |
| { |
| "epoch": 0.000634, |
| "grad_norm": 2.156872510910034, |
| "learning_rate": 1e-05, |
| "loss": 0.3875, |
| "step": 63400 |
| }, |
| { |
| "epoch": 0.000635, |
| "grad_norm": 2.2408478260040283, |
| "learning_rate": 1e-05, |
| "loss": 0.3939, |
| "step": 63500 |
| }, |
| { |
| "epoch": 0.000636, |
| "grad_norm": 2.0988857746124268, |
| "learning_rate": 1e-05, |
| "loss": 0.3867, |
| "step": 63600 |
| }, |
| { |
| "epoch": 0.000637, |
| "grad_norm": 2.140925407409668, |
| "learning_rate": 1e-05, |
| "loss": 0.3878, |
| "step": 63700 |
| }, |
| { |
| "epoch": 0.000638, |
| "grad_norm": 2.2293543815612793, |
| "learning_rate": 1e-05, |
| "loss": 0.3866, |
| "step": 63800 |
| }, |
| { |
| "epoch": 0.000639, |
| "grad_norm": 2.0480923652648926, |
| "learning_rate": 1e-05, |
| "loss": 0.3819, |
| "step": 63900 |
| }, |
| { |
| "epoch": 0.00064, |
| "grad_norm": 2.129159927368164, |
| "learning_rate": 1e-05, |
| "loss": 0.3874, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.000641, |
| "grad_norm": 1.907259225845337, |
| "learning_rate": 1e-05, |
| "loss": 0.3918, |
| "step": 64100 |
| }, |
| { |
| "epoch": 0.000642, |
| "grad_norm": 1.9210904836654663, |
| "learning_rate": 1e-05, |
| "loss": 0.3863, |
| "step": 64200 |
| }, |
| { |
| "epoch": 0.000643, |
| "grad_norm": 2.2560818195343018, |
| "learning_rate": 1e-05, |
| "loss": 0.3986, |
| "step": 64300 |
| }, |
| { |
| "epoch": 0.000644, |
| "grad_norm": 1.8635262250900269, |
| "learning_rate": 1e-05, |
| "loss": 0.3781, |
| "step": 64400 |
| }, |
| { |
| "epoch": 0.000645, |
| "grad_norm": 2.076395273208618, |
| "learning_rate": 1e-05, |
| "loss": 0.3822, |
| "step": 64500 |
| }, |
| { |
| "epoch": 0.000646, |
| "grad_norm": 1.7710347175598145, |
| "learning_rate": 1e-05, |
| "loss": 0.3794, |
| "step": 64600 |
| }, |
| { |
| "epoch": 0.000647, |
| "grad_norm": 2.0143582820892334, |
| "learning_rate": 1e-05, |
| "loss": 0.3825, |
| "step": 64700 |
| }, |
| { |
| "epoch": 0.000648, |
| "grad_norm": 2.2155025005340576, |
| "learning_rate": 1e-05, |
| "loss": 0.3938, |
| "step": 64800 |
| }, |
| { |
| "epoch": 0.000649, |
| "grad_norm": 1.8567825555801392, |
| "learning_rate": 1e-05, |
| "loss": 0.3799, |
| "step": 64900 |
| }, |
| { |
| "epoch": 0.00065, |
| "grad_norm": 2.183415412902832, |
| "learning_rate": 1e-05, |
| "loss": 0.3824, |
| "step": 65000 |
| }, |
| { |
| "epoch": 0.000651, |
| "grad_norm": 2.005911350250244, |
| "learning_rate": 1e-05, |
| "loss": 0.3931, |
| "step": 65100 |
| }, |
| { |
| "epoch": 0.000652, |
| "grad_norm": 1.8332974910736084, |
| "learning_rate": 1e-05, |
| "loss": 0.3811, |
| "step": 65200 |
| }, |
| { |
| "epoch": 0.000653, |
| "grad_norm": 2.1285884380340576, |
| "learning_rate": 1e-05, |
| "loss": 0.3918, |
| "step": 65300 |
| }, |
| { |
| "epoch": 0.000654, |
| "grad_norm": 2.158264636993408, |
| "learning_rate": 1e-05, |
| "loss": 0.3863, |
| "step": 65400 |
| }, |
| { |
| "epoch": 0.000655, |
| "grad_norm": 1.8876869678497314, |
| "learning_rate": 1e-05, |
| "loss": 0.3934, |
| "step": 65500 |
| }, |
| { |
| "epoch": 0.000656, |
| "grad_norm": 1.8769333362579346, |
| "learning_rate": 1e-05, |
| "loss": 0.3923, |
| "step": 65600 |
| }, |
| { |
| "epoch": 0.000657, |
| "grad_norm": 2.019409656524658, |
| "learning_rate": 1e-05, |
| "loss": 0.3826, |
| "step": 65700 |
| }, |
| { |
| "epoch": 0.000658, |
| "grad_norm": 2.0213446617126465, |
| "learning_rate": 1e-05, |
| "loss": 0.3922, |
| "step": 65800 |
| }, |
| { |
| "epoch": 0.000659, |
| "grad_norm": 2.2089147567749023, |
| "learning_rate": 1e-05, |
| "loss": 0.3933, |
| "step": 65900 |
| }, |
| { |
| "epoch": 0.00066, |
| "grad_norm": 1.837319254875183, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "step": 66000 |
| }, |
| { |
| "epoch": 0.000661, |
| "grad_norm": 1.8547362089157104, |
| "learning_rate": 1e-05, |
| "loss": 0.3887, |
| "step": 66100 |
| }, |
| { |
| "epoch": 0.000662, |
| "grad_norm": 1.8269295692443848, |
| "learning_rate": 1e-05, |
| "loss": 0.384, |
| "step": 66200 |
| }, |
| { |
| "epoch": 0.000663, |
| "grad_norm": 2.052025318145752, |
| "learning_rate": 1e-05, |
| "loss": 0.3838, |
| "step": 66300 |
| }, |
| { |
| "epoch": 0.000664, |
| "grad_norm": 2.030297040939331, |
| "learning_rate": 1e-05, |
| "loss": 0.3873, |
| "step": 66400 |
| }, |
| { |
| "epoch": 0.000665, |
| "grad_norm": 2.019329309463501, |
| "learning_rate": 1e-05, |
| "loss": 0.3828, |
| "step": 66500 |
| }, |
| { |
| "epoch": 0.000666, |
| "grad_norm": 1.8459995985031128, |
| "learning_rate": 1e-05, |
| "loss": 0.3879, |
| "step": 66600 |
| }, |
| { |
| "epoch": 0.000667, |
| "grad_norm": 1.9611304998397827, |
| "learning_rate": 1e-05, |
| "loss": 0.3893, |
| "step": 66700 |
| }, |
| { |
| "epoch": 0.000668, |
| "grad_norm": 1.8976935148239136, |
| "learning_rate": 1e-05, |
| "loss": 0.3858, |
| "step": 66800 |
| }, |
| { |
| "epoch": 0.000669, |
| "grad_norm": 1.9818809032440186, |
| "learning_rate": 1e-05, |
| "loss": 0.3838, |
| "step": 66900 |
| }, |
| { |
| "epoch": 0.00067, |
| "grad_norm": 1.7839868068695068, |
| "learning_rate": 1e-05, |
| "loss": 0.3809, |
| "step": 67000 |
| }, |
| { |
| "epoch": 0.000671, |
| "grad_norm": 2.1452698707580566, |
| "learning_rate": 1e-05, |
| "loss": 0.3843, |
| "step": 67100 |
| }, |
| { |
| "epoch": 0.000672, |
| "grad_norm": 2.077277660369873, |
| "learning_rate": 1e-05, |
| "loss": 0.3837, |
| "step": 67200 |
| }, |
| { |
| "epoch": 0.000673, |
| "grad_norm": 2.0192837715148926, |
| "learning_rate": 1e-05, |
| "loss": 0.3974, |
| "step": 67300 |
| }, |
| { |
| "epoch": 0.000674, |
| "grad_norm": 2.134225606918335, |
| "learning_rate": 1e-05, |
| "loss": 0.3777, |
| "step": 67400 |
| }, |
| { |
| "epoch": 0.000675, |
| "grad_norm": 2.0650904178619385, |
| "learning_rate": 1e-05, |
| "loss": 0.3837, |
| "step": 67500 |
| }, |
| { |
| "epoch": 0.000676, |
| "grad_norm": 1.857259750366211, |
| "learning_rate": 1e-05, |
| "loss": 0.3777, |
| "step": 67600 |
| }, |
| { |
| "epoch": 0.000677, |
| "grad_norm": 1.8561601638793945, |
| "learning_rate": 1e-05, |
| "loss": 0.3854, |
| "step": 67700 |
| }, |
| { |
| "epoch": 0.000678, |
| "grad_norm": 2.27827525138855, |
| "learning_rate": 1e-05, |
| "loss": 0.3772, |
| "step": 67800 |
| }, |
| { |
| "epoch": 0.000679, |
| "grad_norm": 1.9206945896148682, |
| "learning_rate": 1e-05, |
| "loss": 0.3882, |
| "step": 67900 |
| }, |
| { |
| "epoch": 0.00068, |
| "grad_norm": 1.849585771560669, |
| "learning_rate": 1e-05, |
| "loss": 0.3903, |
| "step": 68000 |
| }, |
| { |
| "epoch": 0.000681, |
| "grad_norm": 2.1679646968841553, |
| "learning_rate": 1e-05, |
| "loss": 0.3844, |
| "step": 68100 |
| }, |
| { |
| "epoch": 0.000682, |
| "grad_norm": 2.105186700820923, |
| "learning_rate": 1e-05, |
| "loss": 0.3843, |
| "step": 68200 |
| }, |
| { |
| "epoch": 0.000683, |
| "grad_norm": 1.8043280839920044, |
| "learning_rate": 1e-05, |
| "loss": 0.3782, |
| "step": 68300 |
| }, |
| { |
| "epoch": 0.000684, |
| "grad_norm": 1.9731149673461914, |
| "learning_rate": 1e-05, |
| "loss": 0.3876, |
| "step": 68400 |
| }, |
| { |
| "epoch": 0.000685, |
| "grad_norm": 1.9924131631851196, |
| "learning_rate": 1e-05, |
| "loss": 0.386, |
| "step": 68500 |
| }, |
| { |
| "epoch": 0.000686, |
| "grad_norm": 1.8679152727127075, |
| "learning_rate": 1e-05, |
| "loss": 0.3797, |
| "step": 68600 |
| }, |
| { |
| "epoch": 0.000687, |
| "grad_norm": 2.01244854927063, |
| "learning_rate": 1e-05, |
| "loss": 0.3803, |
| "step": 68700 |
| }, |
| { |
| "epoch": 0.000688, |
| "grad_norm": 1.9184852838516235, |
| "learning_rate": 1e-05, |
| "loss": 0.3872, |
| "step": 68800 |
| }, |
| { |
| "epoch": 0.000689, |
| "grad_norm": 2.039447546005249, |
| "learning_rate": 1e-05, |
| "loss": 0.3808, |
| "step": 68900 |
| }, |
| { |
| "epoch": 0.00069, |
| "grad_norm": 2.367798089981079, |
| "learning_rate": 1e-05, |
| "loss": 0.3858, |
| "step": 69000 |
| }, |
| { |
| "epoch": 0.000691, |
| "grad_norm": 2.0003209114074707, |
| "learning_rate": 1e-05, |
| "loss": 0.3808, |
| "step": 69100 |
| }, |
| { |
| "epoch": 0.000692, |
| "grad_norm": 1.9453091621398926, |
| "learning_rate": 1e-05, |
| "loss": 0.3707, |
| "step": 69200 |
| }, |
| { |
| "epoch": 0.000693, |
| "grad_norm": 1.6954456567764282, |
| "learning_rate": 1e-05, |
| "loss": 0.3829, |
| "step": 69300 |
| }, |
| { |
| "epoch": 0.000694, |
| "grad_norm": 2.1012470722198486, |
| "learning_rate": 1e-05, |
| "loss": 0.3833, |
| "step": 69400 |
| }, |
| { |
| "epoch": 0.000695, |
| "grad_norm": 1.8490900993347168, |
| "learning_rate": 1e-05, |
| "loss": 0.3873, |
| "step": 69500 |
| }, |
| { |
| "epoch": 0.000696, |
| "grad_norm": 1.8682618141174316, |
| "learning_rate": 1e-05, |
| "loss": 0.3862, |
| "step": 69600 |
| }, |
| { |
| "epoch": 0.000697, |
| "grad_norm": 2.068352460861206, |
| "learning_rate": 1e-05, |
| "loss": 0.3802, |
| "step": 69700 |
| }, |
| { |
| "epoch": 0.000698, |
| "grad_norm": 2.118117094039917, |
| "learning_rate": 1e-05, |
| "loss": 0.3764, |
| "step": 69800 |
| }, |
| { |
| "epoch": 0.000699, |
| "grad_norm": 1.8571758270263672, |
| "learning_rate": 1e-05, |
| "loss": 0.378, |
| "step": 69900 |
| }, |
| { |
| "epoch": 0.0007, |
| "grad_norm": 2.103874921798706, |
| "learning_rate": 1e-05, |
| "loss": 0.3798, |
| "step": 70000 |
| }, |
| { |
| "epoch": 0.000701, |
| "grad_norm": 2.4420368671417236, |
| "learning_rate": 1e-05, |
| "loss": 0.3796, |
| "step": 70100 |
| }, |
| { |
| "epoch": 0.000702, |
| "grad_norm": 2.143949270248413, |
| "learning_rate": 1e-05, |
| "loss": 0.3735, |
| "step": 70200 |
| }, |
| { |
| "epoch": 0.000703, |
| "grad_norm": 2.070586681365967, |
| "learning_rate": 1e-05, |
| "loss": 0.3813, |
| "step": 70300 |
| }, |
| { |
| "epoch": 0.000704, |
| "grad_norm": 2.0714941024780273, |
| "learning_rate": 1e-05, |
| "loss": 0.383, |
| "step": 70400 |
| }, |
| { |
| "epoch": 0.000705, |
| "grad_norm": 2.0592539310455322, |
| "learning_rate": 1e-05, |
| "loss": 0.3769, |
| "step": 70500 |
| }, |
| { |
| "epoch": 0.000706, |
| "grad_norm": 2.0504090785980225, |
| "learning_rate": 1e-05, |
| "loss": 0.3791, |
| "step": 70600 |
| }, |
| { |
| "epoch": 0.000707, |
| "grad_norm": 1.6406168937683105, |
| "learning_rate": 1e-05, |
| "loss": 0.3758, |
| "step": 70700 |
| }, |
| { |
| "epoch": 0.000708, |
| "grad_norm": 2.1220123767852783, |
| "learning_rate": 1e-05, |
| "loss": 0.3868, |
| "step": 70800 |
| }, |
| { |
| "epoch": 0.000709, |
| "grad_norm": 2.0536298751831055, |
| "learning_rate": 1e-05, |
| "loss": 0.3805, |
| "step": 70900 |
| }, |
| { |
| "epoch": 0.00071, |
| "grad_norm": 2.076979875564575, |
| "learning_rate": 1e-05, |
| "loss": 0.3807, |
| "step": 71000 |
| }, |
| { |
| "epoch": 0.000711, |
| "grad_norm": 2.6225621700286865, |
| "learning_rate": 1e-05, |
| "loss": 0.373, |
| "step": 71100 |
| }, |
| { |
| "epoch": 0.000712, |
| "grad_norm": 2.2727653980255127, |
| "learning_rate": 1e-05, |
| "loss": 0.3762, |
| "step": 71200 |
| }, |
| { |
| "epoch": 0.000713, |
| "grad_norm": 2.0625195503234863, |
| "learning_rate": 1e-05, |
| "loss": 0.3841, |
| "step": 71300 |
| }, |
| { |
| "epoch": 0.000714, |
| "grad_norm": 1.9859055280685425, |
| "learning_rate": 1e-05, |
| "loss": 0.3801, |
| "step": 71400 |
| }, |
| { |
| "epoch": 0.000715, |
| "grad_norm": 1.9635552167892456, |
| "learning_rate": 1e-05, |
| "loss": 0.3848, |
| "step": 71500 |
| }, |
| { |
| "epoch": 0.000716, |
| "grad_norm": 2.121825933456421, |
| "learning_rate": 1e-05, |
| "loss": 0.3745, |
| "step": 71600 |
| }, |
| { |
| "epoch": 0.000717, |
| "grad_norm": 1.9133636951446533, |
| "learning_rate": 1e-05, |
| "loss": 0.3814, |
| "step": 71700 |
| }, |
| { |
| "epoch": 0.000718, |
| "grad_norm": 2.1131491661071777, |
| "learning_rate": 1e-05, |
| "loss": 0.37, |
| "step": 71800 |
| }, |
| { |
| "epoch": 0.000719, |
| "grad_norm": 2.0350754261016846, |
| "learning_rate": 1e-05, |
| "loss": 0.3814, |
| "step": 71900 |
| }, |
| { |
| "epoch": 0.00072, |
| "grad_norm": 2.757786750793457, |
| "learning_rate": 1e-05, |
| "loss": 0.3754, |
| "step": 72000 |
| }, |
| { |
| "epoch": 0.000721, |
| "grad_norm": 1.797782063484192, |
| "learning_rate": 1e-05, |
| "loss": 0.3712, |
| "step": 72100 |
| }, |
| { |
| "epoch": 0.000722, |
| "grad_norm": 2.0632424354553223, |
| "learning_rate": 1e-05, |
| "loss": 0.3881, |
| "step": 72200 |
| }, |
| { |
| "epoch": 0.000723, |
| "grad_norm": 1.7604708671569824, |
| "learning_rate": 1e-05, |
| "loss": 0.3787, |
| "step": 72300 |
| }, |
| { |
| "epoch": 0.000724, |
| "grad_norm": 3.3791792392730713, |
| "learning_rate": 1e-05, |
| "loss": 0.3776, |
| "step": 72400 |
| }, |
| { |
| "epoch": 0.000725, |
| "grad_norm": 2.1998651027679443, |
| "learning_rate": 1e-05, |
| "loss": 0.3701, |
| "step": 72500 |
| }, |
| { |
| "epoch": 0.000726, |
| "grad_norm": 2.309633731842041, |
| "learning_rate": 1e-05, |
| "loss": 0.3742, |
| "step": 72600 |
| }, |
| { |
| "epoch": 0.000727, |
| "grad_norm": 2.0794286727905273, |
| "learning_rate": 1e-05, |
| "loss": 0.3752, |
| "step": 72700 |
| }, |
| { |
| "epoch": 0.000728, |
| "grad_norm": 1.98604154586792, |
| "learning_rate": 1e-05, |
| "loss": 0.3841, |
| "step": 72800 |
| }, |
| { |
| "epoch": 0.000729, |
| "grad_norm": 2.0682222843170166, |
| "learning_rate": 1e-05, |
| "loss": 0.376, |
| "step": 72900 |
| }, |
| { |
| "epoch": 0.00073, |
| "grad_norm": 1.9491254091262817, |
| "learning_rate": 1e-05, |
| "loss": 0.3704, |
| "step": 73000 |
| }, |
| { |
| "epoch": 0.000731, |
| "grad_norm": 1.809173822402954, |
| "learning_rate": 1e-05, |
| "loss": 0.376, |
| "step": 73100 |
| }, |
| { |
| "epoch": 0.000732, |
| "grad_norm": 1.7224321365356445, |
| "learning_rate": 1e-05, |
| "loss": 0.3737, |
| "step": 73200 |
| }, |
| { |
| "epoch": 0.000733, |
| "grad_norm": 1.7145380973815918, |
| "learning_rate": 1e-05, |
| "loss": 0.3806, |
| "step": 73300 |
| }, |
| { |
| "epoch": 0.000734, |
| "grad_norm": 2.0233635902404785, |
| "learning_rate": 1e-05, |
| "loss": 0.3741, |
| "step": 73400 |
| }, |
| { |
| "epoch": 0.000735, |
| "grad_norm": 1.9742248058319092, |
| "learning_rate": 1e-05, |
| "loss": 0.3645, |
| "step": 73500 |
| }, |
| { |
| "epoch": 0.000736, |
| "grad_norm": 1.889393925666809, |
| "learning_rate": 1e-05, |
| "loss": 0.3696, |
| "step": 73600 |
| }, |
| { |
| "epoch": 0.000737, |
| "grad_norm": 2.075669050216675, |
| "learning_rate": 1e-05, |
| "loss": 0.3735, |
| "step": 73700 |
| }, |
| { |
| "epoch": 0.000738, |
| "grad_norm": 2.1420507431030273, |
| "learning_rate": 1e-05, |
| "loss": 0.3701, |
| "step": 73800 |
| }, |
| { |
| "epoch": 0.000739, |
| "grad_norm": 2.1469383239746094, |
| "learning_rate": 1e-05, |
| "loss": 0.379, |
| "step": 73900 |
| }, |
| { |
| "epoch": 0.00074, |
| "grad_norm": 2.0224719047546387, |
| "learning_rate": 1e-05, |
| "loss": 0.372, |
| "step": 74000 |
| }, |
| { |
| "epoch": 0.000741, |
| "grad_norm": 1.8598190546035767, |
| "learning_rate": 1e-05, |
| "loss": 0.3792, |
| "step": 74100 |
| }, |
| { |
| "epoch": 0.000742, |
| "grad_norm": 2.1243066787719727, |
| "learning_rate": 1e-05, |
| "loss": 0.3689, |
| "step": 74200 |
| }, |
| { |
| "epoch": 0.000743, |
| "grad_norm": 1.8850631713867188, |
| "learning_rate": 1e-05, |
| "loss": 0.3762, |
| "step": 74300 |
| }, |
| { |
| "epoch": 0.000744, |
| "grad_norm": 2.0598785877227783, |
| "learning_rate": 1e-05, |
| "loss": 0.3715, |
| "step": 74400 |
| }, |
| { |
| "epoch": 0.000745, |
| "grad_norm": 2.120824098587036, |
| "learning_rate": 1e-05, |
| "loss": 0.3696, |
| "step": 74500 |
| }, |
| { |
| "epoch": 0.000746, |
| "grad_norm": 1.7642192840576172, |
| "learning_rate": 1e-05, |
| "loss": 0.3771, |
| "step": 74600 |
| }, |
| { |
| "epoch": 0.000747, |
| "grad_norm": 2.1491034030914307, |
| "learning_rate": 1e-05, |
| "loss": 0.3883, |
| "step": 74700 |
| }, |
| { |
| "epoch": 0.000748, |
| "grad_norm": 1.8905261754989624, |
| "learning_rate": 1e-05, |
| "loss": 0.3756, |
| "step": 74800 |
| }, |
| { |
| "epoch": 0.000749, |
| "grad_norm": 2.4035165309906006, |
| "learning_rate": 1e-05, |
| "loss": 0.3776, |
| "step": 74900 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 2.104729652404785, |
| "learning_rate": 1e-05, |
| "loss": 0.3762, |
| "step": 75000 |
| }, |
| { |
| "epoch": 0.000751, |
| "grad_norm": 2.0208077430725098, |
| "learning_rate": 1e-05, |
| "loss": 0.3696, |
| "step": 75100 |
| }, |
| { |
| "epoch": 0.000752, |
| "grad_norm": 2.1069564819335938, |
| "learning_rate": 1e-05, |
| "loss": 0.3766, |
| "step": 75200 |
| }, |
| { |
| "epoch": 0.000753, |
| "grad_norm": 1.9399853944778442, |
| "learning_rate": 1e-05, |
| "loss": 0.3767, |
| "step": 75300 |
| }, |
| { |
| "epoch": 0.000754, |
| "grad_norm": 2.072504758834839, |
| "learning_rate": 1e-05, |
| "loss": 0.3787, |
| "step": 75400 |
| }, |
| { |
| "epoch": 0.000755, |
| "grad_norm": 1.6709392070770264, |
| "learning_rate": 1e-05, |
| "loss": 0.3739, |
| "step": 75500 |
| }, |
| { |
| "epoch": 0.000756, |
| "grad_norm": 1.9638621807098389, |
| "learning_rate": 1e-05, |
| "loss": 0.3736, |
| "step": 75600 |
| }, |
| { |
| "epoch": 0.000757, |
| "grad_norm": 1.9661500453948975, |
| "learning_rate": 1e-05, |
| "loss": 0.3742, |
| "step": 75700 |
| }, |
| { |
| "epoch": 0.000758, |
| "grad_norm": 1.7618420124053955, |
| "learning_rate": 1e-05, |
| "loss": 0.3725, |
| "step": 75800 |
| }, |
| { |
| "epoch": 0.000759, |
| "grad_norm": 2.090238332748413, |
| "learning_rate": 1e-05, |
| "loss": 0.3771, |
| "step": 75900 |
| }, |
| { |
| "epoch": 0.00076, |
| "grad_norm": 1.9758362770080566, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 76000 |
| }, |
| { |
| "epoch": 0.000761, |
| "grad_norm": 2.023850917816162, |
| "learning_rate": 1e-05, |
| "loss": 0.379, |
| "step": 76100 |
| }, |
| { |
| "epoch": 0.000762, |
| "grad_norm": 1.9351980686187744, |
| "learning_rate": 1e-05, |
| "loss": 0.3705, |
| "step": 76200 |
| }, |
| { |
| "epoch": 0.000763, |
| "grad_norm": 1.8853991031646729, |
| "learning_rate": 1e-05, |
| "loss": 0.3743, |
| "step": 76300 |
| }, |
| { |
| "epoch": 0.000764, |
| "grad_norm": 2.0006158351898193, |
| "learning_rate": 1e-05, |
| "loss": 0.3704, |
| "step": 76400 |
| }, |
| { |
| "epoch": 0.000765, |
| "grad_norm": 2.1643075942993164, |
| "learning_rate": 1e-05, |
| "loss": 0.3705, |
| "step": 76500 |
| }, |
| { |
| "epoch": 0.000766, |
| "grad_norm": 1.9577935934066772, |
| "learning_rate": 1e-05, |
| "loss": 0.3742, |
| "step": 76600 |
| }, |
| { |
| "epoch": 0.000767, |
| "grad_norm": 2.128188371658325, |
| "learning_rate": 1e-05, |
| "loss": 0.3698, |
| "step": 76700 |
| }, |
| { |
| "epoch": 0.000768, |
| "grad_norm": 1.9895089864730835, |
| "learning_rate": 1e-05, |
| "loss": 0.3753, |
| "step": 76800 |
| }, |
| { |
| "epoch": 0.000769, |
| "grad_norm": 2.1536855697631836, |
| "learning_rate": 1e-05, |
| "loss": 0.3644, |
| "step": 76900 |
| }, |
| { |
| "epoch": 0.00077, |
| "grad_norm": 1.9444348812103271, |
| "learning_rate": 1e-05, |
| "loss": 0.3671, |
| "step": 77000 |
| }, |
| { |
| "epoch": 0.000771, |
| "grad_norm": 1.8287049531936646, |
| "learning_rate": 1e-05, |
| "loss": 0.3735, |
| "step": 77100 |
| }, |
| { |
| "epoch": 0.000772, |
| "grad_norm": 1.8443069458007812, |
| "learning_rate": 1e-05, |
| "loss": 0.3686, |
| "step": 77200 |
| }, |
| { |
| "epoch": 0.000773, |
| "grad_norm": 1.8012452125549316, |
| "learning_rate": 1e-05, |
| "loss": 0.3751, |
| "step": 77300 |
| }, |
| { |
| "epoch": 0.000774, |
| "grad_norm": 1.9977177381515503, |
| "learning_rate": 1e-05, |
| "loss": 0.3684, |
| "step": 77400 |
| }, |
| { |
| "epoch": 0.000775, |
| "grad_norm": 1.9906736612319946, |
| "learning_rate": 1e-05, |
| "loss": 0.3712, |
| "step": 77500 |
| }, |
| { |
| "epoch": 0.000776, |
| "grad_norm": 1.9918975830078125, |
| "learning_rate": 1e-05, |
| "loss": 0.3687, |
| "step": 77600 |
| }, |
| { |
| "epoch": 0.000777, |
| "grad_norm": 1.9965052604675293, |
| "learning_rate": 1e-05, |
| "loss": 0.3668, |
| "step": 77700 |
| }, |
| { |
| "epoch": 0.000778, |
| "grad_norm": 1.9064897298812866, |
| "learning_rate": 1e-05, |
| "loss": 0.3719, |
| "step": 77800 |
| }, |
| { |
| "epoch": 0.000779, |
| "grad_norm": 1.7971402406692505, |
| "learning_rate": 1e-05, |
| "loss": 0.3645, |
| "step": 77900 |
| }, |
| { |
| "epoch": 0.00078, |
| "grad_norm": 1.8232814073562622, |
| "learning_rate": 1e-05, |
| "loss": 0.3705, |
| "step": 78000 |
| }, |
| { |
| "epoch": 0.000781, |
| "grad_norm": 2.1162238121032715, |
| "learning_rate": 1e-05, |
| "loss": 0.3675, |
| "step": 78100 |
| }, |
| { |
| "epoch": 0.000782, |
| "grad_norm": 1.8850531578063965, |
| "learning_rate": 1e-05, |
| "loss": 0.3668, |
| "step": 78200 |
| }, |
| { |
| "epoch": 0.000783, |
| "grad_norm": 1.864730715751648, |
| "learning_rate": 1e-05, |
| "loss": 0.3652, |
| "step": 78300 |
| }, |
| { |
| "epoch": 0.000784, |
| "grad_norm": 1.9205899238586426, |
| "learning_rate": 1e-05, |
| "loss": 0.3716, |
| "step": 78400 |
| }, |
| { |
| "epoch": 0.000785, |
| "grad_norm": 2.325000524520874, |
| "learning_rate": 1e-05, |
| "loss": 0.3738, |
| "step": 78500 |
| }, |
| { |
| "epoch": 0.000786, |
| "grad_norm": 2.2757534980773926, |
| "learning_rate": 1e-05, |
| "loss": 0.3722, |
| "step": 78600 |
| }, |
| { |
| "epoch": 0.000787, |
| "grad_norm": 1.7619765996932983, |
| "learning_rate": 1e-05, |
| "loss": 0.3799, |
| "step": 78700 |
| }, |
| { |
| "epoch": 0.000788, |
| "grad_norm": 1.802307367324829, |
| "learning_rate": 1e-05, |
| "loss": 0.3744, |
| "step": 78800 |
| }, |
| { |
| "epoch": 0.000789, |
| "grad_norm": 1.8677384853363037, |
| "learning_rate": 1e-05, |
| "loss": 0.3757, |
| "step": 78900 |
| }, |
| { |
| "epoch": 0.00079, |
| "grad_norm": 2.1615066528320312, |
| "learning_rate": 1e-05, |
| "loss": 0.3769, |
| "step": 79000 |
| }, |
| { |
| "epoch": 0.000791, |
| "grad_norm": 1.6998400688171387, |
| "learning_rate": 1e-05, |
| "loss": 0.3705, |
| "step": 79100 |
| }, |
| { |
| "epoch": 0.000792, |
| "grad_norm": 1.7555445432662964, |
| "learning_rate": 1e-05, |
| "loss": 0.373, |
| "step": 79200 |
| }, |
| { |
| "epoch": 0.000793, |
| "grad_norm": 2.0142476558685303, |
| "learning_rate": 1e-05, |
| "loss": 0.3723, |
| "step": 79300 |
| }, |
| { |
| "epoch": 0.000794, |
| "grad_norm": 2.3179373741149902, |
| "learning_rate": 1e-05, |
| "loss": 0.3683, |
| "step": 79400 |
| }, |
| { |
| "epoch": 0.000795, |
| "grad_norm": 1.9455734491348267, |
| "learning_rate": 1e-05, |
| "loss": 0.3677, |
| "step": 79500 |
| }, |
| { |
| "epoch": 0.000796, |
| "grad_norm": 2.0112357139587402, |
| "learning_rate": 1e-05, |
| "loss": 0.3667, |
| "step": 79600 |
| }, |
| { |
| "epoch": 0.000797, |
| "grad_norm": 2.257429361343384, |
| "learning_rate": 1e-05, |
| "loss": 0.366, |
| "step": 79700 |
| }, |
| { |
| "epoch": 0.000798, |
| "grad_norm": 1.7353073358535767, |
| "learning_rate": 1e-05, |
| "loss": 0.3693, |
| "step": 79800 |
| }, |
| { |
| "epoch": 0.000799, |
| "grad_norm": 1.989250898361206, |
| "learning_rate": 1e-05, |
| "loss": 0.3658, |
| "step": 79900 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 1.8010023832321167, |
| "learning_rate": 1e-05, |
| "loss": 0.3687, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.0008, |
| "eval_loss": 0.343017578125, |
| "eval_runtime": 111.2417, |
| "eval_samples_per_second": 449.472, |
| "eval_steps_per_second": 28.092, |
| "step": 80000 |
| }, |
| { |
| "epoch": 0.000801, |
| "grad_norm": 1.8883432149887085, |
| "learning_rate": 1e-05, |
| "loss": 0.3739, |
| "step": 80100 |
| }, |
| { |
| "epoch": 0.000802, |
| "grad_norm": 1.9652680158615112, |
| "learning_rate": 1e-05, |
| "loss": 0.359, |
| "step": 80200 |
| }, |
| { |
| "epoch": 0.000803, |
| "grad_norm": 2.1151764392852783, |
| "learning_rate": 1e-05, |
| "loss": 0.3626, |
| "step": 80300 |
| }, |
| { |
| "epoch": 0.000804, |
| "grad_norm": 1.736228108406067, |
| "learning_rate": 1e-05, |
| "loss": 0.3737, |
| "step": 80400 |
| }, |
| { |
| "epoch": 0.000805, |
| "grad_norm": 1.800878643989563, |
| "learning_rate": 1e-05, |
| "loss": 0.3731, |
| "step": 80500 |
| }, |
| { |
| "epoch": 0.000806, |
| "grad_norm": 1.8532053232192993, |
| "learning_rate": 1e-05, |
| "loss": 0.3723, |
| "step": 80600 |
| }, |
| { |
| "epoch": 0.000807, |
| "grad_norm": 1.838671088218689, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 80700 |
| }, |
| { |
| "epoch": 0.000808, |
| "grad_norm": 1.7083035707473755, |
| "learning_rate": 1e-05, |
| "loss": 0.3761, |
| "step": 80800 |
| }, |
| { |
| "epoch": 0.000809, |
| "grad_norm": 1.8676607608795166, |
| "learning_rate": 1e-05, |
| "loss": 0.3684, |
| "step": 80900 |
| }, |
| { |
| "epoch": 0.00081, |
| "grad_norm": 2.0752341747283936, |
| "learning_rate": 1e-05, |
| "loss": 0.3657, |
| "step": 81000 |
| }, |
| { |
| "epoch": 0.000811, |
| "grad_norm": 1.793967604637146, |
| "learning_rate": 1e-05, |
| "loss": 0.3748, |
| "step": 81100 |
| }, |
| { |
| "epoch": 0.000812, |
| "grad_norm": 1.6681337356567383, |
| "learning_rate": 1e-05, |
| "loss": 0.3661, |
| "step": 81200 |
| }, |
| { |
| "epoch": 0.000813, |
| "grad_norm": 2.46724534034729, |
| "learning_rate": 1e-05, |
| "loss": 0.3609, |
| "step": 81300 |
| }, |
| { |
| "epoch": 0.000814, |
| "grad_norm": 1.8258310556411743, |
| "learning_rate": 1e-05, |
| "loss": 0.3632, |
| "step": 81400 |
| }, |
| { |
| "epoch": 0.000815, |
| "grad_norm": 1.9003719091415405, |
| "learning_rate": 1e-05, |
| "loss": 0.365, |
| "step": 81500 |
| }, |
| { |
| "epoch": 0.000816, |
| "grad_norm": 1.8004292249679565, |
| "learning_rate": 1e-05, |
| "loss": 0.3675, |
| "step": 81600 |
| }, |
| { |
| "epoch": 0.000817, |
| "grad_norm": 1.8981424570083618, |
| "learning_rate": 1e-05, |
| "loss": 0.369, |
| "step": 81700 |
| }, |
| { |
| "epoch": 0.000818, |
| "grad_norm": 2.1236822605133057, |
| "learning_rate": 1e-05, |
| "loss": 0.368, |
| "step": 81800 |
| }, |
| { |
| "epoch": 0.000819, |
| "grad_norm": 1.9143141508102417, |
| "learning_rate": 1e-05, |
| "loss": 0.3628, |
| "step": 81900 |
| }, |
| { |
| "epoch": 0.00082, |
| "grad_norm": 1.9660463333129883, |
| "learning_rate": 1e-05, |
| "loss": 0.3627, |
| "step": 82000 |
| }, |
| { |
| "epoch": 0.000821, |
| "grad_norm": 2.1098835468292236, |
| "learning_rate": 1e-05, |
| "loss": 0.3657, |
| "step": 82100 |
| }, |
| { |
| "epoch": 0.000822, |
| "grad_norm": 1.8893609046936035, |
| "learning_rate": 1e-05, |
| "loss": 0.3621, |
| "step": 82200 |
| }, |
| { |
| "epoch": 0.000823, |
| "grad_norm": 1.848253607749939, |
| "learning_rate": 1e-05, |
| "loss": 0.3547, |
| "step": 82300 |
| }, |
| { |
| "epoch": 0.000824, |
| "grad_norm": 1.8891757726669312, |
| "learning_rate": 1e-05, |
| "loss": 0.3654, |
| "step": 82400 |
| }, |
| { |
| "epoch": 0.000825, |
| "grad_norm": 1.9906656742095947, |
| "learning_rate": 1e-05, |
| "loss": 0.37, |
| "step": 82500 |
| }, |
| { |
| "epoch": 0.000826, |
| "grad_norm": 2.026745080947876, |
| "learning_rate": 1e-05, |
| "loss": 0.3598, |
| "step": 82600 |
| }, |
| { |
| "epoch": 0.000827, |
| "grad_norm": 1.8796215057373047, |
| "learning_rate": 1e-05, |
| "loss": 0.3608, |
| "step": 82700 |
| }, |
| { |
| "epoch": 0.000828, |
| "grad_norm": 1.8934880495071411, |
| "learning_rate": 1e-05, |
| "loss": 0.3633, |
| "step": 82800 |
| }, |
| { |
| "epoch": 0.000829, |
| "grad_norm": 2.241187572479248, |
| "learning_rate": 1e-05, |
| "loss": 0.3623, |
| "step": 82900 |
| }, |
| { |
| "epoch": 0.00083, |
| "grad_norm": 1.8311808109283447, |
| "learning_rate": 1e-05, |
| "loss": 0.3643, |
| "step": 83000 |
| }, |
| { |
| "epoch": 0.000831, |
| "grad_norm": 1.7869751453399658, |
| "learning_rate": 1e-05, |
| "loss": 0.3718, |
| "step": 83100 |
| }, |
| { |
| "epoch": 0.000832, |
| "grad_norm": 1.894146203994751, |
| "learning_rate": 1e-05, |
| "loss": 0.3629, |
| "step": 83200 |
| }, |
| { |
| "epoch": 0.000833, |
| "grad_norm": 1.7418984174728394, |
| "learning_rate": 1e-05, |
| "loss": 0.358, |
| "step": 83300 |
| }, |
| { |
| "epoch": 0.000834, |
| "grad_norm": 2.2200584411621094, |
| "learning_rate": 1e-05, |
| "loss": 0.3602, |
| "step": 83400 |
| }, |
| { |
| "epoch": 0.000835, |
| "grad_norm": 1.7402255535125732, |
| "learning_rate": 1e-05, |
| "loss": 0.3648, |
| "step": 83500 |
| }, |
| { |
| "epoch": 0.000836, |
| "grad_norm": 1.7476297616958618, |
| "learning_rate": 1e-05, |
| "loss": 0.3603, |
| "step": 83600 |
| }, |
| { |
| "epoch": 0.000837, |
| "grad_norm": 2.0509250164031982, |
| "learning_rate": 1e-05, |
| "loss": 0.3698, |
| "step": 83700 |
| }, |
| { |
| "epoch": 0.000838, |
| "grad_norm": 1.819290041923523, |
| "learning_rate": 1e-05, |
| "loss": 0.349, |
| "step": 83800 |
| }, |
| { |
| "epoch": 0.000839, |
| "grad_norm": 1.9946727752685547, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 83900 |
| }, |
| { |
| "epoch": 0.00084, |
| "grad_norm": 1.6956796646118164, |
| "learning_rate": 1e-05, |
| "loss": 0.3686, |
| "step": 84000 |
| }, |
| { |
| "epoch": 0.000841, |
| "grad_norm": 1.8575202226638794, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 84100 |
| }, |
| { |
| "epoch": 0.000842, |
| "grad_norm": 1.7518588304519653, |
| "learning_rate": 1e-05, |
| "loss": 0.3624, |
| "step": 84200 |
| }, |
| { |
| "epoch": 0.000843, |
| "grad_norm": 1.6281752586364746, |
| "learning_rate": 1e-05, |
| "loss": 0.3567, |
| "step": 84300 |
| }, |
| { |
| "epoch": 0.000844, |
| "grad_norm": 1.8025518655776978, |
| "learning_rate": 1e-05, |
| "loss": 0.355, |
| "step": 84400 |
| }, |
| { |
| "epoch": 0.000845, |
| "grad_norm": 1.787426471710205, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 84500 |
| }, |
| { |
| "epoch": 0.000846, |
| "grad_norm": 1.8636668920516968, |
| "learning_rate": 1e-05, |
| "loss": 0.3539, |
| "step": 84600 |
| }, |
| { |
| "epoch": 0.000847, |
| "grad_norm": 1.999342441558838, |
| "learning_rate": 1e-05, |
| "loss": 0.3573, |
| "step": 84700 |
| }, |
| { |
| "epoch": 0.000848, |
| "grad_norm": 1.7526439428329468, |
| "learning_rate": 1e-05, |
| "loss": 0.3638, |
| "step": 84800 |
| }, |
| { |
| "epoch": 0.000849, |
| "grad_norm": 1.9818848371505737, |
| "learning_rate": 1e-05, |
| "loss": 0.3622, |
| "step": 84900 |
| }, |
| { |
| "epoch": 0.00085, |
| "grad_norm": 2.2633955478668213, |
| "learning_rate": 1e-05, |
| "loss": 0.3685, |
| "step": 85000 |
| }, |
| { |
| "epoch": 0.000851, |
| "grad_norm": 2.037205696105957, |
| "learning_rate": 1e-05, |
| "loss": 0.3607, |
| "step": 85100 |
| }, |
| { |
| "epoch": 0.000852, |
| "grad_norm": 2.008530616760254, |
| "learning_rate": 1e-05, |
| "loss": 0.3666, |
| "step": 85200 |
| }, |
| { |
| "epoch": 0.000853, |
| "grad_norm": 1.7828373908996582, |
| "learning_rate": 1e-05, |
| "loss": 0.3642, |
| "step": 85300 |
| }, |
| { |
| "epoch": 0.000854, |
| "grad_norm": 2.0201706886291504, |
| "learning_rate": 1e-05, |
| "loss": 0.3597, |
| "step": 85400 |
| }, |
| { |
| "epoch": 0.000855, |
| "grad_norm": 2.0106265544891357, |
| "learning_rate": 1e-05, |
| "loss": 0.3604, |
| "step": 85500 |
| }, |
| { |
| "epoch": 0.000856, |
| "grad_norm": 1.896898627281189, |
| "learning_rate": 1e-05, |
| "loss": 0.366, |
| "step": 85600 |
| }, |
| { |
| "epoch": 0.000857, |
| "grad_norm": 1.9812458753585815, |
| "learning_rate": 1e-05, |
| "loss": 0.3596, |
| "step": 85700 |
| }, |
| { |
| "epoch": 0.000858, |
| "grad_norm": 2.0447208881378174, |
| "learning_rate": 1e-05, |
| "loss": 0.3615, |
| "step": 85800 |
| }, |
| { |
| "epoch": 0.000859, |
| "grad_norm": 1.9249247312545776, |
| "learning_rate": 1e-05, |
| "loss": 0.3532, |
| "step": 85900 |
| }, |
| { |
| "epoch": 0.00086, |
| "grad_norm": 1.700594186782837, |
| "learning_rate": 1e-05, |
| "loss": 0.3567, |
| "step": 86000 |
| }, |
| { |
| "epoch": 0.000861, |
| "grad_norm": 1.9149887561798096, |
| "learning_rate": 1e-05, |
| "loss": 0.3585, |
| "step": 86100 |
| }, |
| { |
| "epoch": 0.000862, |
| "grad_norm": 2.202561855316162, |
| "learning_rate": 1e-05, |
| "loss": 0.3667, |
| "step": 86200 |
| }, |
| { |
| "epoch": 0.000863, |
| "grad_norm": 1.8370987176895142, |
| "learning_rate": 1e-05, |
| "loss": 0.3606, |
| "step": 86300 |
| }, |
| { |
| "epoch": 0.000864, |
| "grad_norm": 2.0995025634765625, |
| "learning_rate": 1e-05, |
| "loss": 0.366, |
| "step": 86400 |
| }, |
| { |
| "epoch": 0.000865, |
| "grad_norm": 1.7918909788131714, |
| "learning_rate": 1e-05, |
| "loss": 0.3524, |
| "step": 86500 |
| }, |
| { |
| "epoch": 0.000866, |
| "grad_norm": 1.870877742767334, |
| "learning_rate": 1e-05, |
| "loss": 0.3631, |
| "step": 86600 |
| }, |
| { |
| "epoch": 0.000867, |
| "grad_norm": 2.0287795066833496, |
| "learning_rate": 1e-05, |
| "loss": 0.3555, |
| "step": 86700 |
| }, |
| { |
| "epoch": 0.000868, |
| "grad_norm": 1.9686987400054932, |
| "learning_rate": 1e-05, |
| "loss": 0.3622, |
| "step": 86800 |
| }, |
| { |
| "epoch": 0.000869, |
| "grad_norm": 1.714966893196106, |
| "learning_rate": 1e-05, |
| "loss": 0.3585, |
| "step": 86900 |
| }, |
| { |
| "epoch": 0.00087, |
| "grad_norm": 2.0388360023498535, |
| "learning_rate": 1e-05, |
| "loss": 0.3605, |
| "step": 87000 |
| }, |
| { |
| "epoch": 0.000871, |
| "grad_norm": 1.8588838577270508, |
| "learning_rate": 1e-05, |
| "loss": 0.3588, |
| "step": 87100 |
| }, |
| { |
| "epoch": 0.000872, |
| "grad_norm": 2.2491447925567627, |
| "learning_rate": 1e-05, |
| "loss": 0.3598, |
| "step": 87200 |
| }, |
| { |
| "epoch": 0.000873, |
| "grad_norm": 2.023857831954956, |
| "learning_rate": 1e-05, |
| "loss": 0.3618, |
| "step": 87300 |
| }, |
| { |
| "epoch": 0.000874, |
| "grad_norm": 1.5411655902862549, |
| "learning_rate": 1e-05, |
| "loss": 0.3681, |
| "step": 87400 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 1.5299054384231567, |
| "learning_rate": 1e-05, |
| "loss": 0.3594, |
| "step": 87500 |
| }, |
| { |
| "epoch": 0.000876, |
| "grad_norm": 1.9162421226501465, |
| "learning_rate": 1e-05, |
| "loss": 0.3569, |
| "step": 87600 |
| }, |
| { |
| "epoch": 0.000877, |
| "grad_norm": 2.160090684890747, |
| "learning_rate": 1e-05, |
| "loss": 0.358, |
| "step": 87700 |
| }, |
| { |
| "epoch": 0.000878, |
| "grad_norm": 2.044666051864624, |
| "learning_rate": 1e-05, |
| "loss": 0.3609, |
| "step": 87800 |
| }, |
| { |
| "epoch": 0.000879, |
| "grad_norm": 1.7112947702407837, |
| "learning_rate": 1e-05, |
| "loss": 0.3544, |
| "step": 87900 |
| }, |
| { |
| "epoch": 0.00088, |
| "grad_norm": 1.8648561239242554, |
| "learning_rate": 1e-05, |
| "loss": 0.365, |
| "step": 88000 |
| }, |
| { |
| "epoch": 0.000881, |
| "grad_norm": 1.8748390674591064, |
| "learning_rate": 1e-05, |
| "loss": 0.3668, |
| "step": 88100 |
| }, |
| { |
| "epoch": 0.000882, |
| "grad_norm": 2.2753427028656006, |
| "learning_rate": 1e-05, |
| "loss": 0.3631, |
| "step": 88200 |
| }, |
| { |
| "epoch": 0.000883, |
| "grad_norm": 1.8260302543640137, |
| "learning_rate": 1e-05, |
| "loss": 0.3614, |
| "step": 88300 |
| }, |
| { |
| "epoch": 0.000884, |
| "grad_norm": 1.8950936794281006, |
| "learning_rate": 1e-05, |
| "loss": 0.3555, |
| "step": 88400 |
| }, |
| { |
| "epoch": 0.000885, |
| "grad_norm": 1.7748656272888184, |
| "learning_rate": 1e-05, |
| "loss": 0.3607, |
| "step": 88500 |
| }, |
| { |
| "epoch": 0.000886, |
| "grad_norm": 1.7580374479293823, |
| "learning_rate": 1e-05, |
| "loss": 0.352, |
| "step": 88600 |
| }, |
| { |
| "epoch": 0.000887, |
| "grad_norm": 1.9820917844772339, |
| "learning_rate": 1e-05, |
| "loss": 0.3549, |
| "step": 88700 |
| }, |
| { |
| "epoch": 0.000888, |
| "grad_norm": 1.842002272605896, |
| "learning_rate": 1e-05, |
| "loss": 0.3606, |
| "step": 88800 |
| }, |
| { |
| "epoch": 0.000889, |
| "grad_norm": 1.8876936435699463, |
| "learning_rate": 1e-05, |
| "loss": 0.3607, |
| "step": 88900 |
| }, |
| { |
| "epoch": 0.00089, |
| "grad_norm": 1.7980290651321411, |
| "learning_rate": 1e-05, |
| "loss": 0.3555, |
| "step": 89000 |
| }, |
| { |
| "epoch": 0.000891, |
| "grad_norm": 2.300755739212036, |
| "learning_rate": 1e-05, |
| "loss": 0.3582, |
| "step": 89100 |
| }, |
| { |
| "epoch": 0.000892, |
| "grad_norm": 2.7439591884613037, |
| "learning_rate": 1e-05, |
| "loss": 0.3545, |
| "step": 89200 |
| }, |
| { |
| "epoch": 0.000893, |
| "grad_norm": 1.7525635957717896, |
| "learning_rate": 1e-05, |
| "loss": 0.3636, |
| "step": 89300 |
| }, |
| { |
| "epoch": 0.000894, |
| "grad_norm": 1.6738208532333374, |
| "learning_rate": 1e-05, |
| "loss": 0.3569, |
| "step": 89400 |
| }, |
| { |
| "epoch": 0.000895, |
| "grad_norm": 1.5723131895065308, |
| "learning_rate": 1e-05, |
| "loss": 0.3619, |
| "step": 89500 |
| }, |
| { |
| "epoch": 0.000896, |
| "grad_norm": 1.865443468093872, |
| "learning_rate": 1e-05, |
| "loss": 0.3546, |
| "step": 89600 |
| }, |
| { |
| "epoch": 0.000897, |
| "grad_norm": 1.8413885831832886, |
| "learning_rate": 1e-05, |
| "loss": 0.3519, |
| "step": 89700 |
| }, |
| { |
| "epoch": 0.000898, |
| "grad_norm": 1.760122537612915, |
| "learning_rate": 1e-05, |
| "loss": 0.3525, |
| "step": 89800 |
| }, |
| { |
| "epoch": 0.000899, |
| "grad_norm": 2.0341832637786865, |
| "learning_rate": 1e-05, |
| "loss": 0.3478, |
| "step": 89900 |
| }, |
| { |
| "epoch": 0.0009, |
| "grad_norm": 1.8548213243484497, |
| "learning_rate": 1e-05, |
| "loss": 0.3486, |
| "step": 90000 |
| }, |
| { |
| "epoch": 0.000901, |
| "grad_norm": 2.114245891571045, |
| "learning_rate": 1e-05, |
| "loss": 0.3494, |
| "step": 90100 |
| }, |
| { |
| "epoch": 0.000902, |
| "grad_norm": 2.117030143737793, |
| "learning_rate": 1e-05, |
| "loss": 0.3514, |
| "step": 90200 |
| }, |
| { |
| "epoch": 0.000903, |
| "grad_norm": 1.8521121740341187, |
| "learning_rate": 1e-05, |
| "loss": 0.3571, |
| "step": 90300 |
| }, |
| { |
| "epoch": 0.000904, |
| "grad_norm": 1.9864593744277954, |
| "learning_rate": 1e-05, |
| "loss": 0.3657, |
| "step": 90400 |
| }, |
| { |
| "epoch": 0.000905, |
| "grad_norm": 1.9219348430633545, |
| "learning_rate": 1e-05, |
| "loss": 0.3541, |
| "step": 90500 |
| }, |
| { |
| "epoch": 0.000906, |
| "grad_norm": 2.13183856010437, |
| "learning_rate": 1e-05, |
| "loss": 0.3566, |
| "step": 90600 |
| }, |
| { |
| "epoch": 0.000907, |
| "grad_norm": 1.7505743503570557, |
| "learning_rate": 1e-05, |
| "loss": 0.351, |
| "step": 90700 |
| }, |
| { |
| "epoch": 0.000908, |
| "grad_norm": 1.7294330596923828, |
| "learning_rate": 1e-05, |
| "loss": 0.3536, |
| "step": 90800 |
| }, |
| { |
| "epoch": 0.000909, |
| "grad_norm": 1.8986823558807373, |
| "learning_rate": 1e-05, |
| "loss": 0.3598, |
| "step": 90900 |
| }, |
| { |
| "epoch": 0.00091, |
| "grad_norm": 1.6649383306503296, |
| "learning_rate": 1e-05, |
| "loss": 0.3568, |
| "step": 91000 |
| }, |
| { |
| "epoch": 0.000911, |
| "grad_norm": 2.0748260021209717, |
| "learning_rate": 1e-05, |
| "loss": 0.3527, |
| "step": 91100 |
| }, |
| { |
| "epoch": 0.000912, |
| "grad_norm": 1.905617117881775, |
| "learning_rate": 1e-05, |
| "loss": 0.3535, |
| "step": 91200 |
| }, |
| { |
| "epoch": 0.000913, |
| "grad_norm": 1.764633059501648, |
| "learning_rate": 1e-05, |
| "loss": 0.3517, |
| "step": 91300 |
| }, |
| { |
| "epoch": 0.000914, |
| "grad_norm": 1.822187900543213, |
| "learning_rate": 1e-05, |
| "loss": 0.3584, |
| "step": 91400 |
| }, |
| { |
| "epoch": 0.000915, |
| "grad_norm": 1.845644235610962, |
| "learning_rate": 1e-05, |
| "loss": 0.3561, |
| "step": 91500 |
| }, |
| { |
| "epoch": 0.000916, |
| "grad_norm": 2.082502603530884, |
| "learning_rate": 1e-05, |
| "loss": 0.3563, |
| "step": 91600 |
| }, |
| { |
| "epoch": 0.000917, |
| "grad_norm": 2.198960065841675, |
| "learning_rate": 1e-05, |
| "loss": 0.3557, |
| "step": 91700 |
| }, |
| { |
| "epoch": 0.000918, |
| "grad_norm": 1.6692492961883545, |
| "learning_rate": 1e-05, |
| "loss": 0.3558, |
| "step": 91800 |
| }, |
| { |
| "epoch": 0.000919, |
| "grad_norm": 2.025036334991455, |
| "learning_rate": 1e-05, |
| "loss": 0.3427, |
| "step": 91900 |
| }, |
| { |
| "epoch": 0.00092, |
| "grad_norm": 1.8072044849395752, |
| "learning_rate": 1e-05, |
| "loss": 0.3534, |
| "step": 92000 |
| }, |
| { |
| "epoch": 0.000921, |
| "grad_norm": 1.989229679107666, |
| "learning_rate": 1e-05, |
| "loss": 0.3579, |
| "step": 92100 |
| }, |
| { |
| "epoch": 0.000922, |
| "grad_norm": 1.943912386894226, |
| "learning_rate": 1e-05, |
| "loss": 0.3625, |
| "step": 92200 |
| }, |
| { |
| "epoch": 0.000923, |
| "grad_norm": 5.172427654266357, |
| "learning_rate": 1e-05, |
| "loss": 0.3467, |
| "step": 92300 |
| }, |
| { |
| "epoch": 0.000924, |
| "grad_norm": 1.854652762413025, |
| "learning_rate": 1e-05, |
| "loss": 0.3526, |
| "step": 92400 |
| }, |
| { |
| "epoch": 0.000925, |
| "grad_norm": 1.7196903228759766, |
| "learning_rate": 1e-05, |
| "loss": 0.3505, |
| "step": 92500 |
| }, |
| { |
| "epoch": 0.000926, |
| "grad_norm": 1.6658947467803955, |
| "learning_rate": 1e-05, |
| "loss": 0.3564, |
| "step": 92600 |
| }, |
| { |
| "epoch": 0.000927, |
| "grad_norm": 2.8138256072998047, |
| "learning_rate": 1e-05, |
| "loss": 0.3571, |
| "step": 92700 |
| }, |
| { |
| "epoch": 0.000928, |
| "grad_norm": 1.700640320777893, |
| "learning_rate": 1e-05, |
| "loss": 0.3576, |
| "step": 92800 |
| }, |
| { |
| "epoch": 0.000929, |
| "grad_norm": 1.738922119140625, |
| "learning_rate": 1e-05, |
| "loss": 0.3482, |
| "step": 92900 |
| }, |
| { |
| "epoch": 0.00093, |
| "grad_norm": 1.7264224290847778, |
| "learning_rate": 1e-05, |
| "loss": 0.3515, |
| "step": 93000 |
| }, |
| { |
| "epoch": 0.000931, |
| "grad_norm": 1.7760035991668701, |
| "learning_rate": 1e-05, |
| "loss": 0.3546, |
| "step": 93100 |
| }, |
| { |
| "epoch": 0.000932, |
| "grad_norm": 1.684767484664917, |
| "learning_rate": 1e-05, |
| "loss": 0.3509, |
| "step": 93200 |
| }, |
| { |
| "epoch": 0.000933, |
| "grad_norm": 1.9357808828353882, |
| "learning_rate": 1e-05, |
| "loss": 0.3613, |
| "step": 93300 |
| }, |
| { |
| "epoch": 0.000934, |
| "grad_norm": 1.853598952293396, |
| "learning_rate": 1e-05, |
| "loss": 0.3523, |
| "step": 93400 |
| }, |
| { |
| "epoch": 0.000935, |
| "grad_norm": 3.272063732147217, |
| "learning_rate": 1e-05, |
| "loss": 0.3515, |
| "step": 93500 |
| }, |
| { |
| "epoch": 0.000936, |
| "grad_norm": 1.8037041425704956, |
| "learning_rate": 1e-05, |
| "loss": 0.3526, |
| "step": 93600 |
| }, |
| { |
| "epoch": 0.000937, |
| "grad_norm": 1.989990472793579, |
| "learning_rate": 1e-05, |
| "loss": 0.3512, |
| "step": 93700 |
| }, |
| { |
| "epoch": 0.000938, |
| "grad_norm": 1.7665644884109497, |
| "learning_rate": 1e-05, |
| "loss": 0.3503, |
| "step": 93800 |
| }, |
| { |
| "epoch": 0.000939, |
| "grad_norm": 2.230848550796509, |
| "learning_rate": 1e-05, |
| "loss": 0.3532, |
| "step": 93900 |
| }, |
| { |
| "epoch": 0.00094, |
| "grad_norm": 1.8637299537658691, |
| "learning_rate": 1e-05, |
| "loss": 0.3558, |
| "step": 94000 |
| }, |
| { |
| "epoch": 0.000941, |
| "grad_norm": 1.9153410196304321, |
| "learning_rate": 1e-05, |
| "loss": 0.3534, |
| "step": 94100 |
| }, |
| { |
| "epoch": 0.000942, |
| "grad_norm": 1.9178539514541626, |
| "learning_rate": 1e-05, |
| "loss": 0.3578, |
| "step": 94200 |
| }, |
| { |
| "epoch": 0.000943, |
| "grad_norm": 1.9506075382232666, |
| "learning_rate": 1e-05, |
| "loss": 0.3557, |
| "step": 94300 |
| }, |
| { |
| "epoch": 0.000944, |
| "grad_norm": 1.97675621509552, |
| "learning_rate": 1e-05, |
| "loss": 0.3496, |
| "step": 94400 |
| }, |
| { |
| "epoch": 0.000945, |
| "grad_norm": 1.571119785308838, |
| "learning_rate": 1e-05, |
| "loss": 0.3549, |
| "step": 94500 |
| }, |
| { |
| "epoch": 0.000946, |
| "grad_norm": 1.84198796749115, |
| "learning_rate": 1e-05, |
| "loss": 0.3564, |
| "step": 94600 |
| }, |
| { |
| "epoch": 0.000947, |
| "grad_norm": 1.6789623498916626, |
| "learning_rate": 1e-05, |
| "loss": 0.3459, |
| "step": 94700 |
| }, |
| { |
| "epoch": 0.000948, |
| "grad_norm": 1.7345160245895386, |
| "learning_rate": 1e-05, |
| "loss": 0.348, |
| "step": 94800 |
| }, |
| { |
| "epoch": 0.000949, |
| "grad_norm": 1.626235008239746, |
| "learning_rate": 1e-05, |
| "loss": 0.3579, |
| "step": 94900 |
| }, |
| { |
| "epoch": 0.00095, |
| "grad_norm": 1.8632274866104126, |
| "learning_rate": 1e-05, |
| "loss": 0.3555, |
| "step": 95000 |
| }, |
| { |
| "epoch": 0.000951, |
| "grad_norm": 1.5302915573120117, |
| "learning_rate": 1e-05, |
| "loss": 0.3515, |
| "step": 95100 |
| }, |
| { |
| "epoch": 0.000952, |
| "grad_norm": 1.759491205215454, |
| "learning_rate": 1e-05, |
| "loss": 0.3451, |
| "step": 95200 |
| }, |
| { |
| "epoch": 0.000953, |
| "grad_norm": 2.1866915225982666, |
| "learning_rate": 1e-05, |
| "loss": 0.3392, |
| "step": 95300 |
| }, |
| { |
| "epoch": 0.000954, |
| "grad_norm": 1.6935898065567017, |
| "learning_rate": 1e-05, |
| "loss": 0.3497, |
| "step": 95400 |
| }, |
| { |
| "epoch": 0.000955, |
| "grad_norm": 1.9268600940704346, |
| "learning_rate": 1e-05, |
| "loss": 0.3463, |
| "step": 95500 |
| }, |
| { |
| "epoch": 0.000956, |
| "grad_norm": 1.9195621013641357, |
| "learning_rate": 1e-05, |
| "loss": 0.351, |
| "step": 95600 |
| }, |
| { |
| "epoch": 0.000957, |
| "grad_norm": 1.845158338546753, |
| "learning_rate": 1e-05, |
| "loss": 0.3465, |
| "step": 95700 |
| }, |
| { |
| "epoch": 0.000958, |
| "grad_norm": 2.0196573734283447, |
| "learning_rate": 1e-05, |
| "loss": 0.3525, |
| "step": 95800 |
| }, |
| { |
| "epoch": 0.000959, |
| "grad_norm": 1.8416608572006226, |
| "learning_rate": 1e-05, |
| "loss": 0.3502, |
| "step": 95900 |
| }, |
| { |
| "epoch": 0.00096, |
| "grad_norm": 1.83146071434021, |
| "learning_rate": 1e-05, |
| "loss": 0.3535, |
| "step": 96000 |
| }, |
| { |
| "epoch": 0.000961, |
| "grad_norm": 1.8110991716384888, |
| "learning_rate": 1e-05, |
| "loss": 0.3536, |
| "step": 96100 |
| }, |
| { |
| "epoch": 0.000962, |
| "grad_norm": 1.798935055732727, |
| "learning_rate": 1e-05, |
| "loss": 0.3513, |
| "step": 96200 |
| }, |
| { |
| "epoch": 0.000963, |
| "grad_norm": 1.7838218212127686, |
| "learning_rate": 1e-05, |
| "loss": 0.353, |
| "step": 96300 |
| }, |
| { |
| "epoch": 0.000964, |
| "grad_norm": 1.8357594013214111, |
| "learning_rate": 1e-05, |
| "loss": 0.35, |
| "step": 96400 |
| }, |
| { |
| "epoch": 0.000965, |
| "grad_norm": 1.9533332586288452, |
| "learning_rate": 1e-05, |
| "loss": 0.3497, |
| "step": 96500 |
| }, |
| { |
| "epoch": 0.000966, |
| "grad_norm": 1.8430505990982056, |
| "learning_rate": 1e-05, |
| "loss": 0.3535, |
| "step": 96600 |
| }, |
| { |
| "epoch": 0.000967, |
| "grad_norm": 1.842871069908142, |
| "learning_rate": 1e-05, |
| "loss": 0.3455, |
| "step": 96700 |
| }, |
| { |
| "epoch": 0.000968, |
| "grad_norm": 1.8501172065734863, |
| "learning_rate": 1e-05, |
| "loss": 0.3469, |
| "step": 96800 |
| }, |
| { |
| "epoch": 0.000969, |
| "grad_norm": 1.8171736001968384, |
| "learning_rate": 1e-05, |
| "loss": 0.3503, |
| "step": 96900 |
| }, |
| { |
| "epoch": 0.00097, |
| "grad_norm": 1.8180707693099976, |
| "learning_rate": 1e-05, |
| "loss": 0.3509, |
| "step": 97000 |
| }, |
| { |
| "epoch": 0.000971, |
| "grad_norm": 1.6564078330993652, |
| "learning_rate": 1e-05, |
| "loss": 0.3449, |
| "step": 97100 |
| }, |
| { |
| "epoch": 0.000972, |
| "grad_norm": 1.9035217761993408, |
| "learning_rate": 1e-05, |
| "loss": 0.3464, |
| "step": 97200 |
| }, |
| { |
| "epoch": 0.000973, |
| "grad_norm": 1.7870876789093018, |
| "learning_rate": 1e-05, |
| "loss": 0.358, |
| "step": 97300 |
| }, |
| { |
| "epoch": 0.000974, |
| "grad_norm": 2.026207447052002, |
| "learning_rate": 1e-05, |
| "loss": 0.3469, |
| "step": 97400 |
| }, |
| { |
| "epoch": 0.000975, |
| "grad_norm": 1.839242935180664, |
| "learning_rate": 1e-05, |
| "loss": 0.3527, |
| "step": 97500 |
| }, |
| { |
| "epoch": 0.000976, |
| "grad_norm": 2.1023123264312744, |
| "learning_rate": 1e-05, |
| "loss": 0.3575, |
| "step": 97600 |
| }, |
| { |
| "epoch": 0.000977, |
| "grad_norm": 1.7062361240386963, |
| "learning_rate": 1e-05, |
| "loss": 0.3427, |
| "step": 97700 |
| }, |
| { |
| "epoch": 0.000978, |
| "grad_norm": 1.8973636627197266, |
| "learning_rate": 1e-05, |
| "loss": 0.3496, |
| "step": 97800 |
| }, |
| { |
| "epoch": 0.000979, |
| "grad_norm": 4.865823745727539, |
| "learning_rate": 1e-05, |
| "loss": 0.3586, |
| "step": 97900 |
| }, |
| { |
| "epoch": 0.00098, |
| "grad_norm": 1.6862282752990723, |
| "learning_rate": 1e-05, |
| "loss": 0.3483, |
| "step": 98000 |
| }, |
| { |
| "epoch": 0.000981, |
| "grad_norm": 1.7278543710708618, |
| "learning_rate": 1e-05, |
| "loss": 0.345, |
| "step": 98100 |
| }, |
| { |
| "epoch": 0.000982, |
| "grad_norm": 1.9642508029937744, |
| "learning_rate": 1e-05, |
| "loss": 0.3552, |
| "step": 98200 |
| }, |
| { |
| "epoch": 0.000983, |
| "grad_norm": 1.6919240951538086, |
| "learning_rate": 1e-05, |
| "loss": 0.3481, |
| "step": 98300 |
| }, |
| { |
| "epoch": 0.000984, |
| "grad_norm": 1.7211792469024658, |
| "learning_rate": 1e-05, |
| "loss": 0.3456, |
| "step": 98400 |
| }, |
| { |
| "epoch": 0.000985, |
| "grad_norm": 1.8794984817504883, |
| "learning_rate": 1e-05, |
| "loss": 0.3547, |
| "step": 98500 |
| }, |
| { |
| "epoch": 0.000986, |
| "grad_norm": 1.7422791719436646, |
| "learning_rate": 1e-05, |
| "loss": 0.3459, |
| "step": 98600 |
| }, |
| { |
| "epoch": 0.000987, |
| "grad_norm": 1.7812235355377197, |
| "learning_rate": 1e-05, |
| "loss": 0.3534, |
| "step": 98700 |
| }, |
| { |
| "epoch": 0.000988, |
| "grad_norm": 1.7994880676269531, |
| "learning_rate": 1e-05, |
| "loss": 0.3406, |
| "step": 98800 |
| }, |
| { |
| "epoch": 0.000989, |
| "grad_norm": 1.766994595527649, |
| "learning_rate": 1e-05, |
| "loss": 0.3454, |
| "step": 98900 |
| }, |
| { |
| "epoch": 0.00099, |
| "grad_norm": 1.9302865266799927, |
| "learning_rate": 1e-05, |
| "loss": 0.3434, |
| "step": 99000 |
| }, |
| { |
| "epoch": 0.000991, |
| "grad_norm": 1.6279524564743042, |
| "learning_rate": 1e-05, |
| "loss": 0.3443, |
| "step": 99100 |
| }, |
| { |
| "epoch": 0.000992, |
| "grad_norm": 1.878088116645813, |
| "learning_rate": 1e-05, |
| "loss": 0.3433, |
| "step": 99200 |
| }, |
| { |
| "epoch": 0.000993, |
| "grad_norm": 1.9811022281646729, |
| "learning_rate": 1e-05, |
| "loss": 0.3444, |
| "step": 99300 |
| }, |
| { |
| "epoch": 0.000994, |
| "grad_norm": 1.9504814147949219, |
| "learning_rate": 1e-05, |
| "loss": 0.3448, |
| "step": 99400 |
| }, |
| { |
| "epoch": 0.000995, |
| "grad_norm": 1.7477716207504272, |
| "learning_rate": 1e-05, |
| "loss": 0.3372, |
| "step": 99500 |
| }, |
| { |
| "epoch": 0.000996, |
| "grad_norm": 1.9687480926513672, |
| "learning_rate": 1e-05, |
| "loss": 0.346, |
| "step": 99600 |
| }, |
| { |
| "epoch": 0.000997, |
| "grad_norm": 2.0356996059417725, |
| "learning_rate": 1e-05, |
| "loss": 0.3508, |
| "step": 99700 |
| }, |
| { |
| "epoch": 0.000998, |
| "grad_norm": 1.816023349761963, |
| "learning_rate": 1e-05, |
| "loss": 0.3524, |
| "step": 99800 |
| }, |
| { |
| "epoch": 0.000999, |
| "grad_norm": 2.0732617378234863, |
| "learning_rate": 1e-05, |
| "loss": 0.3468, |
| "step": 99900 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 1.8265982866287231, |
| "learning_rate": 1e-05, |
| "loss": 0.3485, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.001, |
| "eval_loss": 0.322509765625, |
| "eval_runtime": 109.8279, |
| "eval_samples_per_second": 455.258, |
| "eval_steps_per_second": 28.454, |
| "step": 100000 |
| }, |
| { |
| "epoch": 0.001001, |
| "grad_norm": 1.6607346534729004, |
| "learning_rate": 1e-05, |
| "loss": 0.35, |
| "step": 100100 |
| }, |
| { |
| "epoch": 0.001002, |
| "grad_norm": 1.8534297943115234, |
| "learning_rate": 1e-05, |
| "loss": 0.3515, |
| "step": 100200 |
| }, |
| { |
| "epoch": 0.001003, |
| "grad_norm": 1.7251813411712646, |
| "learning_rate": 1e-05, |
| "loss": 0.3449, |
| "step": 100300 |
| }, |
| { |
| "epoch": 0.001004, |
| "grad_norm": 2.1961374282836914, |
| "learning_rate": 1e-05, |
| "loss": 0.3442, |
| "step": 100400 |
| }, |
| { |
| "epoch": 0.001005, |
| "grad_norm": 1.8674982786178589, |
| "learning_rate": 1e-05, |
| "loss": 0.357, |
| "step": 100500 |
| }, |
| { |
| "epoch": 0.001006, |
| "grad_norm": 1.9515495300292969, |
| "learning_rate": 1e-05, |
| "loss": 0.3516, |
| "step": 100600 |
| }, |
| { |
| "epoch": 0.001007, |
| "grad_norm": 1.8294038772583008, |
| "learning_rate": 1e-05, |
| "loss": 0.3475, |
| "step": 100700 |
| }, |
| { |
| "epoch": 0.001008, |
| "grad_norm": 1.7040425539016724, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 100800 |
| }, |
| { |
| "epoch": 0.001009, |
| "grad_norm": 2.464323043823242, |
| "learning_rate": 1e-05, |
| "loss": 0.3437, |
| "step": 100900 |
| }, |
| { |
| "epoch": 0.00101, |
| "grad_norm": 1.5711098909378052, |
| "learning_rate": 1e-05, |
| "loss": 0.3465, |
| "step": 101000 |
| }, |
| { |
| "epoch": 0.001011, |
| "grad_norm": 1.6807917356491089, |
| "learning_rate": 1e-05, |
| "loss": 0.3462, |
| "step": 101100 |
| }, |
| { |
| "epoch": 0.001012, |
| "grad_norm": 2.2576940059661865, |
| "learning_rate": 1e-05, |
| "loss": 0.3389, |
| "step": 101200 |
| }, |
| { |
| "epoch": 0.001013, |
| "grad_norm": 1.7972438335418701, |
| "learning_rate": 1e-05, |
| "loss": 0.3442, |
| "step": 101300 |
| }, |
| { |
| "epoch": 0.001014, |
| "grad_norm": 1.8780492544174194, |
| "learning_rate": 1e-05, |
| "loss": 0.3424, |
| "step": 101400 |
| }, |
| { |
| "epoch": 0.001015, |
| "grad_norm": 1.7459834814071655, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 101500 |
| }, |
| { |
| "epoch": 0.001016, |
| "grad_norm": 1.549613118171692, |
| "learning_rate": 1e-05, |
| "loss": 0.3354, |
| "step": 101600 |
| }, |
| { |
| "epoch": 0.001017, |
| "grad_norm": 1.737701416015625, |
| "learning_rate": 1e-05, |
| "loss": 0.3447, |
| "step": 101700 |
| }, |
| { |
| "epoch": 0.001018, |
| "grad_norm": 1.936926245689392, |
| "learning_rate": 1e-05, |
| "loss": 0.3457, |
| "step": 101800 |
| }, |
| { |
| "epoch": 0.001019, |
| "grad_norm": 1.579476237297058, |
| "learning_rate": 1e-05, |
| "loss": 0.3388, |
| "step": 101900 |
| }, |
| { |
| "epoch": 0.00102, |
| "grad_norm": 1.620912790298462, |
| "learning_rate": 1e-05, |
| "loss": 0.342, |
| "step": 102000 |
| }, |
| { |
| "epoch": 0.001021, |
| "grad_norm": 1.801440715789795, |
| "learning_rate": 1e-05, |
| "loss": 0.3479, |
| "step": 102100 |
| }, |
| { |
| "epoch": 0.001022, |
| "grad_norm": 1.9294061660766602, |
| "learning_rate": 1e-05, |
| "loss": 0.3535, |
| "step": 102200 |
| }, |
| { |
| "epoch": 0.001023, |
| "grad_norm": 1.7232532501220703, |
| "learning_rate": 1e-05, |
| "loss": 0.3465, |
| "step": 102300 |
| }, |
| { |
| "epoch": 0.001024, |
| "grad_norm": 2.327086925506592, |
| "learning_rate": 1e-05, |
| "loss": 0.3379, |
| "step": 102400 |
| }, |
| { |
| "epoch": 0.001025, |
| "grad_norm": 1.652092695236206, |
| "learning_rate": 1e-05, |
| "loss": 0.3399, |
| "step": 102500 |
| }, |
| { |
| "epoch": 0.001026, |
| "grad_norm": 1.6813795566558838, |
| "learning_rate": 1e-05, |
| "loss": 0.3489, |
| "step": 102600 |
| }, |
| { |
| "epoch": 0.001027, |
| "grad_norm": 1.9206030368804932, |
| "learning_rate": 1e-05, |
| "loss": 0.3476, |
| "step": 102700 |
| }, |
| { |
| "epoch": 0.001028, |
| "grad_norm": 1.5671733617782593, |
| "learning_rate": 1e-05, |
| "loss": 0.337, |
| "step": 102800 |
| }, |
| { |
| "epoch": 0.001029, |
| "grad_norm": 1.8689380884170532, |
| "learning_rate": 1e-05, |
| "loss": 0.3449, |
| "step": 102900 |
| }, |
| { |
| "epoch": 0.00103, |
| "grad_norm": 1.6365469694137573, |
| "learning_rate": 1e-05, |
| "loss": 0.3421, |
| "step": 103000 |
| }, |
| { |
| "epoch": 0.001031, |
| "grad_norm": 2.0219128131866455, |
| "learning_rate": 1e-05, |
| "loss": 0.3402, |
| "step": 103100 |
| }, |
| { |
| "epoch": 0.001032, |
| "grad_norm": 1.6188757419586182, |
| "learning_rate": 1e-05, |
| "loss": 0.3458, |
| "step": 103200 |
| }, |
| { |
| "epoch": 0.001033, |
| "grad_norm": 1.809544324874878, |
| "learning_rate": 1e-05, |
| "loss": 0.3436, |
| "step": 103300 |
| }, |
| { |
| "epoch": 0.001034, |
| "grad_norm": 1.6749653816223145, |
| "learning_rate": 1e-05, |
| "loss": 0.3436, |
| "step": 103400 |
| }, |
| { |
| "epoch": 0.001035, |
| "grad_norm": 1.9452251195907593, |
| "learning_rate": 1e-05, |
| "loss": 0.3355, |
| "step": 103500 |
| }, |
| { |
| "epoch": 0.001036, |
| "grad_norm": 1.790397047996521, |
| "learning_rate": 1e-05, |
| "loss": 0.3489, |
| "step": 103600 |
| }, |
| { |
| "epoch": 0.001037, |
| "grad_norm": 1.5875970125198364, |
| "learning_rate": 1e-05, |
| "loss": 0.3358, |
| "step": 103700 |
| }, |
| { |
| "epoch": 0.001038, |
| "grad_norm": 1.6320905685424805, |
| "learning_rate": 1e-05, |
| "loss": 0.3436, |
| "step": 103800 |
| }, |
| { |
| "epoch": 0.001039, |
| "grad_norm": 1.6067711114883423, |
| "learning_rate": 1e-05, |
| "loss": 0.3438, |
| "step": 103900 |
| }, |
| { |
| "epoch": 0.00104, |
| "grad_norm": 1.8375946283340454, |
| "learning_rate": 1e-05, |
| "loss": 0.344, |
| "step": 104000 |
| }, |
| { |
| "epoch": 0.001041, |
| "grad_norm": 1.708240270614624, |
| "learning_rate": 1e-05, |
| "loss": 0.3491, |
| "step": 104100 |
| }, |
| { |
| "epoch": 0.001042, |
| "grad_norm": 2.3994433879852295, |
| "learning_rate": 1e-05, |
| "loss": 0.3462, |
| "step": 104200 |
| }, |
| { |
| "epoch": 0.001043, |
| "grad_norm": 1.7040139436721802, |
| "learning_rate": 1e-05, |
| "loss": 0.3459, |
| "step": 104300 |
| }, |
| { |
| "epoch": 0.001044, |
| "grad_norm": 2.01163911819458, |
| "learning_rate": 1e-05, |
| "loss": 0.3463, |
| "step": 104400 |
| }, |
| { |
| "epoch": 0.001045, |
| "grad_norm": 1.604658842086792, |
| "learning_rate": 1e-05, |
| "loss": 0.341, |
| "step": 104500 |
| }, |
| { |
| "epoch": 0.001046, |
| "grad_norm": 2.69278883934021, |
| "learning_rate": 1e-05, |
| "loss": 0.3418, |
| "step": 104600 |
| }, |
| { |
| "epoch": 0.001047, |
| "grad_norm": 1.6742432117462158, |
| "learning_rate": 1e-05, |
| "loss": 0.3376, |
| "step": 104700 |
| }, |
| { |
| "epoch": 0.001048, |
| "grad_norm": 1.7139792442321777, |
| "learning_rate": 1e-05, |
| "loss": 0.3448, |
| "step": 104800 |
| }, |
| { |
| "epoch": 0.001049, |
| "grad_norm": 1.9812430143356323, |
| "learning_rate": 1e-05, |
| "loss": 0.3443, |
| "step": 104900 |
| }, |
| { |
| "epoch": 0.00105, |
| "grad_norm": 1.9630818367004395, |
| "learning_rate": 1e-05, |
| "loss": 0.3367, |
| "step": 105000 |
| }, |
| { |
| "epoch": 0.001051, |
| "grad_norm": 1.913673758506775, |
| "learning_rate": 1e-05, |
| "loss": 0.3433, |
| "step": 105100 |
| }, |
| { |
| "epoch": 0.001052, |
| "grad_norm": 1.6645994186401367, |
| "learning_rate": 1e-05, |
| "loss": 0.3483, |
| "step": 105200 |
| }, |
| { |
| "epoch": 0.001053, |
| "grad_norm": 1.6963456869125366, |
| "learning_rate": 1e-05, |
| "loss": 0.3518, |
| "step": 105300 |
| }, |
| { |
| "epoch": 0.001054, |
| "grad_norm": 1.774322748184204, |
| "learning_rate": 1e-05, |
| "loss": 0.339, |
| "step": 105400 |
| }, |
| { |
| "epoch": 0.001055, |
| "grad_norm": 1.6794517040252686, |
| "learning_rate": 1e-05, |
| "loss": 0.3412, |
| "step": 105500 |
| }, |
| { |
| "epoch": 0.001056, |
| "grad_norm": 2.0357189178466797, |
| "learning_rate": 1e-05, |
| "loss": 0.3406, |
| "step": 105600 |
| }, |
| { |
| "epoch": 0.001057, |
| "grad_norm": 1.7818143367767334, |
| "learning_rate": 1e-05, |
| "loss": 0.3379, |
| "step": 105700 |
| }, |
| { |
| "epoch": 0.001058, |
| "grad_norm": 1.819798231124878, |
| "learning_rate": 1e-05, |
| "loss": 0.3328, |
| "step": 105800 |
| }, |
| { |
| "epoch": 0.001059, |
| "grad_norm": 1.751774787902832, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 105900 |
| }, |
| { |
| "epoch": 0.00106, |
| "grad_norm": 1.720474362373352, |
| "learning_rate": 1e-05, |
| "loss": 0.3427, |
| "step": 106000 |
| }, |
| { |
| "epoch": 0.001061, |
| "grad_norm": 1.7977921962738037, |
| "learning_rate": 1e-05, |
| "loss": 0.3435, |
| "step": 106100 |
| }, |
| { |
| "epoch": 0.001062, |
| "grad_norm": 2.1512701511383057, |
| "learning_rate": 1e-05, |
| "loss": 0.3443, |
| "step": 106200 |
| }, |
| { |
| "epoch": 0.001063, |
| "grad_norm": 1.7027465105056763, |
| "learning_rate": 1e-05, |
| "loss": 0.3362, |
| "step": 106300 |
| }, |
| { |
| "epoch": 0.001064, |
| "grad_norm": 1.663902997970581, |
| "learning_rate": 1e-05, |
| "loss": 0.3459, |
| "step": 106400 |
| }, |
| { |
| "epoch": 0.001065, |
| "grad_norm": 1.7038410902023315, |
| "learning_rate": 1e-05, |
| "loss": 0.3356, |
| "step": 106500 |
| }, |
| { |
| "epoch": 0.001066, |
| "grad_norm": 1.5602421760559082, |
| "learning_rate": 1e-05, |
| "loss": 0.3354, |
| "step": 106600 |
| }, |
| { |
| "epoch": 0.001067, |
| "grad_norm": 1.8753788471221924, |
| "learning_rate": 1e-05, |
| "loss": 0.3342, |
| "step": 106700 |
| }, |
| { |
| "epoch": 0.001068, |
| "grad_norm": 1.731338620185852, |
| "learning_rate": 1e-05, |
| "loss": 0.34, |
| "step": 106800 |
| }, |
| { |
| "epoch": 0.001069, |
| "grad_norm": 1.7717700004577637, |
| "learning_rate": 1e-05, |
| "loss": 0.3404, |
| "step": 106900 |
| }, |
| { |
| "epoch": 0.00107, |
| "grad_norm": 1.5369184017181396, |
| "learning_rate": 1e-05, |
| "loss": 0.3534, |
| "step": 107000 |
| }, |
| { |
| "epoch": 0.001071, |
| "grad_norm": 2.0892210006713867, |
| "learning_rate": 1e-05, |
| "loss": 0.3396, |
| "step": 107100 |
| }, |
| { |
| "epoch": 0.001072, |
| "grad_norm": 1.9275939464569092, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 107200 |
| }, |
| { |
| "epoch": 0.001073, |
| "grad_norm": 1.9656401872634888, |
| "learning_rate": 1e-05, |
| "loss": 0.3392, |
| "step": 107300 |
| }, |
| { |
| "epoch": 0.001074, |
| "grad_norm": 1.7235068082809448, |
| "learning_rate": 1e-05, |
| "loss": 0.3416, |
| "step": 107400 |
| }, |
| { |
| "epoch": 0.001075, |
| "grad_norm": 1.8416111469268799, |
| "learning_rate": 1e-05, |
| "loss": 0.3367, |
| "step": 107500 |
| }, |
| { |
| "epoch": 0.001076, |
| "grad_norm": 1.7464598417282104, |
| "learning_rate": 1e-05, |
| "loss": 0.3436, |
| "step": 107600 |
| }, |
| { |
| "epoch": 0.001077, |
| "grad_norm": 1.8630284070968628, |
| "learning_rate": 1e-05, |
| "loss": 0.3432, |
| "step": 107700 |
| }, |
| { |
| "epoch": 0.001078, |
| "grad_norm": 1.740233302116394, |
| "learning_rate": 1e-05, |
| "loss": 0.3384, |
| "step": 107800 |
| }, |
| { |
| "epoch": 0.001079, |
| "grad_norm": 1.5509331226348877, |
| "learning_rate": 1e-05, |
| "loss": 0.3332, |
| "step": 107900 |
| }, |
| { |
| "epoch": 0.00108, |
| "grad_norm": 1.7383582592010498, |
| "learning_rate": 1e-05, |
| "loss": 0.3456, |
| "step": 108000 |
| }, |
| { |
| "epoch": 0.001081, |
| "grad_norm": 1.9408977031707764, |
| "learning_rate": 1e-05, |
| "loss": 0.3396, |
| "step": 108100 |
| }, |
| { |
| "epoch": 0.001082, |
| "grad_norm": 1.6888933181762695, |
| "learning_rate": 1e-05, |
| "loss": 0.3404, |
| "step": 108200 |
| }, |
| { |
| "epoch": 0.001083, |
| "grad_norm": 1.9360098838806152, |
| "learning_rate": 1e-05, |
| "loss": 0.3431, |
| "step": 108300 |
| }, |
| { |
| "epoch": 0.001084, |
| "grad_norm": 1.7306195497512817, |
| "learning_rate": 1e-05, |
| "loss": 0.3348, |
| "step": 108400 |
| }, |
| { |
| "epoch": 0.001085, |
| "grad_norm": 1.6970361471176147, |
| "learning_rate": 1e-05, |
| "loss": 0.3381, |
| "step": 108500 |
| }, |
| { |
| "epoch": 0.001086, |
| "grad_norm": 1.664059042930603, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 108600 |
| }, |
| { |
| "epoch": 0.001087, |
| "grad_norm": 1.7473076581954956, |
| "learning_rate": 1e-05, |
| "loss": 0.3403, |
| "step": 108700 |
| }, |
| { |
| "epoch": 0.001088, |
| "grad_norm": 1.705640196800232, |
| "learning_rate": 1e-05, |
| "loss": 0.3353, |
| "step": 108800 |
| }, |
| { |
| "epoch": 0.001089, |
| "grad_norm": 1.8058274984359741, |
| "learning_rate": 1e-05, |
| "loss": 0.3309, |
| "step": 108900 |
| }, |
| { |
| "epoch": 0.00109, |
| "grad_norm": 1.7639211416244507, |
| "learning_rate": 1e-05, |
| "loss": 0.3322, |
| "step": 109000 |
| }, |
| { |
| "epoch": 0.001091, |
| "grad_norm": 2.020731210708618, |
| "learning_rate": 1e-05, |
| "loss": 0.3421, |
| "step": 109100 |
| }, |
| { |
| "epoch": 0.001092, |
| "grad_norm": 1.7614929676055908, |
| "learning_rate": 1e-05, |
| "loss": 0.3395, |
| "step": 109200 |
| }, |
| { |
| "epoch": 0.001093, |
| "grad_norm": 1.8153364658355713, |
| "learning_rate": 1e-05, |
| "loss": 0.3413, |
| "step": 109300 |
| }, |
| { |
| "epoch": 0.001094, |
| "grad_norm": 1.803002119064331, |
| "learning_rate": 1e-05, |
| "loss": 0.337, |
| "step": 109400 |
| }, |
| { |
| "epoch": 0.001095, |
| "grad_norm": 1.6940698623657227, |
| "learning_rate": 1e-05, |
| "loss": 0.3372, |
| "step": 109500 |
| }, |
| { |
| "epoch": 0.001096, |
| "grad_norm": 1.8647571802139282, |
| "learning_rate": 1e-05, |
| "loss": 0.338, |
| "step": 109600 |
| }, |
| { |
| "epoch": 0.001097, |
| "grad_norm": 1.9329015016555786, |
| "learning_rate": 1e-05, |
| "loss": 0.3397, |
| "step": 109700 |
| }, |
| { |
| "epoch": 0.001098, |
| "grad_norm": 1.8334521055221558, |
| "learning_rate": 1e-05, |
| "loss": 0.3368, |
| "step": 109800 |
| }, |
| { |
| "epoch": 0.001099, |
| "grad_norm": 1.6593636274337769, |
| "learning_rate": 1e-05, |
| "loss": 0.335, |
| "step": 109900 |
| }, |
| { |
| "epoch": 0.0011, |
| "grad_norm": 1.8620237112045288, |
| "learning_rate": 1e-05, |
| "loss": 0.3383, |
| "step": 110000 |
| }, |
| { |
| "epoch": 0.001101, |
| "grad_norm": 1.8764339685440063, |
| "learning_rate": 1e-05, |
| "loss": 0.3338, |
| "step": 110100 |
| }, |
| { |
| "epoch": 0.001102, |
| "grad_norm": 1.9678648710250854, |
| "learning_rate": 1e-05, |
| "loss": 0.3441, |
| "step": 110200 |
| }, |
| { |
| "epoch": 0.001103, |
| "grad_norm": 1.5820297002792358, |
| "learning_rate": 1e-05, |
| "loss": 0.3388, |
| "step": 110300 |
| }, |
| { |
| "epoch": 0.001104, |
| "grad_norm": 1.78269362449646, |
| "learning_rate": 1e-05, |
| "loss": 0.34, |
| "step": 110400 |
| }, |
| { |
| "epoch": 0.001105, |
| "grad_norm": 2.253110647201538, |
| "learning_rate": 1e-05, |
| "loss": 0.3352, |
| "step": 110500 |
| }, |
| { |
| "epoch": 0.001106, |
| "grad_norm": 1.553359866142273, |
| "learning_rate": 1e-05, |
| "loss": 0.3316, |
| "step": 110600 |
| }, |
| { |
| "epoch": 0.001107, |
| "grad_norm": 1.8975584506988525, |
| "learning_rate": 1e-05, |
| "loss": 0.3363, |
| "step": 110700 |
| }, |
| { |
| "epoch": 0.001108, |
| "grad_norm": 1.9635969400405884, |
| "learning_rate": 1e-05, |
| "loss": 0.3339, |
| "step": 110800 |
| }, |
| { |
| "epoch": 0.001109, |
| "grad_norm": 1.6212959289550781, |
| "learning_rate": 1e-05, |
| "loss": 0.3365, |
| "step": 110900 |
| }, |
| { |
| "epoch": 0.00111, |
| "grad_norm": 1.667982578277588, |
| "learning_rate": 1e-05, |
| "loss": 0.3385, |
| "step": 111000 |
| }, |
| { |
| "epoch": 0.001111, |
| "grad_norm": 1.799843192100525, |
| "learning_rate": 1e-05, |
| "loss": 0.3342, |
| "step": 111100 |
| }, |
| { |
| "epoch": 0.001112, |
| "grad_norm": 1.9425872564315796, |
| "learning_rate": 1e-05, |
| "loss": 0.3445, |
| "step": 111200 |
| }, |
| { |
| "epoch": 0.001113, |
| "grad_norm": 1.7052315473556519, |
| "learning_rate": 1e-05, |
| "loss": 0.3321, |
| "step": 111300 |
| }, |
| { |
| "epoch": 0.001114, |
| "grad_norm": 1.8439725637435913, |
| "learning_rate": 1e-05, |
| "loss": 0.3362, |
| "step": 111400 |
| }, |
| { |
| "epoch": 0.001115, |
| "grad_norm": 1.5285687446594238, |
| "learning_rate": 1e-05, |
| "loss": 0.3362, |
| "step": 111500 |
| }, |
| { |
| "epoch": 0.001116, |
| "grad_norm": 1.8127108812332153, |
| "learning_rate": 1e-05, |
| "loss": 0.3351, |
| "step": 111600 |
| }, |
| { |
| "epoch": 0.001117, |
| "grad_norm": 2.099846363067627, |
| "learning_rate": 1e-05, |
| "loss": 0.3348, |
| "step": 111700 |
| }, |
| { |
| "epoch": 0.001118, |
| "grad_norm": 1.8282333612442017, |
| "learning_rate": 1e-05, |
| "loss": 0.3353, |
| "step": 111800 |
| }, |
| { |
| "epoch": 0.001119, |
| "grad_norm": 1.9214797019958496, |
| "learning_rate": 1e-05, |
| "loss": 0.3384, |
| "step": 111900 |
| }, |
| { |
| "epoch": 0.00112, |
| "grad_norm": 1.5597374439239502, |
| "learning_rate": 1e-05, |
| "loss": 0.339, |
| "step": 112000 |
| }, |
| { |
| "epoch": 0.001121, |
| "grad_norm": 1.7652438879013062, |
| "learning_rate": 1e-05, |
| "loss": 0.336, |
| "step": 112100 |
| }, |
| { |
| "epoch": 0.001122, |
| "grad_norm": 1.6596375703811646, |
| "learning_rate": 1e-05, |
| "loss": 0.3375, |
| "step": 112200 |
| }, |
| { |
| "epoch": 0.001123, |
| "grad_norm": 1.7220653295516968, |
| "learning_rate": 1e-05, |
| "loss": 0.3424, |
| "step": 112300 |
| }, |
| { |
| "epoch": 0.001124, |
| "grad_norm": 1.7662781476974487, |
| "learning_rate": 1e-05, |
| "loss": 0.3389, |
| "step": 112400 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.3776822090148926, |
| "learning_rate": 1e-05, |
| "loss": 0.3366, |
| "step": 112500 |
| }, |
| { |
| "epoch": 0.001126, |
| "grad_norm": 1.8062163591384888, |
| "learning_rate": 1e-05, |
| "loss": 0.3366, |
| "step": 112600 |
| }, |
| { |
| "epoch": 0.001127, |
| "grad_norm": 1.6732230186462402, |
| "learning_rate": 1e-05, |
| "loss": 0.3338, |
| "step": 112700 |
| }, |
| { |
| "epoch": 0.001128, |
| "grad_norm": 1.7580362558364868, |
| "learning_rate": 1e-05, |
| "loss": 0.33, |
| "step": 112800 |
| }, |
| { |
| "epoch": 0.001129, |
| "grad_norm": 1.4763306379318237, |
| "learning_rate": 1e-05, |
| "loss": 0.3323, |
| "step": 112900 |
| }, |
| { |
| "epoch": 0.00113, |
| "grad_norm": 1.7444915771484375, |
| "learning_rate": 1e-05, |
| "loss": 0.3368, |
| "step": 113000 |
| }, |
| { |
| "epoch": 0.001131, |
| "grad_norm": 1.487596869468689, |
| "learning_rate": 1e-05, |
| "loss": 0.3372, |
| "step": 113100 |
| }, |
| { |
| "epoch": 0.001132, |
| "grad_norm": 1.7567662000656128, |
| "learning_rate": 1e-05, |
| "loss": 0.3363, |
| "step": 113200 |
| }, |
| { |
| "epoch": 0.001133, |
| "grad_norm": 1.742226243019104, |
| "learning_rate": 1e-05, |
| "loss": 0.3353, |
| "step": 113300 |
| }, |
| { |
| "epoch": 0.001134, |
| "grad_norm": 1.7816177606582642, |
| "learning_rate": 1e-05, |
| "loss": 0.3363, |
| "step": 113400 |
| }, |
| { |
| "epoch": 0.001135, |
| "grad_norm": 1.9755498170852661, |
| "learning_rate": 1e-05, |
| "loss": 0.3418, |
| "step": 113500 |
| }, |
| { |
| "epoch": 0.001136, |
| "grad_norm": 1.867906093597412, |
| "learning_rate": 1e-05, |
| "loss": 0.3334, |
| "step": 113600 |
| }, |
| { |
| "epoch": 0.001137, |
| "grad_norm": 1.6979436874389648, |
| "learning_rate": 1e-05, |
| "loss": 0.3229, |
| "step": 113700 |
| }, |
| { |
| "epoch": 0.001138, |
| "grad_norm": 1.7003270387649536, |
| "learning_rate": 1e-05, |
| "loss": 0.339, |
| "step": 113800 |
| }, |
| { |
| "epoch": 0.001139, |
| "grad_norm": 1.7175750732421875, |
| "learning_rate": 1e-05, |
| "loss": 0.328, |
| "step": 113900 |
| }, |
| { |
| "epoch": 0.00114, |
| "grad_norm": 1.8313535451889038, |
| "learning_rate": 1e-05, |
| "loss": 0.334, |
| "step": 114000 |
| }, |
| { |
| "epoch": 0.001141, |
| "grad_norm": 1.863562822341919, |
| "learning_rate": 1e-05, |
| "loss": 0.3331, |
| "step": 114100 |
| }, |
| { |
| "epoch": 0.001142, |
| "grad_norm": 1.5223082304000854, |
| "learning_rate": 1e-05, |
| "loss": 0.3327, |
| "step": 114200 |
| }, |
| { |
| "epoch": 0.001143, |
| "grad_norm": 1.6789870262145996, |
| "learning_rate": 1e-05, |
| "loss": 0.3313, |
| "step": 114300 |
| }, |
| { |
| "epoch": 0.001144, |
| "grad_norm": 1.6574594974517822, |
| "learning_rate": 1e-05, |
| "loss": 0.3376, |
| "step": 114400 |
| }, |
| { |
| "epoch": 0.001145, |
| "grad_norm": 1.8169411420822144, |
| "learning_rate": 1e-05, |
| "loss": 0.3392, |
| "step": 114500 |
| }, |
| { |
| "epoch": 0.001146, |
| "grad_norm": 2.384134292602539, |
| "learning_rate": 1e-05, |
| "loss": 0.3421, |
| "step": 114600 |
| }, |
| { |
| "epoch": 0.001147, |
| "grad_norm": 1.8304411172866821, |
| "learning_rate": 1e-05, |
| "loss": 0.3356, |
| "step": 114700 |
| }, |
| { |
| "epoch": 0.001148, |
| "grad_norm": 1.5321639776229858, |
| "learning_rate": 1e-05, |
| "loss": 0.3317, |
| "step": 114800 |
| }, |
| { |
| "epoch": 0.001149, |
| "grad_norm": 1.7843445539474487, |
| "learning_rate": 1e-05, |
| "loss": 0.3267, |
| "step": 114900 |
| }, |
| { |
| "epoch": 0.00115, |
| "grad_norm": 1.8861100673675537, |
| "learning_rate": 1e-05, |
| "loss": 0.3318, |
| "step": 115000 |
| }, |
| { |
| "epoch": 0.001151, |
| "grad_norm": 1.8112998008728027, |
| "learning_rate": 1e-05, |
| "loss": 0.3293, |
| "step": 115100 |
| }, |
| { |
| "epoch": 0.001152, |
| "grad_norm": 1.7408936023712158, |
| "learning_rate": 1e-05, |
| "loss": 0.339, |
| "step": 115200 |
| }, |
| { |
| "epoch": 0.001153, |
| "grad_norm": 1.5955983400344849, |
| "learning_rate": 1e-05, |
| "loss": 0.3326, |
| "step": 115300 |
| }, |
| { |
| "epoch": 0.001154, |
| "grad_norm": 1.6836644411087036, |
| "learning_rate": 1e-05, |
| "loss": 0.3377, |
| "step": 115400 |
| }, |
| { |
| "epoch": 0.001155, |
| "grad_norm": 1.7743850946426392, |
| "learning_rate": 1e-05, |
| "loss": 0.3278, |
| "step": 115500 |
| }, |
| { |
| "epoch": 0.001156, |
| "grad_norm": 1.5382933616638184, |
| "learning_rate": 1e-05, |
| "loss": 0.3397, |
| "step": 115600 |
| }, |
| { |
| "epoch": 0.001157, |
| "grad_norm": 1.7360892295837402, |
| "learning_rate": 1e-05, |
| "loss": 0.3207, |
| "step": 115700 |
| }, |
| { |
| "epoch": 0.001158, |
| "grad_norm": 1.7574350833892822, |
| "learning_rate": 1e-05, |
| "loss": 0.3439, |
| "step": 115800 |
| }, |
| { |
| "epoch": 0.001159, |
| "grad_norm": 1.8633227348327637, |
| "learning_rate": 1e-05, |
| "loss": 0.335, |
| "step": 115900 |
| }, |
| { |
| "epoch": 0.00116, |
| "grad_norm": 1.4621351957321167, |
| "learning_rate": 1e-05, |
| "loss": 0.3358, |
| "step": 116000 |
| }, |
| { |
| "epoch": 0.001161, |
| "grad_norm": 1.9157224893569946, |
| "learning_rate": 1e-05, |
| "loss": 0.3406, |
| "step": 116100 |
| }, |
| { |
| "epoch": 0.001162, |
| "grad_norm": 1.6284751892089844, |
| "learning_rate": 1e-05, |
| "loss": 0.3277, |
| "step": 116200 |
| }, |
| { |
| "epoch": 0.001163, |
| "grad_norm": 2.2173221111297607, |
| "learning_rate": 1e-05, |
| "loss": 0.3259, |
| "step": 116300 |
| }, |
| { |
| "epoch": 0.001164, |
| "grad_norm": 1.8805922269821167, |
| "learning_rate": 1e-05, |
| "loss": 0.3304, |
| "step": 116400 |
| }, |
| { |
| "epoch": 0.001165, |
| "grad_norm": 1.5072230100631714, |
| "learning_rate": 1e-05, |
| "loss": 0.3329, |
| "step": 116500 |
| }, |
| { |
| "epoch": 0.001166, |
| "grad_norm": 1.7337315082550049, |
| "learning_rate": 1e-05, |
| "loss": 0.3358, |
| "step": 116600 |
| }, |
| { |
| "epoch": 0.001167, |
| "grad_norm": 1.8346338272094727, |
| "learning_rate": 1e-05, |
| "loss": 0.335, |
| "step": 116700 |
| }, |
| { |
| "epoch": 0.001168, |
| "grad_norm": 2.003572940826416, |
| "learning_rate": 1e-05, |
| "loss": 0.333, |
| "step": 116800 |
| }, |
| { |
| "epoch": 0.001169, |
| "grad_norm": 1.6946192979812622, |
| "learning_rate": 1e-05, |
| "loss": 0.3274, |
| "step": 116900 |
| }, |
| { |
| "epoch": 0.00117, |
| "grad_norm": 1.7123721837997437, |
| "learning_rate": 1e-05, |
| "loss": 0.3346, |
| "step": 117000 |
| }, |
| { |
| "epoch": 0.001171, |
| "grad_norm": 1.8998627662658691, |
| "learning_rate": 1e-05, |
| "loss": 0.3355, |
| "step": 117100 |
| }, |
| { |
| "epoch": 0.001172, |
| "grad_norm": 1.5401489734649658, |
| "learning_rate": 1e-05, |
| "loss": 0.3365, |
| "step": 117200 |
| }, |
| { |
| "epoch": 0.001173, |
| "grad_norm": 1.7201097011566162, |
| "learning_rate": 1e-05, |
| "loss": 0.3382, |
| "step": 117300 |
| }, |
| { |
| "epoch": 0.001174, |
| "grad_norm": 1.8772022724151611, |
| "learning_rate": 1e-05, |
| "loss": 0.3323, |
| "step": 117400 |
| }, |
| { |
| "epoch": 0.001175, |
| "grad_norm": 1.5749614238739014, |
| "learning_rate": 1e-05, |
| "loss": 0.3247, |
| "step": 117500 |
| }, |
| { |
| "epoch": 0.001176, |
| "grad_norm": 1.753891944885254, |
| "learning_rate": 1e-05, |
| "loss": 0.325, |
| "step": 117600 |
| }, |
| { |
| "epoch": 0.001177, |
| "grad_norm": 1.6012095212936401, |
| "learning_rate": 1e-05, |
| "loss": 0.3353, |
| "step": 117700 |
| }, |
| { |
| "epoch": 0.001178, |
| "grad_norm": 1.8432629108428955, |
| "learning_rate": 1e-05, |
| "loss": 0.3342, |
| "step": 117800 |
| }, |
| { |
| "epoch": 0.001179, |
| "grad_norm": 1.583196997642517, |
| "learning_rate": 1e-05, |
| "loss": 0.33, |
| "step": 117900 |
| }, |
| { |
| "epoch": 0.00118, |
| "grad_norm": 1.9006246328353882, |
| "learning_rate": 1e-05, |
| "loss": 0.3282, |
| "step": 118000 |
| }, |
| { |
| "epoch": 0.001181, |
| "grad_norm": 1.6398696899414062, |
| "learning_rate": 1e-05, |
| "loss": 0.3326, |
| "step": 118100 |
| }, |
| { |
| "epoch": 0.001182, |
| "grad_norm": 1.7854382991790771, |
| "learning_rate": 1e-05, |
| "loss": 0.3273, |
| "step": 118200 |
| }, |
| { |
| "epoch": 0.001183, |
| "grad_norm": 2.0176942348480225, |
| "learning_rate": 1e-05, |
| "loss": 0.3296, |
| "step": 118300 |
| }, |
| { |
| "epoch": 0.001184, |
| "grad_norm": 2.0091938972473145, |
| "learning_rate": 1e-05, |
| "loss": 0.3291, |
| "step": 118400 |
| }, |
| { |
| "epoch": 0.001185, |
| "grad_norm": 1.906575322151184, |
| "learning_rate": 1e-05, |
| "loss": 0.3354, |
| "step": 118500 |
| }, |
| { |
| "epoch": 0.001186, |
| "grad_norm": 1.7136719226837158, |
| "learning_rate": 1e-05, |
| "loss": 0.3314, |
| "step": 118600 |
| }, |
| { |
| "epoch": 0.001187, |
| "grad_norm": 1.7901870012283325, |
| "learning_rate": 1e-05, |
| "loss": 0.3236, |
| "step": 118700 |
| }, |
| { |
| "epoch": 0.001188, |
| "grad_norm": 1.7387175559997559, |
| "learning_rate": 1e-05, |
| "loss": 0.3227, |
| "step": 118800 |
| }, |
| { |
| "epoch": 0.001189, |
| "grad_norm": 1.7231628894805908, |
| "learning_rate": 1e-05, |
| "loss": 0.324, |
| "step": 118900 |
| }, |
| { |
| "epoch": 0.00119, |
| "grad_norm": 1.516570806503296, |
| "learning_rate": 1e-05, |
| "loss": 0.3332, |
| "step": 119000 |
| }, |
| { |
| "epoch": 0.001191, |
| "grad_norm": 1.7026876211166382, |
| "learning_rate": 1e-05, |
| "loss": 0.3272, |
| "step": 119100 |
| }, |
| { |
| "epoch": 0.001192, |
| "grad_norm": 1.3457015752792358, |
| "learning_rate": 1e-05, |
| "loss": 0.3306, |
| "step": 119200 |
| }, |
| { |
| "epoch": 0.001193, |
| "grad_norm": 1.9337682723999023, |
| "learning_rate": 1e-05, |
| "loss": 0.3237, |
| "step": 119300 |
| }, |
| { |
| "epoch": 0.001194, |
| "grad_norm": 1.6353681087493896, |
| "learning_rate": 1e-05, |
| "loss": 0.3287, |
| "step": 119400 |
| }, |
| { |
| "epoch": 0.001195, |
| "grad_norm": 1.875755786895752, |
| "learning_rate": 1e-05, |
| "loss": 0.3245, |
| "step": 119500 |
| }, |
| { |
| "epoch": 0.001196, |
| "grad_norm": 2.4236490726470947, |
| "learning_rate": 1e-05, |
| "loss": 0.3274, |
| "step": 119600 |
| }, |
| { |
| "epoch": 0.001197, |
| "grad_norm": 1.7631841897964478, |
| "learning_rate": 1e-05, |
| "loss": 0.3237, |
| "step": 119700 |
| }, |
| { |
| "epoch": 0.001198, |
| "grad_norm": 1.536399245262146, |
| "learning_rate": 1e-05, |
| "loss": 0.3285, |
| "step": 119800 |
| }, |
| { |
| "epoch": 0.001199, |
| "grad_norm": 1.7705007791519165, |
| "learning_rate": 1e-05, |
| "loss": 0.3291, |
| "step": 119900 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 1.8081552982330322, |
| "learning_rate": 1e-05, |
| "loss": 0.3251, |
| "step": 120000 |
| }, |
| { |
| "epoch": 0.0012, |
| "eval_loss": 0.301513671875, |
| "eval_runtime": 114.8107, |
| "eval_samples_per_second": 435.499, |
| "eval_steps_per_second": 27.219, |
| "step": 120000 |
| }, |
| { |
| "epoch": 0.001201, |
| "grad_norm": 1.8171324729919434, |
| "learning_rate": 1e-05, |
| "loss": 0.3342, |
| "step": 120100 |
| }, |
| { |
| "epoch": 0.001202, |
| "grad_norm": 1.6765378713607788, |
| "learning_rate": 1e-05, |
| "loss": 0.3276, |
| "step": 120200 |
| }, |
| { |
| "epoch": 0.001203, |
| "grad_norm": 1.8498486280441284, |
| "learning_rate": 1e-05, |
| "loss": 0.3314, |
| "step": 120300 |
| }, |
| { |
| "epoch": 0.001204, |
| "grad_norm": 1.536418080329895, |
| "learning_rate": 1e-05, |
| "loss": 0.3237, |
| "step": 120400 |
| }, |
| { |
| "epoch": 0.001205, |
| "grad_norm": 1.5730416774749756, |
| "learning_rate": 1e-05, |
| "loss": 0.3263, |
| "step": 120500 |
| }, |
| { |
| "epoch": 0.001206, |
| "grad_norm": 1.7513419389724731, |
| "learning_rate": 1e-05, |
| "loss": 0.3355, |
| "step": 120600 |
| }, |
| { |
| "epoch": 0.001207, |
| "grad_norm": 1.7094500064849854, |
| "learning_rate": 1e-05, |
| "loss": 0.3243, |
| "step": 120700 |
| }, |
| { |
| "epoch": 0.001208, |
| "grad_norm": 1.8792259693145752, |
| "learning_rate": 1e-05, |
| "loss": 0.3283, |
| "step": 120800 |
| }, |
| { |
| "epoch": 0.001209, |
| "grad_norm": 1.7763640880584717, |
| "learning_rate": 1e-05, |
| "loss": 0.3325, |
| "step": 120900 |
| }, |
| { |
| "epoch": 0.00121, |
| "grad_norm": 1.8400176763534546, |
| "learning_rate": 1e-05, |
| "loss": 0.3342, |
| "step": 121000 |
| }, |
| { |
| "epoch": 0.001211, |
| "grad_norm": 1.9107251167297363, |
| "learning_rate": 1e-05, |
| "loss": 0.3314, |
| "step": 121100 |
| }, |
| { |
| "epoch": 0.001212, |
| "grad_norm": 1.4877129793167114, |
| "learning_rate": 1e-05, |
| "loss": 0.3317, |
| "step": 121200 |
| }, |
| { |
| "epoch": 0.001213, |
| "grad_norm": 1.6370038986206055, |
| "learning_rate": 1e-05, |
| "loss": 0.3253, |
| "step": 121300 |
| }, |
| { |
| "epoch": 0.001214, |
| "grad_norm": 1.9504472017288208, |
| "learning_rate": 1e-05, |
| "loss": 0.3242, |
| "step": 121400 |
| }, |
| { |
| "epoch": 0.001215, |
| "grad_norm": 2.077836036682129, |
| "learning_rate": 1e-05, |
| "loss": 0.3302, |
| "step": 121500 |
| }, |
| { |
| "epoch": 0.001216, |
| "grad_norm": 1.4920196533203125, |
| "learning_rate": 1e-05, |
| "loss": 0.332, |
| "step": 121600 |
| }, |
| { |
| "epoch": 0.001217, |
| "grad_norm": 1.7362091541290283, |
| "learning_rate": 1e-05, |
| "loss": 0.3313, |
| "step": 121700 |
| }, |
| { |
| "epoch": 0.001218, |
| "grad_norm": 1.662297248840332, |
| "learning_rate": 1e-05, |
| "loss": 0.3287, |
| "step": 121800 |
| }, |
| { |
| "epoch": 0.001219, |
| "grad_norm": 1.6544877290725708, |
| "learning_rate": 1e-05, |
| "loss": 0.3266, |
| "step": 121900 |
| }, |
| { |
| "epoch": 0.00122, |
| "grad_norm": 1.5775099992752075, |
| "learning_rate": 1e-05, |
| "loss": 0.3263, |
| "step": 122000 |
| }, |
| { |
| "epoch": 0.001221, |
| "grad_norm": 1.704094648361206, |
| "learning_rate": 1e-05, |
| "loss": 0.325, |
| "step": 122100 |
| }, |
| { |
| "epoch": 0.001222, |
| "grad_norm": 1.9005615711212158, |
| "learning_rate": 1e-05, |
| "loss": 0.3268, |
| "step": 122200 |
| }, |
| { |
| "epoch": 0.001223, |
| "grad_norm": 2.027251958847046, |
| "learning_rate": 1e-05, |
| "loss": 0.3266, |
| "step": 122300 |
| }, |
| { |
| "epoch": 0.001224, |
| "grad_norm": 1.8168870210647583, |
| "learning_rate": 1e-05, |
| "loss": 0.3298, |
| "step": 122400 |
| }, |
| { |
| "epoch": 0.001225, |
| "grad_norm": 1.709088921546936, |
| "learning_rate": 1e-05, |
| "loss": 0.322, |
| "step": 122500 |
| }, |
| { |
| "epoch": 0.001226, |
| "grad_norm": 1.7680180072784424, |
| "learning_rate": 1e-05, |
| "loss": 0.3362, |
| "step": 122600 |
| }, |
| { |
| "epoch": 0.001227, |
| "grad_norm": 1.681280493736267, |
| "learning_rate": 1e-05, |
| "loss": 0.3236, |
| "step": 122700 |
| }, |
| { |
| "epoch": 0.001228, |
| "grad_norm": 1.7112354040145874, |
| "learning_rate": 1e-05, |
| "loss": 0.3308, |
| "step": 122800 |
| }, |
| { |
| "epoch": 0.001229, |
| "grad_norm": 1.623887062072754, |
| "learning_rate": 1e-05, |
| "loss": 0.3255, |
| "step": 122900 |
| }, |
| { |
| "epoch": 0.00123, |
| "grad_norm": 1.880348801612854, |
| "learning_rate": 1e-05, |
| "loss": 0.3348, |
| "step": 123000 |
| }, |
| { |
| "epoch": 0.001231, |
| "grad_norm": 1.8015272617340088, |
| "learning_rate": 1e-05, |
| "loss": 0.3263, |
| "step": 123100 |
| }, |
| { |
| "epoch": 0.001232, |
| "grad_norm": 1.794119954109192, |
| "learning_rate": 1e-05, |
| "loss": 0.3215, |
| "step": 123200 |
| }, |
| { |
| "epoch": 0.001233, |
| "grad_norm": 1.3672672510147095, |
| "learning_rate": 1e-05, |
| "loss": 0.3246, |
| "step": 123300 |
| }, |
| { |
| "epoch": 0.001234, |
| "grad_norm": 1.702120304107666, |
| "learning_rate": 1e-05, |
| "loss": 0.3277, |
| "step": 123400 |
| }, |
| { |
| "epoch": 0.001235, |
| "grad_norm": 1.6856110095977783, |
| "learning_rate": 1e-05, |
| "loss": 0.3308, |
| "step": 123500 |
| }, |
| { |
| "epoch": 0.001236, |
| "grad_norm": 1.3940743207931519, |
| "learning_rate": 1e-05, |
| "loss": 0.3224, |
| "step": 123600 |
| }, |
| { |
| "epoch": 0.001237, |
| "grad_norm": 1.4997862577438354, |
| "learning_rate": 1e-05, |
| "loss": 0.3209, |
| "step": 123700 |
| }, |
| { |
| "epoch": 0.001238, |
| "grad_norm": 1.6719286441802979, |
| "learning_rate": 1e-05, |
| "loss": 0.3287, |
| "step": 123800 |
| }, |
| { |
| "epoch": 0.001239, |
| "grad_norm": 1.4933640956878662, |
| "learning_rate": 1e-05, |
| "loss": 0.3275, |
| "step": 123900 |
| }, |
| { |
| "epoch": 0.00124, |
| "grad_norm": 1.6647841930389404, |
| "learning_rate": 1e-05, |
| "loss": 0.3217, |
| "step": 124000 |
| }, |
| { |
| "epoch": 0.001241, |
| "grad_norm": 1.656747817993164, |
| "learning_rate": 1e-05, |
| "loss": 0.3185, |
| "step": 124100 |
| }, |
| { |
| "epoch": 0.001242, |
| "grad_norm": 1.7526649236679077, |
| "learning_rate": 1e-05, |
| "loss": 0.3238, |
| "step": 124200 |
| }, |
| { |
| "epoch": 0.001243, |
| "grad_norm": 1.5294679403305054, |
| "learning_rate": 1e-05, |
| "loss": 0.3282, |
| "step": 124300 |
| }, |
| { |
| "epoch": 0.001244, |
| "grad_norm": 1.5839802026748657, |
| "learning_rate": 1e-05, |
| "loss": 0.3191, |
| "step": 124400 |
| }, |
| { |
| "epoch": 0.001245, |
| "grad_norm": 2.2223639488220215, |
| "learning_rate": 1e-05, |
| "loss": 0.3296, |
| "step": 124500 |
| }, |
| { |
| "epoch": 0.001246, |
| "grad_norm": 1.7450740337371826, |
| "learning_rate": 1e-05, |
| "loss": 0.3275, |
| "step": 124600 |
| }, |
| { |
| "epoch": 0.001247, |
| "grad_norm": 1.591633677482605, |
| "learning_rate": 1e-05, |
| "loss": 0.3152, |
| "step": 124700 |
| }, |
| { |
| "epoch": 0.001248, |
| "grad_norm": 1.8334294557571411, |
| "learning_rate": 1e-05, |
| "loss": 0.325, |
| "step": 124800 |
| }, |
| { |
| "epoch": 0.001249, |
| "grad_norm": 1.8498133420944214, |
| "learning_rate": 1e-05, |
| "loss": 0.3267, |
| "step": 124900 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 1.8751991987228394, |
| "learning_rate": 1e-05, |
| "loss": 0.329, |
| "step": 125000 |
| }, |
| { |
| "epoch": 0.001251, |
| "grad_norm": 1.7075512409210205, |
| "learning_rate": 1e-05, |
| "loss": 0.3233, |
| "step": 125100 |
| }, |
| { |
| "epoch": 0.001252, |
| "grad_norm": 1.7127974033355713, |
| "learning_rate": 1e-05, |
| "loss": 0.3256, |
| "step": 125200 |
| }, |
| { |
| "epoch": 0.001253, |
| "grad_norm": 1.6013959646224976, |
| "learning_rate": 1e-05, |
| "loss": 0.3294, |
| "step": 125300 |
| }, |
| { |
| "epoch": 0.001254, |
| "grad_norm": 1.616365671157837, |
| "learning_rate": 1e-05, |
| "loss": 0.3256, |
| "step": 125400 |
| }, |
| { |
| "epoch": 0.001255, |
| "grad_norm": 1.9536734819412231, |
| "learning_rate": 1e-05, |
| "loss": 0.3268, |
| "step": 125500 |
| }, |
| { |
| "epoch": 0.001256, |
| "grad_norm": 1.725591778755188, |
| "learning_rate": 1e-05, |
| "loss": 0.329, |
| "step": 125600 |
| }, |
| { |
| "epoch": 0.001257, |
| "grad_norm": 1.4925453662872314, |
| "learning_rate": 1e-05, |
| "loss": 0.3284, |
| "step": 125700 |
| }, |
| { |
| "epoch": 0.001258, |
| "grad_norm": 1.718002438545227, |
| "learning_rate": 1e-05, |
| "loss": 0.3269, |
| "step": 125800 |
| }, |
| { |
| "epoch": 0.001259, |
| "grad_norm": 1.6296608448028564, |
| "learning_rate": 1e-05, |
| "loss": 0.3187, |
| "step": 125900 |
| }, |
| { |
| "epoch": 0.00126, |
| "grad_norm": 1.584074854850769, |
| "learning_rate": 1e-05, |
| "loss": 0.3239, |
| "step": 126000 |
| }, |
| { |
| "epoch": 0.001261, |
| "grad_norm": 1.7755796909332275, |
| "learning_rate": 1e-05, |
| "loss": 0.3231, |
| "step": 126100 |
| }, |
| { |
| "epoch": 0.001262, |
| "grad_norm": 1.5260752439498901, |
| "learning_rate": 1e-05, |
| "loss": 0.3227, |
| "step": 126200 |
| }, |
| { |
| "epoch": 0.001263, |
| "grad_norm": 1.7237966060638428, |
| "learning_rate": 1e-05, |
| "loss": 0.3281, |
| "step": 126300 |
| }, |
| { |
| "epoch": 0.001264, |
| "grad_norm": 1.6392747163772583, |
| "learning_rate": 1e-05, |
| "loss": 0.3211, |
| "step": 126400 |
| }, |
| { |
| "epoch": 0.001265, |
| "grad_norm": 1.8122193813323975, |
| "learning_rate": 1e-05, |
| "loss": 0.3264, |
| "step": 126500 |
| }, |
| { |
| "epoch": 0.001266, |
| "grad_norm": 1.8250197172164917, |
| "learning_rate": 1e-05, |
| "loss": 0.32, |
| "step": 126600 |
| }, |
| { |
| "epoch": 0.001267, |
| "grad_norm": 1.7046698331832886, |
| "learning_rate": 1e-05, |
| "loss": 0.3254, |
| "step": 126700 |
| }, |
| { |
| "epoch": 0.001268, |
| "grad_norm": 1.74703049659729, |
| "learning_rate": 1e-05, |
| "loss": 0.3239, |
| "step": 126800 |
| }, |
| { |
| "epoch": 0.001269, |
| "grad_norm": 1.683023452758789, |
| "learning_rate": 1e-05, |
| "loss": 0.3318, |
| "step": 126900 |
| }, |
| { |
| "epoch": 0.00127, |
| "grad_norm": 1.9313528537750244, |
| "learning_rate": 1e-05, |
| "loss": 0.3203, |
| "step": 127000 |
| }, |
| { |
| "epoch": 0.001271, |
| "grad_norm": 1.3307342529296875, |
| "learning_rate": 1e-05, |
| "loss": 0.327, |
| "step": 127100 |
| }, |
| { |
| "epoch": 0.001272, |
| "grad_norm": 1.817285418510437, |
| "learning_rate": 1e-05, |
| "loss": 0.3352, |
| "step": 127200 |
| }, |
| { |
| "epoch": 0.001273, |
| "grad_norm": 1.7280235290527344, |
| "learning_rate": 1e-05, |
| "loss": 0.3207, |
| "step": 127300 |
| }, |
| { |
| "epoch": 0.001274, |
| "grad_norm": 1.7646243572235107, |
| "learning_rate": 1e-05, |
| "loss": 0.3303, |
| "step": 127400 |
| }, |
| { |
| "epoch": 0.001275, |
| "grad_norm": 1.6660072803497314, |
| "learning_rate": 1e-05, |
| "loss": 0.3262, |
| "step": 127500 |
| }, |
| { |
| "epoch": 0.001276, |
| "grad_norm": 1.8076339960098267, |
| "learning_rate": 1e-05, |
| "loss": 0.3195, |
| "step": 127600 |
| }, |
| { |
| "epoch": 0.001277, |
| "grad_norm": 1.6329010725021362, |
| "learning_rate": 1e-05, |
| "loss": 0.3166, |
| "step": 127700 |
| }, |
| { |
| "epoch": 0.001278, |
| "grad_norm": 1.8912221193313599, |
| "learning_rate": 1e-05, |
| "loss": 0.3253, |
| "step": 127800 |
| }, |
| { |
| "epoch": 0.001279, |
| "grad_norm": 1.6600183248519897, |
| "learning_rate": 1e-05, |
| "loss": 0.3328, |
| "step": 127900 |
| }, |
| { |
| "epoch": 0.00128, |
| "grad_norm": 2.2682251930236816, |
| "learning_rate": 1e-05, |
| "loss": 0.322, |
| "step": 128000 |
| }, |
| { |
| "epoch": 0.001281, |
| "grad_norm": 1.9191845655441284, |
| "learning_rate": 1e-05, |
| "loss": 0.3236, |
| "step": 128100 |
| }, |
| { |
| "epoch": 0.001282, |
| "grad_norm": 1.8286381959915161, |
| "learning_rate": 1e-05, |
| "loss": 0.3161, |
| "step": 128200 |
| }, |
| { |
| "epoch": 0.001283, |
| "grad_norm": 1.5892844200134277, |
| "learning_rate": 1e-05, |
| "loss": 0.3176, |
| "step": 128300 |
| }, |
| { |
| "epoch": 0.001284, |
| "grad_norm": 1.5269801616668701, |
| "learning_rate": 1e-05, |
| "loss": 0.3213, |
| "step": 128400 |
| }, |
| { |
| "epoch": 0.001285, |
| "grad_norm": 1.9540048837661743, |
| "learning_rate": 1e-05, |
| "loss": 0.3254, |
| "step": 128500 |
| }, |
| { |
| "epoch": 0.001286, |
| "grad_norm": 1.669771671295166, |
| "learning_rate": 1e-05, |
| "loss": 0.3218, |
| "step": 128600 |
| }, |
| { |
| "epoch": 0.001287, |
| "grad_norm": 1.8254411220550537, |
| "learning_rate": 1e-05, |
| "loss": 0.3241, |
| "step": 128700 |
| }, |
| { |
| "epoch": 0.001288, |
| "grad_norm": 1.9790143966674805, |
| "learning_rate": 1e-05, |
| "loss": 0.323, |
| "step": 128800 |
| }, |
| { |
| "epoch": 0.001289, |
| "grad_norm": 1.7570897340774536, |
| "learning_rate": 1e-05, |
| "loss": 0.3233, |
| "step": 128900 |
| }, |
| { |
| "epoch": 0.00129, |
| "grad_norm": 1.662227988243103, |
| "learning_rate": 1e-05, |
| "loss": 0.3232, |
| "step": 129000 |
| }, |
| { |
| "epoch": 0.001291, |
| "grad_norm": 1.6427834033966064, |
| "learning_rate": 1e-05, |
| "loss": 0.3163, |
| "step": 129100 |
| }, |
| { |
| "epoch": 0.001292, |
| "grad_norm": 1.74818754196167, |
| "learning_rate": 1e-05, |
| "loss": 0.3221, |
| "step": 129200 |
| }, |
| { |
| "epoch": 0.001293, |
| "grad_norm": 1.4715895652770996, |
| "learning_rate": 1e-05, |
| "loss": 0.3199, |
| "step": 129300 |
| }, |
| { |
| "epoch": 0.001294, |
| "grad_norm": 1.5854099988937378, |
| "learning_rate": 1e-05, |
| "loss": 0.3276, |
| "step": 129400 |
| }, |
| { |
| "epoch": 0.001295, |
| "grad_norm": 1.6624189615249634, |
| "learning_rate": 1e-05, |
| "loss": 0.3206, |
| "step": 129500 |
| }, |
| { |
| "epoch": 0.001296, |
| "grad_norm": 1.5426676273345947, |
| "learning_rate": 1e-05, |
| "loss": 0.3215, |
| "step": 129600 |
| }, |
| { |
| "epoch": 0.001297, |
| "grad_norm": 1.7038050889968872, |
| "learning_rate": 1e-05, |
| "loss": 0.3225, |
| "step": 129700 |
| }, |
| { |
| "epoch": 0.001298, |
| "grad_norm": 1.4283004999160767, |
| "learning_rate": 1e-05, |
| "loss": 0.3216, |
| "step": 129800 |
| }, |
| { |
| "epoch": 0.001299, |
| "grad_norm": 1.5062975883483887, |
| "learning_rate": 1e-05, |
| "loss": 0.3221, |
| "step": 129900 |
| }, |
| { |
| "epoch": 0.0013, |
| "grad_norm": 1.7706925868988037, |
| "learning_rate": 1e-05, |
| "loss": 0.3205, |
| "step": 130000 |
| }, |
| { |
| "epoch": 0.001301, |
| "grad_norm": 1.6793439388275146, |
| "learning_rate": 1e-05, |
| "loss": 0.3186, |
| "step": 130100 |
| }, |
| { |
| "epoch": 0.001302, |
| "grad_norm": 1.5680670738220215, |
| "learning_rate": 1e-05, |
| "loss": 0.3203, |
| "step": 130200 |
| }, |
| { |
| "epoch": 0.001303, |
| "grad_norm": 1.8509889841079712, |
| "learning_rate": 1e-05, |
| "loss": 0.3293, |
| "step": 130300 |
| }, |
| { |
| "epoch": 0.001304, |
| "grad_norm": 1.6747349500656128, |
| "learning_rate": 1e-05, |
| "loss": 0.3208, |
| "step": 130400 |
| }, |
| { |
| "epoch": 0.001305, |
| "grad_norm": 1.7840492725372314, |
| "learning_rate": 1e-05, |
| "loss": 0.3176, |
| "step": 130500 |
| }, |
| { |
| "epoch": 0.001306, |
| "grad_norm": 1.4201829433441162, |
| "learning_rate": 1e-05, |
| "loss": 0.3198, |
| "step": 130600 |
| }, |
| { |
| "epoch": 0.001307, |
| "grad_norm": 1.5987930297851562, |
| "learning_rate": 1e-05, |
| "loss": 0.3276, |
| "step": 130700 |
| }, |
| { |
| "epoch": 0.001308, |
| "grad_norm": 1.5990506410598755, |
| "learning_rate": 1e-05, |
| "loss": 0.3216, |
| "step": 130800 |
| }, |
| { |
| "epoch": 0.001309, |
| "grad_norm": 1.563931941986084, |
| "learning_rate": 1e-05, |
| "loss": 0.3202, |
| "step": 130900 |
| }, |
| { |
| "epoch": 0.00131, |
| "grad_norm": 1.9966181516647339, |
| "learning_rate": 1e-05, |
| "loss": 0.3173, |
| "step": 131000 |
| }, |
| { |
| "epoch": 0.001311, |
| "grad_norm": 1.6617968082427979, |
| "learning_rate": 1e-05, |
| "loss": 0.3241, |
| "step": 131100 |
| }, |
| { |
| "epoch": 0.001312, |
| "grad_norm": 1.903935432434082, |
| "learning_rate": 1e-05, |
| "loss": 0.3215, |
| "step": 131200 |
| }, |
| { |
| "epoch": 0.001313, |
| "grad_norm": 1.9104382991790771, |
| "learning_rate": 1e-05, |
| "loss": 0.3183, |
| "step": 131300 |
| }, |
| { |
| "epoch": 0.001314, |
| "grad_norm": 1.5082734823226929, |
| "learning_rate": 1e-05, |
| "loss": 0.3251, |
| "step": 131400 |
| }, |
| { |
| "epoch": 0.001315, |
| "grad_norm": 1.869626522064209, |
| "learning_rate": 1e-05, |
| "loss": 0.3206, |
| "step": 131500 |
| }, |
| { |
| "epoch": 0.001316, |
| "grad_norm": 1.8665653467178345, |
| "learning_rate": 1e-05, |
| "loss": 0.3178, |
| "step": 131600 |
| }, |
| { |
| "epoch": 0.001317, |
| "grad_norm": 1.52765691280365, |
| "learning_rate": 1e-05, |
| "loss": 0.321, |
| "step": 131700 |
| }, |
| { |
| "epoch": 0.001318, |
| "grad_norm": 1.4063327312469482, |
| "learning_rate": 1e-05, |
| "loss": 0.3158, |
| "step": 131800 |
| }, |
| { |
| "epoch": 0.001319, |
| "grad_norm": 1.8856468200683594, |
| "learning_rate": 1e-05, |
| "loss": 0.3144, |
| "step": 131900 |
| }, |
| { |
| "epoch": 0.00132, |
| "grad_norm": 1.7623271942138672, |
| "learning_rate": 1e-05, |
| "loss": 0.3224, |
| "step": 132000 |
| }, |
| { |
| "epoch": 0.001321, |
| "grad_norm": 1.6237845420837402, |
| "learning_rate": 1e-05, |
| "loss": 0.3207, |
| "step": 132100 |
| }, |
| { |
| "epoch": 0.001322, |
| "grad_norm": 1.55039644241333, |
| "learning_rate": 1e-05, |
| "loss": 0.3188, |
| "step": 132200 |
| }, |
| { |
| "epoch": 0.001323, |
| "grad_norm": 1.6823863983154297, |
| "learning_rate": 1e-05, |
| "loss": 0.3204, |
| "step": 132300 |
| }, |
| { |
| "epoch": 0.001324, |
| "grad_norm": 1.6407947540283203, |
| "learning_rate": 1e-05, |
| "loss": 0.3129, |
| "step": 132400 |
| }, |
| { |
| "epoch": 0.001325, |
| "grad_norm": 1.6221039295196533, |
| "learning_rate": 1e-05, |
| "loss": 0.3184, |
| "step": 132500 |
| }, |
| { |
| "epoch": 0.001326, |
| "grad_norm": 1.6350358724594116, |
| "learning_rate": 1e-05, |
| "loss": 0.312, |
| "step": 132600 |
| }, |
| { |
| "epoch": 0.001327, |
| "grad_norm": 1.5436922311782837, |
| "learning_rate": 1e-05, |
| "loss": 0.3194, |
| "step": 132700 |
| }, |
| { |
| "epoch": 0.001328, |
| "grad_norm": 1.4995625019073486, |
| "learning_rate": 1e-05, |
| "loss": 0.3202, |
| "step": 132800 |
| }, |
| { |
| "epoch": 0.001329, |
| "grad_norm": 1.4857275485992432, |
| "learning_rate": 1e-05, |
| "loss": 0.3263, |
| "step": 132900 |
| }, |
| { |
| "epoch": 0.00133, |
| "grad_norm": 1.6003996133804321, |
| "learning_rate": 1e-05, |
| "loss": 0.3185, |
| "step": 133000 |
| }, |
| { |
| "epoch": 0.001331, |
| "grad_norm": 1.6696702241897583, |
| "learning_rate": 1e-05, |
| "loss": 0.3217, |
| "step": 133100 |
| }, |
| { |
| "epoch": 0.001332, |
| "grad_norm": 1.5747042894363403, |
| "learning_rate": 1e-05, |
| "loss": 0.3193, |
| "step": 133200 |
| }, |
| { |
| "epoch": 0.001333, |
| "grad_norm": 1.5583536624908447, |
| "learning_rate": 1e-05, |
| "loss": 0.3216, |
| "step": 133300 |
| }, |
| { |
| "epoch": 0.001334, |
| "grad_norm": 1.7547342777252197, |
| "learning_rate": 1e-05, |
| "loss": 0.3156, |
| "step": 133400 |
| }, |
| { |
| "epoch": 0.001335, |
| "grad_norm": 1.4350770711898804, |
| "learning_rate": 1e-05, |
| "loss": 0.3104, |
| "step": 133500 |
| }, |
| { |
| "epoch": 0.001336, |
| "grad_norm": 1.7420467138290405, |
| "learning_rate": 1e-05, |
| "loss": 0.3268, |
| "step": 133600 |
| }, |
| { |
| "epoch": 0.001337, |
| "grad_norm": 1.7873330116271973, |
| "learning_rate": 1e-05, |
| "loss": 0.3225, |
| "step": 133700 |
| }, |
| { |
| "epoch": 0.001338, |
| "grad_norm": 1.7741400003433228, |
| "learning_rate": 1e-05, |
| "loss": 0.322, |
| "step": 133800 |
| }, |
| { |
| "epoch": 0.001339, |
| "grad_norm": 1.6877092123031616, |
| "learning_rate": 1e-05, |
| "loss": 0.3209, |
| "step": 133900 |
| }, |
| { |
| "epoch": 0.00134, |
| "grad_norm": 1.7802987098693848, |
| "learning_rate": 1e-05, |
| "loss": 0.3206, |
| "step": 134000 |
| }, |
| { |
| "epoch": 0.001341, |
| "grad_norm": 1.8283404111862183, |
| "learning_rate": 1e-05, |
| "loss": 0.3256, |
| "step": 134100 |
| }, |
| { |
| "epoch": 0.001342, |
| "grad_norm": 1.5722459554672241, |
| "learning_rate": 1e-05, |
| "loss": 0.3163, |
| "step": 134200 |
| }, |
| { |
| "epoch": 0.001343, |
| "grad_norm": 1.7147879600524902, |
| "learning_rate": 1e-05, |
| "loss": 0.3161, |
| "step": 134300 |
| }, |
| { |
| "epoch": 0.001344, |
| "grad_norm": 1.8033101558685303, |
| "learning_rate": 1e-05, |
| "loss": 0.3146, |
| "step": 134400 |
| }, |
| { |
| "epoch": 0.001345, |
| "grad_norm": 1.5288691520690918, |
| "learning_rate": 1e-05, |
| "loss": 0.3225, |
| "step": 134500 |
| }, |
| { |
| "epoch": 0.001346, |
| "grad_norm": 1.6482738256454468, |
| "learning_rate": 1e-05, |
| "loss": 0.3136, |
| "step": 134600 |
| }, |
| { |
| "epoch": 0.001347, |
| "grad_norm": 1.5121197700500488, |
| "learning_rate": 1e-05, |
| "loss": 0.3229, |
| "step": 134700 |
| }, |
| { |
| "epoch": 0.001348, |
| "grad_norm": 1.602310061454773, |
| "learning_rate": 1e-05, |
| "loss": 0.3183, |
| "step": 134800 |
| }, |
| { |
| "epoch": 0.001349, |
| "grad_norm": 1.596355676651001, |
| "learning_rate": 1e-05, |
| "loss": 0.3099, |
| "step": 134900 |
| }, |
| { |
| "epoch": 0.00135, |
| "grad_norm": 2.1672589778900146, |
| "learning_rate": 1e-05, |
| "loss": 0.3183, |
| "step": 135000 |
| }, |
| { |
| "epoch": 0.001351, |
| "grad_norm": 1.529840350151062, |
| "learning_rate": 1e-05, |
| "loss": 0.322, |
| "step": 135100 |
| }, |
| { |
| "epoch": 0.001352, |
| "grad_norm": 1.48147714138031, |
| "learning_rate": 1e-05, |
| "loss": 0.3178, |
| "step": 135200 |
| }, |
| { |
| "epoch": 0.001353, |
| "grad_norm": 1.488888144493103, |
| "learning_rate": 1e-05, |
| "loss": 0.3121, |
| "step": 135300 |
| }, |
| { |
| "epoch": 0.001354, |
| "grad_norm": 1.3818961381912231, |
| "learning_rate": 1e-05, |
| "loss": 0.325, |
| "step": 135400 |
| }, |
| { |
| "epoch": 0.001355, |
| "grad_norm": 1.6328446865081787, |
| "learning_rate": 1e-05, |
| "loss": 0.3189, |
| "step": 135500 |
| }, |
| { |
| "epoch": 0.001356, |
| "grad_norm": 1.7009713649749756, |
| "learning_rate": 1e-05, |
| "loss": 0.3188, |
| "step": 135600 |
| }, |
| { |
| "epoch": 0.001357, |
| "grad_norm": 1.5784467458724976, |
| "learning_rate": 1e-05, |
| "loss": 0.3221, |
| "step": 135700 |
| }, |
| { |
| "epoch": 0.001358, |
| "grad_norm": 1.5164488554000854, |
| "learning_rate": 1e-05, |
| "loss": 0.3105, |
| "step": 135800 |
| }, |
| { |
| "epoch": 0.001359, |
| "grad_norm": 1.668848991394043, |
| "learning_rate": 1e-05, |
| "loss": 0.3104, |
| "step": 135900 |
| }, |
| { |
| "epoch": 0.00136, |
| "grad_norm": 1.978113055229187, |
| "learning_rate": 1e-05, |
| "loss": 0.3151, |
| "step": 136000 |
| }, |
| { |
| "epoch": 0.001361, |
| "grad_norm": 1.6246618032455444, |
| "learning_rate": 1e-05, |
| "loss": 0.3165, |
| "step": 136100 |
| }, |
| { |
| "epoch": 0.001362, |
| "grad_norm": 1.5396642684936523, |
| "learning_rate": 1e-05, |
| "loss": 0.3154, |
| "step": 136200 |
| }, |
| { |
| "epoch": 0.001363, |
| "grad_norm": 1.660476565361023, |
| "learning_rate": 1e-05, |
| "loss": 0.313, |
| "step": 136300 |
| }, |
| { |
| "epoch": 0.001364, |
| "grad_norm": 1.4768214225769043, |
| "learning_rate": 1e-05, |
| "loss": 0.3171, |
| "step": 136400 |
| }, |
| { |
| "epoch": 0.001365, |
| "grad_norm": 1.7115814685821533, |
| "learning_rate": 1e-05, |
| "loss": 0.3133, |
| "step": 136500 |
| }, |
| { |
| "epoch": 0.001366, |
| "grad_norm": 1.6259845495224, |
| "learning_rate": 1e-05, |
| "loss": 0.3153, |
| "step": 136600 |
| }, |
| { |
| "epoch": 0.001367, |
| "grad_norm": 1.5243622064590454, |
| "learning_rate": 1e-05, |
| "loss": 0.3151, |
| "step": 136700 |
| }, |
| { |
| "epoch": 0.001368, |
| "grad_norm": 1.8191895484924316, |
| "learning_rate": 1e-05, |
| "loss": 0.3229, |
| "step": 136800 |
| }, |
| { |
| "epoch": 0.001369, |
| "grad_norm": 1.5113294124603271, |
| "learning_rate": 1e-05, |
| "loss": 0.317, |
| "step": 136900 |
| }, |
| { |
| "epoch": 0.00137, |
| "grad_norm": 1.46476149559021, |
| "learning_rate": 1e-05, |
| "loss": 0.3156, |
| "step": 137000 |
| }, |
| { |
| "epoch": 0.001371, |
| "grad_norm": 1.4856374263763428, |
| "learning_rate": 1e-05, |
| "loss": 0.3116, |
| "step": 137100 |
| }, |
| { |
| "epoch": 0.001372, |
| "grad_norm": 1.6964186429977417, |
| "learning_rate": 1e-05, |
| "loss": 0.3155, |
| "step": 137200 |
| }, |
| { |
| "epoch": 0.001373, |
| "grad_norm": 1.8333369493484497, |
| "learning_rate": 1e-05, |
| "loss": 0.3142, |
| "step": 137300 |
| }, |
| { |
| "epoch": 0.001374, |
| "grad_norm": 1.6640217304229736, |
| "learning_rate": 1e-05, |
| "loss": 0.3177, |
| "step": 137400 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 1.566697597503662, |
| "learning_rate": 1e-05, |
| "loss": 0.3114, |
| "step": 137500 |
| }, |
| { |
| "epoch": 0.001376, |
| "grad_norm": 1.9087796211242676, |
| "learning_rate": 1e-05, |
| "loss": 0.3106, |
| "step": 137600 |
| }, |
| { |
| "epoch": 0.001377, |
| "grad_norm": 1.7321326732635498, |
| "learning_rate": 1e-05, |
| "loss": 0.3171, |
| "step": 137700 |
| }, |
| { |
| "epoch": 0.001378, |
| "grad_norm": 1.8865094184875488, |
| "learning_rate": 1e-05, |
| "loss": 0.3154, |
| "step": 137800 |
| }, |
| { |
| "epoch": 0.001379, |
| "grad_norm": 1.7816592454910278, |
| "learning_rate": 1e-05, |
| "loss": 0.3079, |
| "step": 137900 |
| }, |
| { |
| "epoch": 0.00138, |
| "grad_norm": 1.442036509513855, |
| "learning_rate": 1e-05, |
| "loss": 0.3102, |
| "step": 138000 |
| }, |
| { |
| "epoch": 0.001381, |
| "grad_norm": 1.6105217933654785, |
| "learning_rate": 1e-05, |
| "loss": 0.3145, |
| "step": 138100 |
| }, |
| { |
| "epoch": 0.001382, |
| "grad_norm": 1.597864031791687, |
| "learning_rate": 1e-05, |
| "loss": 0.3116, |
| "step": 138200 |
| }, |
| { |
| "epoch": 0.001383, |
| "grad_norm": 1.888089656829834, |
| "learning_rate": 1e-05, |
| "loss": 0.3215, |
| "step": 138300 |
| }, |
| { |
| "epoch": 0.001384, |
| "grad_norm": 1.372367262840271, |
| "learning_rate": 1e-05, |
| "loss": 0.3121, |
| "step": 138400 |
| }, |
| { |
| "epoch": 0.001385, |
| "grad_norm": 1.7237030267715454, |
| "learning_rate": 1e-05, |
| "loss": 0.3066, |
| "step": 138500 |
| }, |
| { |
| "epoch": 0.001386, |
| "grad_norm": 1.8450541496276855, |
| "learning_rate": 1e-05, |
| "loss": 0.3197, |
| "step": 138600 |
| }, |
| { |
| "epoch": 0.001387, |
| "grad_norm": 1.6042810678482056, |
| "learning_rate": 1e-05, |
| "loss": 0.3143, |
| "step": 138700 |
| }, |
| { |
| "epoch": 0.001388, |
| "grad_norm": 1.914825677871704, |
| "learning_rate": 1e-05, |
| "loss": 0.3109, |
| "step": 138800 |
| }, |
| { |
| "epoch": 0.001389, |
| "grad_norm": 1.810681939125061, |
| "learning_rate": 1e-05, |
| "loss": 0.3207, |
| "step": 138900 |
| }, |
| { |
| "epoch": 0.00139, |
| "grad_norm": 1.6505619287490845, |
| "learning_rate": 1e-05, |
| "loss": 0.3112, |
| "step": 139000 |
| }, |
| { |
| "epoch": 0.001391, |
| "grad_norm": 1.5920408964157104, |
| "learning_rate": 1e-05, |
| "loss": 0.3143, |
| "step": 139100 |
| }, |
| { |
| "epoch": 0.001392, |
| "grad_norm": 1.603387713432312, |
| "learning_rate": 1e-05, |
| "loss": 0.3121, |
| "step": 139200 |
| }, |
| { |
| "epoch": 0.001393, |
| "grad_norm": 1.7222926616668701, |
| "learning_rate": 1e-05, |
| "loss": 0.3161, |
| "step": 139300 |
| }, |
| { |
| "epoch": 0.001394, |
| "grad_norm": 1.6358146667480469, |
| "learning_rate": 1e-05, |
| "loss": 0.3083, |
| "step": 139400 |
| }, |
| { |
| "epoch": 0.001395, |
| "grad_norm": 2.0886547565460205, |
| "learning_rate": 1e-05, |
| "loss": 0.3215, |
| "step": 139500 |
| }, |
| { |
| "epoch": 0.001396, |
| "grad_norm": 1.68631911277771, |
| "learning_rate": 1e-05, |
| "loss": 0.3082, |
| "step": 139600 |
| }, |
| { |
| "epoch": 0.001397, |
| "grad_norm": 1.5744168758392334, |
| "learning_rate": 1e-05, |
| "loss": 0.315, |
| "step": 139700 |
| }, |
| { |
| "epoch": 0.001398, |
| "grad_norm": 1.7238872051239014, |
| "learning_rate": 1e-05, |
| "loss": 0.3151, |
| "step": 139800 |
| }, |
| { |
| "epoch": 0.001399, |
| "grad_norm": 1.6450138092041016, |
| "learning_rate": 1e-05, |
| "loss": 0.3129, |
| "step": 139900 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 1.4830751419067383, |
| "learning_rate": 1e-05, |
| "loss": 0.3164, |
| "step": 140000 |
| }, |
| { |
| "epoch": 0.0014, |
| "eval_loss": 0.287109375, |
| "eval_runtime": 109.9314, |
| "eval_samples_per_second": 454.829, |
| "eval_steps_per_second": 28.427, |
| "step": 140000 |
| }, |
| { |
| "epoch": 0.001401, |
| "grad_norm": 1.7286059856414795, |
| "learning_rate": 1e-05, |
| "loss": 0.311, |
| "step": 140100 |
| }, |
| { |
| "epoch": 0.001402, |
| "grad_norm": 1.7702032327651978, |
| "learning_rate": 1e-05, |
| "loss": 0.326, |
| "step": 140200 |
| }, |
| { |
| "epoch": 0.001403, |
| "grad_norm": 1.6898577213287354, |
| "learning_rate": 1e-05, |
| "loss": 0.3216, |
| "step": 140300 |
| }, |
| { |
| "epoch": 0.001404, |
| "grad_norm": 1.6577975749969482, |
| "learning_rate": 1e-05, |
| "loss": 0.3145, |
| "step": 140400 |
| }, |
| { |
| "epoch": 0.001405, |
| "grad_norm": 1.444854736328125, |
| "learning_rate": 1e-05, |
| "loss": 0.3216, |
| "step": 140500 |
| }, |
| { |
| "epoch": 0.001406, |
| "grad_norm": 1.6251206398010254, |
| "learning_rate": 1e-05, |
| "loss": 0.3108, |
| "step": 140600 |
| }, |
| { |
| "epoch": 0.001407, |
| "grad_norm": 1.4880505800247192, |
| "learning_rate": 1e-05, |
| "loss": 0.3131, |
| "step": 140700 |
| }, |
| { |
| "epoch": 0.001408, |
| "grad_norm": 1.8925511837005615, |
| "learning_rate": 1e-05, |
| "loss": 0.3117, |
| "step": 140800 |
| }, |
| { |
| "epoch": 0.001409, |
| "grad_norm": 1.6990015506744385, |
| "learning_rate": 1e-05, |
| "loss": 0.3101, |
| "step": 140900 |
| }, |
| { |
| "epoch": 0.00141, |
| "grad_norm": 1.498661756515503, |
| "learning_rate": 1e-05, |
| "loss": 0.3082, |
| "step": 141000 |
| }, |
| { |
| "epoch": 0.001411, |
| "grad_norm": 1.7527713775634766, |
| "learning_rate": 1e-05, |
| "loss": 0.3178, |
| "step": 141100 |
| }, |
| { |
| "epoch": 0.001412, |
| "grad_norm": 1.6200438737869263, |
| "learning_rate": 1e-05, |
| "loss": 0.3228, |
| "step": 141200 |
| }, |
| { |
| "epoch": 0.001413, |
| "grad_norm": 1.3735147714614868, |
| "learning_rate": 1e-05, |
| "loss": 0.3124, |
| "step": 141300 |
| }, |
| { |
| "epoch": 0.001414, |
| "grad_norm": 2.0076656341552734, |
| "learning_rate": 1e-05, |
| "loss": 0.3119, |
| "step": 141400 |
| }, |
| { |
| "epoch": 0.001415, |
| "grad_norm": 1.5308282375335693, |
| "learning_rate": 1e-05, |
| "loss": 0.321, |
| "step": 141500 |
| }, |
| { |
| "epoch": 0.001416, |
| "grad_norm": 1.5367777347564697, |
| "learning_rate": 1e-05, |
| "loss": 0.3064, |
| "step": 141600 |
| }, |
| { |
| "epoch": 0.001417, |
| "grad_norm": 1.64597749710083, |
| "learning_rate": 1e-05, |
| "loss": 0.3116, |
| "step": 141700 |
| }, |
| { |
| "epoch": 0.001418, |
| "grad_norm": 1.976902723312378, |
| "learning_rate": 1e-05, |
| "loss": 0.3187, |
| "step": 141800 |
| }, |
| { |
| "epoch": 0.001419, |
| "grad_norm": 1.8003846406936646, |
| "learning_rate": 1e-05, |
| "loss": 0.3106, |
| "step": 141900 |
| }, |
| { |
| "epoch": 0.00142, |
| "grad_norm": 2.8026585578918457, |
| "learning_rate": 1e-05, |
| "loss": 0.3186, |
| "step": 142000 |
| }, |
| { |
| "epoch": 0.001421, |
| "grad_norm": 3.536267042160034, |
| "learning_rate": 1e-05, |
| "loss": 0.3104, |
| "step": 142100 |
| }, |
| { |
| "epoch": 0.001422, |
| "grad_norm": 1.5805248022079468, |
| "learning_rate": 1e-05, |
| "loss": 0.3136, |
| "step": 142200 |
| }, |
| { |
| "epoch": 0.001423, |
| "grad_norm": 1.5467828512191772, |
| "learning_rate": 1e-05, |
| "loss": 0.3102, |
| "step": 142300 |
| }, |
| { |
| "epoch": 0.001424, |
| "grad_norm": 1.5788090229034424, |
| "learning_rate": 1e-05, |
| "loss": 0.3153, |
| "step": 142400 |
| }, |
| { |
| "epoch": 0.001425, |
| "grad_norm": 1.8822991847991943, |
| "learning_rate": 1e-05, |
| "loss": 0.3124, |
| "step": 142500 |
| }, |
| { |
| "epoch": 0.001426, |
| "grad_norm": 1.547636866569519, |
| "learning_rate": 1e-05, |
| "loss": 0.3141, |
| "step": 142600 |
| }, |
| { |
| "epoch": 0.001427, |
| "grad_norm": 1.628211498260498, |
| "learning_rate": 1e-05, |
| "loss": 0.3182, |
| "step": 142700 |
| }, |
| { |
| "epoch": 0.001428, |
| "grad_norm": 1.645572304725647, |
| "learning_rate": 1e-05, |
| "loss": 0.3106, |
| "step": 142800 |
| }, |
| { |
| "epoch": 0.001429, |
| "grad_norm": 1.5614272356033325, |
| "learning_rate": 1e-05, |
| "loss": 0.3113, |
| "step": 142900 |
| }, |
| { |
| "epoch": 0.00143, |
| "grad_norm": 1.6089304685592651, |
| "learning_rate": 1e-05, |
| "loss": 0.3115, |
| "step": 143000 |
| }, |
| { |
| "epoch": 0.001431, |
| "grad_norm": 1.7061288356781006, |
| "learning_rate": 1e-05, |
| "loss": 0.3097, |
| "step": 143100 |
| }, |
| { |
| "epoch": 0.001432, |
| "grad_norm": 1.629626989364624, |
| "learning_rate": 1e-05, |
| "loss": 0.3127, |
| "step": 143200 |
| }, |
| { |
| "epoch": 0.001433, |
| "grad_norm": 1.5930266380310059, |
| "learning_rate": 1e-05, |
| "loss": 0.3136, |
| "step": 143300 |
| }, |
| { |
| "epoch": 0.001434, |
| "grad_norm": 1.6816060543060303, |
| "learning_rate": 1e-05, |
| "loss": 0.3171, |
| "step": 143400 |
| }, |
| { |
| "epoch": 0.001435, |
| "grad_norm": 1.6803059577941895, |
| "learning_rate": 1e-05, |
| "loss": 0.3162, |
| "step": 143500 |
| }, |
| { |
| "epoch": 0.001436, |
| "grad_norm": 1.4301313161849976, |
| "learning_rate": 1e-05, |
| "loss": 0.3009, |
| "step": 143600 |
| }, |
| { |
| "epoch": 0.001437, |
| "grad_norm": 1.407421588897705, |
| "learning_rate": 1e-05, |
| "loss": 0.3078, |
| "step": 143700 |
| }, |
| { |
| "epoch": 0.001438, |
| "grad_norm": 1.6475402116775513, |
| "learning_rate": 1e-05, |
| "loss": 0.3088, |
| "step": 143800 |
| }, |
| { |
| "epoch": 0.001439, |
| "grad_norm": 1.5251747369766235, |
| "learning_rate": 1e-05, |
| "loss": 0.3086, |
| "step": 143900 |
| }, |
| { |
| "epoch": 0.00144, |
| "grad_norm": 1.5712449550628662, |
| "learning_rate": 1e-05, |
| "loss": 0.3117, |
| "step": 144000 |
| }, |
| { |
| "epoch": 0.001441, |
| "grad_norm": 1.6450409889221191, |
| "learning_rate": 1e-05, |
| "loss": 0.3131, |
| "step": 144100 |
| }, |
| { |
| "epoch": 0.001442, |
| "grad_norm": 1.451005458831787, |
| "learning_rate": 1e-05, |
| "loss": 0.3117, |
| "step": 144200 |
| }, |
| { |
| "epoch": 0.001443, |
| "grad_norm": 1.5392875671386719, |
| "learning_rate": 1e-05, |
| "loss": 0.3186, |
| "step": 144300 |
| }, |
| { |
| "epoch": 0.001444, |
| "grad_norm": 1.8175650835037231, |
| "learning_rate": 1e-05, |
| "loss": 0.3183, |
| "step": 144400 |
| }, |
| { |
| "epoch": 0.001445, |
| "grad_norm": 1.506216287612915, |
| "learning_rate": 1e-05, |
| "loss": 0.318, |
| "step": 144500 |
| }, |
| { |
| "epoch": 0.001446, |
| "grad_norm": 1.4329332113265991, |
| "learning_rate": 1e-05, |
| "loss": 0.3107, |
| "step": 144600 |
| }, |
| { |
| "epoch": 0.001447, |
| "grad_norm": 1.6957765817642212, |
| "learning_rate": 1e-05, |
| "loss": 0.309, |
| "step": 144700 |
| }, |
| { |
| "epoch": 0.001448, |
| "grad_norm": 1.3159312009811401, |
| "learning_rate": 1e-05, |
| "loss": 0.313, |
| "step": 144800 |
| }, |
| { |
| "epoch": 0.001449, |
| "grad_norm": 1.6114338636398315, |
| "learning_rate": 1e-05, |
| "loss": 0.3096, |
| "step": 144900 |
| }, |
| { |
| "epoch": 0.00145, |
| "grad_norm": 1.8013079166412354, |
| "learning_rate": 1e-05, |
| "loss": 0.3154, |
| "step": 145000 |
| }, |
| { |
| "epoch": 0.001451, |
| "grad_norm": 1.7098653316497803, |
| "learning_rate": 1e-05, |
| "loss": 0.3091, |
| "step": 145100 |
| }, |
| { |
| "epoch": 0.001452, |
| "grad_norm": 1.5512733459472656, |
| "learning_rate": 1e-05, |
| "loss": 0.3109, |
| "step": 145200 |
| }, |
| { |
| "epoch": 0.001453, |
| "grad_norm": 1.725237488746643, |
| "learning_rate": 1e-05, |
| "loss": 0.3048, |
| "step": 145300 |
| }, |
| { |
| "epoch": 0.001454, |
| "grad_norm": 4.254234313964844, |
| "learning_rate": 1e-05, |
| "loss": 0.3154, |
| "step": 145400 |
| }, |
| { |
| "epoch": 0.001455, |
| "grad_norm": 1.7910503149032593, |
| "learning_rate": 1e-05, |
| "loss": 0.3085, |
| "step": 145500 |
| }, |
| { |
| "epoch": 0.001456, |
| "grad_norm": 1.4521243572235107, |
| "learning_rate": 1e-05, |
| "loss": 0.316, |
| "step": 145600 |
| }, |
| { |
| "epoch": 0.001457, |
| "grad_norm": 1.7298940420150757, |
| "learning_rate": 1e-05, |
| "loss": 0.3052, |
| "step": 145700 |
| }, |
| { |
| "epoch": 0.001458, |
| "grad_norm": 1.7451497316360474, |
| "learning_rate": 1e-05, |
| "loss": 0.3111, |
| "step": 145800 |
| }, |
| { |
| "epoch": 0.001459, |
| "grad_norm": 1.7721543312072754, |
| "learning_rate": 1e-05, |
| "loss": 0.3124, |
| "step": 145900 |
| }, |
| { |
| "epoch": 0.00146, |
| "grad_norm": 2.228154182434082, |
| "learning_rate": 1e-05, |
| "loss": 0.3114, |
| "step": 146000 |
| }, |
| { |
| "epoch": 0.001461, |
| "grad_norm": 1.6171804666519165, |
| "learning_rate": 1e-05, |
| "loss": 0.3028, |
| "step": 146100 |
| }, |
| { |
| "epoch": 0.001462, |
| "grad_norm": 1.8199244737625122, |
| "learning_rate": 1e-05, |
| "loss": 0.3165, |
| "step": 146200 |
| }, |
| { |
| "epoch": 0.001463, |
| "grad_norm": 1.5894031524658203, |
| "learning_rate": 1e-05, |
| "loss": 0.3069, |
| "step": 146300 |
| }, |
| { |
| "epoch": 0.001464, |
| "grad_norm": 1.5978094339370728, |
| "learning_rate": 1e-05, |
| "loss": 0.3092, |
| "step": 146400 |
| }, |
| { |
| "epoch": 0.001465, |
| "grad_norm": 1.4421255588531494, |
| "learning_rate": 1e-05, |
| "loss": 0.312, |
| "step": 146500 |
| }, |
| { |
| "epoch": 0.001466, |
| "grad_norm": 1.6096898317337036, |
| "learning_rate": 1e-05, |
| "loss": 0.3049, |
| "step": 146600 |
| }, |
| { |
| "epoch": 0.001467, |
| "grad_norm": 1.5457234382629395, |
| "learning_rate": 1e-05, |
| "loss": 0.3096, |
| "step": 146700 |
| }, |
| { |
| "epoch": 0.001468, |
| "grad_norm": 1.6169909238815308, |
| "learning_rate": 1e-05, |
| "loss": 0.3086, |
| "step": 146800 |
| }, |
| { |
| "epoch": 0.001469, |
| "grad_norm": 1.7878210544586182, |
| "learning_rate": 1e-05, |
| "loss": 0.315, |
| "step": 146900 |
| }, |
| { |
| "epoch": 0.00147, |
| "grad_norm": 6.46168327331543, |
| "learning_rate": 1e-05, |
| "loss": 0.305, |
| "step": 147000 |
| }, |
| { |
| "epoch": 0.001471, |
| "grad_norm": 1.6731548309326172, |
| "learning_rate": 1e-05, |
| "loss": 0.3102, |
| "step": 147100 |
| }, |
| { |
| "epoch": 0.001472, |
| "grad_norm": 1.9476267099380493, |
| "learning_rate": 1e-05, |
| "loss": 0.3122, |
| "step": 147200 |
| }, |
| { |
| "epoch": 0.001473, |
| "grad_norm": 1.6077353954315186, |
| "learning_rate": 1e-05, |
| "loss": 0.3084, |
| "step": 147300 |
| }, |
| { |
| "epoch": 0.001474, |
| "grad_norm": 1.7394909858703613, |
| "learning_rate": 1e-05, |
| "loss": 0.3063, |
| "step": 147400 |
| }, |
| { |
| "epoch": 0.001475, |
| "grad_norm": 1.4782484769821167, |
| "learning_rate": 1e-05, |
| "loss": 0.3105, |
| "step": 147500 |
| }, |
| { |
| "epoch": 0.001476, |
| "grad_norm": 1.6981760263442993, |
| "learning_rate": 1e-05, |
| "loss": 0.3119, |
| "step": 147600 |
| }, |
| { |
| "epoch": 0.001477, |
| "grad_norm": 1.697596549987793, |
| "learning_rate": 1e-05, |
| "loss": 0.3132, |
| "step": 147700 |
| }, |
| { |
| "epoch": 0.001478, |
| "grad_norm": 1.630706787109375, |
| "learning_rate": 1e-05, |
| "loss": 0.3147, |
| "step": 147800 |
| }, |
| { |
| "epoch": 0.001479, |
| "grad_norm": 1.685257077217102, |
| "learning_rate": 1e-05, |
| "loss": 0.3096, |
| "step": 147900 |
| }, |
| { |
| "epoch": 0.00148, |
| "grad_norm": 1.8165167570114136, |
| "learning_rate": 1e-05, |
| "loss": 0.307, |
| "step": 148000 |
| }, |
| { |
| "epoch": 0.001481, |
| "grad_norm": 1.6408799886703491, |
| "learning_rate": 1e-05, |
| "loss": 0.3111, |
| "step": 148100 |
| }, |
| { |
| "epoch": 0.001482, |
| "grad_norm": 1.6128547191619873, |
| "learning_rate": 1e-05, |
| "loss": 0.3083, |
| "step": 148200 |
| }, |
| { |
| "epoch": 0.001483, |
| "grad_norm": 1.8511683940887451, |
| "learning_rate": 1e-05, |
| "loss": 0.3014, |
| "step": 148300 |
| }, |
| { |
| "epoch": 0.001484, |
| "grad_norm": 1.576206922531128, |
| "learning_rate": 1e-05, |
| "loss": 0.309, |
| "step": 148400 |
| }, |
| { |
| "epoch": 0.001485, |
| "grad_norm": 1.543514370918274, |
| "learning_rate": 1e-05, |
| "loss": 0.3093, |
| "step": 148500 |
| }, |
| { |
| "epoch": 0.001486, |
| "grad_norm": 1.5939360857009888, |
| "learning_rate": 1e-05, |
| "loss": 0.3105, |
| "step": 148600 |
| }, |
| { |
| "epoch": 0.001487, |
| "grad_norm": 1.4022550582885742, |
| "learning_rate": 1e-05, |
| "loss": 0.311, |
| "step": 148700 |
| }, |
| { |
| "epoch": 0.001488, |
| "grad_norm": 1.8196625709533691, |
| "learning_rate": 1e-05, |
| "loss": 0.3119, |
| "step": 148800 |
| }, |
| { |
| "epoch": 0.001489, |
| "grad_norm": 1.6308430433273315, |
| "learning_rate": 1e-05, |
| "loss": 0.3042, |
| "step": 148900 |
| }, |
| { |
| "epoch": 0.00149, |
| "grad_norm": 1.6367475986480713, |
| "learning_rate": 1e-05, |
| "loss": 0.3142, |
| "step": 149000 |
| }, |
| { |
| "epoch": 0.001491, |
| "grad_norm": 1.5516581535339355, |
| "learning_rate": 1e-05, |
| "loss": 0.3069, |
| "step": 149100 |
| }, |
| { |
| "epoch": 0.001492, |
| "grad_norm": 1.524357557296753, |
| "learning_rate": 1e-05, |
| "loss": 0.3031, |
| "step": 149200 |
| }, |
| { |
| "epoch": 0.001493, |
| "grad_norm": 1.6461905241012573, |
| "learning_rate": 1e-05, |
| "loss": 0.3083, |
| "step": 149300 |
| }, |
| { |
| "epoch": 0.001494, |
| "grad_norm": 1.8886070251464844, |
| "learning_rate": 1e-05, |
| "loss": 0.3137, |
| "step": 149400 |
| }, |
| { |
| "epoch": 0.001495, |
| "grad_norm": 1.7399191856384277, |
| "learning_rate": 1e-05, |
| "loss": 0.3098, |
| "step": 149500 |
| }, |
| { |
| "epoch": 0.001496, |
| "grad_norm": 1.7540628910064697, |
| "learning_rate": 1e-05, |
| "loss": 0.3155, |
| "step": 149600 |
| }, |
| { |
| "epoch": 0.001497, |
| "grad_norm": 1.6544119119644165, |
| "learning_rate": 1e-05, |
| "loss": 0.3034, |
| "step": 149700 |
| }, |
| { |
| "epoch": 0.001498, |
| "grad_norm": 1.6985324621200562, |
| "learning_rate": 1e-05, |
| "loss": 0.3041, |
| "step": 149800 |
| }, |
| { |
| "epoch": 0.001499, |
| "grad_norm": 1.6984387636184692, |
| "learning_rate": 1e-05, |
| "loss": 0.3041, |
| "step": 149900 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 1.5459750890731812, |
| "learning_rate": 1e-05, |
| "loss": 0.3055, |
| "step": 150000 |
| }, |
| { |
| "epoch": 0.001501, |
| "grad_norm": 1.6217613220214844, |
| "learning_rate": 1e-05, |
| "loss": 0.3083, |
| "step": 150100 |
| }, |
| { |
| "epoch": 0.001502, |
| "grad_norm": 1.6481753587722778, |
| "learning_rate": 1e-05, |
| "loss": 0.3045, |
| "step": 150200 |
| }, |
| { |
| "epoch": 0.001503, |
| "grad_norm": 1.5565217733383179, |
| "learning_rate": 1e-05, |
| "loss": 0.3099, |
| "step": 150300 |
| }, |
| { |
| "epoch": 0.001504, |
| "grad_norm": 1.678059458732605, |
| "learning_rate": 1e-05, |
| "loss": 0.3116, |
| "step": 150400 |
| }, |
| { |
| "epoch": 0.001505, |
| "grad_norm": 1.6894927024841309, |
| "learning_rate": 1e-05, |
| "loss": 0.3101, |
| "step": 150500 |
| }, |
| { |
| "epoch": 0.001506, |
| "grad_norm": 1.5071243047714233, |
| "learning_rate": 1e-05, |
| "loss": 0.3049, |
| "step": 150600 |
| }, |
| { |
| "epoch": 0.001507, |
| "grad_norm": 1.6531084775924683, |
| "learning_rate": 1e-05, |
| "loss": 0.31, |
| "step": 150700 |
| }, |
| { |
| "epoch": 0.001508, |
| "grad_norm": 1.5029364824295044, |
| "learning_rate": 1e-05, |
| "loss": 0.2991, |
| "step": 150800 |
| }, |
| { |
| "epoch": 0.001509, |
| "grad_norm": 1.687752366065979, |
| "learning_rate": 1e-05, |
| "loss": 0.3079, |
| "step": 150900 |
| }, |
| { |
| "epoch": 0.00151, |
| "grad_norm": 1.485236406326294, |
| "learning_rate": 1e-05, |
| "loss": 0.2977, |
| "step": 151000 |
| }, |
| { |
| "epoch": 0.001511, |
| "grad_norm": 1.5481332540512085, |
| "learning_rate": 1e-05, |
| "loss": 0.3076, |
| "step": 151100 |
| }, |
| { |
| "epoch": 0.001512, |
| "grad_norm": 1.5191718339920044, |
| "learning_rate": 1e-05, |
| "loss": 0.302, |
| "step": 151200 |
| }, |
| { |
| "epoch": 0.001513, |
| "grad_norm": 1.6339939832687378, |
| "learning_rate": 1e-05, |
| "loss": 0.3012, |
| "step": 151300 |
| }, |
| { |
| "epoch": 0.001514, |
| "grad_norm": 1.5735626220703125, |
| "learning_rate": 1e-05, |
| "loss": 0.3124, |
| "step": 151400 |
| }, |
| { |
| "epoch": 0.001515, |
| "grad_norm": 1.6196351051330566, |
| "learning_rate": 1e-05, |
| "loss": 0.3135, |
| "step": 151500 |
| }, |
| { |
| "epoch": 0.001516, |
| "grad_norm": 1.6257820129394531, |
| "learning_rate": 1e-05, |
| "loss": 0.3034, |
| "step": 151600 |
| }, |
| { |
| "epoch": 0.001517, |
| "grad_norm": 1.5410822629928589, |
| "learning_rate": 1e-05, |
| "loss": 0.3014, |
| "step": 151700 |
| }, |
| { |
| "epoch": 0.001518, |
| "grad_norm": 1.7772650718688965, |
| "learning_rate": 1e-05, |
| "loss": 0.3023, |
| "step": 151800 |
| }, |
| { |
| "epoch": 0.001519, |
| "grad_norm": 1.683762788772583, |
| "learning_rate": 1e-05, |
| "loss": 0.3003, |
| "step": 151900 |
| }, |
| { |
| "epoch": 0.00152, |
| "grad_norm": 2.5986571311950684, |
| "learning_rate": 1e-05, |
| "loss": 0.3066, |
| "step": 152000 |
| }, |
| { |
| "epoch": 0.001521, |
| "grad_norm": 1.802271842956543, |
| "learning_rate": 1e-05, |
| "loss": 0.3066, |
| "step": 152100 |
| }, |
| { |
| "epoch": 0.001522, |
| "grad_norm": 1.8014322519302368, |
| "learning_rate": 1e-05, |
| "loss": 0.3039, |
| "step": 152200 |
| }, |
| { |
| "epoch": 0.001523, |
| "grad_norm": 3.4917097091674805, |
| "learning_rate": 1e-05, |
| "loss": 0.3077, |
| "step": 152300 |
| }, |
| { |
| "epoch": 0.001524, |
| "grad_norm": 1.3952269554138184, |
| "learning_rate": 1e-05, |
| "loss": 0.3063, |
| "step": 152400 |
| }, |
| { |
| "epoch": 0.001525, |
| "grad_norm": 1.4687715768814087, |
| "learning_rate": 1e-05, |
| "loss": 0.3133, |
| "step": 152500 |
| }, |
| { |
| "epoch": 0.001526, |
| "grad_norm": 1.571907877922058, |
| "learning_rate": 1e-05, |
| "loss": 0.3053, |
| "step": 152600 |
| }, |
| { |
| "epoch": 0.001527, |
| "grad_norm": 1.6538187265396118, |
| "learning_rate": 1e-05, |
| "loss": 0.3025, |
| "step": 152700 |
| }, |
| { |
| "epoch": 0.001528, |
| "grad_norm": 1.332327961921692, |
| "learning_rate": 1e-05, |
| "loss": 0.3036, |
| "step": 152800 |
| }, |
| { |
| "epoch": 0.001529, |
| "grad_norm": 1.9874423742294312, |
| "learning_rate": 1e-05, |
| "loss": 0.304, |
| "step": 152900 |
| }, |
| { |
| "epoch": 0.00153, |
| "grad_norm": 1.5692553520202637, |
| "learning_rate": 1e-05, |
| "loss": 0.3054, |
| "step": 153000 |
| }, |
| { |
| "epoch": 0.001531, |
| "grad_norm": 1.6490308046340942, |
| "learning_rate": 1e-05, |
| "loss": 0.3094, |
| "step": 153100 |
| }, |
| { |
| "epoch": 0.001532, |
| "grad_norm": 1.4249401092529297, |
| "learning_rate": 1e-05, |
| "loss": 0.3078, |
| "step": 153200 |
| }, |
| { |
| "epoch": 0.001533, |
| "grad_norm": 1.432947039604187, |
| "learning_rate": 1e-05, |
| "loss": 0.3029, |
| "step": 153300 |
| }, |
| { |
| "epoch": 0.001534, |
| "grad_norm": 1.6225773096084595, |
| "learning_rate": 1e-05, |
| "loss": 0.2992, |
| "step": 153400 |
| }, |
| { |
| "epoch": 0.001535, |
| "grad_norm": 1.6100537776947021, |
| "learning_rate": 1e-05, |
| "loss": 0.3072, |
| "step": 153500 |
| }, |
| { |
| "epoch": 0.001536, |
| "grad_norm": 1.6616079807281494, |
| "learning_rate": 1e-05, |
| "loss": 0.3066, |
| "step": 153600 |
| }, |
| { |
| "epoch": 0.001537, |
| "grad_norm": 1.8805843591690063, |
| "learning_rate": 1e-05, |
| "loss": 0.3005, |
| "step": 153700 |
| }, |
| { |
| "epoch": 0.001538, |
| "grad_norm": 1.3726553916931152, |
| "learning_rate": 1e-05, |
| "loss": 0.3055, |
| "step": 153800 |
| }, |
| { |
| "epoch": 0.001539, |
| "grad_norm": 1.6736137866973877, |
| "learning_rate": 1e-05, |
| "loss": 0.3053, |
| "step": 153900 |
| }, |
| { |
| "epoch": 0.00154, |
| "grad_norm": 1.704338550567627, |
| "learning_rate": 1e-05, |
| "loss": 0.3085, |
| "step": 154000 |
| }, |
| { |
| "epoch": 0.001541, |
| "grad_norm": 1.5360641479492188, |
| "learning_rate": 1e-05, |
| "loss": 0.3051, |
| "step": 154100 |
| }, |
| { |
| "epoch": 0.001542, |
| "grad_norm": 1.5851362943649292, |
| "learning_rate": 1e-05, |
| "loss": 0.3098, |
| "step": 154200 |
| }, |
| { |
| "epoch": 0.001543, |
| "grad_norm": 1.5866998434066772, |
| "learning_rate": 1e-05, |
| "loss": 0.3059, |
| "step": 154300 |
| }, |
| { |
| "epoch": 0.001544, |
| "grad_norm": 1.4236769676208496, |
| "learning_rate": 1e-05, |
| "loss": 0.3052, |
| "step": 154400 |
| }, |
| { |
| "epoch": 0.001545, |
| "grad_norm": 1.5114357471466064, |
| "learning_rate": 1e-05, |
| "loss": 0.3013, |
| "step": 154500 |
| }, |
| { |
| "epoch": 0.001546, |
| "grad_norm": 1.5853383541107178, |
| "learning_rate": 1e-05, |
| "loss": 0.3032, |
| "step": 154600 |
| }, |
| { |
| "epoch": 0.001547, |
| "grad_norm": 1.7222450971603394, |
| "learning_rate": 1e-05, |
| "loss": 0.3004, |
| "step": 154700 |
| }, |
| { |
| "epoch": 0.001548, |
| "grad_norm": 1.832231879234314, |
| "learning_rate": 1e-05, |
| "loss": 0.3022, |
| "step": 154800 |
| }, |
| { |
| "epoch": 0.001549, |
| "grad_norm": 1.5499014854431152, |
| "learning_rate": 1e-05, |
| "loss": 0.307, |
| "step": 154900 |
| }, |
| { |
| "epoch": 0.00155, |
| "grad_norm": 1.6969635486602783, |
| "learning_rate": 1e-05, |
| "loss": 0.306, |
| "step": 155000 |
| }, |
| { |
| "epoch": 0.001551, |
| "grad_norm": 1.4039770364761353, |
| "learning_rate": 1e-05, |
| "loss": 0.2953, |
| "step": 155100 |
| }, |
| { |
| "epoch": 0.001552, |
| "grad_norm": 1.4944506883621216, |
| "learning_rate": 1e-05, |
| "loss": 0.3068, |
| "step": 155200 |
| }, |
| { |
| "epoch": 0.001553, |
| "grad_norm": 1.6612184047698975, |
| "learning_rate": 1e-05, |
| "loss": 0.3011, |
| "step": 155300 |
| }, |
| { |
| "epoch": 0.001554, |
| "grad_norm": 1.3576972484588623, |
| "learning_rate": 1e-05, |
| "loss": 0.2988, |
| "step": 155400 |
| }, |
| { |
| "epoch": 0.001555, |
| "grad_norm": 1.514603853225708, |
| "learning_rate": 1e-05, |
| "loss": 0.3004, |
| "step": 155500 |
| }, |
| { |
| "epoch": 0.001556, |
| "grad_norm": 1.7504587173461914, |
| "learning_rate": 1e-05, |
| "loss": 0.3031, |
| "step": 155600 |
| }, |
| { |
| "epoch": 0.001557, |
| "grad_norm": 1.723211407661438, |
| "learning_rate": 1e-05, |
| "loss": 0.3057, |
| "step": 155700 |
| }, |
| { |
| "epoch": 0.001558, |
| "grad_norm": 1.395255446434021, |
| "learning_rate": 1e-05, |
| "loss": 0.3081, |
| "step": 155800 |
| }, |
| { |
| "epoch": 0.001559, |
| "grad_norm": 1.8021862506866455, |
| "learning_rate": 1e-05, |
| "loss": 0.3039, |
| "step": 155900 |
| }, |
| { |
| "epoch": 0.00156, |
| "grad_norm": 1.6634401082992554, |
| "learning_rate": 1e-05, |
| "loss": 0.3064, |
| "step": 156000 |
| }, |
| { |
| "epoch": 0.001561, |
| "grad_norm": 1.6071007251739502, |
| "learning_rate": 1e-05, |
| "loss": 0.3046, |
| "step": 156100 |
| }, |
| { |
| "epoch": 0.001562, |
| "grad_norm": 1.7638030052185059, |
| "learning_rate": 1e-05, |
| "loss": 0.3073, |
| "step": 156200 |
| }, |
| { |
| "epoch": 0.001563, |
| "grad_norm": 1.6403735876083374, |
| "learning_rate": 1e-05, |
| "loss": 0.304, |
| "step": 156300 |
| }, |
| { |
| "epoch": 0.001564, |
| "grad_norm": 1.8615624904632568, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 156400 |
| }, |
| { |
| "epoch": 0.001565, |
| "grad_norm": 1.501093864440918, |
| "learning_rate": 1e-05, |
| "loss": 0.2993, |
| "step": 156500 |
| }, |
| { |
| "epoch": 0.001566, |
| "grad_norm": 1.7402315139770508, |
| "learning_rate": 1e-05, |
| "loss": 0.3033, |
| "step": 156600 |
| }, |
| { |
| "epoch": 0.001567, |
| "grad_norm": 1.4010441303253174, |
| "learning_rate": 1e-05, |
| "loss": 0.3079, |
| "step": 156700 |
| }, |
| { |
| "epoch": 0.001568, |
| "grad_norm": 1.894376516342163, |
| "learning_rate": 1e-05, |
| "loss": 0.3006, |
| "step": 156800 |
| }, |
| { |
| "epoch": 0.001569, |
| "grad_norm": 1.381251335144043, |
| "learning_rate": 1e-05, |
| "loss": 0.3053, |
| "step": 156900 |
| }, |
| { |
| "epoch": 0.00157, |
| "grad_norm": 1.8080320358276367, |
| "learning_rate": 1e-05, |
| "loss": 0.2998, |
| "step": 157000 |
| }, |
| { |
| "epoch": 0.001571, |
| "grad_norm": 1.4762428998947144, |
| "learning_rate": 1e-05, |
| "loss": 0.304, |
| "step": 157100 |
| }, |
| { |
| "epoch": 0.001572, |
| "grad_norm": 1.4068925380706787, |
| "learning_rate": 1e-05, |
| "loss": 0.3069, |
| "step": 157200 |
| }, |
| { |
| "epoch": 0.001573, |
| "grad_norm": 1.5252565145492554, |
| "learning_rate": 1e-05, |
| "loss": 0.2932, |
| "step": 157300 |
| }, |
| { |
| "epoch": 0.001574, |
| "grad_norm": 1.5361262559890747, |
| "learning_rate": 1e-05, |
| "loss": 0.3026, |
| "step": 157400 |
| }, |
| { |
| "epoch": 0.001575, |
| "grad_norm": 1.4667400121688843, |
| "learning_rate": 1e-05, |
| "loss": 0.2973, |
| "step": 157500 |
| }, |
| { |
| "epoch": 0.001576, |
| "grad_norm": 1.3710594177246094, |
| "learning_rate": 1e-05, |
| "loss": 0.2958, |
| "step": 157600 |
| }, |
| { |
| "epoch": 0.001577, |
| "grad_norm": 1.4320799112319946, |
| "learning_rate": 1e-05, |
| "loss": 0.3053, |
| "step": 157700 |
| }, |
| { |
| "epoch": 0.001578, |
| "grad_norm": 1.7390029430389404, |
| "learning_rate": 1e-05, |
| "loss": 0.3042, |
| "step": 157800 |
| }, |
| { |
| "epoch": 0.001579, |
| "grad_norm": 1.7083619832992554, |
| "learning_rate": 1e-05, |
| "loss": 0.3051, |
| "step": 157900 |
| }, |
| { |
| "epoch": 0.00158, |
| "grad_norm": 1.5926868915557861, |
| "learning_rate": 1e-05, |
| "loss": 0.3092, |
| "step": 158000 |
| }, |
| { |
| "epoch": 0.001581, |
| "grad_norm": 1.8559361696243286, |
| "learning_rate": 1e-05, |
| "loss": 0.3046, |
| "step": 158100 |
| }, |
| { |
| "epoch": 0.001582, |
| "grad_norm": 1.7098585367202759, |
| "learning_rate": 1e-05, |
| "loss": 0.3016, |
| "step": 158200 |
| }, |
| { |
| "epoch": 0.001583, |
| "grad_norm": 1.5086554288864136, |
| "learning_rate": 1e-05, |
| "loss": 0.3099, |
| "step": 158300 |
| }, |
| { |
| "epoch": 0.001584, |
| "grad_norm": 1.5402973890304565, |
| "learning_rate": 1e-05, |
| "loss": 0.3047, |
| "step": 158400 |
| }, |
| { |
| "epoch": 0.001585, |
| "grad_norm": 1.8630613088607788, |
| "learning_rate": 1e-05, |
| "loss": 0.2992, |
| "step": 158500 |
| }, |
| { |
| "epoch": 0.001586, |
| "grad_norm": 1.4310539960861206, |
| "learning_rate": 1e-05, |
| "loss": 0.3038, |
| "step": 158600 |
| }, |
| { |
| "epoch": 0.001587, |
| "grad_norm": 1.7185118198394775, |
| "learning_rate": 1e-05, |
| "loss": 0.2946, |
| "step": 158700 |
| }, |
| { |
| "epoch": 0.001588, |
| "grad_norm": 1.6692184209823608, |
| "learning_rate": 1e-05, |
| "loss": 0.3042, |
| "step": 158800 |
| }, |
| { |
| "epoch": 0.001589, |
| "grad_norm": 1.501453161239624, |
| "learning_rate": 1e-05, |
| "loss": 0.2964, |
| "step": 158900 |
| }, |
| { |
| "epoch": 0.00159, |
| "grad_norm": 1.6817126274108887, |
| "learning_rate": 1e-05, |
| "loss": 0.3005, |
| "step": 159000 |
| }, |
| { |
| "epoch": 0.001591, |
| "grad_norm": 1.4650346040725708, |
| "learning_rate": 1e-05, |
| "loss": 0.3043, |
| "step": 159100 |
| }, |
| { |
| "epoch": 0.001592, |
| "grad_norm": 1.7585411071777344, |
| "learning_rate": 1e-05, |
| "loss": 0.2997, |
| "step": 159200 |
| }, |
| { |
| "epoch": 0.001593, |
| "grad_norm": 1.440165638923645, |
| "learning_rate": 1e-05, |
| "loss": 0.3077, |
| "step": 159300 |
| }, |
| { |
| "epoch": 0.001594, |
| "grad_norm": 1.6657960414886475, |
| "learning_rate": 1e-05, |
| "loss": 0.301, |
| "step": 159400 |
| }, |
| { |
| "epoch": 0.001595, |
| "grad_norm": 1.4905738830566406, |
| "learning_rate": 1e-05, |
| "loss": 0.3025, |
| "step": 159500 |
| }, |
| { |
| "epoch": 0.001596, |
| "grad_norm": 1.744041085243225, |
| "learning_rate": 1e-05, |
| "loss": 0.3015, |
| "step": 159600 |
| }, |
| { |
| "epoch": 0.001597, |
| "grad_norm": 1.3322395086288452, |
| "learning_rate": 1e-05, |
| "loss": 0.2977, |
| "step": 159700 |
| }, |
| { |
| "epoch": 0.001598, |
| "grad_norm": 1.577319622039795, |
| "learning_rate": 1e-05, |
| "loss": 0.2962, |
| "step": 159800 |
| }, |
| { |
| "epoch": 0.001599, |
| "grad_norm": 1.4697024822235107, |
| "learning_rate": 1e-05, |
| "loss": 0.2978, |
| "step": 159900 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 1.4797179698944092, |
| "learning_rate": 1e-05, |
| "loss": 0.2991, |
| "step": 160000 |
| }, |
| { |
| "epoch": 0.0016, |
| "eval_loss": 0.2783203125, |
| "eval_runtime": 115.6285, |
| "eval_samples_per_second": 432.419, |
| "eval_steps_per_second": 27.026, |
| "step": 160000 |
| }, |
| { |
| "epoch": 0.001601, |
| "grad_norm": 1.489996075630188, |
| "learning_rate": 1e-05, |
| "loss": 0.3065, |
| "step": 160100 |
| }, |
| { |
| "epoch": 0.001602, |
| "grad_norm": 1.6529942750930786, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 160200 |
| }, |
| { |
| "epoch": 0.001603, |
| "grad_norm": 1.6032297611236572, |
| "learning_rate": 1e-05, |
| "loss": 0.3036, |
| "step": 160300 |
| }, |
| { |
| "epoch": 0.001604, |
| "grad_norm": 1.3672584295272827, |
| "learning_rate": 1e-05, |
| "loss": 0.3034, |
| "step": 160400 |
| }, |
| { |
| "epoch": 0.001605, |
| "grad_norm": 1.5010960102081299, |
| "learning_rate": 1e-05, |
| "loss": 0.3026, |
| "step": 160500 |
| }, |
| { |
| "epoch": 0.001606, |
| "grad_norm": 1.631774663925171, |
| "learning_rate": 1e-05, |
| "loss": 0.3042, |
| "step": 160600 |
| }, |
| { |
| "epoch": 0.001607, |
| "grad_norm": 1.3571579456329346, |
| "learning_rate": 1e-05, |
| "loss": 0.3008, |
| "step": 160700 |
| }, |
| { |
| "epoch": 0.001608, |
| "grad_norm": 1.9333149194717407, |
| "learning_rate": 1e-05, |
| "loss": 0.2979, |
| "step": 160800 |
| }, |
| { |
| "epoch": 0.001609, |
| "grad_norm": 1.5662444829940796, |
| "learning_rate": 1e-05, |
| "loss": 0.2991, |
| "step": 160900 |
| }, |
| { |
| "epoch": 0.00161, |
| "grad_norm": 1.4831576347351074, |
| "learning_rate": 1e-05, |
| "loss": 0.2976, |
| "step": 161000 |
| }, |
| { |
| "epoch": 0.001611, |
| "grad_norm": 1.3023030757904053, |
| "learning_rate": 1e-05, |
| "loss": 0.3031, |
| "step": 161100 |
| }, |
| { |
| "epoch": 0.001612, |
| "grad_norm": 1.3196107149124146, |
| "learning_rate": 1e-05, |
| "loss": 0.2966, |
| "step": 161200 |
| }, |
| { |
| "epoch": 0.001613, |
| "grad_norm": 1.6283353567123413, |
| "learning_rate": 1e-05, |
| "loss": 0.3041, |
| "step": 161300 |
| }, |
| { |
| "epoch": 0.001614, |
| "grad_norm": 1.5330151319503784, |
| "learning_rate": 1e-05, |
| "loss": 0.2953, |
| "step": 161400 |
| }, |
| { |
| "epoch": 0.001615, |
| "grad_norm": 1.583950400352478, |
| "learning_rate": 1e-05, |
| "loss": 0.2976, |
| "step": 161500 |
| }, |
| { |
| "epoch": 0.001616, |
| "grad_norm": 1.557778000831604, |
| "learning_rate": 1e-05, |
| "loss": 0.3016, |
| "step": 161600 |
| }, |
| { |
| "epoch": 0.001617, |
| "grad_norm": 2.2620208263397217, |
| "learning_rate": 1e-05, |
| "loss": 0.3063, |
| "step": 161700 |
| }, |
| { |
| "epoch": 0.001618, |
| "grad_norm": 1.6115883588790894, |
| "learning_rate": 1e-05, |
| "loss": 0.2965, |
| "step": 161800 |
| }, |
| { |
| "epoch": 0.001619, |
| "grad_norm": 1.6807005405426025, |
| "learning_rate": 1e-05, |
| "loss": 0.302, |
| "step": 161900 |
| }, |
| { |
| "epoch": 0.00162, |
| "grad_norm": 1.4891862869262695, |
| "learning_rate": 1e-05, |
| "loss": 0.293, |
| "step": 162000 |
| }, |
| { |
| "epoch": 0.001621, |
| "grad_norm": 1.6026562452316284, |
| "learning_rate": 1e-05, |
| "loss": 0.3031, |
| "step": 162100 |
| }, |
| { |
| "epoch": 0.001622, |
| "grad_norm": 1.4442458152770996, |
| "learning_rate": 1e-05, |
| "loss": 0.3084, |
| "step": 162200 |
| }, |
| { |
| "epoch": 0.001623, |
| "grad_norm": 1.5560252666473389, |
| "learning_rate": 1e-05, |
| "loss": 0.3017, |
| "step": 162300 |
| }, |
| { |
| "epoch": 0.001624, |
| "grad_norm": 1.6526131629943848, |
| "learning_rate": 1e-05, |
| "loss": 0.2969, |
| "step": 162400 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 1.4917162656784058, |
| "learning_rate": 1e-05, |
| "loss": 0.3017, |
| "step": 162500 |
| }, |
| { |
| "epoch": 0.001626, |
| "grad_norm": 1.526892066001892, |
| "learning_rate": 1e-05, |
| "loss": 0.3001, |
| "step": 162600 |
| }, |
| { |
| "epoch": 0.001627, |
| "grad_norm": 1.3089638948440552, |
| "learning_rate": 1e-05, |
| "loss": 0.2965, |
| "step": 162700 |
| }, |
| { |
| "epoch": 0.001628, |
| "grad_norm": 1.6630245447158813, |
| "learning_rate": 1e-05, |
| "loss": 0.2936, |
| "step": 162800 |
| }, |
| { |
| "epoch": 0.001629, |
| "grad_norm": 1.420673131942749, |
| "learning_rate": 1e-05, |
| "loss": 0.3051, |
| "step": 162900 |
| }, |
| { |
| "epoch": 0.00163, |
| "grad_norm": 1.4711486101150513, |
| "learning_rate": 1e-05, |
| "loss": 0.2923, |
| "step": 163000 |
| }, |
| { |
| "epoch": 0.001631, |
| "grad_norm": 1.6381266117095947, |
| "learning_rate": 1e-05, |
| "loss": 0.294, |
| "step": 163100 |
| }, |
| { |
| "epoch": 0.001632, |
| "grad_norm": 1.5917518138885498, |
| "learning_rate": 1e-05, |
| "loss": 0.3024, |
| "step": 163200 |
| }, |
| { |
| "epoch": 0.001633, |
| "grad_norm": 1.6768611669540405, |
| "learning_rate": 1e-05, |
| "loss": 0.2992, |
| "step": 163300 |
| }, |
| { |
| "epoch": 0.001634, |
| "grad_norm": 1.5716297626495361, |
| "learning_rate": 1e-05, |
| "loss": 0.3028, |
| "step": 163400 |
| }, |
| { |
| "epoch": 0.001635, |
| "grad_norm": 1.5690321922302246, |
| "learning_rate": 1e-05, |
| "loss": 0.2989, |
| "step": 163500 |
| }, |
| { |
| "epoch": 0.001636, |
| "grad_norm": 1.698068618774414, |
| "learning_rate": 1e-05, |
| "loss": 0.3009, |
| "step": 163600 |
| }, |
| { |
| "epoch": 0.001637, |
| "grad_norm": 1.7230242490768433, |
| "learning_rate": 1e-05, |
| "loss": 0.304, |
| "step": 163700 |
| }, |
| { |
| "epoch": 0.001638, |
| "grad_norm": 1.6072338819503784, |
| "learning_rate": 1e-05, |
| "loss": 0.2956, |
| "step": 163800 |
| }, |
| { |
| "epoch": 0.001639, |
| "grad_norm": 1.611342430114746, |
| "learning_rate": 1e-05, |
| "loss": 0.304, |
| "step": 163900 |
| }, |
| { |
| "epoch": 0.00164, |
| "grad_norm": 1.4601253271102905, |
| "learning_rate": 1e-05, |
| "loss": 0.3055, |
| "step": 164000 |
| }, |
| { |
| "epoch": 0.001641, |
| "grad_norm": 1.567654013633728, |
| "learning_rate": 1e-05, |
| "loss": 0.2974, |
| "step": 164100 |
| }, |
| { |
| "epoch": 0.001642, |
| "grad_norm": 2.222820281982422, |
| "learning_rate": 1e-05, |
| "loss": 0.2955, |
| "step": 164200 |
| }, |
| { |
| "epoch": 0.001643, |
| "grad_norm": 4.182979106903076, |
| "learning_rate": 1e-05, |
| "loss": 0.3032, |
| "step": 164300 |
| }, |
| { |
| "epoch": 0.001644, |
| "grad_norm": 1.8960726261138916, |
| "learning_rate": 1e-05, |
| "loss": 0.3026, |
| "step": 164400 |
| }, |
| { |
| "epoch": 0.001645, |
| "grad_norm": 1.5564576387405396, |
| "learning_rate": 1e-05, |
| "loss": 0.2951, |
| "step": 164500 |
| }, |
| { |
| "epoch": 0.001646, |
| "grad_norm": 1.519041657447815, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 164600 |
| }, |
| { |
| "epoch": 0.001647, |
| "grad_norm": 1.6985987424850464, |
| "learning_rate": 1e-05, |
| "loss": 0.2964, |
| "step": 164700 |
| }, |
| { |
| "epoch": 0.001648, |
| "grad_norm": 1.3167078495025635, |
| "learning_rate": 1e-05, |
| "loss": 0.2982, |
| "step": 164800 |
| }, |
| { |
| "epoch": 0.001649, |
| "grad_norm": 1.5005210638046265, |
| "learning_rate": 1e-05, |
| "loss": 0.2936, |
| "step": 164900 |
| }, |
| { |
| "epoch": 0.00165, |
| "grad_norm": 1.4177864789962769, |
| "learning_rate": 1e-05, |
| "loss": 0.2978, |
| "step": 165000 |
| }, |
| { |
| "epoch": 0.001651, |
| "grad_norm": 1.4829902648925781, |
| "learning_rate": 1e-05, |
| "loss": 0.2986, |
| "step": 165100 |
| }, |
| { |
| "epoch": 0.001652, |
| "grad_norm": 1.3919358253479004, |
| "learning_rate": 1e-05, |
| "loss": 0.2917, |
| "step": 165200 |
| }, |
| { |
| "epoch": 0.001653, |
| "grad_norm": 1.3996176719665527, |
| "learning_rate": 1e-05, |
| "loss": 0.298, |
| "step": 165300 |
| }, |
| { |
| "epoch": 0.001654, |
| "grad_norm": 2.0305674076080322, |
| "learning_rate": 1e-05, |
| "loss": 0.2974, |
| "step": 165400 |
| }, |
| { |
| "epoch": 0.001655, |
| "grad_norm": 1.710474967956543, |
| "learning_rate": 1e-05, |
| "loss": 0.2989, |
| "step": 165500 |
| }, |
| { |
| "epoch": 0.001656, |
| "grad_norm": 1.4588967561721802, |
| "learning_rate": 1e-05, |
| "loss": 0.2953, |
| "step": 165600 |
| }, |
| { |
| "epoch": 0.001657, |
| "grad_norm": 1.4981319904327393, |
| "learning_rate": 1e-05, |
| "loss": 0.2997, |
| "step": 165700 |
| }, |
| { |
| "epoch": 0.001658, |
| "grad_norm": 1.4303194284439087, |
| "learning_rate": 1e-05, |
| "loss": 0.2996, |
| "step": 165800 |
| }, |
| { |
| "epoch": 0.001659, |
| "grad_norm": 1.3741976022720337, |
| "learning_rate": 1e-05, |
| "loss": 0.2921, |
| "step": 165900 |
| }, |
| { |
| "epoch": 0.00166, |
| "grad_norm": 1.6370424032211304, |
| "learning_rate": 1e-05, |
| "loss": 0.302, |
| "step": 166000 |
| }, |
| { |
| "epoch": 0.001661, |
| "grad_norm": 1.6333328485488892, |
| "learning_rate": 1e-05, |
| "loss": 0.2957, |
| "step": 166100 |
| }, |
| { |
| "epoch": 0.001662, |
| "grad_norm": 1.5434244871139526, |
| "learning_rate": 1e-05, |
| "loss": 0.2877, |
| "step": 166200 |
| }, |
| { |
| "epoch": 0.001663, |
| "grad_norm": 1.4523191452026367, |
| "learning_rate": 1e-05, |
| "loss": 0.2984, |
| "step": 166300 |
| }, |
| { |
| "epoch": 0.001664, |
| "grad_norm": 1.4161934852600098, |
| "learning_rate": 1e-05, |
| "loss": 0.309, |
| "step": 166400 |
| }, |
| { |
| "epoch": 0.001665, |
| "grad_norm": 1.5231043100357056, |
| "learning_rate": 1e-05, |
| "loss": 0.2942, |
| "step": 166500 |
| }, |
| { |
| "epoch": 0.001666, |
| "grad_norm": 1.4963332414627075, |
| "learning_rate": 1e-05, |
| "loss": 0.2931, |
| "step": 166600 |
| }, |
| { |
| "epoch": 0.001667, |
| "grad_norm": 1.5862305164337158, |
| "learning_rate": 1e-05, |
| "loss": 0.2876, |
| "step": 166700 |
| }, |
| { |
| "epoch": 0.001668, |
| "grad_norm": 1.4414396286010742, |
| "learning_rate": 1e-05, |
| "loss": 0.2971, |
| "step": 166800 |
| }, |
| { |
| "epoch": 0.001669, |
| "grad_norm": 1.5475411415100098, |
| "learning_rate": 1e-05, |
| "loss": 0.3065, |
| "step": 166900 |
| }, |
| { |
| "epoch": 0.00167, |
| "grad_norm": 1.6189428567886353, |
| "learning_rate": 1e-05, |
| "loss": 0.2929, |
| "step": 167000 |
| }, |
| { |
| "epoch": 0.001671, |
| "grad_norm": 3.0786776542663574, |
| "learning_rate": 1e-05, |
| "loss": 0.2959, |
| "step": 167100 |
| }, |
| { |
| "epoch": 0.001672, |
| "grad_norm": 1.6727555990219116, |
| "learning_rate": 1e-05, |
| "loss": 0.302, |
| "step": 167200 |
| }, |
| { |
| "epoch": 0.001673, |
| "grad_norm": 1.6792882680892944, |
| "learning_rate": 1e-05, |
| "loss": 0.2956, |
| "step": 167300 |
| }, |
| { |
| "epoch": 0.001674, |
| "grad_norm": 1.4409688711166382, |
| "learning_rate": 1e-05, |
| "loss": 0.2931, |
| "step": 167400 |
| }, |
| { |
| "epoch": 0.001675, |
| "grad_norm": 1.84697687625885, |
| "learning_rate": 1e-05, |
| "loss": 0.2942, |
| "step": 167500 |
| }, |
| { |
| "epoch": 0.001676, |
| "grad_norm": 1.294731855392456, |
| "learning_rate": 1e-05, |
| "loss": 0.2908, |
| "step": 167600 |
| }, |
| { |
| "epoch": 0.001677, |
| "grad_norm": 1.557396411895752, |
| "learning_rate": 1e-05, |
| "loss": 0.2906, |
| "step": 167700 |
| }, |
| { |
| "epoch": 0.001678, |
| "grad_norm": 1.6044083833694458, |
| "learning_rate": 1e-05, |
| "loss": 0.2953, |
| "step": 167800 |
| }, |
| { |
| "epoch": 0.001679, |
| "grad_norm": 1.5445910692214966, |
| "learning_rate": 1e-05, |
| "loss": 0.2985, |
| "step": 167900 |
| }, |
| { |
| "epoch": 0.00168, |
| "grad_norm": 1.5258064270019531, |
| "learning_rate": 1e-05, |
| "loss": 0.292, |
| "step": 168000 |
| }, |
| { |
| "epoch": 0.001681, |
| "grad_norm": 1.6000378131866455, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 168100 |
| }, |
| { |
| "epoch": 0.001682, |
| "grad_norm": 1.4779728651046753, |
| "learning_rate": 1e-05, |
| "loss": 0.3035, |
| "step": 168200 |
| }, |
| { |
| "epoch": 0.001683, |
| "grad_norm": 1.8927539587020874, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 168300 |
| }, |
| { |
| "epoch": 0.001684, |
| "grad_norm": 1.5276615619659424, |
| "learning_rate": 1e-05, |
| "loss": 0.2941, |
| "step": 168400 |
| }, |
| { |
| "epoch": 0.001685, |
| "grad_norm": 1.4742182493209839, |
| "learning_rate": 1e-05, |
| "loss": 0.2977, |
| "step": 168500 |
| }, |
| { |
| "epoch": 0.001686, |
| "grad_norm": 1.3637776374816895, |
| "learning_rate": 1e-05, |
| "loss": 0.2919, |
| "step": 168600 |
| }, |
| { |
| "epoch": 0.001687, |
| "grad_norm": 2.3156306743621826, |
| "learning_rate": 1e-05, |
| "loss": 0.3035, |
| "step": 168700 |
| }, |
| { |
| "epoch": 0.001688, |
| "grad_norm": 1.5192285776138306, |
| "learning_rate": 1e-05, |
| "loss": 0.2936, |
| "step": 168800 |
| }, |
| { |
| "epoch": 0.001689, |
| "grad_norm": 1.6717158555984497, |
| "learning_rate": 1e-05, |
| "loss": 0.2873, |
| "step": 168900 |
| }, |
| { |
| "epoch": 0.00169, |
| "grad_norm": 1.9118905067443848, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 169000 |
| }, |
| { |
| "epoch": 0.001691, |
| "grad_norm": 1.750054955482483, |
| "learning_rate": 1e-05, |
| "loss": 0.2943, |
| "step": 169100 |
| }, |
| { |
| "epoch": 0.001692, |
| "grad_norm": 1.545607566833496, |
| "learning_rate": 1e-05, |
| "loss": 0.2942, |
| "step": 169200 |
| }, |
| { |
| "epoch": 0.001693, |
| "grad_norm": 1.576042652130127, |
| "learning_rate": 1e-05, |
| "loss": 0.2913, |
| "step": 169300 |
| }, |
| { |
| "epoch": 0.001694, |
| "grad_norm": 1.5161222219467163, |
| "learning_rate": 1e-05, |
| "loss": 0.2922, |
| "step": 169400 |
| }, |
| { |
| "epoch": 0.001695, |
| "grad_norm": 1.5191494226455688, |
| "learning_rate": 1e-05, |
| "loss": 0.3026, |
| "step": 169500 |
| }, |
| { |
| "epoch": 0.001696, |
| "grad_norm": 1.807310700416565, |
| "learning_rate": 1e-05, |
| "loss": 0.2954, |
| "step": 169600 |
| }, |
| { |
| "epoch": 0.001697, |
| "grad_norm": 1.5956357717514038, |
| "learning_rate": 1e-05, |
| "loss": 0.3014, |
| "step": 169700 |
| }, |
| { |
| "epoch": 0.001698, |
| "grad_norm": 2.556617259979248, |
| "learning_rate": 1e-05, |
| "loss": 0.2993, |
| "step": 169800 |
| }, |
| { |
| "epoch": 0.001699, |
| "grad_norm": 1.5786460638046265, |
| "learning_rate": 1e-05, |
| "loss": 0.2942, |
| "step": 169900 |
| }, |
| { |
| "epoch": 0.0017, |
| "grad_norm": 1.6583482027053833, |
| "learning_rate": 1e-05, |
| "loss": 0.3006, |
| "step": 170000 |
| }, |
| { |
| "epoch": 0.001701, |
| "grad_norm": 1.8018178939819336, |
| "learning_rate": 1e-05, |
| "loss": 0.296, |
| "step": 170100 |
| }, |
| { |
| "epoch": 0.001702, |
| "grad_norm": 1.3693221807479858, |
| "learning_rate": 1e-05, |
| "loss": 0.2949, |
| "step": 170200 |
| }, |
| { |
| "epoch": 0.001703, |
| "grad_norm": 1.5675960779190063, |
| "learning_rate": 1e-05, |
| "loss": 0.2949, |
| "step": 170300 |
| }, |
| { |
| "epoch": 0.001704, |
| "grad_norm": 1.8481919765472412, |
| "learning_rate": 1e-05, |
| "loss": 0.3014, |
| "step": 170400 |
| }, |
| { |
| "epoch": 0.001705, |
| "grad_norm": 1.6740190982818604, |
| "learning_rate": 1e-05, |
| "loss": 0.2991, |
| "step": 170500 |
| }, |
| { |
| "epoch": 0.001706, |
| "grad_norm": 1.5895060300827026, |
| "learning_rate": 1e-05, |
| "loss": 0.2957, |
| "step": 170600 |
| }, |
| { |
| "epoch": 0.001707, |
| "grad_norm": 1.3283610343933105, |
| "learning_rate": 1e-05, |
| "loss": 0.2902, |
| "step": 170700 |
| }, |
| { |
| "epoch": 0.001708, |
| "grad_norm": 1.542960524559021, |
| "learning_rate": 1e-05, |
| "loss": 0.2922, |
| "step": 170800 |
| }, |
| { |
| "epoch": 0.001709, |
| "grad_norm": 1.5971072912216187, |
| "learning_rate": 1e-05, |
| "loss": 0.2965, |
| "step": 170900 |
| }, |
| { |
| "epoch": 0.00171, |
| "grad_norm": 1.559484601020813, |
| "learning_rate": 1e-05, |
| "loss": 0.2917, |
| "step": 171000 |
| }, |
| { |
| "epoch": 0.001711, |
| "grad_norm": 1.4500508308410645, |
| "learning_rate": 1e-05, |
| "loss": 0.2948, |
| "step": 171100 |
| }, |
| { |
| "epoch": 0.001712, |
| "grad_norm": 1.7252469062805176, |
| "learning_rate": 1e-05, |
| "loss": 0.2978, |
| "step": 171200 |
| }, |
| { |
| "epoch": 0.001713, |
| "grad_norm": 1.3989806175231934, |
| "learning_rate": 1e-05, |
| "loss": 0.2918, |
| "step": 171300 |
| }, |
| { |
| "epoch": 0.001714, |
| "grad_norm": 1.3513588905334473, |
| "learning_rate": 1e-05, |
| "loss": 0.2945, |
| "step": 171400 |
| }, |
| { |
| "epoch": 0.001715, |
| "grad_norm": 1.7322951555252075, |
| "learning_rate": 1e-05, |
| "loss": 0.2937, |
| "step": 171500 |
| }, |
| { |
| "epoch": 0.001716, |
| "grad_norm": 1.5518382787704468, |
| "learning_rate": 1e-05, |
| "loss": 0.2963, |
| "step": 171600 |
| }, |
| { |
| "epoch": 0.001717, |
| "grad_norm": 1.6225837469100952, |
| "learning_rate": 1e-05, |
| "loss": 0.2996, |
| "step": 171700 |
| }, |
| { |
| "epoch": 0.001718, |
| "grad_norm": 1.6591675281524658, |
| "learning_rate": 1e-05, |
| "loss": 0.3009, |
| "step": 171800 |
| }, |
| { |
| "epoch": 0.001719, |
| "grad_norm": 1.6477521657943726, |
| "learning_rate": 1e-05, |
| "loss": 0.2986, |
| "step": 171900 |
| }, |
| { |
| "epoch": 0.00172, |
| "grad_norm": 1.392760992050171, |
| "learning_rate": 1e-05, |
| "loss": 0.2988, |
| "step": 172000 |
| }, |
| { |
| "epoch": 0.001721, |
| "grad_norm": 2.2496235370635986, |
| "learning_rate": 1e-05, |
| "loss": 0.2929, |
| "step": 172100 |
| }, |
| { |
| "epoch": 0.001722, |
| "grad_norm": 1.5061190128326416, |
| "learning_rate": 1e-05, |
| "loss": 0.2886, |
| "step": 172200 |
| }, |
| { |
| "epoch": 0.001723, |
| "grad_norm": 1.567452311515808, |
| "learning_rate": 1e-05, |
| "loss": 0.2995, |
| "step": 172300 |
| }, |
| { |
| "epoch": 0.001724, |
| "grad_norm": 1.5281249284744263, |
| "learning_rate": 1e-05, |
| "loss": 0.3001, |
| "step": 172400 |
| }, |
| { |
| "epoch": 0.001725, |
| "grad_norm": 1.5612703561782837, |
| "learning_rate": 1e-05, |
| "loss": 0.3001, |
| "step": 172500 |
| }, |
| { |
| "epoch": 0.001726, |
| "grad_norm": 1.5059471130371094, |
| "learning_rate": 1e-05, |
| "loss": 0.2931, |
| "step": 172600 |
| }, |
| { |
| "epoch": 0.001727, |
| "grad_norm": 1.598825216293335, |
| "learning_rate": 1e-05, |
| "loss": 0.2935, |
| "step": 172700 |
| }, |
| { |
| "epoch": 0.001728, |
| "grad_norm": 1.605755090713501, |
| "learning_rate": 1e-05, |
| "loss": 0.2971, |
| "step": 172800 |
| }, |
| { |
| "epoch": 0.001729, |
| "grad_norm": 1.6247023344039917, |
| "learning_rate": 1e-05, |
| "loss": 0.2902, |
| "step": 172900 |
| }, |
| { |
| "epoch": 0.00173, |
| "grad_norm": 1.3344452381134033, |
| "learning_rate": 1e-05, |
| "loss": 0.3012, |
| "step": 173000 |
| }, |
| { |
| "epoch": 0.001731, |
| "grad_norm": 1.5707899332046509, |
| "learning_rate": 1e-05, |
| "loss": 0.2924, |
| "step": 173100 |
| }, |
| { |
| "epoch": 0.001732, |
| "grad_norm": 1.3291939496994019, |
| "learning_rate": 1e-05, |
| "loss": 0.2883, |
| "step": 173200 |
| }, |
| { |
| "epoch": 0.001733, |
| "grad_norm": 1.429669737815857, |
| "learning_rate": 1e-05, |
| "loss": 0.2973, |
| "step": 173300 |
| }, |
| { |
| "epoch": 0.001734, |
| "grad_norm": 1.5371525287628174, |
| "learning_rate": 1e-05, |
| "loss": 0.2962, |
| "step": 173400 |
| }, |
| { |
| "epoch": 0.001735, |
| "grad_norm": 1.5190463066101074, |
| "learning_rate": 1e-05, |
| "loss": 0.3017, |
| "step": 173500 |
| }, |
| { |
| "epoch": 0.001736, |
| "grad_norm": 1.4913296699523926, |
| "learning_rate": 1e-05, |
| "loss": 0.3002, |
| "step": 173600 |
| }, |
| { |
| "epoch": 0.001737, |
| "grad_norm": 1.5345001220703125, |
| "learning_rate": 1e-05, |
| "loss": 0.2946, |
| "step": 173700 |
| }, |
| { |
| "epoch": 0.001738, |
| "grad_norm": 1.5316048860549927, |
| "learning_rate": 1e-05, |
| "loss": 0.2978, |
| "step": 173800 |
| }, |
| { |
| "epoch": 0.001739, |
| "grad_norm": 1.597626805305481, |
| "learning_rate": 1e-05, |
| "loss": 0.289, |
| "step": 173900 |
| }, |
| { |
| "epoch": 0.00174, |
| "grad_norm": 1.2749770879745483, |
| "learning_rate": 1e-05, |
| "loss": 0.2933, |
| "step": 174000 |
| }, |
| { |
| "epoch": 0.001741, |
| "grad_norm": 1.5808467864990234, |
| "learning_rate": 1e-05, |
| "loss": 0.2997, |
| "step": 174100 |
| }, |
| { |
| "epoch": 0.001742, |
| "grad_norm": 1.2959426641464233, |
| "learning_rate": 1e-05, |
| "loss": 0.2873, |
| "step": 174200 |
| }, |
| { |
| "epoch": 0.001743, |
| "grad_norm": 1.6337339878082275, |
| "learning_rate": 1e-05, |
| "loss": 0.2862, |
| "step": 174300 |
| }, |
| { |
| "epoch": 0.001744, |
| "grad_norm": 1.528238296508789, |
| "learning_rate": 1e-05, |
| "loss": 0.2847, |
| "step": 174400 |
| }, |
| { |
| "epoch": 0.001745, |
| "grad_norm": 1.4361398220062256, |
| "learning_rate": 1e-05, |
| "loss": 0.2953, |
| "step": 174500 |
| }, |
| { |
| "epoch": 0.001746, |
| "grad_norm": 1.6236249208450317, |
| "learning_rate": 1e-05, |
| "loss": 0.2961, |
| "step": 174600 |
| }, |
| { |
| "epoch": 0.001747, |
| "grad_norm": 1.4904263019561768, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 174700 |
| }, |
| { |
| "epoch": 0.001748, |
| "grad_norm": 1.9077177047729492, |
| "learning_rate": 1e-05, |
| "loss": 0.292, |
| "step": 174800 |
| }, |
| { |
| "epoch": 0.001749, |
| "grad_norm": 1.7309199571609497, |
| "learning_rate": 1e-05, |
| "loss": 0.2952, |
| "step": 174900 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 1.6516369581222534, |
| "learning_rate": 1e-05, |
| "loss": 0.2935, |
| "step": 175000 |
| }, |
| { |
| "epoch": 0.001751, |
| "grad_norm": 1.6567866802215576, |
| "learning_rate": 1e-05, |
| "loss": 0.2913, |
| "step": 175100 |
| }, |
| { |
| "epoch": 0.001752, |
| "grad_norm": 1.4897688627243042, |
| "learning_rate": 1e-05, |
| "loss": 0.2883, |
| "step": 175200 |
| }, |
| { |
| "epoch": 0.001753, |
| "grad_norm": 1.3633755445480347, |
| "learning_rate": 1e-05, |
| "loss": 0.2985, |
| "step": 175300 |
| }, |
| { |
| "epoch": 0.001754, |
| "grad_norm": 1.6736537218093872, |
| "learning_rate": 1e-05, |
| "loss": 0.2924, |
| "step": 175400 |
| }, |
| { |
| "epoch": 0.001755, |
| "grad_norm": 1.5742863416671753, |
| "learning_rate": 1e-05, |
| "loss": 0.285, |
| "step": 175500 |
| }, |
| { |
| "epoch": 0.001756, |
| "grad_norm": 1.5823429822921753, |
| "learning_rate": 1e-05, |
| "loss": 0.2865, |
| "step": 175600 |
| }, |
| { |
| "epoch": 0.001757, |
| "grad_norm": 1.4199198484420776, |
| "learning_rate": 1e-05, |
| "loss": 0.2969, |
| "step": 175700 |
| }, |
| { |
| "epoch": 0.001758, |
| "grad_norm": 1.4227279424667358, |
| "learning_rate": 1e-05, |
| "loss": 0.2889, |
| "step": 175800 |
| }, |
| { |
| "epoch": 0.001759, |
| "grad_norm": 1.3790444135665894, |
| "learning_rate": 1e-05, |
| "loss": 0.2947, |
| "step": 175900 |
| }, |
| { |
| "epoch": 0.00176, |
| "grad_norm": 1.717409372329712, |
| "learning_rate": 1e-05, |
| "loss": 0.2934, |
| "step": 176000 |
| }, |
| { |
| "epoch": 0.001761, |
| "grad_norm": 1.5897624492645264, |
| "learning_rate": 1e-05, |
| "loss": 0.3003, |
| "step": 176100 |
| }, |
| { |
| "epoch": 0.001762, |
| "grad_norm": 1.5490648746490479, |
| "learning_rate": 1e-05, |
| "loss": 0.294, |
| "step": 176200 |
| }, |
| { |
| "epoch": 0.001763, |
| "grad_norm": 1.5469624996185303, |
| "learning_rate": 1e-05, |
| "loss": 0.2915, |
| "step": 176300 |
| }, |
| { |
| "epoch": 0.001764, |
| "grad_norm": 1.6679738759994507, |
| "learning_rate": 1e-05, |
| "loss": 0.2966, |
| "step": 176400 |
| }, |
| { |
| "epoch": 0.001765, |
| "grad_norm": 1.3103872537612915, |
| "learning_rate": 1e-05, |
| "loss": 0.2943, |
| "step": 176500 |
| }, |
| { |
| "epoch": 0.001766, |
| "grad_norm": 1.6076604127883911, |
| "learning_rate": 1e-05, |
| "loss": 0.293, |
| "step": 176600 |
| }, |
| { |
| "epoch": 0.001767, |
| "grad_norm": 1.7157478332519531, |
| "learning_rate": 1e-05, |
| "loss": 0.2908, |
| "step": 176700 |
| }, |
| { |
| "epoch": 0.001768, |
| "grad_norm": 1.6932576894760132, |
| "learning_rate": 1e-05, |
| "loss": 0.2931, |
| "step": 176800 |
| }, |
| { |
| "epoch": 0.001769, |
| "grad_norm": 1.3828284740447998, |
| "learning_rate": 1e-05, |
| "loss": 0.2886, |
| "step": 176900 |
| }, |
| { |
| "epoch": 0.00177, |
| "grad_norm": 1.5653789043426514, |
| "learning_rate": 1e-05, |
| "loss": 0.2952, |
| "step": 177000 |
| }, |
| { |
| "epoch": 0.001771, |
| "grad_norm": 1.7369993925094604, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 177100 |
| }, |
| { |
| "epoch": 0.001772, |
| "grad_norm": 1.541108250617981, |
| "learning_rate": 1e-05, |
| "loss": 0.2885, |
| "step": 177200 |
| }, |
| { |
| "epoch": 0.001773, |
| "grad_norm": 1.5713878870010376, |
| "learning_rate": 1e-05, |
| "loss": 0.2951, |
| "step": 177300 |
| }, |
| { |
| "epoch": 0.001774, |
| "grad_norm": 1.6048654317855835, |
| "learning_rate": 1e-05, |
| "loss": 0.2922, |
| "step": 177400 |
| }, |
| { |
| "epoch": 0.001775, |
| "grad_norm": 1.6117722988128662, |
| "learning_rate": 1e-05, |
| "loss": 0.2909, |
| "step": 177500 |
| }, |
| { |
| "epoch": 0.001776, |
| "grad_norm": 1.759687900543213, |
| "learning_rate": 1e-05, |
| "loss": 0.2967, |
| "step": 177600 |
| }, |
| { |
| "epoch": 0.001777, |
| "grad_norm": 1.4271762371063232, |
| "learning_rate": 1e-05, |
| "loss": 0.2868, |
| "step": 177700 |
| }, |
| { |
| "epoch": 0.001778, |
| "grad_norm": 1.4909316301345825, |
| "learning_rate": 1e-05, |
| "loss": 0.2918, |
| "step": 177800 |
| }, |
| { |
| "epoch": 0.001779, |
| "grad_norm": 1.498526692390442, |
| "learning_rate": 1e-05, |
| "loss": 0.2973, |
| "step": 177900 |
| }, |
| { |
| "epoch": 0.00178, |
| "grad_norm": 1.373579502105713, |
| "learning_rate": 1e-05, |
| "loss": 0.2857, |
| "step": 178000 |
| }, |
| { |
| "epoch": 0.001781, |
| "grad_norm": 1.5439717769622803, |
| "learning_rate": 1e-05, |
| "loss": 0.2907, |
| "step": 178100 |
| }, |
| { |
| "epoch": 0.001782, |
| "grad_norm": 1.6108523607254028, |
| "learning_rate": 1e-05, |
| "loss": 0.294, |
| "step": 178200 |
| }, |
| { |
| "epoch": 0.001783, |
| "grad_norm": 1.3739374876022339, |
| "learning_rate": 1e-05, |
| "loss": 0.2959, |
| "step": 178300 |
| }, |
| { |
| "epoch": 0.001784, |
| "grad_norm": 2.883863925933838, |
| "learning_rate": 1e-05, |
| "loss": 0.2891, |
| "step": 178400 |
| }, |
| { |
| "epoch": 0.001785, |
| "grad_norm": 1.3920929431915283, |
| "learning_rate": 1e-05, |
| "loss": 0.291, |
| "step": 178500 |
| }, |
| { |
| "epoch": 0.001786, |
| "grad_norm": 1.4327913522720337, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 178600 |
| }, |
| { |
| "epoch": 0.001787, |
| "grad_norm": 1.6962852478027344, |
| "learning_rate": 1e-05, |
| "loss": 0.2852, |
| "step": 178700 |
| }, |
| { |
| "epoch": 0.001788, |
| "grad_norm": 1.5259137153625488, |
| "learning_rate": 1e-05, |
| "loss": 0.2974, |
| "step": 178800 |
| }, |
| { |
| "epoch": 0.001789, |
| "grad_norm": 1.4931055307388306, |
| "learning_rate": 1e-05, |
| "loss": 0.2888, |
| "step": 178900 |
| }, |
| { |
| "epoch": 0.00179, |
| "grad_norm": 1.5622413158416748, |
| "learning_rate": 1e-05, |
| "loss": 0.2997, |
| "step": 179000 |
| }, |
| { |
| "epoch": 0.001791, |
| "grad_norm": 1.5523242950439453, |
| "learning_rate": 1e-05, |
| "loss": 0.2929, |
| "step": 179100 |
| }, |
| { |
| "epoch": 0.001792, |
| "grad_norm": 1.4353841543197632, |
| "learning_rate": 1e-05, |
| "loss": 0.285, |
| "step": 179200 |
| }, |
| { |
| "epoch": 0.001793, |
| "grad_norm": 1.4831793308258057, |
| "learning_rate": 1e-05, |
| "loss": 0.2938, |
| "step": 179300 |
| }, |
| { |
| "epoch": 0.001794, |
| "grad_norm": 1.483508825302124, |
| "learning_rate": 1e-05, |
| "loss": 0.2919, |
| "step": 179400 |
| }, |
| { |
| "epoch": 0.001795, |
| "grad_norm": 1.4768630266189575, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 179500 |
| }, |
| { |
| "epoch": 0.001796, |
| "grad_norm": 1.329671859741211, |
| "learning_rate": 1e-05, |
| "loss": 0.2902, |
| "step": 179600 |
| }, |
| { |
| "epoch": 0.001797, |
| "grad_norm": 1.4865089654922485, |
| "learning_rate": 1e-05, |
| "loss": 0.2906, |
| "step": 179700 |
| }, |
| { |
| "epoch": 0.001798, |
| "grad_norm": 1.6199544668197632, |
| "learning_rate": 1e-05, |
| "loss": 0.3013, |
| "step": 179800 |
| }, |
| { |
| "epoch": 0.001799, |
| "grad_norm": 1.633739948272705, |
| "learning_rate": 1e-05, |
| "loss": 0.29, |
| "step": 179900 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 1.3728777170181274, |
| "learning_rate": 1e-05, |
| "loss": 0.2875, |
| "step": 180000 |
| }, |
| { |
| "epoch": 0.0018, |
| "eval_loss": 0.264892578125, |
| "eval_runtime": 113.9893, |
| "eval_samples_per_second": 438.638, |
| "eval_steps_per_second": 27.415, |
| "step": 180000 |
| }, |
| { |
| "epoch": 0.001801, |
| "grad_norm": 2.310349702835083, |
| "learning_rate": 1e-05, |
| "loss": 0.2925, |
| "step": 180100 |
| }, |
| { |
| "epoch": 0.001802, |
| "grad_norm": 1.3453627824783325, |
| "learning_rate": 1e-05, |
| "loss": 0.292, |
| "step": 180200 |
| }, |
| { |
| "epoch": 0.001803, |
| "grad_norm": 1.8540631532669067, |
| "learning_rate": 1e-05, |
| "loss": 0.2946, |
| "step": 180300 |
| }, |
| { |
| "epoch": 0.001804, |
| "grad_norm": 1.594420075416565, |
| "learning_rate": 1e-05, |
| "loss": 0.2923, |
| "step": 180400 |
| }, |
| { |
| "epoch": 0.001805, |
| "grad_norm": 1.5511283874511719, |
| "learning_rate": 1e-05, |
| "loss": 0.2924, |
| "step": 180500 |
| }, |
| { |
| "epoch": 0.001806, |
| "grad_norm": 1.8114066123962402, |
| "learning_rate": 1e-05, |
| "loss": 0.2868, |
| "step": 180600 |
| }, |
| { |
| "epoch": 0.001807, |
| "grad_norm": 1.5278881788253784, |
| "learning_rate": 1e-05, |
| "loss": 0.2896, |
| "step": 180700 |
| }, |
| { |
| "epoch": 0.001808, |
| "grad_norm": 1.4767954349517822, |
| "learning_rate": 1e-05, |
| "loss": 0.2928, |
| "step": 180800 |
| }, |
| { |
| "epoch": 0.001809, |
| "grad_norm": 1.3067213296890259, |
| "learning_rate": 1e-05, |
| "loss": 0.2918, |
| "step": 180900 |
| }, |
| { |
| "epoch": 0.00181, |
| "grad_norm": 1.7097564935684204, |
| "learning_rate": 1e-05, |
| "loss": 0.2913, |
| "step": 181000 |
| }, |
| { |
| "epoch": 0.001811, |
| "grad_norm": 1.6690146923065186, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 181100 |
| }, |
| { |
| "epoch": 0.001812, |
| "grad_norm": 1.6829502582550049, |
| "learning_rate": 1e-05, |
| "loss": 0.2874, |
| "step": 181200 |
| }, |
| { |
| "epoch": 0.001813, |
| "grad_norm": 1.2836750745773315, |
| "learning_rate": 1e-05, |
| "loss": 0.2914, |
| "step": 181300 |
| }, |
| { |
| "epoch": 0.001814, |
| "grad_norm": 1.5141675472259521, |
| "learning_rate": 1e-05, |
| "loss": 0.2869, |
| "step": 181400 |
| }, |
| { |
| "epoch": 0.001815, |
| "grad_norm": 1.571880578994751, |
| "learning_rate": 1e-05, |
| "loss": 0.2908, |
| "step": 181500 |
| }, |
| { |
| "epoch": 0.001816, |
| "grad_norm": 1.5643311738967896, |
| "learning_rate": 1e-05, |
| "loss": 0.2905, |
| "step": 181600 |
| }, |
| { |
| "epoch": 0.001817, |
| "grad_norm": 1.9679372310638428, |
| "learning_rate": 1e-05, |
| "loss": 0.2866, |
| "step": 181700 |
| }, |
| { |
| "epoch": 0.001818, |
| "grad_norm": 1.5207774639129639, |
| "learning_rate": 1e-05, |
| "loss": 0.2876, |
| "step": 181800 |
| }, |
| { |
| "epoch": 0.001819, |
| "grad_norm": 1.4971661567687988, |
| "learning_rate": 1e-05, |
| "loss": 0.2837, |
| "step": 181900 |
| }, |
| { |
| "epoch": 0.00182, |
| "grad_norm": 1.3630481958389282, |
| "learning_rate": 1e-05, |
| "loss": 0.2901, |
| "step": 182000 |
| }, |
| { |
| "epoch": 0.001821, |
| "grad_norm": 1.7479013204574585, |
| "learning_rate": 1e-05, |
| "loss": 0.2809, |
| "step": 182100 |
| }, |
| { |
| "epoch": 0.001822, |
| "grad_norm": 1.6308436393737793, |
| "learning_rate": 1e-05, |
| "loss": 0.2869, |
| "step": 182200 |
| }, |
| { |
| "epoch": 0.001823, |
| "grad_norm": 1.6583669185638428, |
| "learning_rate": 1e-05, |
| "loss": 0.2828, |
| "step": 182300 |
| }, |
| { |
| "epoch": 0.001824, |
| "grad_norm": 1.7341161966323853, |
| "learning_rate": 1e-05, |
| "loss": 0.296, |
| "step": 182400 |
| }, |
| { |
| "epoch": 0.001825, |
| "grad_norm": 1.2434451580047607, |
| "learning_rate": 1e-05, |
| "loss": 0.2903, |
| "step": 182500 |
| }, |
| { |
| "epoch": 0.001826, |
| "grad_norm": 1.4031060934066772, |
| "learning_rate": 1e-05, |
| "loss": 0.2871, |
| "step": 182600 |
| }, |
| { |
| "epoch": 0.001827, |
| "grad_norm": 1.417802095413208, |
| "learning_rate": 1e-05, |
| "loss": 0.2849, |
| "step": 182700 |
| }, |
| { |
| "epoch": 0.001828, |
| "grad_norm": 1.6376116275787354, |
| "learning_rate": 1e-05, |
| "loss": 0.2888, |
| "step": 182800 |
| }, |
| { |
| "epoch": 0.001829, |
| "grad_norm": 1.5004040002822876, |
| "learning_rate": 1e-05, |
| "loss": 0.2889, |
| "step": 182900 |
| }, |
| { |
| "epoch": 0.00183, |
| "grad_norm": 1.3705480098724365, |
| "learning_rate": 1e-05, |
| "loss": 0.2916, |
| "step": 183000 |
| }, |
| { |
| "epoch": 0.001831, |
| "grad_norm": 1.4046076536178589, |
| "learning_rate": 1e-05, |
| "loss": 0.2871, |
| "step": 183100 |
| }, |
| { |
| "epoch": 0.001832, |
| "grad_norm": 1.460054874420166, |
| "learning_rate": 1e-05, |
| "loss": 0.2915, |
| "step": 183200 |
| }, |
| { |
| "epoch": 0.001833, |
| "grad_norm": 2.7054927349090576, |
| "learning_rate": 1e-05, |
| "loss": 0.2913, |
| "step": 183300 |
| }, |
| { |
| "epoch": 0.001834, |
| "grad_norm": 1.5564157962799072, |
| "learning_rate": 1e-05, |
| "loss": 0.2803, |
| "step": 183400 |
| }, |
| { |
| "epoch": 0.001835, |
| "grad_norm": 1.4496382474899292, |
| "learning_rate": 1e-05, |
| "loss": 0.2936, |
| "step": 183500 |
| }, |
| { |
| "epoch": 0.001836, |
| "grad_norm": 1.3869458436965942, |
| "learning_rate": 1e-05, |
| "loss": 0.2927, |
| "step": 183600 |
| }, |
| { |
| "epoch": 0.001837, |
| "grad_norm": 1.5351581573486328, |
| "learning_rate": 1e-05, |
| "loss": 0.294, |
| "step": 183700 |
| }, |
| { |
| "epoch": 0.001838, |
| "grad_norm": 1.3545173406600952, |
| "learning_rate": 1e-05, |
| "loss": 0.289, |
| "step": 183800 |
| }, |
| { |
| "epoch": 0.001839, |
| "grad_norm": 2.300602674484253, |
| "learning_rate": 1e-05, |
| "loss": 0.2814, |
| "step": 183900 |
| }, |
| { |
| "epoch": 0.00184, |
| "grad_norm": 1.4842824935913086, |
| "learning_rate": 1e-05, |
| "loss": 0.2895, |
| "step": 184000 |
| }, |
| { |
| "epoch": 0.001841, |
| "grad_norm": 1.6287872791290283, |
| "learning_rate": 1e-05, |
| "loss": 0.2941, |
| "step": 184100 |
| }, |
| { |
| "epoch": 0.001842, |
| "grad_norm": 1.3800750970840454, |
| "learning_rate": 1e-05, |
| "loss": 0.2834, |
| "step": 184200 |
| }, |
| { |
| "epoch": 0.001843, |
| "grad_norm": 1.4979010820388794, |
| "learning_rate": 1e-05, |
| "loss": 0.2903, |
| "step": 184300 |
| }, |
| { |
| "epoch": 0.001844, |
| "grad_norm": 1.5063962936401367, |
| "learning_rate": 1e-05, |
| "loss": 0.2857, |
| "step": 184400 |
| }, |
| { |
| "epoch": 0.001845, |
| "grad_norm": 1.2965285778045654, |
| "learning_rate": 1e-05, |
| "loss": 0.2856, |
| "step": 184500 |
| }, |
| { |
| "epoch": 0.001846, |
| "grad_norm": 1.467738389968872, |
| "learning_rate": 1e-05, |
| "loss": 0.2826, |
| "step": 184600 |
| }, |
| { |
| "epoch": 0.001847, |
| "grad_norm": 1.4696054458618164, |
| "learning_rate": 1e-05, |
| "loss": 0.2938, |
| "step": 184700 |
| }, |
| { |
| "epoch": 0.001848, |
| "grad_norm": 1.6094032526016235, |
| "learning_rate": 1e-05, |
| "loss": 0.2852, |
| "step": 184800 |
| }, |
| { |
| "epoch": 0.001849, |
| "grad_norm": 1.5092664957046509, |
| "learning_rate": 1e-05, |
| "loss": 0.2951, |
| "step": 184900 |
| }, |
| { |
| "epoch": 0.00185, |
| "grad_norm": 1.482487678527832, |
| "learning_rate": 1e-05, |
| "loss": 0.2905, |
| "step": 185000 |
| }, |
| { |
| "epoch": 0.001851, |
| "grad_norm": 1.4276769161224365, |
| "learning_rate": 1e-05, |
| "loss": 0.2852, |
| "step": 185100 |
| }, |
| { |
| "epoch": 0.001852, |
| "grad_norm": 1.756006121635437, |
| "learning_rate": 1e-05, |
| "loss": 0.2904, |
| "step": 185200 |
| }, |
| { |
| "epoch": 0.001853, |
| "grad_norm": 1.7595645189285278, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 185300 |
| }, |
| { |
| "epoch": 0.001854, |
| "grad_norm": 1.4511970281600952, |
| "learning_rate": 1e-05, |
| "loss": 0.2854, |
| "step": 185400 |
| }, |
| { |
| "epoch": 0.001855, |
| "grad_norm": 1.4957120418548584, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 185500 |
| }, |
| { |
| "epoch": 0.001856, |
| "grad_norm": 1.5508650541305542, |
| "learning_rate": 1e-05, |
| "loss": 0.2868, |
| "step": 185600 |
| }, |
| { |
| "epoch": 0.001857, |
| "grad_norm": 1.4669588804244995, |
| "learning_rate": 1e-05, |
| "loss": 0.2955, |
| "step": 185700 |
| }, |
| { |
| "epoch": 0.001858, |
| "grad_norm": 1.456214189529419, |
| "learning_rate": 1e-05, |
| "loss": 0.2887, |
| "step": 185800 |
| }, |
| { |
| "epoch": 0.001859, |
| "grad_norm": 1.5572723150253296, |
| "learning_rate": 1e-05, |
| "loss": 0.2878, |
| "step": 185900 |
| }, |
| { |
| "epoch": 0.00186, |
| "grad_norm": 1.1924431324005127, |
| "learning_rate": 1e-05, |
| "loss": 0.2872, |
| "step": 186000 |
| }, |
| { |
| "epoch": 0.001861, |
| "grad_norm": 1.5211331844329834, |
| "learning_rate": 1e-05, |
| "loss": 0.2831, |
| "step": 186100 |
| }, |
| { |
| "epoch": 0.001862, |
| "grad_norm": 3.843024969100952, |
| "learning_rate": 1e-05, |
| "loss": 0.2831, |
| "step": 186200 |
| }, |
| { |
| "epoch": 0.001863, |
| "grad_norm": 1.5179247856140137, |
| "learning_rate": 1e-05, |
| "loss": 0.285, |
| "step": 186300 |
| }, |
| { |
| "epoch": 0.001864, |
| "grad_norm": 1.3682702779769897, |
| "learning_rate": 1e-05, |
| "loss": 0.2882, |
| "step": 186400 |
| }, |
| { |
| "epoch": 0.001865, |
| "grad_norm": 1.5251023769378662, |
| "learning_rate": 1e-05, |
| "loss": 0.2915, |
| "step": 186500 |
| }, |
| { |
| "epoch": 0.001866, |
| "grad_norm": 1.627709984779358, |
| "learning_rate": 1e-05, |
| "loss": 0.2847, |
| "step": 186600 |
| }, |
| { |
| "epoch": 0.001867, |
| "grad_norm": 1.5060405731201172, |
| "learning_rate": 1e-05, |
| "loss": 0.2845, |
| "step": 186700 |
| }, |
| { |
| "epoch": 0.001868, |
| "grad_norm": 1.6034202575683594, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 186800 |
| }, |
| { |
| "epoch": 0.001869, |
| "grad_norm": 1.4927774667739868, |
| "learning_rate": 1e-05, |
| "loss": 0.2857, |
| "step": 186900 |
| }, |
| { |
| "epoch": 0.00187, |
| "grad_norm": 1.4778563976287842, |
| "learning_rate": 1e-05, |
| "loss": 0.2844, |
| "step": 187000 |
| }, |
| { |
| "epoch": 0.001871, |
| "grad_norm": 1.2116749286651611, |
| "learning_rate": 1e-05, |
| "loss": 0.2805, |
| "step": 187100 |
| }, |
| { |
| "epoch": 0.001872, |
| "grad_norm": 1.4425694942474365, |
| "learning_rate": 1e-05, |
| "loss": 0.2824, |
| "step": 187200 |
| }, |
| { |
| "epoch": 0.001873, |
| "grad_norm": 1.2100266218185425, |
| "learning_rate": 1e-05, |
| "loss": 0.2836, |
| "step": 187300 |
| }, |
| { |
| "epoch": 0.001874, |
| "grad_norm": 1.548009991645813, |
| "learning_rate": 1e-05, |
| "loss": 0.2854, |
| "step": 187400 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 1.5316060781478882, |
| "learning_rate": 1e-05, |
| "loss": 0.288, |
| "step": 187500 |
| }, |
| { |
| "epoch": 0.001876, |
| "grad_norm": 1.5404126644134521, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 187600 |
| }, |
| { |
| "epoch": 0.001877, |
| "grad_norm": 1.592418909072876, |
| "learning_rate": 1e-05, |
| "loss": 0.2818, |
| "step": 187700 |
| }, |
| { |
| "epoch": 0.001878, |
| "grad_norm": 1.5386697053909302, |
| "learning_rate": 1e-05, |
| "loss": 0.2812, |
| "step": 187800 |
| }, |
| { |
| "epoch": 0.001879, |
| "grad_norm": 1.7977592945098877, |
| "learning_rate": 1e-05, |
| "loss": 0.2883, |
| "step": 187900 |
| }, |
| { |
| "epoch": 0.00188, |
| "grad_norm": 1.4943695068359375, |
| "learning_rate": 1e-05, |
| "loss": 0.2924, |
| "step": 188000 |
| }, |
| { |
| "epoch": 0.001881, |
| "grad_norm": 1.7356852293014526, |
| "learning_rate": 1e-05, |
| "loss": 0.2861, |
| "step": 188100 |
| }, |
| { |
| "epoch": 0.001882, |
| "grad_norm": 1.300524353981018, |
| "learning_rate": 1e-05, |
| "loss": 0.2871, |
| "step": 188200 |
| }, |
| { |
| "epoch": 0.001883, |
| "grad_norm": 1.5252922773361206, |
| "learning_rate": 1e-05, |
| "loss": 0.293, |
| "step": 188300 |
| }, |
| { |
| "epoch": 0.001884, |
| "grad_norm": 1.6379549503326416, |
| "learning_rate": 1e-05, |
| "loss": 0.2928, |
| "step": 188400 |
| }, |
| { |
| "epoch": 0.001885, |
| "grad_norm": 1.485962152481079, |
| "learning_rate": 1e-05, |
| "loss": 0.2797, |
| "step": 188500 |
| }, |
| { |
| "epoch": 0.001886, |
| "grad_norm": 1.5201560258865356, |
| "learning_rate": 1e-05, |
| "loss": 0.2906, |
| "step": 188600 |
| }, |
| { |
| "epoch": 0.001887, |
| "grad_norm": 1.5450036525726318, |
| "learning_rate": 1e-05, |
| "loss": 0.2903, |
| "step": 188700 |
| }, |
| { |
| "epoch": 0.001888, |
| "grad_norm": 1.6390937566757202, |
| "learning_rate": 1e-05, |
| "loss": 0.2824, |
| "step": 188800 |
| }, |
| { |
| "epoch": 0.001889, |
| "grad_norm": 1.4087883234024048, |
| "learning_rate": 1e-05, |
| "loss": 0.2825, |
| "step": 188900 |
| }, |
| { |
| "epoch": 0.00189, |
| "grad_norm": 1.5856794118881226, |
| "learning_rate": 1e-05, |
| "loss": 0.2836, |
| "step": 189000 |
| }, |
| { |
| "epoch": 0.001891, |
| "grad_norm": 1.4078369140625, |
| "learning_rate": 1e-05, |
| "loss": 0.2774, |
| "step": 189100 |
| }, |
| { |
| "epoch": 0.001892, |
| "grad_norm": 1.7400150299072266, |
| "learning_rate": 1e-05, |
| "loss": 0.2856, |
| "step": 189200 |
| }, |
| { |
| "epoch": 0.001893, |
| "grad_norm": 1.580092191696167, |
| "learning_rate": 1e-05, |
| "loss": 0.2798, |
| "step": 189300 |
| }, |
| { |
| "epoch": 0.001894, |
| "grad_norm": 1.6314994096755981, |
| "learning_rate": 1e-05, |
| "loss": 0.2876, |
| "step": 189400 |
| }, |
| { |
| "epoch": 0.001895, |
| "grad_norm": 1.33416748046875, |
| "learning_rate": 1e-05, |
| "loss": 0.2809, |
| "step": 189500 |
| }, |
| { |
| "epoch": 0.001896, |
| "grad_norm": 1.512454628944397, |
| "learning_rate": 1e-05, |
| "loss": 0.2784, |
| "step": 189600 |
| }, |
| { |
| "epoch": 0.001897, |
| "grad_norm": 1.4232842922210693, |
| "learning_rate": 1e-05, |
| "loss": 0.2867, |
| "step": 189700 |
| }, |
| { |
| "epoch": 0.001898, |
| "grad_norm": 1.3176745176315308, |
| "learning_rate": 1e-05, |
| "loss": 0.284, |
| "step": 189800 |
| }, |
| { |
| "epoch": 0.001899, |
| "grad_norm": 1.394158959388733, |
| "learning_rate": 1e-05, |
| "loss": 0.2833, |
| "step": 189900 |
| }, |
| { |
| "epoch": 0.0019, |
| "grad_norm": 1.4825711250305176, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 190000 |
| }, |
| { |
| "epoch": 0.001901, |
| "grad_norm": 1.6661640405654907, |
| "learning_rate": 1e-05, |
| "loss": 0.2821, |
| "step": 190100 |
| }, |
| { |
| "epoch": 0.001902, |
| "grad_norm": 1.7882932424545288, |
| "learning_rate": 1e-05, |
| "loss": 0.2793, |
| "step": 190200 |
| }, |
| { |
| "epoch": 0.001903, |
| "grad_norm": 1.774411678314209, |
| "learning_rate": 1e-05, |
| "loss": 0.2914, |
| "step": 190300 |
| }, |
| { |
| "epoch": 0.001904, |
| "grad_norm": 1.4190601110458374, |
| "learning_rate": 1e-05, |
| "loss": 0.285, |
| "step": 190400 |
| }, |
| { |
| "epoch": 0.001905, |
| "grad_norm": 1.6410194635391235, |
| "learning_rate": 1e-05, |
| "loss": 0.2836, |
| "step": 190500 |
| }, |
| { |
| "epoch": 0.001906, |
| "grad_norm": 1.3235634565353394, |
| "learning_rate": 1e-05, |
| "loss": 0.2843, |
| "step": 190600 |
| }, |
| { |
| "epoch": 0.001907, |
| "grad_norm": 1.5994374752044678, |
| "learning_rate": 1e-05, |
| "loss": 0.2911, |
| "step": 190700 |
| }, |
| { |
| "epoch": 0.001908, |
| "grad_norm": 1.479262113571167, |
| "learning_rate": 1e-05, |
| "loss": 0.2851, |
| "step": 190800 |
| }, |
| { |
| "epoch": 0.001909, |
| "grad_norm": 1.8383204936981201, |
| "learning_rate": 1e-05, |
| "loss": 0.2861, |
| "step": 190900 |
| }, |
| { |
| "epoch": 0.00191, |
| "grad_norm": 1.7182331085205078, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 191000 |
| }, |
| { |
| "epoch": 0.001911, |
| "grad_norm": 1.2352073192596436, |
| "learning_rate": 1e-05, |
| "loss": 0.2882, |
| "step": 191100 |
| }, |
| { |
| "epoch": 0.001912, |
| "grad_norm": 1.6550449132919312, |
| "learning_rate": 1e-05, |
| "loss": 0.2857, |
| "step": 191200 |
| }, |
| { |
| "epoch": 0.001913, |
| "grad_norm": 1.4845826625823975, |
| "learning_rate": 1e-05, |
| "loss": 0.2815, |
| "step": 191300 |
| }, |
| { |
| "epoch": 0.001914, |
| "grad_norm": 1.3901734352111816, |
| "learning_rate": 1e-05, |
| "loss": 0.2775, |
| "step": 191400 |
| }, |
| { |
| "epoch": 0.001915, |
| "grad_norm": 1.410360336303711, |
| "learning_rate": 1e-05, |
| "loss": 0.2842, |
| "step": 191500 |
| }, |
| { |
| "epoch": 0.001916, |
| "grad_norm": 1.6031688451766968, |
| "learning_rate": 1e-05, |
| "loss": 0.2809, |
| "step": 191600 |
| }, |
| { |
| "epoch": 0.001917, |
| "grad_norm": 1.6306028366088867, |
| "learning_rate": 1e-05, |
| "loss": 0.2844, |
| "step": 191700 |
| }, |
| { |
| "epoch": 0.001918, |
| "grad_norm": 1.450675129890442, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 191800 |
| }, |
| { |
| "epoch": 0.001919, |
| "grad_norm": 1.4375156164169312, |
| "learning_rate": 1e-05, |
| "loss": 0.288, |
| "step": 191900 |
| }, |
| { |
| "epoch": 0.00192, |
| "grad_norm": 1.5712558031082153, |
| "learning_rate": 1e-05, |
| "loss": 0.2827, |
| "step": 192000 |
| }, |
| { |
| "epoch": 0.001921, |
| "grad_norm": 1.6321333646774292, |
| "learning_rate": 1e-05, |
| "loss": 0.2853, |
| "step": 192100 |
| }, |
| { |
| "epoch": 0.001922, |
| "grad_norm": 1.3940213918685913, |
| "learning_rate": 1e-05, |
| "loss": 0.2868, |
| "step": 192200 |
| }, |
| { |
| "epoch": 0.001923, |
| "grad_norm": 1.506698489189148, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 192300 |
| }, |
| { |
| "epoch": 0.001924, |
| "grad_norm": 1.4711730480194092, |
| "learning_rate": 1e-05, |
| "loss": 0.2865, |
| "step": 192400 |
| }, |
| { |
| "epoch": 0.001925, |
| "grad_norm": 1.4492762088775635, |
| "learning_rate": 1e-05, |
| "loss": 0.2858, |
| "step": 192500 |
| }, |
| { |
| "epoch": 0.001926, |
| "grad_norm": 1.5630435943603516, |
| "learning_rate": 1e-05, |
| "loss": 0.285, |
| "step": 192600 |
| }, |
| { |
| "epoch": 0.001927, |
| "grad_norm": 1.4810446500778198, |
| "learning_rate": 1e-05, |
| "loss": 0.283, |
| "step": 192700 |
| }, |
| { |
| "epoch": 0.001928, |
| "grad_norm": 1.5246132612228394, |
| "learning_rate": 1e-05, |
| "loss": 0.2844, |
| "step": 192800 |
| }, |
| { |
| "epoch": 0.001929, |
| "grad_norm": 1.7029883861541748, |
| "learning_rate": 1e-05, |
| "loss": 0.28, |
| "step": 192900 |
| }, |
| { |
| "epoch": 0.00193, |
| "grad_norm": 1.930101990699768, |
| "learning_rate": 1e-05, |
| "loss": 0.28, |
| "step": 193000 |
| }, |
| { |
| "epoch": 0.001931, |
| "grad_norm": 2.439939260482788, |
| "learning_rate": 1e-05, |
| "loss": 0.284, |
| "step": 193100 |
| }, |
| { |
| "epoch": 0.001932, |
| "grad_norm": 1.4808944463729858, |
| "learning_rate": 1e-05, |
| "loss": 0.2805, |
| "step": 193200 |
| }, |
| { |
| "epoch": 0.001933, |
| "grad_norm": 1.3932912349700928, |
| "learning_rate": 1e-05, |
| "loss": 0.2787, |
| "step": 193300 |
| }, |
| { |
| "epoch": 0.001934, |
| "grad_norm": 1.4781297445297241, |
| "learning_rate": 1e-05, |
| "loss": 0.2774, |
| "step": 193400 |
| }, |
| { |
| "epoch": 0.001935, |
| "grad_norm": 1.4157606363296509, |
| "learning_rate": 1e-05, |
| "loss": 0.2855, |
| "step": 193500 |
| }, |
| { |
| "epoch": 0.001936, |
| "grad_norm": 1.5318036079406738, |
| "learning_rate": 1e-05, |
| "loss": 0.281, |
| "step": 193600 |
| }, |
| { |
| "epoch": 0.001937, |
| "grad_norm": 1.4803863763809204, |
| "learning_rate": 1e-05, |
| "loss": 0.2792, |
| "step": 193700 |
| }, |
| { |
| "epoch": 0.001938, |
| "grad_norm": 1.4421052932739258, |
| "learning_rate": 1e-05, |
| "loss": 0.2888, |
| "step": 193800 |
| }, |
| { |
| "epoch": 0.001939, |
| "grad_norm": 1.336422085762024, |
| "learning_rate": 1e-05, |
| "loss": 0.2798, |
| "step": 193900 |
| }, |
| { |
| "epoch": 0.00194, |
| "grad_norm": 1.4723069667816162, |
| "learning_rate": 1e-05, |
| "loss": 0.2826, |
| "step": 194000 |
| }, |
| { |
| "epoch": 0.001941, |
| "grad_norm": 1.549203634262085, |
| "learning_rate": 1e-05, |
| "loss": 0.2804, |
| "step": 194100 |
| }, |
| { |
| "epoch": 0.001942, |
| "grad_norm": 1.5718244314193726, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 194200 |
| }, |
| { |
| "epoch": 0.001943, |
| "grad_norm": 1.4759576320648193, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 194300 |
| }, |
| { |
| "epoch": 0.001944, |
| "grad_norm": 1.3965938091278076, |
| "learning_rate": 1e-05, |
| "loss": 0.2771, |
| "step": 194400 |
| }, |
| { |
| "epoch": 0.001945, |
| "grad_norm": 1.3787996768951416, |
| "learning_rate": 1e-05, |
| "loss": 0.2806, |
| "step": 194500 |
| }, |
| { |
| "epoch": 0.001946, |
| "grad_norm": 1.3691178560256958, |
| "learning_rate": 1e-05, |
| "loss": 0.2892, |
| "step": 194600 |
| }, |
| { |
| "epoch": 0.001947, |
| "grad_norm": 1.4109934568405151, |
| "learning_rate": 1e-05, |
| "loss": 0.2902, |
| "step": 194700 |
| }, |
| { |
| "epoch": 0.001948, |
| "grad_norm": 1.4349749088287354, |
| "learning_rate": 1e-05, |
| "loss": 0.2852, |
| "step": 194800 |
| }, |
| { |
| "epoch": 0.001949, |
| "grad_norm": 1.3615974187850952, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 194900 |
| }, |
| { |
| "epoch": 0.00195, |
| "grad_norm": 1.5208380222320557, |
| "learning_rate": 1e-05, |
| "loss": 0.2804, |
| "step": 195000 |
| }, |
| { |
| "epoch": 0.001951, |
| "grad_norm": 1.5616921186447144, |
| "learning_rate": 1e-05, |
| "loss": 0.2793, |
| "step": 195100 |
| }, |
| { |
| "epoch": 0.001952, |
| "grad_norm": 1.490240454673767, |
| "learning_rate": 1e-05, |
| "loss": 0.2835, |
| "step": 195200 |
| }, |
| { |
| "epoch": 0.001953, |
| "grad_norm": 1.4141552448272705, |
| "learning_rate": 1e-05, |
| "loss": 0.2841, |
| "step": 195300 |
| }, |
| { |
| "epoch": 0.001954, |
| "grad_norm": 1.4841254949569702, |
| "learning_rate": 1e-05, |
| "loss": 0.2899, |
| "step": 195400 |
| }, |
| { |
| "epoch": 0.001955, |
| "grad_norm": 1.3822132349014282, |
| "learning_rate": 1e-05, |
| "loss": 0.2873, |
| "step": 195500 |
| }, |
| { |
| "epoch": 0.001956, |
| "grad_norm": 1.5400711297988892, |
| "learning_rate": 1e-05, |
| "loss": 0.2816, |
| "step": 195600 |
| }, |
| { |
| "epoch": 0.001957, |
| "grad_norm": 3.026294708251953, |
| "learning_rate": 1e-05, |
| "loss": 0.2818, |
| "step": 195700 |
| }, |
| { |
| "epoch": 0.001958, |
| "grad_norm": 1.5581517219543457, |
| "learning_rate": 1e-05, |
| "loss": 0.2866, |
| "step": 195800 |
| }, |
| { |
| "epoch": 0.001959, |
| "grad_norm": 1.292336106300354, |
| "learning_rate": 1e-05, |
| "loss": 0.2807, |
| "step": 195900 |
| }, |
| { |
| "epoch": 0.00196, |
| "grad_norm": 1.3840731382369995, |
| "learning_rate": 1e-05, |
| "loss": 0.2799, |
| "step": 196000 |
| }, |
| { |
| "epoch": 0.001961, |
| "grad_norm": 1.82817542552948, |
| "learning_rate": 1e-05, |
| "loss": 0.2871, |
| "step": 196100 |
| }, |
| { |
| "epoch": 0.001962, |
| "grad_norm": 2.580714702606201, |
| "learning_rate": 1e-05, |
| "loss": 0.2752, |
| "step": 196200 |
| }, |
| { |
| "epoch": 0.001963, |
| "grad_norm": 1.8855743408203125, |
| "learning_rate": 1e-05, |
| "loss": 0.2822, |
| "step": 196300 |
| }, |
| { |
| "epoch": 0.001964, |
| "grad_norm": 1.3740893602371216, |
| "learning_rate": 1e-05, |
| "loss": 0.2811, |
| "step": 196400 |
| }, |
| { |
| "epoch": 0.001965, |
| "grad_norm": 1.2532157897949219, |
| "learning_rate": 1e-05, |
| "loss": 0.2889, |
| "step": 196500 |
| }, |
| { |
| "epoch": 0.001966, |
| "grad_norm": 1.313609004020691, |
| "learning_rate": 1e-05, |
| "loss": 0.2818, |
| "step": 196600 |
| }, |
| { |
| "epoch": 0.001967, |
| "grad_norm": 1.326478123664856, |
| "learning_rate": 1e-05, |
| "loss": 0.287, |
| "step": 196700 |
| }, |
| { |
| "epoch": 0.001968, |
| "grad_norm": 1.5435999631881714, |
| "learning_rate": 1e-05, |
| "loss": 0.2848, |
| "step": 196800 |
| }, |
| { |
| "epoch": 0.001969, |
| "grad_norm": 1.5859767198562622, |
| "learning_rate": 1e-05, |
| "loss": 0.282, |
| "step": 196900 |
| }, |
| { |
| "epoch": 0.00197, |
| "grad_norm": 1.3006385564804077, |
| "learning_rate": 1e-05, |
| "loss": 0.2793, |
| "step": 197000 |
| }, |
| { |
| "epoch": 0.001971, |
| "grad_norm": 1.2609304189682007, |
| "learning_rate": 1e-05, |
| "loss": 0.2791, |
| "step": 197100 |
| }, |
| { |
| "epoch": 0.001972, |
| "grad_norm": 1.4160981178283691, |
| "learning_rate": 1e-05, |
| "loss": 0.2832, |
| "step": 197200 |
| }, |
| { |
| "epoch": 0.001973, |
| "grad_norm": 2.234437942504883, |
| "learning_rate": 1e-05, |
| "loss": 0.2852, |
| "step": 197300 |
| }, |
| { |
| "epoch": 0.001974, |
| "grad_norm": 1.295417070388794, |
| "learning_rate": 1e-05, |
| "loss": 0.2825, |
| "step": 197400 |
| }, |
| { |
| "epoch": 0.001975, |
| "grad_norm": 1.227103352546692, |
| "learning_rate": 1e-05, |
| "loss": 0.2787, |
| "step": 197500 |
| }, |
| { |
| "epoch": 0.001976, |
| "grad_norm": 1.5311518907546997, |
| "learning_rate": 1e-05, |
| "loss": 0.2807, |
| "step": 197600 |
| }, |
| { |
| "epoch": 0.001977, |
| "grad_norm": 1.3764680624008179, |
| "learning_rate": 1e-05, |
| "loss": 0.2772, |
| "step": 197700 |
| }, |
| { |
| "epoch": 0.001978, |
| "grad_norm": 1.43855619430542, |
| "learning_rate": 1e-05, |
| "loss": 0.2806, |
| "step": 197800 |
| }, |
| { |
| "epoch": 0.001979, |
| "grad_norm": 1.5360887050628662, |
| "learning_rate": 1e-05, |
| "loss": 0.2774, |
| "step": 197900 |
| }, |
| { |
| "epoch": 0.00198, |
| "grad_norm": 1.2137932777404785, |
| "learning_rate": 1e-05, |
| "loss": 0.2833, |
| "step": 198000 |
| }, |
| { |
| "epoch": 0.001981, |
| "grad_norm": 1.5310431718826294, |
| "learning_rate": 1e-05, |
| "loss": 0.2795, |
| "step": 198100 |
| }, |
| { |
| "epoch": 0.001982, |
| "grad_norm": 1.6440815925598145, |
| "learning_rate": 1e-05, |
| "loss": 0.2814, |
| "step": 198200 |
| }, |
| { |
| "epoch": 0.001983, |
| "grad_norm": 1.8036118745803833, |
| "learning_rate": 1e-05, |
| "loss": 0.2835, |
| "step": 198300 |
| }, |
| { |
| "epoch": 0.001984, |
| "grad_norm": 1.3439860343933105, |
| "learning_rate": 1e-05, |
| "loss": 0.2829, |
| "step": 198400 |
| }, |
| { |
| "epoch": 0.001985, |
| "grad_norm": 1.4137046337127686, |
| "learning_rate": 1e-05, |
| "loss": 0.2814, |
| "step": 198500 |
| }, |
| { |
| "epoch": 0.001986, |
| "grad_norm": 1.4916012287139893, |
| "learning_rate": 1e-05, |
| "loss": 0.277, |
| "step": 198600 |
| }, |
| { |
| "epoch": 0.001987, |
| "grad_norm": 1.4562140703201294, |
| "learning_rate": 1e-05, |
| "loss": 0.2866, |
| "step": 198700 |
| }, |
| { |
| "epoch": 0.001988, |
| "grad_norm": 1.6629130840301514, |
| "learning_rate": 1e-05, |
| "loss": 0.2775, |
| "step": 198800 |
| }, |
| { |
| "epoch": 0.001989, |
| "grad_norm": 1.524603247642517, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 198900 |
| }, |
| { |
| "epoch": 0.00199, |
| "grad_norm": 2.1545159816741943, |
| "learning_rate": 1e-05, |
| "loss": 0.2791, |
| "step": 199000 |
| }, |
| { |
| "epoch": 0.001991, |
| "grad_norm": 1.3595882654190063, |
| "learning_rate": 1e-05, |
| "loss": 0.2845, |
| "step": 199100 |
| }, |
| { |
| "epoch": 0.001992, |
| "grad_norm": 1.4160555601119995, |
| "learning_rate": 1e-05, |
| "loss": 0.2779, |
| "step": 199200 |
| }, |
| { |
| "epoch": 0.001993, |
| "grad_norm": 1.612523078918457, |
| "learning_rate": 1e-05, |
| "loss": 0.2878, |
| "step": 199300 |
| }, |
| { |
| "epoch": 0.001994, |
| "grad_norm": 1.4364423751831055, |
| "learning_rate": 1e-05, |
| "loss": 0.2813, |
| "step": 199400 |
| }, |
| { |
| "epoch": 0.001995, |
| "grad_norm": 1.47967529296875, |
| "learning_rate": 1e-05, |
| "loss": 0.2841, |
| "step": 199500 |
| }, |
| { |
| "epoch": 0.001996, |
| "grad_norm": 1.621627688407898, |
| "learning_rate": 1e-05, |
| "loss": 0.2765, |
| "step": 199600 |
| }, |
| { |
| "epoch": 0.001997, |
| "grad_norm": 1.3066225051879883, |
| "learning_rate": 1e-05, |
| "loss": 0.2828, |
| "step": 199700 |
| }, |
| { |
| "epoch": 0.001998, |
| "grad_norm": 1.614108681678772, |
| "learning_rate": 1e-05, |
| "loss": 0.2808, |
| "step": 199800 |
| }, |
| { |
| "epoch": 0.001999, |
| "grad_norm": 1.457350730895996, |
| "learning_rate": 1e-05, |
| "loss": 0.2815, |
| "step": 199900 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 1.6253563165664673, |
| "learning_rate": 1e-05, |
| "loss": 0.2792, |
| "step": 200000 |
| }, |
| { |
| "epoch": 0.002, |
| "eval_loss": 0.256103515625, |
| "eval_runtime": 115.1679, |
| "eval_samples_per_second": 434.149, |
| "eval_steps_per_second": 27.134, |
| "step": 200000 |
| }, |
| { |
| "epoch": 0.002001, |
| "grad_norm": 1.5363661050796509, |
| "learning_rate": 1e-05, |
| "loss": 0.2754, |
| "step": 200100 |
| }, |
| { |
| "epoch": 0.002002, |
| "grad_norm": 1.5348174571990967, |
| "learning_rate": 1e-05, |
| "loss": 0.2851, |
| "step": 200200 |
| }, |
| { |
| "epoch": 0.002003, |
| "grad_norm": 1.4772671461105347, |
| "learning_rate": 1e-05, |
| "loss": 0.279, |
| "step": 200300 |
| }, |
| { |
| "epoch": 0.002004, |
| "grad_norm": 1.5636744499206543, |
| "learning_rate": 1e-05, |
| "loss": 0.2862, |
| "step": 200400 |
| }, |
| { |
| "epoch": 0.002005, |
| "grad_norm": 1.8320351839065552, |
| "learning_rate": 1e-05, |
| "loss": 0.279, |
| "step": 200500 |
| }, |
| { |
| "epoch": 0.002006, |
| "grad_norm": 1.7066452503204346, |
| "learning_rate": 1e-05, |
| "loss": 0.2757, |
| "step": 200600 |
| }, |
| { |
| "epoch": 0.002007, |
| "grad_norm": 1.3273617029190063, |
| "learning_rate": 1e-05, |
| "loss": 0.2815, |
| "step": 200700 |
| }, |
| { |
| "epoch": 0.002008, |
| "grad_norm": 1.492756724357605, |
| "learning_rate": 1e-05, |
| "loss": 0.2779, |
| "step": 200800 |
| }, |
| { |
| "epoch": 0.002009, |
| "grad_norm": 1.5056836605072021, |
| "learning_rate": 1e-05, |
| "loss": 0.2805, |
| "step": 200900 |
| }, |
| { |
| "epoch": 0.00201, |
| "grad_norm": 1.504055142402649, |
| "learning_rate": 1e-05, |
| "loss": 0.2725, |
| "step": 201000 |
| }, |
| { |
| "epoch": 0.002011, |
| "grad_norm": 1.424229621887207, |
| "learning_rate": 1e-05, |
| "loss": 0.2768, |
| "step": 201100 |
| }, |
| { |
| "epoch": 0.002012, |
| "grad_norm": 1.6440435647964478, |
| "learning_rate": 1e-05, |
| "loss": 0.2776, |
| "step": 201200 |
| }, |
| { |
| "epoch": 0.002013, |
| "grad_norm": 1.4276853799819946, |
| "learning_rate": 1e-05, |
| "loss": 0.2737, |
| "step": 201300 |
| }, |
| { |
| "epoch": 0.002014, |
| "grad_norm": 1.479280948638916, |
| "learning_rate": 1e-05, |
| "loss": 0.2772, |
| "step": 201400 |
| }, |
| { |
| "epoch": 0.002015, |
| "grad_norm": 1.7645676136016846, |
| "learning_rate": 1e-05, |
| "loss": 0.2779, |
| "step": 201500 |
| }, |
| { |
| "epoch": 0.002016, |
| "grad_norm": 1.7327306270599365, |
| "learning_rate": 1e-05, |
| "loss": 0.2809, |
| "step": 201600 |
| }, |
| { |
| "epoch": 0.002017, |
| "grad_norm": 1.4628037214279175, |
| "learning_rate": 1e-05, |
| "loss": 0.2815, |
| "step": 201700 |
| }, |
| { |
| "epoch": 0.002018, |
| "grad_norm": 1.2225489616394043, |
| "learning_rate": 1e-05, |
| "loss": 0.286, |
| "step": 201800 |
| }, |
| { |
| "epoch": 0.002019, |
| "grad_norm": 1.4493396282196045, |
| "learning_rate": 1e-05, |
| "loss": 0.2781, |
| "step": 201900 |
| }, |
| { |
| "epoch": 0.00202, |
| "grad_norm": 1.1203590631484985, |
| "learning_rate": 1e-05, |
| "loss": 0.2782, |
| "step": 202000 |
| }, |
| { |
| "epoch": 0.002021, |
| "grad_norm": 1.4502707719802856, |
| "learning_rate": 1e-05, |
| "loss": 0.277, |
| "step": 202100 |
| }, |
| { |
| "epoch": 0.002022, |
| "grad_norm": 1.7395519018173218, |
| "learning_rate": 1e-05, |
| "loss": 0.286, |
| "step": 202200 |
| }, |
| { |
| "epoch": 0.002023, |
| "grad_norm": 1.9898625612258911, |
| "learning_rate": 1e-05, |
| "loss": 0.2721, |
| "step": 202300 |
| }, |
| { |
| "epoch": 0.002024, |
| "grad_norm": 1.6557375192642212, |
| "learning_rate": 1e-05, |
| "loss": 0.2752, |
| "step": 202400 |
| }, |
| { |
| "epoch": 0.002025, |
| "grad_norm": 1.4543732404708862, |
| "learning_rate": 1e-05, |
| "loss": 0.2872, |
| "step": 202500 |
| }, |
| { |
| "epoch": 0.002026, |
| "grad_norm": 1.44635009765625, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 202600 |
| }, |
| { |
| "epoch": 0.002027, |
| "grad_norm": 1.3892033100128174, |
| "learning_rate": 1e-05, |
| "loss": 0.2804, |
| "step": 202700 |
| }, |
| { |
| "epoch": 0.002028, |
| "grad_norm": 1.2940878868103027, |
| "learning_rate": 1e-05, |
| "loss": 0.2778, |
| "step": 202800 |
| }, |
| { |
| "epoch": 0.002029, |
| "grad_norm": 2.028639554977417, |
| "learning_rate": 1e-05, |
| "loss": 0.2741, |
| "step": 202900 |
| }, |
| { |
| "epoch": 0.00203, |
| "grad_norm": 1.462415099143982, |
| "learning_rate": 1e-05, |
| "loss": 0.2775, |
| "step": 203000 |
| }, |
| { |
| "epoch": 0.002031, |
| "grad_norm": 1.60899019241333, |
| "learning_rate": 1e-05, |
| "loss": 0.2764, |
| "step": 203100 |
| }, |
| { |
| "epoch": 0.002032, |
| "grad_norm": 1.5385807752609253, |
| "learning_rate": 1e-05, |
| "loss": 0.2794, |
| "step": 203200 |
| }, |
| { |
| "epoch": 0.002033, |
| "grad_norm": 1.3882631063461304, |
| "learning_rate": 1e-05, |
| "loss": 0.2736, |
| "step": 203300 |
| }, |
| { |
| "epoch": 0.002034, |
| "grad_norm": 1.2531319856643677, |
| "learning_rate": 1e-05, |
| "loss": 0.2848, |
| "step": 203400 |
| }, |
| { |
| "epoch": 0.002035, |
| "grad_norm": 1.621891736984253, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 203500 |
| }, |
| { |
| "epoch": 0.002036, |
| "grad_norm": 1.3949509859085083, |
| "learning_rate": 1e-05, |
| "loss": 0.2804, |
| "step": 203600 |
| }, |
| { |
| "epoch": 0.002037, |
| "grad_norm": 1.3484961986541748, |
| "learning_rate": 1e-05, |
| "loss": 0.2785, |
| "step": 203700 |
| }, |
| { |
| "epoch": 0.002038, |
| "grad_norm": 1.524052381515503, |
| "learning_rate": 1e-05, |
| "loss": 0.2817, |
| "step": 203800 |
| }, |
| { |
| "epoch": 0.002039, |
| "grad_norm": 1.4757261276245117, |
| "learning_rate": 1e-05, |
| "loss": 0.2786, |
| "step": 203900 |
| }, |
| { |
| "epoch": 0.00204, |
| "grad_norm": 1.7308425903320312, |
| "learning_rate": 1e-05, |
| "loss": 0.2765, |
| "step": 204000 |
| }, |
| { |
| "epoch": 0.002041, |
| "grad_norm": 1.433393120765686, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 204100 |
| }, |
| { |
| "epoch": 0.002042, |
| "grad_norm": 1.398602843284607, |
| "learning_rate": 1e-05, |
| "loss": 0.278, |
| "step": 204200 |
| }, |
| { |
| "epoch": 0.002043, |
| "grad_norm": 1.6153137683868408, |
| "learning_rate": 1e-05, |
| "loss": 0.2814, |
| "step": 204300 |
| }, |
| { |
| "epoch": 0.002044, |
| "grad_norm": 1.516076683998108, |
| "learning_rate": 1e-05, |
| "loss": 0.2709, |
| "step": 204400 |
| }, |
| { |
| "epoch": 0.002045, |
| "grad_norm": 1.6530369520187378, |
| "learning_rate": 1e-05, |
| "loss": 0.2824, |
| "step": 204500 |
| }, |
| { |
| "epoch": 0.002046, |
| "grad_norm": 1.4678118228912354, |
| "learning_rate": 1e-05, |
| "loss": 0.2754, |
| "step": 204600 |
| }, |
| { |
| "epoch": 0.002047, |
| "grad_norm": 1.64580237865448, |
| "learning_rate": 1e-05, |
| "loss": 0.2796, |
| "step": 204700 |
| }, |
| { |
| "epoch": 0.002048, |
| "grad_norm": 1.6422346830368042, |
| "learning_rate": 1e-05, |
| "loss": 0.2742, |
| "step": 204800 |
| }, |
| { |
| "epoch": 0.002049, |
| "grad_norm": 1.5554474592208862, |
| "learning_rate": 1e-05, |
| "loss": 0.2838, |
| "step": 204900 |
| }, |
| { |
| "epoch": 0.00205, |
| "grad_norm": 1.4247395992279053, |
| "learning_rate": 1e-05, |
| "loss": 0.2788, |
| "step": 205000 |
| }, |
| { |
| "epoch": 0.002051, |
| "grad_norm": 3.377981662750244, |
| "learning_rate": 1e-05, |
| "loss": 0.2739, |
| "step": 205100 |
| }, |
| { |
| "epoch": 0.002052, |
| "grad_norm": 1.668649435043335, |
| "learning_rate": 1e-05, |
| "loss": 0.2817, |
| "step": 205200 |
| }, |
| { |
| "epoch": 0.002053, |
| "grad_norm": 1.3109833002090454, |
| "learning_rate": 1e-05, |
| "loss": 0.281, |
| "step": 205300 |
| }, |
| { |
| "epoch": 0.002054, |
| "grad_norm": 1.3675384521484375, |
| "learning_rate": 1e-05, |
| "loss": 0.2686, |
| "step": 205400 |
| }, |
| { |
| "epoch": 0.002055, |
| "grad_norm": 1.542527437210083, |
| "learning_rate": 1e-05, |
| "loss": 0.2709, |
| "step": 205500 |
| }, |
| { |
| "epoch": 0.002056, |
| "grad_norm": 1.3900882005691528, |
| "learning_rate": 1e-05, |
| "loss": 0.2841, |
| "step": 205600 |
| }, |
| { |
| "epoch": 0.002057, |
| "grad_norm": 1.5033669471740723, |
| "learning_rate": 1e-05, |
| "loss": 0.2798, |
| "step": 205700 |
| }, |
| { |
| "epoch": 0.002058, |
| "grad_norm": 1.638098120689392, |
| "learning_rate": 1e-05, |
| "loss": 0.275, |
| "step": 205800 |
| }, |
| { |
| "epoch": 0.002059, |
| "grad_norm": 1.4195244312286377, |
| "learning_rate": 1e-05, |
| "loss": 0.2781, |
| "step": 205900 |
| }, |
| { |
| "epoch": 0.00206, |
| "grad_norm": 1.5741572380065918, |
| "learning_rate": 1e-05, |
| "loss": 0.2811, |
| "step": 206000 |
| }, |
| { |
| "epoch": 0.002061, |
| "grad_norm": 1.4734532833099365, |
| "learning_rate": 1e-05, |
| "loss": 0.2757, |
| "step": 206100 |
| }, |
| { |
| "epoch": 0.002062, |
| "grad_norm": 1.4549283981323242, |
| "learning_rate": 1e-05, |
| "loss": 0.2797, |
| "step": 206200 |
| }, |
| { |
| "epoch": 0.002063, |
| "grad_norm": 1.3052780628204346, |
| "learning_rate": 1e-05, |
| "loss": 0.2819, |
| "step": 206300 |
| }, |
| { |
| "epoch": 0.002064, |
| "grad_norm": 1.349663257598877, |
| "learning_rate": 1e-05, |
| "loss": 0.2726, |
| "step": 206400 |
| }, |
| { |
| "epoch": 0.002065, |
| "grad_norm": 1.477702021598816, |
| "learning_rate": 1e-05, |
| "loss": 0.278, |
| "step": 206500 |
| }, |
| { |
| "epoch": 0.002066, |
| "grad_norm": 1.4143027067184448, |
| "learning_rate": 1e-05, |
| "loss": 0.2789, |
| "step": 206600 |
| }, |
| { |
| "epoch": 0.002067, |
| "grad_norm": 1.456390380859375, |
| "learning_rate": 1e-05, |
| "loss": 0.2764, |
| "step": 206700 |
| }, |
| { |
| "epoch": 0.002068, |
| "grad_norm": 1.343540906906128, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 206800 |
| }, |
| { |
| "epoch": 0.002069, |
| "grad_norm": 1.3482036590576172, |
| "learning_rate": 1e-05, |
| "loss": 0.2727, |
| "step": 206900 |
| }, |
| { |
| "epoch": 0.00207, |
| "grad_norm": 1.5749857425689697, |
| "learning_rate": 1e-05, |
| "loss": 0.2752, |
| "step": 207000 |
| }, |
| { |
| "epoch": 0.002071, |
| "grad_norm": 1.458197832107544, |
| "learning_rate": 1e-05, |
| "loss": 0.2803, |
| "step": 207100 |
| }, |
| { |
| "epoch": 0.002072, |
| "grad_norm": 1.5353132486343384, |
| "learning_rate": 1e-05, |
| "loss": 0.2698, |
| "step": 207200 |
| }, |
| { |
| "epoch": 0.002073, |
| "grad_norm": 1.5403767824172974, |
| "learning_rate": 1e-05, |
| "loss": 0.2768, |
| "step": 207300 |
| }, |
| { |
| "epoch": 0.002074, |
| "grad_norm": 1.4076712131500244, |
| "learning_rate": 1e-05, |
| "loss": 0.2783, |
| "step": 207400 |
| }, |
| { |
| "epoch": 0.002075, |
| "grad_norm": 1.5805490016937256, |
| "learning_rate": 1e-05, |
| "loss": 0.2782, |
| "step": 207500 |
| }, |
| { |
| "epoch": 0.002076, |
| "grad_norm": 1.342297911643982, |
| "learning_rate": 1e-05, |
| "loss": 0.2791, |
| "step": 207600 |
| }, |
| { |
| "epoch": 0.002077, |
| "grad_norm": 1.5067929029464722, |
| "learning_rate": 1e-05, |
| "loss": 0.2768, |
| "step": 207700 |
| }, |
| { |
| "epoch": 0.002078, |
| "grad_norm": 1.2661635875701904, |
| "learning_rate": 1e-05, |
| "loss": 0.2719, |
| "step": 207800 |
| }, |
| { |
| "epoch": 0.002079, |
| "grad_norm": 1.2550525665283203, |
| "learning_rate": 1e-05, |
| "loss": 0.2734, |
| "step": 207900 |
| }, |
| { |
| "epoch": 0.00208, |
| "grad_norm": 1.478445053100586, |
| "learning_rate": 1e-05, |
| "loss": 0.2722, |
| "step": 208000 |
| }, |
| { |
| "epoch": 0.002081, |
| "grad_norm": 1.4284790754318237, |
| "learning_rate": 1e-05, |
| "loss": 0.2698, |
| "step": 208100 |
| }, |
| { |
| "epoch": 0.002082, |
| "grad_norm": 1.6922401189804077, |
| "learning_rate": 1e-05, |
| "loss": 0.2813, |
| "step": 208200 |
| }, |
| { |
| "epoch": 0.002083, |
| "grad_norm": 1.50126314163208, |
| "learning_rate": 1e-05, |
| "loss": 0.2797, |
| "step": 208300 |
| }, |
| { |
| "epoch": 0.002084, |
| "grad_norm": 1.4059314727783203, |
| "learning_rate": 1e-05, |
| "loss": 0.2759, |
| "step": 208400 |
| }, |
| { |
| "epoch": 0.002085, |
| "grad_norm": 1.5741177797317505, |
| "learning_rate": 1e-05, |
| "loss": 0.2758, |
| "step": 208500 |
| }, |
| { |
| "epoch": 0.002086, |
| "grad_norm": 1.491023063659668, |
| "learning_rate": 1e-05, |
| "loss": 0.2769, |
| "step": 208600 |
| }, |
| { |
| "epoch": 0.002087, |
| "grad_norm": 1.4003244638442993, |
| "learning_rate": 1e-05, |
| "loss": 0.2726, |
| "step": 208700 |
| }, |
| { |
| "epoch": 0.002088, |
| "grad_norm": 1.695166826248169, |
| "learning_rate": 1e-05, |
| "loss": 0.2795, |
| "step": 208800 |
| }, |
| { |
| "epoch": 0.002089, |
| "grad_norm": 1.3839768171310425, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 208900 |
| }, |
| { |
| "epoch": 0.00209, |
| "grad_norm": 1.2503012418746948, |
| "learning_rate": 1e-05, |
| "loss": 0.2749, |
| "step": 209000 |
| }, |
| { |
| "epoch": 0.002091, |
| "grad_norm": 1.3623175621032715, |
| "learning_rate": 1e-05, |
| "loss": 0.2772, |
| "step": 209100 |
| }, |
| { |
| "epoch": 0.002092, |
| "grad_norm": 1.3409157991409302, |
| "learning_rate": 1e-05, |
| "loss": 0.2735, |
| "step": 209200 |
| }, |
| { |
| "epoch": 0.002093, |
| "grad_norm": 1.5793631076812744, |
| "learning_rate": 1e-05, |
| "loss": 0.2813, |
| "step": 209300 |
| }, |
| { |
| "epoch": 0.002094, |
| "grad_norm": 1.5260995626449585, |
| "learning_rate": 1e-05, |
| "loss": 0.2626, |
| "step": 209400 |
| }, |
| { |
| "epoch": 0.002095, |
| "grad_norm": 1.5600001811981201, |
| "learning_rate": 1e-05, |
| "loss": 0.2792, |
| "step": 209500 |
| }, |
| { |
| "epoch": 0.002096, |
| "grad_norm": 1.5654038190841675, |
| "learning_rate": 1e-05, |
| "loss": 0.2796, |
| "step": 209600 |
| }, |
| { |
| "epoch": 0.002097, |
| "grad_norm": 1.3999748229980469, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 209700 |
| }, |
| { |
| "epoch": 0.002098, |
| "grad_norm": 1.4651896953582764, |
| "learning_rate": 1e-05, |
| "loss": 0.2809, |
| "step": 209800 |
| }, |
| { |
| "epoch": 0.002099, |
| "grad_norm": 2.7390918731689453, |
| "learning_rate": 1e-05, |
| "loss": 0.2699, |
| "step": 209900 |
| }, |
| { |
| "epoch": 0.0021, |
| "grad_norm": 1.6523025035858154, |
| "learning_rate": 1e-05, |
| "loss": 0.2718, |
| "step": 210000 |
| }, |
| { |
| "epoch": 0.002101, |
| "grad_norm": 1.4938172101974487, |
| "learning_rate": 1e-05, |
| "loss": 0.278, |
| "step": 210100 |
| }, |
| { |
| "epoch": 0.002102, |
| "grad_norm": 1.3947250843048096, |
| "learning_rate": 1e-05, |
| "loss": 0.2638, |
| "step": 210200 |
| }, |
| { |
| "epoch": 0.002103, |
| "grad_norm": 1.4332685470581055, |
| "learning_rate": 1e-05, |
| "loss": 0.2776, |
| "step": 210300 |
| }, |
| { |
| "epoch": 0.002104, |
| "grad_norm": 1.5522576570510864, |
| "learning_rate": 1e-05, |
| "loss": 0.2715, |
| "step": 210400 |
| }, |
| { |
| "epoch": 0.002105, |
| "grad_norm": 1.2607855796813965, |
| "learning_rate": 1e-05, |
| "loss": 0.2801, |
| "step": 210500 |
| }, |
| { |
| "epoch": 0.002106, |
| "grad_norm": 1.489503026008606, |
| "learning_rate": 1e-05, |
| "loss": 0.2772, |
| "step": 210600 |
| }, |
| { |
| "epoch": 0.002107, |
| "grad_norm": 1.315061092376709, |
| "learning_rate": 1e-05, |
| "loss": 0.2785, |
| "step": 210700 |
| }, |
| { |
| "epoch": 0.002108, |
| "grad_norm": 1.56254243850708, |
| "learning_rate": 1e-05, |
| "loss": 0.2724, |
| "step": 210800 |
| }, |
| { |
| "epoch": 0.002109, |
| "grad_norm": 1.4691557884216309, |
| "learning_rate": 1e-05, |
| "loss": 0.2741, |
| "step": 210900 |
| }, |
| { |
| "epoch": 0.00211, |
| "grad_norm": 1.551344633102417, |
| "learning_rate": 1e-05, |
| "loss": 0.2802, |
| "step": 211000 |
| }, |
| { |
| "epoch": 0.002111, |
| "grad_norm": 1.3702019453048706, |
| "learning_rate": 1e-05, |
| "loss": 0.275, |
| "step": 211100 |
| }, |
| { |
| "epoch": 0.002112, |
| "grad_norm": 1.563392996788025, |
| "learning_rate": 1e-05, |
| "loss": 0.2778, |
| "step": 211200 |
| }, |
| { |
| "epoch": 0.002113, |
| "grad_norm": 1.403418779373169, |
| "learning_rate": 1e-05, |
| "loss": 0.2751, |
| "step": 211300 |
| }, |
| { |
| "epoch": 0.002114, |
| "grad_norm": 1.391108512878418, |
| "learning_rate": 1e-05, |
| "loss": 0.2762, |
| "step": 211400 |
| }, |
| { |
| "epoch": 0.002115, |
| "grad_norm": 1.3116768598556519, |
| "learning_rate": 1e-05, |
| "loss": 0.2719, |
| "step": 211500 |
| }, |
| { |
| "epoch": 0.002116, |
| "grad_norm": 1.449575424194336, |
| "learning_rate": 1e-05, |
| "loss": 0.2787, |
| "step": 211600 |
| }, |
| { |
| "epoch": 0.002117, |
| "grad_norm": 1.7022771835327148, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 211700 |
| }, |
| { |
| "epoch": 0.002118, |
| "grad_norm": 1.4799153804779053, |
| "learning_rate": 1e-05, |
| "loss": 0.2734, |
| "step": 211800 |
| }, |
| { |
| "epoch": 0.002119, |
| "grad_norm": 1.2987920045852661, |
| "learning_rate": 1e-05, |
| "loss": 0.2739, |
| "step": 211900 |
| }, |
| { |
| "epoch": 0.00212, |
| "grad_norm": 1.3577724695205688, |
| "learning_rate": 1e-05, |
| "loss": 0.2772, |
| "step": 212000 |
| }, |
| { |
| "epoch": 0.002121, |
| "grad_norm": 1.3344751596450806, |
| "learning_rate": 1e-05, |
| "loss": 0.2714, |
| "step": 212100 |
| }, |
| { |
| "epoch": 0.002122, |
| "grad_norm": 1.6037973165512085, |
| "learning_rate": 1e-05, |
| "loss": 0.2761, |
| "step": 212200 |
| }, |
| { |
| "epoch": 0.002123, |
| "grad_norm": 1.3923399448394775, |
| "learning_rate": 1e-05, |
| "loss": 0.2758, |
| "step": 212300 |
| }, |
| { |
| "epoch": 0.002124, |
| "grad_norm": 8.393562316894531, |
| "learning_rate": 1e-05, |
| "loss": 0.2752, |
| "step": 212400 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 1.412778615951538, |
| "learning_rate": 1e-05, |
| "loss": 0.2771, |
| "step": 212500 |
| }, |
| { |
| "epoch": 0.002126, |
| "grad_norm": 1.3804876804351807, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 212600 |
| }, |
| { |
| "epoch": 0.002127, |
| "grad_norm": 1.4782977104187012, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 212700 |
| }, |
| { |
| "epoch": 0.002128, |
| "grad_norm": 1.4774922132492065, |
| "learning_rate": 1e-05, |
| "loss": 0.2782, |
| "step": 212800 |
| }, |
| { |
| "epoch": 0.002129, |
| "grad_norm": 1.4704676866531372, |
| "learning_rate": 1e-05, |
| "loss": 0.2703, |
| "step": 212900 |
| }, |
| { |
| "epoch": 0.00213, |
| "grad_norm": 1.2644281387329102, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 213000 |
| }, |
| { |
| "epoch": 0.002131, |
| "grad_norm": 1.6625502109527588, |
| "learning_rate": 1e-05, |
| "loss": 0.277, |
| "step": 213100 |
| }, |
| { |
| "epoch": 0.002132, |
| "grad_norm": 1.3630250692367554, |
| "learning_rate": 1e-05, |
| "loss": 0.2773, |
| "step": 213200 |
| }, |
| { |
| "epoch": 0.002133, |
| "grad_norm": 1.6373766660690308, |
| "learning_rate": 1e-05, |
| "loss": 0.2701, |
| "step": 213300 |
| }, |
| { |
| "epoch": 0.002134, |
| "grad_norm": 1.4623923301696777, |
| "learning_rate": 1e-05, |
| "loss": 0.2655, |
| "step": 213400 |
| }, |
| { |
| "epoch": 0.002135, |
| "grad_norm": 1.6090341806411743, |
| "learning_rate": 1e-05, |
| "loss": 0.2739, |
| "step": 213500 |
| }, |
| { |
| "epoch": 0.002136, |
| "grad_norm": 1.5013991594314575, |
| "learning_rate": 1e-05, |
| "loss": 0.2708, |
| "step": 213600 |
| }, |
| { |
| "epoch": 0.002137, |
| "grad_norm": 1.26161527633667, |
| "learning_rate": 1e-05, |
| "loss": 0.2757, |
| "step": 213700 |
| }, |
| { |
| "epoch": 0.002138, |
| "grad_norm": 1.418388843536377, |
| "learning_rate": 1e-05, |
| "loss": 0.2761, |
| "step": 213800 |
| }, |
| { |
| "epoch": 0.002139, |
| "grad_norm": 1.293133020401001, |
| "learning_rate": 1e-05, |
| "loss": 0.2855, |
| "step": 213900 |
| }, |
| { |
| "epoch": 0.00214, |
| "grad_norm": 1.9040838479995728, |
| "learning_rate": 1e-05, |
| "loss": 0.2761, |
| "step": 214000 |
| }, |
| { |
| "epoch": 0.002141, |
| "grad_norm": 1.5187069177627563, |
| "learning_rate": 1e-05, |
| "loss": 0.2758, |
| "step": 214100 |
| }, |
| { |
| "epoch": 0.002142, |
| "grad_norm": 1.3588346242904663, |
| "learning_rate": 1e-05, |
| "loss": 0.2709, |
| "step": 214200 |
| }, |
| { |
| "epoch": 0.002143, |
| "grad_norm": 1.5500917434692383, |
| "learning_rate": 1e-05, |
| "loss": 0.2794, |
| "step": 214300 |
| }, |
| { |
| "epoch": 0.002144, |
| "grad_norm": 1.6189924478530884, |
| "learning_rate": 1e-05, |
| "loss": 0.273, |
| "step": 214400 |
| }, |
| { |
| "epoch": 0.002145, |
| "grad_norm": 1.7019140720367432, |
| "learning_rate": 1e-05, |
| "loss": 0.2763, |
| "step": 214500 |
| }, |
| { |
| "epoch": 0.002146, |
| "grad_norm": 1.259387493133545, |
| "learning_rate": 1e-05, |
| "loss": 0.2776, |
| "step": 214600 |
| }, |
| { |
| "epoch": 0.002147, |
| "grad_norm": 2.1989023685455322, |
| "learning_rate": 1e-05, |
| "loss": 0.2803, |
| "step": 214700 |
| }, |
| { |
| "epoch": 0.002148, |
| "grad_norm": 2.2808122634887695, |
| "learning_rate": 1e-05, |
| "loss": 0.2789, |
| "step": 214800 |
| }, |
| { |
| "epoch": 0.002149, |
| "grad_norm": 1.5754348039627075, |
| "learning_rate": 1e-05, |
| "loss": 0.2754, |
| "step": 214900 |
| }, |
| { |
| "epoch": 0.00215, |
| "grad_norm": 1.4869142770767212, |
| "learning_rate": 1e-05, |
| "loss": 0.2714, |
| "step": 215000 |
| }, |
| { |
| "epoch": 0.002151, |
| "grad_norm": 1.4373691082000732, |
| "learning_rate": 1e-05, |
| "loss": 0.2723, |
| "step": 215100 |
| }, |
| { |
| "epoch": 0.002152, |
| "grad_norm": 1.5293453931808472, |
| "learning_rate": 1e-05, |
| "loss": 0.2758, |
| "step": 215200 |
| }, |
| { |
| "epoch": 0.002153, |
| "grad_norm": 1.556516408920288, |
| "learning_rate": 1e-05, |
| "loss": 0.2691, |
| "step": 215300 |
| }, |
| { |
| "epoch": 0.002154, |
| "grad_norm": 1.6215890645980835, |
| "learning_rate": 1e-05, |
| "loss": 0.2792, |
| "step": 215400 |
| }, |
| { |
| "epoch": 0.002155, |
| "grad_norm": 1.3890748023986816, |
| "learning_rate": 1e-05, |
| "loss": 0.2742, |
| "step": 215500 |
| }, |
| { |
| "epoch": 0.002156, |
| "grad_norm": 1.41310453414917, |
| "learning_rate": 1e-05, |
| "loss": 0.2732, |
| "step": 215600 |
| }, |
| { |
| "epoch": 0.002157, |
| "grad_norm": 1.4331954717636108, |
| "learning_rate": 1e-05, |
| "loss": 0.2775, |
| "step": 215700 |
| }, |
| { |
| "epoch": 0.002158, |
| "grad_norm": 1.3733137845993042, |
| "learning_rate": 1e-05, |
| "loss": 0.2709, |
| "step": 215800 |
| }, |
| { |
| "epoch": 0.002159, |
| "grad_norm": 1.2462104558944702, |
| "learning_rate": 1e-05, |
| "loss": 0.2761, |
| "step": 215900 |
| }, |
| { |
| "epoch": 0.00216, |
| "grad_norm": 1.506518006324768, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 216000 |
| }, |
| { |
| "epoch": 0.002161, |
| "grad_norm": 1.5095326900482178, |
| "learning_rate": 1e-05, |
| "loss": 0.2656, |
| "step": 216100 |
| }, |
| { |
| "epoch": 0.002162, |
| "grad_norm": 1.406726360321045, |
| "learning_rate": 1e-05, |
| "loss": 0.2774, |
| "step": 216200 |
| }, |
| { |
| "epoch": 0.002163, |
| "grad_norm": 1.5820876359939575, |
| "learning_rate": 1e-05, |
| "loss": 0.2735, |
| "step": 216300 |
| }, |
| { |
| "epoch": 0.002164, |
| "grad_norm": 1.4047499895095825, |
| "learning_rate": 1e-05, |
| "loss": 0.2771, |
| "step": 216400 |
| }, |
| { |
| "epoch": 0.002165, |
| "grad_norm": 1.3778250217437744, |
| "learning_rate": 1e-05, |
| "loss": 0.2708, |
| "step": 216500 |
| }, |
| { |
| "epoch": 0.002166, |
| "grad_norm": 1.4947162866592407, |
| "learning_rate": 1e-05, |
| "loss": 0.2746, |
| "step": 216600 |
| }, |
| { |
| "epoch": 0.002167, |
| "grad_norm": 1.6999680995941162, |
| "learning_rate": 1e-05, |
| "loss": 0.2738, |
| "step": 216700 |
| }, |
| { |
| "epoch": 0.002168, |
| "grad_norm": 1.2014107704162598, |
| "learning_rate": 1e-05, |
| "loss": 0.2782, |
| "step": 216800 |
| }, |
| { |
| "epoch": 0.002169, |
| "grad_norm": 1.5182017087936401, |
| "learning_rate": 1e-05, |
| "loss": 0.2693, |
| "step": 216900 |
| }, |
| { |
| "epoch": 0.00217, |
| "grad_norm": 1.5457252264022827, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 217000 |
| }, |
| { |
| "epoch": 0.002171, |
| "grad_norm": 1.2823336124420166, |
| "learning_rate": 1e-05, |
| "loss": 0.2699, |
| "step": 217100 |
| }, |
| { |
| "epoch": 0.002172, |
| "grad_norm": 1.5415891408920288, |
| "learning_rate": 1e-05, |
| "loss": 0.2761, |
| "step": 217200 |
| }, |
| { |
| "epoch": 0.002173, |
| "grad_norm": 1.4495307207107544, |
| "learning_rate": 1e-05, |
| "loss": 0.2759, |
| "step": 217300 |
| }, |
| { |
| "epoch": 0.002174, |
| "grad_norm": 1.3674854040145874, |
| "learning_rate": 1e-05, |
| "loss": 0.273, |
| "step": 217400 |
| }, |
| { |
| "epoch": 0.002175, |
| "grad_norm": 1.3650346994400024, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 217500 |
| }, |
| { |
| "epoch": 0.002176, |
| "grad_norm": 1.835408329963684, |
| "learning_rate": 1e-05, |
| "loss": 0.2683, |
| "step": 217600 |
| }, |
| { |
| "epoch": 0.002177, |
| "grad_norm": 1.332398533821106, |
| "learning_rate": 1e-05, |
| "loss": 0.273, |
| "step": 217700 |
| }, |
| { |
| "epoch": 0.002178, |
| "grad_norm": 1.5285613536834717, |
| "learning_rate": 1e-05, |
| "loss": 0.2775, |
| "step": 217800 |
| }, |
| { |
| "epoch": 0.002179, |
| "grad_norm": 1.9105749130249023, |
| "learning_rate": 1e-05, |
| "loss": 0.2749, |
| "step": 217900 |
| }, |
| { |
| "epoch": 0.00218, |
| "grad_norm": 1.4697386026382446, |
| "learning_rate": 1e-05, |
| "loss": 0.2796, |
| "step": 218000 |
| }, |
| { |
| "epoch": 0.002181, |
| "grad_norm": 1.2347923517227173, |
| "learning_rate": 1e-05, |
| "loss": 0.2715, |
| "step": 218100 |
| }, |
| { |
| "epoch": 0.002182, |
| "grad_norm": 1.539542317390442, |
| "learning_rate": 1e-05, |
| "loss": 0.2731, |
| "step": 218200 |
| }, |
| { |
| "epoch": 0.002183, |
| "grad_norm": 1.3916699886322021, |
| "learning_rate": 1e-05, |
| "loss": 0.2647, |
| "step": 218300 |
| }, |
| { |
| "epoch": 0.002184, |
| "grad_norm": 1.3453309535980225, |
| "learning_rate": 1e-05, |
| "loss": 0.2732, |
| "step": 218400 |
| }, |
| { |
| "epoch": 0.002185, |
| "grad_norm": 1.341639518737793, |
| "learning_rate": 1e-05, |
| "loss": 0.2738, |
| "step": 218500 |
| }, |
| { |
| "epoch": 0.002186, |
| "grad_norm": 1.5608952045440674, |
| "learning_rate": 1e-05, |
| "loss": 0.2731, |
| "step": 218600 |
| }, |
| { |
| "epoch": 0.002187, |
| "grad_norm": 1.3826178312301636, |
| "learning_rate": 1e-05, |
| "loss": 0.272, |
| "step": 218700 |
| }, |
| { |
| "epoch": 0.002188, |
| "grad_norm": 1.3272230625152588, |
| "learning_rate": 1e-05, |
| "loss": 0.2663, |
| "step": 218800 |
| }, |
| { |
| "epoch": 0.002189, |
| "grad_norm": 2.7572853565216064, |
| "learning_rate": 1e-05, |
| "loss": 0.2723, |
| "step": 218900 |
| }, |
| { |
| "epoch": 0.00219, |
| "grad_norm": 1.3882721662521362, |
| "learning_rate": 1e-05, |
| "loss": 0.2683, |
| "step": 219000 |
| }, |
| { |
| "epoch": 0.002191, |
| "grad_norm": 1.5614526271820068, |
| "learning_rate": 1e-05, |
| "loss": 0.2685, |
| "step": 219100 |
| }, |
| { |
| "epoch": 0.002192, |
| "grad_norm": 1.5847523212432861, |
| "learning_rate": 1e-05, |
| "loss": 0.2656, |
| "step": 219200 |
| }, |
| { |
| "epoch": 0.002193, |
| "grad_norm": 1.4185631275177002, |
| "learning_rate": 1e-05, |
| "loss": 0.2745, |
| "step": 219300 |
| }, |
| { |
| "epoch": 0.002194, |
| "grad_norm": 1.476169228553772, |
| "learning_rate": 1e-05, |
| "loss": 0.2712, |
| "step": 219400 |
| }, |
| { |
| "epoch": 0.002195, |
| "grad_norm": 1.512122631072998, |
| "learning_rate": 1e-05, |
| "loss": 0.2721, |
| "step": 219500 |
| }, |
| { |
| "epoch": 0.002196, |
| "grad_norm": 1.4664489030838013, |
| "learning_rate": 1e-05, |
| "loss": 0.2706, |
| "step": 219600 |
| }, |
| { |
| "epoch": 0.002197, |
| "grad_norm": 1.3304619789123535, |
| "learning_rate": 1e-05, |
| "loss": 0.272, |
| "step": 219700 |
| }, |
| { |
| "epoch": 0.002198, |
| "grad_norm": 1.65354585647583, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 219800 |
| }, |
| { |
| "epoch": 0.002199, |
| "grad_norm": 1.534138798713684, |
| "learning_rate": 1e-05, |
| "loss": 0.2689, |
| "step": 219900 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 1.5334886312484741, |
| "learning_rate": 1e-05, |
| "loss": 0.2702, |
| "step": 220000 |
| }, |
| { |
| "epoch": 0.0022, |
| "eval_loss": 0.2470703125, |
| "eval_runtime": 109.4816, |
| "eval_samples_per_second": 456.698, |
| "eval_steps_per_second": 28.544, |
| "step": 220000 |
| }, |
| { |
| "epoch": 0.002201, |
| "grad_norm": 1.6148320436477661, |
| "learning_rate": 1e-05, |
| "loss": 0.2714, |
| "step": 220100 |
| }, |
| { |
| "epoch": 0.002202, |
| "grad_norm": 1.3717103004455566, |
| "learning_rate": 1e-05, |
| "loss": 0.2714, |
| "step": 220200 |
| }, |
| { |
| "epoch": 0.002203, |
| "grad_norm": 1.325392484664917, |
| "learning_rate": 1e-05, |
| "loss": 0.2685, |
| "step": 220300 |
| }, |
| { |
| "epoch": 0.002204, |
| "grad_norm": 1.1573683023452759, |
| "learning_rate": 1e-05, |
| "loss": 0.2777, |
| "step": 220400 |
| }, |
| { |
| "epoch": 0.002205, |
| "grad_norm": 1.4676803350448608, |
| "learning_rate": 1e-05, |
| "loss": 0.2766, |
| "step": 220500 |
| }, |
| { |
| "epoch": 0.002206, |
| "grad_norm": 1.3724902868270874, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 220600 |
| }, |
| { |
| "epoch": 0.002207, |
| "grad_norm": 1.3178821802139282, |
| "learning_rate": 1e-05, |
| "loss": 0.2677, |
| "step": 220700 |
| }, |
| { |
| "epoch": 0.002208, |
| "grad_norm": 1.4839378595352173, |
| "learning_rate": 1e-05, |
| "loss": 0.2751, |
| "step": 220800 |
| }, |
| { |
| "epoch": 0.002209, |
| "grad_norm": 1.4000365734100342, |
| "learning_rate": 1e-05, |
| "loss": 0.2715, |
| "step": 220900 |
| }, |
| { |
| "epoch": 0.00221, |
| "grad_norm": 1.4803142547607422, |
| "learning_rate": 1e-05, |
| "loss": 0.2633, |
| "step": 221000 |
| }, |
| { |
| "epoch": 0.002211, |
| "grad_norm": 1.8429442644119263, |
| "learning_rate": 1e-05, |
| "loss": 0.2721, |
| "step": 221100 |
| }, |
| { |
| "epoch": 0.002212, |
| "grad_norm": 1.3296468257904053, |
| "learning_rate": 1e-05, |
| "loss": 0.2657, |
| "step": 221200 |
| }, |
| { |
| "epoch": 0.002213, |
| "grad_norm": 1.283510446548462, |
| "learning_rate": 1e-05, |
| "loss": 0.2664, |
| "step": 221300 |
| }, |
| { |
| "epoch": 0.002214, |
| "grad_norm": 1.7253391742706299, |
| "learning_rate": 1e-05, |
| "loss": 0.2719, |
| "step": 221400 |
| }, |
| { |
| "epoch": 0.002215, |
| "grad_norm": 1.482222557067871, |
| "learning_rate": 1e-05, |
| "loss": 0.27, |
| "step": 221500 |
| }, |
| { |
| "epoch": 0.002216, |
| "grad_norm": 1.497300148010254, |
| "learning_rate": 1e-05, |
| "loss": 0.2721, |
| "step": 221600 |
| }, |
| { |
| "epoch": 0.002217, |
| "grad_norm": 1.3005281686782837, |
| "learning_rate": 1e-05, |
| "loss": 0.271, |
| "step": 221700 |
| }, |
| { |
| "epoch": 0.002218, |
| "grad_norm": 1.3060221672058105, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 221800 |
| }, |
| { |
| "epoch": 0.002219, |
| "grad_norm": 1.7013475894927979, |
| "learning_rate": 1e-05, |
| "loss": 0.2626, |
| "step": 221900 |
| }, |
| { |
| "epoch": 0.00222, |
| "grad_norm": 1.3594257831573486, |
| "learning_rate": 1e-05, |
| "loss": 0.2687, |
| "step": 222000 |
| }, |
| { |
| "epoch": 0.002221, |
| "grad_norm": 1.3790507316589355, |
| "learning_rate": 1e-05, |
| "loss": 0.2747, |
| "step": 222100 |
| }, |
| { |
| "epoch": 0.002222, |
| "grad_norm": 1.2899534702301025, |
| "learning_rate": 1e-05, |
| "loss": 0.2636, |
| "step": 222200 |
| }, |
| { |
| "epoch": 0.002223, |
| "grad_norm": 1.3310524225234985, |
| "learning_rate": 1e-05, |
| "loss": 0.2781, |
| "step": 222300 |
| }, |
| { |
| "epoch": 0.002224, |
| "grad_norm": 1.5631316900253296, |
| "learning_rate": 1e-05, |
| "loss": 0.2668, |
| "step": 222400 |
| }, |
| { |
| "epoch": 0.002225, |
| "grad_norm": 1.5219013690948486, |
| "learning_rate": 1e-05, |
| "loss": 0.2705, |
| "step": 222500 |
| }, |
| { |
| "epoch": 0.002226, |
| "grad_norm": 1.2552835941314697, |
| "learning_rate": 1e-05, |
| "loss": 0.2717, |
| "step": 222600 |
| }, |
| { |
| "epoch": 0.002227, |
| "grad_norm": 1.4404419660568237, |
| "learning_rate": 1e-05, |
| "loss": 0.2696, |
| "step": 222700 |
| }, |
| { |
| "epoch": 0.002228, |
| "grad_norm": 1.3716353178024292, |
| "learning_rate": 1e-05, |
| "loss": 0.2623, |
| "step": 222800 |
| }, |
| { |
| "epoch": 0.002229, |
| "grad_norm": 1.5366202592849731, |
| "learning_rate": 1e-05, |
| "loss": 0.2639, |
| "step": 222900 |
| }, |
| { |
| "epoch": 0.00223, |
| "grad_norm": 1.366859793663025, |
| "learning_rate": 1e-05, |
| "loss": 0.2639, |
| "step": 223000 |
| }, |
| { |
| "epoch": 0.002231, |
| "grad_norm": 1.3707561492919922, |
| "learning_rate": 1e-05, |
| "loss": 0.2705, |
| "step": 223100 |
| }, |
| { |
| "epoch": 0.002232, |
| "grad_norm": 1.5594546794891357, |
| "learning_rate": 1e-05, |
| "loss": 0.2729, |
| "step": 223200 |
| }, |
| { |
| "epoch": 0.002233, |
| "grad_norm": 1.404626488685608, |
| "learning_rate": 1e-05, |
| "loss": 0.2649, |
| "step": 223300 |
| }, |
| { |
| "epoch": 0.002234, |
| "grad_norm": 1.6944884061813354, |
| "learning_rate": 1e-05, |
| "loss": 0.2686, |
| "step": 223400 |
| }, |
| { |
| "epoch": 0.002235, |
| "grad_norm": 1.9831916093826294, |
| "learning_rate": 1e-05, |
| "loss": 0.274, |
| "step": 223500 |
| }, |
| { |
| "epoch": 0.002236, |
| "grad_norm": 1.7819033861160278, |
| "learning_rate": 1e-05, |
| "loss": 0.2734, |
| "step": 223600 |
| }, |
| { |
| "epoch": 0.002237, |
| "grad_norm": 1.5690064430236816, |
| "learning_rate": 1e-05, |
| "loss": 0.2737, |
| "step": 223700 |
| }, |
| { |
| "epoch": 0.002238, |
| "grad_norm": 1.2317233085632324, |
| "learning_rate": 1e-05, |
| "loss": 0.2679, |
| "step": 223800 |
| }, |
| { |
| "epoch": 0.002239, |
| "grad_norm": 2.0383543968200684, |
| "learning_rate": 1e-05, |
| "loss": 0.2679, |
| "step": 223900 |
| }, |
| { |
| "epoch": 0.00224, |
| "grad_norm": 1.4997539520263672, |
| "learning_rate": 1e-05, |
| "loss": 0.2727, |
| "step": 224000 |
| }, |
| { |
| "epoch": 0.002241, |
| "grad_norm": 1.3992339372634888, |
| "learning_rate": 1e-05, |
| "loss": 0.273, |
| "step": 224100 |
| }, |
| { |
| "epoch": 0.002242, |
| "grad_norm": 1.5344079732894897, |
| "learning_rate": 1e-05, |
| "loss": 0.2712, |
| "step": 224200 |
| }, |
| { |
| "epoch": 0.002243, |
| "grad_norm": 1.6358779668807983, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 224300 |
| }, |
| { |
| "epoch": 0.002244, |
| "grad_norm": 1.6222172975540161, |
| "learning_rate": 1e-05, |
| "loss": 0.2621, |
| "step": 224400 |
| }, |
| { |
| "epoch": 0.002245, |
| "grad_norm": 1.5196174383163452, |
| "learning_rate": 1e-05, |
| "loss": 0.2696, |
| "step": 224500 |
| }, |
| { |
| "epoch": 0.002246, |
| "grad_norm": 1.477639079093933, |
| "learning_rate": 1e-05, |
| "loss": 0.2822, |
| "step": 224600 |
| }, |
| { |
| "epoch": 0.002247, |
| "grad_norm": 1.6515791416168213, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 224700 |
| }, |
| { |
| "epoch": 0.002248, |
| "grad_norm": 1.387439489364624, |
| "learning_rate": 1e-05, |
| "loss": 0.2674, |
| "step": 224800 |
| }, |
| { |
| "epoch": 0.002249, |
| "grad_norm": 1.264535665512085, |
| "learning_rate": 1e-05, |
| "loss": 0.2708, |
| "step": 224900 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 1.4407367706298828, |
| "learning_rate": 1e-05, |
| "loss": 0.2631, |
| "step": 225000 |
| }, |
| { |
| "epoch": 0.002251, |
| "grad_norm": 1.5102108716964722, |
| "learning_rate": 1e-05, |
| "loss": 0.272, |
| "step": 225100 |
| }, |
| { |
| "epoch": 0.002252, |
| "grad_norm": 1.5774545669555664, |
| "learning_rate": 1e-05, |
| "loss": 0.2675, |
| "step": 225200 |
| }, |
| { |
| "epoch": 0.002253, |
| "grad_norm": 1.331253170967102, |
| "learning_rate": 1e-05, |
| "loss": 0.2638, |
| "step": 225300 |
| }, |
| { |
| "epoch": 0.002254, |
| "grad_norm": 1.443367600440979, |
| "learning_rate": 1e-05, |
| "loss": 0.2756, |
| "step": 225400 |
| }, |
| { |
| "epoch": 0.002255, |
| "grad_norm": 1.3950369358062744, |
| "learning_rate": 1e-05, |
| "loss": 0.2666, |
| "step": 225500 |
| }, |
| { |
| "epoch": 0.002256, |
| "grad_norm": 2.172563314437866, |
| "learning_rate": 1e-05, |
| "loss": 0.2777, |
| "step": 225600 |
| }, |
| { |
| "epoch": 0.002257, |
| "grad_norm": 1.1899489164352417, |
| "learning_rate": 1e-05, |
| "loss": 0.2651, |
| "step": 225700 |
| }, |
| { |
| "epoch": 0.002258, |
| "grad_norm": 1.459839105606079, |
| "learning_rate": 1e-05, |
| "loss": 0.2691, |
| "step": 225800 |
| }, |
| { |
| "epoch": 0.002259, |
| "grad_norm": 1.577430248260498, |
| "learning_rate": 1e-05, |
| "loss": 0.274, |
| "step": 225900 |
| }, |
| { |
| "epoch": 0.00226, |
| "grad_norm": 2.454540729522705, |
| "learning_rate": 1e-05, |
| "loss": 0.2731, |
| "step": 226000 |
| }, |
| { |
| "epoch": 0.002261, |
| "grad_norm": 1.4699666500091553, |
| "learning_rate": 1e-05, |
| "loss": 0.2684, |
| "step": 226100 |
| }, |
| { |
| "epoch": 0.002262, |
| "grad_norm": 1.745713472366333, |
| "learning_rate": 1e-05, |
| "loss": 0.2742, |
| "step": 226200 |
| }, |
| { |
| "epoch": 0.002263, |
| "grad_norm": 1.7182310819625854, |
| "learning_rate": 1e-05, |
| "loss": 0.2695, |
| "step": 226300 |
| }, |
| { |
| "epoch": 0.002264, |
| "grad_norm": 1.3382000923156738, |
| "learning_rate": 1e-05, |
| "loss": 0.2638, |
| "step": 226400 |
| }, |
| { |
| "epoch": 0.002265, |
| "grad_norm": 1.5027062892913818, |
| "learning_rate": 1e-05, |
| "loss": 0.2738, |
| "step": 226500 |
| }, |
| { |
| "epoch": 0.002266, |
| "grad_norm": 1.4339383840560913, |
| "learning_rate": 1e-05, |
| "loss": 0.2651, |
| "step": 226600 |
| }, |
| { |
| "epoch": 0.002267, |
| "grad_norm": 1.5018837451934814, |
| "learning_rate": 1e-05, |
| "loss": 0.2681, |
| "step": 226700 |
| }, |
| { |
| "epoch": 0.002268, |
| "grad_norm": 1.4683713912963867, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 226800 |
| }, |
| { |
| "epoch": 0.002269, |
| "grad_norm": 1.5575743913650513, |
| "learning_rate": 1e-05, |
| "loss": 0.2668, |
| "step": 226900 |
| }, |
| { |
| "epoch": 0.00227, |
| "grad_norm": 1.3830864429473877, |
| "learning_rate": 1e-05, |
| "loss": 0.2701, |
| "step": 227000 |
| }, |
| { |
| "epoch": 0.002271, |
| "grad_norm": 1.3803704977035522, |
| "learning_rate": 1e-05, |
| "loss": 0.266, |
| "step": 227100 |
| }, |
| { |
| "epoch": 0.002272, |
| "grad_norm": 1.6529074907302856, |
| "learning_rate": 1e-05, |
| "loss": 0.2724, |
| "step": 227200 |
| }, |
| { |
| "epoch": 0.002273, |
| "grad_norm": 1.5334415435791016, |
| "learning_rate": 1e-05, |
| "loss": 0.272, |
| "step": 227300 |
| }, |
| { |
| "epoch": 0.002274, |
| "grad_norm": 1.4912134408950806, |
| "learning_rate": 1e-05, |
| "loss": 0.2723, |
| "step": 227400 |
| }, |
| { |
| "epoch": 0.002275, |
| "grad_norm": 1.375284194946289, |
| "learning_rate": 1e-05, |
| "loss": 0.2702, |
| "step": 227500 |
| }, |
| { |
| "epoch": 0.002276, |
| "grad_norm": 1.2836390733718872, |
| "learning_rate": 1e-05, |
| "loss": 0.2738, |
| "step": 227600 |
| }, |
| { |
| "epoch": 0.002277, |
| "grad_norm": 1.2948681116104126, |
| "learning_rate": 1e-05, |
| "loss": 0.275, |
| "step": 227700 |
| }, |
| { |
| "epoch": 0.002278, |
| "grad_norm": 1.3349876403808594, |
| "learning_rate": 1e-05, |
| "loss": 0.2673, |
| "step": 227800 |
| }, |
| { |
| "epoch": 0.002279, |
| "grad_norm": 1.422441840171814, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 227900 |
| }, |
| { |
| "epoch": 0.00228, |
| "grad_norm": 1.4071296453475952, |
| "learning_rate": 1e-05, |
| "loss": 0.2688, |
| "step": 228000 |
| }, |
| { |
| "epoch": 0.002281, |
| "grad_norm": 1.3052148818969727, |
| "learning_rate": 1e-05, |
| "loss": 0.2674, |
| "step": 228100 |
| }, |
| { |
| "epoch": 0.002282, |
| "grad_norm": 1.4938229322433472, |
| "learning_rate": 1e-05, |
| "loss": 0.2693, |
| "step": 228200 |
| }, |
| { |
| "epoch": 0.002283, |
| "grad_norm": 1.48699951171875, |
| "learning_rate": 1e-05, |
| "loss": 0.2681, |
| "step": 228300 |
| }, |
| { |
| "epoch": 0.002284, |
| "grad_norm": 1.5458961725234985, |
| "learning_rate": 1e-05, |
| "loss": 0.2697, |
| "step": 228400 |
| }, |
| { |
| "epoch": 0.002285, |
| "grad_norm": 1.394895315170288, |
| "learning_rate": 1e-05, |
| "loss": 0.2678, |
| "step": 228500 |
| }, |
| { |
| "epoch": 0.002286, |
| "grad_norm": 1.3542561531066895, |
| "learning_rate": 1e-05, |
| "loss": 0.2669, |
| "step": 228600 |
| }, |
| { |
| "epoch": 0.002287, |
| "grad_norm": 1.5436086654663086, |
| "learning_rate": 1e-05, |
| "loss": 0.2681, |
| "step": 228700 |
| }, |
| { |
| "epoch": 0.002288, |
| "grad_norm": 1.3657710552215576, |
| "learning_rate": 1e-05, |
| "loss": 0.2731, |
| "step": 228800 |
| }, |
| { |
| "epoch": 0.002289, |
| "grad_norm": 1.2138234376907349, |
| "learning_rate": 1e-05, |
| "loss": 0.2706, |
| "step": 228900 |
| }, |
| { |
| "epoch": 0.00229, |
| "grad_norm": 1.4697974920272827, |
| "learning_rate": 1e-05, |
| "loss": 0.2695, |
| "step": 229000 |
| }, |
| { |
| "epoch": 0.002291, |
| "grad_norm": 1.532043695449829, |
| "learning_rate": 1e-05, |
| "loss": 0.2691, |
| "step": 229100 |
| }, |
| { |
| "epoch": 0.002292, |
| "grad_norm": 1.5469400882720947, |
| "learning_rate": 1e-05, |
| "loss": 0.2635, |
| "step": 229200 |
| }, |
| { |
| "epoch": 0.002293, |
| "grad_norm": 1.3268221616744995, |
| "learning_rate": 1e-05, |
| "loss": 0.2703, |
| "step": 229300 |
| }, |
| { |
| "epoch": 0.002294, |
| "grad_norm": 1.3366742134094238, |
| "learning_rate": 1e-05, |
| "loss": 0.2658, |
| "step": 229400 |
| }, |
| { |
| "epoch": 0.002295, |
| "grad_norm": 1.3616935014724731, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 229500 |
| }, |
| { |
| "epoch": 0.002296, |
| "grad_norm": 1.3293488025665283, |
| "learning_rate": 1e-05, |
| "loss": 0.2703, |
| "step": 229600 |
| }, |
| { |
| "epoch": 0.002297, |
| "grad_norm": 1.58241605758667, |
| "learning_rate": 1e-05, |
| "loss": 0.2629, |
| "step": 229700 |
| }, |
| { |
| "epoch": 0.002298, |
| "grad_norm": 1.240764856338501, |
| "learning_rate": 1e-05, |
| "loss": 0.2664, |
| "step": 229800 |
| }, |
| { |
| "epoch": 0.002299, |
| "grad_norm": 1.480238676071167, |
| "learning_rate": 1e-05, |
| "loss": 0.2737, |
| "step": 229900 |
| }, |
| { |
| "epoch": 0.0023, |
| "grad_norm": 1.6399883031845093, |
| "learning_rate": 1e-05, |
| "loss": 0.265, |
| "step": 230000 |
| }, |
| { |
| "epoch": 0.002301, |
| "grad_norm": 1.5711274147033691, |
| "learning_rate": 1e-05, |
| "loss": 0.265, |
| "step": 230100 |
| }, |
| { |
| "epoch": 0.002302, |
| "grad_norm": 1.3830393552780151, |
| "learning_rate": 1e-05, |
| "loss": 0.2708, |
| "step": 230200 |
| }, |
| { |
| "epoch": 0.002303, |
| "grad_norm": 1.3761115074157715, |
| "learning_rate": 1e-05, |
| "loss": 0.2665, |
| "step": 230300 |
| }, |
| { |
| "epoch": 0.002304, |
| "grad_norm": 1.4468395709991455, |
| "learning_rate": 1e-05, |
| "loss": 0.2665, |
| "step": 230400 |
| }, |
| { |
| "epoch": 0.002305, |
| "grad_norm": 1.2549031972885132, |
| "learning_rate": 1e-05, |
| "loss": 0.2704, |
| "step": 230500 |
| }, |
| { |
| "epoch": 0.002306, |
| "grad_norm": 1.5276696681976318, |
| "learning_rate": 1e-05, |
| "loss": 0.2617, |
| "step": 230600 |
| }, |
| { |
| "epoch": 0.002307, |
| "grad_norm": 1.222853660583496, |
| "learning_rate": 1e-05, |
| "loss": 0.2692, |
| "step": 230700 |
| }, |
| { |
| "epoch": 0.002308, |
| "grad_norm": 1.504705548286438, |
| "learning_rate": 1e-05, |
| "loss": 0.2743, |
| "step": 230800 |
| }, |
| { |
| "epoch": 0.002309, |
| "grad_norm": 1.380125880241394, |
| "learning_rate": 1e-05, |
| "loss": 0.273, |
| "step": 230900 |
| }, |
| { |
| "epoch": 0.00231, |
| "grad_norm": 1.5362893342971802, |
| "learning_rate": 1e-05, |
| "loss": 0.2617, |
| "step": 231000 |
| }, |
| { |
| "epoch": 0.002311, |
| "grad_norm": 1.308255672454834, |
| "learning_rate": 1e-05, |
| "loss": 0.2607, |
| "step": 231100 |
| }, |
| { |
| "epoch": 0.002312, |
| "grad_norm": 1.46437406539917, |
| "learning_rate": 1e-05, |
| "loss": 0.2623, |
| "step": 231200 |
| }, |
| { |
| "epoch": 0.002313, |
| "grad_norm": 1.7176834344863892, |
| "learning_rate": 1e-05, |
| "loss": 0.2733, |
| "step": 231300 |
| }, |
| { |
| "epoch": 0.002314, |
| "grad_norm": 1.2125487327575684, |
| "learning_rate": 1e-05, |
| "loss": 0.2661, |
| "step": 231400 |
| }, |
| { |
| "epoch": 0.002315, |
| "grad_norm": 1.656404972076416, |
| "learning_rate": 1e-05, |
| "loss": 0.2719, |
| "step": 231500 |
| }, |
| { |
| "epoch": 0.002316, |
| "grad_norm": 1.2319536209106445, |
| "learning_rate": 1e-05, |
| "loss": 0.2662, |
| "step": 231600 |
| }, |
| { |
| "epoch": 0.002317, |
| "grad_norm": 1.4529757499694824, |
| "learning_rate": 1e-05, |
| "loss": 0.2668, |
| "step": 231700 |
| }, |
| { |
| "epoch": 0.002318, |
| "grad_norm": 1.3441050052642822, |
| "learning_rate": 1e-05, |
| "loss": 0.2664, |
| "step": 231800 |
| }, |
| { |
| "epoch": 0.002319, |
| "grad_norm": 1.3161287307739258, |
| "learning_rate": 1e-05, |
| "loss": 0.2664, |
| "step": 231900 |
| }, |
| { |
| "epoch": 0.00232, |
| "grad_norm": 1.5237764120101929, |
| "learning_rate": 1e-05, |
| "loss": 0.2661, |
| "step": 232000 |
| }, |
| { |
| "epoch": 0.002321, |
| "grad_norm": 1.4569116830825806, |
| "learning_rate": 1e-05, |
| "loss": 0.2683, |
| "step": 232100 |
| }, |
| { |
| "epoch": 0.002322, |
| "grad_norm": 1.4047276973724365, |
| "learning_rate": 1e-05, |
| "loss": 0.2658, |
| "step": 232200 |
| }, |
| { |
| "epoch": 0.002323, |
| "grad_norm": 1.6638625860214233, |
| "learning_rate": 1e-05, |
| "loss": 0.2632, |
| "step": 232300 |
| }, |
| { |
| "epoch": 0.002324, |
| "grad_norm": 1.2283929586410522, |
| "learning_rate": 1e-05, |
| "loss": 0.2639, |
| "step": 232400 |
| }, |
| { |
| "epoch": 0.002325, |
| "grad_norm": 1.2033153772354126, |
| "learning_rate": 1e-05, |
| "loss": 0.2679, |
| "step": 232500 |
| }, |
| { |
| "epoch": 0.002326, |
| "grad_norm": 1.37595534324646, |
| "learning_rate": 1e-05, |
| "loss": 0.2671, |
| "step": 232600 |
| }, |
| { |
| "epoch": 0.002327, |
| "grad_norm": 1.4282982349395752, |
| "learning_rate": 1e-05, |
| "loss": 0.2645, |
| "step": 232700 |
| }, |
| { |
| "epoch": 0.002328, |
| "grad_norm": 1.3374396562576294, |
| "learning_rate": 1e-05, |
| "loss": 0.2702, |
| "step": 232800 |
| }, |
| { |
| "epoch": 0.002329, |
| "grad_norm": 1.2464615106582642, |
| "learning_rate": 1e-05, |
| "loss": 0.2704, |
| "step": 232900 |
| }, |
| { |
| "epoch": 0.00233, |
| "grad_norm": 1.4305086135864258, |
| "learning_rate": 1e-05, |
| "loss": 0.2675, |
| "step": 233000 |
| }, |
| { |
| "epoch": 0.002331, |
| "grad_norm": 1.404268503189087, |
| "learning_rate": 1e-05, |
| "loss": 0.2673, |
| "step": 233100 |
| }, |
| { |
| "epoch": 0.002332, |
| "grad_norm": 1.2417124509811401, |
| "learning_rate": 1e-05, |
| "loss": 0.2652, |
| "step": 233200 |
| }, |
| { |
| "epoch": 0.002333, |
| "grad_norm": 1.4305444955825806, |
| "learning_rate": 1e-05, |
| "loss": 0.264, |
| "step": 233300 |
| }, |
| { |
| "epoch": 0.002334, |
| "grad_norm": 1.4114798307418823, |
| "learning_rate": 1e-05, |
| "loss": 0.2721, |
| "step": 233400 |
| }, |
| { |
| "epoch": 0.002335, |
| "grad_norm": 1.5132521390914917, |
| "learning_rate": 1e-05, |
| "loss": 0.2662, |
| "step": 233500 |
| }, |
| { |
| "epoch": 0.002336, |
| "grad_norm": 1.2459546327590942, |
| "learning_rate": 1e-05, |
| "loss": 0.2608, |
| "step": 233600 |
| }, |
| { |
| "epoch": 0.002337, |
| "grad_norm": 1.3642733097076416, |
| "learning_rate": 1e-05, |
| "loss": 0.2673, |
| "step": 233700 |
| }, |
| { |
| "epoch": 0.002338, |
| "grad_norm": 1.2351726293563843, |
| "learning_rate": 1e-05, |
| "loss": 0.2655, |
| "step": 233800 |
| }, |
| { |
| "epoch": 0.002339, |
| "grad_norm": 1.2862097024917603, |
| "learning_rate": 1e-05, |
| "loss": 0.2726, |
| "step": 233900 |
| }, |
| { |
| "epoch": 0.00234, |
| "grad_norm": 1.286604642868042, |
| "learning_rate": 1e-05, |
| "loss": 0.2626, |
| "step": 234000 |
| }, |
| { |
| "epoch": 0.002341, |
| "grad_norm": 1.5336260795593262, |
| "learning_rate": 1e-05, |
| "loss": 0.2654, |
| "step": 234100 |
| }, |
| { |
| "epoch": 0.002342, |
| "grad_norm": 1.5840083360671997, |
| "learning_rate": 1e-05, |
| "loss": 0.2664, |
| "step": 234200 |
| }, |
| { |
| "epoch": 0.002343, |
| "grad_norm": 1.7138983011245728, |
| "learning_rate": 1e-05, |
| "loss": 0.2649, |
| "step": 234300 |
| }, |
| { |
| "epoch": 0.002344, |
| "grad_norm": 1.624403953552246, |
| "learning_rate": 1e-05, |
| "loss": 0.2604, |
| "step": 234400 |
| }, |
| { |
| "epoch": 0.002345, |
| "grad_norm": 1.43915593624115, |
| "learning_rate": 1e-05, |
| "loss": 0.2656, |
| "step": 234500 |
| }, |
| { |
| "epoch": 0.002346, |
| "grad_norm": 1.4849474430084229, |
| "learning_rate": 1e-05, |
| "loss": 0.2651, |
| "step": 234600 |
| }, |
| { |
| "epoch": 0.002347, |
| "grad_norm": 1.5203824043273926, |
| "learning_rate": 1e-05, |
| "loss": 0.2628, |
| "step": 234700 |
| }, |
| { |
| "epoch": 0.002348, |
| "grad_norm": 1.2643849849700928, |
| "learning_rate": 1e-05, |
| "loss": 0.2661, |
| "step": 234800 |
| }, |
| { |
| "epoch": 0.002349, |
| "grad_norm": 1.2797199487686157, |
| "learning_rate": 1e-05, |
| "loss": 0.2624, |
| "step": 234900 |
| }, |
| { |
| "epoch": 0.00235, |
| "grad_norm": 1.4327815771102905, |
| "learning_rate": 1e-05, |
| "loss": 0.2635, |
| "step": 235000 |
| }, |
| { |
| "epoch": 0.002351, |
| "grad_norm": 1.5249953269958496, |
| "learning_rate": 1e-05, |
| "loss": 0.265, |
| "step": 235100 |
| }, |
| { |
| "epoch": 0.002352, |
| "grad_norm": 1.46501624584198, |
| "learning_rate": 1e-05, |
| "loss": 0.2676, |
| "step": 235200 |
| }, |
| { |
| "epoch": 0.002353, |
| "grad_norm": 1.5722047090530396, |
| "learning_rate": 1e-05, |
| "loss": 0.2574, |
| "step": 235300 |
| }, |
| { |
| "epoch": 0.002354, |
| "grad_norm": 1.2025928497314453, |
| "learning_rate": 1e-05, |
| "loss": 0.2671, |
| "step": 235400 |
| }, |
| { |
| "epoch": 0.002355, |
| "grad_norm": 1.554432988166809, |
| "learning_rate": 1e-05, |
| "loss": 0.2703, |
| "step": 235500 |
| }, |
| { |
| "epoch": 0.002356, |
| "grad_norm": 1.4228944778442383, |
| "learning_rate": 1e-05, |
| "loss": 0.2728, |
| "step": 235600 |
| }, |
| { |
| "epoch": 0.002357, |
| "grad_norm": 1.5761821269989014, |
| "learning_rate": 1e-05, |
| "loss": 0.2637, |
| "step": 235700 |
| }, |
| { |
| "epoch": 0.002358, |
| "grad_norm": 1.1505320072174072, |
| "learning_rate": 1e-05, |
| "loss": 0.2675, |
| "step": 235800 |
| }, |
| { |
| "epoch": 0.002359, |
| "grad_norm": 1.4779144525527954, |
| "learning_rate": 1e-05, |
| "loss": 0.2716, |
| "step": 235900 |
| }, |
| { |
| "epoch": 0.00236, |
| "grad_norm": 1.3939759731292725, |
| "learning_rate": 1e-05, |
| "loss": 0.2616, |
| "step": 236000 |
| }, |
| { |
| "epoch": 0.002361, |
| "grad_norm": 1.5327190160751343, |
| "learning_rate": 1e-05, |
| "loss": 0.263, |
| "step": 236100 |
| }, |
| { |
| "epoch": 0.002362, |
| "grad_norm": 1.338335394859314, |
| "learning_rate": 1e-05, |
| "loss": 0.2696, |
| "step": 236200 |
| }, |
| { |
| "epoch": 0.002363, |
| "grad_norm": 1.4766395092010498, |
| "learning_rate": 1e-05, |
| "loss": 0.2657, |
| "step": 236300 |
| }, |
| { |
| "epoch": 0.002364, |
| "grad_norm": 1.292716145515442, |
| "learning_rate": 1e-05, |
| "loss": 0.2611, |
| "step": 236400 |
| }, |
| { |
| "epoch": 0.002365, |
| "grad_norm": 1.3678349256515503, |
| "learning_rate": 1e-05, |
| "loss": 0.2663, |
| "step": 236500 |
| }, |
| { |
| "epoch": 0.002366, |
| "grad_norm": 1.5033866167068481, |
| "learning_rate": 1e-05, |
| "loss": 0.2693, |
| "step": 236600 |
| }, |
| { |
| "epoch": 0.002367, |
| "grad_norm": 1.1508349180221558, |
| "learning_rate": 1e-05, |
| "loss": 0.2631, |
| "step": 236700 |
| }, |
| { |
| "epoch": 0.002368, |
| "grad_norm": 1.627375602722168, |
| "learning_rate": 1e-05, |
| "loss": 0.2661, |
| "step": 236800 |
| }, |
| { |
| "epoch": 0.002369, |
| "grad_norm": 1.5816211700439453, |
| "learning_rate": 1e-05, |
| "loss": 0.2655, |
| "step": 236900 |
| }, |
| { |
| "epoch": 0.00237, |
| "grad_norm": 1.6481306552886963, |
| "learning_rate": 1e-05, |
| "loss": 0.2649, |
| "step": 237000 |
| }, |
| { |
| "epoch": 0.002371, |
| "grad_norm": 1.6917251348495483, |
| "learning_rate": 1e-05, |
| "loss": 0.2653, |
| "step": 237100 |
| }, |
| { |
| "epoch": 0.002372, |
| "grad_norm": 2.067993640899658, |
| "learning_rate": 1e-05, |
| "loss": 0.2593, |
| "step": 237200 |
| }, |
| { |
| "epoch": 0.002373, |
| "grad_norm": 1.3310753107070923, |
| "learning_rate": 1e-05, |
| "loss": 0.2667, |
| "step": 237300 |
| }, |
| { |
| "epoch": 0.002374, |
| "grad_norm": 1.6005228757858276, |
| "learning_rate": 1e-05, |
| "loss": 0.2658, |
| "step": 237400 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 1.285658836364746, |
| "learning_rate": 1e-05, |
| "loss": 0.2605, |
| "step": 237500 |
| }, |
| { |
| "epoch": 0.002376, |
| "grad_norm": 1.37336003780365, |
| "learning_rate": 1e-05, |
| "loss": 0.2691, |
| "step": 237600 |
| }, |
| { |
| "epoch": 0.002377, |
| "grad_norm": 1.3081128597259521, |
| "learning_rate": 1e-05, |
| "loss": 0.2597, |
| "step": 237700 |
| }, |
| { |
| "epoch": 0.002378, |
| "grad_norm": 1.4721745252609253, |
| "learning_rate": 1e-05, |
| "loss": 0.2679, |
| "step": 237800 |
| }, |
| { |
| "epoch": 0.002379, |
| "grad_norm": 1.4306268692016602, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 237900 |
| }, |
| { |
| "epoch": 0.00238, |
| "grad_norm": 1.323388934135437, |
| "learning_rate": 1e-05, |
| "loss": 0.2656, |
| "step": 238000 |
| }, |
| { |
| "epoch": 0.002381, |
| "grad_norm": 1.5063282251358032, |
| "learning_rate": 1e-05, |
| "loss": 0.2647, |
| "step": 238100 |
| }, |
| { |
| "epoch": 0.002382, |
| "grad_norm": 1.50105619430542, |
| "learning_rate": 1e-05, |
| "loss": 0.2627, |
| "step": 238200 |
| }, |
| { |
| "epoch": 0.002383, |
| "grad_norm": 1.325971245765686, |
| "learning_rate": 1e-05, |
| "loss": 0.2691, |
| "step": 238300 |
| }, |
| { |
| "epoch": 0.002384, |
| "grad_norm": 1.3668450117111206, |
| "learning_rate": 1e-05, |
| "loss": 0.2607, |
| "step": 238400 |
| }, |
| { |
| "epoch": 0.002385, |
| "grad_norm": 1.299739122390747, |
| "learning_rate": 1e-05, |
| "loss": 0.2658, |
| "step": 238500 |
| }, |
| { |
| "epoch": 0.002386, |
| "grad_norm": 1.3577795028686523, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 238600 |
| }, |
| { |
| "epoch": 0.002387, |
| "grad_norm": 2.8680946826934814, |
| "learning_rate": 1e-05, |
| "loss": 0.2659, |
| "step": 238700 |
| }, |
| { |
| "epoch": 0.002388, |
| "grad_norm": 1.344599962234497, |
| "learning_rate": 1e-05, |
| "loss": 0.2677, |
| "step": 238800 |
| }, |
| { |
| "epoch": 0.002389, |
| "grad_norm": 1.3766945600509644, |
| "learning_rate": 1e-05, |
| "loss": 0.269, |
| "step": 238900 |
| }, |
| { |
| "epoch": 0.00239, |
| "grad_norm": 1.4144660234451294, |
| "learning_rate": 1e-05, |
| "loss": 0.2602, |
| "step": 239000 |
| }, |
| { |
| "epoch": 0.002391, |
| "grad_norm": 1.5478261709213257, |
| "learning_rate": 1e-05, |
| "loss": 0.2665, |
| "step": 239100 |
| }, |
| { |
| "epoch": 0.002392, |
| "grad_norm": 2.6023824214935303, |
| "learning_rate": 1e-05, |
| "loss": 0.267, |
| "step": 239200 |
| }, |
| { |
| "epoch": 0.002393, |
| "grad_norm": 1.681246042251587, |
| "learning_rate": 1e-05, |
| "loss": 0.2617, |
| "step": 239300 |
| }, |
| { |
| "epoch": 0.002394, |
| "grad_norm": 1.3988147974014282, |
| "learning_rate": 1e-05, |
| "loss": 0.2642, |
| "step": 239400 |
| }, |
| { |
| "epoch": 0.002395, |
| "grad_norm": 1.481540560722351, |
| "learning_rate": 1e-05, |
| "loss": 0.2671, |
| "step": 239500 |
| }, |
| { |
| "epoch": 0.002396, |
| "grad_norm": 1.4419687986373901, |
| "learning_rate": 1e-05, |
| "loss": 0.2587, |
| "step": 239600 |
| }, |
| { |
| "epoch": 0.002397, |
| "grad_norm": 1.3948259353637695, |
| "learning_rate": 1e-05, |
| "loss": 0.2675, |
| "step": 239700 |
| }, |
| { |
| "epoch": 0.002398, |
| "grad_norm": 1.5281838178634644, |
| "learning_rate": 1e-05, |
| "loss": 0.2645, |
| "step": 239800 |
| }, |
| { |
| "epoch": 0.002399, |
| "grad_norm": 1.4255881309509277, |
| "learning_rate": 1e-05, |
| "loss": 0.2655, |
| "step": 239900 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 1.332002878189087, |
| "learning_rate": 1e-05, |
| "loss": 0.2671, |
| "step": 240000 |
| }, |
| { |
| "epoch": 0.0024, |
| "eval_loss": 0.239013671875, |
| "eval_runtime": 114.9087, |
| "eval_samples_per_second": 435.128, |
| "eval_steps_per_second": 27.195, |
| "step": 240000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 100000000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 20000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 200, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.0168070144e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|