| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.030597377367654, |
| "eval_steps": 500, |
| "global_step": 1950, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007770762506070908, |
| "grad_norm": 0.1171875, |
| "learning_rate": 0.0001, |
| "loss": 0.6041, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.015541525012141816, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.0001, |
| "loss": 0.5642, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.023312287518212724, |
| "grad_norm": 0.0859375, |
| "learning_rate": 0.0001, |
| "loss": 0.2715, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03108305002428363, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.0001, |
| "loss": 0.2318, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03885381253035454, |
| "grad_norm": 0.11767578125, |
| "learning_rate": 0.0001, |
| "loss": 0.3031, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04662457503642545, |
| "grad_norm": 0.1875, |
| "learning_rate": 0.0001, |
| "loss": 0.3717, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.054395337542496355, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.0001, |
| "loss": 0.3045, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.06216610004856726, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.0001, |
| "loss": 0.2982, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06993686255463817, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.0001, |
| "loss": 0.2151, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.07770762506070908, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.0001, |
| "loss": 0.2622, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08547838756677999, |
| "grad_norm": 0.059814453125, |
| "learning_rate": 0.0001, |
| "loss": 0.1472, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0932491500728509, |
| "grad_norm": 0.029541015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0501, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.1010199125789218, |
| "grad_norm": 0.050048828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0447, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.10879067508499271, |
| "grad_norm": 0.0546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0352, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11656143759106362, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0901, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.12433220009713453, |
| "grad_norm": 0.078125, |
| "learning_rate": 0.0001, |
| "loss": 0.1973, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.13210296260320545, |
| "grad_norm": 0.1201171875, |
| "learning_rate": 0.0001, |
| "loss": 0.1561, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.13987372510927634, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.205, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14764448761534726, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.1718, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.15541525012141816, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.0001, |
| "loss": 0.2611, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16318601262748908, |
| "grad_norm": 0.044921875, |
| "learning_rate": 0.0001, |
| "loss": 0.1393, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.17095677513355997, |
| "grad_norm": 0.036865234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0525, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1787275376396309, |
| "grad_norm": 0.0257568359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0409, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.1864983001457018, |
| "grad_norm": 0.021728515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0512, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1942690626517727, |
| "grad_norm": 0.0947265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0869, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2020398251578436, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 0.0001, |
| "loss": 0.1108, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.20981058766391453, |
| "grad_norm": 0.1201171875, |
| "learning_rate": 0.0001, |
| "loss": 0.1604, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.21758135016998542, |
| "grad_norm": 0.109375, |
| "learning_rate": 0.0001, |
| "loss": 0.1491, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.22535211267605634, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.0001, |
| "loss": 0.2048, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.23312287518212724, |
| "grad_norm": 0.1572265625, |
| "learning_rate": 0.0001, |
| "loss": 0.2367, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.24089363768819816, |
| "grad_norm": 0.040283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0978, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.24866440019426905, |
| "grad_norm": 0.04052734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0538, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.25643516270033995, |
| "grad_norm": 0.034912109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0387, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.2642059252064109, |
| "grad_norm": 0.038330078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0319, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2719766877124818, |
| "grad_norm": 0.059814453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0817, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.2797474502185527, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.0001, |
| "loss": 0.1498, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2875182127246236, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.1394, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.29528897523069453, |
| "grad_norm": 0.0625, |
| "learning_rate": 0.0001, |
| "loss": 0.1344, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3030597377367654, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.1596, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.3108305002428363, |
| "grad_norm": 0.142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.1519, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3186012627489072, |
| "grad_norm": 0.0242919921875, |
| "learning_rate": 0.0001, |
| "loss": 0.1061, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.32637202525497816, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0473, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.33414278776104905, |
| "grad_norm": 0.029541015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0288, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.34191355026711995, |
| "grad_norm": 0.0250244140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0286, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.34968431277319084, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0894, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.3574550752792618, |
| "grad_norm": 0.095703125, |
| "learning_rate": 0.0001, |
| "loss": 0.1708, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3652258377853327, |
| "grad_norm": 0.126953125, |
| "learning_rate": 0.0001, |
| "loss": 0.1294, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.3729966002914036, |
| "grad_norm": 0.10107421875, |
| "learning_rate": 0.0001, |
| "loss": 0.1137, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.38076736279747453, |
| "grad_norm": 0.189453125, |
| "learning_rate": 0.0001, |
| "loss": 0.139, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.3885381253035454, |
| "grad_norm": 0.1845703125, |
| "learning_rate": 0.0001, |
| "loss": 0.1938, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.3963088878096163, |
| "grad_norm": 0.04248046875, |
| "learning_rate": 0.0001, |
| "loss": 0.1162, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.4040796503156872, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0764, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.41185041282175816, |
| "grad_norm": 0.0478515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0444, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.41962117532782905, |
| "grad_norm": 0.042724609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0358, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.42739193783389995, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001, |
| "loss": 0.1245, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.43516270033997084, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.0001, |
| "loss": 0.1191, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4429334628460418, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.1455, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.4507042253521127, |
| "grad_norm": 0.12109375, |
| "learning_rate": 0.0001, |
| "loss": 0.1288, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4584749878581836, |
| "grad_norm": 0.09814453125, |
| "learning_rate": 0.0001, |
| "loss": 0.121, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.4662457503642545, |
| "grad_norm": 0.146484375, |
| "learning_rate": 0.0001, |
| "loss": 0.1668, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4740165128703254, |
| "grad_norm": 0.047607421875, |
| "learning_rate": 0.0001, |
| "loss": 0.1261, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.4817872753763963, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0443, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4895580378824672, |
| "grad_norm": 0.031494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0155, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.4973288003885381, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 0.0001, |
| "loss": 0.02, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.505099562894609, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.0001, |
| "loss": 0.1097, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.5128703254006799, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.0001, |
| "loss": 0.1322, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5206410879067509, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 0.0001, |
| "loss": 0.1746, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.5284118504128218, |
| "grad_norm": 0.11328125, |
| "learning_rate": 0.0001, |
| "loss": 0.1127, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5361826129188927, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 0.0001, |
| "loss": 0.1345, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.5439533754249636, |
| "grad_norm": 0.1611328125, |
| "learning_rate": 0.0001, |
| "loss": 0.1526, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5517241379310345, |
| "grad_norm": 0.0576171875, |
| "learning_rate": 0.0001, |
| "loss": 0.1136, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.5594949004371054, |
| "grad_norm": 0.039306640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0372, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5672656629431763, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0323, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.5750364254492472, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0473, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5828071879553182, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0727, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5905779504613891, |
| "grad_norm": 0.1103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.1386, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.59834871296746, |
| "grad_norm": 0.109375, |
| "learning_rate": 0.0001, |
| "loss": 0.1246, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.6061194754735308, |
| "grad_norm": 0.138671875, |
| "learning_rate": 0.0001, |
| "loss": 0.1188, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6138902379796017, |
| "grad_norm": 0.154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.1368, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.6216610004856726, |
| "grad_norm": 0.15625, |
| "learning_rate": 0.0001, |
| "loss": 0.175, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6294317629917435, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 0.0001, |
| "loss": 0.1122, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.6372025254978144, |
| "grad_norm": 0.038330078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0534, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6449732880038854, |
| "grad_norm": 0.0196533203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0212, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.6527440505099563, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0569, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6605148130160272, |
| "grad_norm": 0.058349609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0649, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.6682855755220981, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 0.0001, |
| "loss": 0.1247, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.676056338028169, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.1236, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6838271005342399, |
| "grad_norm": 0.1201171875, |
| "learning_rate": 0.0001, |
| "loss": 0.1132, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6915978630403108, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.0001, |
| "loss": 0.1421, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.6993686255463817, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.0001, |
| "loss": 0.2113, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7071393880524527, |
| "grad_norm": 0.044189453125, |
| "learning_rate": 0.0001, |
| "loss": 0.1112, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.7149101505585236, |
| "grad_norm": 0.0302734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0665, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7226809130645945, |
| "grad_norm": 0.0267333984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0225, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.7304516755706654, |
| "grad_norm": 0.042724609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0432, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7382224380767363, |
| "grad_norm": 0.08203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0552, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.7459932005828072, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0621, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.753763963088878, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0817, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.7615347255949491, |
| "grad_norm": 0.123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.1427, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.76930548810102, |
| "grad_norm": 0.10205078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0819, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.7770762506070908, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.0001, |
| "loss": 0.1177, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7848470131131617, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.1115, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7926177756192326, |
| "grad_norm": 0.008544921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0491, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8003885381253035, |
| "grad_norm": 0.01611328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0136, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.8081593006313744, |
| "grad_norm": 0.04150390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0249, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8159300631374453, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0612, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.8237008256435163, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.0001, |
| "loss": 0.1199, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8314715881495872, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.0001, |
| "loss": 0.1041, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.8392423506556581, |
| "grad_norm": 0.11279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0985, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.847013113161729, |
| "grad_norm": 0.142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.1422, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.8547838756677999, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.1043, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8625546381738708, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 0.0001, |
| "loss": 0.1327, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.8703254006799417, |
| "grad_norm": 0.00714111328125, |
| "learning_rate": 0.0001, |
| "loss": 0.026, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8780961631860126, |
| "grad_norm": 0.059326171875, |
| "learning_rate": 0.0001, |
| "loss": 0.0192, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.8858669256920836, |
| "grad_norm": 0.03076171875, |
| "learning_rate": 0.0001, |
| "loss": 0.0314, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8936376881981545, |
| "grad_norm": 0.07421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0738, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.9014084507042254, |
| "grad_norm": 0.095703125, |
| "learning_rate": 0.0001, |
| "loss": 0.1326, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9091792132102963, |
| "grad_norm": 0.12890625, |
| "learning_rate": 0.0001, |
| "loss": 0.1109, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.9169499757163672, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.1411, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.924720738222438, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.0001, |
| "loss": 0.1349, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.932491500728509, |
| "grad_norm": 0.169921875, |
| "learning_rate": 0.0001, |
| "loss": 0.1544, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9402622632345798, |
| "grad_norm": 0.045654296875, |
| "learning_rate": 0.0001, |
| "loss": 0.1043, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.9480330257406508, |
| "grad_norm": 0.03369140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0226, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9558037882467217, |
| "grad_norm": 0.03515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0314, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.9635745507527926, |
| "grad_norm": 0.056396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0446, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9713453132588635, |
| "grad_norm": 0.1728515625, |
| "learning_rate": 0.0001, |
| "loss": 0.1445, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.9791160757649344, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.1178, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9868868382710053, |
| "grad_norm": 0.119140625, |
| "learning_rate": 0.0001, |
| "loss": 0.106, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.9946576007770762, |
| "grad_norm": 0.11474609375, |
| "learning_rate": 0.0001, |
| "loss": 0.1867, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0024283632831472, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 0.0001, |
| "loss": 0.1249, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.010199125789218, |
| "grad_norm": 0.05078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0703, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.017969888295289, |
| "grad_norm": 0.009765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0179, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.0257406508013598, |
| "grad_norm": 0.025390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0105, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.0335114133074308, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0362, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.0412821758135018, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0466, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.0490529383195726, |
| "grad_norm": 0.11669921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0759, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.0568237008256436, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0641, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.0645944633317144, |
| "grad_norm": 0.1923828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0589, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.0723652258377854, |
| "grad_norm": 0.283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0743, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0801359883438562, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.092, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.0879067508499272, |
| "grad_norm": 0.056396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0677, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.095677513355998, |
| "grad_norm": 0.0498046875, |
| "learning_rate": 0.0001, |
| "loss": 0.0382, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.103448275862069, |
| "grad_norm": 0.033447265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0133, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.11121903836814, |
| "grad_norm": 0.017578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0194, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.1189898008742107, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0609, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.1267605633802817, |
| "grad_norm": 0.10595703125, |
| "learning_rate": 0.0001, |
| "loss": 0.1082, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.1345313258863525, |
| "grad_norm": 0.142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0827, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1423020883924235, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0636, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.1500728508984945, |
| "grad_norm": 0.111328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0613, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1578436134045653, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0959, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.1656143759106363, |
| "grad_norm": 0.058349609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0715, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.173385138416707, |
| "grad_norm": 0.0224609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0226, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.1811559009227781, |
| "grad_norm": 0.04833984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0205, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.188926663428849, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0301, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.19669742593492, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0634, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.2044681884409907, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0743, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.2122389509470617, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.08, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2200097134531327, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0629, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.2277804759592035, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0688, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.2355512384652745, |
| "grad_norm": 0.049560546875, |
| "learning_rate": 0.0001, |
| "loss": 0.1087, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.2433220009713453, |
| "grad_norm": 0.025146484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0625, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2510927634774163, |
| "grad_norm": 0.034912109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0114, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.258863525983487, |
| "grad_norm": 0.033203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0178, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.266634288489558, |
| "grad_norm": 0.05908203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0397, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.2744050509956288, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.1098, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.2821758135016998, |
| "grad_norm": 0.0869140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0609, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.2899465760077709, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0674, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.2977173385138416, |
| "grad_norm": 0.11328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0529, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.3054881010199126, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0828, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3132588635259834, |
| "grad_norm": 0.05810546875, |
| "learning_rate": 0.0001, |
| "loss": 0.1091, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.3210296260320544, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0774, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3288003885381254, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0348, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.3365711510441962, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0143, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.344341913550267, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0361, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.352112676056338, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0504, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.359883438562409, |
| "grad_norm": 0.062255859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0382, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.3676542010684798, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0457, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.3754249635745508, |
| "grad_norm": 0.142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.057, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.3831957260806216, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0573, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.3909664885866926, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.0001, |
| "loss": 0.1187, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.3987372510927636, |
| "grad_norm": 0.04248046875, |
| "learning_rate": 0.0001, |
| "loss": 0.0609, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4065080135988344, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0245, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.4142787761049052, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.0001, |
| "loss": 0.019, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.4220495386109762, |
| "grad_norm": 0.0419921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0323, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.4298203011170472, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 0.0001, |
| "loss": 0.088, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.437591063623118, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0811, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.445361826129189, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0622, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.4531325886352597, |
| "grad_norm": 0.125, |
| "learning_rate": 0.0001, |
| "loss": 0.0593, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.4609033511413307, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0409, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.4686741136474017, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0869, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.4764448761534725, |
| "grad_norm": 0.05712890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0825, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.4842156386595435, |
| "grad_norm": 0.050048828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0306, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.4919864011656143, |
| "grad_norm": 0.0361328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0195, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.4997571636716853, |
| "grad_norm": 0.05859375, |
| "learning_rate": 0.0001, |
| "loss": 0.027, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.5075279261777563, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0545, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.515298688683827, |
| "grad_norm": 0.044189453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0669, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.523069451189898, |
| "grad_norm": 0.07421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0696, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.530840213695969, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0513, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.53861097620204, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0707, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.5463817387081107, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0757, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.5541525012141817, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0811, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5619232637202525, |
| "grad_norm": 0.0166015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0255, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.5696940262263235, |
| "grad_norm": 0.0283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.017, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.5774647887323945, |
| "grad_norm": 0.05859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0335, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.5852355512384653, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.066, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.593006313744536, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0724, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.600777076250607, |
| "grad_norm": 0.126953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0806, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.608547838756678, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0364, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.616318601262749, |
| "grad_norm": 0.13671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0837, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.6240893637688198, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0804, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.6318601262748906, |
| "grad_norm": 0.05078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0758, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.6396308887809616, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0228, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.6474016512870326, |
| "grad_norm": 0.0458984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0148, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.6551724137931034, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0215, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.6629431762991742, |
| "grad_norm": 0.1015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0641, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.6707139388052452, |
| "grad_norm": 0.0927734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0718, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.6784847013113162, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0645, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.6862554638173872, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0505, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.694026226323458, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0681, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.7017969888295288, |
| "grad_norm": 0.0625, |
| "learning_rate": 0.0001, |
| "loss": 0.0998, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.7095677513355998, |
| "grad_norm": 0.05859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0534, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7173385138416708, |
| "grad_norm": 0.0263671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0128, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.7251092763477416, |
| "grad_norm": 0.2421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0107, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.7328800388538124, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0282, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.7406508013598834, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0447, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.7484215638659544, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0675, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.7561923263720254, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0636, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.7639630888780962, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0734, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.771733851384167, |
| "grad_norm": 0.1591796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0522, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.779504613890238, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 0.0001, |
| "loss": 0.1037, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.787275376396309, |
| "grad_norm": 0.03369140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0727, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.79504613890238, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0248, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.8028169014084507, |
| "grad_norm": 0.0556640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0138, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8105876639145215, |
| "grad_norm": 0.046875, |
| "learning_rate": 0.0001, |
| "loss": 0.024, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.8183584264205925, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0717, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.8261291889266635, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0663, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.8338999514327343, |
| "grad_norm": 0.1591796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0842, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.841670713938805, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.042, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.849441476444876, |
| "grad_norm": 0.1689453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0693, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.8572122389509471, |
| "grad_norm": 0.052490234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0942, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.8649830014570181, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0794, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.872753763963089, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0166, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.8805245264691597, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.0001, |
| "loss": 0.017, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.8882952889752307, |
| "grad_norm": 0.08203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0253, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.8960660514813017, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0536, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.9038368139873725, |
| "grad_norm": 0.150390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0708, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.9116075764934433, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0588, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.9193783389995143, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0636, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.9271491015055853, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0656, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.9349198640116563, |
| "grad_norm": 0.05615234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0788, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.942690626517727, |
| "grad_norm": 0.0322265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0549, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.9504613890237978, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0288, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.9582321515298688, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0362, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.9660029140359399, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0301, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.9737736765420106, |
| "grad_norm": 0.095703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0656, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.9815444390480816, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0648, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.9893152015541524, |
| "grad_norm": 0.1044921875, |
| "learning_rate": 0.0001, |
| "loss": 0.061, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.9970859640602234, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0743, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.0048567265662944, |
| "grad_norm": 0.05859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0826, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.0126274890723654, |
| "grad_norm": 0.0380859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0243, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.020398251578436, |
| "grad_norm": 0.028076171875, |
| "learning_rate": 0.0001, |
| "loss": 0.012, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.028169014084507, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0312, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.035939776590578, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0355, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.043710539096649, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0518, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.0514813016027196, |
| "grad_norm": 0.1044921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0245, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.0592520641087906, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0452, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.0670228266148616, |
| "grad_norm": 0.1376953125, |
| "learning_rate": 0.0001, |
| "loss": 0.026, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.0747935891209326, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0341, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.0825643516270036, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.071, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.090335114133074, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0255, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.098105876639145, |
| "grad_norm": 0.00909423828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0063, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.105876639145216, |
| "grad_norm": 0.060791015625, |
| "learning_rate": 0.0001, |
| "loss": 0.008, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.113647401651287, |
| "grad_norm": 0.0274658203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0121, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.1214181641573577, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0331, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.1291889266634287, |
| "grad_norm": 0.10302734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0302, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.1369596891694997, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0351, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.1447304516755707, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0158, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.1525012141816418, |
| "grad_norm": 0.107421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0316, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.1602719766877123, |
| "grad_norm": 0.08740234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0693, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.1680427391937833, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0268, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.1758135016998543, |
| "grad_norm": 0.035400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.009, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.1835842642059253, |
| "grad_norm": 0.05712890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0267, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.191355026711996, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0121, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.199125789218067, |
| "grad_norm": 0.1376953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0377, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.206896551724138, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0444, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.214667314230209, |
| "grad_norm": 0.057373046875, |
| "learning_rate": 0.0001, |
| "loss": 0.0215, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.22243807673628, |
| "grad_norm": 0.115234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0168, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.2302088392423505, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0218, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.2379796017484215, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0652, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.2457503642544925, |
| "grad_norm": 0.0419921875, |
| "learning_rate": 0.0001, |
| "loss": 0.035, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.2535211267605635, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 0.0001, |
| "loss": 0.01, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.2612918892666345, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0073, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.269062651772705, |
| "grad_norm": 0.044677734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0171, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.276833414278776, |
| "grad_norm": 0.05078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0466, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.284604176784847, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0217, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.292374939290918, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0309, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.300145701796989, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0324, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.3079164643030596, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0283, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.3156872268091306, |
| "grad_norm": 0.08642578125, |
| "learning_rate": 0.0001, |
| "loss": 0.074, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.3234579893152016, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0184, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.3312287518212726, |
| "grad_norm": 0.039306640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0105, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.338999514327343, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0145, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.346770276833414, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0206, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.354541039339485, |
| "grad_norm": 0.0986328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0332, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.3623118018455562, |
| "grad_norm": 0.12158203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0245, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.370082564351627, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0281, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.377853326857698, |
| "grad_norm": 0.150390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0434, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.385624089363769, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0225, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.39339485186984, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0531, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.401165614375911, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0398, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.4089363768819814, |
| "grad_norm": 0.01312255859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0073, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.4167071393880524, |
| "grad_norm": 0.0177001953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0077, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.4244779018941234, |
| "grad_norm": 0.047607421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0212, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.4322486644001944, |
| "grad_norm": 0.1484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0377, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.4400194269062654, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0312, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.447790189412336, |
| "grad_norm": 0.103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.029, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.455560951918407, |
| "grad_norm": 0.1103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.026, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.463331714424478, |
| "grad_norm": 0.1572265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0353, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.471102476930549, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0554, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.4788732394366195, |
| "grad_norm": 0.05517578125, |
| "learning_rate": 0.0001, |
| "loss": 0.035, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.4866440019426905, |
| "grad_norm": 0.041015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0063, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.4944147644487615, |
| "grad_norm": 0.035400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0118, |
| "step": 1605 |
| }, |
| { |
| "epoch": 2.5021855269548325, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0185, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.509956289460903, |
| "grad_norm": 0.03955078125, |
| "learning_rate": 0.0001, |
| "loss": 0.031, |
| "step": 1615 |
| }, |
| { |
| "epoch": 2.517727051966974, |
| "grad_norm": 0.1484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0395, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.525497814473045, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.0001, |
| "loss": 0.023, |
| "step": 1625 |
| }, |
| { |
| "epoch": 2.533268576979116, |
| "grad_norm": 0.043701171875, |
| "learning_rate": 0.0001, |
| "loss": 0.0247, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.541039339485187, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0325, |
| "step": 1635 |
| }, |
| { |
| "epoch": 2.5488101019912577, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0723, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.5565808644973287, |
| "grad_norm": 0.03857421875, |
| "learning_rate": 0.0001, |
| "loss": 0.0274, |
| "step": 1645 |
| }, |
| { |
| "epoch": 2.5643516270033997, |
| "grad_norm": 0.035888671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0099, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.5721223895094707, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0105, |
| "step": 1655 |
| }, |
| { |
| "epoch": 2.5798931520155417, |
| "grad_norm": 0.1015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0139, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.5876639145216123, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0377, |
| "step": 1665 |
| }, |
| { |
| "epoch": 2.5954346770276833, |
| "grad_norm": 0.1025390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0373, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.6032054395337543, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0217, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.6109762020398253, |
| "grad_norm": 0.1962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.019, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.6187469645458963, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0288, |
| "step": 1685 |
| }, |
| { |
| "epoch": 2.626517727051967, |
| "grad_norm": 0.078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0726, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.634288489558038, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0245, |
| "step": 1695 |
| }, |
| { |
| "epoch": 2.642059252064109, |
| "grad_norm": 0.00738525390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0108, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.64983001457018, |
| "grad_norm": 0.058837890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0174, |
| "step": 1705 |
| }, |
| { |
| "epoch": 2.657600777076251, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0284, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.6653715395823214, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0384, |
| "step": 1715 |
| }, |
| { |
| "epoch": 2.6731423020883924, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0258, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.6809130645944634, |
| "grad_norm": 0.14453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0377, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.688683827100534, |
| "grad_norm": 0.154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0337, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.696454589606605, |
| "grad_norm": 0.162109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0527, |
| "step": 1735 |
| }, |
| { |
| "epoch": 2.704225352112676, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0659, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.711996114618747, |
| "grad_norm": 0.013671875, |
| "learning_rate": 0.0001, |
| "loss": 0.028, |
| "step": 1745 |
| }, |
| { |
| "epoch": 2.719766877124818, |
| "grad_norm": 0.041259765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0105, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.7275376396308886, |
| "grad_norm": 0.056884765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0127, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.7353084021369596, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0199, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.7430791646430306, |
| "grad_norm": 0.125, |
| "learning_rate": 0.0001, |
| "loss": 0.0409, |
| "step": 1765 |
| }, |
| { |
| "epoch": 2.7508499271491016, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0359, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.7586206896551726, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0382, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.766391452161243, |
| "grad_norm": 0.0703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0427, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.774162214667314, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0324, |
| "step": 1785 |
| }, |
| { |
| "epoch": 2.781932977173385, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0669, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.789703739679456, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0351, |
| "step": 1795 |
| }, |
| { |
| "epoch": 2.797474502185527, |
| "grad_norm": 0.0341796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0243, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.8052452646915977, |
| "grad_norm": 0.05517578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0125, |
| "step": 1805 |
| }, |
| { |
| "epoch": 2.8130160271976687, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 0.0001, |
| "loss": 0.0152, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.8207867897037397, |
| "grad_norm": 0.17578125, |
| "learning_rate": 0.0001, |
| "loss": 0.0389, |
| "step": 1815 |
| }, |
| { |
| "epoch": 2.8285575522098103, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.0001, |
| "loss": 0.035, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.8363283147158818, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0279, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.8440990772219523, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0387, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.8518698397280233, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 0.0001, |
| "loss": 0.02, |
| "step": 1835 |
| }, |
| { |
| "epoch": 2.8596406022340943, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0896, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.867411364740165, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0397, |
| "step": 1845 |
| }, |
| { |
| "epoch": 2.875182127246236, |
| "grad_norm": 0.059326171875, |
| "learning_rate": 0.0001, |
| "loss": 0.0325, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.882952889752307, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0085, |
| "step": 1855 |
| }, |
| { |
| "epoch": 2.890723652258378, |
| "grad_norm": 0.0225830078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0244, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.898494414764449, |
| "grad_norm": 0.119140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0438, |
| "step": 1865 |
| }, |
| { |
| "epoch": 2.9062651772705195, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0269, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.9140359397765905, |
| "grad_norm": 0.060302734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0297, |
| "step": 1875 |
| }, |
| { |
| "epoch": 2.9218067022826615, |
| "grad_norm": 0.10498046875, |
| "learning_rate": 0.0001, |
| "loss": 0.0267, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.9295774647887325, |
| "grad_norm": 0.134765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0237, |
| "step": 1885 |
| }, |
| { |
| "epoch": 2.9373482272948035, |
| "grad_norm": 0.05615234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0836, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.945118989800874, |
| "grad_norm": 0.0245361328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0273, |
| "step": 1895 |
| }, |
| { |
| "epoch": 2.952889752306945, |
| "grad_norm": 0.006561279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0249, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.960660514813016, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0156, |
| "step": 1905 |
| }, |
| { |
| "epoch": 2.968431277319087, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0509, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.976202039825158, |
| "grad_norm": 0.1455078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0393, |
| "step": 1915 |
| }, |
| { |
| "epoch": 2.9839728023312286, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0253, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.9917435648372996, |
| "grad_norm": 0.056884765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0268, |
| "step": 1925 |
| }, |
| { |
| "epoch": 2.9995143273433706, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0255, |
| "step": 1930 |
| }, |
| { |
| "epoch": 3.0072850898494417, |
| "grad_norm": 0.052490234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0451, |
| "step": 1935 |
| }, |
| { |
| "epoch": 3.015055852355512, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 0.0001, |
| "loss": 0.0123, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.022826614861583, |
| "grad_norm": 0.048828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0099, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.030597377367654, |
| "grad_norm": 0.09130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0092, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.030597377367654, |
| "step": 1950, |
| "total_flos": 3.211514870667264e+17, |
| "train_loss": 0.06899887980558933, |
| "train_runtime": 32484.0674, |
| "train_samples_per_second": 0.96, |
| "train_steps_per_second": 0.06 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1950, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 90, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.211514870667264e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|