| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1404, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0106951871657754, |
| "grad_norm": 5.869461536407471, |
| "learning_rate": 9.30232558139535e-07, |
| "loss": 0.3054, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0213903743315508, |
| "grad_norm": 4.7690653800964355, |
| "learning_rate": 2.0930232558139536e-06, |
| "loss": 0.2661, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03208556149732621, |
| "grad_norm": 4.248436450958252, |
| "learning_rate": 3.2558139534883724e-06, |
| "loss": 0.2409, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0427807486631016, |
| "grad_norm": 4.08921480178833, |
| "learning_rate": 4.418604651162791e-06, |
| "loss": 0.2228, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.053475935828877004, |
| "grad_norm": 4.846882343292236, |
| "learning_rate": 5.58139534883721e-06, |
| "loss": 0.2258, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06417112299465241, |
| "grad_norm": 4.230620384216309, |
| "learning_rate": 6.744186046511628e-06, |
| "loss": 0.2103, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0748663101604278, |
| "grad_norm": 4.4064130783081055, |
| "learning_rate": 7.906976744186048e-06, |
| "loss": 0.2122, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0855614973262032, |
| "grad_norm": 4.86531400680542, |
| "learning_rate": 9.069767441860465e-06, |
| "loss": 0.1996, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0962566844919786, |
| "grad_norm": 5.53223991394043, |
| "learning_rate": 9.999986679414613e-06, |
| "loss": 0.2311, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.10695187165775401, |
| "grad_norm": 4.2916178703308105, |
| "learning_rate": 9.999520466378376e-06, |
| "loss": 0.2319, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.11764705882352941, |
| "grad_norm": 4.200897216796875, |
| "learning_rate": 9.998388295046227e-06, |
| "loss": 0.2368, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.12834224598930483, |
| "grad_norm": 4.808861255645752, |
| "learning_rate": 9.996590316228402e-06, |
| "loss": 0.2354, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.13903743315508021, |
| "grad_norm": 4.5134077072143555, |
| "learning_rate": 9.994126769423656e-06, |
| "loss": 0.234, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.1497326203208556, |
| "grad_norm": 4.348900318145752, |
| "learning_rate": 9.990997982787348e-06, |
| "loss": 0.2541, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.16042780748663102, |
| "grad_norm": 4.4039306640625, |
| "learning_rate": 9.98720437308773e-06, |
| "loss": 0.2496, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1711229946524064, |
| "grad_norm": 4.726532936096191, |
| "learning_rate": 9.982746445650437e-06, |
| "loss": 0.2797, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 4.256592273712158, |
| "learning_rate": 9.977624794291172e-06, |
| "loss": 0.2518, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1925133689839572, |
| "grad_norm": 4.330168724060059, |
| "learning_rate": 9.97184010123661e-06, |
| "loss": 0.261, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.20320855614973263, |
| "grad_norm": 4.281119346618652, |
| "learning_rate": 9.965393137033512e-06, |
| "loss": 0.2564, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.21390374331550802, |
| "grad_norm": 4.247673511505127, |
| "learning_rate": 9.958284760446104e-06, |
| "loss": 0.2501, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.22459893048128343, |
| "grad_norm": 3.930504322052002, |
| "learning_rate": 9.950515918341666e-06, |
| "loss": 0.2504, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 4.084280967712402, |
| "learning_rate": 9.942087645564415e-06, |
| "loss": 0.2706, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.24598930481283424, |
| "grad_norm": 3.869400978088379, |
| "learning_rate": 9.93300106479766e-06, |
| "loss": 0.2499, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.25668449197860965, |
| "grad_norm": 4.0887675285339355, |
| "learning_rate": 9.923257386414253e-06, |
| "loss": 0.2628, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.26737967914438504, |
| "grad_norm": 4.140570163726807, |
| "learning_rate": 9.912857908315363e-06, |
| "loss": 0.2642, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.27807486631016043, |
| "grad_norm": 4.731294631958008, |
| "learning_rate": 9.901804015757588e-06, |
| "loss": 0.2669, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2887700534759358, |
| "grad_norm": 4.113650321960449, |
| "learning_rate": 9.89009718116843e-06, |
| "loss": 0.2809, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.2994652406417112, |
| "grad_norm": 3.8770499229431152, |
| "learning_rate": 9.877738963950175e-06, |
| "loss": 0.2705, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.31016042780748665, |
| "grad_norm": 4.077112197875977, |
| "learning_rate": 9.864731010272152e-06, |
| "loss": 0.2548, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.32085561497326204, |
| "grad_norm": 4.771427631378174, |
| "learning_rate": 9.851075052851476e-06, |
| "loss": 0.2534, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3315508021390374, |
| "grad_norm": 3.84387469291687, |
| "learning_rate": 9.83677291072223e-06, |
| "loss": 0.2438, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3422459893048128, |
| "grad_norm": 4.427935600280762, |
| "learning_rate": 9.821826488993168e-06, |
| "loss": 0.2791, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.35294117647058826, |
| "grad_norm": 4.1868767738342285, |
| "learning_rate": 9.806237778593941e-06, |
| "loss": 0.2734, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 3.8055779933929443, |
| "learning_rate": 9.790008856009902e-06, |
| "loss": 0.2463, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.37433155080213903, |
| "grad_norm": 4.041962623596191, |
| "learning_rate": 9.773141883005507e-06, |
| "loss": 0.2813, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3850267379679144, |
| "grad_norm": 3.6363329887390137, |
| "learning_rate": 9.755639106336347e-06, |
| "loss": 0.2724, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.39572192513368987, |
| "grad_norm": 3.50486421585083, |
| "learning_rate": 9.737502857449894e-06, |
| "loss": 0.271, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.40641711229946526, |
| "grad_norm": 3.4967920780181885, |
| "learning_rate": 9.718735552174923e-06, |
| "loss": 0.2467, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.41711229946524064, |
| "grad_norm": 3.3797831535339355, |
| "learning_rate": 9.699339690399717e-06, |
| "loss": 0.2621, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.42780748663101603, |
| "grad_norm": 15.109771728515625, |
| "learning_rate": 9.679317855739073e-06, |
| "loss": 0.2791, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4385026737967914, |
| "grad_norm": 3.828474760055542, |
| "learning_rate": 9.658672715190151e-06, |
| "loss": 0.2829, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.44919786096256686, |
| "grad_norm": 3.2750980854034424, |
| "learning_rate": 9.637407018777224e-06, |
| "loss": 0.2866, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.45989304812834225, |
| "grad_norm": 3.615290880203247, |
| "learning_rate": 9.615523599185353e-06, |
| "loss": 0.2422, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 3.5919339656829834, |
| "learning_rate": 9.593025371383064e-06, |
| "loss": 0.269, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.48128342245989303, |
| "grad_norm": 4.708985805511475, |
| "learning_rate": 9.569915332234068e-06, |
| "loss": 0.2686, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.4919786096256685, |
| "grad_norm": 3.5988545417785645, |
| "learning_rate": 9.546196560098062e-06, |
| "loss": 0.273, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.5026737967914439, |
| "grad_norm": 3.4772067070007324, |
| "learning_rate": 9.521872214420668e-06, |
| "loss": 0.2395, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5133689839572193, |
| "grad_norm": 3.7974278926849365, |
| "learning_rate": 9.496945535312597e-06, |
| "loss": 0.2682, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5240641711229946, |
| "grad_norm": 3.69970440864563, |
| "learning_rate": 9.471419843118036e-06, |
| "loss": 0.2557, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5347593582887701, |
| "grad_norm": 3.470737934112549, |
| "learning_rate": 9.44529853797238e-06, |
| "loss": 0.266, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 3.733236312866211, |
| "learning_rate": 9.418585099349306e-06, |
| "loss": 0.2779, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5561497326203209, |
| "grad_norm": 3.434349298477173, |
| "learning_rate": 9.391283085597299e-06, |
| "loss": 0.2508, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5668449197860963, |
| "grad_norm": 3.5163183212280273, |
| "learning_rate": 9.36339613346565e-06, |
| "loss": 0.2727, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.5775401069518716, |
| "grad_norm": 4.07936429977417, |
| "learning_rate": 9.33492795762005e-06, |
| "loss": 0.2714, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 3.171149492263794, |
| "learning_rate": 9.305882350147763e-06, |
| "loss": 0.2585, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5989304812834224, |
| "grad_norm": 3.424302101135254, |
| "learning_rate": 9.276263180052498e-06, |
| "loss": 0.2584, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.6096256684491979, |
| "grad_norm": 3.800421714782715, |
| "learning_rate": 9.246074392739057e-06, |
| "loss": 0.2771, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6203208556149733, |
| "grad_norm": 3.4711737632751465, |
| "learning_rate": 9.21532000948778e-06, |
| "loss": 0.265, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6310160427807486, |
| "grad_norm": 3.5598926544189453, |
| "learning_rate": 9.184004126918891e-06, |
| "loss": 0.2653, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6417112299465241, |
| "grad_norm": 3.320875644683838, |
| "learning_rate": 9.152130916446817e-06, |
| "loss": 0.2373, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6524064171122995, |
| "grad_norm": 3.606067419052124, |
| "learning_rate": 9.119704623724528e-06, |
| "loss": 0.2779, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6631016042780749, |
| "grad_norm": 3.189316511154175, |
| "learning_rate": 9.086729568078006e-06, |
| "loss": 0.2813, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6737967914438503, |
| "grad_norm": 3.725550651550293, |
| "learning_rate": 9.05321014193089e-06, |
| "loss": 0.284, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.6844919786096256, |
| "grad_norm": 3.6915953159332275, |
| "learning_rate": 9.019150810219376e-06, |
| "loss": 0.2646, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6951871657754011, |
| "grad_norm": 3.499095916748047, |
| "learning_rate": 8.984556109797484e-06, |
| "loss": 0.2693, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 3.671689510345459, |
| "learning_rate": 8.949430648832716e-06, |
| "loss": 0.2662, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7165775401069518, |
| "grad_norm": 3.549403429031372, |
| "learning_rate": 8.91377910619223e-06, |
| "loss": 0.2745, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 3.2107975482940674, |
| "learning_rate": 8.8776062308196e-06, |
| "loss": 0.2462, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7379679144385026, |
| "grad_norm": 3.040982246398926, |
| "learning_rate": 8.84091684110223e-06, |
| "loss": 0.2663, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7486631016042781, |
| "grad_norm": 3.460141658782959, |
| "learning_rate": 8.803715824229525e-06, |
| "loss": 0.2595, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7593582887700535, |
| "grad_norm": 3.5115394592285156, |
| "learning_rate": 8.766008135541896e-06, |
| "loss": 0.2476, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.7700534759358288, |
| "grad_norm": 3.2345032691955566, |
| "learning_rate": 8.727798797870688e-06, |
| "loss": 0.2399, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7807486631016043, |
| "grad_norm": 3.489104986190796, |
| "learning_rate": 8.689092900869112e-06, |
| "loss": 0.2568, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.7914438502673797, |
| "grad_norm": 3.4822070598602295, |
| "learning_rate": 8.649895600334284e-06, |
| "loss": 0.2779, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.8021390374331551, |
| "grad_norm": 3.6014904975891113, |
| "learning_rate": 8.610212117520453e-06, |
| "loss": 0.2738, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.8128342245989305, |
| "grad_norm": 3.3353493213653564, |
| "learning_rate": 8.570047738443502e-06, |
| "loss": 0.2564, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8235294117647058, |
| "grad_norm": 3.1294198036193848, |
| "learning_rate": 8.52940781317683e-06, |
| "loss": 0.2459, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8342245989304813, |
| "grad_norm": 3.2122206687927246, |
| "learning_rate": 8.48829775513869e-06, |
| "loss": 0.2639, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8449197860962567, |
| "grad_norm": 3.2475640773773193, |
| "learning_rate": 8.446723040371114e-06, |
| "loss": 0.2545, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8556149732620321, |
| "grad_norm": 3.113609552383423, |
| "learning_rate": 8.40468920681047e-06, |
| "loss": 0.2712, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8663101604278075, |
| "grad_norm": 3.231935501098633, |
| "learning_rate": 8.362201853549777e-06, |
| "loss": 0.27, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.8770053475935828, |
| "grad_norm": 3.3826744556427, |
| "learning_rate": 8.319266640092899e-06, |
| "loss": 0.2823, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8877005347593583, |
| "grad_norm": 3.3691983222961426, |
| "learning_rate": 8.275889285600656e-06, |
| "loss": 0.2555, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.8983957219251337, |
| "grad_norm": 3.4460556507110596, |
| "learning_rate": 8.23207556812902e-06, |
| "loss": 0.2615, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 3.0713820457458496, |
| "learning_rate": 8.187831323859445e-06, |
| "loss": 0.2512, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.9197860962566845, |
| "grad_norm": 2.781736373901367, |
| "learning_rate": 8.143162446321465e-06, |
| "loss": 0.2568, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.93048128342246, |
| "grad_norm": 3.0802292823791504, |
| "learning_rate": 8.098074885607646e-06, |
| "loss": 0.2506, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 3.0498411655426025, |
| "learning_rate": 8.052574647581009e-06, |
| "loss": 0.2586, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9518716577540107, |
| "grad_norm": 3.0598437786102295, |
| "learning_rate": 8.006667793075026e-06, |
| "loss": 0.2576, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.9625668449197861, |
| "grad_norm": 2.935920238494873, |
| "learning_rate": 7.960360437086287e-06, |
| "loss": 0.2363, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9732620320855615, |
| "grad_norm": 3.2348110675811768, |
| "learning_rate": 7.91365874795995e-06, |
| "loss": 0.2677, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.983957219251337, |
| "grad_norm": 3.4138996601104736, |
| "learning_rate": 7.866568946568107e-06, |
| "loss": 0.2702, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9946524064171123, |
| "grad_norm": 2.949201822280884, |
| "learning_rate": 7.819097305481112e-06, |
| "loss": 0.2487, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.0042780748663103, |
| "grad_norm": 2.0031890869140625, |
| "learning_rate": 7.771250148132067e-06, |
| "loss": 0.1638, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.0149732620320855, |
| "grad_norm": 2.1339635848999023, |
| "learning_rate": 7.723033847974503e-06, |
| "loss": 0.1013, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.025668449197861, |
| "grad_norm": 2.822927474975586, |
| "learning_rate": 7.674454827633413e-06, |
| "loss": 0.0924, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0363636363636364, |
| "grad_norm": 3.7076971530914307, |
| "learning_rate": 7.625519558049722e-06, |
| "loss": 0.1075, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.0470588235294118, |
| "grad_norm": 2.7152915000915527, |
| "learning_rate": 7.576234557618336e-06, |
| "loss": 0.0926, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0577540106951873, |
| "grad_norm": 3.0349481105804443, |
| "learning_rate": 7.526606391319862e-06, |
| "loss": 0.1054, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.0684491978609625, |
| "grad_norm": 2.7987284660339355, |
| "learning_rate": 7.476641669846121e-06, |
| "loss": 0.0999, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.079144385026738, |
| "grad_norm": 2.2019879817962646, |
| "learning_rate": 7.426347048719577e-06, |
| "loss": 0.1082, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.0898395721925134, |
| "grad_norm": 2.8885598182678223, |
| "learning_rate": 7.375729227406789e-06, |
| "loss": 0.1227, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.1005347593582888, |
| "grad_norm": 2.8309147357940674, |
| "learning_rate": 7.324794948426015e-06, |
| "loss": 0.0943, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.1112299465240643, |
| "grad_norm": 2.788823366165161, |
| "learning_rate": 7.273550996449077e-06, |
| "loss": 0.1038, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.1219251336898395, |
| "grad_norm": 2.1967334747314453, |
| "learning_rate": 7.222004197397613e-06, |
| "loss": 0.0956, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.132620320855615, |
| "grad_norm": 2.6466832160949707, |
| "learning_rate": 7.170161417533836e-06, |
| "loss": 0.0946, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1433155080213904, |
| "grad_norm": 2.3864052295684814, |
| "learning_rate": 7.118029562545915e-06, |
| "loss": 0.0868, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.1540106951871658, |
| "grad_norm": 2.6868040561676025, |
| "learning_rate": 7.065615576628107e-06, |
| "loss": 0.1009, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1647058823529413, |
| "grad_norm": 2.748037815093994, |
| "learning_rate": 7.0129264415557585e-06, |
| "loss": 0.1067, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.1754010695187165, |
| "grad_norm": 3.183288097381592, |
| "learning_rate": 6.959969175755306e-06, |
| "loss": 0.1009, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.186096256684492, |
| "grad_norm": 2.472684383392334, |
| "learning_rate": 6.906750833369386e-06, |
| "loss": 0.0912, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.1967914438502674, |
| "grad_norm": 3.301072359085083, |
| "learning_rate": 6.8532785033171975e-06, |
| "loss": 0.1026, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.2074866310160428, |
| "grad_norm": 2.607532262802124, |
| "learning_rate": 6.799559308350219e-06, |
| "loss": 0.0985, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.2181818181818183, |
| "grad_norm": 2.714355230331421, |
| "learning_rate": 6.745600404103431e-06, |
| "loss": 0.0982, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2288770053475937, |
| "grad_norm": 3.1020829677581787, |
| "learning_rate": 6.6914089781421535e-06, |
| "loss": 0.1084, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.239572192513369, |
| "grad_norm": 3.3140673637390137, |
| "learning_rate": 6.636992249004629e-06, |
| "loss": 0.1054, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.2502673796791444, |
| "grad_norm": 2.7101094722747803, |
| "learning_rate": 6.582357465240488e-06, |
| "loss": 0.1045, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.2609625668449198, |
| "grad_norm": 2.9788272380828857, |
| "learning_rate": 6.527511904445194e-06, |
| "loss": 0.0883, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2716577540106953, |
| "grad_norm": 3.2309136390686035, |
| "learning_rate": 6.472462872290654e-06, |
| "loss": 0.0946, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.2823529411764705, |
| "grad_norm": 2.4168930053710938, |
| "learning_rate": 6.417217701552059e-06, |
| "loss": 0.0996, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.293048128342246, |
| "grad_norm": 2.6719398498535156, |
| "learning_rate": 6.36178375113113e-06, |
| "loss": 0.0963, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.3037433155080214, |
| "grad_norm": 2.4402902126312256, |
| "learning_rate": 6.3061684050758776e-06, |
| "loss": 0.0986, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.3144385026737968, |
| "grad_norm": 3.2404541969299316, |
| "learning_rate": 6.250379071597018e-06, |
| "loss": 0.0925, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.3251336898395722, |
| "grad_norm": 2.904754161834717, |
| "learning_rate": 6.194423182081161e-06, |
| "loss": 0.1125, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3358288770053477, |
| "grad_norm": 3.329714059829712, |
| "learning_rate": 6.138308190100918e-06, |
| "loss": 0.1038, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.346524064171123, |
| "grad_norm": 3.2199363708496094, |
| "learning_rate": 6.082041570422059e-06, |
| "loss": 0.1097, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.3572192513368984, |
| "grad_norm": 2.6863086223602295, |
| "learning_rate": 6.025630818007833e-06, |
| "loss": 0.1001, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.3679144385026738, |
| "grad_norm": 2.825577735900879, |
| "learning_rate": 5.969083447020606e-06, |
| "loss": 0.1018, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.3786096256684492, |
| "grad_norm": 2.606800079345703, |
| "learning_rate": 5.912406989820948e-06, |
| "loss": 0.1043, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.3893048128342245, |
| "grad_norm": 2.4628851413726807, |
| "learning_rate": 5.855608995964283e-06, |
| "loss": 0.0961, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 2.584479331970215, |
| "learning_rate": 5.798697031195257e-06, |
| "loss": 0.1025, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.4106951871657754, |
| "grad_norm": 2.4871041774749756, |
| "learning_rate": 5.741678676439946e-06, |
| "loss": 0.0993, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.4213903743315508, |
| "grad_norm": 2.876145601272583, |
| "learning_rate": 5.684561526796045e-06, |
| "loss": 0.0968, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.4320855614973262, |
| "grad_norm": 3.0748062133789062, |
| "learning_rate": 5.627353190521168e-06, |
| "loss": 0.0824, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.4427807486631017, |
| "grad_norm": 2.5074338912963867, |
| "learning_rate": 5.570061288019385e-06, |
| "loss": 0.0994, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.4534759358288771, |
| "grad_norm": 2.8043177127838135, |
| "learning_rate": 5.51269345082617e-06, |
| "loss": 0.1007, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.4641711229946524, |
| "grad_norm": 2.6914560794830322, |
| "learning_rate": 5.455257320591825e-06, |
| "loss": 0.1021, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.4748663101604278, |
| "grad_norm": 2.208425521850586, |
| "learning_rate": 5.397760548063591e-06, |
| "loss": 0.0918, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.4855614973262032, |
| "grad_norm": 2.6503446102142334, |
| "learning_rate": 5.340210792066531e-06, |
| "loss": 0.0967, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.4962566844919787, |
| "grad_norm": 2.4122114181518555, |
| "learning_rate": 5.282615718483344e-06, |
| "loss": 0.0859, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.506951871657754, |
| "grad_norm": 2.084665060043335, |
| "learning_rate": 5.224982999233228e-06, |
| "loss": 0.0858, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.5176470588235293, |
| "grad_norm": 3.171128034591675, |
| "learning_rate": 5.167320311249951e-06, |
| "loss": 0.1037, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.5283422459893048, |
| "grad_norm": 3.304917573928833, |
| "learning_rate": 5.109635335459256e-06, |
| "loss": 0.0845, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.5390374331550802, |
| "grad_norm": 2.538336992263794, |
| "learning_rate": 5.051935755755713e-06, |
| "loss": 0.0875, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5497326203208557, |
| "grad_norm": 3.2048041820526123, |
| "learning_rate": 4.9942292579791965e-06, |
| "loss": 0.0957, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.5604278074866311, |
| "grad_norm": 2.32966685295105, |
| "learning_rate": 4.936523528891111e-06, |
| "loss": 0.08, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.5711229946524066, |
| "grad_norm": 2.5276477336883545, |
| "learning_rate": 4.878826255150453e-06, |
| "loss": 0.1025, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.5818181818181818, |
| "grad_norm": 2.489506244659424, |
| "learning_rate": 4.821145122289941e-06, |
| "loss": 0.0905, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.5925133689839572, |
| "grad_norm": 2.0386905670166016, |
| "learning_rate": 4.763487813692252e-06, |
| "loss": 0.0863, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.6032085561497325, |
| "grad_norm": 3.1131086349487305, |
| "learning_rate": 4.705862009566564e-06, |
| "loss": 0.0884, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.613903743315508, |
| "grad_norm": 2.19130277633667, |
| "learning_rate": 4.648275385925522e-06, |
| "loss": 0.0919, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.6245989304812833, |
| "grad_norm": 2.7279891967773438, |
| "learning_rate": 4.5907356135627605e-06, |
| "loss": 0.0956, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6352941176470588, |
| "grad_norm": 2.4997522830963135, |
| "learning_rate": 4.533250357031104e-06, |
| "loss": 0.1017, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.6459893048128342, |
| "grad_norm": 2.488523006439209, |
| "learning_rate": 4.475827273621639e-06, |
| "loss": 0.0968, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6566844919786097, |
| "grad_norm": 2.9442784786224365, |
| "learning_rate": 4.418474012343711e-06, |
| "loss": 0.0973, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.6673796791443851, |
| "grad_norm": 2.7047040462493896, |
| "learning_rate": 4.361198212906048e-06, |
| "loss": 0.0884, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.6780748663101606, |
| "grad_norm": 2.4104325771331787, |
| "learning_rate": 4.304007504699118e-06, |
| "loss": 0.087, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.6887700534759358, |
| "grad_norm": 2.9229371547698975, |
| "learning_rate": 4.246909505778862e-06, |
| "loss": 0.0772, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.6994652406417112, |
| "grad_norm": 2.7569501399993896, |
| "learning_rate": 4.189911821851928e-06, |
| "loss": 0.0973, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.7101604278074867, |
| "grad_norm": 2.2701845169067383, |
| "learning_rate": 4.1330220452625644e-06, |
| "loss": 0.0869, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.720855614973262, |
| "grad_norm": 2.91239857673645, |
| "learning_rate": 4.076247753981285e-06, |
| "loss": 0.0985, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.7315508021390373, |
| "grad_norm": 2.3714444637298584, |
| "learning_rate": 4.019596510595447e-06, |
| "loss": 0.0862, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.7422459893048128, |
| "grad_norm": 2.617003917694092, |
| "learning_rate": 3.963075861301886e-06, |
| "loss": 0.0933, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.7529411764705882, |
| "grad_norm": 2.6759443283081055, |
| "learning_rate": 3.9066933349017165e-06, |
| "loss": 0.0883, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.7636363636363637, |
| "grad_norm": 3.302290439605713, |
| "learning_rate": 3.8504564417974795e-06, |
| "loss": 0.0988, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.7743315508021391, |
| "grad_norm": 2.1597912311553955, |
| "learning_rate": 3.7943726729927154e-06, |
| "loss": 0.0922, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.7850267379679146, |
| "grad_norm": 2.2167508602142334, |
| "learning_rate": 3.738449499094121e-06, |
| "loss": 0.0807, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.79572192513369, |
| "grad_norm": 2.180342197418213, |
| "learning_rate": 3.682694369316446e-06, |
| "loss": 0.0741, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.8064171122994652, |
| "grad_norm": 3.1924726963043213, |
| "learning_rate": 3.6271147104902192e-06, |
| "loss": 0.0836, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.8171122994652407, |
| "grad_norm": 3.1651058197021484, |
| "learning_rate": 3.5717179260724544e-06, |
| "loss": 0.0911, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.8278074866310159, |
| "grad_norm": 3.11163330078125, |
| "learning_rate": 3.5165113951604874e-06, |
| "loss": 0.094, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.8385026737967913, |
| "grad_norm": 2.2718892097473145, |
| "learning_rate": 3.461502471509045e-06, |
| "loss": 0.0818, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8491978609625668, |
| "grad_norm": 2.625342607498169, |
| "learning_rate": 3.4066984825506855e-06, |
| "loss": 0.0891, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.8598930481283422, |
| "grad_norm": 2.286201000213623, |
| "learning_rate": 3.35210672841976e-06, |
| "loss": 0.0906, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.8705882352941177, |
| "grad_norm": 2.516383171081543, |
| "learning_rate": 3.297734480980002e-06, |
| "loss": 0.0856, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.881283422459893, |
| "grad_norm": 3.1512904167175293, |
| "learning_rate": 3.2435889828558753e-06, |
| "loss": 0.0845, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.8919786096256686, |
| "grad_norm": 3.0289194583892822, |
| "learning_rate": 3.1896774464678327e-06, |
| "loss": 0.0956, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.902673796791444, |
| "grad_norm": 2.4393064975738525, |
| "learning_rate": 3.1360070530715885e-06, |
| "loss": 0.0824, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.9133689839572192, |
| "grad_norm": 2.774142026901245, |
| "learning_rate": 3.0825849518015334e-06, |
| "loss": 0.0873, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.9240641711229947, |
| "grad_norm": 2.386697292327881, |
| "learning_rate": 3.029418258718454e-06, |
| "loss": 0.0836, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.93475935828877, |
| "grad_norm": 2.7990691661834717, |
| "learning_rate": 2.9765140558616287e-06, |
| "loss": 0.0875, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.9454545454545453, |
| "grad_norm": 2.4611012935638428, |
| "learning_rate": 2.9238793903054757e-06, |
| "loss": 0.0791, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.9561497326203208, |
| "grad_norm": 2.292494058609009, |
| "learning_rate": 2.8715212732208523e-06, |
| "loss": 0.0777, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.9668449197860962, |
| "grad_norm": 2.5736472606658936, |
| "learning_rate": 2.819446678941126e-06, |
| "loss": 0.0853, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.9775401069518717, |
| "grad_norm": 2.792259693145752, |
| "learning_rate": 2.7676625440331756e-06, |
| "loss": 0.08, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.988235294117647, |
| "grad_norm": 2.7590394020080566, |
| "learning_rate": 2.7161757663734012e-06, |
| "loss": 0.0876, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.9989304812834225, |
| "grad_norm": 3.141918420791626, |
| "learning_rate": 2.6649932042288994e-06, |
| "loss": 0.0874, |
| "step": 935 |
| }, |
| { |
| "epoch": 2.0085561497326205, |
| "grad_norm": 1.2265775203704834, |
| "learning_rate": 2.6141216753439115e-06, |
| "loss": 0.0265, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.0192513368983955, |
| "grad_norm": 1.1748132705688477, |
| "learning_rate": 2.5635679560316703e-06, |
| "loss": 0.026, |
| "step": 945 |
| }, |
| { |
| "epoch": 2.029946524064171, |
| "grad_norm": 1.9212442636489868, |
| "learning_rate": 2.5133387802717585e-06, |
| "loss": 0.0245, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0406417112299464, |
| "grad_norm": 1.0849891901016235, |
| "learning_rate": 2.4634408388131255e-06, |
| "loss": 0.0213, |
| "step": 955 |
| }, |
| { |
| "epoch": 2.051336898395722, |
| "grad_norm": 2.341118812561035, |
| "learning_rate": 2.4138807782828423e-06, |
| "loss": 0.0217, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.0620320855614973, |
| "grad_norm": 1.3080765008926392, |
| "learning_rate": 2.3646652003007348e-06, |
| "loss": 0.0213, |
| "step": 965 |
| }, |
| { |
| "epoch": 2.0727272727272728, |
| "grad_norm": 1.187409520149231, |
| "learning_rate": 2.3158006606000345e-06, |
| "loss": 0.0278, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.083422459893048, |
| "grad_norm": 2.2584495544433594, |
| "learning_rate": 2.267293668154114e-06, |
| "loss": 0.0239, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.0941176470588236, |
| "grad_norm": 2.2861719131469727, |
| "learning_rate": 2.2191506843094628e-06, |
| "loss": 0.0254, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.104812834224599, |
| "grad_norm": 1.3212783336639404, |
| "learning_rate": 2.1713781219250157e-06, |
| "loss": 0.022, |
| "step": 985 |
| }, |
| { |
| "epoch": 2.1155080213903745, |
| "grad_norm": 1.6825640201568604, |
| "learning_rate": 2.1239823445179236e-06, |
| "loss": 0.0216, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.1262032085561495, |
| "grad_norm": 2.0671329498291016, |
| "learning_rate": 2.076969665415908e-06, |
| "loss": 0.0211, |
| "step": 995 |
| }, |
| { |
| "epoch": 2.136898395721925, |
| "grad_norm": 1.9904191493988037, |
| "learning_rate": 2.0303463469163004e-06, |
| "loss": 0.0246, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.1475935828877004, |
| "grad_norm": 1.6527910232543945, |
| "learning_rate": 1.984118599451866e-06, |
| "loss": 0.024, |
| "step": 1005 |
| }, |
| { |
| "epoch": 2.158288770053476, |
| "grad_norm": 1.7971508502960205, |
| "learning_rate": 1.938292580763561e-06, |
| "loss": 0.025, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.1689839572192513, |
| "grad_norm": 1.7470921277999878, |
| "learning_rate": 1.8928743950802863e-06, |
| "loss": 0.0295, |
| "step": 1015 |
| }, |
| { |
| "epoch": 2.1796791443850267, |
| "grad_norm": 1.0844634771347046, |
| "learning_rate": 1.847870092305773e-06, |
| "loss": 0.0158, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.190374331550802, |
| "grad_norm": 1.632693886756897, |
| "learning_rate": 1.8032856672127191e-06, |
| "loss": 0.0211, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.2010695187165776, |
| "grad_norm": 1.0532259941101074, |
| "learning_rate": 1.7591270586442517e-06, |
| "loss": 0.0189, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.211764705882353, |
| "grad_norm": 1.5501878261566162, |
| "learning_rate": 1.7154001487228478e-06, |
| "loss": 0.0175, |
| "step": 1035 |
| }, |
| { |
| "epoch": 2.2224598930481285, |
| "grad_norm": 1.4238228797912598, |
| "learning_rate": 1.672110762066811e-06, |
| "loss": 0.0215, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.233155080213904, |
| "grad_norm": 1.4649953842163086, |
| "learning_rate": 1.6292646650144072e-06, |
| "loss": 0.0182, |
| "step": 1045 |
| }, |
| { |
| "epoch": 2.243850267379679, |
| "grad_norm": 2.2017998695373535, |
| "learning_rate": 1.5868675648557568e-06, |
| "loss": 0.0176, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.2545454545454544, |
| "grad_norm": 1.6893876791000366, |
| "learning_rate": 1.544925109072607e-06, |
| "loss": 0.02, |
| "step": 1055 |
| }, |
| { |
| "epoch": 2.26524064171123, |
| "grad_norm": 1.3076281547546387, |
| "learning_rate": 1.50344288458606e-06, |
| "loss": 0.0185, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.2759358288770053, |
| "grad_norm": 1.342536449432373, |
| "learning_rate": 1.462426417012361e-06, |
| "loss": 0.0165, |
| "step": 1065 |
| }, |
| { |
| "epoch": 2.2866310160427807, |
| "grad_norm": 2.3762521743774414, |
| "learning_rate": 1.4218811699268752e-06, |
| "loss": 0.019, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.297326203208556, |
| "grad_norm": 1.6990896463394165, |
| "learning_rate": 1.38181254413631e-06, |
| "loss": 0.0179, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.3080213903743316, |
| "grad_norm": 1.9363665580749512, |
| "learning_rate": 1.3422258769592967e-06, |
| "loss": 0.0199, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.318716577540107, |
| "grad_norm": 1.7887779474258423, |
| "learning_rate": 1.3031264415154477e-06, |
| "loss": 0.0194, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.3294117647058825, |
| "grad_norm": 1.9284300804138184, |
| "learning_rate": 1.2645194460229454e-06, |
| "loss": 0.0213, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.3401069518716575, |
| "grad_norm": 2.6546132564544678, |
| "learning_rate": 1.2264100331047878e-06, |
| "loss": 0.0216, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.350802139037433, |
| "grad_norm": 1.0779314041137695, |
| "learning_rate": 1.1888032791037697e-06, |
| "loss": 0.0198, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.3614973262032084, |
| "grad_norm": 2.0439534187316895, |
| "learning_rate": 1.1517041934062834e-06, |
| "loss": 0.019, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.372192513368984, |
| "grad_norm": 1.9515999555587769, |
| "learning_rate": 1.115117717775056e-06, |
| "loss": 0.0169, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.3828877005347593, |
| "grad_norm": 1.5731124877929688, |
| "learning_rate": 1.0790487256908826e-06, |
| "loss": 0.0219, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.3935828877005347, |
| "grad_norm": 2.9910495281219482, |
| "learning_rate": 1.043502021703449e-06, |
| "loss": 0.0273, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.40427807486631, |
| "grad_norm": 1.1649227142333984, |
| "learning_rate": 1.0084823407913563e-06, |
| "loss": 0.0242, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.4149732620320856, |
| "grad_norm": 1.618787407875061, |
| "learning_rate": 9.739943477313918e-07, |
| "loss": 0.0171, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.425668449197861, |
| "grad_norm": 1.1850640773773193, |
| "learning_rate": 9.400426364771648e-07, |
| "loss": 0.0207, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.4363636363636365, |
| "grad_norm": 1.6684306859970093, |
| "learning_rate": 9.066317295471688e-07, |
| "loss": 0.0154, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.447058823529412, |
| "grad_norm": 1.5794020891189575, |
| "learning_rate": 8.737660774223655e-07, |
| "loss": 0.017, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.4577540106951874, |
| "grad_norm": 1.608679175376892, |
| "learning_rate": 8.414500579533536e-07, |
| "loss": 0.0174, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.4684491978609624, |
| "grad_norm": 2.3195552825927734, |
| "learning_rate": 8.096879757772297e-07, |
| "loss": 0.0268, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.479144385026738, |
| "grad_norm": 1.937566876411438, |
| "learning_rate": 7.784840617441858e-07, |
| "loss": 0.0192, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.4898395721925133, |
| "grad_norm": 1.0479762554168701, |
| "learning_rate": 7.47842472353939e-07, |
| "loss": 0.0163, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.5005347593582887, |
| "grad_norm": 1.6187900304794312, |
| "learning_rate": 7.177672892020743e-07, |
| "loss": 0.0195, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.511229946524064, |
| "grad_norm": 2.198720932006836, |
| "learning_rate": 6.882625184363534e-07, |
| "loss": 0.0182, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.5219251336898396, |
| "grad_norm": 1.8759390115737915, |
| "learning_rate": 6.593320902230748e-07, |
| "loss": 0.0232, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.532620320855615, |
| "grad_norm": 1.2502473592758179, |
| "learning_rate": 6.309798582235671e-07, |
| "loss": 0.0178, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.5433155080213905, |
| "grad_norm": 1.6022732257843018, |
| "learning_rate": 6.032095990808567e-07, |
| "loss": 0.0193, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.5540106951871655, |
| "grad_norm": 1.390811562538147, |
| "learning_rate": 5.760250119166061e-07, |
| "loss": 0.0203, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.564705882352941, |
| "grad_norm": 1.4281519651412964, |
| "learning_rate": 5.494297178383729e-07, |
| "loss": 0.0194, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.5754010695187164, |
| "grad_norm": 1.2555127143859863, |
| "learning_rate": 5.234272594572604e-07, |
| "loss": 0.0172, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.586096256684492, |
| "grad_norm": 2.223867654800415, |
| "learning_rate": 4.980211004160306e-07, |
| "loss": 0.0222, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.5967914438502673, |
| "grad_norm": 0.8818890452384949, |
| "learning_rate": 4.7321462492772975e-07, |
| "loss": 0.0168, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.6074866310160427, |
| "grad_norm": 2.0135138034820557, |
| "learning_rate": 4.490111373248918e-07, |
| "loss": 0.0188, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.618181818181818, |
| "grad_norm": 2.228736400604248, |
| "learning_rate": 4.2541386161939424e-07, |
| "loss": 0.024, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.6288770053475936, |
| "grad_norm": 1.993802547454834, |
| "learning_rate": 4.024259410730008e-07, |
| "loss": 0.0204, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.639572192513369, |
| "grad_norm": 1.5609642267227173, |
| "learning_rate": 3.800504377786651e-07, |
| "loss": 0.0207, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.6502673796791445, |
| "grad_norm": 2.098813772201538, |
| "learning_rate": 3.582903322526482e-07, |
| "loss": 0.0199, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.66096256684492, |
| "grad_norm": 1.1627248525619507, |
| "learning_rate": 3.371485230374988e-07, |
| "loss": 0.0226, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.6716577540106954, |
| "grad_norm": 1.6938384771347046, |
| "learning_rate": 3.166278263159539e-07, |
| "loss": 0.0166, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.682352941176471, |
| "grad_norm": 2.4418842792510986, |
| "learning_rate": 2.9673097553581385e-07, |
| "loss": 0.0226, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.693048128342246, |
| "grad_norm": 1.8153109550476074, |
| "learning_rate": 2.7746062104583473e-07, |
| "loss": 0.0164, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.7037433155080213, |
| "grad_norm": 1.7912057638168335, |
| "learning_rate": 2.588193297426844e-07, |
| "loss": 0.0222, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.7144385026737967, |
| "grad_norm": 1.2618776559829712, |
| "learning_rate": 2.4080958472902873e-07, |
| "loss": 0.0226, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.725133689839572, |
| "grad_norm": 1.1699613332748413, |
| "learning_rate": 2.2343378498276391e-07, |
| "loss": 0.0145, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.7358288770053476, |
| "grad_norm": 1.2414964437484741, |
| "learning_rate": 2.0669424503746871e-07, |
| "loss": 0.0173, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.746524064171123, |
| "grad_norm": 1.5833706855773926, |
| "learning_rate": 1.9059319467409244e-07, |
| "loss": 0.0185, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.7572192513368985, |
| "grad_norm": 1.5452425479888916, |
| "learning_rate": 1.7513277862394263e-07, |
| "loss": 0.0143, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.767914438502674, |
| "grad_norm": 2.001614809036255, |
| "learning_rate": 1.603150562829936e-07, |
| "loss": 0.0212, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.778609625668449, |
| "grad_norm": 1.535340428352356, |
| "learning_rate": 1.4614200143756975e-07, |
| "loss": 0.0156, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.7893048128342244, |
| "grad_norm": 2.188175678253174, |
| "learning_rate": 1.3261550200142415e-07, |
| "loss": 0.0163, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 1.8125184774398804, |
| "learning_rate": 1.197373597642637e-07, |
| "loss": 0.0176, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.8106951871657753, |
| "grad_norm": 1.2068591117858887, |
| "learning_rate": 1.0750929015174216e-07, |
| "loss": 0.0127, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.8213903743315507, |
| "grad_norm": 2.0283000469207764, |
| "learning_rate": 9.593292199695403e-08, |
| "loss": 0.0166, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.832085561497326, |
| "grad_norm": 1.5842273235321045, |
| "learning_rate": 8.500979732347259e-08, |
| "loss": 0.0189, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.8427807486631016, |
| "grad_norm": 1.509087085723877, |
| "learning_rate": 7.47413711399414e-08, |
| "loss": 0.0178, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.853475935828877, |
| "grad_norm": 1.6988673210144043, |
| "learning_rate": 6.51290112462627e-08, |
| "loss": 0.0139, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.8641711229946525, |
| "grad_norm": 1.3389947414398193, |
| "learning_rate": 5.617399805139867e-08, |
| "loss": 0.0144, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.874866310160428, |
| "grad_norm": 1.2916324138641357, |
| "learning_rate": 4.7877524402817345e-08, |
| "loss": 0.0129, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.8855614973262034, |
| "grad_norm": 1.6901161670684814, |
| "learning_rate": 4.024069542759801e-08, |
| "loss": 0.0191, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.896256684491979, |
| "grad_norm": 2.0415706634521484, |
| "learning_rate": 3.3264528385225094e-08, |
| "loss": 0.0169, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.9069518716577543, |
| "grad_norm": 1.594831943511963, |
| "learning_rate": 2.6949952532083768e-08, |
| "loss": 0.0157, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.9176470588235293, |
| "grad_norm": 1.7152241468429565, |
| "learning_rate": 2.1297808997678417e-08, |
| "loss": 0.0194, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.9283422459893047, |
| "grad_norm": 1.6763319969177246, |
| "learning_rate": 1.6308850672591692e-08, |
| "loss": 0.0148, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.93903743315508, |
| "grad_norm": 1.905222773551941, |
| "learning_rate": 1.1983742108195862e-08, |
| "loss": 0.0168, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.9497326203208556, |
| "grad_norm": 0.8023597002029419, |
| "learning_rate": 8.323059428130831e-09, |
| "loss": 0.0153, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.960427807486631, |
| "grad_norm": 1.5736637115478516, |
| "learning_rate": 5.3272902515622e-09, |
| "loss": 0.0208, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.9711229946524065, |
| "grad_norm": 1.540812373161316, |
| "learning_rate": 2.996833628228779e-09, |
| "loss": 0.017, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.981818181818182, |
| "grad_norm": 1.0774410963058472, |
| "learning_rate": 1.3319999852867692e-09, |
| "loss": 0.0144, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.9925133689839574, |
| "grad_norm": 1.853932499885559, |
| "learning_rate": 3.330110859600666e-10, |
| "loss": 0.02, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1404, |
| "total_flos": 1.0868263403315528e+18, |
| "train_loss": 0.12404776059961387, |
| "train_runtime": 1235.844, |
| "train_samples_per_second": 72.563, |
| "train_steps_per_second": 1.136 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1404, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.0868263403315528e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|