| {"loss": 2.0693, "learning_rate": 2.3999999999999997e-05, "epoch": 0.03, "step": 10} |
| {"loss": 1.9946, "learning_rate": 5.399999999999999e-05, "epoch": 0.06, "step": 20} |
| {"loss": 1.8837, "learning_rate": 8.1e-05, "epoch": 0.08, "step": 30} |
| {"loss": 1.6655, "learning_rate": 0.00011099999999999999, "epoch": 0.11, "step": 40} |
| {"loss": 1.6095, "learning_rate": 0.00014099999999999998, "epoch": 0.14, "step": 50} |
| {"loss": 1.3838, "learning_rate": 0.00017099999999999998, "epoch": 0.17, "step": 60} |
| {"loss": 1.3548, "learning_rate": 0.000201, "epoch": 0.2, "step": 70} |
| {"loss": 1.0789, "learning_rate": 0.00023099999999999998, "epoch": 0.22, "step": 80} |
| {"loss": 1.0589, "learning_rate": 0.000261, "epoch": 0.25, "step": 90} |
| {"loss": 1.0259, "learning_rate": 0.00029099999999999997, "epoch": 0.28, "step": 100} |
| {"loss": 0.9936, "learning_rate": 0.000299393063583815, "epoch": 0.31, "step": 110} |
| {"loss": 0.9633, "learning_rate": 0.0002985260115606936, "epoch": 0.34, "step": 120} |
| {"loss": 1.0018, "learning_rate": 0.00029765895953757224, "epoch": 0.37, "step": 130} |
| {"loss": 0.893, "learning_rate": 0.0002967919075144509, "epoch": 0.39, "step": 140} |
| {"loss": 0.8079, "learning_rate": 0.00029592485549132945, "epoch": 0.42, "step": 150} |
| {"loss": 0.8668, "learning_rate": 0.0002950578034682081, "epoch": 0.45, "step": 160} |
| {"loss": 0.773, "learning_rate": 0.00029419075144508666, "epoch": 0.48, "step": 170} |
| {"loss": 0.8139, "learning_rate": 0.0002933236994219653, "epoch": 0.51, "step": 180} |
| {"loss": 0.9139, "learning_rate": 0.0002924566473988439, "epoch": 0.53, "step": 190} |
| {"loss": 0.7371, "learning_rate": 0.0002915895953757225, "epoch": 0.56, "step": 200} |
| {"loss": 0.8604, "learning_rate": 0.0002907225433526011, "epoch": 0.59, "step": 210} |
| {"loss": 0.8597, "learning_rate": 0.0002898554913294797, "epoch": 0.62, "step": 220} |
| {"loss": 0.7954, "learning_rate": 0.0002889884393063584, "epoch": 0.65, "step": 230} |
| {"loss": 0.7559, "learning_rate": 0.00028812138728323696, "epoch": 0.67, "step": 240} |
| {"loss": 0.8488, "learning_rate": 0.0002872543352601156, "epoch": 0.7, "step": 250} |
| {"loss": 0.7252, "learning_rate": 0.00028638728323699417, "epoch": 0.73, "step": 260} |
| {"loss": 0.6751, "learning_rate": 0.0002855202312138728, "epoch": 0.76, "step": 270} |
| {"loss": 0.7852, "learning_rate": 0.00028465317919075143, "epoch": 0.79, "step": 280} |
| {"loss": 0.7843, "learning_rate": 0.00028378612716763, "epoch": 0.81, "step": 290} |
| {"loss": 0.7886, "learning_rate": 0.00028291907514450864, "epoch": 0.84, "step": 300} |
| {"loss": 0.7218, "learning_rate": 0.00028205202312138727, "epoch": 0.87, "step": 310} |
| {"loss": 0.7919, "learning_rate": 0.0002811849710982659, "epoch": 0.9, "step": 320} |
| {"loss": 0.7248, "learning_rate": 0.0002803179190751445, "epoch": 0.93, "step": 330} |
| {"loss": 0.7699, "learning_rate": 0.0002794508670520231, "epoch": 0.96, "step": 340} |
| {"loss": 0.6909, "learning_rate": 0.00027858381502890174, "epoch": 0.98, "step": 350} |
| {"loss": 0.6992, "learning_rate": 0.0002777167630057803, "epoch": 1.01, "step": 360} |
| {"loss": 0.6747, "learning_rate": 0.00027684971098265894, "epoch": 1.04, "step": 370} |
| {"loss": 0.672, "learning_rate": 0.0002759826589595375, "epoch": 1.07, "step": 380} |
| {"loss": 0.5997, "learning_rate": 0.0002751156069364162, "epoch": 1.1, "step": 390} |
| {"loss": 0.747, "learning_rate": 0.0002742485549132948, "epoch": 1.12, "step": 400} |
| {"loss": 0.7217, "learning_rate": 0.0002733815028901734, "epoch": 1.15, "step": 410} |
| {"loss": 0.683, "learning_rate": 0.000272514450867052, "epoch": 1.18, "step": 420} |
| {"loss": 0.63, "learning_rate": 0.0002716473988439306, "epoch": 1.21, "step": 430} |
| {"loss": 0.6889, "learning_rate": 0.00027078034682080925, "epoch": 1.24, "step": 440} |
| {"loss": 0.6582, "learning_rate": 0.0002699132947976878, "epoch": 1.26, "step": 450} |
| {"loss": 0.6366, "learning_rate": 0.00026904624277456645, "epoch": 1.29, "step": 460} |
| {"loss": 0.7249, "learning_rate": 0.00026817919075144503, "epoch": 1.32, "step": 470} |
| {"loss": 0.53, "learning_rate": 0.0002673121387283237, "epoch": 1.35, "step": 480} |
| {"loss": 0.6172, "learning_rate": 0.0002664450867052023, "epoch": 1.38, "step": 490} |
| {"loss": 0.6033, "learning_rate": 0.0002655780346820809, "epoch": 1.4, "step": 500} |
| {"eval_loss": 0.5882205367088318, "eval_runtime": 14.2208, "eval_samples_per_second": 3.516, "eval_steps_per_second": 0.492, "epoch": 1.4, "step": 500} |
| {"loss": 0.7452, "learning_rate": 0.0002647109826589595, "epoch": 1.43, "step": 510} |
| {"loss": 0.6689, "learning_rate": 0.00026384393063583813, "epoch": 1.46, "step": 520} |
| {"loss": 0.6339, "learning_rate": 0.00026297687861271676, "epoch": 1.49, "step": 530} |
| {"loss": 0.6538, "learning_rate": 0.00026210982658959533, "epoch": 1.52, "step": 540} |
| {"loss": 0.6864, "learning_rate": 0.00026124277456647397, "epoch": 1.54, "step": 550} |
| {"loss": 0.6466, "learning_rate": 0.0002603757225433526, "epoch": 1.57, "step": 560} |
| {"loss": 0.6186, "learning_rate": 0.00025950867052023117, "epoch": 1.6, "step": 570} |
| {"loss": 0.6872, "learning_rate": 0.0002586416184971098, "epoch": 1.63, "step": 580} |
| {"loss": 0.7206, "learning_rate": 0.00025777456647398843, "epoch": 1.66, "step": 590} |
| {"loss": 0.5848, "learning_rate": 0.000256907514450867, "epoch": 1.69, "step": 600} |
| {"loss": 0.711, "learning_rate": 0.00025604046242774564, "epoch": 1.71, "step": 610} |
| {"loss": 0.628, "learning_rate": 0.00025517341040462427, "epoch": 1.74, "step": 620} |
| {"loss": 0.7035, "learning_rate": 0.00025430635838150285, "epoch": 1.77, "step": 630} |
| {"loss": 0.5965, "learning_rate": 0.0002534393063583815, "epoch": 1.8, "step": 640} |
| {"loss": 0.6486, "learning_rate": 0.0002525722543352601, "epoch": 1.83, "step": 650} |
| {"loss": 0.6264, "learning_rate": 0.0002517052023121387, "epoch": 1.85, "step": 660} |
| {"loss": 0.6469, "learning_rate": 0.0002508381502890173, "epoch": 1.88, "step": 670} |
| {"loss": 0.61, "learning_rate": 0.00024997109826589595, "epoch": 1.91, "step": 680} |
| {"loss": 0.6664, "learning_rate": 0.0002491040462427746, "epoch": 1.94, "step": 690} |
| {"loss": 0.5312, "learning_rate": 0.00024823699421965315, "epoch": 1.97, "step": 700} |
| {"loss": 0.4703, "learning_rate": 0.0002473699421965318, "epoch": 1.99, "step": 710} |
| {"loss": 0.535, "learning_rate": 0.00024650289017341036, "epoch": 2.02, "step": 720} |
| {"loss": 0.6568, "learning_rate": 0.000245635838150289, "epoch": 2.05, "step": 730} |
| {"loss": 0.6303, "learning_rate": 0.0002447687861271676, "epoch": 2.08, "step": 740} |
| {"loss": 0.4911, "learning_rate": 0.00024390173410404622, "epoch": 2.11, "step": 750} |
| {"loss": 0.5043, "learning_rate": 0.00024303468208092483, "epoch": 2.13, "step": 760} |
| {"loss": 0.5248, "learning_rate": 0.00024216763005780346, "epoch": 2.16, "step": 770} |
| {"loss": 0.6274, "learning_rate": 0.00024130057803468206, "epoch": 2.19, "step": 780} |
| {"loss": 0.5617, "learning_rate": 0.00024043352601156066, "epoch": 2.22, "step": 790} |
| {"loss": 0.5978, "learning_rate": 0.00023956647398843927, "epoch": 2.25, "step": 800} |
| {"loss": 0.6027, "learning_rate": 0.00023869942196531787, "epoch": 2.28, "step": 810} |
| {"loss": 0.5925, "learning_rate": 0.00023783236994219653, "epoch": 2.3, "step": 820} |
| {"loss": 0.593, "learning_rate": 0.00023696531791907513, "epoch": 2.33, "step": 830} |
| {"loss": 0.6181, "learning_rate": 0.00023609826589595373, "epoch": 2.36, "step": 840} |
| {"loss": 0.4752, "learning_rate": 0.00023523121387283234, "epoch": 2.39, "step": 850} |
| {"loss": 0.655, "learning_rate": 0.00023436416184971097, "epoch": 2.42, "step": 860} |
| {"loss": 0.5577, "learning_rate": 0.00023349710982658957, "epoch": 2.44, "step": 870} |
| {"loss": 0.6132, "learning_rate": 0.00023263005780346818, "epoch": 2.47, "step": 880} |
| {"loss": 0.5003, "learning_rate": 0.00023176300578034678, "epoch": 2.5, "step": 890} |
| {"loss": 0.5323, "learning_rate": 0.00023089595375722544, "epoch": 2.53, "step": 900} |
| {"loss": 0.5908, "learning_rate": 0.00023002890173410404, "epoch": 2.56, "step": 910} |
| {"loss": 0.5911, "learning_rate": 0.00022916184971098264, "epoch": 2.58, "step": 920} |
| {"loss": 0.565, "learning_rate": 0.00022829479768786125, "epoch": 2.61, "step": 930} |
| {"loss": 0.5789, "learning_rate": 0.00022742774566473988, "epoch": 2.64, "step": 940} |
| {"loss": 0.5242, "learning_rate": 0.00022656069364161848, "epoch": 2.67, "step": 950} |
| {"loss": 0.5082, "learning_rate": 0.00022569364161849708, "epoch": 2.7, "step": 960} |
| {"loss": 0.5184, "learning_rate": 0.0002248265895953757, "epoch": 2.72, "step": 970} |
| {"loss": 0.6131, "learning_rate": 0.0002239595375722543, "epoch": 2.75, "step": 980} |
| {"loss": 0.5129, "learning_rate": 0.00022309248554913295, "epoch": 2.78, "step": 990} |
| {"loss": 0.6016, "learning_rate": 0.00022222543352601155, "epoch": 2.81, "step": 1000} |
| {"eval_loss": 0.50592440366745, "eval_runtime": 14.2225, "eval_samples_per_second": 3.516, "eval_steps_per_second": 0.492, "epoch": 2.81, "step": 1000} |
| {"loss": 0.5728, "learning_rate": 0.00022135838150289016, "epoch": 2.84, "step": 1010} |
| {"loss": 0.5155, "learning_rate": 0.00022049132947976876, "epoch": 2.87, "step": 1020} |
| {"loss": 0.5522, "learning_rate": 0.0002196242774566474, "epoch": 2.89, "step": 1030} |
| {"loss": 0.5176, "learning_rate": 0.000218757225433526, "epoch": 2.92, "step": 1040} |
| {"train_runtime": 3194.7197, "train_samples_per_second": 4.454, "train_steps_per_second": 1.114, "total_flos": 2.2017658953498624e+17, "train_loss": 0.7449809693580173, "epoch": 2.92, "step": 1041} |